From 7416ba97ef8e5f9592c32dae6caaf46c1dbd7610 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 12 Jan 2023 13:25:09 +0100 Subject: Talk 2023-01-18 WIP --- doc/logo/garage_hires_crop.png | Bin 0 -> 41924 bytes doc/talks/2023-01-18-tocatta/.gitignore | 17 + doc/talks/2023-01-18-tocatta/Makefile | 12 + doc/talks/2023-01-18-tocatta/abstract.md | 39 + .../2023-01-18-tocatta/assets/AGPLv3_Logo.png | Bin 0 -> 32497 bytes doc/talks/2023-01-18-tocatta/assets/aerogramme.png | Bin 0 -> 117865 bytes doc/talks/2023-01-18-tocatta/assets/aerogramme.svg | 1241 ++++++ .../assets/aerogramme_components1.drawio.pdf | Bin 0 -> 31966 bytes .../assets/aerogramme_components1.png | Bin 0 -> 26898 bytes .../assets/aerogramme_components2.drawio.pdf | Bin 0 -> 31688 bytes .../assets/aerogramme_components2.png | Bin 0 -> 27405 bytes .../assets/aerogramme_datatype.drawio.pdf | Bin 0 -> 31073 bytes .../assets/aerogramme_datatype.png | Bin 0 -> 9090 bytes .../assets/aerogramme_keys.drawio.pdf | Bin 0 -> 25145 bytes .../2023-01-18-tocatta/assets/aerogramme_keys.png | Bin 0 -> 17869 bytes doc/talks/2023-01-18-tocatta/assets/alex.jpg | Bin 0 -> 4914 bytes doc/talks/2023-01-18-tocatta/assets/atuin.jpg | Bin 0 -> 269747 bytes .../2023-01-18-tocatta/assets/compatibility.png | Bin 0 -> 84505 bytes .../assets/consistent_hashing_1.svg | 301 ++ .../assets/consistent_hashing_2.svg | 334 ++ .../assets/consistent_hashing_3.svg | 358 ++ .../assets/consistent_hashing_4.svg | 377 ++ doc/talks/2023-01-18-tocatta/assets/deuxfleurs.svg | 91 + .../assets/endpoint-latency-dc.png | Bin 0 -> 131776 bytes .../2023-01-18-tocatta/assets/garage.drawio.pdf | Bin 0 -> 26098 bytes .../2023-01-18-tocatta/assets/garage.drawio.png | Bin 0 -> 13463 bytes .../2023-01-18-tocatta/assets/garage2.drawio.png | Bin 0 -> 89618 bytes .../2023-01-18-tocatta/assets/garage2a.drawio.pdf | Bin 0 -> 33911 bytes .../2023-01-18-tocatta/assets/garage2b.drawio.pdf | Bin 0 -> 31051 bytes .../2023-01-18-tocatta/assets/garage_tables.svg | 537 +++ doc/talks/2023-01-18-tocatta/assets/inframap.jpg | Bin 0 -> 38247 bytes .../2023-01-18-tocatta/assets/location-aware.png | Bin 0 -> 99269 bytes .../2023-01-18-tocatta/assets/logo_chatons.png | Bin 0 -> 203533 bytes doc/talks/2023-01-18-tocatta/assets/map.png | Bin 0 -> 148270 bytes doc/talks/2023-01-18-tocatta/assets/minio.png | Bin 0 -> 13497 bytes doc/talks/2023-01-18-tocatta/assets/neptune.jpg | Bin 0 -> 177936 bytes doc/talks/2023-01-18-tocatta/assets/quentin.jpg | Bin 0 -> 39221 bytes doc/talks/2023-01-18-tocatta/assets/rust_logo.png | Bin 0 -> 14835 bytes doc/talks/2023-01-18-tocatta/assets/slide1.png | Bin 0 -> 89059 bytes doc/talks/2023-01-18-tocatta/assets/slide2.png | Bin 0 -> 83364 bytes doc/talks/2023-01-18-tocatta/assets/slide3.png | Bin 0 -> 127275 bytes doc/talks/2023-01-18-tocatta/assets/slideB1.png | Bin 0 -> 86072 bytes doc/talks/2023-01-18-tocatta/assets/slideB2.png | Bin 0 -> 83399 bytes doc/talks/2023-01-18-tocatta/assets/slideB3.png | Bin 0 -> 82581 bytes doc/talks/2023-01-18-tocatta/assets/slides.svg | 4326 ++++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/slidesB.svg | 444 ++ doc/talks/2023-01-18-tocatta/talk.pdf | Bin 0 -> 2572439 bytes doc/talks/2023-01-18-tocatta/talk.tex | 623 +++ 48 files changed, 8700 insertions(+) create mode 100644 doc/logo/garage_hires_crop.png create mode 100644 doc/talks/2023-01-18-tocatta/.gitignore create mode 100644 doc/talks/2023-01-18-tocatta/Makefile create mode 100644 doc/talks/2023-01-18-tocatta/abstract.md create mode 100644 doc/talks/2023-01-18-tocatta/assets/AGPLv3_Logo.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/alex.jpg create mode 100644 doc/talks/2023-01-18-tocatta/assets/atuin.jpg create mode 100644 doc/talks/2023-01-18-tocatta/assets/compatibility.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/consistent_hashing_1.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/consistent_hashing_2.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/consistent_hashing_3.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/consistent_hashing_4.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/deuxfleurs.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/endpoint-latency-dc.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf create mode 100644 doc/talks/2023-01-18-tocatta/assets/garage.drawio.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/garage2.drawio.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/garage2a.drawio.pdf create mode 100644 doc/talks/2023-01-18-tocatta/assets/garage2b.drawio.pdf create mode 100644 doc/talks/2023-01-18-tocatta/assets/garage_tables.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/inframap.jpg create mode 100644 doc/talks/2023-01-18-tocatta/assets/location-aware.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/logo_chatons.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/map.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/minio.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/neptune.jpg create mode 100644 doc/talks/2023-01-18-tocatta/assets/quentin.jpg create mode 100644 doc/talks/2023-01-18-tocatta/assets/rust_logo.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/slide1.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/slide2.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/slide3.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/slideB1.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/slideB2.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/slideB3.png create mode 100644 doc/talks/2023-01-18-tocatta/assets/slides.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/slidesB.svg create mode 100644 doc/talks/2023-01-18-tocatta/talk.pdf create mode 100644 doc/talks/2023-01-18-tocatta/talk.tex (limited to 'doc') diff --git a/doc/logo/garage_hires_crop.png b/doc/logo/garage_hires_crop.png new file mode 100644 index 00000000..2fd0babc Binary files /dev/null and b/doc/logo/garage_hires_crop.png differ diff --git a/doc/talks/2023-01-18-tocatta/.gitignore b/doc/talks/2023-01-18-tocatta/.gitignore new file mode 100644 index 00000000..9f1f00e6 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/.gitignore @@ -0,0 +1,17 @@ +* + +!*.txt +!*.md + +!assets + +!.gitignore +!*.svg +!*.png +!*.jpg +!*.tex +!Makefile +!.gitignore +!assets/*.drawio.pdf + +!talk.pdf diff --git a/doc/talks/2023-01-18-tocatta/Makefile b/doc/talks/2023-01-18-tocatta/Makefile new file mode 100644 index 00000000..4d600178 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/Makefile @@ -0,0 +1,12 @@ +ASSETS=assets/consistent_hashing_1.pdf \ + assets/consistent_hashing_2.pdf \ + assets/consistent_hashing_3.pdf \ + assets/consistent_hashing_4.pdf \ + assets/garage_tables.pdf \ + assets/deuxfleurs.pdf + +talk.pdf: talk.tex $(ASSETS) + pdflatex talk.tex + +assets/%.pdf: assets/%.svg + inkscape -D -z --file=$^ --export-pdf=$@ diff --git a/doc/talks/2023-01-18-tocatta/abstract.md b/doc/talks/2023-01-18-tocatta/abstract.md new file mode 100644 index 00000000..b2658868 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/abstract.md @@ -0,0 +1,39 @@ +### (fr) Garage, un système de stockage de données géo-distribué léger et robuste + +Garage est un système de stockage de données léger, géo-distribué, qui +implémente le protocole de stockage S3 de Amazon. Garage est destiné +principalement à l'auto-hébergement sur du matériel courant d'occasion. À ce +titre, il doit tolérer un grand nombre de pannes: coupures de courant, coupures +de connexion Internet, pannes de machines, ... Il doit également être facile à +déployer et à maintenir, afin de pouvoir être facilement utilisé par des +amateurs ou des petites organisations. + +Cette présentation vous proposera un aperçu de Garage et du choix technique +principal qui rend un système comme Garage possible: le refus d'utiliser des +algorithmes de consensus, remplacés avantageusement par des méthodes à +cohérence faible. Notre modèle est fortement inspiré de la base de donnée +Dynamo (DeCandia et al, 2007), et fait usage des types de données CRDT (Shapiro +et al, 2011). Nous exploreront comment ces méthodes s'appliquent à la +construction de l'abstraction "stockage objet" dans un système distribué, et +quelles autres abstractions peuvent ou ne peuvent pas être construites dans ce +modèle. + +### (en) Garage, a lightweight and robust geo-distributed data storage system + +Garage is a lightweight geo-distributed data store that implements the Amazon +S3 object storage protocol. Garage is meant primarily for self-hosting at home +on second-hand commodity hardware, meaning it has to tolerate a wide variety of +failure scenarios such as power cuts, Internet disconnections and machine +crashes. It also has to be easy to deploy and maintain, so that hobbyists and +small organizations can use it without trouble. + +This talk will present Garage and the key technical choice that made Garage +possible: refusing to use consensus algorithms and using instead weak +consistency methods, with a model that is loosely based on that of the Dynamo +database (DeCandia et al, 2007) and that makes heavy use of conflict-free +replicated data types (Shapiro et al, 2011). We will explore how these methods +are suited to building the "object store" abstraction in a distributed system, +and what other abstractions are possible or impossible to build in this model. + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/AGPLv3_Logo.png b/doc/talks/2023-01-18-tocatta/assets/AGPLv3_Logo.png new file mode 100644 index 00000000..445284a3 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/AGPLv3_Logo.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme.png b/doc/talks/2023-01-18-tocatta/assets/aerogramme.png new file mode 100644 index 00000000..3aabe3ad Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme.svg b/doc/talks/2023-01-18-tocatta/assets/aerogramme.svg new file mode 100644 index 00000000..0c1ee127 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/aerogramme.svg @@ -0,0 +1,1241 @@ + + + + + + K2V APIS3 APIAerogramme + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +IMAPIMAPIMAPMessageindexMessagebodies diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf new file mode 100644 index 00000000..71a90f26 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.png b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.png new file mode 100644 index 00000000..fb81b460 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf new file mode 100644 index 00000000..87e42eed Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.png b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.png new file mode 100644 index 00000000..f9e2df14 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf new file mode 100644 index 00000000..0606e059 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.png b/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.png new file mode 100644 index 00000000..c3b015a1 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf new file mode 100644 index 00000000..8fea81c7 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.png b/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.png new file mode 100644 index 00000000..ed2077d9 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/alex.jpg b/doc/talks/2023-01-18-tocatta/assets/alex.jpg new file mode 100644 index 00000000..eac0f0a9 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/alex.jpg differ diff --git a/doc/talks/2023-01-18-tocatta/assets/atuin.jpg b/doc/talks/2023-01-18-tocatta/assets/atuin.jpg new file mode 100644 index 00000000..f2fbd61d Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/atuin.jpg differ diff --git a/doc/talks/2023-01-18-tocatta/assets/compatibility.png b/doc/talks/2023-01-18-tocatta/assets/compatibility.png new file mode 100644 index 00000000..ce364a9b Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/compatibility.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_1.svg b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_1.svg new file mode 100644 index 00000000..f8d24fd8 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_1.svg @@ -0,0 +1,301 @@ + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + 1 + + + + 2 + + + + 3 + + + + 4 + + + + 5 + + + + 6 + + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_2.svg b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_2.svg new file mode 100644 index 00000000..5ac8faf6 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_2.svg @@ -0,0 +1,334 @@ + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + 1 + + + + 2 + + + + 3 + + + + 4 + + + + 5 + + + + 6 + + + + + + + + + + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_3.svg b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_3.svg new file mode 100644 index 00000000..fdfd3efc --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_3.svg @@ -0,0 +1,358 @@ + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + 1 + + + + 2 + + + + 3 + + + + 4 + + + + 5 + + + + 6 + + + + + + + + + + + + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_4.svg b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_4.svg new file mode 100644 index 00000000..95ed0e02 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/consistent_hashing_4.svg @@ -0,0 +1,377 @@ + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + 1 + + + + 2 + + + + 3 + + + + 4 + + + + 5 + + + + 6 + + + + + + + + + + + + + + + + + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/deuxfleurs.svg b/doc/talks/2023-01-18-tocatta/assets/deuxfleurs.svg new file mode 100644 index 00000000..c298c22b --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/deuxfleurs.svg @@ -0,0 +1,91 @@ + + + + + + + + + + + + + + + D + F + diff --git a/doc/talks/2023-01-18-tocatta/assets/endpoint-latency-dc.png b/doc/talks/2023-01-18-tocatta/assets/endpoint-latency-dc.png new file mode 100644 index 00000000..7c7411cd Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/endpoint-latency-dc.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf new file mode 100644 index 00000000..a54a163c Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png new file mode 100644 index 00000000..386dd862 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage2.drawio.png b/doc/talks/2023-01-18-tocatta/assets/garage2.drawio.png new file mode 100644 index 00000000..8562fbcf Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/garage2.drawio.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage2a.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/garage2a.drawio.pdf new file mode 100644 index 00000000..422c9343 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/garage2a.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage2b.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/garage2b.drawio.pdf new file mode 100644 index 00000000..05a9710e Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/garage2b.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage_tables.svg b/doc/talks/2023-01-18-tocatta/assets/garage_tables.svg new file mode 100644 index 00000000..c7172713 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/garage_tables.svg @@ -0,0 +1,537 @@ + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + Object + + + + + + bucket + + + + + + file path + + = partition key + + = sort key + + + + + + Version 1 + deleted + + + + + + + Version 2 + id + + size + MIME type + ... + + + + + + Version + + id + h(block 1) + h(block 2) + ... + + + + + Data block + + hash + data + + + + Objects table + Versions table + Blocks table + + diff --git a/doc/talks/2023-01-18-tocatta/assets/inframap.jpg b/doc/talks/2023-01-18-tocatta/assets/inframap.jpg new file mode 100644 index 00000000..19905a99 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/inframap.jpg differ diff --git a/doc/talks/2023-01-18-tocatta/assets/location-aware.png b/doc/talks/2023-01-18-tocatta/assets/location-aware.png new file mode 100644 index 00000000..f5966865 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/location-aware.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/logo_chatons.png b/doc/talks/2023-01-18-tocatta/assets/logo_chatons.png new file mode 100644 index 00000000..890cf17e Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/logo_chatons.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/map.png b/doc/talks/2023-01-18-tocatta/assets/map.png new file mode 100644 index 00000000..1dff3ab6 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/map.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/minio.png b/doc/talks/2023-01-18-tocatta/assets/minio.png new file mode 100644 index 00000000..a71e9ccc Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/minio.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/neptune.jpg b/doc/talks/2023-01-18-tocatta/assets/neptune.jpg new file mode 100644 index 00000000..e59f0bfa Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/neptune.jpg differ diff --git a/doc/talks/2023-01-18-tocatta/assets/quentin.jpg b/doc/talks/2023-01-18-tocatta/assets/quentin.jpg new file mode 100644 index 00000000..d9a7b1e7 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/quentin.jpg differ diff --git a/doc/talks/2023-01-18-tocatta/assets/rust_logo.png b/doc/talks/2023-01-18-tocatta/assets/rust_logo.png new file mode 100644 index 00000000..0e4809ec Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/rust_logo.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slide1.png b/doc/talks/2023-01-18-tocatta/assets/slide1.png new file mode 100644 index 00000000..eb2e67a0 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/slide1.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slide2.png b/doc/talks/2023-01-18-tocatta/assets/slide2.png new file mode 100644 index 00000000..126a39b8 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/slide2.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slide3.png b/doc/talks/2023-01-18-tocatta/assets/slide3.png new file mode 100644 index 00000000..a39f96bf Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/slide3.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slideB1.png b/doc/talks/2023-01-18-tocatta/assets/slideB1.png new file mode 100644 index 00000000..b14b6070 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/slideB1.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slideB2.png b/doc/talks/2023-01-18-tocatta/assets/slideB2.png new file mode 100644 index 00000000..a881a796 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/slideB2.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slideB3.png b/doc/talks/2023-01-18-tocatta/assets/slideB3.png new file mode 100644 index 00000000..830709d2 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/slideB3.png differ diff --git a/doc/talks/2023-01-18-tocatta/assets/slides.svg b/doc/talks/2023-01-18-tocatta/assets/slides.svg new file mode 100644 index 00000000..9946c6fb --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/slides.svg @@ -0,0 +1,4326 @@ + + + + + + + + + + + + + + + + + + + + + + User-facing application + Database + Filesystem + + + + + + + + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/slidesB.svg b/doc/talks/2023-01-18-tocatta/assets/slidesB.svg new file mode 100644 index 00000000..c0a6e97c --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/slidesB.svg @@ -0,0 +1,444 @@ + + + +User-facing applicationDatabase*K2VObject storage*(not really a database)Database diff --git a/doc/talks/2023-01-18-tocatta/talk.pdf b/doc/talks/2023-01-18-tocatta/talk.pdf new file mode 100644 index 00000000..5acb9198 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/talk.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/talk.tex b/doc/talks/2023-01-18-tocatta/talk.tex new file mode 100644 index 00000000..566f56ec --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/talk.tex @@ -0,0 +1,623 @@ +%\nonstopmode +\documentclass[aspectratio=169]{beamer} +\usepackage[utf8]{inputenc} +% \usepackage[frenchb]{babel} +\usepackage{amsmath} +\usepackage{mathtools} +\usepackage{breqn} +\usepackage{multirow} +\usetheme{boxes} +\usepackage{graphicx} +\usepackage{adjustbox} +%\useoutertheme[footline=authortitle,subsection=false]{miniframes} +%\useoutertheme[footline=authorinstitute,subsection=false]{miniframes} +\useoutertheme{infolines} +\setbeamertemplate{headline}{} + +\beamertemplatenavigationsymbolsempty + +\definecolor{TitleOrange}{RGB}{255,137,0} +\setbeamercolor{title}{fg=TitleOrange} +\setbeamercolor{frametitle}{fg=TitleOrange} + +\definecolor{ListOrange}{RGB}{255,145,5} +\setbeamertemplate{itemize item}{\color{ListOrange}$\blacktriangleright$} + +\definecolor{verygrey}{RGB}{70,70,70} +\setbeamercolor{normal text}{fg=verygrey} + + +\usepackage{tabu} +\usepackage{multicol} +\usepackage{vwcol} +\usepackage{stmaryrd} +\usepackage{graphicx} + +\usepackage[normalem]{ulem} + +\AtBeginSection[]{ + \begin{frame} + \vfill + \centering + \begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title} + \usebeamerfont{title}\insertsectionhead\par% + \end{beamercolorbox} + \vfill + \end{frame} +} + +\title{Garage} +\subtitle{a lightweight and robust geo-distributed data storage system} +\author{Deuxfleurs Association} +\date{Inria, 2023-01-18} + +\begin{document} + +\begin{frame} + \centering + \includegraphics[width=.3\linewidth]{../../sticker/Garage.pdf} + \vspace{1em} + + {\large\bf Deuxfleurs Association} + \vspace{1em} + + \url{https://garagehq.deuxfleurs.fr/} + + Matrix channel: \texttt{\#garage:deuxfleurs.fr} +\end{frame} + +\begin{frame} + \frametitle{Who we are} + \begin{columns}[t] + \begin{column}{.2\textwidth} + \centering + \adjincludegraphics[width=.4\linewidth, valign=t]{assets/alex.jpg} + \end{column} + \begin{column}{.6\textwidth} + \textbf{Alex Auvolat}\\ + PhD; co-founder of Deuxfleurs + \end{column} + \begin{column}{.2\textwidth} + ~ + \end{column} + \end{columns} + \vspace{1em} + + \begin{columns}[t] + \begin{column}{.2\textwidth} + ~ + \end{column} + \begin{column}{.6\textwidth} + \textbf{Quentin Dufour}\\ + PhD; co-founder of Deuxfleurs + \end{column} + \begin{column}{.2\textwidth} + \centering + \adjincludegraphics[width=.5\linewidth, valign=t]{assets/quentin.jpg} + \end{column} + \end{columns} + \vspace{2em} + + \begin{columns}[t] + \begin{column}{.2\textwidth} + \centering + \adjincludegraphics[width=.5\linewidth, valign=t]{assets/deuxfleurs.pdf} + \end{column} + \begin{column}{.6\textwidth} + \textbf{Deuxfleurs}\\ + A non-profit self-hosting collective,\\ + member of the CHATONS network + \end{column} + \begin{column}{.2\textwidth} + \centering + \adjincludegraphics[width=.7\linewidth, valign=t]{assets/logo_chatons.png} + \end{column} + \end{columns} + +\end{frame} + +\begin{frame} + \frametitle{Our objective at Deuxfleurs} + + \begin{center} + \textbf{Promote self-hosting and small-scale hosting\\ + as an alternative to large cloud providers} + \end{center} + \vspace{2em} + \visible<2->{ + Why is it hard? + } + \visible<3->{ + \vspace{2em} + \begin{center} + \textbf{\underline{Resilience}}\\ + {\footnotesize (we want good uptime/availability with low supervision)} + \end{center} + } +\end{frame} + +\begin{frame} + \frametitle{How to make a \underline{stable} system} + + Enterprise-grade systems typically employ: + \vspace{1em} + \begin{itemize} + \item RAID + \item Redundant power grid + UPS + \item Redundant Internet connections + \item Low-latency links + \item ... + \end{itemize} + \vspace{1em} + $\to$ it's costly and only worth it at DC scale +\end{frame} + +\begin{frame} + \frametitle{How to make a \underline{resilient} system} + + \only<1,4-5>{ + Instead, we use: + \vspace{1em} + \begin{itemize} + \item \textcolor<2->{gray}{Commodity hardware (e.g. old desktop PCs)} + \vspace{.5em} + \item<4-> \textcolor<5->{gray}{Commodity Internet (e.g. FTTB, FTTH) and power grid} + \vspace{.5em} + \item<5-> \textcolor<6->{gray}{\textbf{Geographical redundancy} (multi-site replication)} + \end{itemize} + } + \only<2>{ + \begin{center} + \includegraphics[width=.8\linewidth]{assets/atuin.jpg} + \end{center} + } + \only<3>{ + \begin{center} + \includegraphics[width=.8\linewidth]{assets/neptune.jpg} + \end{center} + } + \only<6>{ + \begin{center} + \includegraphics[width=.5\linewidth]{assets/inframap.jpg} + \end{center} + } +\end{frame} + +\begin{frame} + \frametitle{How to make this happen} + \begin{center} + \only<1>{\includegraphics[width=.8\linewidth]{assets/slide1.png}}% + \only<2>{\includegraphics[width=.8\linewidth]{assets/slide2.png}}% + \only<3>{\includegraphics[width=.8\linewidth]{assets/slide3.png}}% + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Distributed file systems are slow} + File systems are complex, for example: + \vspace{1em} + \begin{itemize} + \item Concurrent modification by several processes + \vspace{1em} + \item Folder hierarchies + \vspace{1em} + \item Other requirements of the POSIX spec + \end{itemize} + \vspace{1em} + Coordination in a distributed system is costly + + \vspace{1em} + Costs explode with commodity hardware / Internet connections\\ + {\small (we experienced this!)} +\end{frame} + +\begin{frame} + \frametitle{A simpler solution: object storage} + Only two operations: + \vspace{1em} + \begin{itemize} + \item Put an object at a key + \vspace{1em} + \item Retrieve an object from its key + \end{itemize} + \vspace{1em} + {\footnotesize (and a few others)} + + \vspace{1em} + Sufficient for many applications! +\end{frame} + +\begin{frame} + \frametitle{A simpler solution: object storage} + \begin{center} + \includegraphics[height=6em]{../2020-12-02_wide-team/img/Amazon-S3.jpg} + \hspace{3em} + \includegraphics[height=5em]{assets/minio.png} + \hspace{3em} + \includegraphics[height=6em]{../../logo/garage_hires_crop.png} + \end{center} + \vspace{1em} + S3: a de-facto standard, many compatible applications + + \vspace{1em} + + MinIO is self-hostable but not suited for geo-distributed deployments + + \vspace{1em} + + \textbf{Garage is a self-hosted drop-in replacement for the Amazon S3 object store} +\end{frame} + + +\begin{frame} + \frametitle{The data model of object storage} + Object storage is basically a key-value store: + \vspace{1em} + + \begin{center} + \begin{tabular}{|l|p{8cm}|} + \hline + \textbf{Key: file path + name} & \textbf{Value: file data + metadata} \\ + \hline + \hline + \texttt{index.html} & + \texttt{Content-Type: text/html; charset=utf-8} \newline + \texttt{Content-Length: 24929} \newline + \texttt{} \\ + \hline + \texttt{img/logo.svg} & + \texttt{Content-Type: text/svg+xml} \newline + \texttt{Content-Length: 13429} \newline + \texttt{} \\ + \hline + \texttt{download/index.html} & + \texttt{Content-Type: text/html; charset=utf-8} \newline + \texttt{Content-Length: 26563} \newline + \texttt{} \\ + \hline + \end{tabular} + \end{center} + +\end{frame} + +\begin{frame} + \frametitle{Two big problems} + \begin{enumerate} + \item \textbf{How to place data on different nodes?}\\ + \vspace{1em} + \underline{Constraints:} heterogeneous hardware\\ + \underline{Objective:} $n$ copies of everything, maximize usable capacity, maximize resilience\\ + \vspace{1em} + $\to$ the Dynamo model + optimization algorithms + \vspace{2em} + \item<2-> \textbf{How to guarantee consistency?}\\ + \vspace{1em} + \underline{Constraints:} slow network (geographical distance), node unavailability/crashes\\ + \underline{Objective:} maximize availability, read-after-write guarantee\\ + \vspace{1em} + $\to$ CRDTs, monotonicity, read and write quorums + \end{enumerate} +\end{frame} + +\section{Problem 1: placing data} + +\begin{frame} + \frametitle{Key-value stores, upgraded: the Dynamo model} + \textbf{Two keys:} + \begin{itemize} + \item Partition key: used to divide data into partitions (shards) + \item Sort key: used to identify items inside a partition + \end{itemize} + + \vspace{1em} + + \begin{center} + \begin{tabular}{|l|l|p{3cm}|} + \hline + \textbf{Partition key: bucket} & \textbf{Sort key: filename} & \textbf{Value} \\ + \hline + \hline + \texttt{website} & \texttt{index.html} & (file data) \\ + \hline + \texttt{website} & \texttt{img/logo.svg} & (file data) \\ + \hline + \texttt{website} & \texttt{download/index.html} & (file data) \\ + \hline + \hline + \texttt{backup} & \texttt{borg/index.2822} & (file data) \\ + \hline + \texttt{backup} & \texttt{borg/data/2/2329} & (file data) \\ + \hline + \texttt{backup} & \texttt{borg/data/2/2680} & (file data) \\ + \hline + \hline + \texttt{private} & \texttt{qq3a2nbe1qjq0ebbvo6ocsp6co} & (file data) \\ + \hline + \end{tabular} + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Key-value stores, upgraded: the Dynamo model} + \begin{itemize} + \item Data with different partition keys is stored independantly,\\ + on a different set of nodes\\ + \vspace{.5em} + $\to$ no easy way to list all partition keys\\ + $\to$ no cross-shard transactions\\ + \vspace{2em} + \item Placing data: hash the partition key, select nodes accordingly\\ + \vspace{.5em} + $\to$ distributed hash table (DHT) + \vspace{2em} + \item For a given value of the partition key, items can be listed using their sort keys + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{How to spread files over different cluster nodes?} + \textbf{Consistent hashing (Dynamo):} + \vspace{1em} + + \begin{center} + \only<1>{\includegraphics[width=.40\columnwidth]{assets/consistent_hashing_1.pdf}}% + \only<2>{\includegraphics[width=.40\columnwidth]{assets/consistent_hashing_2.pdf}}% + \only<3>{\includegraphics[width=.40\columnwidth]{assets/consistent_hashing_3.pdf}}% + \only<4>{\includegraphics[width=.40\columnwidth]{assets/consistent_hashing_4.pdf}}% + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Constraint: location-awareness} + \begin{center} + \includegraphics[width=\linewidth]{assets/location-aware.png} + \end{center} + \vspace{2em} + Garage replicates data on different zones when possible +\end{frame} + +\begin{frame} + \frametitle{Constraint: location-awareness} + \begin{center} + \includegraphics[width=.8\linewidth]{assets/map.png} + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Issues with consistent hashing} + \begin{itemize} + \item Consistent hashing doesn't dispatch data based on geographical location of nodes + \vspace{1em} + \item<2-> Geographically aware adaptation, try 1:\\ + data quantities not well balanced between nodes + \vspace{1em} + \item<3-> Geographically aware adaptation, try 2:\\ + too many reshuffles when adding/removing nodes + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{How to spread files over different cluster nodes?} + \textbf{Garage's method: build an index table} + \vspace{1em} + + Realization: we can actually precompute an optimal solution + \vspace{1em} + + \visible<2->{ + \begin{center} + \begin{tabular}{|l|l|l|l|} + \hline + \textbf{Partition} & \textbf{Node 1} & \textbf{Node 2} & \textbf{Node 3} \\ + \hline + \hline + Partition 0 & Io (jupiter) & Drosera (atuin) & Courgette (neptune) \\ + \hline + Partition 1 & Datura (atuin) & Courgette (neptune) & Io (jupiter) \\ + \hline + Partition 2 & Io(jupiter) & Celeri (neptune) & Drosera (atuin) \\ + \hline + \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ & \hspace{1em}$\vdots$ \\ + \hline + Partition 255 & Concombre (neptune) & Io (jupiter) & Drosera (atuin) \\ + \hline + \end{tabular} + \end{center} + } + \vspace{1em} + \visible<3->{ + The index table is built centrally using an optimal algorithm,\\ + then propagated to all nodes + } +\end{frame} + +\begin{frame} + \frametitle{The relationship between \emph{partition} and \emph{partition key}} + \begin{center} + \begin{tabular}{|l|l|l|l|} + \hline + \textbf{Partition key} & \textbf{Partition} & \textbf{Sort key} & \textbf{Value} \\ + \hline + \hline + \texttt{website} & Partition 12 & \texttt{index.html} & (file data) \\ + \hline + \texttt{website} & Partition 12 & \texttt{img/logo.svg} & (file data) \\ + \hline + \texttt{website} & Partition 12 &\texttt{download/index.html} & (file data) \\ + \hline + \hline + \texttt{backup} & Partition 42 & \texttt{borg/index.2822} & (file data) \\ + \hline + \texttt{backup} & Partition 42 & \texttt{borg/data/2/2329} & (file data) \\ + \hline + \texttt{backup} & Partition 42 & \texttt{borg/data/2/2680} & (file data) \\ + \hline + \hline + \texttt{private} & Partition 42 & \texttt{qq3a2nbe1qjq0ebbvo6ocsp6co} & (file data) \\ + \hline + \end{tabular} + \end{center} + \vspace{1em} + \textbf{To read or write an item:} hash partition key + \\ \hspace{5cm} $\to$ determine partition number (first 8 bits) + \\ \hspace{5cm} $\to$ find associated nodes +\end{frame} + +\begin{frame} + \frametitle{Garage's internal data structures} + \centering + \includegraphics[width=.75\columnwidth]{assets/garage_tables.pdf} +\end{frame} + +\begin{frame} + \frametitle{Storing and retrieving files} + \begin{center} + \only<1>{\includegraphics[width=.45\linewidth]{assets/garage2a.drawio.pdf}}% + \only<2>{\includegraphics[width=.45\linewidth]{assets/garage2b.drawio.pdf}}% + \end{center} +\end{frame} + +\section{Problem 2: ensuring consistency} + +%\begin{frame} +% \frametitle{Garage's architecture} +% \begin{center} +% \includegraphics[width=.35\linewidth]{assets/garage.drawio.pdf} +% \end{center} +%\end{frame} + +\begin{frame} + \frametitle{Garage is \emph{coordination-free}:} + \begin{itemize} + \item No Raft or Paxos + \vspace{1em} + \item Internal data types are CRDTs + \vspace{1em} + \item All nodes are equivalent (no master/leader/index node) + \end{itemize} + \vspace{2em} + $\to$ less sensitive to higher latencies between nodes +\end{frame} + +\begin{frame} + \frametitle{Consistency model} + \begin{itemize} + \item Not ACID (not required by S3 spec) / not linearizable + \vspace{1em} + \item \textbf{Read-after-write consistency}\\ + {\footnotesize (stronger than eventual consistency)} + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Impact on performances} + \begin{center} + \includegraphics[width=.8\linewidth]{assets/endpoint-latency-dc.png} + \end{center} +\end{frame} + + +\begin{frame} + \frametitle{An ever-increasing compatibility list} + \begin{center} + \includegraphics[width=.7\linewidth]{assets/compatibility.png} + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Further plans for Garage} + \begin{center} + \only<1>{\includegraphics[width=.8\linewidth]{assets/slideB1.png}}% + \only<2>{\includegraphics[width=.8\linewidth]{assets/slideB2.png}}% + \only<3>{\includegraphics[width=.8\linewidth]{assets/slideB3.png}}% + \end{center} +\end{frame} + +\begin{frame} + \frametitle{K2V Design} + \begin{itemize} + \item A new, custom, minimal API + \vspace{1em} + \item<2-> Exposes the partitoning mechanism of Garage\\ + K2V = partition key / sort key / value (like Dynamo) + \vspace{1em} + \item<3-> Coordination-free, CRDT-friendly (inspired by Riak)\\ + \vspace{1em} + \item<4-> Cryptography-friendly: values are binary blobs + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Application: an e-mail storage server} + \begin{center} + \only<1>{\includegraphics[width=.9\linewidth]{assets/aerogramme.png}}% + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Aerogramme data model} + \begin{center} + \only<1>{\includegraphics[width=.4\linewidth]{assets/aerogramme_datatype.drawio.pdf}}% + \only<2->{\includegraphics[width=.9\linewidth]{assets/aerogramme_keys.drawio.pdf}\vspace{1em}}% + \end{center} + \visible<3->{Aerogramme encrypts all stored values for privacy\\ + (Garage server administrators can't read your mail)} +\end{frame} + +\begin{frame} + \frametitle{Different deployment scenarios} + \begin{center} + \only<1>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components1.drawio.pdf}}% + \only<2>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components2.drawio.pdf}}% + \end{center} +\end{frame} + +\begin{frame} + \frametitle{A new model for building resilient software} + \begin{itemize} + \item Design a data model suited to K2V\\ + {\footnotesize (see Cassandra docs on porting SQL data models to Cassandra)} + \vspace{1em} + \begin{itemize} + \item Use CRDTs or other eventually consistent data types (see e.g. Bayou) + \vspace{1em} + \item Store opaque binary blobs to provide End-to-End Encryption\\ + \end{itemize} + \vspace{1em} + \item Store big blobs (files) in S3 + \vspace{1em} + \item Let Garage manage sharding, replication, failover, etc. + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Research perspectives} + \begin{itemize} + \item Write about Garage's global architecture \emph{(paper in progress)} + \vspace{1em} + \item Measure and improve Garage's performances + \vspace{1em} + \item Discuss the optimal layout algorithm, provide proofs + \vspace{1em} + \item Write about our proposed architecture for (E2EE) apps over K2V+S3 + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Where to find us} + \begin{center} + \includegraphics[width=.25\linewidth]{../../logo/garage_hires.png}\\ + \vspace{-1em} + \url{https://garagehq.deuxfleurs.fr/}\\ + \url{mailto:garagehq@deuxfleurs.fr}\\ + \texttt{\#garage:deuxfleurs.fr} on Matrix + + \vspace{1.5em} + \includegraphics[width=.06\linewidth]{assets/rust_logo.png} + \includegraphics[width=.13\linewidth]{assets/AGPLv3_Logo.png} + \end{center} +\end{frame} + +\end{document} + +%% vim: set ts=4 sw=4 tw=0 noet spelllang=en : -- cgit v1.2.3 From fe850f62c908492d2c3cbe4c55c3cf3b3d097de0 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 12 Jan 2023 16:27:02 +0100 Subject: Talk 2023-01-18: some WIP talking about consensus --- doc/talks/2023-01-18-tocatta/Makefile | 4 + doc/talks/2023-01-18-tocatta/assets/consensus.svg | 137 +++++++++++++++ .../2023-01-18-tocatta/assets/garage.drawio.pdf | Bin 26098 -> 0 bytes .../2023-01-18-tocatta/assets/garage.drawio.png | Bin 13463 -> 0 bytes doc/talks/2023-01-18-tocatta/talk.pdf | Bin 2572439 -> 2594434 bytes doc/talks/2023-01-18-tocatta/talk.tex | 187 +++++++++++++++++++-- 6 files changed, 311 insertions(+), 17 deletions(-) create mode 100644 doc/talks/2023-01-18-tocatta/assets/consensus.svg delete mode 100644 doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf delete mode 100644 doc/talks/2023-01-18-tocatta/assets/garage.drawio.png (limited to 'doc') diff --git a/doc/talks/2023-01-18-tocatta/Makefile b/doc/talks/2023-01-18-tocatta/Makefile index 4d600178..4a967d24 100644 --- a/doc/talks/2023-01-18-tocatta/Makefile +++ b/doc/talks/2023-01-18-tocatta/Makefile @@ -3,6 +3,7 @@ ASSETS=assets/consistent_hashing_1.pdf \ assets/consistent_hashing_3.pdf \ assets/consistent_hashing_4.pdf \ assets/garage_tables.pdf \ + assets/consensus.pdf_tex \ assets/deuxfleurs.pdf talk.pdf: talk.tex $(ASSETS) @@ -10,3 +11,6 @@ talk.pdf: talk.tex $(ASSETS) assets/%.pdf: assets/%.svg inkscape -D -z --file=$^ --export-pdf=$@ + +assets/%.pdf_tex: assets/%.svg + inkscape -D -z --file=$^ --export-pdf=$@ --export-latex diff --git a/doc/talks/2023-01-18-tocatta/assets/consensus.svg b/doc/talks/2023-01-18-tocatta/assets/consensus.svg new file mode 100644 index 00000000..8321e383 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/consensus.svg @@ -0,0 +1,137 @@ + + + + + + + + + + + + + + $\bot$ + + + + $x$ + + + $propose(x) / x$ + $propose(y) / x$ + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf deleted file mode 100644 index a54a163c..00000000 Binary files a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf and /dev/null differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png deleted file mode 100644 index 386dd862..00000000 Binary files a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png and /dev/null differ diff --git a/doc/talks/2023-01-18-tocatta/talk.pdf b/doc/talks/2023-01-18-tocatta/talk.pdf index 5acb9198..ba9bde3d 100644 Binary files a/doc/talks/2023-01-18-tocatta/talk.pdf and b/doc/talks/2023-01-18-tocatta/talk.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/talk.tex b/doc/talks/2023-01-18-tocatta/talk.tex index 566f56ec..ac9b4077 100644 --- a/doc/talks/2023-01-18-tocatta/talk.tex +++ b/doc/talks/2023-01-18-tocatta/talk.tex @@ -8,6 +8,7 @@ \usepackage{multirow} \usetheme{boxes} \usepackage{graphicx} +\usepackage{import} \usepackage{adjustbox} %\useoutertheme[footline=authortitle,subsection=false]{miniframes} %\useoutertheme[footline=authorinstitute,subsection=false]{miniframes} @@ -479,33 +480,185 @@ \section{Problem 2: ensuring consistency} -%\begin{frame} -% \frametitle{Garage's architecture} -% \begin{center} -% \includegraphics[width=.35\linewidth]{assets/garage.drawio.pdf} -% \end{center} -%\end{frame} +\begin{frame} + \frametitle{Consensus vs weak consistency} + + \hspace{1em} + \begin{minipage}{7cm} + \textbf{Consensus-based systems:} + \vspace{1em} + \begin{itemize} + \item \textbf{Leader-based:} a leader is elected to coordinate + all reads and writes + \vspace{1em} + \item \textbf{Linearizability} of all operations\\ + (strongest consistency guarantee) + \vspace{1em} + \item \textbf{Replicated state machines} that can implement + any sequential specification + \vspace{1em} + \item \textbf{Costly}, the leader is a bottleneck; + leader elections on failure take time + \end{itemize} + \end{minipage} + \hfill + \begin{minipage}{7cm} \visible<2->{ + \textbf{Weakly consistent systems:} + \vspace{1em} + \begin{itemize} + \item \textbf{Nodes are equivalent}, any node + can originate a read or write operation + \vspace{1em} + \item \textbf{Read-after-write consistency} with quorums, + eventual consistency without + \vspace{1em} + \item \textbf{Operations have to commute}, i.e.~we + can only implement CRDTs + \vspace{1em} + \item \textbf{Fast}, no node is a bottleneck;\\ + works the same with offline nodes + \end{itemize} + } \end{minipage} + \hspace{1em} +\end{frame} + +\begin{frame} + \frametitle{Consensus vs weak consistency} + \begin{center} + \textbf{The same objects cannot be implemented in both models.} + \end{center} + \vspace{2em} + + \hspace{1em} + \begin{minipage}{7cm} + \underline{Consensus-based systems:} + + \vspace{1em} + + \textbf{Any sequential specification}\\~ + \end{minipage} + \hfill + \begin{minipage}{7cm} + \underline{Weakly consistent systems:} + + \vspace{1em} + + \textbf{CRDTs only}\\(conflict-free replicated data types) + \end{minipage} + \hspace{1em} + + \vspace{3em} + \begin{center} + Part of the complexity is \textbf{reported to the consumer of the API} + \end{center} +\end{frame} + +\begin{frame} + \frametitle{Consensus vs weak consistency} + \begin{center} + \textbf{From a theoretical point of view:}\\ + + \end{center} + \vspace{2em} + + \hspace{1em} + \begin{minipage}{6.5cm} + \underline{Consensus-based systems:} + + \vspace{1em} + + Require \textbf{additionnal assumptions} such as a fault detector or a strong RNG\\~ + \end{minipage} + \hfill + \begin{minipage}{6.5cm} + \underline{Weakly consistent systems:} + + \vspace{1em} + + Can be implemented in \textbf{any asynchronous message passing distributed system} + \end{minipage} + \hspace{1em} + + \vspace{3em} + \begin{center} + They represent \textbf{different classes of computational capability} + \end{center} +\end{frame} \begin{frame} - \frametitle{Garage is \emph{coordination-free}:} + \frametitle{Understanding the power of consensus} + \textbf{Consensus:} an API with a single operation, $propose(x)$ + \begin{enumerate} + \item nodes all call $propose(x)$ with their proposed value; + \item nodes all receive the same value as a return value, which is one of the proposed values + \end{enumerate} + \vspace{1em} + + \visible<2->{ + \textbf{Equivalent to} a distributed algorithm that gives a total order on all requests + } + \vspace{1em} + + \visible<3->{ + \textbf{Implemented by} this simple replicated state machine: + \vspace{.5em} + \begin{figure} + \centering + \def\svgwidth{.5\textwidth} + \large + \import{assets/}{consensus.pdf_tex} + \end{figure} + \vspace{1em} + } +\end{frame} + +\begin{frame} + \frametitle{Can my object be implemented without consensus?} + \underline{Given the specification of an API:} + \vspace{2em} \begin{itemize} - \item No Raft or Paxos - \vspace{1em} - \item Internal data types are CRDTs + \item \textbf{Using this API, we can implement the consensus object} (the $propose$ function)\\ + $\to$ the API is equivalent to consensus/total ordering of messages\\ + $\to$ the API cannot be implemented in a weakly consistent system + \vspace{2em} + \item \textbf{This API can be implemented using only weak primitives}\\ + (e.g. a bunch of atomic registers)\\ + $\to$ the API is strictly weaker than consensus\\ + $\to$ we can implement it in Garage! + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Why avoid consensus?} + Consensus can be implemented reasonably well in practice, so why avoid it? + \vspace{2em} + \begin{itemize} + \item \textbf{Software complexity:} RAFT and PAXOS are complex beasts;\\ + harder to prove, harder to reason about + \vspace{1.5em} + \item \textbf{Performance issues:} \vspace{1em} - \item All nodes are equivalent (no master/leader/index node) + \begin{itemize} + \item The leader is a \textbf{bottleneck} for all requests + \vspace{1em} + \item Particularly \textbf{sensitive to higher latency} between nodes + \end{itemize} \end{itemize} - \vspace{2em} - $\to$ less sensitive to higher latencies between nodes \end{frame} \begin{frame} - \frametitle{Consistency model} + \frametitle{What can we implement without consensus?} \begin{itemize} - \item Not ACID (not required by S3 spec) / not linearizable + \item Any \textbf{conflict-free replicated data type} (CRDT) + \vspace{1em} + \item Non-transactional key-value stores such as S3 are equivalent to a simple CRDT:\\ + a \textbf{last-writer-wins registry} + \vspace{1em} + \item \textbf{Read-after-write consistency} can be implemented + using quorums on read and write operations \vspace{1em} - \item \textbf{Read-after-write consistency}\\ - {\footnotesize (stronger than eventual consistency)} + \item \textbf{Monotonicity of reads} can be implemented with repair-on-read\\ + (makes reads more costly, not implemented in Garage) \end{itemize} \end{frame} -- cgit v1.2.3 From f5a7bc37365e8e593359db114a4e44f8e8c65207 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 12 Jan 2023 17:17:13 +0100 Subject: Add 12 lattice diagrams to explain CRDTs and quorums --- doc/talks/2023-01-18-tocatta/Makefile | 12 + doc/talks/2023-01-18-tocatta/assets/lattice1.svg | 433 ++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/lattice2.svg | 514 +++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/lattice3.svg | 515 +++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/lattice4.svg | 525 ++++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/lattice5.svg | 536 ++++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/lattice6.svg | 553 +++++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/lattice7.svg | 581 ++++++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/lattice8.svg | 587 ++++++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/lattice9.svg | 587 ++++++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/latticeA.svg | 587 ++++++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/latticeB.svg | 598 +++++++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/latticeC.svg | 598 +++++++++++++++++++++++ doc/talks/2023-01-18-tocatta/talk.pdf | Bin 2594434 -> 2643346 bytes doc/talks/2023-01-18-tocatta/talk.tex | 43 +- 15 files changed, 6658 insertions(+), 11 deletions(-) create mode 100644 doc/talks/2023-01-18-tocatta/assets/lattice1.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/lattice2.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/lattice3.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/lattice4.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/lattice5.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/lattice6.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/lattice7.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/lattice8.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/lattice9.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeA.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeB.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeC.svg (limited to 'doc') diff --git a/doc/talks/2023-01-18-tocatta/Makefile b/doc/talks/2023-01-18-tocatta/Makefile index 4a967d24..a1f76e15 100644 --- a/doc/talks/2023-01-18-tocatta/Makefile +++ b/doc/talks/2023-01-18-tocatta/Makefile @@ -4,6 +4,18 @@ ASSETS=assets/consistent_hashing_1.pdf \ assets/consistent_hashing_4.pdf \ assets/garage_tables.pdf \ assets/consensus.pdf_tex \ + assets/lattice1.pdf_tex \ + assets/lattice2.pdf_tex \ + assets/lattice3.pdf_tex \ + assets/lattice4.pdf_tex \ + assets/lattice5.pdf_tex \ + assets/lattice6.pdf_tex \ + assets/lattice7.pdf_tex \ + assets/lattice8.pdf_tex \ + assets/lattice9.pdf_tex \ + assets/latticeA.pdf_tex \ + assets/latticeB.pdf_tex \ + assets/latticeC.pdf_tex \ assets/deuxfleurs.pdf talk.pdf: talk.tex $(ASSETS) diff --git a/doc/talks/2023-01-18-tocatta/assets/lattice1.svg b/doc/talks/2023-01-18-tocatta/assets/lattice1.svg new file mode 100644 index 00000000..8bfa5aa7 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/lattice1.svg @@ -0,0 +1,433 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/lattice2.svg b/doc/talks/2023-01-18-tocatta/assets/lattice2.svg new file mode 100644 index 00000000..adcd92cb --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/lattice2.svg @@ -0,0 +1,514 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + + diff --git a/doc/talks/2023-01-18-tocatta/assets/lattice3.svg b/doc/talks/2023-01-18-tocatta/assets/lattice3.svg new file mode 100644 index 00000000..640dc468 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/lattice3.svg @@ -0,0 +1,515 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + + diff --git a/doc/talks/2023-01-18-tocatta/assets/lattice4.svg b/doc/talks/2023-01-18-tocatta/assets/lattice4.svg new file mode 100644 index 00000000..b2a99e28 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/lattice4.svg @@ -0,0 +1,525 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + return OK + + diff --git a/doc/talks/2023-01-18-tocatta/assets/lattice5.svg b/doc/talks/2023-01-18-tocatta/assets/lattice5.svg new file mode 100644 index 00000000..bc6b7195 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/lattice5.svg @@ -0,0 +1,536 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $read()$: + $\sqsupseteq \{a\} \to$ OK + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + return OK + + diff --git a/doc/talks/2023-01-18-tocatta/assets/lattice6.svg b/doc/talks/2023-01-18-tocatta/assets/lattice6.svg new file mode 100644 index 00000000..176b1715 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/lattice6.svg @@ -0,0 +1,553 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $read()$: + $\sqsupseteq \{a\} \to$ OK + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + + $\to \{\}$ + return OK + + diff --git a/doc/talks/2023-01-18-tocatta/assets/lattice7.svg b/doc/talks/2023-01-18-tocatta/assets/lattice7.svg new file mode 100644 index 00000000..7ce8bda8 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/lattice7.svg @@ -0,0 +1,581 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $read()$: + $\sqsupseteq \{a\} \to$ OK + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + + $\to \{\}$ + return OK + return $\{\}\sqcup\{a\}=\{a\}$ + + $\to \{a\}$ + + diff --git a/doc/talks/2023-01-18-tocatta/assets/lattice8.svg b/doc/talks/2023-01-18-tocatta/assets/lattice8.svg new file mode 100644 index 00000000..3bada791 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/lattice8.svg @@ -0,0 +1,587 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + return OK + + + + $write(\{b\})$: + $\not\sqsupseteq \{b\}$ + $\not\sqsupseteq \{b\}$ + $\not\sqsupseteq \{b\}$ + + diff --git a/doc/talks/2023-01-18-tocatta/assets/lattice9.svg b/doc/talks/2023-01-18-tocatta/assets/lattice9.svg new file mode 100644 index 00000000..8b3c6585 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/lattice9.svg @@ -0,0 +1,587 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + return OK + + + + $write(\{b\})$: + $\not\sqsupseteq \{b\}$ + $\not\sqsupseteq \{b\}$ + $\sqsupseteq \{b\} \to$ OK + + diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeA.svg b/doc/talks/2023-01-18-tocatta/assets/latticeA.svg new file mode 100644 index 00000000..400ccff8 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/latticeA.svg @@ -0,0 +1,587 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\sqsupseteq \{a\} \to$ OK + $\sqsupseteq \{a\}$ + return OK + + + + $write(\{b\})$: + $\not\sqsupseteq \{b\}$ + $\not\sqsupseteq \{b\}$ + $\sqsupseteq \{b\} \to$ OK + + diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeB.svg b/doc/talks/2023-01-18-tocatta/assets/latticeB.svg new file mode 100644 index 00000000..06725d75 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/latticeB.svg @@ -0,0 +1,598 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\sqsupseteq \{a\} \to$ OK + $\sqsupseteq \{a\}$ + return OK + return OK + + + + $write(\{b\})$: + $\not\sqsupseteq \{b\}$ + $\sqsupseteq \{b\} \to$ OK + $\sqsupseteq \{b\} \to$ OK + + diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeC.svg b/doc/talks/2023-01-18-tocatta/assets/latticeC.svg new file mode 100644 index 00000000..c815af94 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/latticeC.svg @@ -0,0 +1,598 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\sqsupseteq \{a\} \to$ OK + $\sqsupseteq \{a\}$ + return OK + return OK + + + + $write(\{b\})$: + $\sqsupseteq \{b\}$ + $\sqsupseteq \{b\} \to$ OK + $\sqsupseteq \{b\} \to$ OK + + diff --git a/doc/talks/2023-01-18-tocatta/talk.pdf b/doc/talks/2023-01-18-tocatta/talk.pdf index ba9bde3d..02f605e8 100644 Binary files a/doc/talks/2023-01-18-tocatta/talk.pdf and b/doc/talks/2023-01-18-tocatta/talk.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/talk.tex b/doc/talks/2023-01-18-tocatta/talk.tex index ac9b4077..e789f597 100644 --- a/doc/talks/2023-01-18-tocatta/talk.tex +++ b/doc/talks/2023-01-18-tocatta/talk.tex @@ -494,8 +494,7 @@ \item \textbf{Linearizability} of all operations\\ (strongest consistency guarantee) \vspace{1em} - \item \textbf{Replicated state machines} that can implement - any sequential specification + \item Any sequential specification can be implemented as a \textbf{replicated state machine} \vspace{1em} \item \textbf{Costly}, the leader is a bottleneck; leader elections on failure take time @@ -515,7 +514,7 @@ \item \textbf{Operations have to commute}, i.e.~we can only implement CRDTs \vspace{1em} - \item \textbf{Fast}, no node is a bottleneck;\\ + \item \textbf{Fast}, no single bottleneck;\\ works the same with offline nodes \end{itemize} } \end{minipage} @@ -530,27 +529,29 @@ \vspace{2em} \hspace{1em} - \begin{minipage}{7cm} + \begin{minipage}{6.5cm} \underline{Consensus-based systems:} \vspace{1em} \textbf{Any sequential specification}\\~ + + \vspace{1em} + \textbf{Easier to program for}: just write your program as if it were sequential on a single machine + \end{minipage} \hfill - \begin{minipage}{7cm} + \begin{minipage}{6.5cm} \underline{Weakly consistent systems:} \vspace{1em} \textbf{CRDTs only}\\(conflict-free replicated data types) + + \vspace{1em} + Part of the complexity is \textbf{reported to the consumer of the API}\\~ \end{minipage} \hspace{1em} - - \vspace{3em} - \begin{center} - Part of the complexity is \textbf{reported to the consumer of the API} - \end{center} \end{frame} \begin{frame} @@ -663,7 +664,27 @@ \end{frame} \begin{frame} - \frametitle{Impact on performances} + \frametitle{Understanding CRDTs and quorums} + \begin{figure} + \centering + \def\svgwidth{.8\textwidth} + \only<1>{\import{assets/}{lattice1.pdf_tex}}% + \only<2>{\import{assets/}{lattice2.pdf_tex}}% + \only<3>{\import{assets/}{lattice3.pdf_tex}}% + \only<4>{\import{assets/}{lattice4.pdf_tex}}% + \only<5>{\import{assets/}{lattice5.pdf_tex}}% + \only<6>{\import{assets/}{lattice6.pdf_tex}}% + \only<7>{\import{assets/}{lattice7.pdf_tex}}% + \only<8>{\import{assets/}{lattice8.pdf_tex}}% + \only<9>{\import{assets/}{lattice9.pdf_tex}}% + \only<10>{\import{assets/}{latticeA.pdf_tex}}% + \only<11>{\import{assets/}{latticeB.pdf_tex}}% + \only<12>{\import{assets/}{latticeC.pdf_tex}}% + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{Performance gains in practice} \begin{center} \includegraphics[width=.8\linewidth]{assets/endpoint-latency-dc.png} \end{center} -- cgit v1.2.3 From 1f5e3aaf8e704ae35a1ccea0f923ce92d4f9ed73 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 12 Jan 2023 17:39:12 +0100 Subject: Add explanations about quorums --- .../assets/aerogramme_components1.drawio.pdf | Bin 31966 -> 0 bytes .../assets/aerogramme_components2.drawio.pdf | Bin 31688 -> 0 bytes .../assets/aerogramme_datatype.drawio.pdf | Bin 31073 -> 0 bytes .../assets/aerogramme_keys.drawio.pdf | Bin 25145 -> 0 bytes doc/talks/2023-01-18-tocatta/talk.pdf | Bin 2643346 -> 2494390 bytes doc/talks/2023-01-18-tocatta/talk.tex | 91 +++++++++++++++------ 6 files changed, 65 insertions(+), 26 deletions(-) delete mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf delete mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf delete mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf delete mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf (limited to 'doc') diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf deleted file mode 100644 index 71a90f26..00000000 Binary files a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf and /dev/null differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf deleted file mode 100644 index 87e42eed..00000000 Binary files a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf and /dev/null differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf deleted file mode 100644 index 0606e059..00000000 Binary files a/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf and /dev/null differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf deleted file mode 100644 index 8fea81c7..00000000 Binary files a/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf and /dev/null differ diff --git a/doc/talks/2023-01-18-tocatta/talk.pdf b/doc/talks/2023-01-18-tocatta/talk.pdf index 02f605e8..e4acf75e 100644 Binary files a/doc/talks/2023-01-18-tocatta/talk.pdf and b/doc/talks/2023-01-18-tocatta/talk.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/talk.tex b/doc/talks/2023-01-18-tocatta/talk.tex index e789f597..db6d26ef 100644 --- a/doc/talks/2023-01-18-tocatta/talk.tex +++ b/doc/talks/2023-01-18-tocatta/talk.tex @@ -684,20 +684,77 @@ \end{frame} \begin{frame} - \frametitle{Performance gains in practice} - \begin{center} - \includegraphics[width=.8\linewidth]{assets/endpoint-latency-dc.png} - \end{center} + \frametitle{Read-after-write consistency:} + \textbf{Property:} If node $A$ did an operation $write(x)$ and received an OK response,\\ + \hspace{2cm} and node $B$ starts an operation $read()$ after $A$ received OK,\\ + \hspace{2cm} then $B$ will read a value $x' \sqsupseteq x$. + + \vspace{1em} + + \hspace{1em} + \begin{minipage}{6.8cm} + \textbf{Algorithm $write(x)$:} + \begin{enumerate} + \item Broadcast $write(x)$ to all nodes + \item Wait for $k > n/2$ nodes to reply OK + \item Return OK + \end{enumerate} + \end{minipage} + \hfill + \begin{minipage}{6.8cm} + \vspace{1em} + \textbf{Algorithm $read()$:} + \begin{enumerate} + \item Broadcast $read()$ to all nodes + \item Wait for $k > n/2$ nodes to reply\\ + with values $x_1, \dots, x_k$ + \item Return $x_1 \sqcup \dots \sqcup x_k$ + \end{enumerate} + \end{minipage} + \hspace{1em} + + \vspace{2em} + \textbf{Why does it work?} There is at least one node at the intersection between the two sets of nodes that replied to each request, that ``saw'' $x$ before the $read()$ started ($x_i \sqsupseteq x$). +\end{frame} + +\begin{frame} + \frametitle{Monotonical read consistency:} + \textbf{Property:} If node $A$ did an operation $read()$ and received $x$ as a response,\\ + \hspace{2cm} and node $B$ starts an operation $read()$ after $A$ received $x$,\\ + \hspace{2cm} then $B$ will read a value $x' \sqsupseteq x$. + + \vspace{1em} + + \textbf{Algorithm $read()$:} + \begin{enumerate} + \item Broadcast $read()$ to all nodes + \item Wait for $k > n/2$ nodes to reply with values $x_1, \dots, x_k$ + \item If $x_i \ne x_j$ for some nodes $i$ and $j$,\\ + \hspace{1cm}then call $write(x_1 \sqcup \dots \sqcup x_k)$ and wait for OK from $k' > n/2$ nodes + \item Return $x_1 \sqcup \dots \sqcup x_k$ + \end{enumerate} + + \vspace{1em} + This makes reads slower in some cases, and is \textbf{not implemented in Garage}. \end{frame} +\begin{frame} + \frametitle{Performance gains in practice} + \begin{center} + \includegraphics[width=.8\linewidth]{assets/endpoint-latency-dc.png} + \end{center} +\end{frame} \begin{frame} - \frametitle{An ever-increasing compatibility list} - \begin{center} - \includegraphics[width=.7\linewidth]{assets/compatibility.png} - \end{center} + \frametitle{The hard parts we don't address (yet!)} + \begin{itemize} + \item Maintain consistency changes when nodes assigned to a partition change:\\ + \item TODO + \end{itemize} \end{frame} +\section{Going further than the S3 API} + \begin{frame} \frametitle{Further plans for Garage} \begin{center} @@ -728,24 +785,6 @@ \end{center} \end{frame} -\begin{frame} - \frametitle{Aerogramme data model} - \begin{center} - \only<1>{\includegraphics[width=.4\linewidth]{assets/aerogramme_datatype.drawio.pdf}}% - \only<2->{\includegraphics[width=.9\linewidth]{assets/aerogramme_keys.drawio.pdf}\vspace{1em}}% - \end{center} - \visible<3->{Aerogramme encrypts all stored values for privacy\\ - (Garage server administrators can't read your mail)} -\end{frame} - -\begin{frame} - \frametitle{Different deployment scenarios} - \begin{center} - \only<1>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components1.drawio.pdf}}% - \only<2>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components2.drawio.pdf}}% - \end{center} -\end{frame} - \begin{frame} \frametitle{A new model for building resilient software} \begin{itemize} -- cgit v1.2.3 From 9bf94faaa147e0209188db27fc9e5d6ee49656b3 Mon Sep 17 00:00:00 2001 From: kaiyou Date: Thu, 12 Jan 2023 20:46:17 +0100 Subject: Add docs about running pict-rs with garage --- doc/book/connect/apps/index.md | 65 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 2 deletions(-) (limited to 'doc') diff --git a/doc/book/connect/apps/index.md b/doc/book/connect/apps/index.md index 737351a0..4d556ff8 100644 --- a/doc/book/connect/apps/index.md +++ b/doc/book/connect/apps/index.md @@ -13,7 +13,7 @@ In this section, we cover the following web applications: | [Matrix](#matrix) | ✅ | Tested with `synapse-s3-storage-provider` | | [Pixelfed](#pixelfed) | ❓ | Not yet tested | | [Pleroma](#pleroma) | ❓ | Not yet tested | -| [Lemmy](#lemmy) | ❓ | Not yet tested | +| [Lemmy](#lemmy) | ✅ | Supported with pict-rs | | [Funkwhale](#funkwhale) | ❓ | Not yet tested | | [Misskey](#misskey) | ❓ | Not yet tested | | [Prismo](#prismo) | ❓ | Not yet tested | @@ -484,7 +484,68 @@ And add a new line. For example, to run it every 10 minutes: ## Lemmy -Lemmy uses pict-rs that [supports S3 backends](https://git.asonix.dog/asonix/pict-rs/commit/f9f4fc63d670f357c93f24147c2ee3e1278e2d97) +Lemmy uses pict-rs that [supports S3 backends](https://git.asonix.dog/asonix/pict-rs/commit/f9f4fc63d670f357c93f24147c2ee3e1278e2d97). +This feature requires `pict-rs >= 4.0.0`. + +### Creating your bucket + +This is the usual Garage setup: + +```bash +garage key new --name pictrs-key +garage bucket create pictrs-data +garage bucket allow pictrs-data --read --write --key pictrs-key +``` + +Note the Key ID and Secret Key. + +### Migrating your data + +If your pict-rs instance holds existing data, you first need to migrate to the S3 bucket. + +Stop pict-rs, then run the migration utility from local filesystem to the bucket: + +``` +pict-rs \ + filesystem -p /path/to/existing/files \ + object-store \ + -e my-garage-instance.mydomain.tld:3900 \ + -b pictrs-data \ + -r garage \ + -a GK... \ + -s abcdef0123456789... +``` + +This is pretty slow, so hold on while migrating. + +### Running pict-rs with an S3 backend + +Pict-rs supports both a configuration file and environment variables. + +Either set the following section in your `pict-rs.toml`: + +``` +[store] +type = 'object_storage' +endpoint = 'http://my-garage-instance.mydomain.tld:3900' +bucket_name = 'pictrs-data' +region = 'garage' +access_key = 'GK...' +secret_key = 'abcdef0123456789...' +``` + +... or set these environment variables: + + +``` +PICTRS__STORE__TYPE=object_storage +PICTRS__STORE__ENDPOINT=http:/my-garage-instance.mydomain.tld:3900 +PICTRS__STORE__BUCKET_NAME=pictrs-data +PICTRS__STORE__REGION=garage +PICTRS__STORE__ACCESS_KEY=GK... +PICTRS__STORE__SECRET_KEY=abcdef0123456789... +``` + ## Funkwhale -- cgit v1.2.3 From cbb522e17942797ea1f0fd972225b6945a775368 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 13 Jan 2023 12:33:27 +0100 Subject: Different lattice figures --- doc/talks/2023-01-18-tocatta/Makefile | 14 +- doc/talks/2023-01-18-tocatta/assets/lattice8.svg | 112 ++-- doc/talks/2023-01-18-tocatta/assets/lattice9.svg | 587 ----------------- doc/talks/2023-01-18-tocatta/assets/latticeA.svg | 587 ----------------- doc/talks/2023-01-18-tocatta/assets/latticeB.svg | 598 ----------------- doc/talks/2023-01-18-tocatta/assets/latticeB_1.svg | 576 +++++++++++++++++ .../2023-01-18-tocatta/assets/latticeB_10.svg | 715 +++++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/latticeB_2.svg | 576 +++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/latticeB_3.svg | 576 +++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/latticeB_4.svg | 587 +++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/latticeB_5.svg | 604 +++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/latticeB_6.svg | 632 ++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/latticeB_7.svg | 654 +++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/latticeB_8.svg | 671 +++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/latticeB_9.svg | 699 ++++++++++++++++++++ doc/talks/2023-01-18-tocatta/assets/latticeC.svg | 598 ----------------- doc/talks/2023-01-18-tocatta/talk.pdf | Bin 2494390 -> 2488793 bytes doc/talks/2023-01-18-tocatta/talk.tex | 66 +- 18 files changed, 6386 insertions(+), 2466 deletions(-) delete mode 100644 doc/talks/2023-01-18-tocatta/assets/lattice9.svg delete mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeA.svg delete mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeB.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeB_1.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeB_10.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeB_2.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeB_3.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeB_4.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeB_5.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeB_6.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeB_7.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeB_8.svg create mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeB_9.svg delete mode 100644 doc/talks/2023-01-18-tocatta/assets/latticeC.svg (limited to 'doc') diff --git a/doc/talks/2023-01-18-tocatta/Makefile b/doc/talks/2023-01-18-tocatta/Makefile index a1f76e15..554f7b97 100644 --- a/doc/talks/2023-01-18-tocatta/Makefile +++ b/doc/talks/2023-01-18-tocatta/Makefile @@ -12,10 +12,16 @@ ASSETS=assets/consistent_hashing_1.pdf \ assets/lattice6.pdf_tex \ assets/lattice7.pdf_tex \ assets/lattice8.pdf_tex \ - assets/lattice9.pdf_tex \ - assets/latticeA.pdf_tex \ - assets/latticeB.pdf_tex \ - assets/latticeC.pdf_tex \ + assets/latticeB_1.pdf_tex \ + assets/latticeB_2.pdf_tex \ + assets/latticeB_3.pdf_tex \ + assets/latticeB_4.pdf_tex \ + assets/latticeB_5.pdf_tex \ + assets/latticeB_6.pdf_tex \ + assets/latticeB_7.pdf_tex \ + assets/latticeB_8.pdf_tex \ + assets/latticeB_9.pdf_tex \ + assets/latticeB_10.pdf_tex \ assets/deuxfleurs.pdf talk.pdf: talk.tex $(ASSETS) diff --git a/doc/talks/2023-01-18-tocatta/assets/lattice8.svg b/doc/talks/2023-01-18-tocatta/assets/lattice8.svg index 3bada791..c94a69b2 100644 --- a/doc/talks/2023-01-18-tocatta/assets/lattice8.svg +++ b/doc/talks/2023-01-18-tocatta/assets/lattice8.svg @@ -25,8 +25,8 @@ inkscape:document-units="mm" showgrid="false" inkscape:zoom="1.4734708" - inkscape:cx="451.65469" - inkscape:cy="272.14655" + inkscape:cx="399.39712" + inkscape:cy="248.39311" inkscape:window-width="1920" inkscape:window-height="999" inkscape:window-x="0" @@ -433,8 +433,8 @@ $write(\{a\})$: + $read()$: $\sqsupseteq \{a\} \to$ OK $\not\sqsupseteq \{a\}$ + y="41.972523">$\sqsupseteq \{a\}$ + + $\to \{\}$ return OK - - - $write(\{b\})$: - $\not\sqsupseteq \{b\}$ - $\not\sqsupseteq \{b\}$ + x="14.395845" + y="92.005798">return $\{\}\sqcup\{a\}=\{a\}$ + $\not\sqsupseteq \{b\}$ + y="83.577797">$\to \{a\}$ diff --git a/doc/talks/2023-01-18-tocatta/assets/lattice9.svg b/doc/talks/2023-01-18-tocatta/assets/lattice9.svg deleted file mode 100644 index 8b3c6585..00000000 --- a/doc/talks/2023-01-18-tocatta/assets/lattice9.svg +++ /dev/null @@ -1,587 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - $\{\}$ - $\{a,b,c\}$ - - $\{a\}$ - $\{b\}$ - $\{c\}$ - - - $\{a,c\}$ - $\{a,b\}$ - $\{b,c\}$ - - - - - - - - - - - - - - - - - - - - $write(\{a\})$: - $\sqsupseteq \{a\} \to$ OK - $\sqsupseteq \{a\} \to$ OK - $\not\sqsupseteq \{a\}$ - return OK - - - - $write(\{b\})$: - $\not\sqsupseteq \{b\}$ - $\not\sqsupseteq \{b\}$ - $\sqsupseteq \{b\} \to$ OK - - diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeA.svg b/doc/talks/2023-01-18-tocatta/assets/latticeA.svg deleted file mode 100644 index 400ccff8..00000000 --- a/doc/talks/2023-01-18-tocatta/assets/latticeA.svg +++ /dev/null @@ -1,587 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - $\{\}$ - $\{a,b,c\}$ - - $\{a\}$ - $\{b\}$ - $\{c\}$ - - - $\{a,c\}$ - $\{a,b\}$ - $\{b,c\}$ - - - - - - - - - - - - - - - - - - - - $write(\{a\})$: - $\sqsupseteq \{a\} \to$ OK - $\sqsupseteq \{a\} \to$ OK - $\sqsupseteq \{a\}$ - return OK - - - - $write(\{b\})$: - $\not\sqsupseteq \{b\}$ - $\not\sqsupseteq \{b\}$ - $\sqsupseteq \{b\} \to$ OK - - diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeB.svg b/doc/talks/2023-01-18-tocatta/assets/latticeB.svg deleted file mode 100644 index 06725d75..00000000 --- a/doc/talks/2023-01-18-tocatta/assets/latticeB.svg +++ /dev/null @@ -1,598 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - $\{\}$ - $\{a,b,c\}$ - - $\{a\}$ - $\{b\}$ - $\{c\}$ - - - $\{a,c\}$ - $\{a,b\}$ - $\{b,c\}$ - - - - - - - - - - - - - - - - - - - - $write(\{a\})$: - $\sqsupseteq \{a\} \to$ OK - $\sqsupseteq \{a\} \to$ OK - $\sqsupseteq \{a\}$ - return OK - return OK - - - - $write(\{b\})$: - $\not\sqsupseteq \{b\}$ - $\sqsupseteq \{b\} \to$ OK - $\sqsupseteq \{b\} \to$ OK - - diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeB_1.svg b/doc/talks/2023-01-18-tocatta/assets/latticeB_1.svg new file mode 100644 index 00000000..92232a1b --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/latticeB_1.svg @@ -0,0 +1,576 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + + + + $write(\{b\})$: + $\not\sqsupseteq \{b\}$ + $\not\sqsupseteq \{b\}$ + $\not\sqsupseteq \{b\}$ + + diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeB_10.svg b/doc/talks/2023-01-18-tocatta/assets/latticeB_10.svg new file mode 100644 index 00000000..34c24e0d --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/latticeB_10.svg @@ -0,0 +1,715 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + + + + $write(\{b\})$: + $read()$: + $\not\sqsupseteq \{b\}$ + $\sqsupseteq \{b\} \to$ OK + $\not\sqsupseteq \{b\}$ + + + $\to \{a\}$ + return $\{a\}$ + + $\to \{\}$ + + $\to \{\}$ + $read()$: + ; + return $\{b\}$ + + $\to \{b\}$ + ${\Large\textbf{??!}}$~~~~~$\{a\} \not\sqsubseteq \{b\}$ + + diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeB_2.svg b/doc/talks/2023-01-18-tocatta/assets/latticeB_2.svg new file mode 100644 index 00000000..c07cba2b --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/latticeB_2.svg @@ -0,0 +1,576 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + + + + $write(\{b\})$: + $\not\sqsupseteq \{b\}$ + $\not\sqsupseteq \{b\}$ + $\not\sqsupseteq \{b\}$ + + diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeB_3.svg b/doc/talks/2023-01-18-tocatta/assets/latticeB_3.svg new file mode 100644 index 00000000..198d1f5d --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/latticeB_3.svg @@ -0,0 +1,576 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + + + + $write(\{b\})$: + $\not\sqsupseteq \{b\}$ + $\sqsupseteq \{b\} \to$ OK + $\not\sqsupseteq \{b\}$ + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeB_4.svg b/doc/talks/2023-01-18-tocatta/assets/latticeB_4.svg new file mode 100644 index 00000000..c5f6148d --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/latticeB_4.svg @@ -0,0 +1,587 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + + + + $write(\{b\})$: + $read()$: + $\not\sqsupseteq \{b\}$ + $\sqsupseteq \{b\} \to$ OK + $\not\sqsupseteq \{b\}$ + + + diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeB_5.svg b/doc/talks/2023-01-18-tocatta/assets/latticeB_5.svg new file mode 100644 index 00000000..c2b668be --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/latticeB_5.svg @@ -0,0 +1,604 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + + + + $write(\{b\})$: + $read()$: + $\not\sqsupseteq \{b\}$ + $\sqsupseteq \{b\} \to$ OK + $\not\sqsupseteq \{b\}$ + + + $\to \{a\}$ + + diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeB_6.svg b/doc/talks/2023-01-18-tocatta/assets/latticeB_6.svg new file mode 100644 index 00000000..980823fc --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/latticeB_6.svg @@ -0,0 +1,632 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + + + + $write(\{b\})$: + $read()$: + $\not\sqsupseteq \{b\}$ + $\sqsupseteq \{b\} \to$ OK + $\not\sqsupseteq \{b\}$ + + + $\to \{a\}$ + return $\{a\}$ + + $\to \{\}$ + + diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeB_7.svg b/doc/talks/2023-01-18-tocatta/assets/latticeB_7.svg new file mode 100644 index 00000000..154c0b7d --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/latticeB_7.svg @@ -0,0 +1,654 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + + + + $write(\{b\})$: + $read()$: + $\not\sqsupseteq \{b\}$ + $\sqsupseteq \{b\} \to$ OK + $\not\sqsupseteq \{b\}$ + + + $\to \{a\}$ + return $\{a\}$ + + $\to \{\}$ + $read()$: + ; + + diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeB_8.svg b/doc/talks/2023-01-18-tocatta/assets/latticeB_8.svg new file mode 100644 index 00000000..21766415 --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/latticeB_8.svg @@ -0,0 +1,671 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + + + + $write(\{b\})$: + $read()$: + $\not\sqsupseteq \{b\}$ + $\sqsupseteq \{b\} \to$ OK + $\not\sqsupseteq \{b\}$ + + + $\to \{a\}$ + return $\{a\}$ + + $\to \{\}$ + + $\to \{\}$ + $read()$: + ; + + diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeB_9.svg b/doc/talks/2023-01-18-tocatta/assets/latticeB_9.svg new file mode 100644 index 00000000..b60f8afe --- /dev/null +++ b/doc/talks/2023-01-18-tocatta/assets/latticeB_9.svg @@ -0,0 +1,699 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $\{\}$ + $\{a,b,c\}$ + + $\{a\}$ + $\{b\}$ + $\{c\}$ + + + $\{a,c\}$ + $\{a,b\}$ + $\{b,c\}$ + + + + + + + + + + + + + + + + + + + $write(\{a\})$: + $\sqsupseteq \{a\} \to$ OK + $\not\sqsupseteq \{a\}$ + $\not\sqsupseteq \{a\}$ + + + + $write(\{b\})$: + $read()$: + $\not\sqsupseteq \{b\}$ + $\sqsupseteq \{b\} \to$ OK + $\not\sqsupseteq \{b\}$ + + + $\to \{a\}$ + return $\{a\}$ + + $\to \{\}$ + + $\to \{\}$ + $read()$: + ; + return $\{b\}$ + + $\to \{b\}$ + + diff --git a/doc/talks/2023-01-18-tocatta/assets/latticeC.svg b/doc/talks/2023-01-18-tocatta/assets/latticeC.svg deleted file mode 100644 index c815af94..00000000 --- a/doc/talks/2023-01-18-tocatta/assets/latticeC.svg +++ /dev/null @@ -1,598 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - $\{\}$ - $\{a,b,c\}$ - - $\{a\}$ - $\{b\}$ - $\{c\}$ - - - $\{a,c\}$ - $\{a,b\}$ - $\{b,c\}$ - - - - - - - - - - - - - - - - - - - - $write(\{a\})$: - $\sqsupseteq \{a\} \to$ OK - $\sqsupseteq \{a\} \to$ OK - $\sqsupseteq \{a\}$ - return OK - return OK - - - - $write(\{b\})$: - $\sqsupseteq \{b\}$ - $\sqsupseteq \{b\} \to$ OK - $\sqsupseteq \{b\} \to$ OK - - diff --git a/doc/talks/2023-01-18-tocatta/talk.pdf b/doc/talks/2023-01-18-tocatta/talk.pdf index e4acf75e..6a70fcd7 100644 Binary files a/doc/talks/2023-01-18-tocatta/talk.pdf and b/doc/talks/2023-01-18-tocatta/talk.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/talk.tex b/doc/talks/2023-01-18-tocatta/talk.tex index db6d26ef..4c3e4eeb 100644 --- a/doc/talks/2023-01-18-tocatta/talk.tex +++ b/doc/talks/2023-01-18-tocatta/talk.tex @@ -49,7 +49,7 @@ \title{Garage} \subtitle{a lightweight and robust geo-distributed data storage system} -\author{Deuxfleurs Association} +\author{Alex Auvolat, Deuxfleurs Association} \date{Inria, 2023-01-18} \begin{document} @@ -59,7 +59,7 @@ \includegraphics[width=.3\linewidth]{../../sticker/Garage.pdf} \vspace{1em} - {\large\bf Deuxfleurs Association} + {\large\bf Alex Auvolat, Deuxfleurs Association} \vspace{1em} \url{https://garagehq.deuxfleurs.fr/} @@ -68,7 +68,7 @@ \end{frame} \begin{frame} - \frametitle{Who we are} + \frametitle{Who I am} \begin{columns}[t] \begin{column}{.2\textwidth} \centering @@ -82,21 +82,6 @@ ~ \end{column} \end{columns} - \vspace{1em} - - \begin{columns}[t] - \begin{column}{.2\textwidth} - ~ - \end{column} - \begin{column}{.6\textwidth} - \textbf{Quentin Dufour}\\ - PhD; co-founder of Deuxfleurs - \end{column} - \begin{column}{.2\textwidth} - \centering - \adjincludegraphics[width=.5\linewidth, valign=t]{assets/quentin.jpg} - \end{column} - \end{columns} \vspace{2em} \begin{columns}[t] @@ -546,7 +531,7 @@ \vspace{1em} - \textbf{CRDTs only}\\(conflict-free replicated data types) + \textbf{Limited objects such as CRDTs}\\(conflict-free replicated data types) \vspace{1em} Part of the complexity is \textbf{reported to the consumer of the API}\\~ @@ -647,6 +632,13 @@ \end{itemize} \end{frame} +\begin{frame} + \frametitle{Performance gains in practice} + \begin{center} + \includegraphics[width=.8\linewidth]{assets/endpoint-latency-dc.png} + \end{center} +\end{frame} + \begin{frame} \frametitle{What can we implement without consensus?} \begin{itemize} @@ -664,7 +656,7 @@ \end{frame} \begin{frame} - \frametitle{Understanding CRDTs and quorums} + \frametitle{CRDTs and quorums: read-after-write consistency} \begin{figure} \centering \def\svgwidth{.8\textwidth} @@ -676,15 +668,11 @@ \only<6>{\import{assets/}{lattice6.pdf_tex}}% \only<7>{\import{assets/}{lattice7.pdf_tex}}% \only<8>{\import{assets/}{lattice8.pdf_tex}}% - \only<9>{\import{assets/}{lattice9.pdf_tex}}% - \only<10>{\import{assets/}{latticeA.pdf_tex}}% - \only<11>{\import{assets/}{latticeB.pdf_tex}}% - \only<12>{\import{assets/}{latticeC.pdf_tex}}% \end{figure} \end{frame} \begin{frame} - \frametitle{Read-after-write consistency:} + \frametitle{CRDTs and quorums: read-after-write consistency} \textbf{Property:} If node $A$ did an operation $write(x)$ and received an OK response,\\ \hspace{2cm} and node $B$ starts an operation $read()$ after $A$ received OK,\\ \hspace{2cm} then $B$ will read a value $x' \sqsupseteq x$. @@ -718,7 +706,25 @@ \end{frame} \begin{frame} - \frametitle{Monotonical read consistency:} + \frametitle{CRDTs and quorums: monotonic-reads consistency} + \begin{figure} + \centering + \def\svgwidth{.8\textwidth} + \only<1>{\import{assets/}{latticeB_1.pdf_tex}}% + \only<2>{\import{assets/}{latticeB_2.pdf_tex}}% + \only<3>{\import{assets/}{latticeB_3.pdf_tex}}% + \only<4>{\import{assets/}{latticeB_4.pdf_tex}}% + \only<5>{\import{assets/}{latticeB_5.pdf_tex}}% + \only<6>{\import{assets/}{latticeB_6.pdf_tex}}% + \only<7>{\import{assets/}{latticeB_7.pdf_tex}}% + \only<8>{\import{assets/}{latticeB_8.pdf_tex}}% + \only<9>{\import{assets/}{latticeB_9.pdf_tex}}% + \only<10>{\import{assets/}{latticeB_10.pdf_tex}}% + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{CRDTs and quorums: monotonic-reads consistency} \textbf{Property:} If node $A$ did an operation $read()$ and received $x$ as a response,\\ \hspace{2cm} and node $B$ starts an operation $read()$ after $A$ received $x$,\\ \hspace{2cm} then $B$ will read a value $x' \sqsupseteq x$. @@ -735,14 +741,8 @@ \end{enumerate} \vspace{1em} - This makes reads slower in some cases, and is \textbf{not implemented in Garage}. -\end{frame} -\begin{frame} - \frametitle{Performance gains in practice} - \begin{center} - \includegraphics[width=.8\linewidth]{assets/endpoint-latency-dc.png} - \end{center} + This makes reads slower in some cases, and is \textbf{not implemented in Garage}. \end{frame} \begin{frame} -- cgit v1.2.3 From d44e8366e7b9ab2ad352ecee189231430ee713df Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 13 Jan 2023 13:16:55 +0100 Subject: Reorder and add a hard problem --- doc/talks/2023-01-18-tocatta/talk.pdf | Bin 2488793 -> 2490671 bytes doc/talks/2023-01-18-tocatta/talk.tex | 61 +++++++++++++++++++++------------- 2 files changed, 37 insertions(+), 24 deletions(-) (limited to 'doc') diff --git a/doc/talks/2023-01-18-tocatta/talk.pdf b/doc/talks/2023-01-18-tocatta/talk.pdf index 6a70fcd7..3d0c8830 100644 Binary files a/doc/talks/2023-01-18-tocatta/talk.pdf and b/doc/talks/2023-01-18-tocatta/talk.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/talk.tex b/doc/talks/2023-01-18-tocatta/talk.tex index 4c3e4eeb..09250cf1 100644 --- a/doc/talks/2023-01-18-tocatta/talk.tex +++ b/doc/talks/2023-01-18-tocatta/talk.tex @@ -509,7 +509,8 @@ \begin{frame} \frametitle{Consensus vs weak consistency} \begin{center} - \textbf{The same objects cannot be implemented in both models.} + \textbf{From a theoretical point of view:}\\ + \end{center} \vspace{2em} @@ -519,11 +520,8 @@ \vspace{1em} - \textbf{Any sequential specification}\\~ - - \vspace{1em} - \textbf{Easier to program for}: just write your program as if it were sequential on a single machine - + Require \textbf{additionnal assumptions} such as a fault detector or a strong RNG\\ + (FLP impossibility theorem) \end{minipage} \hfill \begin{minipage}{6.5cm} @@ -531,19 +529,20 @@ \vspace{1em} - \textbf{Limited objects such as CRDTs}\\(conflict-free replicated data types) - - \vspace{1em} - Part of the complexity is \textbf{reported to the consumer of the API}\\~ + Can be implemented in \textbf{any\\asynchronous message passing\\distributed system} with node crashes \end{minipage} \hspace{1em} + + \vspace{3em} + \begin{center} + They represent \textbf{different classes of computational capability}\\ + \end{center} \end{frame} \begin{frame} \frametitle{Consensus vs weak consistency} \begin{center} - \textbf{From a theoretical point of view:}\\ - + \textbf{The same objects cannot be implemented in both models.} \end{center} \vspace{2em} @@ -553,7 +552,11 @@ \vspace{1em} - Require \textbf{additionnal assumptions} such as a fault detector or a strong RNG\\~ + \textbf{Any sequential specification}\\~ + + \vspace{1em} + \textbf{Easier to program for}: just write your program as if it were sequential on a single machine + \end{minipage} \hfill \begin{minipage}{6.5cm} @@ -561,14 +564,12 @@ \vspace{1em} - Can be implemented in \textbf{any asynchronous message passing distributed system} + \textbf{Only CRDTs}\\(conflict-free replicated data types) + + \vspace{1em} + Part of the complexity is \textbf{reported to the consumer of the API}\\~ \end{minipage} \hspace{1em} - - \vspace{3em} - \begin{center} - They represent \textbf{different classes of computational capability} - \end{center} \end{frame} \begin{frame} @@ -608,7 +609,7 @@ $\to$ the API cannot be implemented in a weakly consistent system \vspace{2em} \item \textbf{This API can be implemented using only weak primitives}\\ - (e.g. a bunch of atomic registers)\\ + (e.g. in the asynchronous message passing model with no further assumption)\\ $\to$ the API is strictly weaker than consensus\\ $\to$ we can implement it in Garage! \end{itemize} @@ -625,7 +626,10 @@ \item \textbf{Performance issues:} \vspace{1em} \begin{itemize} - \item The leader is a \textbf{bottleneck} for all requests + \item Theoretical requirements (RNG, failure detector) translate into \textbf{practical costs} + \vspace{1em} + \item The leader is a \textbf{bottleneck} for all requests;\\ + even in leaderless approaches, \textbf{all nodes must process all operations in order} \vspace{1em} \item Particularly \textbf{sensitive to higher latency} between nodes \end{itemize} @@ -746,10 +750,19 @@ \end{frame} \begin{frame} - \frametitle{The hard parts we don't address (yet!)} + \frametitle{A hard problem: layout changes} \begin{itemize} - \item Maintain consistency changes when nodes assigned to a partition change:\\ - \item TODO + \item We rely on quorums $k > n/2$ within each partition:\\ + $$n=3,~~~~~~~k\ge 2$$ + \item When rebalancing, the set of nodes responsible for a partition can change:\\ + $$\{n_A, n_B, n_C\} \to \{n_A, n_D, n_E\}$$ + \vspace{.01em} + \item During the rebalancing, $D$ and $E$ don't yet have the data,\\ + ~~~~~~~~~~~~~~~~~~~and $B$ and $C$ want to get rid of the data to free up space\\ + \vspace{.2em} + $\to$ quorums only within the new set of nodes don't work\\ + $\to$ how to coordinate? \textbf{currently, we don't...} + \end{itemize} \end{frame} -- cgit v1.2.3 From 065d6e1e06e97d60443b78ec2ac5da0bb2abb760 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 13 Jan 2023 13:51:39 +0100 Subject: Talk about K2V specifics --- doc/talks/2023-01-18-tocatta/talk.pdf | Bin 2490671 -> 2497912 bytes doc/talks/2023-01-18-tocatta/talk.tex | 74 ++++++++++++++++++++++++++++------ 2 files changed, 62 insertions(+), 12 deletions(-) (limited to 'doc') diff --git a/doc/talks/2023-01-18-tocatta/talk.pdf b/doc/talks/2023-01-18-tocatta/talk.pdf index 3d0c8830..9522f8b0 100644 Binary files a/doc/talks/2023-01-18-tocatta/talk.pdf and b/doc/talks/2023-01-18-tocatta/talk.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/talk.tex b/doc/talks/2023-01-18-tocatta/talk.tex index 09250cf1..1a5b18a8 100644 --- a/doc/talks/2023-01-18-tocatta/talk.tex +++ b/doc/talks/2023-01-18-tocatta/talk.tex @@ -780,17 +780,73 @@ \begin{frame} \frametitle{K2V Design} \begin{itemize} - \item A new, custom, minimal API + \item A new, custom, minimal API\\ + \vspace{.5em} + \begin{itemize} + \item Single-item operations + \item Operations on ranges and batches of items + \item Polling operations to help implement a PubSub pattern + \end{itemize} \vspace{1em} \item<2-> Exposes the partitoning mechanism of Garage\\ K2V = partition key / sort key / value (like Dynamo) \vspace{1em} - \item<3-> Coordination-free, CRDT-friendly (inspired by Riak)\\ + \item<3-> Weakly consistent, CRDT-friendly\\ + $\to$ no support for transactions (not ACID) \vspace{1em} \item<4-> Cryptography-friendly: values are binary blobs \end{itemize} \end{frame} +\begin{frame} + \frametitle{Handling concurrent values} + \textbf{How to handle concurrency?} Example: + \vspace{1em} + \begin{enumerate} + \item Client $A$ reads the initial value of a key, $x_0$ + \vspace{1em} + \item<2-> Client $B$ also reads the initial value $x_0$ of that key + \vspace{1em} + \item<3-> Client $A$ modifies $x_0$, and writes a new value $x_1$ + \vspace{1em} + \item<4-> Client $B$ also modifies $x_0$, and writes a new value $x'_1$,\\ + without having a chance to first read $x_1$\\ + \vspace{1em} + $\to$ what should the final state be? + \end{enumerate} +\end{frame} + +\begin{frame} + \frametitle{Handling concurrent values} + \begin{itemize} + \item If we keep only $x_1$ or $x'_1$, we risk \textbf{loosing application data} + \vspace{1.5em} + \item Values are opaque binary blobs, \textbf{K2V cannot resolve conflicts} by itself\\ + (e.g. by implementing a CRDT) + \vspace{1.5em} + \item Solution: \textbf{we keep both!}\\ + $\to$ the value of the key is now $\{x_1, x'_1\}$\\ + $\to$ the client application can decide how to resolve conflicts on the next read + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Keeping track of causality} + How does K2V know that $x_1$ and $x'_1$ are concurrent? + \vspace{1em} + \begin{itemize} + \item $read()$ returns \textbf{a set of values} and an associated \textbf{causality token}\\ + \vspace{1.5em} + \item When calling $write()$, the client sends \textbf{the causality token from its last read} + \vspace{1.5em} + \item The causality token represents the set of values \textbf{already seen by the client}\\ + $\to$ those values are the \textbf{causal past} of the write operation\\ + $\to$ K2V can keep concurrent values and overwrite all ones in the causal past + \vspace{1.5em} + \item Internally, the causality token is \textbf{a vector clock} + \end{itemize} +\end{frame} + \begin{frame} \frametitle{Application: an e-mail storage server} \begin{center} @@ -800,7 +856,7 @@ \begin{frame} \frametitle{A new model for building resilient software} - \begin{itemize} + \begin{enumerate} \item Design a data model suited to K2V\\ {\footnotesize (see Cassandra docs on porting SQL data models to Cassandra)} \vspace{1em} @@ -810,22 +866,16 @@ \item Store opaque binary blobs to provide End-to-End Encryption\\ \end{itemize} \vspace{1em} - \item Store big blobs (files) in S3 + \item Store big blobs (files) using the S3 API \vspace{1em} \item Let Garage manage sharding, replication, failover, etc. - \end{itemize} + \end{enumerate} \end{frame} \begin{frame} \frametitle{Research perspectives} \begin{itemize} - \item Write about Garage's global architecture \emph{(paper in progress)} - \vspace{1em} - \item Measure and improve Garage's performances - \vspace{1em} - \item Discuss the optimal layout algorithm, provide proofs - \vspace{1em} - \item Write about our proposed architecture for (E2EE) apps over K2V+S3 + \item TODO \end{itemize} \end{frame} -- cgit v1.2.3 From 0010f705ef6d6816cb819d4d30417e56ddc7a209 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 13 Jan 2023 15:28:17 +0100 Subject: Talk for 2023-01-18 pretty much finished --- doc/talks/2023-01-18-tocatta/talk.pdf | Bin 2497912 -> 2632153 bytes doc/talks/2023-01-18-tocatta/talk.tex | 73 ++++++++++++++++++++++++---------- 2 files changed, 51 insertions(+), 22 deletions(-) (limited to 'doc') diff --git a/doc/talks/2023-01-18-tocatta/talk.pdf b/doc/talks/2023-01-18-tocatta/talk.pdf index 9522f8b0..c3265542 100644 Binary files a/doc/talks/2023-01-18-tocatta/talk.pdf and b/doc/talks/2023-01-18-tocatta/talk.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/talk.tex b/doc/talks/2023-01-18-tocatta/talk.tex index 1a5b18a8..7fad6065 100644 --- a/doc/talks/2023-01-18-tocatta/talk.tex +++ b/doc/talks/2023-01-18-tocatta/talk.tex @@ -187,7 +187,7 @@ \vspace{1em} \item Folder hierarchies \vspace{1em} - \item Other requirements of the POSIX spec + \item Other requirements of the POSIX spec (e.g.~locks) \end{itemize} \vspace{1em} Coordination in a distributed system is costly @@ -291,7 +291,7 @@ \frametitle{Key-value stores, upgraded: the Dynamo model} \textbf{Two keys:} \begin{itemize} - \item Partition key: used to divide data into partitions (shards) + \item Partition key: used to divide data into partitions {\small (a.k.a.~shards)} \item Sort key: used to identify items inside a partition \end{itemize} @@ -326,7 +326,7 @@ \begin{frame} \frametitle{Key-value stores, upgraded: the Dynamo model} \begin{itemize} - \item Data with different partition keys is stored independantly,\\ + \item Data with different partition keys is stored independently,\\ on a different set of nodes\\ \vspace{.5em} $\to$ no easy way to list all partition keys\\ @@ -520,7 +520,7 @@ \vspace{1em} - Require \textbf{additionnal assumptions} such as a fault detector or a strong RNG\\ + Require \textbf{additional assumptions} such as a fault detector or a strong RNG\\ (FLP impossibility theorem) \end{minipage} \hfill @@ -608,7 +608,7 @@ $\to$ the API is equivalent to consensus/total ordering of messages\\ $\to$ the API cannot be implemented in a weakly consistent system \vspace{2em} - \item \textbf{This API can be implemented using only weak primitives}\\ + \item<2-> \textbf{This API can be implemented using only weak primitives}\\ (e.g. in the asynchronous message passing model with no further assumption)\\ $\to$ the API is strictly weaker than consensus\\ $\to$ we can implement it in Garage! @@ -648,13 +648,13 @@ \begin{itemize} \item Any \textbf{conflict-free replicated data type} (CRDT) \vspace{1em} - \item Non-transactional key-value stores such as S3 are equivalent to a simple CRDT:\\ - a \textbf{last-writer-wins registry} + \item<2-> Non-transactional key-value stores such as S3 are equivalent to a simple CRDT:\\ + a map of \textbf{last-writer-wins registers} (each key is its own CRDT) \vspace{1em} - \item \textbf{Read-after-write consistency} can be implemented + \item<3-> \textbf{Read-after-write consistency} can be implemented using quorums on read and write operations \vspace{1em} - \item \textbf{Monotonicity of reads} can be implemented with repair-on-read\\ + \item<4-> \textbf{Monotonicity of reads} can be implemented with repair-on-read\\ (makes reads more costly, not implemented in Garage) \end{itemize} \end{frame} @@ -735,7 +735,7 @@ \vspace{1em} - \textbf{Algorithm $read()$:} + \textbf{Algorithm $monotonic\_read()$:} {\small (a.k.a. repair-on-read)} \begin{enumerate} \item Broadcast $read()$ to all nodes \item Wait for $k > n/2$ nodes to reply with values $x_1, \dots, x_k$ @@ -754,10 +754,10 @@ \begin{itemize} \item We rely on quorums $k > n/2$ within each partition:\\ $$n=3,~~~~~~~k\ge 2$$ - \item When rebalancing, the set of nodes responsible for a partition can change:\\ + \item<2-> When rebalancing, the set of nodes responsible for a partition can change:\\ $$\{n_A, n_B, n_C\} \to \{n_A, n_D, n_E\}$$ \vspace{.01em} - \item During the rebalancing, $D$ and $E$ don't yet have the data,\\ + \item<3-> During the rebalancing, $D$ and $E$ don't yet have the data,\\ ~~~~~~~~~~~~~~~~~~~and $B$ and $C$ want to get rid of the data to free up space\\ \vspace{.2em} $\to$ quorums only within the new set of nodes don't work\\ @@ -769,7 +769,7 @@ \section{Going further than the S3 API} \begin{frame} - \frametitle{Further plans for Garage} + \frametitle{Using Garage for everything} \begin{center} \only<1>{\includegraphics[width=.8\linewidth]{assets/slideB1.png}}% \only<2>{\includegraphics[width=.8\linewidth]{assets/slideB2.png}}% @@ -821,10 +821,10 @@ \begin{itemize} \item If we keep only $x_1$ or $x'_1$, we risk \textbf{loosing application data} \vspace{1.5em} - \item Values are opaque binary blobs, \textbf{K2V cannot resolve conflicts} by itself\\ + \item<2-> Values are opaque binary blobs, \textbf{K2V cannot resolve conflicts} by itself\\ (e.g. by implementing a CRDT) \vspace{1.5em} - \item Solution: \textbf{we keep both!}\\ + \item<3-> Solution: \textbf{we keep both!}\\ $\to$ the value of the key is now $\{x_1, x'_1\}$\\ $\to$ the client application can decide how to resolve conflicts on the next read \end{itemize} @@ -837,13 +837,13 @@ \begin{itemize} \item $read()$ returns \textbf{a set of values} and an associated \textbf{causality token}\\ \vspace{1.5em} - \item When calling $write()$, the client sends \textbf{the causality token from its last read} + \item<2-> When calling $write()$, the client sends \textbf{the causality token from its last read} \vspace{1.5em} - \item The causality token represents the set of values \textbf{already seen by the client}\\ + \item<3-> The causality token represents the set of values \textbf{already seen by the client}\\ $\to$ those values are the \textbf{causal past} of the write operation\\ $\to$ K2V can keep concurrent values and overwrite all ones in the causal past \vspace{1.5em} - \item Internally, the causality token is \textbf{a vector clock} + \item<4-> Internally, the causality token is \textbf{a vector clock} \end{itemize} \end{frame} @@ -854,8 +854,28 @@ \end{center} \end{frame} +\begin{frame} + \frametitle{Aerogramme data model} + \begin{center} + \only<1>{\includegraphics[width=.4\linewidth]{assets/aerogramme_datatype.drawio.pdf}}% + \only<2->{\includegraphics[width=.9\linewidth]{assets/aerogramme_keys.drawio.pdf}\vspace{1em}}% + \end{center} + \visible<3->{Aerogramme encrypts all stored values for privacy\\ + (Garage server administrators can't read your mail)} +\end{frame} + +\begin{frame} + \frametitle{Different deployment scenarios} + \begin{center} + \only<1>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components1.drawio.pdf}}% + \only<2>{\includegraphics[width=.9\linewidth]{assets/aerogramme_components2.drawio.pdf}}% + \end{center} +\end{frame} + \begin{frame} \frametitle{A new model for building resilient software} + How to build an application using only Garage as a data store: + \vspace{1em} \begin{enumerate} \item Design a data model suited to K2V\\ {\footnotesize (see Cassandra docs on porting SQL data models to Cassandra)} @@ -866,16 +886,25 @@ \item Store opaque binary blobs to provide End-to-End Encryption\\ \end{itemize} \vspace{1em} - \item Store big blobs (files) using the S3 API + \item<2-> Store big blobs (files) using the S3 API \vspace{1em} - \item Let Garage manage sharding, replication, failover, etc. + \item<3-> Let Garage manage sharding, replication, failover, etc. \end{enumerate} \end{frame} +\section{Conclusion} + \begin{frame} - \frametitle{Research perspectives} + \frametitle{Perspectives} \begin{itemize} - \item TODO + \item Fix the consistency issue when rebalancing + \vspace{1em} + \item Write about Garage's architecture and properties,\\ + and about our proposed architecture for (E2EE) apps over K2V+S3 + \vspace{1em} + \item Continue developing Garage; finish Aerogramme; build new applications... + \vspace{1em} + \item Anything else? \end{itemize} \end{frame} -- cgit v1.2.3 From 97bb1102193e274208ed1db7dc2a8b9445959f94 Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Fri, 13 Jan 2023 14:12:02 +0000 Subject: doc: Added observability.md. --- doc/book/connect/_index.md | 7 ++--- doc/book/connect/observability.md | 57 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 3 deletions(-) create mode 100644 doc/book/connect/observability.md (limited to 'doc') diff --git a/doc/book/connect/_index.md b/doc/book/connect/_index.md index ca44ac17..93a2b87e 100644 --- a/doc/book/connect/_index.md +++ b/doc/book/connect/_index.md @@ -10,11 +10,12 @@ Garage implements the Amazon S3 protocol, which makes it compatible with many ex In particular, you will find here instructions to connect it with: - - [Browsing tools](@/documentation/connect/cli.md) - [Applications](@/documentation/connect/apps/index.md) - - [Website hosting](@/documentation/connect/websites.md) - - [Software repositories](@/documentation/connect/repositories.md) + - [Browsing tools](@/documentation/connect/cli.md) - [FUSE](@/documentation/connect/fs.md) + - [Observability](@/documentation/connect/observability.md) + - [Software repositories](@/documentation/connect/repositories.md) + - [Website hosting](@/documentation/connect/websites.md) ### Generic instructions diff --git a/doc/book/connect/observability.md b/doc/book/connect/observability.md new file mode 100644 index 00000000..c5037fa4 --- /dev/null +++ b/doc/book/connect/observability.md @@ -0,0 +1,57 @@ ++++ +title = "Observability" +weight = 25 ++++ + +An object store can be used as data storage location for metrics, and logs which +can then be leveraged for systems observability. + +## Metrics + +### Prometheus + +Prometheus itself has no object store capabilities, however two projects exist +which support storing metrics in an object store: + + - [Cortex](https://cortexmetrics.io/) + - [Thanos](https://thanos.io/) + +## System logs + +### Vector + +[Vector](https://vector.dev/) natively supports S3 as a +[data sink](https://vector.dev/docs/reference/configuration/sinks/aws_s3/) +(and [source](https://vector.dev/docs/reference/configuration/sources/aws_s3/)). + +This can be configured with Garage with the following: + +```bash +garage key new --name vector-system-logs +garage bucket create system-logs +garage bucket allow system-logs --read --write --key vector-system-logs +``` + +The `vector.toml` can then be configured as follows: + +```toml +[sources.journald] +type = "journald" +current_boot_only = true + +[sinks.out] +encoding.codec = "json" +type = "aws_s3" +inputs = [ "journald" ] +bucket = "system-logs" +key_prefix = "%F/" +compression = "none" +region = "garage" +endpoint = "https://my-garage-instance.mydomain.tld" +auth.access_key_id = "" +auth.secret_access_key = "" +``` + +This is an example configuration - please refer to the Vector documentation for +all configuration and transformation possibilities. Also note that Garage +performs its own compression, so this should be disabled in Vector. -- cgit v1.2.3 From fcc5033466e58e3beec05ee7748d33522b6b32b0 Mon Sep 17 00:00:00 2001 From: Mike Coleman Date: Mon, 16 Jan 2023 23:57:23 -0800 Subject: Change some integer types to int64 Modified integer types representing byte or object count to int64 to prevent overflow. --- doc/api/garage-admin-v0.yml | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'doc') diff --git a/doc/api/garage-admin-v0.yml b/doc/api/garage-admin-v0.yml index a841f8d9..51968894 100644 --- a/doc/api/garage-admin-v0.yml +++ b/doc/api/garage-admin-v0.yml @@ -678,10 +678,12 @@ paths: properties: maxSize: type: integer + format: int64 nullable: true example: 19029801 maxObjects: type: integer + format: int64 nullable: true example: null @@ -1158,9 +1160,11 @@ components: $ref: '#/components/schemas/BucketKeyInfo' objects: type: integer + format: int64 example: 14827 bytes: type: integer + format: int64 example: 13189855625 unfinishedUploads: type: integer @@ -1171,10 +1175,12 @@ components: maxSize: nullable: true type: integer + format: int64 example: null maxObjects: nullable: true type: integer + format: int64 example: null -- cgit v1.2.3 From 3250be7c48b0789d864e43dc44e238eb7c939500 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 18 Jan 2023 15:25:04 +0100 Subject: Update tocatta talk, add talks shell.nix and .envrc --- doc/talks/.envrc | 1 + .../assets/aerogramme_components1.drawio.pdf | Bin 0 -> 31966 bytes .../assets/aerogramme_components2.drawio.pdf | Bin 0 -> 31688 bytes .../assets/aerogramme_datatype.drawio.pdf | Bin 0 -> 31073 bytes .../assets/aerogramme_keys.drawio.pdf | Bin 0 -> 25145 bytes .../2023-01-18-tocatta/assets/garage.drawio.pdf | Bin 0 -> 26098 bytes .../2023-01-18-tocatta/assets/garage.drawio.png | Bin 0 -> 13463 bytes doc/talks/2023-01-18-tocatta/talk.pdf | Bin 2632153 -> 2657696 bytes doc/talks/2023-01-18-tocatta/talk.tex | 7 +++++++ doc/talks/shell.nix | 12 ++++++++++++ 10 files changed, 20 insertions(+) create mode 100644 doc/talks/.envrc create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf create mode 100644 doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf create mode 100644 doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf create mode 100644 doc/talks/2023-01-18-tocatta/assets/garage.drawio.png create mode 100644 doc/talks/shell.nix (limited to 'doc') diff --git a/doc/talks/.envrc b/doc/talks/.envrc new file mode 100644 index 00000000..4a4726a5 --- /dev/null +++ b/doc/talks/.envrc @@ -0,0 +1 @@ +use_nix diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf new file mode 100644 index 00000000..71a90f26 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components1.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf new file mode 100644 index 00000000..87e42eed Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_components2.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf new file mode 100644 index 00000000..0606e059 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_datatype.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf new file mode 100644 index 00000000..8fea81c7 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/aerogramme_keys.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf new file mode 100644 index 00000000..a54a163c Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png new file mode 100644 index 00000000..386dd862 Binary files /dev/null and b/doc/talks/2023-01-18-tocatta/assets/garage.drawio.png differ diff --git a/doc/talks/2023-01-18-tocatta/talk.pdf b/doc/talks/2023-01-18-tocatta/talk.pdf index c3265542..97966061 100644 Binary files a/doc/talks/2023-01-18-tocatta/talk.pdf and b/doc/talks/2023-01-18-tocatta/talk.pdf differ diff --git a/doc/talks/2023-01-18-tocatta/talk.tex b/doc/talks/2023-01-18-tocatta/talk.tex index 7fad6065..43399d8e 100644 --- a/doc/talks/2023-01-18-tocatta/talk.tex +++ b/doc/talks/2023-01-18-tocatta/talk.tex @@ -266,6 +266,13 @@ \end{frame} +\begin{frame} + \frametitle{Garage's architecture} + \begin{center} + \includegraphics[width=.35\linewidth]{assets/garage.drawio.pdf} + \end{center} +\end{frame} + \begin{frame} \frametitle{Two big problems} \begin{enumerate} diff --git a/doc/talks/shell.nix b/doc/talks/shell.nix new file mode 100644 index 00000000..161a61e1 --- /dev/null +++ b/doc/talks/shell.nix @@ -0,0 +1,12 @@ +{ pkgs ? import { } }: +let + latex = (pkgs.texlive.combine { + inherit (pkgs.texlive) + scheme-basic + beamer amsmath mathtools breqn + environ + multirow graphics import adjustbox tabu vwcol stmaryrd ulem ragged2e + dvisvgm dvipng wrapfig hyperref capt-of; + }); +in pkgs.mkShell { nativeBuildInputs = [ pkgs.gnumake latex ]; } + -- cgit v1.2.3 From f2492107d7858882adf386f8925829659755f1e5 Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Wed, 25 Jan 2023 12:00:01 +0000 Subject: cookbook/real-world.md: Added note about mesh network options. --- doc/book/cookbook/real-world.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'doc') diff --git a/doc/book/cookbook/real-world.md b/doc/book/cookbook/real-world.md index 5423bbab..9be1ba44 100644 --- a/doc/book/cookbook/real-world.md +++ b/doc/book/cookbook/real-world.md @@ -19,8 +19,12 @@ To run a real-world deployment, make sure the following conditions are met: - You have at least three machines with sufficient storage space available. -- Each machine has a public IP address which is reachable by other machines. - Running behind a NAT is likely to be possible but hasn't been tested for the latest version (TODO). +- Each machine has a public IP address which is reachable by other machines. It + is highly recommended that you use IPv6 for this end-to-end connectivity. If + IPv6 is not available, then using a mesh VPN such as + [Nebula](https://github.com/slackhq/nebula) or + [Yggdrasil](https://yggdrasil-network.github.io/) are approaches to consider + in addition to building out your own VPN tunneling. - This guide will assume you are using Docker containers to deploy Garage on each node. Garage can also be run independently, for instance as a [Systemd service](@/documentation/cookbook/systemd.md). -- cgit v1.2.3