From f5afa3d974e631de75c438cf2941a88440e2cf69 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 2 Feb 2022 10:07:26 +0100 Subject: Documentation from garage submodule --- content/documentation/working-documents | 1 + content/documentation/working-documents/_index.md | 14 -- .../working-documents/compatibility_target.md | 109 ----------- .../working-documents/design_draft.md | 166 ----------------- .../working-documents/load_balancing.md | 203 --------------------- .../working-documents/migration_04.md | 109 ----------- .../working-documents/migration_06.md | 50 ----- 7 files changed, 1 insertion(+), 651 deletions(-) create mode 120000 content/documentation/working-documents delete mode 100644 content/documentation/working-documents/_index.md delete mode 100644 content/documentation/working-documents/compatibility_target.md delete mode 100644 content/documentation/working-documents/design_draft.md delete mode 100644 content/documentation/working-documents/load_balancing.md delete mode 100644 content/documentation/working-documents/migration_04.md delete mode 100644 content/documentation/working-documents/migration_06.md (limited to 'content/documentation/working-documents') diff --git a/content/documentation/working-documents b/content/documentation/working-documents new file mode 120000 index 0000000..3ca002f --- /dev/null +++ b/content/documentation/working-documents @@ -0,0 +1 @@ +../../garage/doc/book/working-documents \ No newline at end of file diff --git a/content/documentation/working-documents/_index.md b/content/documentation/working-documents/_index.md deleted file mode 100644 index cb78fc4..0000000 --- a/content/documentation/working-documents/_index.md +++ /dev/null @@ -1,14 +0,0 @@ -+++ -title = "Working Documents" -weight = 7 -sort_by = "weight" -template = "documentation.html" -+++ - - -Working documents are documents that reflect the fact that Garage is a software that evolves quickly. -They are a way to communicate our ideas, our changes, and so on before or while we are implementing them in Garage. -If you like to live on the edge, it could also serve as a documentation of our next features to be released. - -Ideally, once the feature/patch has been merged, the working document should serve as a source to -update the rest of the documentation and then be removed. diff --git a/content/documentation/working-documents/compatibility_target.md b/content/documentation/working-documents/compatibility_target.md deleted file mode 100644 index 6ac47cc..0000000 --- a/content/documentation/working-documents/compatibility_target.md +++ /dev/null @@ -1,109 +0,0 @@ -+++ -title = "S3 compatibility target" -weight = 5 -+++ - - -If there is a specific S3 functionnality you have a need for, feel free to open -a PR to put the corresponding endpoints higher in the list. Please explain -your motivations for doing so in the PR message. - -| Priority | Endpoints | -| -------------------------- | --------- | -| **S-tier** (high priority) | | -| | HeadBucket | -| | GetBucketLocation | -| | CreateBucket | -| | DeleteBucket | -| | ListBuckets | -| | ListObjects | -| | ListObjectsV2 | -| | HeadObject | -| | GetObject | -| | PutObject | -| | CopyObject | -| | DeleteObject | -| | DeleteObjects | -| | CreateMultipartUpload | -| | CompleteMultipartUpload | -| | AbortMultipartUpload | -| | UploadPart | -| | [*ListMultipartUploads*](https://git.deuxfleurs.fr/Deuxfleurs/garage/issues/103) | -| | [*ListParts*](https://git.deuxfleurs.fr/Deuxfleurs/garage/issues/103) | -| **A-tier** (will implement) | | -| | [*GetBucketCors*](https://git.deuxfleurs.fr/Deuxfleurs/garage/issues/138) | -| | [*PutBucketCors*](https://git.deuxfleurs.fr/Deuxfleurs/garage/issues/138) | -| | [*DeleteBucketCors*](https://git.deuxfleurs.fr/Deuxfleurs/garage/issues/138) | -| | UploadPartCopy | -| | GetBucketWebsite | -| | PutBucketWebsite | -| | DeleteBucketWebsite | -| ~~~~~~~~~~~~~~~~~~~~~~~~~~ | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | -| **B-tier** | | -| | GetBucketAcl | -| | PutBucketAcl | -| | GetObjectLockConfiguration | -| | PutObjectLockConfiguration | -| | GetObjectRetention | -| | PutObjectRetention | -| | GetObjectLegalHold | -| | PutObjectLegalHold | -| **C-tier** | | -| | GetBucketVersioning | -| | PutBucketVersioning | -| | ListObjectVersions | -| | GetObjectAcl | -| | PutObjectAcl | -| | GetBucketLifecycleConfiguration | -| | PutBucketLifecycleConfiguration | -| | DeleteBucketLifecycle | -| **garbage-tier** | | -| | DeleteBucketEncryption | -| | DeleteBucketAnalyticsConfiguration | -| | DeleteBucketIntelligentTieringConfiguration | -| | DeleteBucketInventoryConfiguration | -| | DeleteBucketMetricsConfiguration | -| | DeleteBucketOwnershipControls | -| | DeleteBucketPolicy | -| | DeleteBucketReplication | -| | DeleteBucketTagging | -| | DeleteObjectTagging | -| | DeletePublicAccessBlock | -| | GetBucketAccelerateConfiguration | -| | GetBucketAnalyticsConfiguration | -| | GetBucketEncryption | -| | GetBucketIntelligentTieringConfiguration | -| | GetBucketInventoryConfiguration | -| | GetBucketLogging | -| | GetBucketMetricsConfiguration | -| | GetBucketNotificationConfiguration | -| | GetBucketOwnershipControls | -| | GetBucketPolicy | -| | GetBucketPolicyStatus | -| | GetBucketReplication | -| | GetBucketRequestPayment | -| | GetBucketTagging | -| | GetObjectTagging | -| | GetObjectTorrent | -| | GetPublicAccessBlock | -| | ListBucketAnalyticsConfigurations | -| | ListBucketIntelligentTieringConfigurations | -| | ListBucketInventoryConfigurations | -| | ListBucketMetricsConfigurations | -| | PutBucketAccelerateConfiguration | -| | PutBucketAnalyticsConfiguration | -| | PutBucketEncryption | -| | PutBucketIntelligentTieringConfiguration | -| | PutBucketInventoryConfiguration | -| | PutBucketLogging | -| | PutBucketMetricsConfiguration | -| | PutBucketNotificationConfiguration | -| | PutBucketOwnershipControls | -| | PutBucketPolicy | -| | PutBucketReplication | -| | PutBucketRequestPayment | -| | PutBucketTagging | -| | PutObjectTagging | -| | PutPublicAccessBlock | -| | RestoreObject | -| | SelectObjectContent | diff --git a/content/documentation/working-documents/design_draft.md b/content/documentation/working-documents/design_draft.md deleted file mode 100644 index 78e82c3..0000000 --- a/content/documentation/working-documents/design_draft.md +++ /dev/null @@ -1,166 +0,0 @@ -+++ -title = "Design draft" -weight = 25 -+++ - - -**WARNING: this documentation is a design draft which was written before Garage's actual implementation. -The general principle are similar, but details have not been updated.** - - -#### Modules - -- `membership/`: configuration, membership management (gossip of node's presence and status), ring generation --> what about Serf (used by Consul/Nomad) : https://www.serf.io/? Seems a huge library with many features so maybe overkill/hard to integrate -- `metadata/`: metadata management -- `blocks/`: block management, writing, GC and rebalancing -- `internal/`: server to server communication (HTTP server and client that reuses connections, TLS if we want, etc) -- `api/`: S3 API -- `web/`: web management interface - -#### Metadata tables - -**Objects:** - -- *Hash key:* Bucket name (string) -- *Sort key:* Object key (string) -- *Sort key:* Version timestamp (int) -- *Sort key:* Version UUID (string) -- Complete: bool -- Inline: bool, true for objects < threshold (say 1024) -- Object size (int) -- Mime type (string) -- Data for inlined objects (blob) -- Hash of first block otherwise (string) - -*Having only a hash key on the bucket name will lead to storing all file entries of this table for a specific bucket on a single node. At the same time, it is the only way I see to rapidly being able to list all bucket entries...* - -**Blocks:** - -- *Hash key:* Version UUID (string) -- *Sort key:* Offset of block in total file (int) -- Hash of data block (string) - -A version is defined by the existence of at least one entry in the blocks table for a certain version UUID. -We must keep the following invariant: if a version exists in the blocks table, it has to be referenced in the objects table. -We explicitly manage concurrent versions of an object: the version timestamp and version UUID columns are index columns, thus we may have several concurrent versions of an object. -Important: before deleting an older version from the objects table, we must make sure that we did a successfull delete of the blocks of that version from the blocks table. - -Thus, the workflow for reading an object is as follows: - -1. Check permissions (LDAP) -2. Read entry in object table. If data is inline, we have its data, stop here. - -> if several versions, take newest one and launch deletion of old ones in background -3. Read first block from cluster. If size <= 1 block, stop here. -4. Simultaneously with previous step, if size > 1 block: query the Blocks table for the IDs of the next blocks -5. Read subsequent blocks from cluster - -Workflow for PUT: - -1. Check write permission (LDAP) -2. Select a new version UUID -3. Write a preliminary entry for the new version in the objects table with complete = false -4. Send blocks to cluster and write entries in the blocks table -5. Update the version with complete = true and all of the accurate information (size, etc) -6. Return success to the user -7. Launch a background job to check and delete older versions - -Workflow for DELETE: - -1. Check write permission (LDAP) -2. Get current version (or versions) in object table -3. Do the deletion of those versions NOT IN A BACKGROUND JOB THIS TIME -4. Return succes to the user if we were able to delete blocks from the blocks table and entries from the object table - -To delete a version: - -1. List the blocks from Cassandra -2. For each block, delete it from cluster. Don't care if some deletions fail, we can do GC. -3. Delete all of the blocks from the blocks table -4. Finally, delete the version from the objects table - -Known issue: if someone is reading from a version that we want to delete and the object is big, the read might be interrupted. I think it is ok to leave it like this, we just cut the connection if data disappears during a read. - -("Soit P un problème, on s'en fout est une solution à ce problème") - -#### Block storage on disk - -**Blocks themselves:** - -- file path = /blobs/(first 3 hex digits of hash)/(rest of hash) - -**Reverse index for GC & other block-level metadata:** - -- file path = /meta/(first 3 hex digits of hash)/(rest of hash) -- map block hash -> set of version UUIDs where it is referenced - -Usefull metadata: - -- list of versions that reference this block in the Casandra table, so that we can do GC by checking in Cassandra that the lines still exist -- list of other nodes that we know have acknowledged a write of this block, usefull in the rebalancing algorithm - -Write strategy: have a single thread that does all write IO so that it is serialized (or have several threads that manage independent parts of the hash space). When writing a blob, write it to a temporary file, close, then rename so that a concurrent read gets a consistent result (either not found or found with whole content). - -Read strategy: the only read operation is get(hash) that returns either the data or not found (can do a corruption check as well and return corrupted state if it is the case). Can be done concurrently with writes. - -**Internal API:** - -- get(block hash) -> ok+data/not found/corrupted -- put(block hash & data, version uuid + offset) -> ok/error -- put with no data(block hash, version uuid + offset) -> ok/not found plz send data/error -- delete(block hash, version uuid + offset) -> ok/error - -GC: when last ref is deleted, delete block. -Long GC procedure: check in Cassandra that version UUIDs still exist and references this block. - -Rebalancing: takes as argument the list of newly added nodes. - -- List all blocks that we have. For each block: -- If it hits a newly introduced node, send it to them. - Use put with no data first to check if it has to be sent to them already or not. - Use a random listing order to avoid race conditions (they do no harm but we might have two nodes sending the same thing at the same time thus wasting time). -- If it doesn't hit us anymore, delete it and its reference list. - -Only one balancing can be running at a same time. It can be restarted at the beginning with new parameters. - -#### Membership management - -Two sets of nodes: - -- set of nodes from which a ping was recently received, with status: number of stored blocks, request counters, error counters, GC%, rebalancing% - (eviction from this set after say 30 seconds without ping) -- set of nodes that are part of the system, explicitly modified by the operator using the web UI (persisted to disk), - is a CRDT using a version number for the value of the whole set - -Thus, three states for nodes: - -- healthy: in both sets -- missing: not pingable but part of desired cluster -- unused/draining: currently present but not part of the desired cluster, empty = if contains nothing, draining = if still contains some blocks - -Membership messages between nodes: - -- ping with current state + hash of current membership info -> reply with same info -- send&get back membership info (the ids of nodes that are in the two sets): used when no local membership change in a long time and membership info hash discrepancy detected with first message (passive membership fixing with full CRDT gossip) -- inform of newly pingable node(s) -> no result, when receive new info repeat to all (reliable broadcast) -- inform of operator membership change -> no result, when receive new info repeat to all (reliable broadcast) - -Ring: generated from the desired set of nodes, however when doing read/writes on the ring, skip nodes that are known to be not pingable. -The tokens are generated in a deterministic fashion from node IDs (hash of node id + token number from 1 to K). -Number K of tokens per node: decided by the operator & stored in the operator's list of nodes CRDT. Default value proposal: with node status information also broadcast disk total size and free space, and propose a default number of tokens equal to 80%Free space / 10Gb. (this is all user interface) - - -#### Constants - -- Block size: around 1MB ? --> Exoscale use 16MB chunks -- Number of tokens in the hash ring: one every 10Gb of allocated storage -- Threshold for storing data directly in Cassandra objects table: 1kb bytes (maybe up to 4kb?) -- Ping timeout (time after which a node is registered as unresponsive/missing): 30 seconds -- Ping interval: 10 seconds -- ?? - -#### Links - -- CDC: -- Erasure coding: -- [Openstack Storage Concepts](https://docs.openstack.org/arch-design/design-storage/design-storage-concepts.html) -- [RADOS](https://ceph.com/wp-content/uploads/2016/08/weil-rados-pdsw07.pdf) diff --git a/content/documentation/working-documents/load_balancing.md b/content/documentation/working-documents/load_balancing.md deleted file mode 100644 index ba208fa..0000000 --- a/content/documentation/working-documents/load_balancing.md +++ /dev/null @@ -1,203 +0,0 @@ -+++ -title = "Load balancing data" -weight = 10 -+++ - - -**This is being yet improved in release 0.5. The working document has not been updated yet, it still only applies to Garage 0.2 through 0.4.** - -I have conducted a quick study of different methods to load-balance data over different Garage nodes using consistent hashing. - -## Requirements - -- *good balancing*: two nodes that have the same announced capacity should receive close to the same number of items - -- *multi-datacenter*: the replicas of a partition should be distributed over as many datacenters as possible - -- *minimal disruption*: when adding or removing a node, as few partitions as possible should have to move around - -- *order-agnostic*: the same set of nodes (each associated with a datacenter name - and a capacity) should always return the same distribution of partition - replicas, independently of the order in which nodes were added/removed (this - is to keep the implementation simple) - -## Methods - -### Naive multi-DC ring walking strategy - -This strategy can be used with any ring-like algorithm to make it aware of the *multi-datacenter* requirement: - -In this method, the ring is a list of positions, each associated with a single node in the cluster. -Partitions contain all the keys between two consecutive items of the ring. -To find the nodes that store replicas of a given partition: - -- select the node for the position of the partition's lower bound -- go clockwise on the ring, skipping nodes that: - - we halve already selected - - are in a datacenter of a node we have selected, except if we already have nodes from all possible datacenters - -In this way the selected nodes will always be distributed over -`min(n_datacenters, n_replicas)` different datacenters, which is the best we -can do. - -This method was implemented in the first version of Garage, with the basic -ring construction from Dynamo DB that consists in associating `n_token` random positions to -each node (I know it's not optimal, the Dynamo paper already studies this). - -### Better rings - -The ring construction that selects `n_token` random positions for each nodes gives a ring of positions that -is not well-balanced: the space between the tokens varies a lot, and some partitions are thus bigger than others. -This problem was demonstrated in the original Dynamo DB paper. - -To solve this, we want to apply a better second method for partitionning our dataset: - -1. fix an initially large number of partitions (say 1024) with evenly-spaced delimiters, - -2. attribute each partition randomly to a node, with a probability - proportionnal to its capacity (which `n_tokens` represented in the first - method) - -For now we continue using the multi-DC ring walking described above. - -I have studied two ways to do the attribution of partitions to nodes, in a way that is deterministic: - -- Min-hash: for each partition, select node that minimizes `hash(node, partition_number)` -- MagLev: see [here](https://blog.acolyer.org/2016/03/21/maglev-a-fast-and-reliable-software-network-load-balancer/) - -MagLev provided significantly better balancing, as it guarantees that the exact -same number of partitions is attributed to all nodes that have the same -capacity (and that this number is proportionnal to the node's capacity, except -for large values), however in both cases: - -- the distribution is still bad, because we use the naive multi-DC ring walking - that behaves strangely due to interactions between consecutive positions on - the ring - -- the disruption in case of adding/removing a node is not as low as it can be, - as we show with the following method. - -A quick description of MagLev (backend = node, lookup table = ring): - -> The basic idea of Maglev hashing is to assign a preference list of all the -> lookup table positions to each backend. Then all the backends take turns -> filling their most-preferred table positions that are still empty, until the -> lookup table is completely filled in. Hence, Maglev hashing gives an almost -> equal share of the lookup table to each of the backends. Heterogeneous -> backend weights can be achieved by altering the relative frequency of the -> backends’ turns… - -Here are some stats (run `scripts/simulate_ring.py` to reproduce): - -``` -##### Custom-ring (min-hash) ##### - -#partitions per node (capacity in parenthesis): -- datura (8) : 227 -- digitale (8) : 351 -- drosera (8) : 259 -- geant (16) : 476 -- gipsie (16) : 410 -- io (16) : 495 -- isou (8) : 231 -- mini (4) : 149 -- mixi (4) : 188 -- modi (4) : 127 -- moxi (4) : 159 - -Variance of load distribution for load normalized to intra-class mean -(a class being the set of nodes with the same announced capacity): 2.18% <-- REALLY BAD - -Disruption when removing nodes (partitions moved on 0/1/2/3 nodes): -removing atuin digitale : 63.09% 30.18% 6.64% 0.10% -removing atuin drosera : 72.36% 23.44% 4.10% 0.10% -removing atuin datura : 73.24% 21.48% 5.18% 0.10% -removing jupiter io : 48.34% 38.48% 12.30% 0.88% -removing jupiter isou : 74.12% 19.73% 6.05% 0.10% -removing grog mini : 84.47% 12.40% 2.93% 0.20% -removing grog mixi : 80.76% 16.60% 2.64% 0.00% -removing grog moxi : 83.59% 14.06% 2.34% 0.00% -removing grog modi : 87.01% 11.43% 1.46% 0.10% -removing grisou geant : 48.24% 37.40% 13.67% 0.68% -removing grisou gipsie : 53.03% 33.59% 13.09% 0.29% -on average: 69.84% 23.53% 6.40% 0.23% <-- COULD BE BETTER - --------- - -##### MagLev ##### - -#partitions per node: -- datura (8) : 273 -- digitale (8) : 256 -- drosera (8) : 267 -- geant (16) : 452 -- gipsie (16) : 427 -- io (16) : 483 -- isou (8) : 272 -- mini (4) : 184 -- mixi (4) : 160 -- modi (4) : 144 -- moxi (4) : 154 - -Variance of load distribution: 0.37% <-- Already much better, but not optimal - -Disruption when removing nodes (partitions moved on 0/1/2/3 nodes): -removing atuin digitale : 62.60% 29.20% 7.91% 0.29% -removing atuin drosera : 65.92% 26.56% 7.23% 0.29% -removing atuin datura : 63.96% 27.83% 7.71% 0.49% -removing jupiter io : 44.63% 40.33% 14.06% 0.98% -removing jupiter isou : 63.38% 27.25% 8.98% 0.39% -removing grog mini : 72.46% 21.00% 6.35% 0.20% -removing grog mixi : 72.95% 22.46% 4.39% 0.20% -removing grog moxi : 74.22% 20.61% 4.98% 0.20% -removing grog modi : 75.98% 18.36% 5.27% 0.39% -removing grisou geant : 46.97% 36.62% 15.04% 1.37% -removing grisou gipsie : 49.22% 36.52% 12.79% 1.46% -on average: 62.94% 27.89% 8.61% 0.57% <-- WORSE THAN PREVIOUSLY -``` - -### The magical solution: multi-DC aware MagLev - -Suppose we want to select three replicas for each partition (this is what we do in our simulation and in most Garage deployments). -We apply MagLev three times consecutively, one for each replica selection. -The first time is pretty much the same as normal MagLev, but for the following times, when a node runs through its preference -list to select a partition to replicate, we skip partitions for which adding this node would not bring datacenter-diversity. -More precisely, we skip a partition in the preference list if: - -- the node already replicates the partition (from one of the previous rounds of MagLev) -- the node is in a datacenter where a node already replicates the partition and there are other datacenters available - -Refer to `method4` in the simulation script for a formal definition. - -``` -##### Multi-DC aware MagLev ##### - -#partitions per node: -- datura (8) : 268 <-- NODES WITH THE SAME CAPACITY -- digitale (8) : 267 HAVE THE SAME NUM OF PARTITIONS -- drosera (8) : 267 (+- 1) -- geant (16) : 470 -- gipsie (16) : 472 -- io (16) : 516 -- isou (8) : 268 -- mini (4) : 136 -- mixi (4) : 136 -- modi (4) : 136 -- moxi (4) : 136 - -Variance of load distribution: 0.06% <-- CAN'T DO BETTER THAN THIS - -Disruption when removing nodes (partitions moved on 0/1/2/3 nodes): -removing atuin digitale : 65.72% 33.01% 1.27% 0.00% -removing atuin drosera : 64.65% 33.89% 1.37% 0.10% -removing atuin datura : 66.11% 32.62% 1.27% 0.00% -removing jupiter io : 42.97% 53.42% 3.61% 0.00% -removing jupiter isou : 66.11% 32.32% 1.56% 0.00% -removing grog mini : 80.47% 18.85% 0.68% 0.00% -removing grog mixi : 80.27% 18.85% 0.88% 0.00% -removing grog moxi : 80.18% 19.04% 0.78% 0.00% -removing grog modi : 79.69% 19.92% 0.39% 0.00% -removing grisou geant : 44.63% 52.15% 3.22% 0.00% -removing grisou gipsie : 43.55% 52.54% 3.91% 0.00% -on average: 64.94% 33.33% 1.72% 0.01% <-- VERY GOOD (VERY LOW VALUES FOR 2 AND 3 NODES) -``` diff --git a/content/documentation/working-documents/migration_04.md b/content/documentation/working-documents/migration_04.md deleted file mode 100644 index 302fdb1..0000000 --- a/content/documentation/working-documents/migration_04.md +++ /dev/null @@ -1,109 +0,0 @@ -+++ -title = "Migrating from 0.3 to 0.4" -weight = 20 -+++ - - -**Migrating from 0.3 to 0.4 is unsupported. This document is only intended to -document the process internally for the Deuxfleurs cluster where we have to do -it. Do not try it yourself, you will lose your data and we will not help you.** - -**Migrating from 0.2 to 0.4 will break everything for sure. Never try it.** - -The internal data format of Garage hasn't changed much between 0.3 and 0.4. -The Sled database is still the same, and the data directory as well. - -The following has changed, all in the meta directory: - -- `node_id` in 0.3 contains the identifier of the current node. In 0.4, this - file does nothing and should be deleted. It is replaced by `node_key` (the - secret key) and `node_key.pub` (the associated public key). A node's - identifier on the ring is its public key. - -- `peer_info` in 0.3 contains the list of peers saved automatically by Garage. - The format has changed and it is now stored in `peer_list` (`peer_info` - should be deleted). - -When migrating, all node identifiers will change. This also means that the -affectation of data partitions on the ring will change, and lots of data will -have to be rebalanced. - -- If your cluster has only 3 nodes, all nodes store everything, therefore nothing has to be rebalanced. - -- If your cluster has only 4 nodes, for any partition there will always be at - least 2 nodes that stored data before that still store it after. Therefore - the migration should in theory be transparent and Garage should continue to - work during the rebalance. - -- If your cluster has 5 or more nodes, data will disappear during the - migration. Do not migrate (fortunately we don't have this scenario at - Deuxfleurs), or if you do, make Garage unavailable until things stabilize - (disable web and api access). - - -The migration steps are as follows: - -1. Prepare a new configuration file for 0.4. For each node, point to the same - meta and data directories as Garage 0.3. Basically, the things that change - are the following: - - - No more `rpc_tls` section - - You have to generate a shared `rpc_secret` and put it in all config files - - `bootstrap_peers` has a different syntax as it has to contain node keys. - Leave it empty and use `garage node-id` and `garage node connect` instead (new features of 0.4) - - put the publicly accessible RPC address of your node in `rpc_public_addr` if possible (its optional but recommended) - - If you are using Consul, change the `consul_service_name` to NOT be the name advertised by Nomad. - Now Garage is responsible for advertising its own service itself. - -2. Disable api and web access for some time (Garage does not support disabling - these endpoints but you can change the port number or stop your reverse - proxy for instance). - -3. Do `garage repair -a --yes tables` and `garage repair -a --yes blocks`, - check the logs and check that all data seems to be synced correctly between - nodes. - -4. Save somewhere the output of `garage status`. We will need this to remember - how to reconfigure nodes in 0.4. - -5. Turn off Garage 0.3 - -6. Backup metadata folders if you can (i.e. if you have space to do it - somewhere). Backuping data folders could also be usefull but that's much - harder to do. If your filesystem supports snapshots, this could be a good - time to use them. - -7. Turn on Garage 0.4 - -8. At this point, running `garage status` should indicate that all nodes of the - previous cluster are "unavailable". The nodes have new identifiers that - should appear in healthy nodes once they can talk to one another (use - `garage node connect` if necessary`). They should have NO ROLE ASSIGNED at - the moment. - -9. Prepare a script with several `garage node configure` commands that replace - each of the v0.3 node ID with the corresponding v0.4 node ID, with the same - zone/tag/capacity. For example if your node `drosera` had identifier `c24e` - before and now has identifier `789a`, and it was configured with capacity - `2` in zone `dc1`, put the following command in your script: - -```bash -garage node configure 789a -z dc1 -c 2 -t drosera --replace c24e -``` - -10. Run your reconfiguration script. Check that the new output of `garage - status` contains the correct node IDs with the correct values for capacity - and zone. Old nodes should no longer be mentioned. - -11. If your cluster has 4 nodes or less, and you are feeling adventurous, you - can reenable Web and API access now. Things will probably work. - -12. Garage might already be resyncing stuff. Issue a `garage repair -a --yes - tables` and `garage repair -a --yes blocks` to force it to do so. - -13. Wait for resyncing activity to stop in the logs. Do steps 12 and 13 two or - three times, until you see that when you issue the repair commands, nothing - gets resynced any longer. - -14. Your upgraded cluster should be in a working state. Re-enable API and Web - access and check that everything went well. diff --git a/content/documentation/working-documents/migration_06.md b/content/documentation/working-documents/migration_06.md deleted file mode 100644 index 22d8274..0000000 --- a/content/documentation/working-documents/migration_06.md +++ /dev/null @@ -1,50 +0,0 @@ -+++ -title = "Migrating from 0.5 to 0.6" -weight = 15 -+++ - - -**This guide explains how to migrate to 0.6 if you have an existing 0.5 cluster. -We don't recommend trying to migrate directly from 0.4 or older to 0.6.** - -**We make no guarantee that this migration will work perfectly: -back up all your data before attempting it!** - -Garage v0.6 (not yet released) introduces a new data model for buckets, -that allows buckets to have many names (aliases). -Buckets can also have "private" aliases (called local aliases), -which are only visible when using a certain access key. - -This new data model means that the metadata tables have changed quite a bit in structure, -and a manual migration step is required. - -The migration steps are as follows: - -1. Disable api and web access for some time (Garage does not support disabling - these endpoints but you can change the port number or stop your reverse - proxy for instance). - -2. Do `garage repair -a --yes tables` and `garage repair -a --yes blocks`, - check the logs and check that all data seems to be synced correctly between - nodes. - -4. Turn off Garage 0.5 - -5. **Backup your metadata folders!!** - -6. Turn on Garage 0.6 - -7. At this point, `garage bucket list` should indicate that no buckets are present - in the cluster. `garage key list` should show all of the previously existing - access key, however these keys should not have any permissions to access buckets. - -8. Run `garage migrate buckets050`: this will populate the new bucket table with - the buckets that existed previously. This will also give access to API keys - as it was before. - -9. Check that all your buckets indeed appear in `garage bucket list`, and that - keys have the proper access flags set. If that is not the case, revert - everything and file a bug! - -10. Your upgraded cluster should be in a working state. Re-enable API and Web - access and check that everything went well. -- cgit v1.2.3