diff options
Diffstat (limited to 'doc/book')
-rw-r--r-- | doc/book/build/golang.md | 82 | ||||
-rw-r--r-- | doc/book/build/javascript.md | 4 | ||||
-rw-r--r-- | doc/book/build/python.md | 11 | ||||
-rw-r--r-- | doc/book/connect/apps/index.md | 12 | ||||
-rw-r--r-- | doc/book/connect/backup.md | 2 | ||||
-rw-r--r-- | doc/book/connect/repositories.md | 4 | ||||
-rw-r--r-- | doc/book/cookbook/exposing-websites.md | 2 | ||||
-rw-r--r-- | doc/book/cookbook/real-world.md | 88 | ||||
-rw-r--r-- | doc/book/operations/durability-repairs.md | 11 | ||||
-rw-r--r-- | doc/book/operations/layout.md | 221 | ||||
-rw-r--r-- | doc/book/operations/multi-hdd.md | 101 | ||||
-rw-r--r-- | doc/book/operations/upgrading.md | 2 | ||||
-rw-r--r-- | doc/book/quick-start/_index.md | 21 | ||||
-rw-r--r-- | doc/book/reference-manual/admin-api.md | 11 | ||||
-rw-r--r-- | doc/book/reference-manual/configuration.md | 380 | ||||
-rw-r--r-- | doc/book/reference-manual/features.md | 2 | ||||
-rw-r--r-- | doc/book/reference-manual/s3-compatibility.md | 32 | ||||
-rw-r--r-- | doc/book/working-documents/migration-09.md | 72 |
18 files changed, 811 insertions, 247 deletions
diff --git a/doc/book/build/golang.md b/doc/book/build/golang.md index a508260e..f3f28a40 100644 --- a/doc/book/build/golang.md +++ b/doc/book/build/golang.md @@ -37,30 +37,84 @@ import ( "context" "fmt" "os" + "strings" garage "git.deuxfleurs.fr/garage-sdk/garage-admin-sdk-golang" ) func main() { - // Set Host and other parameters + // Initialization configuration := garage.NewConfiguration() configuration.Host = "127.0.0.1:3903" - - - // We can now generate a client client := garage.NewAPIClient(configuration) - - // Authentication is handled through the context pattern ctx := context.WithValue(context.Background(), garage.ContextAccessToken, "s3cr3t") - // Send a request - resp, r, err := client.NodesApi.GetNodes(ctx).Execute() - if err != nil { - fmt.Fprintf(os.Stderr, "Error when calling `NodesApi.GetNodes``: %v\n", err) - fmt.Fprintf(os.Stderr, "Full HTTP response: %v\n", r) + // Nodes + fmt.Println("--- nodes ---") + nodes, _, _ := client.NodesApi.GetNodes(ctx).Execute() + fmt.Fprintf(os.Stdout, "First hostname: %v\n", nodes.KnownNodes[0].Hostname) + capa := int64(1000000000) + change := []garage.NodeRoleChange{ + garage.NodeRoleChange{NodeRoleUpdate: &garage.NodeRoleUpdate { + Id: *nodes.KnownNodes[0].Id, + Zone: "dc1", + Capacity: *garage.NewNullableInt64(&capa), + Tags: []string{ "fast", "amd64" }, + }}, } - - // Process the response - fmt.Fprintf(os.Stdout, "Target hostname: %v\n", resp.KnownNodes[resp.Node].Hostname) + staged, _, _ := client.LayoutApi.AddLayout(ctx).NodeRoleChange(change).Execute() + msg, _, _ := client.LayoutApi.ApplyLayout(ctx).LayoutVersion(*garage.NewLayoutVersion(staged.Version + 1)).Execute() + fmt.Printf(strings.Join(msg.Message, "\n")) // Layout configured + + health, _, _ := client.NodesApi.GetHealth(ctx).Execute() + fmt.Printf("Status: %s, nodes: %v/%v, storage: %v/%v, partitions: %v/%v\n", health.Status, health.ConnectedNodes, health.KnownNodes, health.StorageNodesOk, health.StorageNodes, health.PartitionsAllOk, health.Partitions) + + // Key + fmt.Println("\n--- key ---") + key := "openapi-key" + keyInfo, _, _ := client.KeyApi.AddKey(ctx).AddKeyRequest(garage.AddKeyRequest{Name: *garage.NewNullableString(&key) }).Execute() + defer client.KeyApi.DeleteKey(ctx).Id(*keyInfo.AccessKeyId).Execute() + fmt.Printf("AWS_ACCESS_KEY_ID=%s\nAWS_SECRET_ACCESS_KEY=%s\n", *keyInfo.AccessKeyId, *keyInfo.SecretAccessKey.Get()) + + id := *keyInfo.AccessKeyId + canCreateBucket := true + updateKeyRequest := *garage.NewUpdateKeyRequest() + updateKeyRequest.SetName("openapi-key-updated") + updateKeyRequest.SetAllow(garage.UpdateKeyRequestAllow { CreateBucket: &canCreateBucket }) + update, _, _ := client.KeyApi.UpdateKey(ctx).Id(id).UpdateKeyRequest(updateKeyRequest).Execute() + fmt.Printf("Updated %v with key name %v\n", *update.AccessKeyId, *update.Name) + + keyList, _, _ := client.KeyApi.ListKeys(ctx).Execute() + fmt.Printf("Keys count: %v\n", len(keyList)) + + // Bucket + fmt.Println("\n--- bucket ---") + global_name := "global-ns-openapi-bucket" + local_name := "local-ns-openapi-bucket" + bucketInfo, _, _ := client.BucketApi.CreateBucket(ctx).CreateBucketRequest(garage.CreateBucketRequest{ + GlobalAlias: &global_name, + LocalAlias: &garage.CreateBucketRequestLocalAlias { + AccessKeyId: keyInfo.AccessKeyId, + Alias: &local_name, + }, + }).Execute() + defer client.BucketApi.DeleteBucket(ctx).Id(*bucketInfo.Id).Execute() + fmt.Printf("Bucket id: %s\n", *bucketInfo.Id) + + updateBucketRequest := *garage.NewUpdateBucketRequest() + website := garage.NewUpdateBucketRequestWebsiteAccess() + website.SetEnabled(true) + website.SetIndexDocument("index.html") + website.SetErrorDocument("errors/4xx.html") + updateBucketRequest.SetWebsiteAccess(*website) + quotas := garage.NewUpdateBucketRequestQuotas() + quotas.SetMaxSize(1000000000) + quotas.SetMaxObjects(999999999) + updateBucketRequest.SetQuotas(*quotas) + updatedBucket, _, _ := client.BucketApi.UpdateBucket(ctx).Id(*bucketInfo.Id).UpdateBucketRequest(updateBucketRequest).Execute() + fmt.Printf("Bucket %v website activation: %v\n", *updatedBucket.Id, *updatedBucket.WebsiteAccess) + + bucketList, _, _ := client.BucketApi.ListBuckets(ctx).Execute() + fmt.Printf("Bucket count: %v\n", len(bucketList)) } ``` diff --git a/doc/book/build/javascript.md b/doc/book/build/javascript.md index ff009ffe..a065c595 100644 --- a/doc/book/build/javascript.md +++ b/doc/book/build/javascript.md @@ -31,9 +31,9 @@ npm install --save git+https://git.deuxfleurs.fr/garage-sdk/garage-admin-sdk-js. A short example: ```javascript -const garage = require('garage_administration_api_v0garage_v0_8_0'); +const garage = require('garage_administration_api_v1garage_v0_9_0'); -const api = new garage.ApiClient("http://127.0.0.1:3903/v0"); +const api = new garage.ApiClient("http://127.0.0.1:3903/v1"); api.authentications['bearerAuth'].accessToken = "s3cr3t"; const [node, layout, key, bucket] = [ diff --git a/doc/book/build/python.md b/doc/book/build/python.md index 5b797897..896c99d3 100644 --- a/doc/book/build/python.md +++ b/doc/book/build/python.md @@ -80,7 +80,7 @@ from garage_admin_sdk.apis import * from garage_admin_sdk.models import * configuration = garage_admin_sdk.Configuration( - host = "http://localhost:3903/v0", + host = "http://localhost:3903/v1", access_token = "s3cr3t" ) @@ -94,13 +94,14 @@ print(f"running garage {status.garage_version}, node_id {status.node}") # Change layout of this node current = layout.get_layout() -layout.add_layout({ - status.node: NodeClusterInfo( +layout.add_layout([ + NodeRoleChange( + id = status.node, zone = "dc1", - capacity = 1, + capacity = 1000000000, tags = [ "dev" ], ) -}) +]) layout.apply_layout(LayoutVersion( version = current.version + 1 )) diff --git a/doc/book/connect/apps/index.md b/doc/book/connect/apps/index.md index 3f59530a..c8571fac 100644 --- a/doc/book/connect/apps/index.md +++ b/doc/book/connect/apps/index.md @@ -37,7 +37,7 @@ Second, we suppose you have created a key and a bucket. As a reminder, you can create a key for your nextcloud instance as follow: ```bash -garage key new --name nextcloud-key +garage key create nextcloud-key ``` Keep the Key ID and the Secret key in a pad, they will be needed later. @@ -139,14 +139,14 @@ a reasonable trade-off for some instances. Create a key for Peertube: ```bash -garage key new --name peertube-key +garage key create peertube-key ``` Keep the Key ID and the Secret key in a pad, they will be needed later. We need two buckets, one for normal videos (named peertube-video) and one for webtorrent videos (named peertube-playlist). ```bash -garage bucket create peertube-video +garage bucket create peertube-videos garage bucket create peertube-playlist ``` @@ -216,7 +216,7 @@ object_storage: # Same settings but for webtorrent videos videos: - bucket_name: 'peertube-video' + bucket_name: 'peertube-videos' prefix: '' # You must fill this field to make Peertube use our reverse proxy/website logic base_url: 'http://peertube-videos.web.garage.localhost' @@ -253,7 +253,7 @@ As such, your Garage cluster should be configured appropriately for good perform This is the usual Garage setup: ```bash -garage key new --name mastodon-key +garage key create mastodon-key garage bucket create mastodon-data garage bucket allow mastodon-data --read --write --key mastodon-key ``` @@ -379,7 +379,7 @@ Supposing you have a working synapse installation, you can add the module with p Now create a bucket and a key for your matrix instance (note your Key ID and Secret Key somewhere, they will be needed later): ```bash -garage key new --name matrix-key +garage key create matrix-key garage bucket create matrix garage bucket allow matrix --read --write --key matrix-key ``` diff --git a/doc/book/connect/backup.md b/doc/book/connect/backup.md index d20c3c96..585ec469 100644 --- a/doc/book/connect/backup.md +++ b/doc/book/connect/backup.md @@ -54,7 +54,7 @@ how to configure this. Create your key and bucket: ```bash -garage key new my-key +garage key create my-key garage bucket create backup garage bucket allow backup --read --write --key my-key ``` diff --git a/doc/book/connect/repositories.md b/doc/book/connect/repositories.md index 4b14bb46..66365d64 100644 --- a/doc/book/connect/repositories.md +++ b/doc/book/connect/repositories.md @@ -23,7 +23,7 @@ You can configure a different target for each data type (check `[lfs]` and `[att Let's start by creating a key and a bucket (your key id and secret will be needed later, keep them somewhere): ```bash -garage key new --name gitea-key +garage key create gitea-key garage bucket create gitea garage bucket allow gitea --read --write --key gitea-key ``` @@ -118,7 +118,7 @@ through another support, like a git repository. As a first step, we will need to create a bucket on Garage and enabling website access on it: ```bash -garage key new --name nix-key +garage key create nix-key garage bucket create nix.example.com garage bucket allow nix.example.com --read --write --key nix-key garage bucket website nix.example.com --allow diff --git a/doc/book/cookbook/exposing-websites.md b/doc/book/cookbook/exposing-websites.md index 5f6a5a28..9382a541 100644 --- a/doc/book/cookbook/exposing-websites.md +++ b/doc/book/cookbook/exposing-websites.md @@ -38,7 +38,7 @@ Our website serving logic is as follow: Now we need to infer the URL of your website through your bucket name. Let assume: - - we set `root_domain = ".web.example.com"` in `garage.toml` ([ref](@/documentation/reference-manual/configuration.md#root_domain)) + - we set `root_domain = ".web.example.com"` in `garage.toml` ([ref](@/documentation/reference-manual/configuration.md#web_root_domain)) - our bucket name is `garagehq.deuxfleurs.fr`. Our bucket will be served if the Host field matches one of these 2 values (the port is ignored): diff --git a/doc/book/cookbook/real-world.md b/doc/book/cookbook/real-world.md index 7061069f..ea4ce1f9 100644 --- a/doc/book/cookbook/real-world.md +++ b/doc/book/cookbook/real-world.md @@ -19,9 +19,10 @@ To run a real-world deployment, make sure the following conditions are met: - You have at least three machines with sufficient storage space available. -- Each machine has a public IP address which is reachable by other machines. It - is highly recommended that you use IPv6 for this end-to-end connectivity. If - IPv6 is not available, then using a mesh VPN such as +- Each machine has an IP address which makes it directly reachable by all other machines. + In many cases, nodes will be behind a NAT and will not each have a public + IPv4 addresses. In this case, is recommended that you use IPv6 for this + end-to-end connectivity if it is available. Otherwise, using a mesh VPN such as [Nebula](https://github.com/slackhq/nebula) or [Yggdrasil](https://yggdrasil-network.github.io/) are approaches to consider in addition to building out your own VPN tunneling. @@ -42,7 +43,7 @@ For our example, we will suppose the following infrastructure with IPv6 connecti | Brussels | Mars | fc00:F::1 | 1.5 TB | Note that Garage will **always** store the three copies of your data on nodes at different -locations. This means that in the case of this small example, the available capacity +locations. This means that in the case of this small example, the usable capacity of the cluster is in fact only 1.5 TB, because nodes in Brussels can't store more than that. This also means that nodes in Paris and London will be under-utilized. To make better use of the available hardware, you should ensure that the capacity @@ -75,28 +76,23 @@ to store 2 TB of data in total. - For the metadata storage, Garage does not do checksumming and integrity verification on its own. If you are afraid of bitrot/data corruption, - put your metadata directory on a BTRFS partition. Otherwise, just use regular + put your metadata directory on a ZFS or BTRFS partition. Otherwise, just use regular EXT4 or XFS. -- Having a single server with several storage drives is currently not very well - supported in Garage ([#218](https://git.deuxfleurs.fr/Deuxfleurs/garage/issues/218)). - For an easy setup, just put all your drives in a RAID0 or a ZFS RAIDZ array. - If you're adventurous, you can try to format each of your disk as - a separate XFS partition, and then run one `garage` daemon per disk drive, - or use something like [`mergerfs`](https://github.com/trapexit/mergerfs) to merge - all your disks in a single union filesystem that spreads load over them. +- Servers with multiple HDDs are supported natively by Garage without resorting + to RAID, see [our dedicated documentation page](@/documentation/operations/multi-hdd.md). ## Get a Docker image Our docker image is currently named `dxflrs/garage` and is stored on the [Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated). -We encourage you to use a fixed tag (eg. `v0.8.0`) and not the `latest` tag. -For this example, we will use the latest published version at the time of the writing which is `v0.8.0` but it's up to you +We encourage you to use a fixed tag (eg. `v0.9.0`) and not the `latest` tag. +For this example, we will use the latest published version at the time of the writing which is `v0.9.0` but it's up to you to check [the most recent versions on the Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated). For example: ``` -sudo docker pull dxflrs/garage:v0.8.0 +sudo docker pull dxflrs/garage:v0.9.0 ``` ## Deploying and configuring Garage @@ -161,12 +157,13 @@ docker run \ -v /etc/garage.toml:/etc/garage.toml \ -v /var/lib/garage/meta:/var/lib/garage/meta \ -v /var/lib/garage/data:/var/lib/garage/data \ - dxflrs/garage:v0.8.0 + dxflrs/garage:v0.9.0 ``` -It should be restarted automatically at each reboot. -Please note that we use host networking as otherwise Docker containers -can not communicate with IPv6. +With this command line, Garage should be started automatically at each boot. +Please note that we use host networking as otherwise the network indirection +added by Docker would prevent Garage nodes from communicating with one another +(especially if using IPv6). If you want to use `docker-compose`, you may use the following `docker-compose.yml` file as a reference: @@ -174,7 +171,7 @@ If you want to use `docker-compose`, you may use the following `docker-compose.y version: "3" services: garage: - image: dxflrs/garage:v0.8.0 + image: dxflrs/garage:v0.9.0 network_mode: "host" restart: unless-stopped volumes: @@ -183,10 +180,12 @@ services: - /var/lib/garage/data:/var/lib/garage/data ``` -Upgrading between Garage versions should be supported transparently, -but please check the relase notes before doing so! -To upgrade, simply stop and remove this container and -start again the command with a new version of Garage. +If you wish to upgrade your cluster, make sure to read the corresponding +[documentation page](@/documentation/operations/upgrading.md) first, as well as +the documentation relevant to your version of Garage in the case of major +upgrades. With the containerized setup proposed here, the upgrade process +will require stopping and removing the existing container, and re-creating it +with the upgraded version. ## Controling the daemon @@ -270,12 +269,12 @@ of a role that is assigned to each active cluster node. For our example, we will suppose we have the following infrastructure (Capacity, Identifier and Zone are specific values to Garage described in the following): -| Location | Name | Disk Space | `Capacity` | `Identifier` | `Zone` | -|----------|---------|------------|------------|--------------|--------------| -| Paris | Mercury | 1 TB | `10` | `563e` | `par1` | -| Paris | Venus | 2 TB | `20` | `86f0` | `par1` | -| London | Earth | 2 TB | `20` | `6814` | `lon1` | -| Brussels | Mars | 1.5 TB | `15` | `212f` | `bru1` | +| Location | Name | Disk Space | Identifier | Zone (`-z`) | Capacity (`-c`) | +|----------|---------|------------|------------|-------------|-----------------| +| Paris | Mercury | 1 TB | `563e` | `par1` | `1T` | +| Paris | Venus | 2 TB | `86f0` | `par1` | `2T` | +| London | Earth | 2 TB | `6814` | `lon1` | `2T` | +| Brussels | Mars | 1.5 TB | `212f` | `bru1` | `1.5T` | #### Node identifiers @@ -297,6 +296,8 @@ garage status It will display the IP address associated with each node; from the IP address you will be able to recognize the node. +We will now use the `garage layout assign` command to configure the correct parameters for each node. + #### Zones Zones are simply a user-chosen identifier that identify a group of server that are grouped together logically. @@ -306,29 +307,29 @@ In most cases, a zone will correspond to a geographical location (i.e. a datacen Behind the scene, Garage will use zone definition to try to store the same data on different zones, in order to provide high availability despite failure of a zone. +Zones are passed to Garage using the `-z` flag of `garage layout assign` (see below). + #### Capacity -Garage reasons on an abstract metric about disk storage that is named the *capacity* of a node. -The capacity configured in Garage must be proportional to the disk space dedicated to the node. +Garage needs to know the storage capacity (disk space) it can/should use on +each node, to be able to correctly balance data. + +Capacity values are expressed in bytes and are passed to Garage using the `-c` flag of `garage layout assign` (see below). -Capacity values must be **integers** but can be given any signification. -Here we chose that 1 unit of capacity = 100 GB. +#### Tags -Note that the amount of data stored by Garage on each server may not be strictly proportional to -its capacity value, as Garage will priorize having 3 copies of data in different zones, -even if this means that capacities will not be strictly respected. For example in our above examples, -nodes Earth and Mars will always store a copy of everything each, and the third copy will -have 66% chance of being stored by Venus and 33% chance of being stored by Mercury. +You can add additional tags to nodes using the `-t` flag of `garage layout assign` (see below). +Tags have no specific meaning for Garage and can be used at your convenience. #### Injecting the topology Given the information above, we will configure our cluster as follow: ```bash -garage layout assign 563e -z par1 -c 10 -t mercury -garage layout assign 86f0 -z par1 -c 20 -t venus -garage layout assign 6814 -z lon1 -c 20 -t earth -garage layout assign 212f -z bru1 -c 15 -t mars +garage layout assign 563e -z par1 -c 1T -t mercury +garage layout assign 86f0 -z par1 -c 2T -t venus +garage layout assign 6814 -z lon1 -c 2T -t earth +garage layout assign 212f -z bru1 -c 1.5T -t mars ``` At this point, the changes in the cluster layout have not yet been applied. @@ -338,6 +339,7 @@ To show the new layout that will be applied, call: garage layout show ``` +Make sure to read carefully the output of `garage layout show`. Once you are satisfied with your new layout, apply it with: ```bash diff --git a/doc/book/operations/durability-repairs.md b/doc/book/operations/durability-repairs.md index 498c8fda..b0d2c78a 100644 --- a/doc/book/operations/durability-repairs.md +++ b/doc/book/operations/durability-repairs.md @@ -91,6 +91,16 @@ is definitely lost, then there is no other choice than to declare your S3 object as unrecoverable, and to delete them properly from the data store. This can be done using the `garage block purge` command. +## Rebalancing data directories + +In [multi-HDD setups](@/documentation/operations/multi-hdd.md), to ensure that +data blocks are well balanced between storage locations, you may run a +rebalance operation using `garage repair rebalance`. This is usefull when +adding storage locations or when capacities of the storage locations have been +changed. Once this is finished, Garage will know for each block of a single +possible location where it can be, which can increase access speed. This +operation will also move out all data from locations marked as read-only. + # Metadata operations @@ -114,4 +124,3 @@ in your cluster, you can run one of the following repair procedures: - `garage repair versions`: checks that all versions belong to a non-deleted object, and purges any orphan version - `garage repair block_refs`: checks that all block references belong to a non-deleted object version, and purges any orphan block reference (this will then allow the blocks to be garbage-collected) - diff --git a/doc/book/operations/layout.md b/doc/book/operations/layout.md index 5e314246..ee05aba1 100644 --- a/doc/book/operations/layout.md +++ b/doc/book/operations/layout.md @@ -9,18 +9,30 @@ a certain capacity, or a gateway node that does not store data and is only used as an API entry point for faster cluster access. An introduction to building cluster layouts can be found in the [production deployment](@/documentation/cookbook/real-world.md) page. +In Garage, all of the data that can be stored in a given cluster is divided +into slices which we call *partitions*. Each partition is stored by +one or several nodes in the cluster +(see [`replication_mode`](@/documentation/reference-manual/configuration.md#replication_mode)). +The layout determines the correspondence between these partition, +which exist on a logical level, and actual storage nodes. + ## How cluster layouts work in Garage -In Garage, a cluster layout is composed of the following components: +A cluster layout is composed of the following components: -- a table of roles assigned to nodes +- a table of roles assigned to nodes, defined by the user +- an optimal assignation of partitions to nodes, computed by an algorithm that is ran once when calling `garage layout apply` or the ApplyClusterLayout API endpoint - a version number Garage nodes will always use the cluster layout with the highest version number. Garage nodes also maintain and synchronize between them a set of proposed role changes that haven't yet been applied. These changes will be applied (or -canceled) in the next version of the layout +canceled) in the next version of the layout. + +All operations on the layout can be realized using the `garage` CLI or using the +[administration API endpoint](@/documentation/reference-manual/admin-api.md). +We give here a description of CLI commands, the admin API semantics are very similar. The following commands insert modifications to the set of proposed role changes for the next layout version (but they do not create the new layout immediately): @@ -51,7 +63,7 @@ commands will fail otherwise. ## Warnings about Garage cluster layout management -**Warning: never make several calls to `garage layout apply` or `garage layout +**⚠️ Never make several calls to `garage layout apply` or `garage layout revert` with the same value of the `--version` flag. Doing so can lead to the creation of several different layouts with the same version number, in which case your Garage cluster will become inconsistent until fixed.** If a call to @@ -65,13 +77,198 @@ shell, you shouldn't have much issues as long as you run commands one after the other and take care of checking the output of `garage layout show` before applying any changes. -If you are using the `garage` CLI to script layout changes, follow the following recommendations: +If you are using the `garage` CLI or the admin API to script layout changes, +follow the following recommendations: + +- If using the CLI, make all of your `garage` CLI calls to the same RPC host. + If using the admin API, make all of your API calls to the same Garage node. Do + not connect to individual nodes to send them each a piece of the layout changes + you are making, as the changes propagate asynchronously between nodes and might + not all be taken into account at the time when the new layout is applied. + +- **Only call `garage layout apply`/ApplyClusterLayout once**, and call it + **strictly after** all of the `layout assign` and `layout remove` + commands/UpdateClusterLayout API calls have returned. + + +## Understanding unexpected layout calculations + +When adding, removing or modifying nodes in a cluster layout, sometimes +unexpected assigntations of partitions to node can occur. These assignations +are in fact normal and logical, given the objectives of the algorihtm. Indeed, +**the layout algorithm prioritizes moving less data between nodes over the fact +of achieving equal distribution of load. It also tries to use all links between +pairs of nodes in equal proportions when moving data.** This section presents +two examples and illustrates how one can control Garage's behavior to obtain +the desired results. + +### Example 1 + +In this example, a cluster is originally composed of 3 nodes in 3 different +zones (data centers). The three nodes are of equal capacity, therefore they +are all fully exploited and all store a copy of all of the data in the cluster. + +Then, a fourth node of the same size is added in the datacenter `dc1`. +As illustrated by the following, **Garage will by default not store any data on the new node**: + +``` +$ garage layout show +==== CURRENT CLUSTER LAYOUT ==== +ID Tags Zone Capacity Usable capacity +b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%) +a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%) +62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%) + +Zone redundancy: maximum + +Current cluster layout version: 6 + +==== STAGED ROLE CHANGES ==== +ID Tags Zone Capacity +a11c7cf18af29737 node4 dc1 1000.0 MB + + +==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ==== +ID Tags Zone Capacity Usable capacity +b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%) +a11c7cf18af29737 node4 dc1 1000.0 MB 0 B (0.0%) +a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%) +62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%) + +Zone redundancy: maximum + +==== COMPUTATION OF A NEW PARTITION ASSIGNATION ==== + +Partitions are replicated 3 times on at least 3 distinct zones. + +Optimal partition size: 3.9 MB (3.9 MB in previous layout) +Usable capacity / total cluster capacity: 3.0 GB / 4.0 GB (75.0 %) +Effective capacity (replication factor 3): 1000.0 MB + +A total of 0 new copies of partitions need to be transferred. + +dc1 Tags Partitions Capacity Usable capacity + b10c110e4e854e5a node1 256 (0 new) 1000.0 MB 1000.0 MB (100.0%) + a11c7cf18af29737 node4 0 (0 new) 1000.0 MB 0 B (0.0%) + TOTAL 256 (256 unique) 2.0 GB 1000.0 MB (50.0%) + +dc2 Tags Partitions Capacity Usable capacity + a235ac7695e0c54d node2 256 (0 new) 1000.0 MB 1000.0 MB (100.0%) + TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%) + +dc3 Tags Partitions Capacity Usable capacity + 62b218d848e86a64 node3 256 (0 new) 1000.0 MB 1000.0 MB (100.0%) + TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%) +``` + +While unexpected, this is logical because of the following facts: + +- storing some data on the new node does not help increase the total quantity + of data that can be stored on the cluster, as the two other zones (`dc2` and + `dc3`) still need to store a full copy of everything, and their capacity is + still the same; + +- there is therefore no need to move any data on the new node as this would be pointless; + +- moving data to the new node has a cost which the algorithm decides to not pay if not necessary. + +This distribution of data can however not be what the administrator wanted: if +they added a new node to `dc1`, it might be because the existing node is too +slow, and they wish to divide its load by half. In that case, what they need to +do to force Garage to distribute the data between the two nodes is to attribute +only half of the capacity to each node in `dc1` (in our example, 500M instead of 1G). +In that case, Garage would determine that to be able to store 1G in total, it +would need to store 500M on the old node and 500M on the added one. + + +### Example 2 + +The following example is a slightly different scenario, where `dc1` had two +nodes that were used at 50%, and `dc2` and `dc3` each have one node that is +100% used. All node capacities are the same. + +Then, a node from `dc1` is moved into `dc3`. One could expect that the roles of +`dc1` and `dc3` would simply be swapped: the remaining node in `dc1` would be +used at 100%, and the two nodes now in `dc3` would be used at 50%. Instead, +this happens: + +``` +==== CURRENT CLUSTER LAYOUT ==== +ID Tags Zone Capacity Usable capacity +b10c110e4e854e5a node1 dc1 1000.0 MB 500.0 MB (50.0%) +a11c7cf18af29737 node4 dc1 1000.0 MB 500.0 MB (50.0%) +a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%) +62b218d848e86a64 node3 dc3 1000.0 MB 1000.0 MB (100.0%) + +Zone redundancy: maximum + +Current cluster layout version: 8 + +==== STAGED ROLE CHANGES ==== +ID Tags Zone Capacity +a11c7cf18af29737 node4 dc3 1000.0 MB + + +==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ==== +ID Tags Zone Capacity Usable capacity +b10c110e4e854e5a node1 dc1 1000.0 MB 1000.0 MB (100.0%) +a235ac7695e0c54d node2 dc2 1000.0 MB 1000.0 MB (100.0%) +62b218d848e86a64 node3 dc3 1000.0 MB 753.9 MB (75.4%) +a11c7cf18af29737 node4 dc3 1000.0 MB 246.1 MB (24.6%) + +Zone redundancy: maximum + +==== COMPUTATION OF A NEW PARTITION ASSIGNATION ==== + +Partitions are replicated 3 times on at least 3 distinct zones. + +Optimal partition size: 3.9 MB (3.9 MB in previous layout) +Usable capacity / total cluster capacity: 3.0 GB / 4.0 GB (75.0 %) +Effective capacity (replication factor 3): 1000.0 MB + +A total of 128 new copies of partitions need to be transferred. + +dc1 Tags Partitions Capacity Usable capacity + b10c110e4e854e5a node1 256 (128 new) 1000.0 MB 1000.0 MB (100.0%) + TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%) + +dc2 Tags Partitions Capacity Usable capacity + a235ac7695e0c54d node2 256 (0 new) 1000.0 MB 1000.0 MB (100.0%) + TOTAL 256 (256 unique) 1000.0 MB 1000.0 MB (100.0%) + +dc3 Tags Partitions Capacity Usable capacity + 62b218d848e86a64 node3 193 (0 new) 1000.0 MB 753.9 MB (75.4%) + a11c7cf18af29737 node4 63 (0 new) 1000.0 MB 246.1 MB (24.6%) + TOTAL 256 (256 unique) 2.0 GB 1000.0 MB (50.0%) +``` + +As we can see, the node that was moved to `dc3` (node4) is only used at 25% (approximatively), +whereas the node that was already in `dc3` (node3) is used at 75%. + +This can be explained by the following: + +- node1 will now be the only node remaining in `dc1`, thus it has to store all + of the data in the cluster. Since it was storing only half of it before, it has + to retrieve the other half from other nodes in the cluster. + +- The data which it does not have is entirely stored by the other node that was + in `dc1` and that is now in `dc3` (node4). There is also a copy of it on node2 + and node3 since both these nodes have a copy of everything. + +- node3 and node4 are the two nodes that will now be in a datacenter that is + under-utilized (`dc3`), this means that those are the two candidates from which + data can be removed to be moved to node1. + +- Garage will move data in equal proportions from all possible sources, in this + case it means that it will tranfer 25% of the entire data set from node3 to + node1 and another 25% from node4 to node1. -- Make all of your `garage` CLI calls to the same RPC host. Do not use the - `garage` CLI to connect to individual nodes to send them each a piece of the - layout changes you are making, as the changes propagate asynchronously - between nodes and might not all be taken into account at the time when the - new layout is applied. +This explains why node3 ends with 75% utilization (100% from before minus 25% +that is moved to node1), and node4 ends with 25% (50% from before minus 25% +that is moved to node1). -- **Only call `garage layout apply` once**, and call it **strictly after** all - of the `layout assign` and `layout remove` commands have returned. +This illustrates the second principle of the layout computation: **if there is +a choice in moving data out of some nodes, then all links between pairs of +nodes are used in equal proportions** (this is approximately true, there is +randomness in the algorihtm to achieve this so there might be some small +fluctuations, as we see above). diff --git a/doc/book/operations/multi-hdd.md b/doc/book/operations/multi-hdd.md new file mode 100644 index 00000000..36445b0a --- /dev/null +++ b/doc/book/operations/multi-hdd.md @@ -0,0 +1,101 @@ ++++ +title = "Multi-HDD support" +weight = 15 ++++ + + +Since v0.9, Garage natively supports nodes that have several storage drives +for storing data blocks (not for metadata storage). + +## Initial setup + +To set up a new Garage storage node with multiple HDDs, +format and mount all your drives in different directories, +and use a Garage configuration as follows: + +```toml +data_dir = [ + { path = "/path/to/hdd1", capacity = "2T" }, + { path = "/path/to/hdd2", capacity = "4T" }, +] +``` + +Garage will automatically balance all blocks stored by the node +among the different specified directories, proportionnally to the +specified capacities. + +## Updating the list of storage locations + +If you add new storage locations to your `data_dir`, +Garage will not rebalance existing data between storage locations. +Newly written blocks will be balanced proportionnally to the specified capacities, +and existing data may be moved between drives to improve balancing, +but only opportunistically when a data block is re-written (e.g. an object +is re-uploaded, or an object with a duplicate block is uploaded). + +To understand precisely what is happening, we need to dive in to how Garage +splits data among the different storage locations. + +First of all, Garage divides the set of all possible block hashes +in a fixed number of slices (currently 1024), and assigns +to each slice a primary storage location among the specified data directories. +The number of slices having their primary location in each data directory +is proportionnal to the capacity specified in the config file. + +When Garage receives a block to write, it will always write it in the primary +directory of the slice that contains its hash. + +Now, to be able to not lose existing data blocks when storage locations +are added, Garage also keeps a list of secondary data directories +for all of the hash slices. Secondary data directories for a slice indicates +storage locations that once were primary directories for that slice, i.e. where +Garage knows that data blocks of that slice might be stored. +When Garage is requested to read a certain data block, +it will first look in the primary storage directory of its slice, +and if it doesn't find it there it goes through all of the secondary storage +locations until it finds it. This allows Garage to continue operating +normally when storage locations are added, without having to shuffle +files between drives to place them in the correct location. + +This relatively simple strategy works well but does not ensure that data +is correctly balanced among drives according to their capacity. +To rebalance data, two strategies can be used: + +- Lazy rebalancing: when a block is re-written (e.g. the object is re-uploaded), + Garage checks whether the existing copy is in the primary directory of the slice + or in a secondary directory. If the current copy is in a secondary directory, + Garage re-writes a copy in the primary directory and deletes the one from the + secondary directory. This might never end up rebalancing everything if there + are data blocks that are only read and never written. + +- Active rebalancing: an operator of a Garage node can explicitly launch a repair + procedure that rebalances the data directories, moving all blocks to their + primary location. Once done, all secondary locations for all hash slices are + removed so that they won't be checked anymore when looking for a data block. + +## Read-only storage locations + +If you would like to move all data blocks from an existing data directory to one +or several new data directories, mark the old directory as read-only: + +```toml +data_dir = [ + { path = "/path/to/old_data", read_only = true }, + { path = "/path/to/new_hdd1", capacity = "2T" }, + { path = "/path/to/new_hdd2", capacity = "4T" }, +] +``` + +Garage will be able to read requested blocks from the read-only directory. +Garage will also move data out of the read-only directory either progressively +(lazy rebalancing) or if requested explicitly (active rebalancing). + +Once an active rebalancing has finished, your read-only directory should be empty: +it might still contain subdirectories, but no data files. You can check that +it contains no files using: + +```bash +find -type f /path/to/old_data # should not print anything +``` + +at which point it can be removed from the `data_dir` list in your config file. diff --git a/doc/book/operations/upgrading.md b/doc/book/operations/upgrading.md index e8919a19..9a738282 100644 --- a/doc/book/operations/upgrading.md +++ b/doc/book/operations/upgrading.md @@ -80,6 +80,6 @@ The entire procedure would look something like this: 5. If any specific migration procedure is required, it is usually in one of the two cases: - It can be run on online nodes after the new version has started, during regular cluster operation. - - it has to be run offline + - it has to be run offline, in which case you will have to again take all nodes offline one after the other to run the repair For this last step, please refer to the specific documentation pertaining to the version upgrade you are doing. diff --git a/doc/book/quick-start/_index.md b/doc/book/quick-start/_index.md index 08932775..1b129f36 100644 --- a/doc/book/quick-start/_index.md +++ b/doc/book/quick-start/_index.md @@ -84,9 +84,8 @@ admin_token = "$(openssl rand -base64 32)" EOF ``` -Now that your configuration file has been created, you can put -it in the right place. By default, garage looks at **`/etc/garage.toml`.** - +Now that your configuration file has been created, you may save it to the directory of your choice. +By default, Garage looks for **`/etc/garage.toml`.** You can also store it somewhere else, but you will have to specify `-c path/to/garage.toml` at each invocation of the `garage` binary (for example: `garage -c ./garage.toml server`, `garage -c ./garage.toml status`). @@ -103,12 +102,14 @@ your data to be persisted properly. ### Launching the Garage server -Use the following command to launch the Garage server with our configuration file: +Use the following command to launch the Garage server: ``` -garage server +garage -c path/to/garage.toml server ``` +If you have placed the `garage.toml` file in `/etc` (its default location), you can simply run `garage server`. + You can tune Garage's verbosity as follows (from less verbose to more verbose): ``` @@ -126,7 +127,7 @@ Log level `debug` can help you check why your S3 API calls are not working. The `garage` utility is also used as a CLI tool to configure your Garage deployment. It uses values from the TOML configuration file to find the Garage daemon running on the local node, therefore if your configuration file is not at `/etc/garage.toml` you will -again have to specify `-c path/to/garage.toml`. +again have to specify `-c path/to/garage.toml` at each invocation. If the `garage` CLI is able to correctly detect the parameters of your local Garage node, the following command should be enough to show the status of your cluster: @@ -140,7 +141,7 @@ This should show something like this: ``` ==== HEALTHY NODES ==== ID Hostname Address Tag Zone Capacity -563e1ac825ee3323… linuxbox 127.0.0.1:3901 NO ROLE ASSIGNED +563e1ac825ee3323 linuxbox 127.0.0.1:3901 NO ROLE ASSIGNED ``` ## Creating a cluster layout @@ -153,12 +154,12 @@ For our test deployment, we are using only one node. The way in which we configu it does not matter, you can simply write: ```bash -garage layout assign -z dc1 -c 1 <node_id> +garage layout assign -z dc1 -c 1G <node_id> ``` where `<node_id>` corresponds to the identifier of the node shown by `garage status` (first column). You can enter simply a prefix of that identifier. -For instance here you could write just `garage layout assign -z dc1 -c 1 563e`. +For instance here you could write just `garage layout assign -z dc1 -c 1G 563e`. The layout then has to be applied to the cluster, using: @@ -209,7 +210,7 @@ one key can access multiple buckets, multiple keys can access one bucket. Create an API key using the following command: ``` -garage key new --name nextcloud-app-key +garage key create nextcloud-app-key ``` The output should look as follows: diff --git a/doc/book/reference-manual/admin-api.md b/doc/book/reference-manual/admin-api.md index 6932ac60..15630788 100644 --- a/doc/book/reference-manual/admin-api.md +++ b/doc/book/reference-manual/admin-api.md @@ -13,8 +13,11 @@ We will bump the version numbers prefixed to each API endpoint at each time the or semantics change, meaning that code that relies on these endpoint will break when changes are introduced. -The Garage administration API was introduced in version 0.7.2, this document -does not apply to older versions of Garage. +Versions: + - Before Garage 0.7.2 - no admin API + - Garage 0.7.2 - admin APIv0 + - Garage 0.9.0 - admin APIv1, deprecate admin APIv0 + ## Access control @@ -131,7 +134,9 @@ $ curl -so /dev/null -w "%{http_code}" http://localhost:3903/check?domain=exampl ### Cluster operations -These endpoints are defined on a dedicated [Redocly page](https://garagehq.deuxfleurs.fr/api/garage-admin-v0.html). You can also download its [OpenAPI specification](https://garagehq.deuxfleurs.fr/api/garage-admin-v0.yml). +These endpoints have a dedicated OpenAPI spec. + - APIv1 - [HTML spec](https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html) - [OpenAPI YAML](https://garagehq.deuxfleurs.fr/api/garage-admin-v1.yml) + - APIv0 (deprecated) - [HTML spec](https://garagehq.deuxfleurs.fr/api/garage-admin-v0.html) - [OpenAPI YAML](https://garagehq.deuxfleurs.fr/api/garage-admin-v0.yml) Requesting the API from the command line can be as simple as running: diff --git a/doc/book/reference-manual/configuration.md b/doc/book/reference-manual/configuration.md index 77720f7b..5e12a7da 100644 --- a/doc/book/reference-manual/configuration.md +++ b/doc/book/reference-manual/configuration.md @@ -8,8 +8,12 @@ weight = 20 Here is an example `garage.toml` configuration file that illustrates all of the possible options: ```toml +replication_mode = "3" + metadata_dir = "/var/lib/garage/meta" data_dir = "/var/lib/garage/data" +metadata_fsync = true +data_fsync = false db_engine = "lmdb" @@ -19,8 +23,6 @@ sled_cache_capacity = "128MiB" sled_flush_every_ms = 2000 lmdb_map_size = "1T" -replication_mode = "3" - compression_level = 1 rpc_secret = "4425f5c26c5e11581d3223904324dcb5b5d5dfb14e5e7f35e38c595424f5f1e6" @@ -75,101 +77,64 @@ The following gives details about each available configuration option. ## Available configuration options -### `metadata_dir` - -The directory in which Garage will store its metadata. This contains the node identifier, -the network configuration and the peer list, the list of buckets and keys as well -as the index of all objects, object version and object blocks. - -Store this folder on a fast SSD drive if possible to maximize Garage's performance. - -### `data_dir` - -The directory in which Garage will store the data blocks of objects. -This folder can be placed on an HDD. The space available for `data_dir` -should be counted to determine a node's capacity -when [adding it to the cluster layout](@/documentation/cookbook/real-world.md). - -### `db_engine` (since `v0.8.0`) - -By default, Garage uses the Sled embedded database library -to store its metadata on-disk. Since `v0.8.0`, Garage can use alternative storage backends as follows: - -| DB engine | `db_engine` value | Database path | -| --------- | ----------------- | ------------- | -| [Sled](https://sled.rs) | `"sled"` | `<metadata_dir>/db/` | -| [LMDB](https://www.lmdb.tech) | `"lmdb"` | `<metadata_dir>/db.lmdb/` | -| [Sqlite](https://sqlite.org) | `"sqlite"` | `<metadata_dir>/db.sqlite` | - -Performance characteristics of the different DB engines are as follows: - -- Sled: the default database engine, which tends to produce - large data files and also has performance issues, especially when the metadata folder - is on a traditional HDD and not on SSD. -- LMDB: the recommended alternative on 64-bit systems, - much more space-efficiant and slightly faster. Note that the data format of LMDB is not portable - between architectures, so for instance the Garage database of an x86-64 - node cannot be moved to an ARM64 node. Also note that, while LMDB can technically be used on 32-bit systems, - this will limit your node to very small database sizes due to how LMDB works; it is therefore not recommended. -- Sqlite: Garage supports Sqlite as a storage backend for metadata, - however it may have issues and is also very slow in its current implementation, - so it is not recommended to be used for now. - -It is possible to convert Garage's metadata directory from one format to another with a small utility named `convert_db`, -which can be downloaded at the following locations: -[for amd64](https://garagehq.deuxfleurs.fr/_releases/convert_db/amd64/convert_db), -[for i386](https://garagehq.deuxfleurs.fr/_releases/convert_db/i386/convert_db), -[for arm64](https://garagehq.deuxfleurs.fr/_releases/convert_db/arm64/convert_db), -[for arm](https://garagehq.deuxfleurs.fr/_releases/convert_db/arm/convert_db). -The `convert_db` utility is used as folows: - -``` -convert-db -a <input db engine> -i <input db path> \ - -b <output db engine> -o <output db path> -``` - -Make sure to specify the full database path as presented in the table above, -and not just the path to the metadata directory. - -### `block_size` - -Garage splits stored objects in consecutive chunks of size `block_size` -(except the last one which might be smaller). The default size is 1MiB and -should work in most cases. We recommend increasing it to e.g. 10MiB if -you are using Garage to store large files and have fast network connections -between all nodes (e.g. 1gbps). - -If you are interested in tuning this, feel free to do so (and remember to -report your findings to us!). When this value is changed for a running Garage -installation, only files newly uploaded will be affected. Previously uploaded -files will remain available. This however means that chunks from existing files -will not be deduplicated with chunks from newly uploaded files, meaning you -might use more storage space that is optimally possible. - -### `sled_cache_capacity` - -This parameter can be used to tune the capacity of the cache used by -[sled](https://sled.rs), the database Garage uses internally to store metadata. -Tune this to fit the RAM you wish to make available to your Garage instance. -This value has a conservative default (128MB) so that Garage doesn't use too much -RAM by default, but feel free to increase this for higher performance. - -### `sled_flush_every_ms` - -This parameters can be used to tune the flushing interval of sled. -Increase this if sled is thrashing your SSD, at the risk of losing more data in case -of a power outage (though this should not matter much as data is replicated on other -nodes). The default value, 2000ms, should be appropriate for most use cases. - -### `lmdb_map_size` - -This parameters can be used to set the map size used by LMDB, -which is the size of the virtual memory region used for mapping the database file. -The value of this parameter is the maximum size the metadata database can take. -This value is not bound by the physical RAM size of the machine running Garage. -If not specified, it defaults to 1GiB on 32-bit machines and 1TiB on 64-bit machines. - -### `replication_mode` +### Index + +Top-level configuration options: +[`block_size`](#block_size), +[`bootstrap_peers`](#bootstrap_peers), +[`compression_level`](#compression_level), +[`data_dir`](#metadata_dir), +[`data_fsync`](#data_fsync), +[`db_engine`](#db_engine), +[`lmdb_map_size`](#lmdb_map_size), +[`metadata_dir`](#metadata_dir), +[`metadata_fsync`](#metadata_fsync), +[`replication_mode`](#replication_mode), +[`rpc_bind_addr`](#rpc_bind_addr), +[`rpc_public_addr`](#rpc_public_addr), +[`rpc_secret`](#rpc_secret), +[`rpc_secret_file`](#rpc_secret), +[`sled_cache_capacity`](#sled_cache_capacity), +[`sled_flush_every_ms`](#sled_flush_every_ms). + +The `[consul_discovery]` section: +[`api`](#consul_api), +[`ca_cert`](#consul_ca_cert), +[`client_cert`](#consul_client_cert), +[`client_key`](#consul_client_cert), +[`consul_http_addr`](#consul_http_addr), +[`meta`](#consul_tags), +[`service_name`](#consul_service_name), +[`tags`](#consul_tags), +[`tls_skip_verify`](#consul_tls_skip_verify), +[`token`](#consul_token). + +The `[kubernetes_discovery]` section: +[`namespace`](#kube_namespace), +[`service_name`](#kube_service_name), +[`skip_crd`](#kube_skip_crd). + +The `[s3_api]` section: +[`api_bind_addr`](#s3_api_bind_addr), +[`root_domain`](#s3_root_domain), +[`s3_region`](#s3_region). + +The `[s3_web]` section: +[`bind_addr`](#web_bind_addr), +[`root_domain`](#web_root_domain). + +The `[admin]` section: +[`api_bind_addr`](#admin_api_bind_addr), +[`metrics_token`](#admin_metrics_token), +[`metrics_token_file`](#admin_metrics_token), +[`admin_token`](#admin_token), +[`admin_token_file`](#admin_token), +[`trace_sink`](#admin_trace_sink), + + +### Top-level configuration options + +#### `replication_mode` {#replication_mode} Garage supports the following replication modes: @@ -252,7 +217,160 @@ to the cluster while rebalancing is in progress. In theory, no data should be lost as rebalancing is a routine operation for Garage, although we cannot guarantee you that everything will go right in such an extreme scenario. -### `compression_level` +#### `metadata_dir` {#metadata_dir} + +The directory in which Garage will store its metadata. This contains the node identifier, +the network configuration and the peer list, the list of buckets and keys as well +as the index of all objects, object version and object blocks. + +Store this folder on a fast SSD drive if possible to maximize Garage's performance. + +#### `data_dir` {#data_dir} + +The directory in which Garage will store the data blocks of objects. +This folder can be placed on an HDD. The space available for `data_dir` +should be counted to determine a node's capacity +when [adding it to the cluster layout](@/documentation/cookbook/real-world.md). + +Since `v0.9.0`, Garage supports multiple data directories with the following syntax: + +```toml +data_dir = [ + { path = "/path/to/old_data", read_only = true }, + { path = "/path/to/new_hdd1", capacity = "2T" }, + { path = "/path/to/new_hdd2", capacity = "4T" }, +] +``` + +See [the dedicated documentation page](@/documentation/operations/multi-hdd.md) +on how to operate Garage in such a setup. + +#### `db_engine` (since `v0.8.0`) {#db_engine} + +Since `v0.8.0`, Garage can use alternative storage backends as follows: + +| DB engine | `db_engine` value | Database path | +| --------- | ----------------- | ------------- | +| [LMDB](https://www.lmdb.tech) (default since `v0.9.0`) | `"lmdb"` | `<metadata_dir>/db.lmdb/` | +| [Sled](https://sled.rs) (default up to `v0.8.0`) | `"sled"` | `<metadata_dir>/db/` | +| [Sqlite](https://sqlite.org) | `"sqlite"` | `<metadata_dir>/db.sqlite` | + +Sled was the only database engine up to Garage v0.7.0. Performance issues and +API limitations of Sled prompted the addition of alternative engines in v0.8.0. +Since v0.9.0, LMDB is the default engine instead of Sled, and Sled is +deprecated. We plan to remove Sled in Garage v1.0. + +Performance characteristics of the different DB engines are as follows: + +- Sled: tends to produce large data files and also has performance issues, + especially when the metadata folder is on a traditional HDD and not on SSD. + +- LMDB: the recommended database engine on 64-bit systems, much more + space-efficient and slightly faster. Note that the data format of LMDB is not + portable between architectures, so for instance the Garage database of an + x86-64 node cannot be moved to an ARM64 node. Also note that, while LMDB can + technically be used on 32-bit systems, this will limit your node to very + small database sizes due to how LMDB works; it is therefore not recommended. + +- Sqlite: Garage supports Sqlite as an alternative storage backend for + metadata, and although it has not been tested as much, it is expected to work + satisfactorily. Since Garage v0.9.0, performance issues have largely been + fixed by allowing for a no-fsync mode (see `metadata_fsync`). Sqlite does not + have the database size limitation of LMDB on 32-bit systems. + +It is possible to convert Garage's metadata directory from one format to another +using the `garage convert-db` command, which should be used as follows: + +``` +garage convert-db -a <input db engine> -i <input db path> \ + -b <output db engine> -o <output db path> +``` + +Make sure to specify the full database path as presented in the table above +(third colummn), and not just the path to the metadata directory. + +#### `metadata_fsync` {#metadata_fsync} + +Whether to enable synchronous mode for the database engine or not. +This is disabled (`false`) by default. + +This reduces the risk of metadata corruption in case of power failures, +at the cost of a significant drop in write performance, +as Garage will have to pause to sync data to disk much more often +(several times for API calls such as PutObject). + +Using this option reduces the risk of simultaneous metadata corruption on several +cluster nodes, which could lead to data loss. + +If multi-site replication is used, this option is most likely not necessary, as +it is extremely unlikely that two nodes in different locations will have a +power failure at the exact same time. + +(Metadata corruption on a single node is not an issue, the corrupted data file +can always be deleted and reconstructed from the other nodes in the cluster.) + +Here is how this option impacts the different database engines: + +| Database | `metadata_fsync = false` (default) | `metadata_fsync = true` | +|----------|------------------------------------|-------------------------------| +| Sled | default options | *unsupported* | +| Sqlite | `PRAGMA synchronous = OFF` | `PRAGMA synchronous = NORMAL` | +| LMDB | `MDB_NOMETASYNC` + `MDB_NOSYNC` | `MDB_NOMETASYNC` | + +Note that the Sqlite database is always ran in `WAL` mode (`PRAGMA journal_mode = WAL`). + +#### `data_fsync` {#data_fsync} + +Whether to `fsync` data blocks and their containing directory after they are +saved to disk. +This is disabled (`false`) by default. + +This might reduce the risk that a data block is lost in rare +situations such as simultaneous node losing power, +at the cost of a moderate drop in write performance. + +Similarly to `metatada_fsync`, this is likely not necessary +if geographical replication is used. + +#### `block_size` {#block_size} + +Garage splits stored objects in consecutive chunks of size `block_size` +(except the last one which might be smaller). The default size is 1MiB and +should work in most cases. We recommend increasing it to e.g. 10MiB if +you are using Garage to store large files and have fast network connections +between all nodes (e.g. 1gbps). + +If you are interested in tuning this, feel free to do so (and remember to +report your findings to us!). When this value is changed for a running Garage +installation, only files newly uploaded will be affected. Previously uploaded +files will remain available. This however means that chunks from existing files +will not be deduplicated with chunks from newly uploaded files, meaning you +might use more storage space that is optimally possible. + +#### `sled_cache_capacity` {#sled_cache_capacity} + +This parameter can be used to tune the capacity of the cache used by +[sled](https://sled.rs), the database Garage uses internally to store metadata. +Tune this to fit the RAM you wish to make available to your Garage instance. +This value has a conservative default (128MB) so that Garage doesn't use too much +RAM by default, but feel free to increase this for higher performance. + +#### `sled_flush_every_ms` {#sled_flush_every_ms} + +This parameters can be used to tune the flushing interval of sled. +Increase this if sled is thrashing your SSD, at the risk of losing more data in case +of a power outage (though this should not matter much as data is replicated on other +nodes). The default value, 2000ms, should be appropriate for most use cases. + +#### `lmdb_map_size` {#lmdb_map_size} + +This parameters can be used to set the map size used by LMDB, +which is the size of the virtual memory region used for mapping the database file. +The value of this parameter is the maximum size the metadata database can take. +This value is not bound by the physical RAM size of the machine running Garage. +If not specified, it defaults to 1GiB on 32-bit machines and 1TiB on 64-bit machines. + +#### `compression_level` {#compression_level} Zstd compression level to use for storing blocks. @@ -276,7 +394,7 @@ Compression is done synchronously, setting a value too high will add latency to This value can be different between nodes, compression is done by the node which receive the API call. -### `rpc_secret`, `rpc_secret_file` or `GARAGE_RPC_SECRET`, `GARAGE_RPC_SECRET_FILE` (env) +#### `rpc_secret`, `rpc_secret_file` or `GARAGE_RPC_SECRET`, `GARAGE_RPC_SECRET_FILE` (env) {#rpc_secret} Garage uses a secret key, called an RPC secret, that is shared between all nodes of the cluster in order to identify these nodes and allow them to @@ -288,10 +406,10 @@ Since Garage `v0.8.2`, the RPC secret can also be stored in a file whose path is given in the configuration variable `rpc_secret_file`, or specified as an environment variable `GARAGE_RPC_SECRET`. -Since Garage `v0.9.0`, you can also specify the path of a file storing the secret -as the `GARAGE_RPC_SECRET_FILE` environment variable. +Since Garage `v0.8.5` and `v0.9.1`, you can also specify the path of a file +storing the secret as the `GARAGE_RPC_SECRET_FILE` environment variable. -### `rpc_bind_addr` +#### `rpc_bind_addr` {#rpc_bind_addr} The address and port on which to bind for inter-cluster communcations (reffered to as RPC for remote procedure calls). @@ -300,14 +418,14 @@ the node, even in the case of a NAT: the NAT should be configured to forward the port number to the same internal port nubmer. This means that if you have several nodes running behind a NAT, they should each use a different RPC port number. -### `rpc_public_addr` +#### `rpc_public_addr` {#rpc_public_addr} The address and port that other nodes need to use to contact this node for RPC calls. **This parameter is optional but recommended.** In case you have a NAT that binds the RPC port to a port that is different on your public IP, this field might help making it work. -### `bootstrap_peers` +#### `bootstrap_peers` {#bootstrap_peers} A list of peer identifiers on which to contact other Garage peers of this cluster. These peer identifiers have the following syntax: @@ -335,42 +453,42 @@ permission verification. Alternatively, you can set the `GARAGE_ALLOW_WORLD_READABLE_SECRETS` environment variable to `true` to bypass the permissions check. -## The `[consul_discovery]` section +### The `[consul_discovery]` section Garage supports discovering other nodes of the cluster using Consul. For this to work correctly, nodes need to know their IP address by which they can be reached by other nodes of the cluster, which should be set in `rpc_public_addr`. -### `consul_http_addr` and `service_name` +#### `consul_http_addr` {#consul_http_addr} The `consul_http_addr` parameter should be set to the full HTTP(S) address of the Consul server. -### `api` +#### `api` {#consul_api} Two APIs for service registration are supported: `catalog` and `agent`. `catalog`, the default, will register a service using the `/v1/catalog` endpoints, enabling mTLS if `client_cert` and `client_key` are provided. The `agent` API uses the `v1/agent` endpoints instead, where an optional `token` may be provided. -### `service_name` +#### `service_name` {#consul_service_name} `service_name` should be set to the service name under which Garage's RPC ports are announced. -### `client_cert`, `client_key` +#### `client_cert`, `client_key` {#consul_client_cert} TLS client certificate and client key to use when communicating with Consul over TLS. Both are mandatory when doing so. Only available when `api = "catalog"`. -### `ca_cert` +#### `ca_cert` {#consul_ca_cert} TLS CA certificate to use when communicating with Consul over TLS. -### `tls_skip_verify` +#### `tls_skip_verify` {#consul_tls_skip_verify} Skip server hostname verification in TLS handshake. `ca_cert` is ignored when this is set. -### `token` +#### `token` {#consul_token} Uses the provided token for communication with Consul. Only available when `api = "agent"`. The policy assigned to this token should at least have these rules: @@ -390,49 +508,49 @@ node_prefix "" { } ``` -### `tags` and `meta` +#### `tags` and `meta` {#consul_tags} Additional list of tags and map of service meta to add during service registration. -## The `[kubernetes_discovery]` section +### The `[kubernetes_discovery]` section Garage supports discovering other nodes of the cluster using kubernetes custom resources. For this to work, a `[kubernetes_discovery]` section must be present with at least the `namespace` and `service_name` parameters. -### `namespace` +#### `namespace` {#kube_namespace} `namespace` sets the namespace in which the custom resources are configured. -### `service_name` +#### `service_name` {#kube_service_name} `service_name` is added as a label to the advertised resources to filter them, to allow for multiple deployments in a single namespace. -### `skip_crd` +#### `skip_crd` {#kube_skip_crd} `skip_crd` can be set to true to disable the automatic creation and patching of the `garagenodes.deuxfleurs.fr` CRD. You will need to create the CRD manually. -## The `[s3_api]` section +### The `[s3_api]` section -### `api_bind_addr` +#### `api_bind_addr` {#s3_api_bind_addr} The IP and port on which to bind for accepting S3 API calls. This endpoint does not suport TLS: a reverse proxy should be used to provide it. Alternatively, since `v0.8.5`, a path can be used to create a unix socket with 0222 mode. -### `s3_region` +#### `s3_region` {#s3_region} Garage will accept S3 API calls that are targetted to the S3 region defined here. API calls targetted to other regions will fail with a AuthorizationHeaderMalformed error message that redirects the client to the correct region. -### `root_domain` {#root_domain} +#### `root_domain` {#s3_root_domain} The optional suffix to access bucket using vhost-style in addition to path-style request. Note path-style requests are always enabled, whether or not vhost-style is configured. @@ -444,12 +562,12 @@ using the hostname `my-bucket.s3.garage.eu`. -## The `[s3_web]` section +### The `[s3_web]` section Garage allows to publish content of buckets as websites. This section configures the behaviour of this module. -### `bind_addr` +#### `bind_addr` {#web_bind_addr} The IP and port on which to bind for accepting HTTP requests to buckets configured for website access. @@ -457,7 +575,7 @@ This endpoint does not suport TLS: a reverse proxy should be used to provide it. Alternatively, since `v0.8.5`, a path can be used to create a unix socket with 0222 mode. -### `root_domain` +#### `root_domain` {#web_root_domain} The optional suffix appended to bucket names for the corresponding HTTP Host. @@ -466,11 +584,11 @@ will be accessible either with hostname `deuxfleurs.fr.web.garage.eu` or with hostname `deuxfleurs.fr`. -## The `[admin]` section +### The `[admin]` section Garage has a few administration capabilities, in particular to allow remote monitoring. These features are detailed below. -### `api_bind_addr` +#### `api_bind_addr` {#admin_api_bind_addr} If specified, Garage will bind an HTTP server to this port and address, on which it will listen to requests for administration features. @@ -479,7 +597,7 @@ See [administration API reference](@/documentation/reference-manual/admin-api.md Alternatively, since `v0.8.5`, a path can be used to create a unix socket. Note that for security reasons, the socket will have 0220 mode. Make sure to set user and group permissions accordingly. -### `metrics_token`, `metrics_token_file` or `GARAGE_METRICS_TOKEN`, `GARAGE_METRICS_TOKEN_FILE` (env) +#### `metrics_token`, `metrics_token_file` or `GARAGE_METRICS_TOKEN`, `GARAGE_METRICS_TOKEN_FILE` (env) {#admin_metrics_token} The token for accessing the Metrics endpoint. If this token is not set, the Metrics endpoint can be accessed without access control. @@ -489,9 +607,9 @@ You can use any random string for this value. We recommend generating a random t `metrics_token` was introduced in Garage `v0.7.2`. `metrics_token_file` and the `GARAGE_METRICS_TOKEN` environment variable are supported since Garage `v0.8.2`. -`GARAGE_METRICS_TOKEN_FILE` is supported since `v0.9.0`. +`GARAGE_METRICS_TOKEN_FILE` is supported since `v0.8.5` / `v0.9.1`. -### `admin_token`, `admin_token_file` or `GARAGE_ADMIN_TOKEN`, `GARAGE_ADMIN_TOKEN_FILE` (env) +#### `admin_token`, `admin_token_file` or `GARAGE_ADMIN_TOKEN`, `GARAGE_ADMIN_TOKEN_FILE` (env) {#admin_token} The token for accessing all of the other administration endpoints. If this token is not set, access to these endpoints is disabled entirely. @@ -501,9 +619,9 @@ You can use any random string for this value. We recommend generating a random t `admin_token` was introduced in Garage `v0.7.2`. `admin_token_file` and the `GARAGE_ADMIN_TOKEN` environment variable are supported since Garage `v0.8.2`. -`GARAGE_ADMIN_TOKEN_FILE` is supported since `v0.9.0`. +`GARAGE_ADMIN_TOKEN_FILE` is supported since `v0.8.5` / `v0.9.1`. -### `trace_sink` +#### `trace_sink` {#admin_trace_sink} Optionally, the address of an OpenTelemetry collector. If specified, Garage will send traces in the OpenTelemetry format to this endpoint. These diff --git a/doc/book/reference-manual/features.md b/doc/book/reference-manual/features.md index 2f8e633a..e8ba9510 100644 --- a/doc/book/reference-manual/features.md +++ b/doc/book/reference-manual/features.md @@ -52,7 +52,7 @@ This is particularly usefull when nodes are far from one another and talk to one Garage supports a variety of replication modes, with 1 copy, 2 copies or 3 copies of your data, and with various levels of consistency, in order to adapt to a variety of usage scenarios. -Read our reference page on [supported replication modes](@/documentation/reference-manual/configuration.md#replication-mode) +Read our reference page on [supported replication modes](@/documentation/reference-manual/configuration.md#replication_mode) to select the replication mode best suited to your use case (hint: in most cases, `replication_mode = "3"` is what you want). ### Web server for static websites diff --git a/doc/book/reference-manual/s3-compatibility.md b/doc/book/reference-manual/s3-compatibility.md index 15b29bd1..1bcfd123 100644 --- a/doc/book/reference-manual/s3-compatibility.md +++ b/doc/book/reference-manual/s3-compatibility.md @@ -75,16 +75,13 @@ but these endpoints are documented in [Red Hat Ceph Storage - Chapter 2. Ceph Ob | Endpoint | Garage | [Openstack Swift](https://docs.openstack.org/swift/latest/s3_compat.html) | [Ceph Object Gateway](https://docs.ceph.com/en/latest/radosgw/s3/) | [Riak CS](https://docs.riak.com/riak/cs/2.1.1/references/apis/storage/s3/index.html) | [OpenIO](https://docs.openio.io/latest/source/arch-design/s3_compliancy.html) | |------------------------------|----------------------------------|-----------------|---------------|---------|-----| -| [AbortMultipartUpload](https://docs.aws.amazon.com/AmazonS3/latest/API/API_AbortMultipartUpload.html) | ✅ Implemented | ✅ | ✅ | ✅ | ✅ | -| [CompleteMultipartUpload](https://docs.aws.amazon.com/AmazonS3/latest/API/API_CompleteMultipartUpload.html) | ✅ Implemented (see details below) | ✅ | ✅ | ✅ | ✅ | -| [CreateMultipartUpload](https://docs.aws.amazon.com/AmazonS3/latest/API/API_CreateMultipartUpload.html) | ✅ Implemented | ✅| ✅ | ✅ | ✅ | -| [ListMultipartUpload](https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListMultipartUpload.html) | ✅ Implemented | ✅ | ✅ | ✅ | ✅ | -| [ListParts](https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListParts.html) | ✅ Implemented | ✅ | ✅ | ✅ | ✅ | -| [UploadPart](https://docs.aws.amazon.com/AmazonS3/latest/API/API_UploadPart.html) | ✅ Implemented (see details below) | ✅ | ✅| ✅ | ✅ | -| [UploadPartCopy](https://docs.aws.amazon.com/AmazonS3/latest/API/API_UploadPartCopy.html) | ✅ Implemented | ✅ | ✅ | ✅ | ✅ | - -Our implementation of Multipart Upload is currently a bit more restrictive than Amazon's one in some edge cases. -For more information, please refer to our [issue tracker](https://git.deuxfleurs.fr/Deuxfleurs/garage/issues/204). +| [AbortMultipartUpload](https://docs.aws.amazon.com/AmazonS3/latest/API/API_AbortMultipartUpload.html) | ✅ Implemented | ✅ | ✅ | ✅ | ✅ | +| [CompleteMultipartUpload](https://docs.aws.amazon.com/AmazonS3/latest/API/API_CompleteMultipartUpload.html) | ✅ Implemented | ✅ | ✅ | ✅ | ✅ | +| [CreateMultipartUpload](https://docs.aws.amazon.com/AmazonS3/latest/API/API_CreateMultipartUpload.html) | ✅ Implemented | ✅| ✅ | ✅ | ✅ | +| [ListMultipartUpload](https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListMultipartUpload.html) | ✅ Implemented | ✅ | ✅ | ✅ | ✅ | +| [ListParts](https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListParts.html) | ✅ Implemented | ✅ | ✅ | ✅ | ✅ | +| [UploadPart](https://docs.aws.amazon.com/AmazonS3/latest/API/API_UploadPart.html) | ✅ Implemented | ✅ | ✅| ✅ | ✅ | +| [UploadPartCopy](https://docs.aws.amazon.com/AmazonS3/latest/API/API_UploadPartCopy.html) | ✅ Implemented | ✅ | ✅ | ✅ | ✅ | ### Website endpoints @@ -127,15 +124,22 @@ If you need this feature, please [share your use case in our dedicated issue](ht | Endpoint | Garage | [Openstack Swift](https://docs.openstack.org/swift/latest/s3_compat.html) | [Ceph Object Gateway](https://docs.ceph.com/en/latest/radosgw/s3/) | [Riak CS](https://docs.riak.com/riak/cs/2.1.1/references/apis/storage/s3/index.html) | [OpenIO](https://docs.openio.io/latest/source/arch-design/s3_compliancy.html) | |------------------------------|----------------------------------|-----------------|---------------|---------|-----| -| [DeleteBucketLifecycle](https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteBucketLifecycle.html) | ❌ Missing | ❌| ✅| ❌| ✅| -| [GetBucketLifecycleConfiguration](https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetBucketLifecycleConfiguration.html) | ❌ Missing | ❌| ✅ | ❌| ✅| -| [PutBucketLifecycleConfiguration](https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutBucketLifecycleConfiguration.html) | ❌ Missing | ❌| ✅ | ❌| ✅| +| [DeleteBucketLifecycle](https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteBucketLifecycle.html) | ✅ Implemented | ❌| ✅| ❌| ✅| +| [GetBucketLifecycleConfiguration](https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetBucketLifecycleConfiguration.html) | ✅ Implemented | ❌| ✅ | ❌| ✅| +| [PutBucketLifecycleConfiguration](https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutBucketLifecycleConfiguration.html) | ⚠ Partially implemented (see below) | ❌| ✅ | ❌| ✅| | [GetBucketVersioning](https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetBucketVersioning.html) | ❌ Stub (see below) | ✅| ✅ | ❌| ✅| | [ListObjectVersions](https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectVersions.html) | ❌ Missing | ❌| ✅ | ❌| ✅| | [PutBucketVersioning](https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutBucketVersioning.html) | ❌ Missing | ❌| ✅| ❌| ✅| +**PutBucketLifecycleConfiguration:** The only actions supported are +`AbortIncompleteMultipartUpload` and `Expiration` (without the +`ExpiredObjectDeleteMarker` field). All other operations are dependent on +either bucket versionning or storage classes which Garage currently does not +implement. The deprecated `Prefix` member directly in the the `Rule` +structure/XML tag is not supported, specified prefixes must be inside the +`Filter` structure/XML tag. -**GetBucketVersioning:** Stub implementation (Garage does not yet support versionning so this always returns "versionning not enabled"). +**GetBucketVersioning:** Stub implementation which always returns "versionning not enabled", since Garage does not yet support bucket versionning. ### Replication endpoints diff --git a/doc/book/working-documents/migration-09.md b/doc/book/working-documents/migration-09.md new file mode 100644 index 00000000..ba758093 --- /dev/null +++ b/doc/book/working-documents/migration-09.md @@ -0,0 +1,72 @@ ++++ +title = "Migrating from 0.8 to 0.9" +weight = 12 ++++ + +**This guide explains how to migrate to 0.9 if you have an existing 0.8 cluster. +We don't recommend trying to migrate to 0.9 directly from 0.7 or older.** + +This migration procedure has been tested on several clusters without issues. +However, it is still a *critical procedure* that might cause issues. +**Make sure to back up all your data before attempting it!** + +You might also want to read our [general documentation on upgrading Garage](@/documentation/operations/upgrading.md). + +The following are **breaking changes** in Garage v0.9 that require your attention when migrating: + +- LMDB is now the default metadata db engine and Sled is deprecated. If you were using Sled, make sure to specify `db_engine = "sled"` in your configuration file, or take the time to [convert your database](https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#db-engine-since-v0-8-0). + +- Capacity values are now in actual byte units. The translation from the old layout will assign 1 capacity = 1Gb by default, which might be wrong for your cluster. This does not cause any data to be moved around, but you might want to re-assign correct capacity values post-migration. + +- Multipart uploads that were started in Garage v0.8 will not be visible in Garage v0.9 and will have to be restarted from scratch. + +- Changes to the admin API: some `v0/` endpoints have been replaced by `v1/` counterparts with updated/uniformized syntax. All other endpoints have also moved to `v1/` by default, without syntax changes, but are still available under `v0/` for compatibility. + + +## Simple migration procedure (takes cluster offline for a while) + +The migration steps are as follows: + +1. Disable API and web access. You may do this by stopping your reverse proxy or by commenting out + the `api_bind_addr` values in your `config.toml` file and restarting Garage. +2. Do `garage repair --all-nodes --yes tables` and `garage repair --all-nodes --yes blocks`, + check the logs and check that all data seems to be synced correctly between + nodes. If you have time, do additional checks (`versions`, `block_refs`, etc.) +3. Check that the block resync queue and Merkle queue are empty: + run `garage stats -a` to query them or inspect metrics in the Grafana dashboard. +4. Turn off Garage v0.8 +5. **Backup the metadata folder of all your nodes!** For instance, use the following command + if your metadata directory is `/var/lib/garage/meta`: `cd /var/lib/garage ; tar -acf meta-v0.8.tar.zst meta/` +6. Install Garage v0.9 +7. Update your configuration file if necessary. +8. Turn on Garage v0.9 +9. Do `garage repair --all-nodes --yes tables` and `garage repair --all-nodes --yes blocks`. + Wait for a full table sync to run. +10. Your upgraded cluster should be in a working state. Re-enable API and Web + access and check that everything went well. +11. Monitor your cluster in the next hours to see if it works well under your production load, report any issue. +12. You might want to assign correct capacity values to all your nodes. Doing so might cause data to be moved + in your cluster, which should also be monitored carefully. + +## Minimal downtime migration procedure + +The migration to Garage v0.9 can be done with almost no downtime, +by restarting all nodes at once in the new version. + +The migration steps are as follows: + +1. Do `garage repair --all-nodes --yes tables` and `garage repair --all-nodes --yes blocks`, + check the logs and check that all data seems to be synced correctly between + nodes. If you have time, do additional checks (`versions`, `block_refs`, etc.) + +2. Turn off each node individually; back up its metadata folder (see above); turn it back on again. + This will allow you to take a backup of all nodes without impacting global cluster availability. + You can do all nodes of a single zone at once as this does not impact the availability of Garage. + +3. Prepare your binaries and configuration files for Garage v0.9 + +4. Shut down all v0.8 nodes simultaneously, and restart them all simultaneously in v0.9. + Use your favorite deployment tool (Ansible, Kubernetes, Nomad) to achieve this as fast as possible. + Garage v0.9 should be in a working state as soon as it starts. + +5. Proceed with repair and monitoring as described in steps 9-12 above. |