diff options
34 files changed, 2732 insertions, 180 deletions
diff --git a/doc/api/garage-admin-v1.html b/doc/api/garage-admin-v1.html new file mode 100644 index 00000000..783d459e --- /dev/null +++ b/doc/api/garage-admin-v1.html @@ -0,0 +1,24 @@ +<!DOCTYPE html> +<html> + <head> + <title>Garage Adminstration API v0</title> + <!-- needed for adaptive design --> + <meta charset="utf-8"/> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <link href="./css/redoc.css" rel="stylesheet"> + + <!-- + Redoc doesn't change outer page styles + --> + <style> + body { + margin: 0; + padding: 0; + } + </style> + </head> + <body> + <redoc spec-url='./garage-admin-v1.yml'></redoc> + <script src="./redoc.standalone.js"> </script> + </body> +</html> diff --git a/doc/api/garage-admin-v1.yml b/doc/api/garage-admin-v1.yml new file mode 100644 index 00000000..fd78feb1 --- /dev/null +++ b/doc/api/garage-admin-v1.yml @@ -0,0 +1,1363 @@ +openapi: 3.0.0 +info: + version: v0.9.0 + title: Garage Administration API v0+garage-v0.9.0 + description: | + Administrate your Garage cluster programatically, including status, layout, keys, buckets, and maintainance tasks. + + *Disclaimer: The API is not stable yet, hence its v0 tag. The API can change at any time, and changes can include breaking backward compatibility. Read the changelog and upgrade your scripts before upgrading. Additionnaly, this specification is very early stage and can contain bugs, especially on error return codes/types that are not tested yet. Do not expect a well finished and polished product!* +paths: + /health: + get: + tags: + - Nodes + operationId: "GetHealth" + summary: "Cluster health report" + description: | + Returns the global status of the cluster, the number of connected nodes (over the number of known ones), the number of healthy storage nodes (over the declared ones), and the number of healthy partitions (over the total). + responses: + '500': + description: | + The server can not answer your request because it is in a bad state + '200': + description: | + Information about the queried node, its environment and the current layout + content: + application/json: + schema: + type: object + required: [ status, knownNodes, connectedNodes, storageNodes, storageNodesOk, partitions, partitionsQuorum, partitionsAllOk ] + properties: + status: + type: string + example: "healthy" + knownNodes: + type: integer + format: int64 + example: 4 + connectedNodes: + type: integer + format: int64 + example: 4 + storageNodes: + type: integer + format: int64 + example: 3 + storageNodesOk: + type: integer + format: int64 + example: 3 + partitions: + type: integer + format: int64 + example: 256 + partitionsQuorum: + type: integer + format: int64 + example: 256 + partitionsAllOk: + type: integer + format: int64 + example: 256 + /status: + get: + tags: + - Nodes + operationId: "GetNodes" + summary: "Describe cluster" + description: | + Returns the cluster's current status, including: + - ID of the node being queried and its version of the Garage daemon + - Live nodes + - Currently configured cluster layout + - Staged changes to the cluster layout + + *Capacity is given in bytes* + responses: + '500': + description: | + The server can not answer your request because it is in a bad state + '200': + description: | + Information about the queried node, its environment and the current layout + content: + application/json: + schema: + type: object + required: [ node, garageVersion, garageFeatures, rustVersion, dbEngine, knownNodes, layout ] + properties: + node: + type: string + example: "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f" + garageVersion: + type: string + example: "v0.9.0" + garageFeatures: + type: array + items: + type: string + example: + - "k2v" + - "sled" + - "lmdb" + - "sqlite" + - "consul-discovery" + - "kubernetes-discovery" + - "metrics" + - "telemetry-otlp" + - "bundled-libs" + rustVersion: + type: string + example: "1.68.0" + dbEngine: + type: string + example: "LMDB (using Heed crate)" + knownNodes: + type: array + example: + - id: "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f" + addr: "10.0.0.11:3901" + isUp: true + lastSeenSecsAgo: 9 + hostname: orion + - id: "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff" + addr: "10.0.0.12:3901" + isUp: true + lastSeenSecsAgo: 13 + hostname: pegasus + - id: "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b" + addr: "10.0.0.13:3901" + isUp: true + lastSeenSecsAgo: 2 + hostname: neptune + items: + $ref: '#/components/schemas/NodeNetworkInfo' + layout: + $ref: '#/components/schemas/ClusterLayout' + + /connect: + post: + tags: + - Nodes + operationId: "AddNode" + summary: "Connect a new node" + description: | + Instructs this Garage node to connect to other Garage nodes at specified `<node_id>@<net_address>`. `node_id` is generated automatically on node start. + requestBody: + required: true + content: + application/json: + schema: + type: array + example: + - "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f@10.0.0.11:3901" + - "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff@10.0.0.12:3901" + items: + type: string + + responses: + '500': + description: | + The server can not answer your request because it is in a bad state + '400': + description: | + Your request is malformed, check your JSON + '200': + description: | + The request has been handled correctly but it does not mean that all connection requests succeeded; some might have fail, you need to check the body! + content: + application/json: + schema: + type: array + example: + - success: true + error: + - success: false + error: "Handshake error" + items: + type: object + properties: + success: + type: boolean + example: true + error: + type: string + nullable: true + example: null + + /layout: + get: + tags: + - Layout + operationId: "GetLayout" + summary: "Details on the current and staged layout" + description: | + Returns the cluster's current layout, including: + - Currently configured cluster layout + - Staged changes to the cluster layout + + *Capacity is given in bytes* + *The info returned by this endpoint is a subset of the info returned by `GET /status`.* + responses: + '500': + description: | + The server can not answer your request because it is in a bad state + '200': + description: | + Returns the cluster's current cluster layout: + - Currently configured cluster layout + - Staged changes to the cluster layout + content: + application/json: + schema: + $ref: '#/components/schemas/ClusterLayout' + + post: + tags: + - Layout + operationId: "AddLayout" + summary: "Send modifications to the cluster layout" + description: | + Send modifications to the cluster layout. These modifications will be included in the staged role changes, visible in subsequent calls of `GET /layout`. Once the set of staged changes is satisfactory, the user may call `POST /layout/apply` to apply the changed changes, or `POST /layout/revert` to clear all of the staged changes in the layout. + + Setting the capacity to `null` will configure the node as a gateway. + Otherwise, capacity must be now set in bytes (before Garage 0.9 it was arbitrary weights). + For example to declare 100GB, you must set `capacity: 100000000000`. + + Garage uses internally the International System of Units (SI), it assumes that 1kB = 1000 bytes, and displays storage as kB, MB, GB (and not KiB, MiB, GiB that assume 1KiB = 1024 bytes). + requestBody: + description: | + To add a new node to the layout or to change the configuration of an existing node, simply set the values you want (`zone`, `capacity`, and `tags`). + To remove a node, simply pass the `remove: true` field. + This logic is represented in OpenAPI with a "One Of" object. + + Contrary to the CLI that may update only a subset of the fields capacity, zone and tags, when calling this API all of these values must be specified. + required: true + content: + application/json: + schema: + type: array + example: + - id: "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b" + zone: "geneva" + capacity: 100000000000 + tags: + - gateway + - id: "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff" + remove: true + items: + $ref: '#/components/schemas/NodeRoleChange' + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Invalid syntax or requested change" + '200': + description: "The layout modification has been correctly staged" + content: + application/json: + schema: + $ref: '#/components/schemas/ClusterLayout' + + /layout/apply: + post: + tags: + - Layout + operationId: "ApplyLayout" + summary: "Apply staged layout" + description: | + Applies to the cluster the layout changes currently registered as staged layout changes. + + *Note: do not try to parse the `message` field of the response, it is given as an array of string specifically because its format is not stable.* + requestBody: + description: | + Similarly to the CLI, the body must include the version of the new layout that will be created, which MUST be 1 + the value of the currently existing layout in the cluster. + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/LayoutVersion' + + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Invalid syntax or requested change" + '200': + description: "The staged layout has been applied as the new layout of the cluster, a rebalance has been triggered." + content: + application/json: + schema: + type: object + required: [ message, layout ] + properties: + message: + type: array + items: + type: string + example: + - "==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====" + - "" + - "Partitions are replicated 1 times on at least 1 distinct zones." + - "" + - "Optimal partition size: 419.4 MB (3 B in previous layout)" + - "Usable capacity / total cluster capacity: 107.4 GB / 107.4 GB (100.0 %)" + - "Effective capacity (replication factor 1): 107.4 GB" + - "" + - "A total of 0 new copies of partitions need to be transferred." + - "" + - "dc1 Tags Partitions Capacity Usable capacity\n 6a8e08af2aab1083 a,v 256 (0 new) 107.4 GB 107.4 GB (100.0%)\n TOTAL 256 (256 unique) 107.4 GB 107.4 GB (100.0%)\n\n" + layout: + $ref: '#/components/schemas/ClusterLayout' + + + /layout/revert: + post: + tags: + - Layout + operationId: "RevertLayout" + summary: "Clear staged layout" + description: | + Clears all of the staged layout changes. + requestBody: + description: | + Reverting the staged changes is done by incrementing the version number and clearing the contents of the staged change list. Similarly to the CLI, the body must include the incremented version number, which MUST be 1 + the value of the currently existing layout in the cluster. + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/LayoutVersion' + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Invalid syntax or requested change" + '200': + description: "The staged layout has been cleared, you can start again sending modification from a fresh copy with `POST /layout`." + + "/key?list": + get: + tags: + - Key + operationId: "ListKeys" + summary: "List all keys" + description: | + Returns all API access keys in the cluster. + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '200': + description: | + Returns the key identifier (aka `AWS_ACCESS_KEY_ID`) and its associated, human friendly, name if any (otherwise return an empty string) + content: + application/json: + schema: + type: array + example: + - id: "GK31c2f218a2e44f485b94239e" + name: "test-key" + - id: "GKe10061ac9c2921f09e4c5540" + name: "" + items: + type: object + required: [ id ] + properties: + id: + type: string + name: + type: string + post: + tags: + - Key + operationId: "AddKey" + summary: "Create a new API key" + description: | + Creates a new API access key. + requestBody: + description: | + You can set a friendly name for this key. + If you don't want to, you can set the name to `null`. + + *Note: the secret key is returned in the response.* + required: true + content: + application/json: + schema: + type: object + properties: + name: + type: string + nullable: true + example: "test-key" + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Invalid syntax or requested change" + '200': + description: "The key has been added" + content: + application/json: + schema: + $ref: '#/components/schemas/KeyInfo' + + "/key": + get: + tags: + - Key + operationId: "GetKey" + summary: "Get key information" + description: | + Return information about a specific key like its identifiers, its permissions and buckets on which it has permissions. + You can search by specifying the exact key identifier (`id`) or by specifying a pattern (`search`). + + For confidentiality reasons, the secret key is not returned by default: you must pass the `showSecretKey` query parameter to get it. + parameters: + - name: id + in: query + description: | + The exact API access key generated by Garage. + + Incompatible with `search`. + example: "GK31c2f218a2e44f485b94239e" + schema: + type: string + - name: search + in: query + description: | + A pattern (beginning or full string) corresponding to a key identifier or friendly name. + + Incompatible with `id`. + example: "test-k" + schema: + type: string + - name: showSecretKey + in: query + schema: + type: string + default: "false" + enum: + - "true" + - "false" + example: "true" + required: false + description: "Wether or not the secret key should be returned in the response" + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '200': + description: | + Returns information about the key + content: + application/json: + schema: + $ref: '#/components/schemas/KeyInfo' + + delete: + tags: + - Key + operationId: "DeleteKey" + summary: "Delete a key" + description: | + Delete a key from the cluster. Its access will be removed from all the buckets. Buckets are not automatically deleted and can be dangling. You should manually delete them before. + parameters: + - name: id + in: query + required: true + description: "The exact API access key generated by Garage" + example: "GK31c2f218a2e44f485b94239e" + schema: + type: string + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '200': + description: "The key has been deleted" + + + post: + tags: + - Key + operationId: "UpdateKey" + summary: "Update a key" + description: | + Updates information about the specified API access key. + + *Note: the secret key is not returned in the response, `null` is sent instead.* + parameters: + - name: id + in: query + required: true + description: "The exact API access key generated by Garage" + example: "GK31c2f218a2e44f485b94239e" + schema: + type: string + requestBody: + description: | + For a given key, provide a first set with the permissions to grant, and a second set with the permissions to remove + required: true + content: + application/json: + schema: + type: object + properties: + name: + type: string + example: "test-key" + allow: + type: object + example: + properties: + createBucket: + type: boolean + example: true + deny: + type: object + properties: + createBucket: + type: boolean + example: true + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Invalid syntax or requested change" + '200': + description: | + Returns information about the key + content: + application/json: + schema: + $ref: '#/components/schemas/KeyInfo' + + + /key/import: + post: + tags: + - Key + operationId: "ImportKey" + summary: "Import an existing key" + description: | + Imports an existing API key. This feature must only be used for migrations and backup restore. + + **Do not use it to generate custom key identifiers or you will break your Garage cluster.** + requestBody: + description: | + Information on the key to import + required: true + content: + application/json: + schema: + type: object + required: [ name, accessKeyId, secretAccessKey ] + properties: + name: + type: string + example: "test-key" + nullable: true + accessKeyId: + type: string + example: "GK31c2f218a2e44f485b94239e" + secretAccessKey: + type: string + example: "b892c0665f0ada8a4755dae98baa3b133590e11dae3bcc1f9d769d67f16c3835" + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Invalid syntax or requested change" + '200': + description: "The key has been imported into the system" + content: + application/json: + schema: + $ref: '#/components/schemas/KeyInfo' + + "/bucket?list": + get: + tags: + - Bucket + operationId: "ListBuckets" + summary: "List all buckets" + description: | + List all the buckets on the cluster with their UUID and their global and local aliases. + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '200': + description: | + Returns the UUID of the bucket and all its aliases + content: + application/json: + schema: + type: array + example: + - id: "70dc3bed7fe83a75e46b66e7ddef7d56e65f3c02f9f80b6749fb97eccb5e1033" + globalAliases: + - "container_registry" + - id: "96470e0df00ec28807138daf01915cfda2bee8eccc91dea9558c0b4855b5bf95" + localAliases: + - alias: "my_documents" + accessKeyid: "GK31c2f218a2e44f485b94239e" + - id: "d7452a935e663fc1914f3a5515163a6d3724010ce8dfd9e4743ca8be5974f995" + globalAliases: + - "example.com" + - "www.example.com" + localAliases: + - alias: "corp_website" + accessKeyId: "GKe10061ac9c2921f09e4c5540" + - alias: "web" + accessKeyid: "GK31c2f218a2e44f485b94239e" + - id: "" + items: + type: object + required: [ id ] + properties: + id: + type: string + globalAliases: + type: array + items: + type: string + localAliases: + type: array + items: + type: object + required: [ alias, accessKeyId ] + properties: + alias: + type: string + accessKeyId: + type: string + + /bucket: + post: + tags: + - Bucket + operationId: "CreateBucket" + summary: "Create a bucket" + description: | + Creates a new bucket, either with a global alias, a local one, or no alias at all. + Technically, you can also specify both `globalAlias` and `localAlias` and that would create two aliases. + requestBody: + description: | + Aliases to put on the new bucket + required: true + content: + application/json: + schema: + type: object + required: [ ] + properties: + globalAlias: + type: string + example: "my_documents" + localAlias: + type: object + properties: + accessKeyId: + type: string + alias: + type: string + allow: + type: object + properties: + read: + type: boolean + example: true + write: + type: boolean + example: true + owner: + type: boolean + example: true + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "The payload is not formatted correctly" + '200': + description: Returns exhaustive information about the bucket + content: + application/json: + schema: + $ref: '#/components/schemas/BucketInfo' + get: + tags: + - Bucket + operationId: "GetBucketInfo" + summary: "Get a bucket" + description: | + Given a bucket identifier (`id`) or a global alias (`alias`), get its information. + It includes its aliases, its web configuration, keys that have some permissions + on it, some statistics (number of objects, size), number of dangling multipart uploads, + and its quotas (if any). + parameters: + - name: id + in: query + description: | + The exact bucket identifier, a 32 bytes hexadecimal string. + + Incompatible with `alias`. + example: "b4018dc61b27ccb5c64ec1b24f53454bbbd180697c758c4d47a22a8921864a87" + schema: + type: string + - name: alias + in: query + description: | + The exact global alias of one of the existing buckets. + + Incompatible with `id`. + example: "my_documents" + schema: + type: string + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '404': + description: "Bucket not found" + '200': + description: Returns exhaustive information about the bucket + content: + application/json: + schema: + $ref: '#/components/schemas/BucketInfo' + + + delete: + tags: + - Bucket + operationId: "DeleteBucket" + summary: "Delete a bucket" + description: | + Delete a bucket.Deletes a storage bucket. A bucket cannot be deleted if it is not empty. + + **Warning:** this will delete all aliases associated with the bucket! + parameters: + - name: id + in: query + required: true + description: "The exact bucket identifier, a 32 bytes hexadecimal string" + example: "b4018dc61b27ccb5c64ec1b24f53454bbbd180697c758c4d47a22a8921864a87" + schema: + type: string + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Bucket is not empty" + '404': + description: "Bucket not found" + '204': + description: Bucket has been deleted + + + + put: + tags: + - Bucket + operationId: "UpdateBucket" + summary: "Update a bucket" + description: | + All fields (`websiteAccess` and `quotas`) are optional. + If they are present, the corresponding modifications are applied to the bucket, otherwise nothing is changed. + + In `websiteAccess`: if `enabled` is `true`, `indexDocument` must be specified. + The field `errorDocument` is optional, if no error document is set a generic + error message is displayed when errors happen. Conversely, if `enabled` is + `false`, neither `indexDocument` nor `errorDocument` must be specified. + + In `quotas`: new values of `maxSize` and `maxObjects` must both be specified, or set to `null` + to remove the quotas. An absent value will be considered the same as a `null`. It is not possible + to change only one of the two quotas. + parameters: + - name: id + in: query + required: true + description: "The exact bucket identifier, a 32 bytes hexadecimal string" + example: "b4018dc61b27ccb5c64ec1b24f53454bbbd180697c758c4d47a22a8921864a87" + schema: + type: string + requestBody: + description: | + Requested changes on the bucket. Both root fields are optionals. + required: true + content: + application/json: + schema: + type: object + required: [ ] + properties: + websiteAccess: + type: object + properties: + enabled: + type: boolean + example: true + indexDocument: + type: string + example: "index.html" + errorDocument: + type: string + example: "error/400.html" + quotas: + type: object + properties: + maxSize: + type: integer + format: int64 + nullable: true + example: 19029801 + maxObjects: + type: integer + format: int64 + nullable: true + example: null + + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Bad request, check your body." + '404': + description: "Bucket not found" + '200': + description: Returns exhaustive information about the bucket + content: + application/json: + schema: + $ref: '#/components/schemas/BucketInfo' + + /bucket/allow: + post: + tags: + - Bucket + operationId: "AllowBucketKey" + summary: "Allow key" + description: | + ⚠️ **DISCLAIMER**: Garage's developers are aware that this endpoint has an unconventional semantic. Be extra careful when implementing it, its behavior is not obvious. + + Allows a key to do read/write/owner operations on a bucket. + + Flags in permissions which have the value true will be activated. Other flags will remain unchanged (ie. they will keep their internal value). + + For example, if you set read to true, the key will be allowed to read the bucket. + If you set it to false, the key will keeps its previous read permission. + If you want to disallow read for the key, check the DenyBucketKey operation. + + requestBody: + description: | + Aliases to put on the new bucket + required: true + content: + application/json: + schema: + type: object + required: [ bucketId, accessKeyId, permissions ] + properties: + bucketId: + type: string + example: "e6a14cd6a27f48684579ec6b381c078ab11697e6bc8513b72b2f5307e25fff9b" + accessKeyId: + type: string + example: "GK31c2f218a2e44f485b94239e" + permissions: + type: object + required: [ read, write, owner ] + properties: + read: + type: boolean + example: true + write: + type: boolean + example: true + owner: + type: boolean + example: true + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Bad request, check your request body" + '404': + description: "Bucket not found" + '200': + description: Returns exhaustive information about the bucket + content: + application/json: + schema: + $ref: '#/components/schemas/BucketInfo' + + /bucket/deny: + post: + tags: + - Bucket + operationId: "DenyBucketKey" + summary: "Deny key" + description: | + ⚠️ **DISCLAIMER**: Garage's developers are aware that this endpoint has an unconventional semantic. Be extra careful when implementing it, its behavior is not obvious. + + Denies a key from doing read/write/owner operations on a bucket. + + Flags in permissions which have the value true will be deactivated. Other flags will remain unchanged. + + For example, if you set read to true, the key will be denied from reading. + If you set read to false, the key will keep its previous permissions. + If you want the key to have the reading permission, check the AllowBucketKey operation. + + requestBody: + description: | + Aliases to put on the new bucket + required: true + content: + application/json: + schema: + type: object + required: [ bucketId, accessKeyId, permissions ] + properties: + bucketId: + type: string + example: "e6a14cd6a27f48684579ec6b381c078ab11697e6bc8513b72b2f5307e25fff9b" + accessKeyId: + type: string + example: "GK31c2f218a2e44f485b94239e" + permissions: + type: object + required: [ read, write, owner ] + properties: + read: + type: boolean + example: true + write: + type: boolean + example: true + owner: + type: boolean + example: true + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Bad request, check your request body" + '404': + description: "Bucket not found" + '200': + description: Returns exhaustive information about the bucket + content: + application/json: + schema: + $ref: '#/components/schemas/BucketInfo' + + /bucket/alias/global: + put: + tags: + - Bucket + operationId: "PutBucketGlobalAlias" + summary: "Add a global alias" + description: | + Add a global alias to the target bucket + parameters: + - name: id + in: query + required: true + schema: + type: string + example: e6a14cd6a27f48684579ec6b381c078ab11697e6bc8513b72b2f5307e25fff9b + - name: alias + in: query + required: true + example: my_documents + schema: + type: string + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Bad request, check your request body" + '404': + description: "Bucket not found" + '200': + description: Returns exhaustive information about the bucket + content: + application/json: + schema: + $ref: '#/components/schemas/BucketInfo' + + delete: + tags: + - Bucket + operationId: "DeleteBucketGlobalAlias" + summary: "Delete a global alias" + description: | + Delete a global alias from the target bucket + parameters: + - name: id + in: query + required: true + schema: + type: string + example: e6a14cd6a27f48684579ec6b381c078ab11697e6bc8513b72b2f5307e25fff9b + - name: alias + in: query + required: true + schema: + type: string + example: my_documents + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Bad request, check your request body" + '404': + description: "Bucket not found" + '200': + description: Returns exhaustive information about the bucket + content: + application/json: + schema: + $ref: '#/components/schemas/BucketInfo' + + /bucket/alias/local: + put: + tags: + - Bucket + operationId: "PutBucketLocalAlias" + summary: "Add a local alias" + description: | + Add a local alias, bound to specified account, to the target bucket + parameters: + - name: id + in: query + required: true + schema: + type: string + example: e6a14cd6a27f48684579ec6b381c078ab11697e6bc8513b72b2f5307e25fff9b + - name: accessKeyId + in: query + required: true + schema: + type: string + example: GK31c2f218a2e44f485b94239e + - name: alias + in: query + required: true + schema: + type: string + example: my_documents + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Bad request, check your request body" + '404': + description: "Bucket not found" + '200': + description: Returns exhaustive information about the bucket + content: + application/json: + schema: + $ref: '#/components/schemas/BucketInfo' + + delete: + tags: + - Bucket + operationId: "DeleteBucketLocalAlias" + summary: "Delete a local alias" + description: | + Delete a local alias, bound to specified account, from the target bucket + parameters: + - name: id + in: query + required: true + schema: + type: string + example: e6a14cd6a27f48684579ec6b381c078ab11697e6bc8513b72b2f5307e25fff9b + - name: accessKeyId + in: query + schema: + type: string + required: true + example: GK31c2f218a2e44f485b94239e + - name: alias + in: query + schema: + type: string + required: true + example: my_documents + responses: + '500': + description: "The server can not handle your request. Check your connectivity with the rest of the cluster." + '400': + description: "Bad request, check your request body" + '404': + description: "Bucket not found" + '200': + description: Returns exhaustive information about the bucket + content: + application/json: + schema: + $ref: '#/components/schemas/BucketInfo' + +components: + securitySchemes: + bearerAuth: + type: http + scheme: bearer + schemas: + NodeNetworkInfo: + type: object + required: [ addr, isUp, lastSeenSecsAgo, hostname ] + properties: + id: + type: string + example: "6a8e08af2aab1083ebab9b22165ea8b5b9d333b60a39ecd504e85cc1f432c36f" + addr: + type: string + example: "10.0.0.11:3901" + isUp: + type: boolean + example: true + lastSeenSecsAgo: + type: integer + nullable: true + example: 9 + hostname: + type: string + example: "node1" + NodeClusterInfo: + type: object + required: [ id, zone, tags ] + properties: + zone: + type: string + example: dc1 + capacity: + type: integer + format: int64 + nullable: true + example: 4 + tags: + type: array + description: | + User defined tags, put whatever makes sense for you, these tags are not interpreted by Garage + example: + - gateway + - fast + items: + type: string + NodeRoleChange: + oneOf: + - $ref: '#/components/schemas/NodeRoleRemove' + - $ref: '#/components/schemas/NodeRoleUpdate' + NodeRoleRemove: + type: object + required: [ id, remove ] + properties: + id: + type: string + example: "6a8e08af2aab1083ebab9b22165ea8b5b9d333b60a39ecd504e85cc1f432c36f" + remove: + type: boolean + example: true + NodeRoleUpdate: + type: object + required: [ id, zone, capacity, tags ] + properties: + id: + type: string + example: "6a8e08af2aab1083ebab9b22165ea8b5b9d333b60a39ecd504e85cc1f432c36f" + zone: + type: string + example: "dc1" + capacity: + type: integer + format: int64 + nullable: true + example: 150000000000 + tags: + type: array + items: + type: string + example: + - gateway + - fast + + ClusterLayout: + type: object + required: [ version, roles, stagedRoleChanges ] + properties: + version: + type: integer + example: 12 + roles: + type: array + example: + - id: "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f" + zone: "madrid" + capacity: 300000000000 + tags: + - fast + - amd64 + - id: "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff" + zone: "geneva" + capacity: 700000000000 + tags: + - arm64 + items: + $ref: '#/components/schemas/NodeClusterInfo' + stagedRoleChanges: + type: array + example: + - id: "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b" + zone: "geneva" + capacity: 800000000000 + tags: + - gateway + - id: "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff" + remove: true + items: + $ref: '#/components/schemas/NodeRoleChange' + LayoutVersion: + type: object + required: [ version ] + properties: + version: + type: integer + #format: int64 + example: 13 + + KeyInfo: + type: object + properties: + name: + type: string + example: "test-key" + accessKeyId: + type: string + example: "GK31c2f218a2e44f485b94239e" + secretAccessKey: + type: string + nullable: true + example: "b892c0665f0ada8a4755dae98baa3b133590e11dae3bcc1f9d769d67f16c3835" + permissions: + type: object + properties: + createBucket: + type: boolean + example: false + buckets: + type: array + items: + type: object + properties: + id: + type: string + example: "70dc3bed7fe83a75e46b66e7ddef7d56e65f3c02f9f80b6749fb97eccb5e1033" + globalAliases: + type: array + items: + type: string + example: "my-bucket" + localAliases: + type: array + items: + type: string + example: "GK31c2f218a2e44f485b94239e:localname" + permissions: + type: object + properties: + read: + type: boolean + example: true + write: + type: boolean + example: true + owner: + type: boolean + example: false + BucketInfo: + type: object + properties: + id: + type: string + example: afa8f0a22b40b1247ccd0affb869b0af5cff980924a20e4b5e0720a44deb8d39 + globalAliases: + type: array + items: + type: string + example: "my_documents" + websiteAccess: + type: boolean + example: true + websiteConfig: + type: object + nullable: true + properties: + indexDocument: + type: string + example: "index.html" + errorDocument: + type: string + example: "error/400.html" + keys: + type: array + items: + $ref: '#/components/schemas/BucketKeyInfo' + objects: + type: integer + format: int64 + example: 14827 + bytes: + type: integer + format: int64 + example: 13189855625 + unfinishedUploads: + type: integer + example: 0 + quotas: + type: object + properties: + maxSize: + nullable: true + type: integer + format: int64 + example: null + maxObjects: + nullable: true + type: integer + format: int64 + example: null + + + BucketKeyInfo: + type: object + properties: + accessKeyId: + type: string + name: + type: string + permissions: + type: object + properties: + read: + type: boolean + example: true + write: + type: boolean + example: true + owner: + type: boolean + example: true + bucketLocalAliases: + type: array + items: + type: string + example: "my_documents" + + +security: + - bearerAuth: [] + +servers: + - description: A local server + url: http://localhost:3903/v1/ diff --git a/doc/book/build/golang.md b/doc/book/build/golang.md index a508260e..f3f28a40 100644 --- a/doc/book/build/golang.md +++ b/doc/book/build/golang.md @@ -37,30 +37,84 @@ import ( "context" "fmt" "os" + "strings" garage "git.deuxfleurs.fr/garage-sdk/garage-admin-sdk-golang" ) func main() { - // Set Host and other parameters + // Initialization configuration := garage.NewConfiguration() configuration.Host = "127.0.0.1:3903" - - - // We can now generate a client client := garage.NewAPIClient(configuration) - - // Authentication is handled through the context pattern ctx := context.WithValue(context.Background(), garage.ContextAccessToken, "s3cr3t") - // Send a request - resp, r, err := client.NodesApi.GetNodes(ctx).Execute() - if err != nil { - fmt.Fprintf(os.Stderr, "Error when calling `NodesApi.GetNodes``: %v\n", err) - fmt.Fprintf(os.Stderr, "Full HTTP response: %v\n", r) + // Nodes + fmt.Println("--- nodes ---") + nodes, _, _ := client.NodesApi.GetNodes(ctx).Execute() + fmt.Fprintf(os.Stdout, "First hostname: %v\n", nodes.KnownNodes[0].Hostname) + capa := int64(1000000000) + change := []garage.NodeRoleChange{ + garage.NodeRoleChange{NodeRoleUpdate: &garage.NodeRoleUpdate { + Id: *nodes.KnownNodes[0].Id, + Zone: "dc1", + Capacity: *garage.NewNullableInt64(&capa), + Tags: []string{ "fast", "amd64" }, + }}, } - - // Process the response - fmt.Fprintf(os.Stdout, "Target hostname: %v\n", resp.KnownNodes[resp.Node].Hostname) + staged, _, _ := client.LayoutApi.AddLayout(ctx).NodeRoleChange(change).Execute() + msg, _, _ := client.LayoutApi.ApplyLayout(ctx).LayoutVersion(*garage.NewLayoutVersion(staged.Version + 1)).Execute() + fmt.Printf(strings.Join(msg.Message, "\n")) // Layout configured + + health, _, _ := client.NodesApi.GetHealth(ctx).Execute() + fmt.Printf("Status: %s, nodes: %v/%v, storage: %v/%v, partitions: %v/%v\n", health.Status, health.ConnectedNodes, health.KnownNodes, health.StorageNodesOk, health.StorageNodes, health.PartitionsAllOk, health.Partitions) + + // Key + fmt.Println("\n--- key ---") + key := "openapi-key" + keyInfo, _, _ := client.KeyApi.AddKey(ctx).AddKeyRequest(garage.AddKeyRequest{Name: *garage.NewNullableString(&key) }).Execute() + defer client.KeyApi.DeleteKey(ctx).Id(*keyInfo.AccessKeyId).Execute() + fmt.Printf("AWS_ACCESS_KEY_ID=%s\nAWS_SECRET_ACCESS_KEY=%s\n", *keyInfo.AccessKeyId, *keyInfo.SecretAccessKey.Get()) + + id := *keyInfo.AccessKeyId + canCreateBucket := true + updateKeyRequest := *garage.NewUpdateKeyRequest() + updateKeyRequest.SetName("openapi-key-updated") + updateKeyRequest.SetAllow(garage.UpdateKeyRequestAllow { CreateBucket: &canCreateBucket }) + update, _, _ := client.KeyApi.UpdateKey(ctx).Id(id).UpdateKeyRequest(updateKeyRequest).Execute() + fmt.Printf("Updated %v with key name %v\n", *update.AccessKeyId, *update.Name) + + keyList, _, _ := client.KeyApi.ListKeys(ctx).Execute() + fmt.Printf("Keys count: %v\n", len(keyList)) + + // Bucket + fmt.Println("\n--- bucket ---") + global_name := "global-ns-openapi-bucket" + local_name := "local-ns-openapi-bucket" + bucketInfo, _, _ := client.BucketApi.CreateBucket(ctx).CreateBucketRequest(garage.CreateBucketRequest{ + GlobalAlias: &global_name, + LocalAlias: &garage.CreateBucketRequestLocalAlias { + AccessKeyId: keyInfo.AccessKeyId, + Alias: &local_name, + }, + }).Execute() + defer client.BucketApi.DeleteBucket(ctx).Id(*bucketInfo.Id).Execute() + fmt.Printf("Bucket id: %s\n", *bucketInfo.Id) + + updateBucketRequest := *garage.NewUpdateBucketRequest() + website := garage.NewUpdateBucketRequestWebsiteAccess() + website.SetEnabled(true) + website.SetIndexDocument("index.html") + website.SetErrorDocument("errors/4xx.html") + updateBucketRequest.SetWebsiteAccess(*website) + quotas := garage.NewUpdateBucketRequestQuotas() + quotas.SetMaxSize(1000000000) + quotas.SetMaxObjects(999999999) + updateBucketRequest.SetQuotas(*quotas) + updatedBucket, _, _ := client.BucketApi.UpdateBucket(ctx).Id(*bucketInfo.Id).UpdateBucketRequest(updateBucketRequest).Execute() + fmt.Printf("Bucket %v website activation: %v\n", *updatedBucket.Id, *updatedBucket.WebsiteAccess) + + bucketList, _, _ := client.BucketApi.ListBuckets(ctx).Execute() + fmt.Printf("Bucket count: %v\n", len(bucketList)) } ``` diff --git a/doc/book/build/javascript.md b/doc/book/build/javascript.md index ff009ffe..a065c595 100644 --- a/doc/book/build/javascript.md +++ b/doc/book/build/javascript.md @@ -31,9 +31,9 @@ npm install --save git+https://git.deuxfleurs.fr/garage-sdk/garage-admin-sdk-js. A short example: ```javascript -const garage = require('garage_administration_api_v0garage_v0_8_0'); +const garage = require('garage_administration_api_v1garage_v0_9_0'); -const api = new garage.ApiClient("http://127.0.0.1:3903/v0"); +const api = new garage.ApiClient("http://127.0.0.1:3903/v1"); api.authentications['bearerAuth'].accessToken = "s3cr3t"; const [node, layout, key, bucket] = [ diff --git a/doc/book/build/python.md b/doc/book/build/python.md index 5b797897..896c99d3 100644 --- a/doc/book/build/python.md +++ b/doc/book/build/python.md @@ -80,7 +80,7 @@ from garage_admin_sdk.apis import * from garage_admin_sdk.models import * configuration = garage_admin_sdk.Configuration( - host = "http://localhost:3903/v0", + host = "http://localhost:3903/v1", access_token = "s3cr3t" ) @@ -94,13 +94,14 @@ print(f"running garage {status.garage_version}, node_id {status.node}") # Change layout of this node current = layout.get_layout() -layout.add_layout({ - status.node: NodeClusterInfo( +layout.add_layout([ + NodeRoleChange( + id = status.node, zone = "dc1", - capacity = 1, + capacity = 1000000000, tags = [ "dev" ], ) -}) +]) layout.apply_layout(LayoutVersion( version = current.version + 1 )) diff --git a/doc/book/cookbook/exposing-websites.md b/doc/book/cookbook/exposing-websites.md index 5f6a5a28..9382a541 100644 --- a/doc/book/cookbook/exposing-websites.md +++ b/doc/book/cookbook/exposing-websites.md @@ -38,7 +38,7 @@ Our website serving logic is as follow: Now we need to infer the URL of your website through your bucket name. Let assume: - - we set `root_domain = ".web.example.com"` in `garage.toml` ([ref](@/documentation/reference-manual/configuration.md#root_domain)) + - we set `root_domain = ".web.example.com"` in `garage.toml` ([ref](@/documentation/reference-manual/configuration.md#web_root_domain)) - our bucket name is `garagehq.deuxfleurs.fr`. Our bucket will be served if the Host field matches one of these 2 values (the port is ignored): diff --git a/doc/book/operations/layout.md b/doc/book/operations/layout.md index ece17ddb..ee05aba1 100644 --- a/doc/book/operations/layout.md +++ b/doc/book/operations/layout.md @@ -12,7 +12,7 @@ An introduction to building cluster layouts can be found in the [production depl In Garage, all of the data that can be stored in a given cluster is divided into slices which we call *partitions*. Each partition is stored by one or several nodes in the cluster -(see [`replication_mode`](@/documentation/reference-manual/configuration.md#replication-mode)). +(see [`replication_mode`](@/documentation/reference-manual/configuration.md#replication_mode)). The layout determines the correspondence between these partition, which exist on a logical level, and actual storage nodes. diff --git a/doc/book/reference-manual/admin-api.md b/doc/book/reference-manual/admin-api.md index 6932ac60..15630788 100644 --- a/doc/book/reference-manual/admin-api.md +++ b/doc/book/reference-manual/admin-api.md @@ -13,8 +13,11 @@ We will bump the version numbers prefixed to each API endpoint at each time the or semantics change, meaning that code that relies on these endpoint will break when changes are introduced. -The Garage administration API was introduced in version 0.7.2, this document -does not apply to older versions of Garage. +Versions: + - Before Garage 0.7.2 - no admin API + - Garage 0.7.2 - admin APIv0 + - Garage 0.9.0 - admin APIv1, deprecate admin APIv0 + ## Access control @@ -131,7 +134,9 @@ $ curl -so /dev/null -w "%{http_code}" http://localhost:3903/check?domain=exampl ### Cluster operations -These endpoints are defined on a dedicated [Redocly page](https://garagehq.deuxfleurs.fr/api/garage-admin-v0.html). You can also download its [OpenAPI specification](https://garagehq.deuxfleurs.fr/api/garage-admin-v0.yml). +These endpoints have a dedicated OpenAPI spec. + - APIv1 - [HTML spec](https://garagehq.deuxfleurs.fr/api/garage-admin-v1.html) - [OpenAPI YAML](https://garagehq.deuxfleurs.fr/api/garage-admin-v1.yml) + - APIv0 (deprecated) - [HTML spec](https://garagehq.deuxfleurs.fr/api/garage-admin-v0.html) - [OpenAPI YAML](https://garagehq.deuxfleurs.fr/api/garage-admin-v0.yml) Requesting the API from the command line can be as simple as running: diff --git a/doc/book/reference-manual/configuration.md b/doc/book/reference-manual/configuration.md index 1ac681cf..18d160bb 100644 --- a/doc/book/reference-manual/configuration.md +++ b/doc/book/reference-manual/configuration.md @@ -8,6 +8,8 @@ weight = 20 Here is an example `garage.toml` configuration file that illustrates all of the possible options: ```toml +replication_mode = "3" + metadata_dir = "/var/lib/garage/meta" data_dir = "/var/lib/garage/data" metadata_fsync = true @@ -21,8 +23,6 @@ sled_cache_capacity = "128MiB" sled_flush_every_ms = 2000 lmdb_map_size = "1T" -replication_mode = "3" - compression_level = 1 rpc_secret = "4425f5c26c5e11581d3223904324dcb5b5d5dfb14e5e7f35e38c595424f5f1e6" @@ -77,7 +77,147 @@ The following gives details about each available configuration option. ## Available configuration options -### `metadata_dir` +### Index + +Top-level configuration options: +[`block_size`](#block_size), +[`bootstrap_peers`](#bootstrap_peers), +[`compression_level`](#compression_level), +[`data_dir`](#metadata_dir), +[`data_fsync`](#data_fsync), +[`db_engine`](#db_engine), +[`lmdb_map_size`](#lmdb_map_size), +[`metadata_dir`](#metadata_dir), +[`metadata_fsync`](#metadata_fsync), +[`replication_mode`](#replication_mode), +[`rpc_bind_addr`](#rpc_bind_addr), +[`rpc_public_addr`](#rpc_public_addr), +[`rpc_secret`](#rpc_secret), +[`rpc_secret_file`](#rpc_secret), +[`sled_cache_capacity`](#sled_cache_capacity), +[`sled_flush_every_ms`](#sled_flush_every_ms). + +The `[consul_discovery]` section: +[`api`](#consul_api), +[`ca_cert`](#consul_ca_cert), +[`client_cert`](#consul_client_cert), +[`client_key`](#consul_client_cert), +[`consul_http_addr`](#consul_http_addr), +[`meta`](#consul_tags), +[`service_name`](#consul_service_name), +[`tags`](#consul_tags), +[`tls_skip_verify`](#consul_tls_skip_verify), +[`token`](#consul_token). + +The `[kubernetes_discovery]` section: +[`namespace`](#kube_namespace), +[`service_name`](#kube_service_name), +[`skip_crd`](#kube_skip_crd). + +The `[s3_api]` section: +[`api_bind_addr`](#s3_api_bind_addr), +[`root_domain`](#s3_root_domain), +[`s3_region`](#s3_region). + +The `[s3_web]` section: +[`bind_addr`](#web_bind_addr), +[`root_domain`](#web_root_domain). + +The `[admin]` section: +[`api_bind_addr`](#admin_api_bind_addr), +[`metrics_token`](#admin_metrics_token), +[`metrics_token_file`](#admin_metrics_token), +[`admin_token`](#admin_token), +[`admin_token_file`](#admin_token), +[`trace_sink`](#admin_trace_sink), + + +### Top-level configuration options + +#### `replication_mode` {#replication_mode} + +Garage supports the following replication modes: + +- `none` or `1`: data stored on Garage is stored on a single node. There is no + redundancy, and data will be unavailable as soon as one node fails or its + network is disconnected. Do not use this for anything else than test + deployments. + +- `2`: data stored on Garage will be stored on two different nodes, if possible + in different zones. Garage tolerates one node failure, or several nodes + failing but all in a single zone (in a deployment with at least two zones), + before losing data. Data remains available in read-only mode when one node is + down, but write operations will fail. + + - `2-dangerous`: a variant of mode `2`, where written objects are written to + the second replica asynchronously. This means that Garage will return `200 + OK` to a PutObject request before the second copy is fully written (or even + before it even starts being written). This means that data can more easily + be lost if the node crashes before a second copy can be completed. This + also means that written objects might not be visible immediately in read + operations. In other words, this mode severely breaks the consistency and + durability guarantees of standard Garage cluster operation. Benefits of + this mode: you can still write to your cluster when one node is + unavailable. + +- `3`: data stored on Garage will be stored on three different nodes, if + possible each in a different zones. Garage tolerates two node failure, or + several node failures but in no more than two zones (in a deployment with at + least three zones), before losing data. As long as only a single node fails, + or node failures are only in a single zone, reading and writing data to + Garage can continue normally. + + - `3-degraded`: a variant of replication mode `3`, that lowers the read + quorum to `1`, to allow you to read data from your cluster when several + nodes (or nodes in several zones) are unavailable. In this mode, Garage + does not provide read-after-write consistency anymore. The write quorum is + still 2, ensuring that data successfully written to Garage is stored on at + least two nodes. + + - `3-dangerous`: a variant of replication mode `3` that lowers both the read + and write quorums to `1`, to allow you to both read and write to your + cluster when several nodes (or nodes in several zones) are unavailable. It + is the least consistent mode of operation proposed by Garage, and also one + that should probably never be used. + +Note that in modes `2` and `3`, +if at least the same number of zones are available, an arbitrary number of failures in +any given zone is tolerated as copies of data will be spread over several zones. + +**Make sure `replication_mode` is the same in the configuration files of all nodes. +Never run a Garage cluster where that is not the case.** + +The quorums associated with each replication mode are described below: + +| `replication_mode` | Number of replicas | Write quorum | Read quorum | Read-after-write consistency? | +| ------------------ | ------------------ | ------------ | ----------- | ----------------------------- | +| `none` or `1` | 1 | 1 | 1 | yes | +| `2` | 2 | 2 | 1 | yes | +| `2-dangerous` | 2 | 1 | 1 | NO | +| `3` | 3 | 2 | 2 | yes | +| `3-degraded` | 3 | 2 | 1 | NO | +| `3-dangerous` | 3 | 1 | 1 | NO | + +Changing the `replication_mode` between modes with the same number of replicas +(e.g. from `3` to `3-degraded`, or from `2-dangerous` to `2`), can be done easily by +just changing the `replication_mode` parameter in your config files and restarting all your +Garage nodes. + +It is also technically possible to change the replication mode to a mode with a +different numbers of replicas, although it's a dangerous operation that is not +officially supported. This requires you to delete the existing cluster layout +and create a new layout from scratch, meaning that a full rebalancing of your +cluster's data will be needed. To do it, shut down your cluster entirely, +delete the `custer_layout` files in the meta directories of all your nodes, +update all your configuration files with the new `replication_mode` parameter, +restart your cluster, and then create a new layout with all the nodes you want +to keep. Rebalancing data will take some time, and data might temporarily +appear unavailable to your users. It is recommended to shut down public access +to the cluster while rebalancing is in progress. In theory, no data should be +lost as rebalancing is a routine operation for Garage, although we cannot +guarantee you that everything will go right in such an extreme scenario. + +#### `metadata_dir` {#metadata_dir} The directory in which Garage will store its metadata. This contains the node identifier, the network configuration and the peer list, the list of buckets and keys as well @@ -85,7 +225,7 @@ as the index of all objects, object version and object blocks. Store this folder on a fast SSD drive if possible to maximize Garage's performance. -### `data_dir` +#### `data_dir` {#data_dir} The directory in which Garage will store the data blocks of objects. This folder can be placed on an HDD. The space available for `data_dir` @@ -105,48 +245,51 @@ data_dir = [ See [the dedicated documentation page](@/documentation/operations/multi-hdd.md) on how to operate Garage in such a setup. -### `db_engine` (since `v0.8.0`) +#### `db_engine` (since `v0.8.0`) {#db_engine} -By default, Garage uses the Sled embedded database library -to store its metadata on-disk. Since `v0.8.0`, Garage can use alternative storage backends as follows: +Since `v0.8.0`, Garage can use alternative storage backends as follows: | DB engine | `db_engine` value | Database path | | --------- | ----------------- | ------------- | -| [Sled](https://sled.rs) | `"sled"` | `<metadata_dir>/db/` | -| [LMDB](https://www.lmdb.tech) | `"lmdb"` | `<metadata_dir>/db.lmdb/` | +| [LMDB](https://www.lmdb.tech) (default since `v0.9.0`) | `"lmdb"` | `<metadata_dir>/db.lmdb/` | +| [Sled](https://sled.rs) (default up to `v0.8.0`) | `"sled"` | `<metadata_dir>/db/` | | [Sqlite](https://sqlite.org) | `"sqlite"` | `<metadata_dir>/db.sqlite` | +Sled was the only database engine up to Garage v0.7.0. Performance issues and +API limitations of Sled prompted the addition of alternative engines in v0.8.0. +Since v0.9.0, LMDB is the default engine instead of Sled, and Sled is +deprecated. We plan to remove Sled in Garage v1.0. + Performance characteristics of the different DB engines are as follows: -- Sled: the default database engine, which tends to produce - large data files and also has performance issues, especially when the metadata folder - is on a traditional HDD and not on SSD. -- LMDB: the recommended alternative on 64-bit systems, - much more space-efficiant and slightly faster. Note that the data format of LMDB is not portable - between architectures, so for instance the Garage database of an x86-64 - node cannot be moved to an ARM64 node. Also note that, while LMDB can technically be used on 32-bit systems, - this will limit your node to very small database sizes due to how LMDB works; it is therefore not recommended. -- Sqlite: Garage supports Sqlite as a storage backend for metadata, - however it may have issues and is also very slow in its current implementation, - so it is not recommended to be used for now. - -It is possible to convert Garage's metadata directory from one format to another with a small utility named `convert_db`, -which can be downloaded at the following locations: -[for amd64](https://garagehq.deuxfleurs.fr/_releases/convert_db/amd64/convert_db), -[for i386](https://garagehq.deuxfleurs.fr/_releases/convert_db/i386/convert_db), -[for arm64](https://garagehq.deuxfleurs.fr/_releases/convert_db/arm64/convert_db), -[for arm](https://garagehq.deuxfleurs.fr/_releases/convert_db/arm/convert_db). -The `convert_db` utility is used as folows: +- Sled: tends to produce large data files and also has performance issues, + especially when the metadata folder is on a traditional HDD and not on SSD. + +- LMDB: the recommended database engine on 64-bit systems, much more + space-efficient and slightly faster. Note that the data format of LMDB is not + portable between architectures, so for instance the Garage database of an + x86-64 node cannot be moved to an ARM64 node. Also note that, while LMDB can + technically be used on 32-bit systems, this will limit your node to very + small database sizes due to how LMDB works; it is therefore not recommended. + +- Sqlite: Garage supports Sqlite as an alternative storage backend for + metadata, and although it has not been tested as much, it is expected to work + satisfactorily. Since Garage v0.9.0, performance issues have largely been + fixed by allowing for a no-fsync mode (see `metadata_fsync`). Sqlite does not + have the database size limitation of LMDB on 32-bit systems. + +It is possible to convert Garage's metadata directory from one format to another +using the `garage convert-db` command, which should be used as follows: ``` -convert-db -a <input db engine> -i <input db path> \ - -b <output db engine> -o <output db path> +garage convert-db -a <input db engine> -i <input db path> \ + -b <output db engine> -o <output db path> ``` -Make sure to specify the full database path as presented in the table above, -and not just the path to the metadata directory. +Make sure to specify the full database path as presented in the table above +(third colummn), and not just the path to the metadata directory. -### `metadata_fsync` +#### `metadata_fsync` {#metadata_fsync} Whether to enable synchronous mode for the database engine or not. This is disabled (`false`) by default. @@ -176,7 +319,7 @@ Here is how this option impacts the different database engines: Note that the Sqlite database is always ran in `WAL` mode (`PRAGMA journal_mode = WAL`). -### `data_fsync` +#### `data_fsync` {#data_fsync} Whether to `fsync` data blocks and their containing directory after they are saved to disk. @@ -189,7 +332,7 @@ at the cost of a moderate drop in write performance. Similarly to `metatada_fsync`, this is likely not necessary if geographical replication is used. -### `block_size` +#### `block_size` {#block_size} Garage splits stored objects in consecutive chunks of size `block_size` (except the last one which might be smaller). The default size is 1MiB and @@ -204,7 +347,7 @@ files will remain available. This however means that chunks from existing files will not be deduplicated with chunks from newly uploaded files, meaning you might use more storage space that is optimally possible. -### `sled_cache_capacity` +#### `sled_cache_capacity` {#sled_cache_capacity} This parameter can be used to tune the capacity of the cache used by [sled](https://sled.rs), the database Garage uses internally to store metadata. @@ -212,14 +355,14 @@ Tune this to fit the RAM you wish to make available to your Garage instance. This value has a conservative default (128MB) so that Garage doesn't use too much RAM by default, but feel free to increase this for higher performance. -### `sled_flush_every_ms` +#### `sled_flush_every_ms` {#sled_flush_every_ms} This parameters can be used to tune the flushing interval of sled. Increase this if sled is thrashing your SSD, at the risk of losing more data in case of a power outage (though this should not matter much as data is replicated on other nodes). The default value, 2000ms, should be appropriate for most use cases. -### `lmdb_map_size` +#### `lmdb_map_size` {#lmdb_map_size} This parameters can be used to set the map size used by LMDB, which is the size of the virtual memory region used for mapping the database file. @@ -227,90 +370,7 @@ The value of this parameter is the maximum size the metadata database can take. This value is not bound by the physical RAM size of the machine running Garage. If not specified, it defaults to 1GiB on 32-bit machines and 1TiB on 64-bit machines. -### `replication_mode` - -Garage supports the following replication modes: - -- `none` or `1`: data stored on Garage is stored on a single node. There is no - redundancy, and data will be unavailable as soon as one node fails or its - network is disconnected. Do not use this for anything else than test - deployments. - -- `2`: data stored on Garage will be stored on two different nodes, if possible - in different zones. Garage tolerates one node failure, or several nodes - failing but all in a single zone (in a deployment with at least two zones), - before losing data. Data remains available in read-only mode when one node is - down, but write operations will fail. - - - `2-dangerous`: a variant of mode `2`, where written objects are written to - the second replica asynchronously. This means that Garage will return `200 - OK` to a PutObject request before the second copy is fully written (or even - before it even starts being written). This means that data can more easily - be lost if the node crashes before a second copy can be completed. This - also means that written objects might not be visible immediately in read - operations. In other words, this mode severely breaks the consistency and - durability guarantees of standard Garage cluster operation. Benefits of - this mode: you can still write to your cluster when one node is - unavailable. - -- `3`: data stored on Garage will be stored on three different nodes, if - possible each in a different zones. Garage tolerates two node failure, or - several node failures but in no more than two zones (in a deployment with at - least three zones), before losing data. As long as only a single node fails, - or node failures are only in a single zone, reading and writing data to - Garage can continue normally. - - - `3-degraded`: a variant of replication mode `3`, that lowers the read - quorum to `1`, to allow you to read data from your cluster when several - nodes (or nodes in several zones) are unavailable. In this mode, Garage - does not provide read-after-write consistency anymore. The write quorum is - still 2, ensuring that data successfully written to Garage is stored on at - least two nodes. - - - `3-dangerous`: a variant of replication mode `3` that lowers both the read - and write quorums to `1`, to allow you to both read and write to your - cluster when several nodes (or nodes in several zones) are unavailable. It - is the least consistent mode of operation proposed by Garage, and also one - that should probably never be used. - -Note that in modes `2` and `3`, -if at least the same number of zones are available, an arbitrary number of failures in -any given zone is tolerated as copies of data will be spread over several zones. - -**Make sure `replication_mode` is the same in the configuration files of all nodes. -Never run a Garage cluster where that is not the case.** - -The quorums associated with each replication mode are described below: - -| `replication_mode` | Number of replicas | Write quorum | Read quorum | Read-after-write consistency? | -| ------------------ | ------------------ | ------------ | ----------- | ----------------------------- | -| `none` or `1` | 1 | 1 | 1 | yes | -| `2` | 2 | 2 | 1 | yes | -| `2-dangerous` | 2 | 1 | 1 | NO | -| `3` | 3 | 2 | 2 | yes | -| `3-degraded` | 3 | 2 | 1 | NO | -| `3-dangerous` | 3 | 1 | 1 | NO | - -Changing the `replication_mode` between modes with the same number of replicas -(e.g. from `3` to `3-degraded`, or from `2-dangerous` to `2`), can be done easily by -just changing the `replication_mode` parameter in your config files and restarting all your -Garage nodes. - -It is also technically possible to change the replication mode to a mode with a -different numbers of replicas, although it's a dangerous operation that is not -officially supported. This requires you to delete the existing cluster layout -and create a new layout from scratch, meaning that a full rebalancing of your -cluster's data will be needed. To do it, shut down your cluster entirely, -delete the `custer_layout` files in the meta directories of all your nodes, -update all your configuration files with the new `replication_mode` parameter, -restart your cluster, and then create a new layout with all the nodes you want -to keep. Rebalancing data will take some time, and data might temporarily -appear unavailable to your users. It is recommended to shut down public access -to the cluster while rebalancing is in progress. In theory, no data should be -lost as rebalancing is a routine operation for Garage, although we cannot -guarantee you that everything will go right in such an extreme scenario. - -### `compression_level` +#### `compression_level` {#compression_level} Zstd compression level to use for storing blocks. @@ -334,7 +394,7 @@ Compression is done synchronously, setting a value too high will add latency to This value can be different between nodes, compression is done by the node which receive the API call. -### `rpc_secret`, `rpc_secret_file` or `GARAGE_RPC_SECRET` (env) +#### `rpc_secret`, `rpc_secret_file` or `GARAGE_RPC_SECRET` (env) {#rpc_secret} Garage uses a secret key, called an RPC secret, that is shared between all nodes of the cluster in order to identify these nodes and allow them to @@ -346,7 +406,7 @@ Since Garage `v0.8.2`, the RPC secret can also be stored in a file whose path is given in the configuration variable `rpc_secret_file`, or specified as an environment variable `GARAGE_RPC_SECRET`. -### `rpc_bind_addr` +#### `rpc_bind_addr` {#rpc_bind_addr} The address and port on which to bind for inter-cluster communcations (reffered to as RPC for remote procedure calls). @@ -355,14 +415,14 @@ the node, even in the case of a NAT: the NAT should be configured to forward the port number to the same internal port nubmer. This means that if you have several nodes running behind a NAT, they should each use a different RPC port number. -### `rpc_public_addr` +#### `rpc_public_addr` {#rpc_public_addr} The address and port that other nodes need to use to contact this node for RPC calls. **This parameter is optional but recommended.** In case you have a NAT that binds the RPC port to a port that is different on your public IP, this field might help making it work. -### `bootstrap_peers` +#### `bootstrap_peers` {#bootstrap_peers} A list of peer identifiers on which to contact other Garage peers of this cluster. These peer identifiers have the following syntax: @@ -379,42 +439,42 @@ key will be returned by `garage node id` and you will have to add the IP yourself. -## The `[consul_discovery]` section +### The `[consul_discovery]` section Garage supports discovering other nodes of the cluster using Consul. For this to work correctly, nodes need to know their IP address by which they can be reached by other nodes of the cluster, which should be set in `rpc_public_addr`. -### `consul_http_addr` and `service_name` +#### `consul_http_addr` {#consul_http_addr} The `consul_http_addr` parameter should be set to the full HTTP(S) address of the Consul server. -### `api` +#### `api` {#consul_api} Two APIs for service registration are supported: `catalog` and `agent`. `catalog`, the default, will register a service using the `/v1/catalog` endpoints, enabling mTLS if `client_cert` and `client_key` are provided. The `agent` API uses the `v1/agent` endpoints instead, where an optional `token` may be provided. -### `service_name` +#### `service_name` {#consul_service_name} `service_name` should be set to the service name under which Garage's RPC ports are announced. -### `client_cert`, `client_key` +#### `client_cert`, `client_key` {#consul_client_cert} TLS client certificate and client key to use when communicating with Consul over TLS. Both are mandatory when doing so. Only available when `api = "catalog"`. -### `ca_cert` +#### `ca_cert` {#consul_ca_cert} TLS CA certificate to use when communicating with Consul over TLS. -### `tls_skip_verify` +#### `tls_skip_verify` {#consul_tls_skip_verify} Skip server hostname verification in TLS handshake. `ca_cert` is ignored when this is set. -### `token` +#### `token` {#consul_token} Uses the provided token for communication with Consul. Only available when `api = "agent"`. The policy assigned to this token should at least have these rules: @@ -434,49 +494,49 @@ node_prefix "" { } ``` -### `tags` and `meta` +#### `tags` and `meta` {#consul_tags} Additional list of tags and map of service meta to add during service registration. -## The `[kubernetes_discovery]` section +### The `[kubernetes_discovery]` section Garage supports discovering other nodes of the cluster using kubernetes custom resources. For this to work, a `[kubernetes_discovery]` section must be present with at least the `namespace` and `service_name` parameters. -### `namespace` +#### `namespace` {#kube_namespace} `namespace` sets the namespace in which the custom resources are configured. -### `service_name` +#### `service_name` {#kube_service_name} `service_name` is added as a label to the advertised resources to filter them, to allow for multiple deployments in a single namespace. -### `skip_crd` +#### `skip_crd` {#kube_skip_crd} `skip_crd` can be set to true to disable the automatic creation and patching of the `garagenodes.deuxfleurs.fr` CRD. You will need to create the CRD manually. -## The `[s3_api]` section +### The `[s3_api]` section -### `api_bind_addr` +#### `api_bind_addr` {#s3_api_bind_addr} The IP and port on which to bind for accepting S3 API calls. This endpoint does not suport TLS: a reverse proxy should be used to provide it. Alternatively, since `v0.8.5`, a path can be used to create a unix socket with 0222 mode. -### `s3_region` +#### `s3_region` {#s3_region} Garage will accept S3 API calls that are targetted to the S3 region defined here. API calls targetted to other regions will fail with a AuthorizationHeaderMalformed error message that redirects the client to the correct region. -### `root_domain` {#root_domain} +#### `root_domain` {#s3_root_domain} The optional suffix to access bucket using vhost-style in addition to path-style request. Note path-style requests are always enabled, whether or not vhost-style is configured. @@ -488,12 +548,12 @@ using the hostname `my-bucket.s3.garage.eu`. -## The `[s3_web]` section +### The `[s3_web]` section Garage allows to publish content of buckets as websites. This section configures the behaviour of this module. -### `bind_addr` +#### `bind_addr` {#web_bind_addr} The IP and port on which to bind for accepting HTTP requests to buckets configured for website access. @@ -501,7 +561,7 @@ This endpoint does not suport TLS: a reverse proxy should be used to provide it. Alternatively, since `v0.8.5`, a path can be used to create a unix socket with 0222 mode. -### `root_domain` +#### `root_domain` {#web_root_domain} The optional suffix appended to bucket names for the corresponding HTTP Host. @@ -510,11 +570,11 @@ will be accessible either with hostname `deuxfleurs.fr.web.garage.eu` or with hostname `deuxfleurs.fr`. -## The `[admin]` section +### The `[admin]` section Garage has a few administration capabilities, in particular to allow remote monitoring. These features are detailed below. -### `api_bind_addr` +#### `api_bind_addr` {#admin_api_bind_addr} If specified, Garage will bind an HTTP server to this port and address, on which it will listen to requests for administration features. @@ -523,7 +583,7 @@ See [administration API reference](@/documentation/reference-manual/admin-api.md Alternatively, since `v0.8.5`, a path can be used to create a unix socket. Note that for security reasons, the socket will have 0220 mode. Make sure to set user and group permissions accordingly. -### `metrics_token`, `metrics_token_file` or `GARAGE_METRICS_TOKEN` (env) +#### `metrics_token`, `metrics_token_file` or `GARAGE_METRICS_TOKEN` (env) {#admin_metrics_token} The token for accessing the Metrics endpoint. If this token is not set, the Metrics endpoint can be accessed without access control. @@ -534,7 +594,7 @@ You can use any random string for this value. We recommend generating a random t `metrics_token_file` and the `GARAGE_METRICS_TOKEN` environment variable are supported since Garage `v0.8.2`. -### `admin_token`, `admin_token_file` or `GARAGE_ADMIN_TOKEN` (env) +#### `admin_token`, `admin_token_file` or `GARAGE_ADMIN_TOKEN` (env) {#admin_token} The token for accessing all of the other administration endpoints. If this token is not set, access to these endpoints is disabled entirely. @@ -545,7 +605,7 @@ You can use any random string for this value. We recommend generating a random t `admin_token_file` and the `GARAGE_ADMIN_TOKEN` environment variable are supported since Garage `v0.8.2`. -### `trace_sink` +#### `trace_sink` {#admin_trace_sink} Optionally, the address of an OpenTelemetry collector. If specified, Garage will send traces in the OpenTelemetry format to this endpoint. These diff --git a/doc/book/reference-manual/features.md b/doc/book/reference-manual/features.md index 2f8e633a..e8ba9510 100644 --- a/doc/book/reference-manual/features.md +++ b/doc/book/reference-manual/features.md @@ -52,7 +52,7 @@ This is particularly usefull when nodes are far from one another and talk to one Garage supports a variety of replication modes, with 1 copy, 2 copies or 3 copies of your data, and with various levels of consistency, in order to adapt to a variety of usage scenarios. -Read our reference page on [supported replication modes](@/documentation/reference-manual/configuration.md#replication-mode) +Read our reference page on [supported replication modes](@/documentation/reference-manual/configuration.md#replication_mode) to select the replication mode best suited to your use case (hint: in most cases, `replication_mode = "3"` is what you want). ### Web server for static websites diff --git a/script/jepsen.garage/.envrc b/script/jepsen.garage/.envrc new file mode 100644 index 00000000..1d953f4b --- /dev/null +++ b/script/jepsen.garage/.envrc @@ -0,0 +1 @@ +use nix diff --git a/script/jepsen.garage/.gitignore b/script/jepsen.garage/.gitignore new file mode 100644 index 00000000..31842a96 --- /dev/null +++ b/script/jepsen.garage/.gitignore @@ -0,0 +1,17 @@ +/target +/classes +/checkouts +profiles.clj +pom.xml +pom.xml.asc +*.jar +*.class +/.lein-* +/.nrepl-port +/.prepl-port +.hgignore +.hg/ +.direnv +/store +/store.* +.vagrant diff --git a/script/jepsen.garage/README.md b/script/jepsen.garage/README.md new file mode 100644 index 00000000..50c7eb38 --- /dev/null +++ b/script/jepsen.garage/README.md @@ -0,0 +1,166 @@ +# jepsen.garage + +Jepsen checking of Garage consistency properties. + +## Usage + +Requirements: + +- vagrant +- VirtualBox, configured so that nodes can take an IP in a private network `192.168.56.0/24` (it's the default) +- a user that can create VirtualBox VMs +- leiningen +- gnuplot + +Set up VMs before running tests: + +``` +vagrant up +``` + +Run tests: see commands below. + + +## Results + +### Register linear, without timestamp patch + +Command: `lein run test --nodes-file nodes.vagrant --time-limit 60 --rate 100 --concurrency 20 --workload reg1 --ops-per-key 100` + +Results without timestamp patch: + +- Fails with a simple clock-scramble nemesis (`--scenario c`). + Explanation: without the timestamp patch, nodes will create objects using their + local clock only as a timestamp, so the ordering will be all over the place if + clocks are scrambled. + +Results with timestamp patch (`--patch tsfix2`): + +- No failure with clock-scramble nemesis + +- Fails with clock-scramble nemesis + partition nemesis (`--scenario cp`). + +**This test is expected to fail.** +Indeed, S3 objects are not meant to behave like linearizable registers. +TODO explain using a counter-example + + +### Read-after-write CRDT register model + +Command: `lein run test --nodes-file nodes.vagrant --time-limit 60 --rate 100 --concurrency 100 --workload reg2 --ops-per-key 100` + +Results without timestamp patch: + +- Fails with a simple clock-scramble nemesis (`--scenario c`). + Explanation: old values are not overwritten correctly when their timestamps are in the future. + +Results with timestamp patch (`--patch tsfix2`): + +- No failures with clock-scramble nemesis + partition nemesis (`--scenario cp`). + This proves that `tsfix2` (PR#543) does improve consistency. + +- **Fails with layout reconfiguration nemesis** (`--scenario r`). + Example of a failed run: `garage reg2/20231024T120806.899+0200`. + This is the failure mode we are looking for and trying to fix for NLnet task 3. + +Results with NLnet task 3 code (commit 707442f5de, `--patch task3a`): + +- No failures with `--scenario r` (0 of 10 runs), `--scenario pr` (0 of 10 runs), + `--scenario cpr` (0 of 10 runs) and `--scenario dpr` (0 of 10 runs). + +- Same with `--patch task3c` (commit `0041b013`, the final version). + + +### Set, basic test (write some items, then read) + +Command: `lein run test --nodes-file nodes.vagrant --time-limit 60 --rate 200 --concurrency 200 --workload set1 --ops-per-key 100` + +Results without NLnet task3 code (`--patch tsfix2`): + +- For now, no failures with clock-scramble nemesis + partition nemesis -> TODO long test run + +- Does not seem to fail with only the layout reconfiguation nemesis (<10 runs), although theoretically it could + +- **Fails with the partition + layout reconfiguration nemesis** (`--scenario pr`). + Example of a failed run: `garage set1/20231024T172214.488+0200` (1 failure in 4 runs). + This is the failure mode we are looking for and trying to fix for NLnet task 3. + +Results with NLnet task 3 code (commit 707442f5de, `--patch task3a`): + +- The tests are buggy and often result in an "unknown" validity status, which + is caused by some requests not returning results during network partitions or + other nemesis-induced broken cluster states. However, when the tests were + able to finish, there were no failures with scenarios `r`, `pr`, `cpr`, + `dpr`. + + +### Set, continuous test (interspersed reads and writes) + +Command: `lein run test --nodes-file nodes.vagrant --time-limit 60 --rate 100 --concurrency 100 --workload set2 --ops-per-key 100` + +Results without NLnet task3 code (`--patch tsfix2`): + +- No failures with clock-scramble nemesis + db nemesis + partition nemesis (`--scenario cdp`) (0 failures in 10 runs). + +- **Fails with just layout reconfiguration nemesis** (`--scenario r`). + Example of a failed run: `garage set2/20231025T141940.198+0200` (10 failures in 10 runs). + This is the failure mode we are looking for and trying to fix for NLnet task 3. + +Results with NLnet task3 code (commit 707442f5de, `--patch task3a`): + +- No failures with `--scenario r` (0 of 10 runs), `--scenario pr` (0 of 10 runs), + `--scenario cpr` (0 of 10 runs) and `--scenario dpr` (0 of 10 runs). + +- Same with `--patch task3c` (commit `0041b013`, the final version). + + +## NLnet task 3 final results + +- With code from task3 (`--patch task3c`): [reg2 and set2](results/Results-2023-12-13-task3c.png), [set1](results/Results-2023-12-14-task3-set1.png). +- Without (`--patch tsfix2`): [reg2 and set2](results/Results-2023-12-13-tsfix2.png), set1 TBD. + +## Investigating (and fixing) errors + +### Segfaults + +They are due to the download being interrupted in the middle (^C during first launch on clean VMs), the `garage` binary is truncated. +Add `:force?` to the `cached-wget!` call in `daemon.clj` to re-download the binary, +or restar the VMs to clear temporary files. + +### In `jepsen.garage`: prefix wierdness + +In `store/garage set1/20231019T163358.615+0200`: + +``` +INFO [2023-10-19 16:35:20,977] clojure-agent-send-off-pool-207 - jepsen.garage.set list results for prefix set20/ : (set13/0 set13/1 set13/10 set13/11 set13/12 set13/13 set13/14 set13/15 set13/16 set13/17 set13/18 set13/19 set13/2 set13/20 set13/21 set13/22 set13/23 set13/24 set13/25 set13/26 set13/27 set13/28 set13/29 set13/3 set13/30 set13/31 set13/32 set13/33 set13/34 set13/35 set13/36 set13/37 set13/38 set13/39 set13/4 set13/40 set13/41 set13/42 set13/43 set13/44 set13/45 set13/46 set13/47 set13/48 set13/49 set13/5 set13/50 set13/51 set13/52 set13/53 set13/54 set13/55 set13/56 set13/57 set13/58 set13/59 set13/6 set13/60 set13/61 set13/62 set13/63 set13/64 set13/65 set13/66 set13/67 set13/68 set13/69 set13/7 set13/70 set13/71 set13/72 set13/73 set13/74 set13/75 set13/76 set13/77 set13/78 set13/79 set13/8 set13/80 set13/81 set13/82 set13/83 set13/84 set13/85 set13/86 set13/87 set13/88 set13/89 set13/9 set13/90 set13/91 set13/92 set13/93 set13/94 set13/95 set13/96 set13/97 set13/98 set13/99) (node: http://192.168.56.25:3900 ) +``` + +After inspecting, the actual S3 call made was with prefix "set13/", so at least this is not an error in Garage itself but in the jepsen code. + +Finally found out that this was due to closures not correctly capturing their context in the list function in s3api.clj (wtf clojure?) +Not sure exactly where it came from but it seems to have been fixed by making list-inner a separate function and not a sub-function, +and passing all values that were previously in the context (creds and prefix) as additional arguments. + +### `reg2` test inconsistency, even with timestamp fix + +The reg2 test is our custom checker for CRDT read-after-write on individual object keys, acting as registers which can be updated. +The test fails without the timestamp fix, which is expected as the clock scrambler will prevent nodes from having a correct ordering of objects. + +With the timestamp fix (`--patch tsfix1`), the happenned-before relationship should at least be respected, meaning that when a PutObject call starts +after another PutObject call has ended, the second call should overwrite the value of the first call, and that value should not be +readable by future GetObject calls. +However, we observed inconsistencies even with the timestamp fix. + +The inconsistencies seemed to always happenned after writing a nil value, which translates to a DeleteObject call +instead of a PutObject. By removing the possibility of writing nil values, therefore only doing +PutObject calls, the issue disappears. There is therefore an issue to fix in DeleteObject. + +The issue in DeleteObject seems to have been fixed by commit `c82d91c6bccf307186332b6c5c6fc0b128b1b2b1`, which can be used using `--patch tsfix2`. + + +## License + +Copyright © 2023 Alex Auvolat + +This program and the accompanying materials are made available under the +terms of the GNU Affero General Public License v3.0. diff --git a/script/jepsen.garage/Vagrantfile b/script/jepsen.garage/Vagrantfile new file mode 100644 index 00000000..b54c2426 --- /dev/null +++ b/script/jepsen.garage/Vagrantfile @@ -0,0 +1,40 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : +# + +def vm(config, hostname, ip) + config.vm.hostname = hostname + config.vm.network "private_network", ip: ip +end + +Vagrant.configure("2") do |config| + config.vm.box = "generic/debian10" + + config.vm.provider "virtualbox" do |vb| + vb.gui = false + vb.memory = "512" + vb.customize ["modifyvm", :id, "--vram=12"] + end + + config.vm.provision "shell", inline: <<-SHELL + echo "root:root" | chpasswd + mkdir -p /root/.ssh + echo "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJpaBZdYxHqMxhv2RExAOa7nkKhPBOHupMP3mYaZ73w9 lx@lindy" >> /root/.ssh/authorized_keys + SHELL + + config.vm.define "n1" do |config| vm(config, "n1", "192.168.56.21") end + config.vm.define "n2" do |config| vm(config, "n2", "192.168.56.22") end + config.vm.define "n3" do |config| vm(config, "n3", "192.168.56.23") end + config.vm.define "n4" do |config| vm(config, "n4", "192.168.56.24") end + config.vm.define "n5" do |config| vm(config, "n5", "192.168.56.25") end + config.vm.define "n6" do |config| vm(config, "n6", "192.168.56.26") end + config.vm.define "n7" do |config| vm(config, "n7", "192.168.56.27") end + + config.vm.define "n8" do |config| vm(config, "n8", "192.168.56.28") end + config.vm.define "n9" do |config| vm(config, "n9", "192.168.56.29") end + config.vm.define "n10" do |config| vm(config, "n10", "192.168.56.30") end + config.vm.define "n11" do |config| vm(config, "n11", "192.168.56.31") end + config.vm.define "n12" do |config| vm(config, "n12", "192.168.56.32") end + config.vm.define "n13" do |config| vm(config, "n13", "192.168.56.33") end + config.vm.define "n14" do |config| vm(config, "n14", "192.168.56.34") end +end diff --git a/script/jepsen.garage/all_tests_1.sh b/script/jepsen.garage/all_tests_1.sh new file mode 100755 index 00000000..b5397d13 --- /dev/null +++ b/script/jepsen.garage/all_tests_1.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -x + +#for ppatch in task3c task3a tsfix2; do +for ppatch in tsfix2; do + #for psc in c cp cdp r pr cpr dpr; do + for psc in cdp r pr cpr dpr; do + #for ptsk in reg2 set1 set2; do + for ptsk in set1; do + for irun in $(seq 10); do + lein run test --nodes-file nodes.vagrant \ + --time-limit 60 --rate 100 --concurrency 100 --ops-per-key 100 \ + --workload $ptsk --patch $ppatch --scenario $psc + done + done + done +done diff --git a/script/jepsen.garage/all_tests_2.sh b/script/jepsen.garage/all_tests_2.sh new file mode 100755 index 00000000..641643ed --- /dev/null +++ b/script/jepsen.garage/all_tests_2.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +set -x + +#for ppatch in task3c tsfix2; do +for ppatch in tsfix2; do + for psc in cdp r pr cpr dpr; do + for ptsk in set1; do + for irun in $(seq 10); do + lein run test --nodes-file nodes2.vagrant \ + --time-limit 60 --rate 100 --concurrency 100 --ops-per-key 100 \ + --workload $ptsk --patch $ppatch --scenario $psc + done + done + done +done diff --git a/script/jepsen.garage/jaeger.sh b/script/jepsen.garage/jaeger.sh new file mode 100644 index 00000000..7f67b61b --- /dev/null +++ b/script/jepsen.garage/jaeger.sh @@ -0,0 +1,13 @@ +docker stop jaeger +docker rm jaeger + +# UI is on localhost:16686 +# otel-grpc collector is on localhost:4317 +# otel-http collector is on localhost:4318 + +docker run -d --name jaeger \ + -e COLLECTOR_OTLP_ENABLED=true \ + -p 4317:4317 \ + -p 4318:4318 \ + -p 16686:16686 \ + jaegertracing/all-in-one:1.50 diff --git a/script/jepsen.garage/nodes.vagrant b/script/jepsen.garage/nodes.vagrant new file mode 100644 index 00000000..9e5694e6 --- /dev/null +++ b/script/jepsen.garage/nodes.vagrant @@ -0,0 +1,7 @@ +192.168.56.21 +192.168.56.22 +192.168.56.23 +192.168.56.24 +192.168.56.25 +192.168.56.26 +192.168.56.27 diff --git a/script/jepsen.garage/nodes2.vagrant b/script/jepsen.garage/nodes2.vagrant new file mode 100644 index 00000000..842bf276 --- /dev/null +++ b/script/jepsen.garage/nodes2.vagrant @@ -0,0 +1,7 @@ +192.168.56.28 +192.168.56.29 +192.168.56.30 +192.168.56.31 +192.168.56.32 +192.168.56.33 +192.168.56.34 diff --git a/script/jepsen.garage/project.clj b/script/jepsen.garage/project.clj new file mode 100644 index 00000000..59d45484 --- /dev/null +++ b/script/jepsen.garage/project.clj @@ -0,0 +1,10 @@ +(defproject jepsen.garage "0.1.0-SNAPSHOT" + :description "Jepsen testing for Garage" + :url "https://git.deuxfleurs.fr/Deuxfleurs/garage" + :license {:name "AGPLv3" + :url "https://www.gnu.org/licenses/agpl-3.0.en.html"} + :main jepsen.garage + :dependencies [[org.clojure/clojure "1.11.1"] + [jepsen "0.3.3-SNAPSHOT"] + [amazonica "0.3.163"]] + :repl-options {:init-ns jepsen.garage}) diff --git a/script/jepsen.garage/results/Results-2023-11-16.png b/script/jepsen.garage/results/Results-2023-11-16.png Binary files differnew file mode 100644 index 00000000..26dac833 --- /dev/null +++ b/script/jepsen.garage/results/Results-2023-11-16.png diff --git a/script/jepsen.garage/results/Results-2023-12-13-task3c.png b/script/jepsen.garage/results/Results-2023-12-13-task3c.png Binary files differnew file mode 100644 index 00000000..216043c3 --- /dev/null +++ b/script/jepsen.garage/results/Results-2023-12-13-task3c.png diff --git a/script/jepsen.garage/results/Results-2023-12-13-tsfix2.png b/script/jepsen.garage/results/Results-2023-12-13-tsfix2.png Binary files differnew file mode 100644 index 00000000..147d25e9 --- /dev/null +++ b/script/jepsen.garage/results/Results-2023-12-13-tsfix2.png diff --git a/script/jepsen.garage/results/Results-2023-12-14-task3-set1.png b/script/jepsen.garage/results/Results-2023-12-14-task3-set1.png Binary files differnew file mode 100644 index 00000000..dbff3a95 --- /dev/null +++ b/script/jepsen.garage/results/Results-2023-12-14-task3-set1.png diff --git a/script/jepsen.garage/shell.nix b/script/jepsen.garage/shell.nix new file mode 100644 index 00000000..01e4c845 --- /dev/null +++ b/script/jepsen.garage/shell.nix @@ -0,0 +1,18 @@ +{ pkgs ? import <nixpkgs> { + overlays = [ + (self: super: { + jdk = super.jdk11; + jre = super.jre11; + }) + ]; +} }: +pkgs.mkShell { + nativeBuildInputs = with pkgs; [ + leiningen + jdk + jna + vagrant + gnuplot + graphviz + ]; +} diff --git a/script/jepsen.garage/src/jepsen/garage.clj b/script/jepsen.garage/src/jepsen/garage.clj new file mode 100644 index 00000000..446b81de --- /dev/null +++ b/script/jepsen.garage/src/jepsen/garage.clj @@ -0,0 +1,105 @@ +(ns jepsen.garage + (:require + [clojure.string :as str] + [jepsen + [checker :as checker] + [cli :as cli] + [generator :as gen] + [nemesis :as nemesis] + [tests :as tests]] + [jepsen.os.debian :as debian] + [jepsen.garage + [daemon :as grg] + [nemesis :as grgNemesis] + [reg :as reg] + [set :as set]])) + +(def workloads + "A map of workload names to functions that construct workloads, given opts." + {"reg1" reg/workload1 + "reg2" reg/workload2 + "set1" set/workload1 + "set2" set/workload2}) + +(def scenari + "A map of scenari to the associated nemesis" + {"c" grgNemesis/scenario-c + "cp" grgNemesis/scenario-cp + "r" grgNemesis/scenario-r + "pr" grgNemesis/scenario-pr + "cpr" grgNemesis/scenario-cpr + "cdp" grgNemesis/scenario-cdp + "dpr" grgNemesis/scenario-dpr}) + +(def patches + "A map of patch names to Garage builds" + {"default" "v0.9.0" + "tsfix1" "d146cdd5b66ca1d3ed65ce93ca42c6db22defc09" + "tsfix2" "c82d91c6bccf307186332b6c5c6fc0b128b1b2b1" + "task3a" "707442f5de416fdbed4681a33b739f0a787b7834" + "task3b" "431b28e0cfdc9cac6c649193cf602108a8b02997" + "task3c" "0041b013a473e3ae72f50209d8f79db75a72848b"}) + +(def cli-opts + "Additional command line options." + [["-p" "--patch NAME" "Garage patch to use" + :default "default" + :validate [patches (cli/one-of patches)]] + ["-s" "--scenario NAME" "Nemesis scenario to run" + :default "cp" + :validate [scenari (cli/one-of scenari)]] + ["-r" "--rate HZ" "Approximate number of requests per second, per thread." + :default 10 + :parse-fn read-string + :validate [#(and (number? %) (pos? %)) "Must be a positive number"]] + [nil "--ops-per-key NUM" "Maximum number of operations on any given key." + :default 100 + :parse-fn parse-long + :validate [pos? "Must be a positive integer."]] + ["-w" "--workload NAME" "Workload of test to run" + :default "reg1" + :validate [workloads (cli/one-of workloads)]]]) + +(defn garage-test + "Given an options map from the command line runner (e.g. :nodes, :ssh, + :concurrency, ...), constructs a test map." + [opts] + (let [garage-version (get patches (:patch opts)) + db (grg/db garage-version) + workload ((get workloads (:workload opts)) opts) + scenario ((get scenari (:scenario opts)) (assoc opts :db db))] + (merge tests/noop-test + opts + {:pure-generators true + :name (str "garage-" (name (:patch opts)) " " (name (:workload opts)) " " (name (:scenario opts))) + :os debian/os + :db db + :client (:client workload) + :generator (gen/phases + (->> + (:generator workload) + (gen/stagger (/ (:rate opts))) + (gen/nemesis (:generator scenario)) + (gen/time-limit (:time-limit opts))) + (gen/log "Healing cluster") + (gen/nemesis (:final-generator scenario)) + (gen/log "Waiting for recovery") + (gen/sleep 10) + (gen/log "Running final generator") + (gen/clients (:final-generator workload)) + (gen/log "Generators all done")) + :nemesis (:nemesis scenario) + :checker (checker/compose + {:perf (checker/perf (:perf scenario)) + :workload (:checker workload)}) + }))) + + +(defn -main + "Handles command line arguments. Can either run a test, or a web server for + browsing results." + [& args] + (cli/run! (merge (cli/single-test-cmd {:test-fn garage-test + :opt-spec cli-opts}) + (cli/serve-cmd)) + args)) diff --git a/script/jepsen.garage/src/jepsen/garage/daemon.clj b/script/jepsen.garage/src/jepsen/garage/daemon.clj new file mode 100644 index 00000000..d407dd29 --- /dev/null +++ b/script/jepsen.garage/src/jepsen/garage/daemon.clj @@ -0,0 +1,152 @@ +(ns jepsen.garage.daemon + (:require [clojure.tools.logging :refer :all] + [jepsen [control :as c] + [core :as jepsen] + [db :as db]] + [jepsen.control.util :as cu])) + +; CONSTANTS -- HOW GARAGE IS SET UP + +(def base-dir "/opt/garage") +(def data-dir (str base-dir "/data")) +(def meta-dir (str base-dir "/meta")) +(def binary (str base-dir "/garage")) +(def logfile (str base-dir "/garage.log")) +(def pidfile (str base-dir "/garage.pid")) + +(def admin-token "icanhazadmin") +(def access-key-id "GK8bfb6a51286071c6c9cd8bc3") +(def secret-access-key "b0be95f71c1c6f16858a9edf395078b75c12ecb6b1c03385c4ae92076e4994a3") +(def bucket-name "jepsen") + +; THE GARAGE DB + +(defn install! + "Download and install Garage" + [node version] + (c/su + (c/trace + (info node "installing garage" version) + (c/exec :mkdir :-p base-dir) + (let [url (str "https://garagehq.deuxfleurs.fr/_releases/" version "/x86_64-unknown-linux-musl/garage") + cache (cu/cached-wget! url)] + (c/exec :cp cache binary)) + (c/exec :chmod :+x binary)))) + +(defn configure! + "Configure Garage" + [node] + (c/su + (c/trace + (cu/write-file! + (str "rpc_secret = \"0fffabe52542c2b89a56b2efb7dfd477e9dafb285c9025cbdf1de7ca21a6b372\"\n" + "rpc_bind_addr = \"0.0.0.0:3901\"\n" + "rpc_public_addr = \"" node ":3901\"\n" + "db_engine = \"lmdb\"\n" + "replication_mode = \"2\"\n" + "data_dir = \"" data-dir "\"\n" + "metadata_dir = \"" meta-dir "\"\n" + "[s3_api]\n" + "s3_region = \"us-east-1\"\n" + "api_bind_addr = \"0.0.0.0:3900\"\n" + "[k2v_api]\n" + "api_bind_addr = \"0.0.0.0:3902\"\n" + "[admin]\n" + "api_bind_addr = \"0.0.0.0:3903\"\n" + "admin_token = \"" admin-token "\"\n" + "trace_sink = \"http://192.168.56.1:4317\"\n") + "/etc/garage.toml")))) + +(defn connect-node! + "Connect a Garage node to the rest of the cluster" + [test node] + (c/trace + (let [node-id (c/exec binary :node :id :-q)] + (info node "node id:" node-id) + (c/on-many (:nodes test) + (c/exec binary :node :connect node-id))))) + +(defn configure-node! + "Configure a Garage node to be part of a cluster layout" + [test node] + (c/trace + (let [node-id (c/exec binary :node :id :-q)] + (c/on (jepsen/primary test) + (c/exec binary :layout :assign (subs node-id 0 16) :-c :1G :-z :dc1 :-t node))))) + +(defn finalize-config! + "Apply the layout and create a key/bucket pair in the cluster" + [node] + (c/trace + (c/exec binary :layout :apply :--version 1) + (info node "garage status:" (c/exec binary :status)) + (c/exec binary :key :import access-key-id secret-access-key :--yes) + (c/exec binary :bucket :create bucket-name) + (c/exec binary :bucket :allow :--read :--write bucket-name :--key access-key-id) + (info node "key info: " (c/exec binary :key :info access-key-id)))) + +(defn db + "Garage DB for a particular version" + [version] + (reify db/DB + (setup! [_ test node] + (install! node version) + (configure! node) + (cu/start-daemon! + {:logfile logfile + :pidfile pidfile + :chdir base-dir + :env {:RUST_LOG "garage=debug,garage_api=trace"}} + binary + :server) + (c/exec :sleep 3) + + (jepsen/synchronize test) + (connect-node! test node) + + (jepsen/synchronize test) + (configure-node! test node) + + (jepsen/synchronize test) + (when (= node (jepsen/primary test)) + (finalize-config! node))) + + (teardown! [_ test node] + (info node "tearing down garage" version) + (c/su + (cu/stop-daemon! binary pidfile) + (c/exec :rm :-rf logfile) + (c/exec :rm :-rf data-dir) + (c/exec :rm :-rf meta-dir))) + + db/Pause + (pause! [_ test node] + (cu/grepkill! :stop binary)) + (resume! [_ test node] + (cu/grepkill! :cont binary)) + + db/Kill + (kill! [_ test node] + (cu/stop-daemon! binary pidfile)) + (start! [_ test node] + (cu/start-daemon! + {:logfile logfile + :pidfile pidfile + :chdir base-dir + :env {:RUST_LOG "garage=debug,garage_api=trace"}} + binary + :server)) + + db/LogFiles + (log-files [_ test node] + [logfile]))) + +(defn creds + "Obtain Garage credentials for node" + [node] + {:access-key access-key-id + :secret-key secret-access-key + :endpoint (str "http://" node ":3900") + :bucket bucket-name + :client-config {:path-style-access-enabled true}}) + diff --git a/script/jepsen.garage/src/jepsen/garage/nemesis.clj b/script/jepsen.garage/src/jepsen/garage/nemesis.clj new file mode 100644 index 00000000..dfce0255 --- /dev/null +++ b/script/jepsen.garage/src/jepsen/garage/nemesis.clj @@ -0,0 +1,142 @@ +(ns jepsen.garage.nemesis + (:require [clojure.tools.logging :refer :all] + [jepsen [control :as c] + [core :as jepsen] + [generator :as gen] + [nemesis :as nemesis]] + [jepsen.nemesis.combined :as combined] + [jepsen.garage.daemon :as grg] + [jepsen.control.util :as cu])) + +; ---- reconfiguration nemesis ---- + +(defn configure-present! + "Configure node to be active in new cluster layout" + [test nodes] + (info "configure-present!" nodes) + (let [node-ids (c/on-many nodes (c/exec grg/binary :node :id :-q)) + node-id-strs (map (fn [[_ v]] (subs v 0 16)) node-ids)] + (c/on + (jepsen/primary test) + (apply c/exec (concat [grg/binary :layout :assign :-c :1G] node-id-strs))))) + +(defn configure-absent! + "Configure nodes to be active in new cluster layout" + [test nodes] + (info "configure-absent!" nodes) + (let [node-ids (c/on-many nodes (c/exec grg/binary :node :id :-q)) + node-id-strs (map (fn [[_ v]] (subs v 0 16)) node-ids)] + (c/on + (jepsen/primary test) + (apply c/exec (concat [grg/binary :layout :assign :-g] node-id-strs))))) + +(defn finalize-config! + "Apply the proposed cluster layout" + [test] + (let [layout-show (c/on (jepsen/primary test) (c/exec grg/binary :layout :show)) + [_ layout-next-version] (re-find #"apply --version (\d+)\n" layout-show)] + (if layout-next-version + (do + (info "layout show: " layout-show "; next-version: " layout-next-version) + (c/on (jepsen/primary test) + (c/exec grg/binary :layout :apply :--version layout-next-version))) + (info "no layout changes to apply")))) + +(defn reconfigure-subset + "Reconfigure cluster with only a subset of nodes" + [cnt] + (reify nemesis/Nemesis + (setup! [this test] this) + + (invoke! [this test op] op + (case (:f op) + :start + (let [[keep-nodes remove-nodes] + (->> (:nodes test) + shuffle + (split-at cnt))] + (info "layout split: keep " keep-nodes ", remove " remove-nodes) + (configure-present! test keep-nodes) + (configure-absent! test remove-nodes) + (finalize-config! test) + (assoc op :value keep-nodes)) + :stop + (do + (info "layout un-split: all nodes=" (:nodes test)) + (configure-present! test (:nodes test)) + (finalize-config! test) + (assoc op :value (:nodes test))))) + + (teardown! [this test] this))) + +; ---- nemesis scenari ---- + +(defn nemesis-op + "A generator for a single nemesis operation" + [op] + (fn [_ _] {:type :info, :f op})) + +(defn reconfiguration-package + "Cluster reconfiguration nemesis package" + [opts] + {:generator (->> + (gen/mix [(nemesis-op :reconfigure-start) + (nemesis-op :reconfigure-stop)]) + (gen/stagger (:interval opts 5))) + :final-generator {:type :info, :f :reconfigure-stop} + :nemesis (nemesis/compose + {{:reconfigure-start :start + :reconfigure-stop :stop} (reconfigure-subset 3)}) + :perf #{{:name "reconfigure" + :start #{:reconfigure-start} + :stop #{:reconfigur-stop} + :color "#A197E9"}}}) + +(defn scenario-c + "Clock modifying scenario" + [opts] + (combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}})) + +(defn scenario-cp + "Clock modifying + partition scenario" + [opts] + (combined/compose-packages + [(combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}}) + (combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}})])) + +(defn scenario-r + "Cluster reconfiguration scenario" + [opts] + (reconfiguration-package {:interval 1})) + +(defn scenario-pr + "Partition + cluster reconfiguration scenario" + [opts] + (combined/compose-packages + [(combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}}) + (reconfiguration-package {:interval 1})])) + +(defn scenario-cpr + "Clock scramble + partition + cluster reconfiguration scenario" + [opts] + (combined/compose-packages + [(combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}}) + (combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}}) + (reconfiguration-package {:interval 1})])) + +(defn scenario-cdp + "Clock modifying + db + partition scenario" + [opts] + (combined/compose-packages + [(combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}}) + (combined/db-package {:db (:db opts), :interval 1, :faults #{:db :pause :kill}}) + (combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}})])) + +(defn scenario-dpr + "Db + partition + cluster reconfiguration scenario" + [opts] + (combined/compose-packages + [(combined/db-package {:db (:db opts), :interval 1, :faults #{:db :pause :kill}}) + (combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}}) + (reconfiguration-package {:interval 1})])) + diff --git a/script/jepsen.garage/src/jepsen/garage/reg.clj b/script/jepsen.garage/src/jepsen/garage/reg.clj new file mode 100644 index 00000000..39708c0b --- /dev/null +++ b/script/jepsen.garage/src/jepsen/garage/reg.clj @@ -0,0 +1,143 @@ +(ns jepsen.garage.reg + (:require [clojure.tools.logging :refer :all] + [clojure.string :as str] + [clojure.set :as set] + [jepsen [checker :as checker] + [cli :as cli] + [client :as client] + [control :as c] + [db :as db] + [generator :as gen] + [independent :as independent] + [nemesis :as nemesis] + [util :as util] + [tests :as tests]] + [jepsen.checker.timeline :as timeline] + [jepsen.control.util :as cu] + [jepsen.os.debian :as debian] + [jepsen.garage.daemon :as grg] + [jepsen.garage.s3api :as s3] + [knossos.model :as model] + [slingshot.slingshot :refer [try+]])) + +(defn op-get [_ _] {:type :invoke, :f :read, :value nil}) +(defn op-put [_ _] {:type :invoke, :f :write, :value (str (rand-int 99))}) +(defn op-del [_ _] {:type :invoke, :f :write, :value nil}) + +(defrecord RegClient [creds] + client/Client + (open! [this test node] + (assoc this :creds (grg/creds node))) + (setup! [this test]) + (invoke! [this test op] + (try+ + (let [[k v] (:value op)] + (case (:f op) + :read + (util/timeout + 10000 + (assoc op :type :fail, :error ::timeout) + (let [value (s3/get (:creds this) k)] + (assoc op :type :ok, :value (independent/tuple k value)))) + :write + (util/timeout + 10000 + (assoc op :type :info, :error ::timeout) + (do + (s3/put (:creds this) k v) + (assoc op :type :ok))))) + (catch (re-find #"Unavailable" (.getMessage %)) ex + (assoc op :type :info, :error ::unavailable)) + (catch (re-find #"Broken pipe" (.getMessage %)) ex + (assoc op :type :info, :error ::broken-pipe)) + (catch (re-find #"Connection refused" (.getMessage %)) ex + (assoc op :type :info, :error ::connection-refused)))) + (teardown! [this test]) + (close! [this test])) + +(defn reg-read-after-write + "Read-after-Write checker for register operations" + [] + (reify checker/Checker + (check [this test history opts] + (let [init {:put-values {-1 nil} + :put-done #{-1} + :put-in-progress {} + :read-can-contain {} + :bad-reads #{}} + final (reduce + (fn [state op] + (let [current-values (set/union + (set (map (fn [idx] (get (:put-values state) idx)) (:put-done state))) + (set (map (fn [[_ [idx _]]] (get (:put-values state) idx)) (:put-in-progress state)))) + read-can-contain (reduce + (fn [rcc [idx v]] (assoc rcc idx (set/union current-values v))) + {} (:read-can-contain state))] + (info "--------") + (info "state: " state) + (info "current-values: " current-values) + (info "read-can-contain: " read-can-contain) + (info "op: " op) + (case [(:type op) (:f op)] + ([:invoke :write]) + (assoc state + :read-can-contain read-can-contain + :put-values (assoc (:put-values state) (:index op) (:value op)) + :put-in-progress (assoc (:put-in-progress state) (:process op) [(:index op) (:put-done state)])) + ([:ok :write]) + (let [[index overwrites] (get (:put-in-progress state) (:process op))] + (assoc state + :read-can-contain read-can-contain + :put-in-progress (dissoc (:put-in-progress state) (:process op)) + :put-done + (conj + (set/difference (:put-done state) overwrites) + index))) + ([:invoke :read]) + (assoc state + :read-can-contain (assoc read-can-contain (:process op) current-values)) + ([:ok :read]) + (let [this-read-can-contain (get read-can-contain (:process op)) + bad-reads (if (contains? this-read-can-contain (:value op)) + (:bad-reads state) + (conj (:bad-reads state) [(:process op) (:index op) (:value op) this-read-can-contain]))] + (info "this-read-can-contain: " this-read-can-contain) + (assoc state + :read-can-contain (dissoc read-can-contain (:process op)) + :bad-reads bad-reads)) + state))) + init history) + valid? (empty? (:bad-reads final))] + (assoc final :valid? valid?))))) + +(defn workload-common + "Common parts of workload" + [opts] + {:client (RegClient. nil) + :generator (independent/concurrent-generator + 10 + (range) + (fn [k] + (->> + (gen/mix [op-get op-put op-del]) + (gen/limit (:ops-per-key opts)))))}) + +(defn workload1 + "Tests linearizable reads and writes" + [opts] + (assoc (workload-common opts) + :checker (independent/checker + (checker/compose + {:linear (checker/linearizable + {:model (model/register) + :algorithm :linear}) + :timeline (timeline/html)})))) + +(defn workload2 + "Tests CRDT reads and writes" + [opts] + (assoc (workload-common opts) + :checker (independent/checker + (checker/compose + {:reg-read-after-write (reg-read-after-write) + :timeline (timeline/html)})))) diff --git a/script/jepsen.garage/src/jepsen/garage/s3api.clj b/script/jepsen.garage/src/jepsen/garage/s3api.clj new file mode 100644 index 00000000..531e0157 --- /dev/null +++ b/script/jepsen.garage/src/jepsen/garage/s3api.clj @@ -0,0 +1,48 @@ +(ns jepsen.garage.s3api + (:require [clojure.tools.logging :refer :all] + [jepsen [control :as c]] + [amazonica.aws.s3 :as s3] + [slingshot.slingshot :refer [try+]])) + +; GARAGE S3 HELPER FUNCTIONS + +(defn get + "Helper for GetObject" + [creds k] + (try+ + (-> (s3/get-object creds (:bucket creds) k) + :input-stream + slurp) + (catch (re-find #"Key not found" (.getMessage %)) ex + nil))) + +(defn put + "Helper for PutObject or DeleteObject (is a delete if value is nil)" + [creds k v] + (if (= v nil) + (s3/delete-object creds + :bucket-name (:bucket creds) + :key k) + (let [some-bytes (.getBytes v "UTF-8") + bytes-stream (java.io.ByteArrayInputStream. some-bytes)] + (s3/put-object creds + :bucket-name (:bucket creds) + :key k + :input-stream bytes-stream + :metadata {:content-length (count some-bytes)})))) + +(defn list-inner [creds prefix ct accum] + (let [list-result (s3/list-objects-v2 creds + {:bucket-name (:bucket creds) + :prefix prefix + :continuation-token ct}) + new-object-summaries (:object-summaries list-result) + new-objects (map (fn [d] (:key d)) new-object-summaries) + objects (concat new-objects accum)] + (if (:truncated? list-result) + (list-inner creds prefix (:next-continuation-token list-result) objects) + objects))) +(defn list + "Helper for ListObjects -- just lists everything in the bucket" + [creds prefix] + (list-inner creds prefix nil [])) diff --git a/script/jepsen.garage/src/jepsen/garage/set.clj b/script/jepsen.garage/src/jepsen/garage/set.clj new file mode 100644 index 00000000..2c7a2ccd --- /dev/null +++ b/script/jepsen.garage/src/jepsen/garage/set.clj @@ -0,0 +1,135 @@ +(ns jepsen.garage.set + (:require [clojure.tools.logging :refer :all] + [clojure.string :as str] + [clojure.set :as set] + [jepsen [checker :as checker] + [cli :as cli] + [client :as client] + [control :as c] + [checker :as checker] + [db :as db] + [generator :as gen] + [independent :as independent] + [nemesis :as nemesis] + [util :as util] + [tests :as tests]] + [jepsen.checker.timeline :as timeline] + [jepsen.control.util :as cu] + [jepsen.os.debian :as debian] + [jepsen.garage.daemon :as grg] + [jepsen.garage.s3api :as s3] + [knossos.model :as model] + [slingshot.slingshot :refer [try+]])) + +(defn op-add-rand100 [_ _] {:type :invoke, :f :add, :value (rand-int 100)}) +(defn op-read [_ _] {:type :invoke, :f :read, :value nil}) + +(defrecord SetClient [creds] + client/Client + (open! [this test node] + (assoc this :creds (grg/creds node))) + (setup! [this test]) + (invoke! [this test op] + (try+ + (let [[k v] (:value op) + prefix (str "set" k "/")] + (case (:f op) + :add + (util/timeout + 10000 + (assoc op :type :info, :error ::timeout) + (do + (s3/put (:creds this) (str prefix v) "present") + (assoc op :type :ok))) + :read + (util/timeout + 10000 + (assoc op :type :fail, :error ::timeout) + (do + (let [items (s3/list (:creds this) prefix)] + (let [items-stripped (map (fn [o] + (assert (str/starts-with? o prefix)) + (str/replace-first o prefix "")) items) + items-set (set (map parse-long items-stripped))] + (assoc op :type :ok, :value (independent/tuple k items-set)))))))) + (catch (re-find #"Unavailable" (.getMessage %)) ex + (assoc op :type :info, :error ::unavailable)) + (catch (re-find #"Broken pipe" (.getMessage %)) ex + (assoc op :type :info, :error ::broken-pipe)) + (catch (re-find #"Connection refused" (.getMessage %)) ex + (assoc op :type :info, :error ::connection-refused)))) + (teardown! [this test]) + (close! [this test])) + +(defn set-read-after-write + "Read-after-Write checker for set operations" + [] + (reify checker/Checker + (check [this test history opts] + (let [init {:add-started #{} + :add-done #{} + :read-must-contain {} + :missed #{} + :unexpected #{}} + final (reduce + (fn [state op] + (case [(:type op) (:f op)] + ([:invoke :add]) + (assoc state :add-started (conj (:add-started state) (:value op))) + ([:ok :add]) + (assoc state :add-done (conj (:add-done state) (:value op))) + ([:invoke :read]) + (assoc-in state [:read-must-contain (:process op)] (:add-done state)) + ([:ok :read]) + (let [read-must-contain (get (:read-must-contain state) (:process op)) + new-missed (set/difference read-must-contain (:value op)) + new-unexpected (set/difference (:value op) (:add-started state))] + (assoc state + :read-must-contain (dissoc (:read-must-contain state) (:process op)) + :missed (set/union (:missed state) new-missed), + :unexpected (set/union (:unexpected state) new-unexpected))) + state)) + init history) + valid? (and (empty? (:missed final)) (empty? (:unexpected final)))] + (assoc final :valid? valid?))))) + +(defn workload1 + "Tests insertions and deletions" + [opts] + {:client (SetClient. nil) + :checker (independent/checker + (checker/compose + {:set (checker/set) + :timeline (timeline/html)})) + :generator (independent/concurrent-generator + 10 + (range 100) + (fn [k] + (->> (range) + (map (fn [x] {:type :invoke, :f :add, :value x})) + (gen/limit (:ops-per-key opts))))) + :final-generator (independent/concurrent-generator + 10 + (range 100) + (fn [k] + (gen/phases + (gen/once op-read) + (gen/sleep 5))))}) + +(defn workload2 + "Tests insertions and deletions" + [opts] + {:client (SetClient. nil) + :checker (independent/checker + (checker/compose + {:set-read-after-write (set-read-after-write) + ; :set-full (checker/set-full {:linearizable? false}) + :timeline (timeline/html)})) + :generator (independent/concurrent-generator + 10 + (range) + (fn [k] + (->> (gen/mix [op-add-rand100 op-read]) + (gen/limit (:ops-per-key opts)))))}) + + diff --git a/script/jepsen.garage/test/jepsen/garage_test.clj b/script/jepsen.garage/test/jepsen/garage_test.clj new file mode 100644 index 00000000..055392a1 --- /dev/null +++ b/script/jepsen.garage/test/jepsen/garage_test.clj @@ -0,0 +1,7 @@ +(ns jepsen.garage-test + (:require [clojure.test :refer :all] + [jepsen.garage :refer :all])) + +(deftest a-test + (testing "FIXME, I fail." + (is (= 0 1)))) diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs index d9bd600e..41a5e68c 100644 --- a/src/api/admin/api_server.rs +++ b/src/api/admin/api_server.rs @@ -182,7 +182,7 @@ impl AdminApiServer { ), }; let status_str = format!( - "{}\nConsult the full health check API endpoint at /v0/health for more details\n", + "{}\nConsult the full health check API endpoint at /v1/health for more details\n", status_str ); diff --git a/src/api/s3/api_server.rs b/src/api/s3/api_server.rs index d675ab61..887839dd 100644 --- a/src/api/s3/api_server.rs +++ b/src/api/s3/api_server.rs @@ -344,7 +344,7 @@ impl ApiHandler for S3ApiServer { bucket_id, key, upload_id, - part_number_marker: part_number_marker.map(|p| p.clamp(1, 10000)), + part_number_marker: part_number_marker.map(|p| p.min(10000)), max_parts: max_parts.unwrap_or(1000).clamp(1, 1000), }, ) |