diff options
-rw-r--r-- | doc/book/cookbook/monitoring.md | 305 | ||||
-rw-r--r-- | doc/book/cookbook/real-world.md | 46 | ||||
-rw-r--r-- | doc/book/cookbook/recovering.md | 2 | ||||
-rw-r--r-- | doc/book/cookbook/upgrading.md | 2 | ||||
-rw-r--r-- | doc/book/quick-start/_index.md | 3 | ||||
-rw-r--r-- | script/telemetry/grafana-garage-dashboard-prometheus.json | 1053 |
6 files changed, 1400 insertions, 11 deletions
diff --git a/doc/book/cookbook/monitoring.md b/doc/book/cookbook/monitoring.md new file mode 100644 index 00000000..0a3a33e7 --- /dev/null +++ b/doc/book/cookbook/monitoring.md @@ -0,0 +1,305 @@ ++++ +title = "Monitoring Garage" +weight = 40 ++++ + +Garage exposes some internal metrics in the Prometheus data format. +This page explains how to exploit these metrics. + +## Setting up monitoring + +### Enabling the Admin API endpoint + +If you have not already enabled the [administration API endpoint](@/documentation/reference-manual/admin-api.md), do so by adding the following lines to your configuration file: + +```toml +[admin] +api_bind_addr = "0.0.0.0:3903" +``` + +This will allow anyone to scrape Prometheus metrics by fetching +`http://localhost:3093/metrics`. If you want to restrict access +to the exported metrics, set the `metrics_token` configuration value +to a bearer token to be used when fetching the metrics endpoint. + +### Setting up Prometheus and Grafana + +Add a scrape config to your Prometheus daemon to scrape metrics from +all of your nodes: + +```yaml +scrape_configs: + - job_name: 'garage' + static_configs: + - targets: + - 'node1.mycluster:3903' + - 'node2.mycluster:3903' + - 'node3.mycluster:3903' +``` + +If you have set a metrics token in your Garage configuration file, +add the following lines in your Prometheus scrape config: + +```yaml + authorization: + type: Bearer + credentials: 'your metrics token' +``` + +To visualize the scraped data in Grafana, +you can either import our [Grafana dashboard for Garage](https://git.deuxfleurs.fr/Deuxfleurs/garage/raw/branch/main/script/telemetry/grafana-garage-dashboard-prometheus.json) +or make your own. +We detail below the list of exposed metrics and their meaning. + + + +## List of exported metrics + + +### API endpoints + +#### `api_admin_request_counter` (counter) + +Counts the number of requests to a given endpoint of the administration API. Example: + +``` +api_admin_request_counter{api_endpoint="Metrics"} 127041 +``` + +#### `api_admin_request_duration` (histogram) + +Evaluates the duration of API calls to the various administration API endpoint. Example: + +``` +api_admin_request_duration_bucket{api_endpoint="Metrics",le="0.5"} 127041 +api_admin_request_duration_sum{api_endpoint="Metrics"} 605.250344830999 +api_admin_request_duration_count{api_endpoint="Metrics"} 127041 +``` + +#### `api_s3_request_counter` (counter) + +Counts the number of requests to a given endpoint of the S3 API. Example: + +``` +api_s3_request_counter{api_endpoint="CreateMultipartUpload"} 1 +``` + +#### `api_s3_error_counter` (counter) + +Counts the number of requests to a given endpoint of the S3 API that returned an error. Example: + +``` +api_s3_error_counter{api_endpoint="GetObject",status_code="404"} 39 +``` + +#### `api_s3_request_duration` (histogram) + +Evaluates the duration of API calls to the various S3 API endpoints. Example: + +``` +api_s3_request_duration_bucket{api_endpoint="CreateMultipartUpload",le="0.5"} 1 +api_s3_request_duration_sum{api_endpoint="CreateMultipartUpload"} 0.046340762 +api_s3_request_duration_count{api_endpoint="CreateMultipartUpload"} 1 +``` + +#### `api_k2v_request_counter` (counter), `api_k2v_error_counter` (counter), `api_k2v_error_duration` (histogram) + +Same as for S3, for the K2V API. + + +### Web endpoint + + +#### `web_request_counter` (counter) + +Number of requests to the web endpoint + +``` +web_request_counter{method="GET"} 80 +``` + +#### `web_request_duration` (histogram) + +Duration of requests to the web endpoint + +``` +web_request_duration_bucket{method="GET",le="0.5"} 80 +web_request_duration_sum{method="GET"} 1.0528433229999998 +web_request_duration_count{method="GET"} 80 +``` + +#### `web_error_counter` (counter) + +Number of requests to the web endpoint resulting in errors + +``` +web_error_counter{method="GET",status_code="404 Not Found"} 64 +``` + + +### Data block manager + +#### `block_bytes_read`, `block_bytes_written` (counter) + +Number of bytes read/written to/from disk in the data storage directory. + +``` +block_bytes_read 120586322022 +block_bytes_written 3386618077 +``` + +#### `block_read_duration`, `block_write_duration` (histograms) + +Evaluates the duration of the reading/writing of individual data blocks in the data storage directory. + +``` +block_read_duration_bucket{le="0.5"} 169229 +block_read_duration_sum 2761.6902550310056 +block_read_duration_count 169240 +block_write_duration_bucket{le="0.5"} 3559 +block_write_duration_sum 195.59170078500006 +block_write_duration_count 3571 +``` + +#### `block_delete_counter` (counter) + +Counts the number of data blocks that have been deleted from storage. + +``` +block_delete_counter 122 +``` + +#### `block_resync_counter` (counter), `block_resync_duration` (histogram) + +Counts the number of resync operations the node has executed, and evaluates their duration. + +``` +block_resync_counter 308897 +block_resync_duration_bucket{le="0.5"} 308892 +block_resync_duration_sum 139.64204196100016 +block_resync_duration_count 308897 +``` + +#### `block_resync_queue_length` (gauge) + +The number of block hashes currently queued for a resync. +This is normal to be nonzero for long periods of time. + +``` +block_resync_queue_length 0 +``` + +#### `block_resync_errored_blocks` (gauge) + +The number of block hashes that we were unable to resync last time we tried. +**THIS SHOULD BE ZERO, OR FALL BACK TO ZERO RAPIDLY, IN A HEALTHY CLUSTER.** + +``` +block_resync_errored_blocks 0 +``` + + +### RPC (remote procedure calls) between nodes + +#### `rpc_netapp_request_counter` (counter) + +Number of RPC requests emitted + +``` +rpc_request_counter{from="<this node>",rpc_endpoint="garage_block/manager.rs/Rpc",to="<remote node>"} 176 +``` + +#### `rpc_netapp_error_counter` (counter) + +Number of communication errors (errors in the Netapp library) + +``` +rpc_netapp_error_counter{from="<this node>",rpc_endpoint="garage_block/manager.rs/Rpc",to="<remote node>"} 354 +``` + +#### `rpc_timeout_counter` (counter) + +Number of RPC timeouts + +``` +rpc_timeout_counter{from="<this node>",rpc_endpoint="garage_rpc/membership.rs/SystemRpc",to="<remote node>"} 1 +``` + +#### `rpc_duration` (histogram) + +The duration of internal RPC calls between Garage nodes. + +``` +rpc_duration_bucket{from="<this node>",rpc_endpoint="garage_block/manager.rs/Rpc",to="<remote node>",le="0.5"} 166 +rpc_duration_sum{from="<this node>",rpc_endpoint="garage_block/manager.rs/Rpc",to="<remote node>"} 35.172253716 +rpc_duration_count{from="<this node>",rpc_endpoint="garage_block/manager.rs/Rpc",to="<remote node>"} 174 +``` + + +### Metadata table manager + +#### `table_gc_todo_queue_length` (gauge) + +Table garbage collector TODO queue length + +``` +table_gc_todo_queue_length{table_name="block_ref"} 0 +``` + +#### `table_get_request_counter` (counter), `table_get_request_duration` (histogram) + +Number of get/get_range requests internally made on each table, and their duration. + +``` +table_get_request_counter{table_name="bucket_alias"} 315 +table_get_request_duration_bucket{table_name="bucket_alias",le="0.5"} 315 +table_get_request_duration_sum{table_name="bucket_alias"} 0.048509778000000024 +table_get_request_duration_count{table_name="bucket_alias"} 315 +``` + + +#### `table_put_request_counter` (counter), `table_put_request_duration` (histogram) + +Number of insert/insert_many requests internally made on this table, and their duration + +``` +table_put_request_counter{table_name="block_ref"} 677 +table_put_request_duration_bucket{table_name="block_ref",le="0.5"} 677 +table_put_request_duration_sum{table_name="block_ref"} 61.617528636 +table_put_request_duration_count{table_name="block_ref"} 677 +``` + +#### `table_internal_delete_counter` (counter) + +Number of value deletions in the tree (due to GC or repartitioning) + +``` +table_internal_delete_counter{table_name="block_ref"} 2296 +``` + +#### `table_internal_update_counter` (counter) + +Number of value updates where the value actually changes (includes creation of new key and update of existing key) + +``` +table_internal_update_counter{table_name="block_ref"} 5996 +``` + +#### `table_merkle_updater_todo_queue_length` (gauge) + +Merkle tree updater TODO queue length (should fall to zero rapidly) + +``` +table_merkle_updater_todo_queue_length{table_name="block_ref"} 0 +``` + +#### `table_sync_items_received`, `table_sync_items_sent` (counters) + +Number of data items sent to/recieved from other nodes during resync procedures + +``` +table_sync_items_received{from="<remote node>",table_name="bucket_v2"} 3 +table_sync_items_sent{table_name="block_ref",to="<remote node>"} 2 +``` + + diff --git a/doc/book/cookbook/real-world.md b/doc/book/cookbook/real-world.md index 4fcb5cf7..6fe7e547 100644 --- a/doc/book/cookbook/real-world.md +++ b/doc/book/cookbook/real-world.md @@ -11,8 +11,9 @@ We recommend first following the [quick start guide](@/documentation/quick-start to get familiar with Garage's command line and usage patterns. +## Preparing your environment -## Prerequisites +### Prerequisites To run a real-world deployment, make sure the following conditions are met: @@ -21,10 +22,6 @@ To run a real-world deployment, make sure the following conditions are met: - Each machine has a public IP address which is reachable by other machines. Running behind a NAT is likely to be possible but hasn't been tested for the latest version (TODO). -- Ideally, each machine should have a SSD available in addition to the HDD you are dedicating - to Garage. This will allow for faster access to metadata and has the potential - to significantly reduce Garage's response times. - - This guide will assume you are using Docker containers to deploy Garage on each node. Garage can also be run independently, for instance as a [Systemd service](@/documentation/cookbook/systemd.md). You can also use an orchestrator such as Nomad or Kubernetes to automatically manage @@ -49,6 +46,42 @@ available in the different locations of your cluster is roughly the same. For instance, here, the Mercury node could be moved to Brussels; this would allow the cluster to store 2 TB of data in total. +### Best practices + +- If you have fast dedicated networking between all your nodes, and are planing to store + very large files, bump the `block_size` configuration parameter to 10 MB + (`block_size = 10485760`). + +- Garage stores its files in two locations: it uses a metadata directory to store frequently-accessed + small metadata items, and a data directory to store data blocks of uploaded objects. + Ideally, the metadata directory would be stored on an SSD (smaller but faster), + and the data directory would be stored on an HDD (larger but slower). + +- For the data directory, Garage already does checksumming and integrity verification, + so there is no need to use a filesystem such as BTRFS or ZFS that does it. + We recommend using XFS for the data partition, as it has the best performance. + Ext4 is not recommended as it has more strict limitations on the number of inodes, + which might cause issues with Garage when large numbers of objects are stored. + +- If you only have an HDD and no SSD, it's fine to put your metadata alongside the data + on the same drive. Having lots of RAM for your kernel to cache the metadata will + help a lot with performance. Make sure to use the LMDB database engine, + instead of Sled, which suffers from quite bad performance degradation on HDDs. + Sled is still the default for legacy reasons, but is not recommended anymore. + +- For the metadata storage, Garage does not do checksumming and integrity + verification on its own. If you are afraid of bitrot/data corruption, + put your metadata directory on a BTRFS partition. Otherwise, just use regular + EXT4 or XFS. + +- Having a single server with several storage drives is currently not very well + supported in Garage ([#218](https://git.deuxfleurs.fr/Deuxfleurs/garage/issues/218)). + For an easy setup, just put all your drives in a RAID0 or a ZFS RAIDZ array. + If you're adventurous, you can try to format each of your disk as + a separate XFS partition, and then run one `garage` daemon per disk drive, + or use something like [`mergerfs`](https://github.com/trapexit/mergerfs) to merge + all your disks in a single union filesystem that spreads load over them. + ## Get a Docker image Our docker image is currently named `dxflrs/garage` and is stored on the [Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated). @@ -81,6 +114,7 @@ A valid `/etc/garage/garage.toml` for our cluster would look as follows: ```toml metadata_dir = "/var/lib/garage/meta" data_dir = "/var/lib/garage/data" +db_engine = "lmdb" replication_mode = "3" @@ -90,8 +124,6 @@ rpc_bind_addr = "[::]:3901" rpc_public_addr = "<this node's public IP>:3901" rpc_secret = "<RPC secret>" -bootstrap_peers = [] - [s3_api] s3_region = "garage" api_bind_addr = "[::]:3900" diff --git a/doc/book/cookbook/recovering.md b/doc/book/cookbook/recovering.md index 2424558c..2129a7f3 100644 --- a/doc/book/cookbook/recovering.md +++ b/doc/book/cookbook/recovering.md @@ -1,6 +1,6 @@ +++ title = "Recovering from failures" -weight = 35 +weight = 50 +++ Garage is meant to work on old, second-hand hardware. diff --git a/doc/book/cookbook/upgrading.md b/doc/book/cookbook/upgrading.md index f7659921..9f2ba73b 100644 --- a/doc/book/cookbook/upgrading.md +++ b/doc/book/cookbook/upgrading.md @@ -1,6 +1,6 @@ +++ title = "Upgrading Garage" -weight = 40 +weight = 60 +++ Garage is a stateful clustered application, where all nodes are communicating together and share data structures. diff --git a/doc/book/quick-start/_index.md b/doc/book/quick-start/_index.md index 03c542f9..ac55d2f7 100644 --- a/doc/book/quick-start/_index.md +++ b/doc/book/quick-start/_index.md @@ -54,6 +54,7 @@ to generate unique and private secrets for security reasons: cat > garage.toml <<EOF metadata_dir = "/tmp/meta" data_dir = "/tmp/data" +db_engine = "lmdb" replication_mode = "none" @@ -61,8 +62,6 @@ rpc_bind_addr = "[::]:3901" rpc_public_addr = "127.0.0.1:3901" rpc_secret = "$(openssl rand -hex 32)" -bootstrap_peers = [] - [s3_api] s3_region = "garage" api_bind_addr = "[::]:3900" diff --git a/script/telemetry/grafana-garage-dashboard-prometheus.json b/script/telemetry/grafana-garage-dashboard-prometheus.json new file mode 100644 index 00000000..28ef1ec0 --- /dev/null +++ b/script/telemetry/grafana-garage-dashboard-prometheus.json @@ -0,0 +1,1053 @@ +{ + "__inputs": [ + { + "name": "DS_DS_PROMETHEUS", + "label": "DS_PROMETHEUS", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.2.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 24, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "sum(rate(block_bytes_read{job=\"garage\"}[$__rate_interval]) )", + "hide": false, + "interval": "", + "legendFormat": "Disk bytes read", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "-sum(rate(block_bytes_written{job=\"garage\"}[$__rate_interval]) )", + "hide": false, + "interval": "", + "legendFormat": "Disk bytes written", + "refId": "B" + } + ], + "title": "Disk I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum by (api_endpoint) (rate(api_s3_request_counter {job=\"garage\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "{{api_endpoint}}", + "range": true, + "refId": "A" + } + ], + "title": "API requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "sum(rate(web_request_counter {job=\"garage\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "Web request rate", + "refId": "A" + } + ], + "title": "Web requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 9 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "sum by (rpc_endpoint) (rate(rpc_request_counter {job=\"garage\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "{{rpc_endpoint}}", + "refId": "A" + } + ], + "title": "RPC requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 9 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum by (api_endpoint, status_code) (rate(api_s3_error_counter {job=\"garage\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "{{api_endpoint}} {{status_code}}", + "range": true, + "refId": "A" + } + ], + "title": "API errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 9 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "sum by(status_code) (rate(web_error_counter {job=\"garage\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "{{status_code}}", + "refId": "A" + } + ], + "title": "Web errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "10.83.2.3:3903" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 17 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "block_resync_queue_length{job=\"garage\"}", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Resync queue length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 17 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "sum by(table_name) (table_gc_todo_queue_length{job=\"garage\"})", + "interval": "", + "legendFormat": "{{ table_name}}", + "refId": "A" + } + ], + "title": "Table GC queue length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 17 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "sum by(table_name) (table_merkle_updater_todo_queue_length{job=\"garage\"})", + "interval": "", + "legendFormat": "{{ table_name}}", + "refId": "A" + } + ], + "title": "Table Merkle updater queue length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 25 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "block_resync_errored_blocks{job=\"garage\"}", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Resync errored blocks", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Garage", + "uid": "ys3pnpZ4k", + "version": 26, + "weekStart": "" +}
\ No newline at end of file |