From 72d033dcd40a65ccf7f41f51af356ffc20144c30 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 25 Aug 2022 13:59:40 +0200 Subject: Remove garage files at bad location, add basic telemetry --- cluster/prod/app/garage/config/garage.toml | 4 +- cluster/prod/app/garage/deploy/garage.hcl | 19 +++ cluster/prod/app/garage/secrets/garage/admin_token | 1 + .../prod/app/garage/secrets/garage/metrics_token | 1 + .../config/grafana-datasource-prometheus.yaml | 7 + .../app/telemetry/config/grafana-litestream.yml | 10 ++ cluster/prod/app/telemetry/config/prometheus.yml | 30 ++++ .../prod/app/telemetry/deploy/telemetry-system.hcl | 49 ++++++ cluster/prod/app/telemetry/deploy/telemetry.hcl | 189 +++++++++++++++++++++ .../secrets/telemetry/grafana/admin_password | 1 + .../secrets/telemetry/grafana/s3_access_key | 1 + .../secrets/telemetry/grafana/s3_secret_key | 1 + cluster/prod/garage/config/garage.toml | 24 --- cluster/prod/garage/deploy/garage.hcl | 131 -------------- cluster/prod/garage/secrets/garage/rpc_secret | 1 - 15 files changed, 312 insertions(+), 157 deletions(-) create mode 100644 cluster/prod/app/garage/secrets/garage/admin_token create mode 100644 cluster/prod/app/garage/secrets/garage/metrics_token create mode 100644 cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml create mode 100644 cluster/prod/app/telemetry/config/grafana-litestream.yml create mode 100644 cluster/prod/app/telemetry/config/prometheus.yml create mode 100644 cluster/prod/app/telemetry/deploy/telemetry-system.hcl create mode 100644 cluster/prod/app/telemetry/deploy/telemetry.hcl create mode 100644 cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password create mode 100644 cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key create mode 100644 cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key delete mode 100644 cluster/prod/garage/config/garage.toml delete mode 100644 cluster/prod/garage/deploy/garage.hcl delete mode 100644 cluster/prod/garage/secrets/garage/rpc_secret diff --git a/cluster/prod/app/garage/config/garage.toml b/cluster/prod/app/garage/config/garage.toml index a721886..224c755 100644 --- a/cluster/prod/app/garage/config/garage.toml +++ b/cluster/prod/app/garage/config/garage.toml @@ -21,4 +21,6 @@ bind_addr = "[::]:3902" root_domain = ".web.deuxfleurs.fr" [admin] -api_bind_addr = "[::1]:3903" +api_bind_addr = "[::]:3903" +metrics_token = "{{ key "secrets/garage/metrics_token" | trimSpace }}" +admin_token = "{{ key "secrets/garage/admin_token" | trimSpace }}" diff --git a/cluster/prod/app/garage/deploy/garage.hcl b/cluster/prod/app/garage/deploy/garage.hcl index bbaaec1..5a9c6b5 100644 --- a/cluster/prod/app/garage/deploy/garage.hcl +++ b/cluster/prod/app/garage/deploy/garage.hcl @@ -18,6 +18,7 @@ job "garage" { port "s3" { static = 3900 } port "rpc" { static = 3901 } port "web" { static = 3902 } + port "admin" { static = 3903 } } update { @@ -125,6 +126,24 @@ job "garage" { } } + service { + port = 3903 + address_mode = "driver" + name = "garage-admin" + check { + type = "tcp" + port = 3903 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + restart { interval = "30m" attempts = 10 diff --git a/cluster/prod/app/garage/secrets/garage/admin_token b/cluster/prod/app/garage/secrets/garage/admin_token new file mode 100644 index 0000000..d831d53 --- /dev/null +++ b/cluster/prod/app/garage/secrets/garage/admin_token @@ -0,0 +1 @@ +CMD_ONCE openssl rand -hex 32 diff --git a/cluster/prod/app/garage/secrets/garage/metrics_token b/cluster/prod/app/garage/secrets/garage/metrics_token new file mode 100644 index 0000000..d831d53 --- /dev/null +++ b/cluster/prod/app/garage/secrets/garage/metrics_token @@ -0,0 +1 @@ +CMD_ONCE openssl rand -hex 32 diff --git a/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml b/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml new file mode 100644 index 0000000..9be89f9 --- /dev/null +++ b/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml @@ -0,0 +1,7 @@ +apiVersion: 1 + +datasources: + - name: DS_PROMETHEUS + type: prometheus + access: proxy + url: http://prometheus.service.prod.consul:9090 diff --git a/cluster/prod/app/telemetry/config/grafana-litestream.yml b/cluster/prod/app/telemetry/config/grafana-litestream.yml new file mode 100644 index 0000000..9d4d48a --- /dev/null +++ b/cluster/prod/app/telemetry/config/grafana-litestream.yml @@ -0,0 +1,10 @@ +dbs: + - path: /ephemeral/grafana.db + replicas: + - url: s3://grafana-db/grafana.db + region: garage + endpoint: http://{{ env "attr.unique.network.ip-address" }}:3900 + access-key-id: {{ key "secrets/telemetry/grafana/s3_access_key" | trimSpace }} + secret-access-key: {{ key "secrets/telemetry/grafana/s3_secret_key" | trimSpace }} + force-path-style: true + sync-interval: 60s diff --git a/cluster/prod/app/telemetry/config/prometheus.yml b/cluster/prod/app/telemetry/config/prometheus.yml new file mode 100644 index 0000000..d30ee13 --- /dev/null +++ b/cluster/prod/app/telemetry/config/prometheus.yml @@ -0,0 +1,30 @@ +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'node-exporter' + consul_sd_configs: + - server: 'https://localhost:8501' + services: + - 'node-exporter' + tls_config: + ca_file: /etc/prometheus/consul.crt + cert_file: /etc/prometheus/consul-client.crt + key_file: /etc/prometheus/consul-client.key + + - job_name: 'garage' + authorization: + type: Bearer + credentials: {{ key "secrets/garage/metrics_token" }} + consul_sd_configs: + - server: 'https://localhost:8501' + services: + - 'garage-admin' + tls_config: + ca_file: /etc/prometheus/consul.crt + cert_file: /etc/prometheus/consul-client.crt + key_file: /etc/prometheus/consul-client.key diff --git a/cluster/prod/app/telemetry/deploy/telemetry-system.hcl b/cluster/prod/app/telemetry/deploy/telemetry-system.hcl new file mode 100644 index 0000000..e4bde1a --- /dev/null +++ b/cluster/prod/app/telemetry/deploy/telemetry-system.hcl @@ -0,0 +1,49 @@ +job "telemetry-system" { + datacenters = ["neptune", "orion"] + type = "system" + priority = "100" + + group "collector" { + network { + port "node_exporter" { static = 9100 } + } + + task "node_exporter" { + driver = "docker" + + config { + image = "quay.io/prometheus/node-exporter:v1.1.2" + network_mode = "host" + volumes = [ + "/:/host:ro,rslave" + ] + args = [ "--path.rootfs=/host" ] + } + + resources { + cpu = 50 + memory = 40 + } + + service { + tags = [ "telemetry" ] + port = 9100 + address_mode = "driver" + name = "node-exporter" + check { + type = "http" + path = "/" + port = 9100 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + } + } +} diff --git a/cluster/prod/app/telemetry/deploy/telemetry.hcl b/cluster/prod/app/telemetry/deploy/telemetry.hcl new file mode 100644 index 0000000..e1f1000 --- /dev/null +++ b/cluster/prod/app/telemetry/deploy/telemetry.hcl @@ -0,0 +1,189 @@ +job "telemetry" { + datacenters = ["neptune"] + type = "service" + + group "prometheus" { + count = 1 + + network { + port "prometheus" { + static = 9090 + } + } + + task "prometheus" { + driver = "docker" + config { + image = "prom/prometheus:v2.38.0" + network_mode = "host" + ports = [ "prometheus" ] + volumes = [ + "secrets:/etc/prometheus" + ] + } + + template { + data = file("../config/prometheus.yml") + destination = "secrets/prometheus.yml" + } + + template { + data = "{{ key \"secrets/consul/consul.crt\" }}" + destination = "secrets/consul.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.crt\" }}" + destination = "secrets/consul-client.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.key\" }}" + destination = "secrets/consul-client.key" + } + + resources { + memory = 500 + cpu = 500 + } + + service { + port = 9090 + address_mode = "driver" + name = "prometheus" + check { + type = "http" + path = "/" + port = 9090 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + } + } + + group "grafana" { + count = 1 + + network { + port "grafana" { + static = 3719 + } + } + + task "restore-db" { + lifecycle { + hook = "prestart" + sidecar = false + } + + driver = "docker" + config { + image = "litestream/litestream:0.3.7" + args = [ + "restore", "-config", "/etc/litestream.yml", "/ephemeral/grafana.db" + ] + volumes = [ + "../alloc/data:/ephemeral", + "secrets/litestream.yml:/etc/litestream.yml" + ] + } + user = "472" + + template { + data = file("../config/grafana-litestream.yml") + destination = "secrets/litestream.yml" + } + + resources { + memory = 200 + cpu = 1000 + } + } + + task "grafana" { + driver = "docker" + config { + image = "grafana/grafana:8.4.3" + network_mode = "host" + ports = [ "grafana" ] + volumes = [ + "../alloc/data:/var/lib/grafana", + "secrets/prometheus.yaml:/etc/grafana/provisioning/datasources/prometheus.yaml" + ] + } + + template { + data = file("../config/grafana-datasource-prometheus.yaml") + destination = "secrets/prometheus.yaml" + } + + template { + data = <