diff options
Diffstat (limited to 'cluster/prod/app/telemetry')
8 files changed, 288 insertions, 0 deletions
diff --git a/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml b/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml new file mode 100644 index 0000000..9be89f9 --- /dev/null +++ b/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml @@ -0,0 +1,7 @@ +apiVersion: 1 + +datasources: + - name: DS_PROMETHEUS + type: prometheus + access: proxy + url: http://prometheus.service.prod.consul:9090 diff --git a/cluster/prod/app/telemetry/config/grafana-litestream.yml b/cluster/prod/app/telemetry/config/grafana-litestream.yml new file mode 100644 index 0000000..9d4d48a --- /dev/null +++ b/cluster/prod/app/telemetry/config/grafana-litestream.yml @@ -0,0 +1,10 @@ +dbs: + - path: /ephemeral/grafana.db + replicas: + - url: s3://grafana-db/grafana.db + region: garage + endpoint: http://{{ env "attr.unique.network.ip-address" }}:3900 + access-key-id: {{ key "secrets/telemetry/grafana/s3_access_key" | trimSpace }} + secret-access-key: {{ key "secrets/telemetry/grafana/s3_secret_key" | trimSpace }} + force-path-style: true + sync-interval: 60s diff --git a/cluster/prod/app/telemetry/config/prometheus.yml b/cluster/prod/app/telemetry/config/prometheus.yml new file mode 100644 index 0000000..d30ee13 --- /dev/null +++ b/cluster/prod/app/telemetry/config/prometheus.yml @@ -0,0 +1,30 @@ +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'node-exporter' + consul_sd_configs: + - server: 'https://localhost:8501' + services: + - 'node-exporter' + tls_config: + ca_file: /etc/prometheus/consul.crt + cert_file: /etc/prometheus/consul-client.crt + key_file: /etc/prometheus/consul-client.key + + - job_name: 'garage' + authorization: + type: Bearer + credentials: {{ key "secrets/garage/metrics_token" }} + consul_sd_configs: + - server: 'https://localhost:8501' + services: + - 'garage-admin' + tls_config: + ca_file: /etc/prometheus/consul.crt + cert_file: /etc/prometheus/consul-client.crt + key_file: /etc/prometheus/consul-client.key diff --git a/cluster/prod/app/telemetry/deploy/telemetry-system.hcl b/cluster/prod/app/telemetry/deploy/telemetry-system.hcl new file mode 100644 index 0000000..e4bde1a --- /dev/null +++ b/cluster/prod/app/telemetry/deploy/telemetry-system.hcl @@ -0,0 +1,49 @@ +job "telemetry-system" { + datacenters = ["neptune", "orion"] + type = "system" + priority = "100" + + group "collector" { + network { + port "node_exporter" { static = 9100 } + } + + task "node_exporter" { + driver = "docker" + + config { + image = "quay.io/prometheus/node-exporter:v1.1.2" + network_mode = "host" + volumes = [ + "/:/host:ro,rslave" + ] + args = [ "--path.rootfs=/host" ] + } + + resources { + cpu = 50 + memory = 40 + } + + service { + tags = [ "telemetry" ] + port = 9100 + address_mode = "driver" + name = "node-exporter" + check { + type = "http" + path = "/" + port = 9100 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + } + } +} diff --git a/cluster/prod/app/telemetry/deploy/telemetry.hcl b/cluster/prod/app/telemetry/deploy/telemetry.hcl new file mode 100644 index 0000000..e1f1000 --- /dev/null +++ b/cluster/prod/app/telemetry/deploy/telemetry.hcl @@ -0,0 +1,189 @@ +job "telemetry" { + datacenters = ["neptune"] + type = "service" + + group "prometheus" { + count = 1 + + network { + port "prometheus" { + static = 9090 + } + } + + task "prometheus" { + driver = "docker" + config { + image = "prom/prometheus:v2.38.0" + network_mode = "host" + ports = [ "prometheus" ] + volumes = [ + "secrets:/etc/prometheus" + ] + } + + template { + data = file("../config/prometheus.yml") + destination = "secrets/prometheus.yml" + } + + template { + data = "{{ key \"secrets/consul/consul.crt\" }}" + destination = "secrets/consul.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.crt\" }}" + destination = "secrets/consul-client.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.key\" }}" + destination = "secrets/consul-client.key" + } + + resources { + memory = 500 + cpu = 500 + } + + service { + port = 9090 + address_mode = "driver" + name = "prometheus" + check { + type = "http" + path = "/" + port = 9090 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + } + } + + group "grafana" { + count = 1 + + network { + port "grafana" { + static = 3719 + } + } + + task "restore-db" { + lifecycle { + hook = "prestart" + sidecar = false + } + + driver = "docker" + config { + image = "litestream/litestream:0.3.7" + args = [ + "restore", "-config", "/etc/litestream.yml", "/ephemeral/grafana.db" + ] + volumes = [ + "../alloc/data:/ephemeral", + "secrets/litestream.yml:/etc/litestream.yml" + ] + } + user = "472" + + template { + data = file("../config/grafana-litestream.yml") + destination = "secrets/litestream.yml" + } + + resources { + memory = 200 + cpu = 1000 + } + } + + task "grafana" { + driver = "docker" + config { + image = "grafana/grafana:8.4.3" + network_mode = "host" + ports = [ "grafana" ] + volumes = [ + "../alloc/data:/var/lib/grafana", + "secrets/prometheus.yaml:/etc/grafana/provisioning/datasources/prometheus.yaml" + ] + } + + template { + data = file("../config/grafana-datasource-prometheus.yaml") + destination = "secrets/prometheus.yaml" + } + + template { + data = <<EOH +GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel,grafana-worldmap-panel,grafana-polystat-panel +GF_SERVER_HTTP_PORT=3719 +EOH + destination = "secrets/env" + env = true + } + + resources { + memory = 500 + cpu = 100 + } + + service { + tags = [ + "grafana", + "tricot grafana-new.deuxfleurs.fr", + ] + port = 3719 + address_mode = "driver" + name = "grafana" + check { + type = "tcp" + port = 3719 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + } + + task "replicate-db" { + driver = "docker" + config { + image = "litestream/litestream:0.3.7" + args = [ + "replicate", "-config", "/etc/litestream.yml" + ] + volumes = [ + "../alloc/data:/ephemeral", + "secrets/litestream.yml:/etc/litestream.yml" + ] + } + user = "472" + + template { + data = file("../config/grafana-litestream.yml") + destination = "secrets/litestream.yml" + } + + resources { + memory = 200 + cpu = 100 + } + } + } +} diff --git a/cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password b/cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password new file mode 100644 index 0000000..2f36e97 --- /dev/null +++ b/cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password @@ -0,0 +1 @@ +CMD openssl rand -base64 12 diff --git a/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key new file mode 100644 index 0000000..c7e41a4 --- /dev/null +++ b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key @@ -0,0 +1 @@ +USER S3 access key for grafana db diff --git a/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key new file mode 100644 index 0000000..051f41a --- /dev/null +++ b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key @@ -0,0 +1 @@ +USER S3 secret key for grafana db |