aboutsummaryrefslogtreecommitdiff
path: root/cluster/prod/app/telemetry
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2022-08-25 13:59:40 +0200
committerAlex Auvolat <alex@adnab.me>2022-08-25 13:59:40 +0200
commit72d033dcd40a65ccf7f41f51af356ffc20144c30 (patch)
tree2b1247bcae11a1f993590bc9db8bf08a53564350 /cluster/prod/app/telemetry
parentfd3ed44dad783c15f2793788f9384d48760666a3 (diff)
downloadnixcfg-72d033dcd40a65ccf7f41f51af356ffc20144c30.tar.gz
nixcfg-72d033dcd40a65ccf7f41f51af356ffc20144c30.zip
Remove garage files at bad location, add basic telemetry
Diffstat (limited to 'cluster/prod/app/telemetry')
-rw-r--r--cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml7
-rw-r--r--cluster/prod/app/telemetry/config/grafana-litestream.yml10
-rw-r--r--cluster/prod/app/telemetry/config/prometheus.yml30
-rw-r--r--cluster/prod/app/telemetry/deploy/telemetry-system.hcl49
-rw-r--r--cluster/prod/app/telemetry/deploy/telemetry.hcl189
-rw-r--r--cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password1
-rw-r--r--cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key1
-rw-r--r--cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key1
8 files changed, 288 insertions, 0 deletions
diff --git a/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml b/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml
new file mode 100644
index 0000000..9be89f9
--- /dev/null
+++ b/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml
@@ -0,0 +1,7 @@
+apiVersion: 1
+
+datasources:
+ - name: DS_PROMETHEUS
+ type: prometheus
+ access: proxy
+ url: http://prometheus.service.prod.consul:9090
diff --git a/cluster/prod/app/telemetry/config/grafana-litestream.yml b/cluster/prod/app/telemetry/config/grafana-litestream.yml
new file mode 100644
index 0000000..9d4d48a
--- /dev/null
+++ b/cluster/prod/app/telemetry/config/grafana-litestream.yml
@@ -0,0 +1,10 @@
+dbs:
+ - path: /ephemeral/grafana.db
+ replicas:
+ - url: s3://grafana-db/grafana.db
+ region: garage
+ endpoint: http://{{ env "attr.unique.network.ip-address" }}:3900
+ access-key-id: {{ key "secrets/telemetry/grafana/s3_access_key" | trimSpace }}
+ secret-access-key: {{ key "secrets/telemetry/grafana/s3_secret_key" | trimSpace }}
+ force-path-style: true
+ sync-interval: 60s
diff --git a/cluster/prod/app/telemetry/config/prometheus.yml b/cluster/prod/app/telemetry/config/prometheus.yml
new file mode 100644
index 0000000..d30ee13
--- /dev/null
+++ b/cluster/prod/app/telemetry/config/prometheus.yml
@@ -0,0 +1,30 @@
+global:
+ scrape_interval: 15s # By default, scrape targets every 15 seconds.
+
+scrape_configs:
+ - job_name: 'prometheus'
+ static_configs:
+ - targets: ['localhost:9090']
+
+ - job_name: 'node-exporter'
+ consul_sd_configs:
+ - server: 'https://localhost:8501'
+ services:
+ - 'node-exporter'
+ tls_config:
+ ca_file: /etc/prometheus/consul.crt
+ cert_file: /etc/prometheus/consul-client.crt
+ key_file: /etc/prometheus/consul-client.key
+
+ - job_name: 'garage'
+ authorization:
+ type: Bearer
+ credentials: {{ key "secrets/garage/metrics_token" }}
+ consul_sd_configs:
+ - server: 'https://localhost:8501'
+ services:
+ - 'garage-admin'
+ tls_config:
+ ca_file: /etc/prometheus/consul.crt
+ cert_file: /etc/prometheus/consul-client.crt
+ key_file: /etc/prometheus/consul-client.key
diff --git a/cluster/prod/app/telemetry/deploy/telemetry-system.hcl b/cluster/prod/app/telemetry/deploy/telemetry-system.hcl
new file mode 100644
index 0000000..e4bde1a
--- /dev/null
+++ b/cluster/prod/app/telemetry/deploy/telemetry-system.hcl
@@ -0,0 +1,49 @@
+job "telemetry-system" {
+ datacenters = ["neptune", "orion"]
+ type = "system"
+ priority = "100"
+
+ group "collector" {
+ network {
+ port "node_exporter" { static = 9100 }
+ }
+
+ task "node_exporter" {
+ driver = "docker"
+
+ config {
+ image = "quay.io/prometheus/node-exporter:v1.1.2"
+ network_mode = "host"
+ volumes = [
+ "/:/host:ro,rslave"
+ ]
+ args = [ "--path.rootfs=/host" ]
+ }
+
+ resources {
+ cpu = 50
+ memory = 40
+ }
+
+ service {
+ tags = [ "telemetry" ]
+ port = 9100
+ address_mode = "driver"
+ name = "node-exporter"
+ check {
+ type = "http"
+ path = "/"
+ port = 9100
+ address_mode = "driver"
+ interval = "60s"
+ timeout = "5s"
+ check_restart {
+ limit = 3
+ grace = "90s"
+ ignore_warnings = false
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/cluster/prod/app/telemetry/deploy/telemetry.hcl b/cluster/prod/app/telemetry/deploy/telemetry.hcl
new file mode 100644
index 0000000..e1f1000
--- /dev/null
+++ b/cluster/prod/app/telemetry/deploy/telemetry.hcl
@@ -0,0 +1,189 @@
+job "telemetry" {
+ datacenters = ["neptune"]
+ type = "service"
+
+ group "prometheus" {
+ count = 1
+
+ network {
+ port "prometheus" {
+ static = 9090
+ }
+ }
+
+ task "prometheus" {
+ driver = "docker"
+ config {
+ image = "prom/prometheus:v2.38.0"
+ network_mode = "host"
+ ports = [ "prometheus" ]
+ volumes = [
+ "secrets:/etc/prometheus"
+ ]
+ }
+
+ template {
+ data = file("../config/prometheus.yml")
+ destination = "secrets/prometheus.yml"
+ }
+
+ template {
+ data = "{{ key \"secrets/consul/consul.crt\" }}"
+ destination = "secrets/consul.crt"
+ }
+
+ template {
+ data = "{{ key \"secrets/consul/consul-client.crt\" }}"
+ destination = "secrets/consul-client.crt"
+ }
+
+ template {
+ data = "{{ key \"secrets/consul/consul-client.key\" }}"
+ destination = "secrets/consul-client.key"
+ }
+
+ resources {
+ memory = 500
+ cpu = 500
+ }
+
+ service {
+ port = 9090
+ address_mode = "driver"
+ name = "prometheus"
+ check {
+ type = "http"
+ path = "/"
+ port = 9090
+ address_mode = "driver"
+ interval = "60s"
+ timeout = "5s"
+ check_restart {
+ limit = 3
+ grace = "90s"
+ ignore_warnings = false
+ }
+ }
+ }
+ }
+ }
+
+ group "grafana" {
+ count = 1
+
+ network {
+ port "grafana" {
+ static = 3719
+ }
+ }
+
+ task "restore-db" {
+ lifecycle {
+ hook = "prestart"
+ sidecar = false
+ }
+
+ driver = "docker"
+ config {
+ image = "litestream/litestream:0.3.7"
+ args = [
+ "restore", "-config", "/etc/litestream.yml", "/ephemeral/grafana.db"
+ ]
+ volumes = [
+ "../alloc/data:/ephemeral",
+ "secrets/litestream.yml:/etc/litestream.yml"
+ ]
+ }
+ user = "472"
+
+ template {
+ data = file("../config/grafana-litestream.yml")
+ destination = "secrets/litestream.yml"
+ }
+
+ resources {
+ memory = 200
+ cpu = 1000
+ }
+ }
+
+ task "grafana" {
+ driver = "docker"
+ config {
+ image = "grafana/grafana:8.4.3"
+ network_mode = "host"
+ ports = [ "grafana" ]
+ volumes = [
+ "../alloc/data:/var/lib/grafana",
+ "secrets/prometheus.yaml:/etc/grafana/provisioning/datasources/prometheus.yaml"
+ ]
+ }
+
+ template {
+ data = file("../config/grafana-datasource-prometheus.yaml")
+ destination = "secrets/prometheus.yaml"
+ }
+
+ template {
+ data = <<EOH
+GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel,grafana-worldmap-panel,grafana-polystat-panel
+GF_SERVER_HTTP_PORT=3719
+EOH
+ destination = "secrets/env"
+ env = true
+ }
+
+ resources {
+ memory = 500
+ cpu = 100
+ }
+
+ service {
+ tags = [
+ "grafana",
+ "tricot grafana-new.deuxfleurs.fr",
+ ]
+ port = 3719
+ address_mode = "driver"
+ name = "grafana"
+ check {
+ type = "tcp"
+ port = 3719
+ address_mode = "driver"
+ interval = "60s"
+ timeout = "5s"
+ check_restart {
+ limit = 3
+ grace = "90s"
+ ignore_warnings = false
+ }
+ }
+ }
+ }
+
+ task "replicate-db" {
+ driver = "docker"
+ config {
+ image = "litestream/litestream:0.3.7"
+ args = [
+ "replicate", "-config", "/etc/litestream.yml"
+ ]
+ volumes = [
+ "../alloc/data:/ephemeral",
+ "secrets/litestream.yml:/etc/litestream.yml"
+ ]
+ }
+ user = "472"
+
+ template {
+ data = file("../config/grafana-litestream.yml")
+ destination = "secrets/litestream.yml"
+ }
+
+ resources {
+ memory = 200
+ cpu = 100
+ }
+ }
+ }
+}
diff --git a/cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password b/cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password
new file mode 100644
index 0000000..2f36e97
--- /dev/null
+++ b/cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password
@@ -0,0 +1 @@
+CMD openssl rand -base64 12
diff --git a/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key
new file mode 100644
index 0000000..c7e41a4
--- /dev/null
+++ b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key
@@ -0,0 +1 @@
+USER S3 access key for grafana db
diff --git a/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key
new file mode 100644
index 0000000..051f41a
--- /dev/null
+++ b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key
@@ -0,0 +1 @@
+USER S3 secret key for grafana db