aboutsummaryrefslogtreecommitdiff
path: root/cluster/prod
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2022-08-25 13:59:40 +0200
committerAlex Auvolat <alex@adnab.me>2022-08-25 13:59:40 +0200
commit72d033dcd40a65ccf7f41f51af356ffc20144c30 (patch)
tree2b1247bcae11a1f993590bc9db8bf08a53564350 /cluster/prod
parentfd3ed44dad783c15f2793788f9384d48760666a3 (diff)
downloadnixcfg-72d033dcd40a65ccf7f41f51af356ffc20144c30.tar.gz
nixcfg-72d033dcd40a65ccf7f41f51af356ffc20144c30.zip
Remove garage files at bad location, add basic telemetry
Diffstat (limited to 'cluster/prod')
-rw-r--r--cluster/prod/app/garage/config/garage.toml4
-rw-r--r--cluster/prod/app/garage/deploy/garage.hcl19
-rw-r--r--cluster/prod/app/garage/secrets/garage/admin_token (renamed from cluster/prod/garage/secrets/garage/rpc_secret)0
-rw-r--r--cluster/prod/app/garage/secrets/garage/metrics_token1
-rw-r--r--cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml7
-rw-r--r--cluster/prod/app/telemetry/config/grafana-litestream.yml10
-rw-r--r--cluster/prod/app/telemetry/config/prometheus.yml30
-rw-r--r--cluster/prod/app/telemetry/deploy/telemetry-system.hcl49
-rw-r--r--cluster/prod/app/telemetry/deploy/telemetry.hcl189
-rw-r--r--cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password1
-rw-r--r--cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key1
-rw-r--r--cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key1
-rw-r--r--cluster/prod/garage/config/garage.toml24
-rw-r--r--cluster/prod/garage/deploy/garage.hcl131
14 files changed, 311 insertions, 156 deletions
diff --git a/cluster/prod/app/garage/config/garage.toml b/cluster/prod/app/garage/config/garage.toml
index a721886..224c755 100644
--- a/cluster/prod/app/garage/config/garage.toml
+++ b/cluster/prod/app/garage/config/garage.toml
@@ -21,4 +21,6 @@ bind_addr = "[::]:3902"
root_domain = ".web.deuxfleurs.fr"
[admin]
-api_bind_addr = "[::1]:3903"
+api_bind_addr = "[::]:3903"
+metrics_token = "{{ key "secrets/garage/metrics_token" | trimSpace }}"
+admin_token = "{{ key "secrets/garage/admin_token" | trimSpace }}"
diff --git a/cluster/prod/app/garage/deploy/garage.hcl b/cluster/prod/app/garage/deploy/garage.hcl
index bbaaec1..5a9c6b5 100644
--- a/cluster/prod/app/garage/deploy/garage.hcl
+++ b/cluster/prod/app/garage/deploy/garage.hcl
@@ -18,6 +18,7 @@ job "garage" {
port "s3" { static = 3900 }
port "rpc" { static = 3901 }
port "web" { static = 3902 }
+ port "admin" { static = 3903 }
}
update {
@@ -125,6 +126,24 @@ job "garage" {
}
}
+ service {
+ port = 3903
+ address_mode = "driver"
+ name = "garage-admin"
+ check {
+ type = "tcp"
+ port = 3903
+ address_mode = "driver"
+ interval = "60s"
+ timeout = "5s"
+ check_restart {
+ limit = 3
+ grace = "90s"
+ ignore_warnings = false
+ }
+ }
+ }
+
restart {
interval = "30m"
attempts = 10
diff --git a/cluster/prod/garage/secrets/garage/rpc_secret b/cluster/prod/app/garage/secrets/garage/admin_token
index d831d53..d831d53 100644
--- a/cluster/prod/garage/secrets/garage/rpc_secret
+++ b/cluster/prod/app/garage/secrets/garage/admin_token
diff --git a/cluster/prod/app/garage/secrets/garage/metrics_token b/cluster/prod/app/garage/secrets/garage/metrics_token
new file mode 100644
index 0000000..d831d53
--- /dev/null
+++ b/cluster/prod/app/garage/secrets/garage/metrics_token
@@ -0,0 +1 @@
+CMD_ONCE openssl rand -hex 32
diff --git a/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml b/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml
new file mode 100644
index 0000000..9be89f9
--- /dev/null
+++ b/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml
@@ -0,0 +1,7 @@
+apiVersion: 1
+
+datasources:
+ - name: DS_PROMETHEUS
+ type: prometheus
+ access: proxy
+ url: http://prometheus.service.prod.consul:9090
diff --git a/cluster/prod/app/telemetry/config/grafana-litestream.yml b/cluster/prod/app/telemetry/config/grafana-litestream.yml
new file mode 100644
index 0000000..9d4d48a
--- /dev/null
+++ b/cluster/prod/app/telemetry/config/grafana-litestream.yml
@@ -0,0 +1,10 @@
+dbs:
+ - path: /ephemeral/grafana.db
+ replicas:
+ - url: s3://grafana-db/grafana.db
+ region: garage
+ endpoint: http://{{ env "attr.unique.network.ip-address" }}:3900
+ access-key-id: {{ key "secrets/telemetry/grafana/s3_access_key" | trimSpace }}
+ secret-access-key: {{ key "secrets/telemetry/grafana/s3_secret_key" | trimSpace }}
+ force-path-style: true
+ sync-interval: 60s
diff --git a/cluster/prod/app/telemetry/config/prometheus.yml b/cluster/prod/app/telemetry/config/prometheus.yml
new file mode 100644
index 0000000..d30ee13
--- /dev/null
+++ b/cluster/prod/app/telemetry/config/prometheus.yml
@@ -0,0 +1,30 @@
+global:
+ scrape_interval: 15s # By default, scrape targets every 15 seconds.
+
+scrape_configs:
+ - job_name: 'prometheus'
+ static_configs:
+ - targets: ['localhost:9090']
+
+ - job_name: 'node-exporter'
+ consul_sd_configs:
+ - server: 'https://localhost:8501'
+ services:
+ - 'node-exporter'
+ tls_config:
+ ca_file: /etc/prometheus/consul.crt
+ cert_file: /etc/prometheus/consul-client.crt
+ key_file: /etc/prometheus/consul-client.key
+
+ - job_name: 'garage'
+ authorization:
+ type: Bearer
+ credentials: {{ key "secrets/garage/metrics_token" }}
+ consul_sd_configs:
+ - server: 'https://localhost:8501'
+ services:
+ - 'garage-admin'
+ tls_config:
+ ca_file: /etc/prometheus/consul.crt
+ cert_file: /etc/prometheus/consul-client.crt
+ key_file: /etc/prometheus/consul-client.key
diff --git a/cluster/prod/app/telemetry/deploy/telemetry-system.hcl b/cluster/prod/app/telemetry/deploy/telemetry-system.hcl
new file mode 100644
index 0000000..e4bde1a
--- /dev/null
+++ b/cluster/prod/app/telemetry/deploy/telemetry-system.hcl
@@ -0,0 +1,49 @@
+job "telemetry-system" {
+ datacenters = ["neptune", "orion"]
+ type = "system"
+ priority = "100"
+
+ group "collector" {
+ network {
+ port "node_exporter" { static = 9100 }
+ }
+
+ task "node_exporter" {
+ driver = "docker"
+
+ config {
+ image = "quay.io/prometheus/node-exporter:v1.1.2"
+ network_mode = "host"
+ volumes = [
+ "/:/host:ro,rslave"
+ ]
+ args = [ "--path.rootfs=/host" ]
+ }
+
+ resources {
+ cpu = 50
+ memory = 40
+ }
+
+ service {
+ tags = [ "telemetry" ]
+ port = 9100
+ address_mode = "driver"
+ name = "node-exporter"
+ check {
+ type = "http"
+ path = "/"
+ port = 9100
+ address_mode = "driver"
+ interval = "60s"
+ timeout = "5s"
+ check_restart {
+ limit = 3
+ grace = "90s"
+ ignore_warnings = false
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/cluster/prod/app/telemetry/deploy/telemetry.hcl b/cluster/prod/app/telemetry/deploy/telemetry.hcl
new file mode 100644
index 0000000..e1f1000
--- /dev/null
+++ b/cluster/prod/app/telemetry/deploy/telemetry.hcl
@@ -0,0 +1,189 @@
+job "telemetry" {
+ datacenters = ["neptune"]
+ type = "service"
+
+ group "prometheus" {
+ count = 1
+
+ network {
+ port "prometheus" {
+ static = 9090
+ }
+ }
+
+ task "prometheus" {
+ driver = "docker"
+ config {
+ image = "prom/prometheus:v2.38.0"
+ network_mode = "host"
+ ports = [ "prometheus" ]
+ volumes = [
+ "secrets:/etc/prometheus"
+ ]
+ }
+
+ template {
+ data = file("../config/prometheus.yml")
+ destination = "secrets/prometheus.yml"
+ }
+
+ template {
+ data = "{{ key \"secrets/consul/consul.crt\" }}"
+ destination = "secrets/consul.crt"
+ }
+
+ template {
+ data = "{{ key \"secrets/consul/consul-client.crt\" }}"
+ destination = "secrets/consul-client.crt"
+ }
+
+ template {
+ data = "{{ key \"secrets/consul/consul-client.key\" }}"
+ destination = "secrets/consul-client.key"
+ }
+
+ resources {
+ memory = 500
+ cpu = 500
+ }
+
+ service {
+ port = 9090
+ address_mode = "driver"
+ name = "prometheus"
+ check {
+ type = "http"
+ path = "/"
+ port = 9090
+ address_mode = "driver"
+ interval = "60s"
+ timeout = "5s"
+ check_restart {
+ limit = 3
+ grace = "90s"
+ ignore_warnings = false
+ }
+ }
+ }
+ }
+ }
+
+ group "grafana" {
+ count = 1
+
+ network {
+ port "grafana" {
+ static = 3719
+ }
+ }
+
+ task "restore-db" {
+ lifecycle {
+ hook = "prestart"
+ sidecar = false
+ }
+
+ driver = "docker"
+ config {
+ image = "litestream/litestream:0.3.7"
+ args = [
+ "restore", "-config", "/etc/litestream.yml", "/ephemeral/grafana.db"
+ ]
+ volumes = [
+ "../alloc/data:/ephemeral",
+ "secrets/litestream.yml:/etc/litestream.yml"
+ ]
+ }
+ user = "472"
+
+ template {
+ data = file("../config/grafana-litestream.yml")
+ destination = "secrets/litestream.yml"
+ }
+
+ resources {
+ memory = 200
+ cpu = 1000
+ }
+ }
+
+ task "grafana" {
+ driver = "docker"
+ config {
+ image = "grafana/grafana:8.4.3"
+ network_mode = "host"
+ ports = [ "grafana" ]
+ volumes = [
+ "../alloc/data:/var/lib/grafana",
+ "secrets/prometheus.yaml:/etc/grafana/provisioning/datasources/prometheus.yaml"
+ ]
+ }
+
+ template {
+ data = file("../config/grafana-datasource-prometheus.yaml")
+ destination = "secrets/prometheus.yaml"
+ }
+
+ template {
+ data = <<EOH
+GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel,grafana-worldmap-panel,grafana-polystat-panel
+GF_SERVER_HTTP_PORT=3719
+EOH
+ destination = "secrets/env"
+ env = true
+ }
+
+ resources {
+ memory = 500
+ cpu = 100
+ }
+
+ service {
+ tags = [
+ "grafana",
+ "tricot grafana-new.deuxfleurs.fr",
+ ]
+ port = 3719
+ address_mode = "driver"
+ name = "grafana"
+ check {
+ type = "tcp"
+ port = 3719
+ address_mode = "driver"
+ interval = "60s"
+ timeout = "5s"
+ check_restart {
+ limit = 3
+ grace = "90s"
+ ignore_warnings = false
+ }
+ }
+ }
+ }
+
+ task "replicate-db" {
+ driver = "docker"
+ config {
+ image = "litestream/litestream:0.3.7"
+ args = [
+ "replicate", "-config", "/etc/litestream.yml"
+ ]
+ volumes = [
+ "../alloc/data:/ephemeral",
+ "secrets/litestream.yml:/etc/litestream.yml"
+ ]
+ }
+ user = "472"
+
+ template {
+ data = file("../config/grafana-litestream.yml")
+ destination = "secrets/litestream.yml"
+ }
+
+ resources {
+ memory = 200
+ cpu = 100
+ }
+ }
+ }
+}
diff --git a/cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password b/cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password
new file mode 100644
index 0000000..2f36e97
--- /dev/null
+++ b/cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password
@@ -0,0 +1 @@
+CMD openssl rand -base64 12
diff --git a/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key
new file mode 100644
index 0000000..c7e41a4
--- /dev/null
+++ b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key
@@ -0,0 +1 @@
+USER S3 access key for grafana db
diff --git a/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key
new file mode 100644
index 0000000..051f41a
--- /dev/null
+++ b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key
@@ -0,0 +1 @@
+USER S3 secret key for grafana db
diff --git a/cluster/prod/garage/config/garage.toml b/cluster/prod/garage/config/garage.toml
deleted file mode 100644
index a721886..0000000
--- a/cluster/prod/garage/config/garage.toml
+++ /dev/null
@@ -1,24 +0,0 @@
-block_size = 1048576
-
-metadata_dir = "/meta"
-data_dir = "/data"
-
-replication_mode = "3"
-
-rpc_bind_addr = "[::]:3901"
-rpc_secret = "{{ key "secrets/garage/rpc_secret" | trimSpace }}"
-
-sled_cache_capacity = 536870912
-sled_sync_interval_ms = 10000
-
-[s3_api]
-s3_region = "garage"
-api_bind_addr = "[::]:3900"
-root_domain = ".garage.deuxfleurs.fr"
-
-[s3_web]
-bind_addr = "[::]:3902"
-root_domain = ".web.deuxfleurs.fr"
-
-[admin]
-api_bind_addr = "[::1]:3903"
diff --git a/cluster/prod/garage/deploy/garage.hcl b/cluster/prod/garage/deploy/garage.hcl
deleted file mode 100644
index 665515a..0000000
--- a/cluster/prod/garage/deploy/garage.hcl
+++ /dev/null
@@ -1,131 +0,0 @@
-job "garage" {
- datacenters = ["dc1", "saturne", "neptune"]
- type = "system"
- priority = 80
-
- constraint {
- attribute = "${attr.cpu.arch}"
- value = "amd64"
- }
-
- group "garage" {
- network {
- port "s3" { static = 3900 }
- port "rpc" { static = 3901 }
- port "web" { static = 3902 }
- }
-
- update {
- max_parallel = 1
- min_healthy_time = "30s"
- healthy_deadline = "5m"
- }
-
- task "server" {
- driver = "docker"
- config {
- advertise_ipv6_address = true
- image = "dxflrs/amd64_garage:v0.7.1"
- command = "/garage"
- args = [ "server" ]
- network_mode = "host"
- volumes = [
- "/mnt/storage/garage/data:/data",
- "/mnt/ssd/garage/meta:/meta",
- "secrets/garage.toml:/etc/garage.toml",
- ]
- logging {
- type = "journald"
- }
- }
-
- template {
- data = file("../config/garage.toml")
- destination = "secrets/garage.toml"
- }
-
- resources {
- memory = 1500
- cpu = 1000
- }
-
- kill_signal = "SIGINT"
- kill_timeout = "20s"
-
- service {
- tags = [
- "garage_api",
- "tricot garage.deuxfleurs.fr",
- "tricot *.garage.deuxfleurs.fr",
- ]
- port = 3900
- address_mode = "driver"
- name = "garage-api"
- check {
- type = "tcp"
- port = 3900
- address_mode = "driver"
- interval = "60s"
- timeout = "5s"
- check_restart {
- limit = 3
- grace = "90s"
- ignore_warnings = false
- }
- }
- }
-
- service {
- tags = ["garage-rpc"]
- port = 3901
- address_mode = "driver"
- name = "garage-rpc"
- check {
- type = "tcp"
- port = 3901
- address_mode = "driver"
- interval = "60s"
- timeout = "5s"
- check_restart {
- limit = 3
- grace = "90s"
- ignore_warnings = false
- }
- }
- }
-
- service {
- tags = [
- "garage-web",
- "tricot * 1",
- "tricot-add-header Content-Security-Policy default-src 'self' 'unsafe-inline'; script-src 'self' 'unsafe-inline' https://code.jquery.com/; frame-ancestors 'self'",
- "tricot-add-header Strict-Transport-Security max-age=63072000; includeSubDomains; preload",
- "tricot-add-header X-Frame-Options SAMEORIGIN",
- "tricot-add-header X-XSS-Protection 1; mode=block",
- ]
- port = 3902
- address_mode = "driver"
- name = "garage-web"
- check {
- type = "tcp"
- port = 3902
- address_mode = "driver"
- interval = "60s"
- timeout = "5s"
- check_restart {
- limit = 3
- grace = "90s"
- ignore_warnings = false
- }
- }
- }
-
- restart {
- interval = "30m"
- attempts = 10
- delay = "15s"
- mode = "delay"
- }
- }
- }
-}