diff options
-rw-r--r-- | cluster/prod/app/garage/config/garage.toml | 4 | ||||
-rw-r--r-- | cluster/prod/app/garage/deploy/garage.hcl | 19 | ||||
-rw-r--r-- | cluster/prod/app/garage/secrets/garage/admin_token (renamed from cluster/prod/garage/secrets/garage/rpc_secret) | 0 | ||||
-rw-r--r-- | cluster/prod/app/garage/secrets/garage/metrics_token | 1 | ||||
-rw-r--r-- | cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml | 7 | ||||
-rw-r--r-- | cluster/prod/app/telemetry/config/grafana-litestream.yml | 10 | ||||
-rw-r--r-- | cluster/prod/app/telemetry/config/prometheus.yml | 30 | ||||
-rw-r--r-- | cluster/prod/app/telemetry/deploy/telemetry-system.hcl | 49 | ||||
-rw-r--r-- | cluster/prod/app/telemetry/deploy/telemetry.hcl | 189 | ||||
-rw-r--r-- | cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password | 1 | ||||
-rw-r--r-- | cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key | 1 | ||||
-rw-r--r-- | cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key | 1 | ||||
-rw-r--r-- | cluster/prod/garage/config/garage.toml | 24 | ||||
-rw-r--r-- | cluster/prod/garage/deploy/garage.hcl | 131 |
14 files changed, 311 insertions, 156 deletions
diff --git a/cluster/prod/app/garage/config/garage.toml b/cluster/prod/app/garage/config/garage.toml index a721886..224c755 100644 --- a/cluster/prod/app/garage/config/garage.toml +++ b/cluster/prod/app/garage/config/garage.toml @@ -21,4 +21,6 @@ bind_addr = "[::]:3902" root_domain = ".web.deuxfleurs.fr" [admin] -api_bind_addr = "[::1]:3903" +api_bind_addr = "[::]:3903" +metrics_token = "{{ key "secrets/garage/metrics_token" | trimSpace }}" +admin_token = "{{ key "secrets/garage/admin_token" | trimSpace }}" diff --git a/cluster/prod/app/garage/deploy/garage.hcl b/cluster/prod/app/garage/deploy/garage.hcl index bbaaec1..5a9c6b5 100644 --- a/cluster/prod/app/garage/deploy/garage.hcl +++ b/cluster/prod/app/garage/deploy/garage.hcl @@ -18,6 +18,7 @@ job "garage" { port "s3" { static = 3900 } port "rpc" { static = 3901 } port "web" { static = 3902 } + port "admin" { static = 3903 } } update { @@ -125,6 +126,24 @@ job "garage" { } } + service { + port = 3903 + address_mode = "driver" + name = "garage-admin" + check { + type = "tcp" + port = 3903 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + restart { interval = "30m" attempts = 10 diff --git a/cluster/prod/garage/secrets/garage/rpc_secret b/cluster/prod/app/garage/secrets/garage/admin_token index d831d53..d831d53 100644 --- a/cluster/prod/garage/secrets/garage/rpc_secret +++ b/cluster/prod/app/garage/secrets/garage/admin_token diff --git a/cluster/prod/app/garage/secrets/garage/metrics_token b/cluster/prod/app/garage/secrets/garage/metrics_token new file mode 100644 index 0000000..d831d53 --- /dev/null +++ b/cluster/prod/app/garage/secrets/garage/metrics_token @@ -0,0 +1 @@ +CMD_ONCE openssl rand -hex 32 diff --git a/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml b/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml new file mode 100644 index 0000000..9be89f9 --- /dev/null +++ b/cluster/prod/app/telemetry/config/grafana-datasource-prometheus.yaml @@ -0,0 +1,7 @@ +apiVersion: 1 + +datasources: + - name: DS_PROMETHEUS + type: prometheus + access: proxy + url: http://prometheus.service.prod.consul:9090 diff --git a/cluster/prod/app/telemetry/config/grafana-litestream.yml b/cluster/prod/app/telemetry/config/grafana-litestream.yml new file mode 100644 index 0000000..9d4d48a --- /dev/null +++ b/cluster/prod/app/telemetry/config/grafana-litestream.yml @@ -0,0 +1,10 @@ +dbs: + - path: /ephemeral/grafana.db + replicas: + - url: s3://grafana-db/grafana.db + region: garage + endpoint: http://{{ env "attr.unique.network.ip-address" }}:3900 + access-key-id: {{ key "secrets/telemetry/grafana/s3_access_key" | trimSpace }} + secret-access-key: {{ key "secrets/telemetry/grafana/s3_secret_key" | trimSpace }} + force-path-style: true + sync-interval: 60s diff --git a/cluster/prod/app/telemetry/config/prometheus.yml b/cluster/prod/app/telemetry/config/prometheus.yml new file mode 100644 index 0000000..d30ee13 --- /dev/null +++ b/cluster/prod/app/telemetry/config/prometheus.yml @@ -0,0 +1,30 @@ +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'node-exporter' + consul_sd_configs: + - server: 'https://localhost:8501' + services: + - 'node-exporter' + tls_config: + ca_file: /etc/prometheus/consul.crt + cert_file: /etc/prometheus/consul-client.crt + key_file: /etc/prometheus/consul-client.key + + - job_name: 'garage' + authorization: + type: Bearer + credentials: {{ key "secrets/garage/metrics_token" }} + consul_sd_configs: + - server: 'https://localhost:8501' + services: + - 'garage-admin' + tls_config: + ca_file: /etc/prometheus/consul.crt + cert_file: /etc/prometheus/consul-client.crt + key_file: /etc/prometheus/consul-client.key diff --git a/cluster/prod/app/telemetry/deploy/telemetry-system.hcl b/cluster/prod/app/telemetry/deploy/telemetry-system.hcl new file mode 100644 index 0000000..e4bde1a --- /dev/null +++ b/cluster/prod/app/telemetry/deploy/telemetry-system.hcl @@ -0,0 +1,49 @@ +job "telemetry-system" { + datacenters = ["neptune", "orion"] + type = "system" + priority = "100" + + group "collector" { + network { + port "node_exporter" { static = 9100 } + } + + task "node_exporter" { + driver = "docker" + + config { + image = "quay.io/prometheus/node-exporter:v1.1.2" + network_mode = "host" + volumes = [ + "/:/host:ro,rslave" + ] + args = [ "--path.rootfs=/host" ] + } + + resources { + cpu = 50 + memory = 40 + } + + service { + tags = [ "telemetry" ] + port = 9100 + address_mode = "driver" + name = "node-exporter" + check { + type = "http" + path = "/" + port = 9100 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + } + } +} diff --git a/cluster/prod/app/telemetry/deploy/telemetry.hcl b/cluster/prod/app/telemetry/deploy/telemetry.hcl new file mode 100644 index 0000000..e1f1000 --- /dev/null +++ b/cluster/prod/app/telemetry/deploy/telemetry.hcl @@ -0,0 +1,189 @@ +job "telemetry" { + datacenters = ["neptune"] + type = "service" + + group "prometheus" { + count = 1 + + network { + port "prometheus" { + static = 9090 + } + } + + task "prometheus" { + driver = "docker" + config { + image = "prom/prometheus:v2.38.0" + network_mode = "host" + ports = [ "prometheus" ] + volumes = [ + "secrets:/etc/prometheus" + ] + } + + template { + data = file("../config/prometheus.yml") + destination = "secrets/prometheus.yml" + } + + template { + data = "{{ key \"secrets/consul/consul.crt\" }}" + destination = "secrets/consul.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.crt\" }}" + destination = "secrets/consul-client.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.key\" }}" + destination = "secrets/consul-client.key" + } + + resources { + memory = 500 + cpu = 500 + } + + service { + port = 9090 + address_mode = "driver" + name = "prometheus" + check { + type = "http" + path = "/" + port = 9090 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + } + } + + group "grafana" { + count = 1 + + network { + port "grafana" { + static = 3719 + } + } + + task "restore-db" { + lifecycle { + hook = "prestart" + sidecar = false + } + + driver = "docker" + config { + image = "litestream/litestream:0.3.7" + args = [ + "restore", "-config", "/etc/litestream.yml", "/ephemeral/grafana.db" + ] + volumes = [ + "../alloc/data:/ephemeral", + "secrets/litestream.yml:/etc/litestream.yml" + ] + } + user = "472" + + template { + data = file("../config/grafana-litestream.yml") + destination = "secrets/litestream.yml" + } + + resources { + memory = 200 + cpu = 1000 + } + } + + task "grafana" { + driver = "docker" + config { + image = "grafana/grafana:8.4.3" + network_mode = "host" + ports = [ "grafana" ] + volumes = [ + "../alloc/data:/var/lib/grafana", + "secrets/prometheus.yaml:/etc/grafana/provisioning/datasources/prometheus.yaml" + ] + } + + template { + data = file("../config/grafana-datasource-prometheus.yaml") + destination = "secrets/prometheus.yaml" + } + + template { + data = <<EOH +GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel,grafana-worldmap-panel,grafana-polystat-panel +GF_SERVER_HTTP_PORT=3719 +EOH + destination = "secrets/env" + env = true + } + + resources { + memory = 500 + cpu = 100 + } + + service { + tags = [ + "grafana", + "tricot grafana-new.deuxfleurs.fr", + ] + port = 3719 + address_mode = "driver" + name = "grafana" + check { + type = "tcp" + port = 3719 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + } + + task "replicate-db" { + driver = "docker" + config { + image = "litestream/litestream:0.3.7" + args = [ + "replicate", "-config", "/etc/litestream.yml" + ] + volumes = [ + "../alloc/data:/ephemeral", + "secrets/litestream.yml:/etc/litestream.yml" + ] + } + user = "472" + + template { + data = file("../config/grafana-litestream.yml") + destination = "secrets/litestream.yml" + } + + resources { + memory = 200 + cpu = 100 + } + } + } +} diff --git a/cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password b/cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password new file mode 100644 index 0000000..2f36e97 --- /dev/null +++ b/cluster/prod/app/telemetry/secrets/telemetry/grafana/admin_password @@ -0,0 +1 @@ +CMD openssl rand -base64 12 diff --git a/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key new file mode 100644 index 0000000..c7e41a4 --- /dev/null +++ b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_access_key @@ -0,0 +1 @@ +USER S3 access key for grafana db diff --git a/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key new file mode 100644 index 0000000..051f41a --- /dev/null +++ b/cluster/prod/app/telemetry/secrets/telemetry/grafana/s3_secret_key @@ -0,0 +1 @@ +USER S3 secret key for grafana db diff --git a/cluster/prod/garage/config/garage.toml b/cluster/prod/garage/config/garage.toml deleted file mode 100644 index a721886..0000000 --- a/cluster/prod/garage/config/garage.toml +++ /dev/null @@ -1,24 +0,0 @@ -block_size = 1048576 - -metadata_dir = "/meta" -data_dir = "/data" - -replication_mode = "3" - -rpc_bind_addr = "[::]:3901" -rpc_secret = "{{ key "secrets/garage/rpc_secret" | trimSpace }}" - -sled_cache_capacity = 536870912 -sled_sync_interval_ms = 10000 - -[s3_api] -s3_region = "garage" -api_bind_addr = "[::]:3900" -root_domain = ".garage.deuxfleurs.fr" - -[s3_web] -bind_addr = "[::]:3902" -root_domain = ".web.deuxfleurs.fr" - -[admin] -api_bind_addr = "[::1]:3903" diff --git a/cluster/prod/garage/deploy/garage.hcl b/cluster/prod/garage/deploy/garage.hcl deleted file mode 100644 index 665515a..0000000 --- a/cluster/prod/garage/deploy/garage.hcl +++ /dev/null @@ -1,131 +0,0 @@ -job "garage" { - datacenters = ["dc1", "saturne", "neptune"] - type = "system" - priority = 80 - - constraint { - attribute = "${attr.cpu.arch}" - value = "amd64" - } - - group "garage" { - network { - port "s3" { static = 3900 } - port "rpc" { static = 3901 } - port "web" { static = 3902 } - } - - update { - max_parallel = 1 - min_healthy_time = "30s" - healthy_deadline = "5m" - } - - task "server" { - driver = "docker" - config { - advertise_ipv6_address = true - image = "dxflrs/amd64_garage:v0.7.1" - command = "/garage" - args = [ "server" ] - network_mode = "host" - volumes = [ - "/mnt/storage/garage/data:/data", - "/mnt/ssd/garage/meta:/meta", - "secrets/garage.toml:/etc/garage.toml", - ] - logging { - type = "journald" - } - } - - template { - data = file("../config/garage.toml") - destination = "secrets/garage.toml" - } - - resources { - memory = 1500 - cpu = 1000 - } - - kill_signal = "SIGINT" - kill_timeout = "20s" - - service { - tags = [ - "garage_api", - "tricot garage.deuxfleurs.fr", - "tricot *.garage.deuxfleurs.fr", - ] - port = 3900 - address_mode = "driver" - name = "garage-api" - check { - type = "tcp" - port = 3900 - address_mode = "driver" - interval = "60s" - timeout = "5s" - check_restart { - limit = 3 - grace = "90s" - ignore_warnings = false - } - } - } - - service { - tags = ["garage-rpc"] - port = 3901 - address_mode = "driver" - name = "garage-rpc" - check { - type = "tcp" - port = 3901 - address_mode = "driver" - interval = "60s" - timeout = "5s" - check_restart { - limit = 3 - grace = "90s" - ignore_warnings = false - } - } - } - - service { - tags = [ - "garage-web", - "tricot * 1", - "tricot-add-header Content-Security-Policy default-src 'self' 'unsafe-inline'; script-src 'self' 'unsafe-inline' https://code.jquery.com/; frame-ancestors 'self'", - "tricot-add-header Strict-Transport-Security max-age=63072000; includeSubDomains; preload", - "tricot-add-header X-Frame-Options SAMEORIGIN", - "tricot-add-header X-XSS-Protection 1; mode=block", - ] - port = 3902 - address_mode = "driver" - name = "garage-web" - check { - type = "tcp" - port = 3902 - address_mode = "driver" - interval = "60s" - timeout = "5s" - check_restart { - limit = 3 - grace = "90s" - ignore_warnings = false - } - } - } - - restart { - interval = "30m" - attempts = 10 - delay = "15s" - mode = "delay" - } - } - } -} |