diff options
author | Alex Auvolat <alex@adnab.me> | 2022-09-20 17:13:46 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2022-09-20 17:13:46 +0200 |
commit | 56ff4c5cfdfc7fd84a10bd1d69418109e25c2560 (patch) | |
tree | 2dced4fd3861b147e1f3b0f617ecad2cd627571a | |
parent | 9b6bdc709253ba20d344bee711e5b7bf29bf03c7 (diff) | |
download | nixcfg-56ff4c5cfdfc7fd84a10bd1d69418109e25c2560.tar.gz nixcfg-56ff4c5cfdfc7fd84a10bd1d69418109e25c2560.zip |
Prod-like telemetry into staging
14 files changed, 494 insertions, 206 deletions
diff --git a/cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml b/cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml new file mode 100644 index 0000000..36b67e6 --- /dev/null +++ b/cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml @@ -0,0 +1,7 @@ +apiVersion: 1 + +datasources: + - name: DS_PROMETHEUS + type: prometheus + access: proxy + url: http://prometheus.service.staging.consul:9090 diff --git a/cluster/staging/app/telemetry/config/prometheus.yml b/cluster/staging/app/telemetry/config/prometheus.yml new file mode 100644 index 0000000..e0e786d --- /dev/null +++ b/cluster/staging/app/telemetry/config/prometheus.yml @@ -0,0 +1,30 @@ +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'node-exporter' + consul_sd_configs: + - server: 'https://localhost:8501' + services: + - 'node-exporter' + tls_config: + ca_file: /etc/prometheus/consul.crt + cert_file: /etc/prometheus/consul-client.crt + key_file: /etc/prometheus/consul-client.key + + - job_name: 'garage' + authorization: + type: Bearer + credentials: {{ key "secrets/garage-staging/metrics_token" }} + consul_sd_configs: + - server: 'https://localhost:8501' + services: + - 'garage-staging-admin' + tls_config: + ca_file: /etc/prometheus/consul.crt + cert_file: /etc/prometheus/consul-client.crt + key_file: /etc/prometheus/consul-client.key diff --git a/cluster/staging/app/telemetry/deploy/telemetry-system.hcl b/cluster/staging/app/telemetry/deploy/telemetry-system.hcl index 3e26c2e..e2bad61 100644 --- a/cluster/staging/app/telemetry/deploy/telemetry-system.hcl +++ b/cluster/staging/app/telemetry/deploy/telemetry-system.hcl @@ -1,182 +1,49 @@ job "telemetry-system" { - datacenters = ["neptune"] - type = "system" + datacenters = ["neptune"] + type = "system" + priority = "100" - group "elasticsearch" { + group "collector" { network { - port "elastic" { - static = 9200 - } - port "elastic_internal" { - static = 9300 - } + port "node_exporter" { static = 9100 } } - task "elastic" { - driver = "docker" - config { - image = "docker.elastic.co/elasticsearch/elasticsearch:8.2.0" - network_mode = "host" - volumes = [ - "/mnt/ssd/telemetry/es_data:/usr/share/elasticsearch/data", - "secrets/elastic-certificates.p12:/usr/share/elasticsearch/config/elastic-certificates.p12", - ] - ports = [ "elastic", "elastic_internal" ] - sysctl = { - #"vm.max_map_count" = "262144", - } - ulimit = { - memlock = "9223372036854775807:9223372036854775807", + task "node_exporter" { + driver = "docker" + + config { + image = "quay.io/prometheus/node-exporter:v1.1.2" + network_mode = "host" + volumes = [ + "/:/host:ro,rslave" + ] + args = [ "--path.rootfs=/host" ] + } + + resources { + cpu = 50 + memory = 40 + } + + service { + tags = [ "telemetry" ] + port = 9100 + address_mode = "driver" + name = "node-exporter" + check { + type = "http" + path = "/" + port = 9100 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } } } - - user = "1000" - - resources { - memory = 1500 - cpu = 500 - } - - template { - data = "{{ key \"secrets/telemetry/elasticsearch/elastic-certificates.p12\" }}" - destination = "secrets/elastic-certificates.p12" - } - - template { - data = <<EOH -node.name={{ env "attr.unique.hostname" }} -http.port=9200 -transport.port=9300 -cluster.name=es-deuxfleurs -cluster.initial_master_nodes=carcajou,caribou,cariacou -discovery.seed_hosts=carcajou,caribou,cariacou -bootstrap.memory_lock=true -xpack.security.enabled=true -xpack.security.authc.api_key.enabled=true -xpack.security.transport.ssl.enabled=true -xpack.security.transport.ssl.verification_mode=certificate -xpack.security.transport.ssl.client_authentication=required -xpack.security.transport.ssl.keystore.path=/usr/share/elasticsearch/config/elastic-certificates.p12 -xpack.security.transport.ssl.truststore.path=/usr/share/elasticsearch/config/elastic-certificates.p12 -cluster.routing.allocation.disk.watermark.high=75% -cluster.routing.allocation.disk.watermark.low=65% -ES_JAVA_OPTS=-Xms512M -Xmx512M -EOH - destination = "secrets/env" - env = true - } - } - } - - group "collector" { - network { - port "otel_grpc" { - static = 4317 - } - port "apm" { - static = 8200 - } - port "node_exporter" { - static = 9100 - } - } - - task "otel" { - driver = "docker" - config { - image = "otel/opentelemetry-collector-contrib:0.46.0" - args = [ - "--config=/etc/otel-config.yaml", - ] - network_mode = "host" - ports= [ "otel_grpc" ] - volumes = [ - "secrets/otel-config.yaml:/etc/otel-config.yaml" - ] - } - - template { - data = file("../config/otel-config.yaml") - destination = "secrets/otel-config.yaml" - } - - resources { - memory = 100 - cpu = 100 - } - } - - task "apm" { - driver = "docker" - config { - image = "docker.elastic.co/apm/apm-server:8.2.0" - network_mode = "host" - ports = [ "apm" ] - args = [ "--strict.perms=false" ] - volumes = [ - "secrets/apm-config.yaml:/usr/share/apm-server/apm-server.yml:ro" - ] - } - - template { - data = file("../config/apm-config.yaml") - destination = "secrets/apm-config.yaml" - } - - resources { - memory = 100 - cpu = 100 - } - } - -/* - task "node_exporter" { - driver = "docker" - config { - image = "quay.io/prometheus/node-exporter:v1.1.2" - network_mode = "host" - ports = [ "node_exporter" ] - volumes = [ - "/:/host:ro,rslave" - ] - args = [ "--path.rootfs=/host" ] - } - - resources { - cpu = 50 - memory = 40 - } - } -*/ - - task "filebeat" { - driver = "docker" - config { - image = "docker.elastic.co/beats/filebeat:8.2.0" - network_mode = "host" - volumes = [ - "/mnt/ssd/telemetry/filebeat:/usr/share/filebeat/data", - "secrets/filebeat.yml:/usr/share/filebeat/filebeat.yml", - "/var/run/docker.sock:/var/run/docker.sock", - "/var/lib/docker/containers/:/var/lib/docker/containers/:ro", - "/var/log/:/var/log/:ro", - ] - args = [ "--strict.perms=false" ] - privileged = true - } - user = "root" - - - template { - data = file("../config/filebeat.yml") - destination = "secrets/filebeat.yml" - } - - resources { - memory = 100 - cpu = 100 - } - } - } + } + } } - diff --git a/cluster/staging/app/telemetry/deploy/telemetry.hcl b/cluster/staging/app/telemetry/deploy/telemetry.hcl index 21685a1..cfd26f3 100644 --- a/cluster/staging/app/telemetry/deploy/telemetry.hcl +++ b/cluster/staging/app/telemetry/deploy/telemetry.hcl @@ -2,51 +2,59 @@ job "telemetry" { datacenters = ["neptune"] type = "service" - group "kibana" { + group "prometheus" { count = 1 network { - port "kibana" { - static = 5601 + port "prometheus" { + static = 9090 } } - task "kibana" { + task "prometheus" { driver = "docker" config { - image = "docker.elastic.co/kibana/kibana:8.2.0" + image = "prom/prometheus:v2.38.0" network_mode = "host" - ports = [ "kibana" ] + ports = [ "prometheus" ] + volumes = [ + "secrets:/etc/prometheus" + ] } template { - data = <<EOH -SERVER_NAME={{ env "attr.unique.hostname" }} -ELASTICSEARCH_HOSTS=http://localhost:9200 -ELASTICSEARCH_USERNAME=kibana_system -ELASTICSEARCH_PASSWORD={{ key "secrets/telemetry/elastic_passwords/kibana_system" }} -SERVER_PUBLICBASEURL=https://kibana.home.adnab.me -EOH - destination = "secrets/env" - env = true + data = file("../config/prometheus.yml") + destination = "secrets/prometheus.yml" + } + + template { + data = "{{ key \"secrets/consul/consul.crt\" }}" + destination = "secrets/consul.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.crt\" }}" + destination = "secrets/consul-client.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.key\" }}" + destination = "secrets/consul-client.key" } resources { - memory = 1000 + memory = 500 cpu = 500 } service { - tags = [ - "kibana", - "tricot kibana.staging.deuxfleurs.org", - ] - port = 5601 + port = 9090 address_mode = "driver" - name = "kibana" + name = "prometheus" check { - type = "tcp" - port = 5601 + type = "http" + path = "/" + port = 9090 address_mode = "driver" interval = "60s" timeout = "5s" @@ -59,13 +67,13 @@ EOH } } } - + group "grafana" { count = 1 network { port "grafana" { - static = 3333 + static = 3719 } } @@ -107,19 +115,19 @@ EOH ports = [ "grafana" ] volumes = [ "../alloc/data:/var/lib/grafana", - "secrets/elastic.yaml:/etc/grafana/provisioning/datasources/elastic.yaml" + "secrets/prometheus.yaml:/etc/grafana/provisioning/datasources/prometheus.yaml" ] } template { - data = file("../config/grafana/provisioning/datasources/elastic.yaml") - destination = "secrets/elastic.yaml" + data = file("../config/grafana-datasource-prometheus.yaml") + destination = "secrets/prometheus.yaml" } template { data = <<EOH GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel,grafana-worldmap-panel,grafana-polystat-panel -GF_SERVER_HTTP_PORT=3333 +GF_SERVER_HTTP_PORT=3719 EOH destination = "secrets/env" env = true @@ -135,12 +143,12 @@ EOH "grafana", "tricot grafana.staging.deuxfleurs.org", ] - port = 3333 + port = 3719 address_mode = "driver" name = "grafana" check { type = "tcp" - port = 3333 + port = 3719 address_mode = "driver" interval = "60s" timeout = "5s" diff --git a/cluster/staging/app/telemetry/secrets/telemetry/grafana/admin_password b/cluster/staging/app/telemetry/secrets/telemetry/grafana/admin_password new file mode 100644 index 0000000..2f36e97 --- /dev/null +++ b/cluster/staging/app/telemetry/secrets/telemetry/grafana/admin_password @@ -0,0 +1 @@ +CMD openssl rand -base64 12 diff --git a/cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_access_key b/cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_access_key new file mode 100644 index 0000000..c7e41a4 --- /dev/null +++ b/cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_access_key @@ -0,0 +1 @@ +USER S3 access key for grafana db diff --git a/cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_secret_key b/cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_secret_key new file mode 100644 index 0000000..051f41a --- /dev/null +++ b/cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_secret_key @@ -0,0 +1 @@ +USER S3 secret key for grafana db diff --git a/cluster/staging/app/telemetry/config/apm-config.yaml b/experimental/bad.telemetry-elastic/config/apm-config.yaml index 07a88bd..07a88bd 100644 --- a/cluster/staging/app/telemetry/config/apm-config.yaml +++ b/experimental/bad.telemetry-elastic/config/apm-config.yaml diff --git a/cluster/staging/app/telemetry/config/filebeat.yml b/experimental/bad.telemetry-elastic/config/filebeat.yml index 310afd1..310afd1 100644 --- a/cluster/staging/app/telemetry/config/filebeat.yml +++ b/experimental/bad.telemetry-elastic/config/filebeat.yml diff --git a/experimental/bad.telemetry-elastic/config/grafana-litestream.yml b/experimental/bad.telemetry-elastic/config/grafana-litestream.yml new file mode 100644 index 0000000..a537d9c --- /dev/null +++ b/experimental/bad.telemetry-elastic/config/grafana-litestream.yml @@ -0,0 +1,10 @@ +dbs: + - path: /ephemeral/grafana.db + replicas: + - url: s3://grafana-db/grafana.db + region: garage-staging + endpoint: http://{{ env "attr.unique.network.ip-address" }}:3990 + access-key-id: {{ key "secrets/telemetry/grafana/s3_access_key" | trimSpace }} + secret-access-key: {{ key "secrets/telemetry/grafana/s3_secret_key" | trimSpace }} + force-path-style: true + sync-interval: 60s diff --git a/cluster/staging/app/telemetry/config/grafana/provisioning/datasources/elastic.yaml b/experimental/bad.telemetry-elastic/config/grafana/provisioning/datasources/elastic.yaml index 7d2277c..7d2277c 100644 --- a/cluster/staging/app/telemetry/config/grafana/provisioning/datasources/elastic.yaml +++ b/experimental/bad.telemetry-elastic/config/grafana/provisioning/datasources/elastic.yaml diff --git a/cluster/staging/app/telemetry/config/otel-config.yaml b/experimental/bad.telemetry-elastic/config/otel-config.yaml index bcf1baa..bcf1baa 100644 --- a/cluster/staging/app/telemetry/config/otel-config.yaml +++ b/experimental/bad.telemetry-elastic/config/otel-config.yaml diff --git a/experimental/bad.telemetry-elastic/deploy/telemetry-system.hcl b/experimental/bad.telemetry-elastic/deploy/telemetry-system.hcl new file mode 100644 index 0000000..3e26c2e --- /dev/null +++ b/experimental/bad.telemetry-elastic/deploy/telemetry-system.hcl @@ -0,0 +1,182 @@ +job "telemetry-system" { + datacenters = ["neptune"] + type = "system" + + group "elasticsearch" { + network { + port "elastic" { + static = 9200 + } + port "elastic_internal" { + static = 9300 + } + } + + task "elastic" { + driver = "docker" + config { + image = "docker.elastic.co/elasticsearch/elasticsearch:8.2.0" + network_mode = "host" + volumes = [ + "/mnt/ssd/telemetry/es_data:/usr/share/elasticsearch/data", + "secrets/elastic-certificates.p12:/usr/share/elasticsearch/config/elastic-certificates.p12", + ] + ports = [ "elastic", "elastic_internal" ] + sysctl = { + #"vm.max_map_count" = "262144", + } + ulimit = { + memlock = "9223372036854775807:9223372036854775807", + } + } + + user = "1000" + + resources { + memory = 1500 + cpu = 500 + } + + template { + data = "{{ key \"secrets/telemetry/elasticsearch/elastic-certificates.p12\" }}" + destination = "secrets/elastic-certificates.p12" + } + + template { + data = <<EOH +node.name={{ env "attr.unique.hostname" }} +http.port=9200 +transport.port=9300 +cluster.name=es-deuxfleurs +cluster.initial_master_nodes=carcajou,caribou,cariacou +discovery.seed_hosts=carcajou,caribou,cariacou +bootstrap.memory_lock=true +xpack.security.enabled=true +xpack.security.authc.api_key.enabled=true +xpack.security.transport.ssl.enabled=true +xpack.security.transport.ssl.verification_mode=certificate +xpack.security.transport.ssl.client_authentication=required +xpack.security.transport.ssl.keystore.path=/usr/share/elasticsearch/config/elastic-certificates.p12 +xpack.security.transport.ssl.truststore.path=/usr/share/elasticsearch/config/elastic-certificates.p12 +cluster.routing.allocation.disk.watermark.high=75% +cluster.routing.allocation.disk.watermark.low=65% +ES_JAVA_OPTS=-Xms512M -Xmx512M +EOH + destination = "secrets/env" + env = true + } + } + } + + group "collector" { + network { + port "otel_grpc" { + static = 4317 + } + port "apm" { + static = 8200 + } + port "node_exporter" { + static = 9100 + } + } + + task "otel" { + driver = "docker" + config { + image = "otel/opentelemetry-collector-contrib:0.46.0" + args = [ + "--config=/etc/otel-config.yaml", + ] + network_mode = "host" + ports= [ "otel_grpc" ] + volumes = [ + "secrets/otel-config.yaml:/etc/otel-config.yaml" + ] + } + + template { + data = file("../config/otel-config.yaml") + destination = "secrets/otel-config.yaml" + } + + resources { + memory = 100 + cpu = 100 + } + } + + task "apm" { + driver = "docker" + config { + image = "docker.elastic.co/apm/apm-server:8.2.0" + network_mode = "host" + ports = [ "apm" ] + args = [ "--strict.perms=false" ] + volumes = [ + "secrets/apm-config.yaml:/usr/share/apm-server/apm-server.yml:ro" + ] + } + + template { + data = file("../config/apm-config.yaml") + destination = "secrets/apm-config.yaml" + } + + resources { + memory = 100 + cpu = 100 + } + } + +/* + task "node_exporter" { + driver = "docker" + config { + image = "quay.io/prometheus/node-exporter:v1.1.2" + network_mode = "host" + ports = [ "node_exporter" ] + volumes = [ + "/:/host:ro,rslave" + ] + args = [ "--path.rootfs=/host" ] + } + + resources { + cpu = 50 + memory = 40 + } + } +*/ + + task "filebeat" { + driver = "docker" + config { + image = "docker.elastic.co/beats/filebeat:8.2.0" + network_mode = "host" + volumes = [ + "/mnt/ssd/telemetry/filebeat:/usr/share/filebeat/data", + "secrets/filebeat.yml:/usr/share/filebeat/filebeat.yml", + "/var/run/docker.sock:/var/run/docker.sock", + "/var/lib/docker/containers/:/var/lib/docker/containers/:ro", + "/var/log/:/var/log/:ro", + ] + args = [ "--strict.perms=false" ] + privileged = true + } + user = "root" + + + template { + data = file("../config/filebeat.yml") + destination = "secrets/filebeat.yml" + } + + resources { + memory = 100 + cpu = 100 + } + } + } +} + diff --git a/experimental/bad.telemetry-elastic/deploy/telemetry.hcl b/experimental/bad.telemetry-elastic/deploy/telemetry.hcl new file mode 100644 index 0000000..21685a1 --- /dev/null +++ b/experimental/bad.telemetry-elastic/deploy/telemetry.hcl @@ -0,0 +1,181 @@ +job "telemetry" { + datacenters = ["neptune"] + type = "service" + + group "kibana" { + count = 1 + + network { + port "kibana" { + static = 5601 + } + } + + task "kibana" { + driver = "docker" + config { + image = "docker.elastic.co/kibana/kibana:8.2.0" + network_mode = "host" + ports = [ "kibana" ] + } + + template { + data = <<EOH +SERVER_NAME={{ env "attr.unique.hostname" }} +ELASTICSEARCH_HOSTS=http://localhost:9200 +ELASTICSEARCH_USERNAME=kibana_system +ELASTICSEARCH_PASSWORD={{ key "secrets/telemetry/elastic_passwords/kibana_system" }} +SERVER_PUBLICBASEURL=https://kibana.home.adnab.me +EOH + destination = "secrets/env" + env = true + } + + resources { + memory = 1000 + cpu = 500 + } + + service { + tags = [ + "kibana", + "tricot kibana.staging.deuxfleurs.org", + ] + port = 5601 + address_mode = "driver" + name = "kibana" + check { + type = "tcp" + port = 5601 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + } + } + + group "grafana" { + count = 1 + + network { + port "grafana" { + static = 3333 + } + } + + task "restore-db" { + lifecycle { + hook = "prestart" + sidecar = false + } + + driver = "docker" + config { + image = "litestream/litestream:0.3.7" + args = [ + "restore", "-config", "/etc/litestream.yml", "/ephemeral/grafana.db" + ] + volumes = [ + "../alloc/data:/ephemeral", + "secrets/litestream.yml:/etc/litestream.yml" + ] + } + user = "472" + + template { + data = file("../config/grafana-litestream.yml") + destination = "secrets/litestream.yml" + } + + resources { + memory = 200 + cpu = 1000 + } + } + + task "grafana" { + driver = "docker" + config { + image = "grafana/grafana:8.4.3" + network_mode = "host" + ports = [ "grafana" ] + volumes = [ + "../alloc/data:/var/lib/grafana", + "secrets/elastic.yaml:/etc/grafana/provisioning/datasources/elastic.yaml" + ] + } + + template { + data = file("../config/grafana/provisioning/datasources/elastic.yaml") + destination = "secrets/elastic.yaml" + } + + template { + data = <<EOH +GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel,grafana-worldmap-panel,grafana-polystat-panel +GF_SERVER_HTTP_PORT=3333 +EOH + destination = "secrets/env" + env = true + } + + resources { + memory = 500 + cpu = 100 + } + + service { + tags = [ + "grafana", + "tricot grafana.staging.deuxfleurs.org", + ] + port = 3333 + address_mode = "driver" + name = "grafana" + check { + type = "tcp" + port = 3333 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } + } + } + } + + task "replicate-db" { + driver = "docker" + config { + image = "litestream/litestream:0.3.7" + args = [ + "replicate", "-config", "/etc/litestream.yml" + ] + volumes = [ + "../alloc/data:/ephemeral", + "secrets/litestream.yml:/etc/litestream.yml" + ] + } + user = "472" + + template { + data = file("../config/grafana-litestream.yml") + destination = "secrets/litestream.yml" + } + + resources { + memory = 200 + cpu = 100 + } + } + } +} |