From 56ff4c5cfdfc7fd84a10bd1d69418109e25c2560 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 20 Sep 2022 17:13:46 +0200 Subject: Prod-like telemetry into staging --- .../staging/app/telemetry/config/apm-config.yaml | 20 -- cluster/staging/app/telemetry/config/filebeat.yml | 46 ----- .../config/grafana-datasource-prometheus.yaml | 7 + .../grafana/provisioning/datasources/elastic.yaml | 21 -- .../staging/app/telemetry/config/otel-config.yaml | 56 ------ .../staging/app/telemetry/config/prometheus.yml | 30 +++ .../app/telemetry/deploy/telemetry-system.hcl | 215 ++++----------------- cluster/staging/app/telemetry/deploy/telemetry.hcl | 72 ++++--- .../secrets/telemetry/grafana/admin_password | 1 + .../secrets/telemetry/grafana/s3_access_key | 1 + .../secrets/telemetry/grafana/s3_secret_key | 1 + 11 files changed, 121 insertions(+), 349 deletions(-) delete mode 100644 cluster/staging/app/telemetry/config/apm-config.yaml delete mode 100644 cluster/staging/app/telemetry/config/filebeat.yml create mode 100644 cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml delete mode 100644 cluster/staging/app/telemetry/config/grafana/provisioning/datasources/elastic.yaml delete mode 100644 cluster/staging/app/telemetry/config/otel-config.yaml create mode 100644 cluster/staging/app/telemetry/config/prometheus.yml create mode 100644 cluster/staging/app/telemetry/secrets/telemetry/grafana/admin_password create mode 100644 cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_access_key create mode 100644 cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_secret_key (limited to 'cluster/staging/app') diff --git a/cluster/staging/app/telemetry/config/apm-config.yaml b/cluster/staging/app/telemetry/config/apm-config.yaml deleted file mode 100644 index 07a88bd..0000000 --- a/cluster/staging/app/telemetry/config/apm-config.yaml +++ /dev/null @@ -1,20 +0,0 @@ -apm-server: - # Defines the host and port the server is listening on. Use "unix:/path/to.sock" to listen on a unix domain socket. - host: "0.0.0.0:8200" -#-------------------------- Elasticsearch output -------------------------- -output.elasticsearch: - # Array of hosts to connect to. - # Scheme and port can be left out and will be set to the default (`http` and `9200`). - # In case you specify and additional path, the scheme is required: `http://localhost:9200/path`. - # IPv6 addresses should always be defined as: `https://[2001:db8::1]:9200`. - hosts: ["localhost:9200"] - username: "elastic" - password: "{{ key "secrets/telemetry/elastic_passwords/elastic" }}" - -instrumentation: - enabled: true - environment: staging - -logging: - level: warning - to_stderr: true diff --git a/cluster/staging/app/telemetry/config/filebeat.yml b/cluster/staging/app/telemetry/config/filebeat.yml deleted file mode 100644 index 310afd1..0000000 --- a/cluster/staging/app/telemetry/config/filebeat.yml +++ /dev/null @@ -1,46 +0,0 @@ -# see https://github.com/elastic/beats/blob/master/filebeat/filebeat.reference.yml -filebeat.modules: -- module: system - syslog: - enabled: true - auth: - enabled: true - -#filebeat.inputs: -#- type: container -# enabled: true -# paths: -# -/var/lib/docker/containers/*/*.log -# stream: all # can be all, stdout or stderr - -#========================== Filebeat autodiscover ============================== -filebeat.autodiscover: - providers: - - type: docker - # https://www.elastic.co/guide/en/beats/filebeat/current/configuration-autodiscover-hints.html - # This URL alos contains instructions on multi-line logs - hints.enabled: true - -#================================ Processors =================================== -processors: -# - add_cloud_metadata: ~ -- add_docker_metadata: ~ -- add_locale: - format: offset -- add_host_metadata: - netinfo.enabled: true - -#========================== Elasticsearch output =============================== -output.elasticsearch: - hosts: ["localhost:9200"] - username: elastic - password: {{ key "secrets/telemetry/elastic_passwords/elastic" }} - -#============================== Dashboards ===================================== -setup.dashboards: - enabled: false - -#============================== Xpack Monitoring =============================== -xpack.monitoring: - enabled: true - elasticsearch: diff --git a/cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml b/cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml new file mode 100644 index 0000000..36b67e6 --- /dev/null +++ b/cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml @@ -0,0 +1,7 @@ +apiVersion: 1 + +datasources: + - name: DS_PROMETHEUS + type: prometheus + access: proxy + url: http://prometheus.service.staging.consul:9090 diff --git a/cluster/staging/app/telemetry/config/grafana/provisioning/datasources/elastic.yaml b/cluster/staging/app/telemetry/config/grafana/provisioning/datasources/elastic.yaml deleted file mode 100644 index 7d2277c..0000000 --- a/cluster/staging/app/telemetry/config/grafana/provisioning/datasources/elastic.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: 1 - -datasources: - - name: DS_ELASTICSEARCH - type: elasticsearch - access: proxy - url: http://localhost:9200 - password: '{{ key "secrets/telemetry/elastic_passwords/elastic" }}' - user: 'elastic' - database: metrics-* - basicAuth: false - isDefault: true - jsonData: - esVersion: "8.2.0" - includeFrozen: false - logLevelField: '' - logMessageField: '' - maxConcurrentShardRequests: 5 - timeField: "@timestamp" - timeInterval: "5s" - readOnly: false diff --git a/cluster/staging/app/telemetry/config/otel-config.yaml b/cluster/staging/app/telemetry/config/otel-config.yaml deleted file mode 100644 index bcf1baa..0000000 --- a/cluster/staging/app/telemetry/config/otel-config.yaml +++ /dev/null @@ -1,56 +0,0 @@ -receivers: - # Data sources: metrics, traces - otlp: - protocols: - grpc: - endpoint: ":4317" - http: - endpoint: ":55681" - # Data sources: metrics - prometheus: - config: - scrape_configs: - - job_name: "garage" - scrape_interval: 5s - static_configs: - - targets: - - "{{ env "attr.unique.network.ip-address" }}:3909" - - job_name: "node_exporter" - scrape_interval: 5s - static_configs: - - targets: - - "{{ env "attr.unique.network.ip-address" }}:9100" - -exporters: - logging: - logLevel: info - # see https://www.elastic.co/guide/en/apm/get-started/current/open-telemetry-elastic.html#open-telemetry-collector - otlp/elastic: - endpoint: "localhost:8200" - tls: - insecure: true - -processors: - batch: - probabilistic_sampler: - hash_seed: 42 - sampling_percentage: 10 - -extensions: - health_check: - pprof: - endpoint: :1888 - zpages: - endpoint: :55679 - -service: - extensions: [pprof, zpages, health_check] - pipelines: - traces: - receivers: [otlp] - processors: [probabilistic_sampler, batch] - exporters: [logging, otlp/elastic] - metrics: - receivers: [otlp, prometheus] - processors: [batch] - exporters: [logging, otlp/elastic] diff --git a/cluster/staging/app/telemetry/config/prometheus.yml b/cluster/staging/app/telemetry/config/prometheus.yml new file mode 100644 index 0000000..e0e786d --- /dev/null +++ b/cluster/staging/app/telemetry/config/prometheus.yml @@ -0,0 +1,30 @@ +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'node-exporter' + consul_sd_configs: + - server: 'https://localhost:8501' + services: + - 'node-exporter' + tls_config: + ca_file: /etc/prometheus/consul.crt + cert_file: /etc/prometheus/consul-client.crt + key_file: /etc/prometheus/consul-client.key + + - job_name: 'garage' + authorization: + type: Bearer + credentials: {{ key "secrets/garage-staging/metrics_token" }} + consul_sd_configs: + - server: 'https://localhost:8501' + services: + - 'garage-staging-admin' + tls_config: + ca_file: /etc/prometheus/consul.crt + cert_file: /etc/prometheus/consul-client.crt + key_file: /etc/prometheus/consul-client.key diff --git a/cluster/staging/app/telemetry/deploy/telemetry-system.hcl b/cluster/staging/app/telemetry/deploy/telemetry-system.hcl index 3e26c2e..e2bad61 100644 --- a/cluster/staging/app/telemetry/deploy/telemetry-system.hcl +++ b/cluster/staging/app/telemetry/deploy/telemetry-system.hcl @@ -1,182 +1,49 @@ job "telemetry-system" { - datacenters = ["neptune"] - type = "system" + datacenters = ["neptune"] + type = "system" + priority = "100" - group "elasticsearch" { + group "collector" { network { - port "elastic" { - static = 9200 - } - port "elastic_internal" { - static = 9300 - } + port "node_exporter" { static = 9100 } } - task "elastic" { - driver = "docker" - config { - image = "docker.elastic.co/elasticsearch/elasticsearch:8.2.0" - network_mode = "host" - volumes = [ - "/mnt/ssd/telemetry/es_data:/usr/share/elasticsearch/data", - "secrets/elastic-certificates.p12:/usr/share/elasticsearch/config/elastic-certificates.p12", - ] - ports = [ "elastic", "elastic_internal" ] - sysctl = { - #"vm.max_map_count" = "262144", - } - ulimit = { - memlock = "9223372036854775807:9223372036854775807", + task "node_exporter" { + driver = "docker" + + config { + image = "quay.io/prometheus/node-exporter:v1.1.2" + network_mode = "host" + volumes = [ + "/:/host:ro,rslave" + ] + args = [ "--path.rootfs=/host" ] + } + + resources { + cpu = 50 + memory = 40 + } + + service { + tags = [ "telemetry" ] + port = 9100 + address_mode = "driver" + name = "node-exporter" + check { + type = "http" + path = "/" + port = 9100 + address_mode = "driver" + interval = "60s" + timeout = "5s" + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } } } - - user = "1000" - - resources { - memory = 1500 - cpu = 500 - } - - template { - data = "{{ key \"secrets/telemetry/elasticsearch/elastic-certificates.p12\" }}" - destination = "secrets/elastic-certificates.p12" - } - - template { - data = <