aboutsummaryrefslogtreecommitdiff
path: root/cluster
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2022-09-20 17:13:46 +0200
committerAlex Auvolat <alex@adnab.me>2022-09-20 17:13:46 +0200
commit56ff4c5cfdfc7fd84a10bd1d69418109e25c2560 (patch)
tree2dced4fd3861b147e1f3b0f617ecad2cd627571a /cluster
parent9b6bdc709253ba20d344bee711e5b7bf29bf03c7 (diff)
downloadnixcfg-56ff4c5cfdfc7fd84a10bd1d69418109e25c2560.tar.gz
nixcfg-56ff4c5cfdfc7fd84a10bd1d69418109e25c2560.zip
Prod-like telemetry into staging
Diffstat (limited to 'cluster')
-rw-r--r--cluster/staging/app/telemetry/config/apm-config.yaml20
-rw-r--r--cluster/staging/app/telemetry/config/filebeat.yml46
-rw-r--r--cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml7
-rw-r--r--cluster/staging/app/telemetry/config/grafana/provisioning/datasources/elastic.yaml21
-rw-r--r--cluster/staging/app/telemetry/config/otel-config.yaml56
-rw-r--r--cluster/staging/app/telemetry/config/prometheus.yml30
-rw-r--r--cluster/staging/app/telemetry/deploy/telemetry-system.hcl215
-rw-r--r--cluster/staging/app/telemetry/deploy/telemetry.hcl72
-rw-r--r--cluster/staging/app/telemetry/secrets/telemetry/grafana/admin_password1
-rw-r--r--cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_access_key1
-rw-r--r--cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_secret_key1
11 files changed, 121 insertions, 349 deletions
diff --git a/cluster/staging/app/telemetry/config/apm-config.yaml b/cluster/staging/app/telemetry/config/apm-config.yaml
deleted file mode 100644
index 07a88bd..0000000
--- a/cluster/staging/app/telemetry/config/apm-config.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-apm-server:
- # Defines the host and port the server is listening on. Use "unix:/path/to.sock" to listen on a unix domain socket.
- host: "0.0.0.0:8200"
-#-------------------------- Elasticsearch output --------------------------
-output.elasticsearch:
- # Array of hosts to connect to.
- # Scheme and port can be left out and will be set to the default (`http` and `9200`).
- # In case you specify and additional path, the scheme is required: `http://localhost:9200/path`.
- # IPv6 addresses should always be defined as: `https://[2001:db8::1]:9200`.
- hosts: ["localhost:9200"]
- username: "elastic"
- password: "{{ key "secrets/telemetry/elastic_passwords/elastic" }}"
-
-instrumentation:
- enabled: true
- environment: staging
-
-logging:
- level: warning
- to_stderr: true
diff --git a/cluster/staging/app/telemetry/config/filebeat.yml b/cluster/staging/app/telemetry/config/filebeat.yml
deleted file mode 100644
index 310afd1..0000000
--- a/cluster/staging/app/telemetry/config/filebeat.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-# see https://github.com/elastic/beats/blob/master/filebeat/filebeat.reference.yml
-filebeat.modules:
-- module: system
- syslog:
- enabled: true
- auth:
- enabled: true
-
-#filebeat.inputs:
-#- type: container
-# enabled: true
-# paths:
-# -/var/lib/docker/containers/*/*.log
-# stream: all # can be all, stdout or stderr
-
-#========================== Filebeat autodiscover ==============================
-filebeat.autodiscover:
- providers:
- - type: docker
- # https://www.elastic.co/guide/en/beats/filebeat/current/configuration-autodiscover-hints.html
- # This URL alos contains instructions on multi-line logs
- hints.enabled: true
-
-#================================ Processors ===================================
-processors:
-# - add_cloud_metadata: ~
-- add_docker_metadata: ~
-- add_locale:
- format: offset
-- add_host_metadata:
- netinfo.enabled: true
-
-#========================== Elasticsearch output ===============================
-output.elasticsearch:
- hosts: ["localhost:9200"]
- username: elastic
- password: {{ key "secrets/telemetry/elastic_passwords/elastic" }}
-
-#============================== Dashboards =====================================
-setup.dashboards:
- enabled: false
-
-#============================== Xpack Monitoring ===============================
-xpack.monitoring:
- enabled: true
- elasticsearch:
diff --git a/cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml b/cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml
new file mode 100644
index 0000000..36b67e6
--- /dev/null
+++ b/cluster/staging/app/telemetry/config/grafana-datasource-prometheus.yaml
@@ -0,0 +1,7 @@
+apiVersion: 1
+
+datasources:
+ - name: DS_PROMETHEUS
+ type: prometheus
+ access: proxy
+ url: http://prometheus.service.staging.consul:9090
diff --git a/cluster/staging/app/telemetry/config/grafana/provisioning/datasources/elastic.yaml b/cluster/staging/app/telemetry/config/grafana/provisioning/datasources/elastic.yaml
deleted file mode 100644
index 7d2277c..0000000
--- a/cluster/staging/app/telemetry/config/grafana/provisioning/datasources/elastic.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-apiVersion: 1
-
-datasources:
- - name: DS_ELASTICSEARCH
- type: elasticsearch
- access: proxy
- url: http://localhost:9200
- password: '{{ key "secrets/telemetry/elastic_passwords/elastic" }}'
- user: 'elastic'
- database: metrics-*
- basicAuth: false
- isDefault: true
- jsonData:
- esVersion: "8.2.0"
- includeFrozen: false
- logLevelField: ''
- logMessageField: ''
- maxConcurrentShardRequests: 5
- timeField: "@timestamp"
- timeInterval: "5s"
- readOnly: false
diff --git a/cluster/staging/app/telemetry/config/otel-config.yaml b/cluster/staging/app/telemetry/config/otel-config.yaml
deleted file mode 100644
index bcf1baa..0000000
--- a/cluster/staging/app/telemetry/config/otel-config.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-receivers:
- # Data sources: metrics, traces
- otlp:
- protocols:
- grpc:
- endpoint: ":4317"
- http:
- endpoint: ":55681"
- # Data sources: metrics
- prometheus:
- config:
- scrape_configs:
- - job_name: "garage"
- scrape_interval: 5s
- static_configs:
- - targets:
- - "{{ env "attr.unique.network.ip-address" }}:3909"
- - job_name: "node_exporter"
- scrape_interval: 5s
- static_configs:
- - targets:
- - "{{ env "attr.unique.network.ip-address" }}:9100"
-
-exporters:
- logging:
- logLevel: info
- # see https://www.elastic.co/guide/en/apm/get-started/current/open-telemetry-elastic.html#open-telemetry-collector
- otlp/elastic:
- endpoint: "localhost:8200"
- tls:
- insecure: true
-
-processors:
- batch:
- probabilistic_sampler:
- hash_seed: 42
- sampling_percentage: 10
-
-extensions:
- health_check:
- pprof:
- endpoint: :1888
- zpages:
- endpoint: :55679
-
-service:
- extensions: [pprof, zpages, health_check]
- pipelines:
- traces:
- receivers: [otlp]
- processors: [probabilistic_sampler, batch]
- exporters: [logging, otlp/elastic]
- metrics:
- receivers: [otlp, prometheus]
- processors: [batch]
- exporters: [logging, otlp/elastic]
diff --git a/cluster/staging/app/telemetry/config/prometheus.yml b/cluster/staging/app/telemetry/config/prometheus.yml
new file mode 100644
index 0000000..e0e786d
--- /dev/null
+++ b/cluster/staging/app/telemetry/config/prometheus.yml
@@ -0,0 +1,30 @@
+global:
+ scrape_interval: 15s # By default, scrape targets every 15 seconds.
+
+scrape_configs:
+ - job_name: 'prometheus'
+ static_configs:
+ - targets: ['localhost:9090']
+
+ - job_name: 'node-exporter'
+ consul_sd_configs:
+ - server: 'https://localhost:8501'
+ services:
+ - 'node-exporter'
+ tls_config:
+ ca_file: /etc/prometheus/consul.crt
+ cert_file: /etc/prometheus/consul-client.crt
+ key_file: /etc/prometheus/consul-client.key
+
+ - job_name: 'garage'
+ authorization:
+ type: Bearer
+ credentials: {{ key "secrets/garage-staging/metrics_token" }}
+ consul_sd_configs:
+ - server: 'https://localhost:8501'
+ services:
+ - 'garage-staging-admin'
+ tls_config:
+ ca_file: /etc/prometheus/consul.crt
+ cert_file: /etc/prometheus/consul-client.crt
+ key_file: /etc/prometheus/consul-client.key
diff --git a/cluster/staging/app/telemetry/deploy/telemetry-system.hcl b/cluster/staging/app/telemetry/deploy/telemetry-system.hcl
index 3e26c2e..e2bad61 100644
--- a/cluster/staging/app/telemetry/deploy/telemetry-system.hcl
+++ b/cluster/staging/app/telemetry/deploy/telemetry-system.hcl
@@ -1,182 +1,49 @@
job "telemetry-system" {
- datacenters = ["neptune"]
- type = "system"
+ datacenters = ["neptune"]
+ type = "system"
+ priority = "100"
- group "elasticsearch" {
+ group "collector" {
network {
- port "elastic" {
- static = 9200
- }
- port "elastic_internal" {
- static = 9300
- }
+ port "node_exporter" { static = 9100 }
}
- task "elastic" {
- driver = "docker"
- config {
- image = "docker.elastic.co/elasticsearch/elasticsearch:8.2.0"
- network_mode = "host"
- volumes = [
- "/mnt/ssd/telemetry/es_data:/usr/share/elasticsearch/data",
- "secrets/elastic-certificates.p12:/usr/share/elasticsearch/config/elastic-certificates.p12",
- ]
- ports = [ "elastic", "elastic_internal" ]
- sysctl = {
- #"vm.max_map_count" = "262144",
- }
- ulimit = {
- memlock = "9223372036854775807:9223372036854775807",
+ task "node_exporter" {
+ driver = "docker"
+
+ config {
+ image = "quay.io/prometheus/node-exporter:v1.1.2"
+ network_mode = "host"
+ volumes = [
+ "/:/host:ro,rslave"
+ ]
+ args = [ "--path.rootfs=/host" ]
+ }
+
+ resources {
+ cpu = 50
+ memory = 40
+ }
+
+ service {
+ tags = [ "telemetry" ]
+ port = 9100
+ address_mode = "driver"
+ name = "node-exporter"
+ check {
+ type = "http"
+ path = "/"
+ port = 9100
+ address_mode = "driver"
+ interval = "60s"
+ timeout = "5s"
+ check_restart {
+ limit = 3
+ grace = "90s"
+ ignore_warnings = false
+ }
}
}
-
- user = "1000"
-
- resources {
- memory = 1500
- cpu = 500
- }
-
- template {
- data = "{{ key \"secrets/telemetry/elasticsearch/elastic-certificates.p12\" }}"
- destination = "secrets/elastic-certificates.p12"
- }
-
- template {
- data = <<EOH
-node.name={{ env "attr.unique.hostname" }}
-http.port=9200
-transport.port=9300
-cluster.name=es-deuxfleurs
-cluster.initial_master_nodes=carcajou,caribou,cariacou
-discovery.seed_hosts=carcajou,caribou,cariacou
-bootstrap.memory_lock=true
-xpack.security.enabled=true
-xpack.security.authc.api_key.enabled=true
-xpack.security.transport.ssl.enabled=true
-xpack.security.transport.ssl.verification_mode=certificate
-xpack.security.transport.ssl.client_authentication=required
-xpack.security.transport.ssl.keystore.path=/usr/share/elasticsearch/config/elastic-certificates.p12
-xpack.security.transport.ssl.truststore.path=/usr/share/elasticsearch/config/elastic-certificates.p12
-cluster.routing.allocation.disk.watermark.high=75%
-cluster.routing.allocation.disk.watermark.low=65%
-ES_JAVA_OPTS=-Xms512M -Xmx512M
-EOH
- destination = "secrets/env"
- env = true
- }
- }
- }
-
- group "collector" {
- network {
- port "otel_grpc" {
- static = 4317
- }
- port "apm" {
- static = 8200
- }
- port "node_exporter" {
- static = 9100
- }
- }
-
- task "otel" {
- driver = "docker"
- config {
- image = "otel/opentelemetry-collector-contrib:0.46.0"
- args = [
- "--config=/etc/otel-config.yaml",
- ]
- network_mode = "host"
- ports= [ "otel_grpc" ]
- volumes = [
- "secrets/otel-config.yaml:/etc/otel-config.yaml"
- ]
- }
-
- template {
- data = file("../config/otel-config.yaml")
- destination = "secrets/otel-config.yaml"
- }
-
- resources {
- memory = 100
- cpu = 100
- }
- }
-
- task "apm" {
- driver = "docker"
- config {
- image = "docker.elastic.co/apm/apm-server:8.2.0"
- network_mode = "host"
- ports = [ "apm" ]
- args = [ "--strict.perms=false" ]
- volumes = [
- "secrets/apm-config.yaml:/usr/share/apm-server/apm-server.yml:ro"
- ]
- }
-
- template {
- data = file("../config/apm-config.yaml")
- destination = "secrets/apm-config.yaml"
- }
-
- resources {
- memory = 100
- cpu = 100
- }
- }
-
-/*
- task "node_exporter" {
- driver = "docker"
- config {
- image = "quay.io/prometheus/node-exporter:v1.1.2"
- network_mode = "host"
- ports = [ "node_exporter" ]
- volumes = [
- "/:/host:ro,rslave"
- ]
- args = [ "--path.rootfs=/host" ]
- }
-
- resources {
- cpu = 50
- memory = 40
- }
- }
-*/
-
- task "filebeat" {
- driver = "docker"
- config {
- image = "docker.elastic.co/beats/filebeat:8.2.0"
- network_mode = "host"
- volumes = [
- "/mnt/ssd/telemetry/filebeat:/usr/share/filebeat/data",
- "secrets/filebeat.yml:/usr/share/filebeat/filebeat.yml",
- "/var/run/docker.sock:/var/run/docker.sock",
- "/var/lib/docker/containers/:/var/lib/docker/containers/:ro",
- "/var/log/:/var/log/:ro",
- ]
- args = [ "--strict.perms=false" ]
- privileged = true
- }
- user = "root"
-
-
- template {
- data = file("../config/filebeat.yml")
- destination = "secrets/filebeat.yml"
- }
-
- resources {
- memory = 100
- cpu = 100
- }
- }
- }
+ }
+ }
}
-
diff --git a/cluster/staging/app/telemetry/deploy/telemetry.hcl b/cluster/staging/app/telemetry/deploy/telemetry.hcl
index 21685a1..cfd26f3 100644
--- a/cluster/staging/app/telemetry/deploy/telemetry.hcl
+++ b/cluster/staging/app/telemetry/deploy/telemetry.hcl
@@ -2,51 +2,59 @@ job "telemetry" {
datacenters = ["neptune"]
type = "service"
- group "kibana" {
+ group "prometheus" {
count = 1
network {
- port "kibana" {
- static = 5601
+ port "prometheus" {
+ static = 9090
}
}
- task "kibana" {
+ task "prometheus" {
driver = "docker"
config {
- image = "docker.elastic.co/kibana/kibana:8.2.0"
+ image = "prom/prometheus:v2.38.0"
network_mode = "host"
- ports = [ "kibana" ]
+ ports = [ "prometheus" ]
+ volumes = [
+ "secrets:/etc/prometheus"
+ ]
}
template {
- data = <<EOH
-SERVER_NAME={{ env "attr.unique.hostname" }}
-ELASTICSEARCH_HOSTS=http://localhost:9200
-ELASTICSEARCH_USERNAME=kibana_system
-ELASTICSEARCH_PASSWORD={{ key "secrets/telemetry/elastic_passwords/kibana_system" }}
-SERVER_PUBLICBASEURL=https://kibana.home.adnab.me
-EOH
- destination = "secrets/env"
- env = true
+ data = file("../config/prometheus.yml")
+ destination = "secrets/prometheus.yml"
+ }
+
+ template {
+ data = "{{ key \"secrets/consul/consul.crt\" }}"
+ destination = "secrets/consul.crt"
+ }
+
+ template {
+ data = "{{ key \"secrets/consul/consul-client.crt\" }}"
+ destination = "secrets/consul-client.crt"
+ }
+
+ template {
+ data = "{{ key \"secrets/consul/consul-client.key\" }}"
+ destination = "secrets/consul-client.key"
}
resources {
- memory = 1000
+ memory = 500
cpu = 500
}
service {
- tags = [
- "kibana",
- "tricot kibana.staging.deuxfleurs.org",
- ]
- port = 5601
+ port = 9090
address_mode = "driver"
- name = "kibana"
+ name = "prometheus"
check {
- type = "tcp"
- port = 5601
+ type = "http"
+ path = "/"
+ port = 9090
address_mode = "driver"
interval = "60s"
timeout = "5s"
@@ -59,13 +67,13 @@ EOH
}
}
}
-
+
group "grafana" {
count = 1
network {
port "grafana" {
- static = 3333
+ static = 3719
}
}
@@ -107,19 +115,19 @@ EOH
ports = [ "grafana" ]
volumes = [
"../alloc/data:/var/lib/grafana",
- "secrets/elastic.yaml:/etc/grafana/provisioning/datasources/elastic.yaml"
+ "secrets/prometheus.yaml:/etc/grafana/provisioning/datasources/prometheus.yaml"
]
}
template {
- data = file("../config/grafana/provisioning/datasources/elastic.yaml")
- destination = "secrets/elastic.yaml"
+ data = file("../config/grafana-datasource-prometheus.yaml")
+ destination = "secrets/prometheus.yaml"
}
template {
data = <<EOH
GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel,grafana-worldmap-panel,grafana-polystat-panel
-GF_SERVER_HTTP_PORT=3333
+GF_SERVER_HTTP_PORT=3719
EOH
destination = "secrets/env"
env = true
@@ -135,12 +143,12 @@ EOH
"grafana",
"tricot grafana.staging.deuxfleurs.org",
]
- port = 3333
+ port = 3719
address_mode = "driver"
name = "grafana"
check {
type = "tcp"
- port = 3333
+ port = 3719
address_mode = "driver"
interval = "60s"
timeout = "5s"
diff --git a/cluster/staging/app/telemetry/secrets/telemetry/grafana/admin_password b/cluster/staging/app/telemetry/secrets/telemetry/grafana/admin_password
new file mode 100644
index 0000000..2f36e97
--- /dev/null
+++ b/cluster/staging/app/telemetry/secrets/telemetry/grafana/admin_password
@@ -0,0 +1 @@
+CMD openssl rand -base64 12
diff --git a/cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_access_key b/cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_access_key
new file mode 100644
index 0000000..c7e41a4
--- /dev/null
+++ b/cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_access_key
@@ -0,0 +1 @@
+USER S3 access key for grafana db
diff --git a/cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_secret_key b/cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_secret_key
new file mode 100644
index 0000000..051f41a
--- /dev/null
+++ b/cluster/staging/app/telemetry/secrets/telemetry/grafana/s3_secret_key
@@ -0,0 +1 @@
+USER S3 secret key for grafana db