10 files changed, 230 insertions, 176 deletions
diff --git a/cluster/prod/app/plume/config/app.env b/cluster/prod/app/plume/config/app.env
index b663d81..36000c2 100644
--- a/cluster/prod/app/plume/config/app.env
+++ b/cluster/prod/app/plume/config/app.env
@@ -28,7 +28,7 @@ MIGRATION_DIRECTORY=migrations/postgres
 
 USE_HTTPS=0
 ROCKET_ADDRESS=::
-ROCKET_PORT={{ env "NOMAD_PORT_web_port" }}
+ROCKET_PORT={{ env "NOMAD_PORT_back_port" }}
 
 MEDIA_UPLOAD_DIRECTORY=/app/static/media
 SEARCH_INDEX=/app/search_index
diff --git a/cluster/prod/app/plume/deploy/plume.hcl b/cluster/prod/app/plume/deploy/plume.hcl
index d9e276e..c759a02 100644
--- a/cluster/prod/app/plume/deploy/plume.hcl
+++ b/cluster/prod/app/plume/deploy/plume.hcl
@@ -6,7 +6,45 @@ job "plume-blog" {
     count = 1
 
     network {
-      port "web_port" { }
+      port "back_port" { }
+      port "cache_port" { }
+    }
+
+    task "varnish" {
+      driver = "docker"
+      config {
+        image = "varnish:7.6.1"
+        network_mode = "host"
+        ports = [ "cache_port" ]
+
+	# cache
+	mount {
+          type = "tmpfs"
+          target = "/var/lib/varnish/varnishd:exec"
+          readonly = false
+          tmpfs_options {
+              size = 2684354559 # 2.5GB in bytes
+          }
+        }
+      }
+
+      env {
+        VARNISH_SIZE = "2G"
+        VARNISH_BACKEND_HOST = "localhost"
+        VARNISH_BACKEND_PORT = "${NOMAD_PORT_back_port}"
+        VARNISH_HTTP_PORT = "${NOMAD_PORT_cache_port}"
+      }
+
+      service {
+        name = "plume-cache"
+        tags = [
+          "plume",
+          "tricot plume.deuxfleurs.fr",
+          "d53-cname plume.deuxfleurs.fr",
+        ]
+        port = "cache_port"
+        address_mode = "host"
+      }
     }
 
     task "plume" {
@@ -14,9 +52,9 @@ job "plume-blog" {
       config {
         image = "lxpz/plume_s3:v1"
         network_mode = "host"
-        ports = [ "web_port" ]
+        ports = [ "back_port" ]
         command = "sh"
-        args = [ "-c", "plm search init; plm search refill; plume" ]
+        args = [ "-c", "plm search init; plume" ]
       }
 
       template {
@@ -26,24 +64,22 @@ job "plume-blog" {
       }
 
       resources {
-        memory = 1024
-        memory_max = 1024
+        memory = 512
+        memory_max = 512
         cpu = 100
       }
 
       service {
-        name = "plume"
+        name = "plume-back"
         tags = [
           "plume",
-          "tricot plume.deuxfleurs.fr",
-          "d53-cname plume.deuxfleurs.fr",
         ]
-        port = "web_port"
+        port = "back_port"
         address_mode = "host"
         check {
           type = "http"
           protocol = "http"
-          port = "web_port"
+          port = "back_port"
           path = "/"
           interval = "60s"
           timeout = "5s"
@@ -55,7 +91,7 @@ job "plume-blog" {
         }
       }
       restart {
-        interval = "30m"
+        interval = "20m"
         attempts = 20
         delay    = "15s"
         mode     = "delay"
diff --git a/cluster/prod/app/telemetry/deploy/telemetry-service.hcl b/cluster/prod/app/telemetry/deploy/telemetry-service.hcl
index 8b120e6..0744abc 100644
--- a/cluster/prod/app/telemetry/deploy/telemetry-service.hcl
+++ b/cluster/prod/app/telemetry/deploy/telemetry-service.hcl
@@ -45,7 +45,7 @@ job "telemetry-service" {
     task "grafana" {
       driver = "docker"
       config {
-        image = "grafana/grafana:11.4.0"
+        image = "grafana/grafana:11.4.1"
         network_mode = "host"
         ports = [ "grafana" ]
         volumes = [
@@ -76,9 +76,9 @@ EOH
       }
 
       resources {
-        memory = 100
+        memory = 200
         memory_max = 400
-        cpu = 500
+        cpu = 300
       }
 
       service {
diff --git a/cluster/prod/app/woodpecker-ci/deploy/server.hcl b/cluster/prod/app/woodpecker-ci/deploy/server.hcl
index e0788de..60806b9 100644
--- a/cluster/prod/app/woodpecker-ci/deploy/server.hcl
+++ b/cluster/prod/app/woodpecker-ci/deploy/server.hcl
@@ -23,7 +23,7 @@ job "woodpecker-ci" {
     task "server" {
       driver = "docker"
       config {
-        image = "woodpeckerci/woodpecker-server:v2.7.1"
+        image = "woodpeckerci/woodpecker-server:v3.0.1"
         ports = [ "web_port", "grpc_port" ]
         network_mode = "host"
       }
diff --git a/cluster/prod/app/woodpecker-ci/integration/docker-compose.yml b/cluster/prod/app/woodpecker-ci/integration/docker-compose.yml
index 7b825df..5756b25 100644
--- a/cluster/prod/app/woodpecker-ci/integration/docker-compose.yml
+++ b/cluster/prod/app/woodpecker-ci/integration/docker-compose.yml
@@ -10,7 +10,7 @@ services:
       - "./nix.conf:/etc/nix/nix.conf:ro"
 
   woodpecker-runner:
-    image: woodpeckerci/woodpecker-agent:v2.4.1
+    image: woodpeckerci/woodpecker-agent:v3.0.1
     restart: always
     environment:
       # -- change these for each agent
diff --git a/cluster/prod/cluster.nix b/cluster/prod/cluster.nix
index 080b258..66da48d 100644
--- a/cluster/prod/cluster.nix
+++ b/cluster/prod/cluster.nix
@@ -133,6 +133,9 @@
     kokakiwi = [
       "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFPTsEgcOtb2bij+Ih8eg8ZqO7d3IMiWykv6deMzlSSS kokakiwi@kira"
     ];
+    stitch = [
+      "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAILdT28Emp9yJqTPrxz+oDP08KZaN1kbsNyVqt9p9IMED"
+    ];
   };
 
   # For Garage external communication
diff --git a/cluster/staging/app/telemetry/deploy/telemetry-service.hcl b/cluster/staging/app/telemetry/deploy/telemetry-service.hcl
index 47554e2..5fcaa7a 100644
--- a/cluster/staging/app/telemetry/deploy/telemetry-service.hcl
+++ b/cluster/staging/app/telemetry/deploy/telemetry-service.hcl
@@ -2,95 +2,6 @@ job "telemetry-service" {
   datacenters = ["neptune", "dathomir", "corrin", "bespin"]
   type = "service"
 
-  group "prometheus" {
-    count = 2
-
-    network {
-      port "prometheus" {
-        static = 9090
-      }
-    }
-
-    constraint {
-      attribute = "${attr.unique.hostname}"
-      operator = "set_contains_any"
-      value = "df-pw5,origan"
-    }
-
-    task "prometheus" {
-      driver = "nix2"
-      config {
-        nixpkgs = "github:nixos/nixpkgs/nixos-22.11"
-        packages = [ "#prometheus", "#coreutils", "#findutils", "#bash" ]
-        command = "prometheus"
-        args = [
-          "--config.file=/etc/prom/prometheus.yml",
-          "--storage.tsdb.path=/data",
-          "--storage.tsdb.retention.size=5GB",
-        ]
-        bind = {
-          "/mnt/ssd/prometheus" = "/data"
-        }
-      }
-
-      template {
-        data = file("../config/prometheus.yml")
-        destination = "etc/prom/prometheus.yml"
-      }
-
-      template {
-        data = "{{ key \"secrets/consul/consul-ca.crt\" }}"
-        destination = "etc/prom/consul.crt"
-      }
-
-      template {
-        data = "{{ key \"secrets/consul/consul-client.crt\" }}"
-        destination = "etc/prom/consul-client.crt"
-      }
-
-      template {
-        data = "{{ key \"secrets/consul/consul-client.key\" }}"
-        destination = "etc/prom/consul-client.key"
-      }
-
-      template {
-        data = "{{ key \"secrets/nomad/nomad-ca.crt\" }}"
-        destination = "etc/prom/nomad-ca.crt"
-      }
-
-      template {
-        data = "{{ key \"secrets/nomad/nomad-client.crt\" }}"
-        destination = "etc/prom/nomad-client.crt"
-      }
-
-      template {
-        data = "{{ key \"secrets/nomad/nomad-client.key\" }}"
-        destination = "etc/prom/nomad-client.key"
-      }
-
-      resources {
-        memory = 500
-        cpu = 200
-      }
-
-      service {
-        port = "prometheus"
-        name = "prometheus"
-        check {
-          type = "http"
-          path = "/"
-          interval = "60s"
-          timeout = "5s"
-          check_restart {
-            limit = 3
-            grace = "90s"
-            ignore_warnings = false
-          }
-        }
-      }
-    }
-  }
-
   group "grafana" {
     count = 1
 
@@ -106,50 +17,46 @@ job "telemetry-service" {
         sidecar = false
       }
 
-      driver = "nix2"
+      driver = "docker"
       config {
-        packages = [ "#litestream" ]
-        command = "litestream"
+        image = "litestream/litestream:0.3.13"
         args = [
           "restore", "-config", "/etc/litestream.yml", "/ephemeral/grafana.db"
         ]
-        bind = {
-          "../alloc/data" = "/ephemeral",
-        }
+        volumes = [
+          "../alloc/data:/ephemeral",
+          "secrets/litestream.yml:/etc/litestream.yml"
+        ]
       }
+      user = "472"
 
       template {
         data = file("../config/grafana-litestream.yml")
-        destination = "etc/litestream.yml"
+        destination = "secrets/litestream.yml"
       }
 
       resources {
-        memory = 100
-        memory_max = 1000
+        memory = 50
+        memory_max = 200
         cpu = 100
       }
     }
 
     task "grafana" {
-      driver = "nix2"
+      driver = "docker"
       config {
-        nixpkgs = "github:nixos/nixpkgs/nixos-22.11"
-        packages = [ "#grafana" ]
-        command = "grafana-server"
-        args = [
-          "-homepath", "/share/grafana",
-          "cfg:default.paths.data=/grafana",
-          "cfg:default.paths.provisioning=/grafana-provisioning"
+        image = "grafana/grafana:11.4.1"
+        network_mode = "host"
+        ports = [ "grafana" ]
+        volumes = [
+          "../alloc/data:/var/lib/grafana",
+          "secrets/prometheus.yaml:/etc/grafana/provisioning/datasources/prometheus.yaml"
         ]
-
-        bind = {
-          "../alloc/data" = "/grafana",
-        }
       }
 
       template {
         data = file("../config/grafana-datasource-prometheus.yaml")
-        destination = "grafana-provisioning/datasources/prometheus.yaml"
+        destination = "secrets/prometheus.yaml"
       }
 
       template {
@@ -163,8 +70,9 @@ GF_SECURITY_ADMIN_PASSWORD={{ key "secrets/telemetry/grafana/admin_password" }}
       }
 
       resources {
-        memory = 300
-          cpu = 300
+        memory = 100
+        memory_max = 400
+        cpu = 300
       }
 
       restart {
@@ -181,9 +89,12 @@ GF_SECURITY_ADMIN_PASSWORD={{ key "secrets/telemetry/grafana/admin_password" }}
           "tricot grafana.staging.deuxfleurs.org",
           "d53-cname grafana.staging.deuxfleurs.org",
         ]
-        port = "grafana"
+        port = 3719
+        address_mode = "driver"
         check {
           type = "tcp"
+          port = 3719
+          address_mode = "driver"
           interval = "60s"
           timeout = "5s"
           check_restart {
@@ -196,26 +107,27 @@ GF_SECURITY_ADMIN_PASSWORD={{ key "secrets/telemetry/grafana/admin_password" }}
     }
 
     task "replicate-db" {
-      driver = "nix2"
+      driver = "docker"
       config {
-        packages = [ "#litestream" ]
-        command = "litestream"
+        image = "litestream/litestream:0.3.13"
         args = [
           "replicate", "-config", "/etc/litestream.yml"
         ]
-        bind = {
-          "../alloc/data" = "/ephemeral",
-        }
+        volumes = [
+          "../alloc/data:/ephemeral",
+          "secrets/litestream.yml:/etc/litestream.yml"
+        ]
       }
+      user = "472"
 
       template {
         data = file("../config/grafana-litestream.yml")
-        destination = "etc/litestream.yml"
+        destination = "secrets/litestream.yml"
       }
 
       resources {
-        memory = 100
-        memory_max = 500
+        memory = 50
+        memory_max = 200
         cpu = 100
       }
     }
diff --git a/cluster/staging/app/telemetry/deploy/telemetry-storage.hcl b/cluster/staging/app/telemetry/deploy/telemetry-storage.hcl
new file mode 100644
index 0000000..fbde697
--- /dev/null
+++ b/cluster/staging/app/telemetry/deploy/telemetry-storage.hcl
@@ -0,0 +1,97 @@
+job "telemetry-storage" {
+  datacenters = ["neptune", "dathomir", "corrin", "bespin"]
+  type = "service"
+
+  group "prometheus" {
+    count = 2
+
+    network {
+      port "prometheus" {
+        static = 9090
+      }
+    }
+
+    constraint {
+      attribute = "${attr.unique.hostname}"
+      operator = "set_contains_any"
+      value = "df-pw5,origan"
+    }
+
+    task "prometheus" {
+      driver = "docker"
+      config {
+        image = "prom/prometheus:v3.1.0"
+        network_mode = "host"
+        ports = [ "prometheus" ]
+        args = [
+          "--config.file=/etc/prometheus/prometheus.yml",
+          "--storage.tsdb.path=/data",
+          "--storage.tsdb.retention.size=20GB",
+        ]
+        volumes = [
+          "secrets:/etc/prometheus",
+          "/mnt/ssd/prometheus:/data"
+        ]
+      }
+
+      template {
+        data = file("../config/prometheus.yml")
+        destination = "secrets/prometheus.yml"
+      }
+
+      template {
+        data = "{{ key \"secrets/consul/consul-ca.crt\" }}"
+        destination = "secrets/consul.crt"
+      }
+
+      template {
+        data = "{{ key \"secrets/consul/consul-client.crt\" }}"
+        destination = "secrets/consul-client.crt"
+      }
+
+      template {
+        data = "{{ key \"secrets/consul/consul-client.key\" }}"
+        destination = "secrets/consul-client.key"
+      }
+
+      template {
+        data = "{{ key \"secrets/nomad/nomad-ca.crt\" }}"
+        destination = "secrets/nomad-ca.crt"
+      }
+
+      template {
+        data = "{{ key \"secrets/nomad/nomad-client.crt\" }}"
+        destination = "secrets/nomad-client.crt"
+      }
+
+      template {
+        data = "{{ key \"secrets/nomad/nomad-client.key\" }}"
+        destination = "secrets/nomad-client.key"
+      }
+
+      resources {
+        memory = 500
+        cpu = 200
+      }
+
+      service {
+        port = 9090
+        address_mode = "driver"
+        name = "prometheus"
+        check {
+          type = "http"
+          path = "/"
+          port = 9090
+          address_mode = "driver"
+          interval = "60s"
+          timeout = "5s"
+          check_restart {
+            limit = 3
+            grace = "90s"
+            ignore_warnings = false
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/cluster/staging/app/telemetry/deploy/telemetry-system.hcl b/cluster/staging/app/telemetry/deploy/telemetry-system.hcl
index a97c7b1..9cd254a 100644
--- a/cluster/staging/app/telemetry/deploy/telemetry-system.hcl
+++ b/cluster/staging/app/telemetry/deploy/telemetry-system.hcl
@@ -4,43 +4,46 @@ job "telemetry-system" {
   priority = "100"
 
   group "collector" {
-    network {
-      port "node_exporter" { static = 9100 }
-    }
+     network {
+       port "node_exporter" { static = 9100 }
+     }
 
-    task "node_exporter" {
-      driver = "nix2"
+		task "node_exporter" {
+			driver = "docker"
 
-      config {
-        packages = [ "#prometheus-node-exporter" ]
-        command = "node_exporter"
-        args = [ "--path.rootfs=/host" ]
-        bind_read_only = {
-          "/" = "/host"
-        }
-      }
+			config {
+				image = "quay.io/prometheus/node-exporter:v1.8.1"
+				network_mode = "host"
+				volumes = [
+					"/:/host:ro,rslave"
+				]
+				args = [ "--path.rootfs=/host" ]
+			}
 
-      resources {
-        cpu = 50
-        memory = 40
-      }
+			resources {
+				cpu = 50
+				memory = 40
+			}
 
-      service {
-        name = "node-exporter"
-        tags = [ "telemetry" ]
-        port = "node_exporter"
-        check {
-          type = "http"
-          path = "/"
-          interval = "60s"
-          timeout = "5s"
-          check_restart {
-            limit = 3
-            grace = "90s"
-            ignore_warnings = false
-          }
-        }
-      }
-    }
-  }
-}
+       service {
+         tags = [ "telemetry" ]
+         port = 9100
+         address_mode = "driver"
+         name = "node-exporter"
+         check {
+           type = "http"
+           path = "/"
+           port = 9100
+           address_mode = "driver"
+           interval = "60s"
+           timeout = "5s"
+           check_restart {
+             limit = 3
+             grace = "90s"
+             ignore_warnings = false
+           }
+         }
+       }
+		}
+	}
+ }
diff --git a/cluster/staging/cluster.nix b/cluster/staging/cluster.nix
index 26011d2..8a71424 100644
--- a/cluster/staging/cluster.nix
+++ b/cluster/staging/cluster.nix
@@ -90,6 +90,9 @@
       "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJX0A2P59or83EKhh32o8XumGz0ToTEsoq89hMbMtr7h"
       "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIB540H9kn+Ocs4Wjc1Y3f3OkHFYEqc5IM/FiCyoVVoh3"
     ];
+    stitch = [
+      "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAILdT28Emp9yJqTPrxz+oDP08KZaN1kbsNyVqt9p9IMED"
+    ];
   };
 
   # For Garage ipv6 communication