diff options
author | Quentin Dufour <quentin@deuxfleurs.fr> | 2022-09-01 18:05:50 +0200 |
---|---|---|
committer | Quentin Dufour <quentin@deuxfleurs.fr> | 2022-09-01 18:05:50 +0200 |
commit | 02c65de5fec7242d225d6e052117f108f0a5e035 (patch) | |
tree | 38ccd91ed7ca6c865bc1acb544d206b5a1b365f8 /cluster/prod | |
parent | 1749a98e86b0ea33131bbc3511fb7ed6fb39375d (diff) | |
download | nixcfg-02c65de5fec7242d225d6e052117f108f0a5e035.tar.gz nixcfg-02c65de5fec7242d225d6e052117f108f0a5e035.zip |
Restart backups
Diffstat (limited to 'cluster/prod')
26 files changed, 487 insertions, 1 deletions
diff --git a/cluster/prod/app/backup/build/backup-consul/Dockerfile b/cluster/prod/app/backup/build/backup-consul/Dockerfile new file mode 100644 index 0000000..0a5c38f --- /dev/null +++ b/cluster/prod/app/backup/build/backup-consul/Dockerfile @@ -0,0 +1,28 @@ +FROM golang:buster as builder + +WORKDIR /root +RUN git clone https://filippo.io/age && cd age/cmd/age && go build -o age . + +FROM amd64/debian:buster + +COPY --from=builder /root/age/cmd/age/age /usr/local/bin/age + +RUN apt-get update && \ + apt-get -qq -y full-upgrade && \ + apt-get install -y rsync wget openssh-client unzip && \ + apt-get clean && \ + rm -f /var/lib/apt/lists/*_* + +RUN mkdir -p /root/.ssh +WORKDIR /root + +RUN wget https://releases.hashicorp.com/consul/1.8.5/consul_1.8.5_linux_amd64.zip && \ + unzip consul_1.8.5_linux_amd64.zip && \ + chmod +x consul && \ + mv consul /usr/local/bin && \ + rm consul_1.8.5_linux_amd64.zip + +COPY do_backup.sh /root/do_backup.sh + +CMD "/root/do_backup.sh" + diff --git a/cluster/prod/app/backup/build/backup-consul/do_backup.sh b/cluster/prod/app/backup/build/backup-consul/do_backup.sh new file mode 100755 index 0000000..a34e7b7 --- /dev/null +++ b/cluster/prod/app/backup/build/backup-consul/do_backup.sh @@ -0,0 +1,20 @@ +#!/bin/sh + +set -x -e + +cd /root + +chmod 0600 .ssh/id_ed25519 + +cat > .ssh/config <<EOF +Host backuphost + HostName $TARGET_SSH_HOST + Port $TARGET_SSH_PORT + User $TARGET_SSH_USER +EOF + +consul kv export | \ + gzip | \ + age -r "$(cat /root/.ssh/id_ed25519.pub)" | \ + ssh backuphost "cat > $TARGET_SSH_DIR/consul/$(date --iso-8601=minute)_consul_kv_export.gz.age" + diff --git a/cluster/prod/app/backup/build/backup-psql/.gitignore b/cluster/prod/app/backup/build/backup-psql/.gitignore new file mode 100644 index 0000000..b2be92b --- /dev/null +++ b/cluster/prod/app/backup/build/backup-psql/.gitignore @@ -0,0 +1 @@ +result diff --git a/cluster/prod/app/backup/build/backup-psql/README.md b/cluster/prod/app/backup/build/backup-psql/README.md new file mode 100644 index 0000000..97929db --- /dev/null +++ b/cluster/prod/app/backup/build/backup-psql/README.md @@ -0,0 +1,8 @@ +## Build + +```bash +docker load < $(nix-build docker.nix) +docker push superboum/backup-psql:??? +``` + + diff --git a/cluster/prod/app/backup/build/backup-psql/backup-psql.py b/cluster/prod/app/backup/build/backup-psql/backup-psql.py new file mode 100755 index 0000000..291cf50 --- /dev/null +++ b/cluster/prod/app/backup/build/backup-psql/backup-psql.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +import shutil,sys,os,datetime,minio,subprocess + +working_directory = "." +if 'CACHE_DIR' in os.environ: working_directory = os.environ['CACHE_DIR'] +required_space_in_bytes = 20 * 1024 * 1024 * 1024 +bucket = os.environ['AWS_BUCKET'] +key = os.environ['AWS_ACCESS_KEY_ID'] +secret = os.environ['AWS_SECRET_ACCESS_KEY'] +endpoint = os.environ['AWS_ENDPOINT'] +pubkey = os.environ['CRYPT_PUBLIC_KEY'] +psql_host = os.environ['PSQL_HOST'] +psql_user = os.environ['PSQL_USER'] +s3_prefix = str(datetime.datetime.now()) +files = [ "backup_manifest", "base.tar.gz", "pg_wal.tar.gz" ] +clear_paths = [ os.path.join(working_directory, f) for f in files ] +crypt_paths = [ os.path.join(working_directory, f) + ".age" for f in files ] +s3_keys = [ s3_prefix + "/" + f for f in files ] + +def abort(msg): + for p in clear_paths + crypt_paths: + if os.path.exists(p): + print(f"Remove {p}") + os.remove(p) + + if msg: sys.exit(msg) + else: print("success") + +# Check we have enough space on disk +if shutil.disk_usage(working_directory).free < required_space_in_bytes: + abort(f"Not enough space on disk at path {working_directory} to perform a backup, aborting") + +# Check postgres password is set +if 'PGPASSWORD' not in os.environ: + abort(f"You must pass postgres' password through the environment variable PGPASSWORD") + +# Check our working directory is empty +if len(os.listdir(working_directory)) != 0: + abort(f"Working directory {working_directory} is not empty, aborting") + +# Check Minio +client = minio.Minio(endpoint, key, secret) +if not client.bucket_exists(bucket): + abort(f"Bucket {bucket} does not exist or its access is forbidden, aborting") + +# Perform the backup locally +try: + ret = subprocess.run(["pg_basebackup", + f"--host={psql_host}", + f"--username={psql_user}", + f"--pgdata={working_directory}", + f"--format=tar", + "--wal-method=stream", + "--gzip", + "--compress=6", + "--progress", + "--max-rate=5M", + ]) + if ret.returncode != 0: + abort(f"pg_basebackup exited, expected return code 0, got {ret.returncode}. aborting") +except Exception as e: + abort(f"pg_basebackup raised exception {e}. aborting") + +# Check that the expected files are here +for p in clear_paths: + print(f"Checking that {p} exists locally") + if not os.path.exists(p): + abort(f"File {p} expected but not found, aborting") + +# Cipher them +for c, e in zip(clear_paths, crypt_paths): + print(f"Ciphering {c} to {e}") + try: + ret = subprocess.run(["age", "-r", pubkey, "-o", e, c]) + if ret.returncode != 0: + abort(f"age exit code is {ret}, 0 expected. aborting") + except Exception as e: + abort(f"aged raised an exception. {e}. aborting") + +# Upload the backup to S3 +for p, k in zip(crypt_paths, s3_keys): + try: + print(f"Uploading {p} to {k}") + result = client.fput_object(bucket, k, p) + print( + "created {0} object; etag: {1}, version-id: {2}".format( + result.object_name, result.etag, result.version_id, + ), + ) + except Exception as e: + abort(f"Exception {e} occured while upload {p}. aborting") + +# Check that the files have been uploaded +for k in s3_keys: + try: + print(f"Checking that {k} exists remotely") + result = client.stat_object(bucket, k) + print( + "last-modified: {0}, size: {1}".format( + result.last_modified, result.size, + ), + ) + except Exception as e: + abort(f"{k} not found on S3. {e}. aborting") + +abort(None) diff --git a/cluster/prod/app/backup/build/backup-psql/common.nix b/cluster/prod/app/backup/build/backup-psql/common.nix new file mode 100644 index 0000000..639d9a1 --- /dev/null +++ b/cluster/prod/app/backup/build/backup-psql/common.nix @@ -0,0 +1,8 @@ +{ + pkgsSrc = fetchTarball { + # Latest commit on https://github.com/NixOS/nixpkgs/tree/nixos-21.11 + # As of 2022-04-15 + url ="https://github.com/NixOS/nixpkgs/archive/2f06b87f64bc06229e05045853e0876666e1b023.tar.gz"; + sha256 = "sha256:1d7zg96xw4qsqh7c89pgha9wkq3rbi9as3k3d88jlxy2z0ns0cy2"; + }; +} diff --git a/cluster/prod/app/backup/build/backup-psql/default.nix b/cluster/prod/app/backup/build/backup-psql/default.nix new file mode 100644 index 0000000..2cd8d93 --- /dev/null +++ b/cluster/prod/app/backup/build/backup-psql/default.nix @@ -0,0 +1,37 @@ +let + common = import ./common.nix; + pkgs = import common.pkgsSrc {}; + python-with-my-packages = pkgs.python3.withPackages (p: with p; [ + minio + ]); +in + pkgs.stdenv.mkDerivation { + name = "backup-psql"; + src = pkgs.lib.sourceFilesBySuffices ./. [ ".py" ]; + + buildInputs = [ + python-with-my-packages + pkgs.age + pkgs.postgresql_14 + ]; + + buildPhase = '' + cat > backup-psql <<EOF + #!${pkgs.bash}/bin/bash + + export PYTHONPATH=${python-with-my-packages}/${python-with-my-packages.sitePackages} + export PATH=${python-with-my-packages}/bin:${pkgs.age}/bin:${pkgs.postgresql_14}/bin + + ${python-with-my-packages}/bin/python3 $out/lib/backup-psql.py + EOF + + chmod +x backup-psql + ''; + + installPhase = '' + mkdir -p $out/{bin,lib} + cp *.py $out/lib/backup-psql.py + cp backup-psql $out/bin/backup-psql + ''; + } + diff --git a/cluster/prod/app/backup/build/backup-psql/docker.nix b/cluster/prod/app/backup/build/backup-psql/docker.nix new file mode 100644 index 0000000..693943a --- /dev/null +++ b/cluster/prod/app/backup/build/backup-psql/docker.nix @@ -0,0 +1,11 @@ +let + common = import ./common.nix; + app = import ./default.nix; + pkgs = import common.pkgsSrc {}; +in + pkgs.dockerTools.buildImage { + name = "superboum/backup-psql-docker"; + config = { + Cmd = [ "${app}/bin/backup-psql" ]; + }; + } diff --git a/cluster/prod/app/backup/deploy/backup-daily.hcl b/cluster/prod/app/backup/deploy/backup-daily.hcl new file mode 100644 index 0000000..df592ce --- /dev/null +++ b/cluster/prod/app/backup/deploy/backup-daily.hcl @@ -0,0 +1,196 @@ +job "backup_daily" { + datacenters = ["orion", "neptune"] + type = "batch" + + priority = "60" + + periodic { + cron = "@daily" + // Do not allow overlapping runs. + prohibit_overlap = true + } + + group "backup-dovecot" { + constraint { + attribute = "${attr.unique.hostname}" + operator = "=" + value = "doradille" + } + + task "main" { + driver = "docker" + + config { + image = "restic/restic:0.14.0" + entrypoint = [ "/bin/sh", "-c" ] + args = [ "restic backup /mail && restic forget --keep-within 1m1d --keep-within-weekly 3m --keep-within-monthly 1y && restic prune --max-unused 50% --max-repack-size 2G && restic check" ] + volumes = [ + "/mnt/ssd/mail:/mail" + ] + } + + template { + data = <<EOH +AWS_ACCESS_KEY_ID={{ key "secrets/email/dovecot/backup_aws_access_key_id" }} +AWS_SECRET_ACCESS_KEY={{ key "secrets/email/dovecot/backup_aws_secret_access_key" }} +RESTIC_REPOSITORY={{ key "secrets/email/dovecot/backup_restic_repository" }} +RESTIC_PASSWORD={{ key "secrets/email/dovecot/backup_restic_password" }} +EOH + + destination = "secrets/env_vars" + env = true + } + + resources { + cpu = 500 + memory = 100 + memory_max = 300 + } + + restart { + attempts = 2 + interval = "30m" + delay = "15s" + mode = "fail" + } + } + } + + group "backup-plume" { + constraint { + attribute = "${attr.unique.hostname}" + operator = "=" + value = "dahlia" + } + + task "main" { + driver = "docker" + + config { + image = "restic/restic:0.14.0" + entrypoint = [ "/bin/sh", "-c" ] + args = [ "restic backup /plume && restic forget --keep-within 1m1d --keep-within-weekly 3m --keep-within-monthly 1y && restic prune --max-unused 50% --max-repack-size 2G && restic check" ] + volumes = [ + "/mnt/ssd/plume/media:/plume" + ] + } + + template { + data = <<EOH +AWS_ACCESS_KEY_ID={{ key "secrets/plume/backup_aws_access_key_id" }} +AWS_SECRET_ACCESS_KEY={{ key "secrets/plume/backup_aws_secret_access_key" }} +RESTIC_REPOSITORY={{ key "secrets/plume/backup_restic_repository" }} +RESTIC_PASSWORD={{ key "secrets/plume/backup_restic_password" }} +EOH + + destination = "secrets/env_vars" + env = true + } + + resources { + cpu = 500 + memory = 100 + memory_max = 300 + } + + restart { + attempts = 2 + interval = "30m" + delay = "15s" + mode = "fail" + } + } + } + + group "backup-consul" { + task "consul-kv-export" { + driver = "docker" + + lifecycle { + hook = "prestart" + sidecar = false + } + + config { + image = "consul:1.13.1" + network_mode = "host" + entrypoint = [ "/bin/sh", "-c" ] + args = [ "/bin/consul kv export > $NOMAD_ALLOC_DIR/consul.json" ] + volumes = [ + "secrets:/etc/consul", + ] + } + + env { + CONSUL_HTTP_ADDR = "https://consul.service.prod.consul:8501" + CONSUL_HTTP_SSL = "true" + CONSUL_CACERT = "/etc/consul/consul.crt" + CONSUL_CLIENT_CERT = "/etc/consul/consul-client.crt" + CONSUL_CLIENT_KEY = "/etc/consul/consul-client.key" + } + + resources { + cpu = 200 + memory = 200 + } + + + template { + data = "{{ key \"secrets/consul/consul.crt\" }}" + destination = "secrets/consul.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.crt\" }}" + destination = "secrets/consul-client.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.key\" }}" + destination = "secrets/consul-client.key" + } + + restart { + attempts = 2 + interval = "30m" + delay = "15s" + mode = "fail" + } + } + + task "restic-backup" { + driver = "docker" + + config { + image = "restic/restic:0.12.1" + entrypoint = [ "/bin/sh", "-c" ] + args = [ "restic backup $NOMAD_ALLOC_DIR/consul.json && restic forget --keep-within 1m1d --keep-within-weekly 3m --keep-within-monthly 1y && restic prune --max-unused 50% --max-repack-size 2G && restic check" ] + } + + + template { + data = <<EOH +AWS_ACCESS_KEY_ID={{ key "secrets/backup/consul/backup_aws_access_key_id" }} +AWS_SECRET_ACCESS_KEY={{ key "secrets/backup/consul/backup_aws_secret_access_key" }} +RESTIC_REPOSITORY={{ key "secrets/backup/consul/backup_restic_repository" }} +RESTIC_PASSWORD={{ key "secrets/backup/consul/backup_restic_password" }} +EOH + + destination = "secrets/env_vars" + env = true + } + + resources { + cpu = 200 + memory = 200 + } + + restart { + attempts = 2 + interval = "30m" + delay = "15s" + mode = "fail" + } + } + } +} diff --git a/cluster/prod/app/backup/deploy/backup-weekly.hcl b/cluster/prod/app/backup/deploy/backup-weekly.hcl new file mode 100644 index 0000000..36a507a --- /dev/null +++ b/cluster/prod/app/backup/deploy/backup-weekly.hcl @@ -0,0 +1,55 @@ +job "backup_weekly" { + datacenters = ["orion"] + type = "batch" + + priority = "60" + + periodic { + cron = "@weekly" + // Do not allow overlapping runs. + prohibit_overlap = true + } + + group "backup-psql" { + task "main" { + driver = "docker" + + config { + image = "superboum/backup-psql-docker:gyr3aqgmhs0hxj0j9hkrdmm1m07i8za2" + volumes = [ + // Mount a cache on the hard disk to avoid filling up the SSD + "/mnt/storage/tmp_bckp_psql:/mnt/cache" + ] + } + + template { + data = <<EOH +CACHE_DIR=/mnt/cache +AWS_BUCKET=backups-pgbasebackup +AWS_ENDPOINT=s3.deuxfleurs.shirokumo.net +AWS_ACCESS_KEY_ID={{ key "secrets/postgres/backup/aws_access_key_id" }} +AWS_SECRET_ACCESS_KEY={{ key "secrets/postgres/backup/aws_secret_access_key" }} +CRYPT_PUBLIC_KEY={{ key "secrets/postgres/backup/crypt_public_key" }} +PSQL_HOST=psql-proxy.service.prod.consul +PSQL_USER={{ key "secrets/postgres/keeper/pg_repl_username" }} +PGPASSWORD={{ key "secrets/postgres/keeper/pg_repl_pwd" }} +EOH + + destination = "secrets/env_vars" + env = true + } + + resources { + cpu = 200 + memory = 200 + } + + restart { + attempts = 2 + interval = "30m" + delay = "15s" + mode = "fail" + } + } + } +} diff --git a/cluster/prod/app/backup/secrets/backup/consul/backup_aws_access_key_id b/cluster/prod/app/backup/secrets/backup/consul/backup_aws_access_key_id new file mode 100644 index 0000000..9235e53 --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/consul/backup_aws_access_key_id @@ -0,0 +1 @@ +USER Backup AWS access key ID diff --git a/cluster/prod/app/backup/secrets/backup/consul/backup_aws_secret_access_key b/cluster/prod/app/backup/secrets/backup/consul/backup_aws_secret_access_key new file mode 100644 index 0000000..f34677e --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/consul/backup_aws_secret_access_key @@ -0,0 +1 @@ +USER Backup AWS secret access key diff --git a/cluster/prod/app/backup/secrets/backup/consul/backup_restic_password b/cluster/prod/app/backup/secrets/backup/consul/backup_restic_password new file mode 100644 index 0000000..fbaa5fa --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/consul/backup_restic_password @@ -0,0 +1 @@ +USER Restic password to encrypt backups diff --git a/cluster/prod/app/backup/secrets/backup/consul/backup_restic_repository b/cluster/prod/app/backup/secrets/backup/consul/backup_restic_repository new file mode 100644 index 0000000..3f6cb93 --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/consul/backup_restic_repository @@ -0,0 +1 @@ +USER Restic repository, eg. s3:https://s3.garage.tld diff --git a/cluster/prod/app/backup/secrets/backup/id_ed25519 b/cluster/prod/app/backup/secrets/backup/id_ed25519 new file mode 100644 index 0000000..9d7fd46 --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/id_ed25519 @@ -0,0 +1 @@ +USER_LONG Private ed25519 key of the container doing the backup diff --git a/cluster/prod/app/backup/secrets/backup/id_ed25519.pub b/cluster/prod/app/backup/secrets/backup/id_ed25519.pub new file mode 100644 index 0000000..0a2ab35 --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/id_ed25519.pub @@ -0,0 +1 @@ +USER Public ed25519 key of the container doing the backup (this key must be in authorized_keys on the backup target host) diff --git a/cluster/prod/app/backup/secrets/backup/psql/aws_access_key_id b/cluster/prod/app/backup/secrets/backup/psql/aws_access_key_id new file mode 100644 index 0000000..82375d7 --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/psql/aws_access_key_id @@ -0,0 +1 @@ +USER Minio access key diff --git a/cluster/prod/app/backup/secrets/backup/psql/aws_secret_access_key b/cluster/prod/app/backup/secrets/backup/psql/aws_secret_access_key new file mode 100644 index 0000000..de5090c --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/psql/aws_secret_access_key @@ -0,0 +1 @@ +USER Minio secret key diff --git a/cluster/prod/app/backup/secrets/backup/psql/crypt_private_key b/cluster/prod/app/backup/secrets/backup/psql/crypt_private_key new file mode 100644 index 0000000..4abece9 --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/psql/crypt_private_key @@ -0,0 +1 @@ +USER a private key to decript backups from age diff --git a/cluster/prod/app/backup/secrets/backup/psql/crypt_public_key b/cluster/prod/app/backup/secrets/backup/psql/crypt_public_key new file mode 100644 index 0000000..156ad47 --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/psql/crypt_public_key @@ -0,0 +1 @@ +USER A public key to encypt backups with age diff --git a/cluster/prod/app/backup/secrets/backup/target_ssh_dir b/cluster/prod/app/backup/secrets/backup/target_ssh_dir new file mode 100644 index 0000000..3b2a4da --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/target_ssh_dir @@ -0,0 +1 @@ +USER Directory where to store backups on target host diff --git a/cluster/prod/app/backup/secrets/backup/target_ssh_fingerprint b/cluster/prod/app/backup/secrets/backup/target_ssh_fingerprint new file mode 100644 index 0000000..608f3ec --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/target_ssh_fingerprint @@ -0,0 +1 @@ +USER SSH fingerprint of the target machine (format: copy here the corresponding line from your known_hosts file) diff --git a/cluster/prod/app/backup/secrets/backup/target_ssh_host b/cluster/prod/app/backup/secrets/backup/target_ssh_host new file mode 100644 index 0000000..6268f87 --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/target_ssh_host @@ -0,0 +1 @@ +USER Hostname of the backup target host diff --git a/cluster/prod/app/backup/secrets/backup/target_ssh_port b/cluster/prod/app/backup/secrets/backup/target_ssh_port new file mode 100644 index 0000000..309dd38 --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/target_ssh_port @@ -0,0 +1 @@ +USER SSH port number to connect to the target host diff --git a/cluster/prod/app/backup/secrets/backup/target_ssh_user b/cluster/prod/app/backup/secrets/backup/target_ssh_user new file mode 100644 index 0000000..98b3046 --- /dev/null +++ b/cluster/prod/app/backup/secrets/backup/target_ssh_user @@ -0,0 +1 @@ +USER SSH username to log in as on the target host diff --git a/cluster/prod/app/email/deploy/email.hcl b/cluster/prod/app/email/deploy/email.hcl index 0c6308a..b28f3bc 100644 --- a/cluster/prod/app/email/deploy/email.hcl +++ b/cluster/prod/app/email/deploy/email.hcl @@ -475,7 +475,8 @@ job "email" { resources { cpu = 200 - memory = 1000 + memory = 500 + memory_max = 1000 } service { |