From 02c65de5fec7242d225d6e052117f108f0a5e035 Mon Sep 17 00:00:00 2001 From: Quentin Dufour Date: Thu, 1 Sep 2022 18:05:50 +0200 Subject: Restart backups --- .../prod/app/backup/build/backup-consul/Dockerfile | 28 +++ .../app/backup/build/backup-consul/do_backup.sh | 20 +++ .../prod/app/backup/build/backup-psql/.gitignore | 1 + .../prod/app/backup/build/backup-psql/README.md | 8 + .../app/backup/build/backup-psql/backup-psql.py | 106 +++++++++++ .../prod/app/backup/build/backup-psql/common.nix | 8 + .../prod/app/backup/build/backup-psql/default.nix | 37 ++++ .../prod/app/backup/build/backup-psql/docker.nix | 11 ++ cluster/prod/app/backup/deploy/backup-daily.hcl | 196 +++++++++++++++++++++ cluster/prod/app/backup/deploy/backup-weekly.hcl | 55 ++++++ .../secrets/backup/consul/backup_aws_access_key_id | 1 + .../backup/consul/backup_aws_secret_access_key | 1 + .../secrets/backup/consul/backup_restic_password | 1 + .../secrets/backup/consul/backup_restic_repository | 1 + cluster/prod/app/backup/secrets/backup/id_ed25519 | 1 + .../prod/app/backup/secrets/backup/id_ed25519.pub | 1 + .../backup/secrets/backup/psql/aws_access_key_id | 1 + .../secrets/backup/psql/aws_secret_access_key | 1 + .../backup/secrets/backup/psql/crypt_private_key | 1 + .../backup/secrets/backup/psql/crypt_public_key | 1 + .../prod/app/backup/secrets/backup/target_ssh_dir | 1 + .../backup/secrets/backup/target_ssh_fingerprint | 1 + .../prod/app/backup/secrets/backup/target_ssh_host | 1 + .../prod/app/backup/secrets/backup/target_ssh_port | 1 + .../prod/app/backup/secrets/backup/target_ssh_user | 1 + cluster/prod/app/email/deploy/email.hcl | 3 +- 26 files changed, 487 insertions(+), 1 deletion(-) create mode 100644 cluster/prod/app/backup/build/backup-consul/Dockerfile create mode 100755 cluster/prod/app/backup/build/backup-consul/do_backup.sh create mode 100644 cluster/prod/app/backup/build/backup-psql/.gitignore create mode 100644 cluster/prod/app/backup/build/backup-psql/README.md create mode 100755 cluster/prod/app/backup/build/backup-psql/backup-psql.py create mode 100644 cluster/prod/app/backup/build/backup-psql/common.nix create mode 100644 cluster/prod/app/backup/build/backup-psql/default.nix create mode 100644 cluster/prod/app/backup/build/backup-psql/docker.nix create mode 100644 cluster/prod/app/backup/deploy/backup-daily.hcl create mode 100644 cluster/prod/app/backup/deploy/backup-weekly.hcl create mode 100644 cluster/prod/app/backup/secrets/backup/consul/backup_aws_access_key_id create mode 100644 cluster/prod/app/backup/secrets/backup/consul/backup_aws_secret_access_key create mode 100644 cluster/prod/app/backup/secrets/backup/consul/backup_restic_password create mode 100644 cluster/prod/app/backup/secrets/backup/consul/backup_restic_repository create mode 100644 cluster/prod/app/backup/secrets/backup/id_ed25519 create mode 100644 cluster/prod/app/backup/secrets/backup/id_ed25519.pub create mode 100644 cluster/prod/app/backup/secrets/backup/psql/aws_access_key_id create mode 100644 cluster/prod/app/backup/secrets/backup/psql/aws_secret_access_key create mode 100644 cluster/prod/app/backup/secrets/backup/psql/crypt_private_key create mode 100644 cluster/prod/app/backup/secrets/backup/psql/crypt_public_key create mode 100644 cluster/prod/app/backup/secrets/backup/target_ssh_dir create mode 100644 cluster/prod/app/backup/secrets/backup/target_ssh_fingerprint create mode 100644 cluster/prod/app/backup/secrets/backup/target_ssh_host create mode 100644 cluster/prod/app/backup/secrets/backup/target_ssh_port create mode 100644 cluster/prod/app/backup/secrets/backup/target_ssh_user (limited to 'cluster/prod') diff --git a/cluster/prod/app/backup/build/backup-consul/Dockerfile b/cluster/prod/app/backup/build/backup-consul/Dockerfile new file mode 100644 index 0000000..0a5c38f --- /dev/null +++ b/cluster/prod/app/backup/build/backup-consul/Dockerfile @@ -0,0 +1,28 @@ +FROM golang:buster as builder + +WORKDIR /root +RUN git clone https://filippo.io/age && cd age/cmd/age && go build -o age . + +FROM amd64/debian:buster + +COPY --from=builder /root/age/cmd/age/age /usr/local/bin/age + +RUN apt-get update && \ + apt-get -qq -y full-upgrade && \ + apt-get install -y rsync wget openssh-client unzip && \ + apt-get clean && \ + rm -f /var/lib/apt/lists/*_* + +RUN mkdir -p /root/.ssh +WORKDIR /root + +RUN wget https://releases.hashicorp.com/consul/1.8.5/consul_1.8.5_linux_amd64.zip && \ + unzip consul_1.8.5_linux_amd64.zip && \ + chmod +x consul && \ + mv consul /usr/local/bin && \ + rm consul_1.8.5_linux_amd64.zip + +COPY do_backup.sh /root/do_backup.sh + +CMD "/root/do_backup.sh" + diff --git a/cluster/prod/app/backup/build/backup-consul/do_backup.sh b/cluster/prod/app/backup/build/backup-consul/do_backup.sh new file mode 100755 index 0000000..a34e7b7 --- /dev/null +++ b/cluster/prod/app/backup/build/backup-consul/do_backup.sh @@ -0,0 +1,20 @@ +#!/bin/sh + +set -x -e + +cd /root + +chmod 0600 .ssh/id_ed25519 + +cat > .ssh/config < $TARGET_SSH_DIR/consul/$(date --iso-8601=minute)_consul_kv_export.gz.age" + diff --git a/cluster/prod/app/backup/build/backup-psql/.gitignore b/cluster/prod/app/backup/build/backup-psql/.gitignore new file mode 100644 index 0000000..b2be92b --- /dev/null +++ b/cluster/prod/app/backup/build/backup-psql/.gitignore @@ -0,0 +1 @@ +result diff --git a/cluster/prod/app/backup/build/backup-psql/README.md b/cluster/prod/app/backup/build/backup-psql/README.md new file mode 100644 index 0000000..97929db --- /dev/null +++ b/cluster/prod/app/backup/build/backup-psql/README.md @@ -0,0 +1,8 @@ +## Build + +```bash +docker load < $(nix-build docker.nix) +docker push superboum/backup-psql:??? +``` + + diff --git a/cluster/prod/app/backup/build/backup-psql/backup-psql.py b/cluster/prod/app/backup/build/backup-psql/backup-psql.py new file mode 100755 index 0000000..291cf50 --- /dev/null +++ b/cluster/prod/app/backup/build/backup-psql/backup-psql.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +import shutil,sys,os,datetime,minio,subprocess + +working_directory = "." +if 'CACHE_DIR' in os.environ: working_directory = os.environ['CACHE_DIR'] +required_space_in_bytes = 20 * 1024 * 1024 * 1024 +bucket = os.environ['AWS_BUCKET'] +key = os.environ['AWS_ACCESS_KEY_ID'] +secret = os.environ['AWS_SECRET_ACCESS_KEY'] +endpoint = os.environ['AWS_ENDPOINT'] +pubkey = os.environ['CRYPT_PUBLIC_KEY'] +psql_host = os.environ['PSQL_HOST'] +psql_user = os.environ['PSQL_USER'] +s3_prefix = str(datetime.datetime.now()) +files = [ "backup_manifest", "base.tar.gz", "pg_wal.tar.gz" ] +clear_paths = [ os.path.join(working_directory, f) for f in files ] +crypt_paths = [ os.path.join(working_directory, f) + ".age" for f in files ] +s3_keys = [ s3_prefix + "/" + f for f in files ] + +def abort(msg): + for p in clear_paths + crypt_paths: + if os.path.exists(p): + print(f"Remove {p}") + os.remove(p) + + if msg: sys.exit(msg) + else: print("success") + +# Check we have enough space on disk +if shutil.disk_usage(working_directory).free < required_space_in_bytes: + abort(f"Not enough space on disk at path {working_directory} to perform a backup, aborting") + +# Check postgres password is set +if 'PGPASSWORD' not in os.environ: + abort(f"You must pass postgres' password through the environment variable PGPASSWORD") + +# Check our working directory is empty +if len(os.listdir(working_directory)) != 0: + abort(f"Working directory {working_directory} is not empty, aborting") + +# Check Minio +client = minio.Minio(endpoint, key, secret) +if not client.bucket_exists(bucket): + abort(f"Bucket {bucket} does not exist or its access is forbidden, aborting") + +# Perform the backup locally +try: + ret = subprocess.run(["pg_basebackup", + f"--host={psql_host}", + f"--username={psql_user}", + f"--pgdata={working_directory}", + f"--format=tar", + "--wal-method=stream", + "--gzip", + "--compress=6", + "--progress", + "--max-rate=5M", + ]) + if ret.returncode != 0: + abort(f"pg_basebackup exited, expected return code 0, got {ret.returncode}. aborting") +except Exception as e: + abort(f"pg_basebackup raised exception {e}. aborting") + +# Check that the expected files are here +for p in clear_paths: + print(f"Checking that {p} exists locally") + if not os.path.exists(p): + abort(f"File {p} expected but not found, aborting") + +# Cipher them +for c, e in zip(clear_paths, crypt_paths): + print(f"Ciphering {c} to {e}") + try: + ret = subprocess.run(["age", "-r", pubkey, "-o", e, c]) + if ret.returncode != 0: + abort(f"age exit code is {ret}, 0 expected. aborting") + except Exception as e: + abort(f"aged raised an exception. {e}. aborting") + +# Upload the backup to S3 +for p, k in zip(crypt_paths, s3_keys): + try: + print(f"Uploading {p} to {k}") + result = client.fput_object(bucket, k, p) + print( + "created {0} object; etag: {1}, version-id: {2}".format( + result.object_name, result.etag, result.version_id, + ), + ) + except Exception as e: + abort(f"Exception {e} occured while upload {p}. aborting") + +# Check that the files have been uploaded +for k in s3_keys: + try: + print(f"Checking that {k} exists remotely") + result = client.stat_object(bucket, k) + print( + "last-modified: {0}, size: {1}".format( + result.last_modified, result.size, + ), + ) + except Exception as e: + abort(f"{k} not found on S3. {e}. aborting") + +abort(None) diff --git a/cluster/prod/app/backup/build/backup-psql/common.nix b/cluster/prod/app/backup/build/backup-psql/common.nix new file mode 100644 index 0000000..639d9a1 --- /dev/null +++ b/cluster/prod/app/backup/build/backup-psql/common.nix @@ -0,0 +1,8 @@ +{ + pkgsSrc = fetchTarball { + # Latest commit on https://github.com/NixOS/nixpkgs/tree/nixos-21.11 + # As of 2022-04-15 + url ="https://github.com/NixOS/nixpkgs/archive/2f06b87f64bc06229e05045853e0876666e1b023.tar.gz"; + sha256 = "sha256:1d7zg96xw4qsqh7c89pgha9wkq3rbi9as3k3d88jlxy2z0ns0cy2"; + }; +} diff --git a/cluster/prod/app/backup/build/backup-psql/default.nix b/cluster/prod/app/backup/build/backup-psql/default.nix new file mode 100644 index 0000000..2cd8d93 --- /dev/null +++ b/cluster/prod/app/backup/build/backup-psql/default.nix @@ -0,0 +1,37 @@ +let + common = import ./common.nix; + pkgs = import common.pkgsSrc {}; + python-with-my-packages = pkgs.python3.withPackages (p: with p; [ + minio + ]); +in + pkgs.stdenv.mkDerivation { + name = "backup-psql"; + src = pkgs.lib.sourceFilesBySuffices ./. [ ".py" ]; + + buildInputs = [ + python-with-my-packages + pkgs.age + pkgs.postgresql_14 + ]; + + buildPhase = '' + cat > backup-psql < $NOMAD_ALLOC_DIR/consul.json" ] + volumes = [ + "secrets:/etc/consul", + ] + } + + env { + CONSUL_HTTP_ADDR = "https://consul.service.prod.consul:8501" + CONSUL_HTTP_SSL = "true" + CONSUL_CACERT = "/etc/consul/consul.crt" + CONSUL_CLIENT_CERT = "/etc/consul/consul-client.crt" + CONSUL_CLIENT_KEY = "/etc/consul/consul-client.key" + } + + resources { + cpu = 200 + memory = 200 + } + + + template { + data = "{{ key \"secrets/consul/consul.crt\" }}" + destination = "secrets/consul.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.crt\" }}" + destination = "secrets/consul-client.crt" + } + + template { + data = "{{ key \"secrets/consul/consul-client.key\" }}" + destination = "secrets/consul-client.key" + } + + restart { + attempts = 2 + interval = "30m" + delay = "15s" + mode = "fail" + } + } + + task "restic-backup" { + driver = "docker" + + config { + image = "restic/restic:0.12.1" + entrypoint = [ "/bin/sh", "-c" ] + args = [ "restic backup $NOMAD_ALLOC_DIR/consul.json && restic forget --keep-within 1m1d --keep-within-weekly 3m --keep-within-monthly 1y && restic prune --max-unused 50% --max-repack-size 2G && restic check" ] + } + + + template { + data = <