From c92f1c5ed8215dd524914815b2c4d2894ba86926 Mon Sep 17 00:00:00 2001 From: Max Date: Sat, 10 Aug 2024 02:39:52 +0200 Subject: [PATCH 01/10] cluster/services/locksmith: support skipping secret updates --- cluster/services/locksmith/provider.nix | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/cluster/services/locksmith/provider.nix b/cluster/services/locksmith/provider.nix index e6af83d..060afa6 100644 --- a/cluster/services/locksmith/provider.nix +++ b/cluster/services/locksmith/provider.nix @@ -28,6 +28,10 @@ in command = mkOption { type = types.coercedTo types.package (package: "${package}") types.str; }; + checkUpdate = mkOption { + type = types.coercedTo types.package (package: "${package}") types.str; + default = "true"; + }; owner = mkOption { type = types.str; default = "root"; @@ -72,20 +76,24 @@ in activeNodes = lib.unique (lib.flatten (lib.mapAttrsToList (_: secret: secret.nodes) activeSecrets)); secretNames = map (name: "${providerRoot}-${name}/") (lib.attrNames activeSecrets); - createSecret = { path, nodes, owner, mode, group, command }: '' - consul kv put ${lib.escapeShellArg path}/mode ${lib.escapeShellArg mode} - consul kv put ${lib.escapeShellArg path}/owner ${lib.escapeShellArg owner} - consul kv put ${lib.escapeShellArg path}/group ${lib.escapeShellArg group} - ${lib.concatStringsSep "\n" (map (node: '' - consul kv put ${lib.escapeShellArg path}/recipient/${node} "$( (${command}) | age --encrypt --armor -r ${lib.escapeShellArg depot.hours.${node}.ssh.id.publicKey})" - '') nodes)} + createSecret = { path, nodes, owner, mode, group, command, checkUpdate }: '' + if (${checkUpdate}); then + consul kv put ${lib.escapeShellArg path}/mode ${lib.escapeShellArg mode} + consul kv put ${lib.escapeShellArg path}/owner ${lib.escapeShellArg owner} + consul kv put ${lib.escapeShellArg path}/group ${lib.escapeShellArg group} + ${lib.concatStringsSep "\n" (map (node: '' + consul kv put ${lib.escapeShellArg path}/recipient/${node} "$( (${command}) | age --encrypt --armor -r ${lib.escapeShellArg depot.hours.${node}.ssh.id.publicKey})" + '') nodes)} + else + echo Skipping update for ${lib.escapeShellArg path} + fi ''; in '' # create/update secrets ${lib.pipe activeSecrets [ (lib.mapAttrsToList (secretName: secretConfig: createSecret { path = "${providerRoot}-${secretName}"; - inherit (secretConfig) nodes mode owner group command; + inherit (secretConfig) nodes mode owner group command checkUpdate; })) (lib.concatStringsSep "\n") ]} From 3b1e82b33f8aa5fd3adabccf140ae538c79a017c Mon Sep 17 00:00:00 2001 From: Max Date: Sat, 10 Aug 2024 02:48:34 +0200 Subject: [PATCH 02/10] cluster/services/locksmith: only run secret generation command once --- cluster/services/locksmith/provider.nix | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cluster/services/locksmith/provider.nix b/cluster/services/locksmith/provider.nix index 060afa6..99a2270 100644 --- a/cluster/services/locksmith/provider.nix +++ b/cluster/services/locksmith/provider.nix @@ -81,8 +81,10 @@ in consul kv put ${lib.escapeShellArg path}/mode ${lib.escapeShellArg mode} consul kv put ${lib.escapeShellArg path}/owner ${lib.escapeShellArg owner} consul kv put ${lib.escapeShellArg path}/group ${lib.escapeShellArg group} + secret="$(mktemp -ut)" + (${command}) > "$secret" ${lib.concatStringsSep "\n" (map (node: '' - consul kv put ${lib.escapeShellArg path}/recipient/${node} "$( (${command}) | age --encrypt --armor -r ${lib.escapeShellArg depot.hours.${node}.ssh.id.publicKey})" + consul kv put ${lib.escapeShellArg path}/recipient/${node} "$(age < "$secret" --encrypt --armor -r ${lib.escapeShellArg depot.hours.${node}.ssh.id.publicKey})" '') nodes)} else echo Skipping update for ${lib.escapeShellArg path} @@ -90,6 +92,7 @@ in ''; in '' # create/update secrets + umask 77 ${lib.pipe activeSecrets [ (lib.mapAttrsToList (secretName: secretConfig: createSecret { path = "${providerRoot}-${secretName}"; From 204d3f77ebb391f92b75915020bb51eba2fa8c8c Mon Sep 17 00:00:00 2001 From: Max Date: Sat, 10 Aug 2024 02:51:53 +0200 Subject: [PATCH 03/10] cluster/services/patroni: implement incandescence provider for databases and users --- cluster/services/patroni/create-databases.nix | 91 +++++++++++++++++++ cluster/services/patroni/default.nix | 8 +- cluster/services/patroni/incandescence.nix | 10 ++ cluster/services/patroni/options.nix | 39 ++++++++ 4 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 cluster/services/patroni/create-databases.nix create mode 100644 cluster/services/patroni/incandescence.nix create mode 100644 cluster/services/patroni/options.nix diff --git a/cluster/services/patroni/create-databases.nix b/cluster/services/patroni/create-databases.nix new file mode 100644 index 0000000..677e6a0 --- /dev/null +++ b/cluster/services/patroni/create-databases.nix @@ -0,0 +1,91 @@ +{ cluster, config, lib, pkgs, ... }: + +let + inherit (cluster.config.services.patroni) secrets; + + patroni = cluster.config.links.patroni-pg-access; + + cfg = cluster.config.patroni; + + writeQueryFile = pkgs.writeText "patroni-query.sql"; + + psqlRunFile = file: '' + export PGPASSWORD="$(< ${secrets.PATRONI_SUPERUSER_PASSWORD.path})" + while ! ${config.services.patroni.postgresqlPackage}/bin/psql 'host=${patroni.ipv4} port=${patroni.portStr} dbname=postgres user=postgres' --tuples-only --csv --file="${file}"; do + sleep 3 + done + ''; + + psql = query: psqlRunFile (writeQueryFile query); + + psqlSecret = getSecret: queryTemplate: let + queryTemplateFile = writeQueryFile queryTemplate; + in '' + umask 77 + secretFile="$(mktemp -ut patroniSecret.XXXXXXXXXXXXXXXX)" + queryFile="$(mktemp -ut patroniQuery.XXXXXXXXXXXXXXXX)" + trap "rm -f $secretFile $queryFile" EXIT + ${getSecret} > "$secretFile" + cp --no-preserve=mode ${queryTemplateFile} "$queryFile" + ${pkgs.replace-secret}/bin/replace-secret '@SECRET@' "$secretFile" "$queryFile" + ${psqlRunFile "$queryFile"} + ''; + + genPassword = pkgs.writeShellScript "patroni-generate-user-password" '' + umask 77 + base64 -w0 /dev/urandom | tr -d /+ | head -c256 | tee "/run/keys/locksmith-provider-patroni-$1" + ''; +in + +{ + services.incandescence.providers.patroni = lib.mkIf config.services.haproxy.enable { + locksmith = true; + wantedBy = [ "patroni.service" "multi-user.target" ]; + partOf = [ "patroni.service" ]; + wants = [ "postgresql.service" ]; + after = [ "postgresql.service" ]; + + formulae = { + user = { + destroyAfterDays = 0; + create = user: psqlSecret "${genPassword} ${user}" '' + CREATE USER ${user} PASSWORD '@SECRET@'; + ''; + destroy = psqlSecret "printenv OBJECT" '' + DROP USER @SECRET@; + ''; + }; + database = { + destroyAfterDays = 30; + deps = [ "user" ]; + create = db: psql '' + CREATE DATABASE ${db} OWNER ${cfg.databases.${db}.owner}; + ''; + destroy = psqlSecret "printenv OBJECT" '' + DROP DATABASE @SECRET@; + ''; + }; + }; + }; + + services.locksmith.providers.patroni = lib.mkIf config.services.haproxy.enable { + secrets = lib.mapAttrs (user: userConfig: { + command = { + envFile = '' + echo "PGPASSWORD=$(cat /run/keys/locksmith-provider-patroni-${user})" + rm -f /run/keys/locksmith-provider-patroni-${user} + ''; + pgpass = '' + echo "*:*:*:${user}:$(cat /run/keys/locksmith-provider-patroni-${user})" + rm -f /run/keys/locksmith-provider-patroni-${user} + ''; + raw = '' + cat /run/keys/locksmith-provider-patroni-${user} + rm -f /run/keys/locksmith-provider-patroni-${user} + ''; + }.${userConfig.locksmith.format}; + checkUpdate = "test -e /run/keys/locksmith-provider-patroni-${user}"; + inherit (userConfig.locksmith) nodes; + }) cfg.users; + }; +} diff --git a/cluster/services/patroni/default.nix b/cluster/services/patroni/default.nix index 15353f3..2e9bcff 100644 --- a/cluster/services/patroni/default.nix +++ b/cluster/services/patroni/default.nix @@ -1,6 +1,11 @@ -{ config, lib, ... }: +{ config, ... }: { + imports = [ + ./options.nix + ./incandescence.nix + ]; + links = { patroni-pg-internal.ipv4 = "0.0.0.0"; patroni-api.ipv4 = "0.0.0.0"; @@ -15,6 +20,7 @@ worker = [ ./worker.nix ./metrics.nix + ./create-databases.nix ]; haproxy = ./haproxy.nix; }; diff --git a/cluster/services/patroni/incandescence.nix b/cluster/services/patroni/incandescence.nix new file mode 100644 index 0000000..f24dbca --- /dev/null +++ b/cluster/services/patroni/incandescence.nix @@ -0,0 +1,10 @@ +{ config, lib, ... }: + +{ + incandescence.providers.patroni = { + objects = { + user = lib.attrNames config.patroni.users; + database = lib.attrNames config.patroni.databases; + }; + }; +} diff --git a/cluster/services/patroni/options.nix b/cluster/services/patroni/options.nix new file mode 100644 index 0000000..e4f7435 --- /dev/null +++ b/cluster/services/patroni/options.nix @@ -0,0 +1,39 @@ +{ lib, ... }: + +let + inherit (lib) mkOption; + inherit (lib.types) attrsOf enum listOf submodule str; +in + +{ + options.patroni = { + databases = mkOption { + type = attrsOf (submodule ({ name, ... }: { + options = { + owner = mkOption { + type = str; + default = name; + }; + }; + })); + default = {}; + }; + users = mkOption { + type = attrsOf (submodule ({ ... }: { + options = { + locksmith = { + nodes = mkOption { + type = listOf str; + default = []; + }; + format = mkOption { + type = enum [ "pgpass" "envFile" "raw" ]; + default = "pgpass"; + }; + }; + }; + })); + default = {}; + }; + }; +} From fe89d1d3c3a791f122c72fa1c10ec81249286b8f Mon Sep 17 00:00:00 2001 From: Max Date: Sun, 11 Aug 2024 00:51:31 +0200 Subject: [PATCH 04/10] cluster/services/chant: add simulacrum deps --- cluster/services/chant/default.nix | 1 + 1 file changed, 1 insertion(+) diff --git a/cluster/services/chant/default.nix b/cluster/services/chant/default.nix index 4d0f203..dfb1073 100644 --- a/cluster/services/chant/default.nix +++ b/cluster/services/chant/default.nix @@ -6,5 +6,6 @@ nixos.listener = [ ./listener.nix ]; + simulacrum.deps = [ "consul" ]; }; } From e87a1b23e9e187fdc43ba6c0fe5ba37c47e1baa1 Mon Sep 17 00:00:00 2001 From: Max Date: Sun, 11 Aug 2024 00:54:14 +0200 Subject: [PATCH 05/10] cluster/services/locksmith: add simulacrum deps --- cluster/services/locksmith/default.nix | 1 + 1 file changed, 1 insertion(+) diff --git a/cluster/services/locksmith/default.nix b/cluster/services/locksmith/default.nix index 0b782b2..978413c 100644 --- a/cluster/services/locksmith/default.nix +++ b/cluster/services/locksmith/default.nix @@ -14,5 +14,6 @@ ./provider.nix ]; }; + simulacrum.deps = [ "chant" "consul" ]; }; } From c57976a29914aa174af2997a8c30c1b640541b4b Mon Sep 17 00:00:00 2001 From: Max Date: Sun, 11 Aug 2024 00:54:17 +0200 Subject: [PATCH 06/10] cluster/services/patroni: add simulacrum deps --- cluster/services/patroni/default.nix | 1 + 1 file changed, 1 insertion(+) diff --git a/cluster/services/patroni/default.nix b/cluster/services/patroni/default.nix index 2e9bcff..b7ebe6f 100644 --- a/cluster/services/patroni/default.nix +++ b/cluster/services/patroni/default.nix @@ -36,5 +36,6 @@ PATRONI_REWIND_PASSWORD = default; metricsCredentials.nodes = nodes.worker; }; + simulacrum.deps = [ "consul" "incandescence" "locksmith" ]; }; } From ca4564f25dc8fb30860204c04f695838aa733aef Mon Sep 17 00:00:00 2001 From: Max Date: Wed, 14 Aug 2024 02:59:33 +0200 Subject: [PATCH 07/10] cluster/services/patroni: test in simulacrum --- cluster/services/patroni/default.nix | 7 +- .../services/patroni/simulacrum/test-data.nix | 14 +++ cluster/services/patroni/simulacrum/test.nix | 91 +++++++++++++++++++ 3 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 cluster/services/patroni/simulacrum/test-data.nix create mode 100644 cluster/services/patroni/simulacrum/test.nix diff --git a/cluster/services/patroni/default.nix b/cluster/services/patroni/default.nix index b7ebe6f..0d9c977 100644 --- a/cluster/services/patroni/default.nix +++ b/cluster/services/patroni/default.nix @@ -4,6 +4,7 @@ imports = [ ./options.nix ./incandescence.nix + ./simulacrum/test-data.nix ]; links = { @@ -36,6 +37,10 @@ PATRONI_REWIND_PASSWORD = default; metricsCredentials.nodes = nodes.worker; }; - simulacrum.deps = [ "consul" "incandescence" "locksmith" ]; + simulacrum = { + enable = true; + deps = [ "consul" "incandescence" "locksmith" ]; + settings = ./simulacrum/test.nix; + }; }; } diff --git a/cluster/services/patroni/simulacrum/test-data.nix b/cluster/services/patroni/simulacrum/test-data.nix new file mode 100644 index 0000000..e56e862 --- /dev/null +++ b/cluster/services/patroni/simulacrum/test-data.nix @@ -0,0 +1,14 @@ +{ config, lib, ... }: +{ + patroni = lib.mkIf config.simulacrum { + databases = config.lib.forService "patroni" { + testdb.owner = "testuser"; + }; + users = config.lib.forService "patroni" { + testuser.locksmith = { + nodes = config.services.patroni.nodes.haproxy; + format = "pgpass"; + }; + }; + }; +} diff --git a/cluster/services/patroni/simulacrum/test.nix b/cluster/services/patroni/simulacrum/test.nix new file mode 100644 index 0000000..438bfed --- /dev/null +++ b/cluster/services/patroni/simulacrum/test.nix @@ -0,0 +1,91 @@ +{ cluster, ... }: + +let + clusterName = "poseidon"; + link = cluster.config.links.patroni-pg-access; +in +{ + defaults = { depot, pkgs, ... }: { + environment.systemPackages = [ + pkgs.jq + depot.packages.postgresql + ]; + services.patroni.settings.postgresql.pg_hba = [ + "host postgres postgres 0.0.0.0/0 trust" + ]; + }; + + # taken from https://github.com/phfroidmont/nixpkgs/blob/patroni-module/nixos/tests/patroni.nix + testScript = '' + import json + nodeNames = json.loads('${builtins.toJSON cluster.config.services.patroni.nodes.worker}') + clientNames = json.loads('${builtins.toJSON cluster.config.services.patroni.nodes.haproxy}') + nodes = [ n for n in machines if n.name in nodeNames ] + clients = [ n for n in machines if n.name in clientNames ] + + def booted(nodes): + return filter(lambda node: node.booted, nodes) + + def wait_for_all_nodes_ready(expected_replicas=2): + booted_nodes = booted(nodes) + for node in booted_nodes: + node.wait_for_unit("patroni.service") + print(node.succeed("patronictl list ${clusterName}")) + node.wait_until_succeeds(f"[ $(patronictl list -f json ${clusterName} | jq 'length') == {expected_replicas + 1} ]") + node.wait_until_succeeds("[ $(patronictl list -f json ${clusterName} | jq 'map(select(.Role | test(\"^Leader$\"))) | map(select(.State | test(\"^running$\"))) | length') == 1 ]") + node.wait_until_succeeds(f"[ $(patronictl list -f json ${clusterName} | jq 'map(select(.Role | test(\"^Replica$\"))) | map(select(.State | test(\"^streaming$\"))) | length') == {expected_replicas} ]") + print(node.succeed("patronictl list ${clusterName}")) + for client in booted(clients): + client.wait_until_succeeds("psql -h ${link.ipv4} -p ${link.portStr} -U postgres --command='select 1;'") + + def run_dummy_queries(): + for client in booted(clients): + client.succeed("psql -h ${link.ipv4} -p ${link.portStr} -U postgres --pset='pager=off' --tuples-only --command='insert into dummy(val) values (101);'") + client.succeed("test $(psql -h ${link.ipv4} -p ${link.portStr} -U postgres --pset='pager=off' --tuples-only --command='select val from dummy where val = 101;') -eq 101") + client.succeed("psql -h ${link.ipv4} -p ${link.portStr} -U postgres --pset='pager=off' --tuples-only --command='delete from dummy where val = 101;'") + + start_all() + + with subtest("should bootstrap a new patroni cluster"): + wait_for_all_nodes_ready() + + with subtest("should be able to insert and select"): + booted_clients = list(booted(clients)) + booted_clients[0].succeed("psql -h ${link.ipv4} -p ${link.portStr} -U postgres --command='create table dummy as select * from generate_series(1, 100) as val;'") + for client in booted_clients: + client.succeed("test $(psql -h ${link.ipv4} -p ${link.portStr} -U postgres --pset='pager=off' --tuples-only --command='select count(distinct val) from dummy;') -eq 100") + + with subtest("should restart after all nodes are crashed"): + for node in nodes: + node.crash() + for node in nodes: + node.start() + wait_for_all_nodes_ready() + + with subtest("should be able to run queries while any one node is crashed"): + masterNodeName = nodes[0].succeed("patronictl list -f json ${clusterName} | jq '.[] | select(.Role | test(\"^Leader$\")) | .Member' -r").strip() + masterNodeIndex = next((i for i, v in enumerate(nodes) if v.name == masterNodeName)) + + # Move master node at the end of the list to avoid multiple failovers (makes the test faster and more consistent) + nodes.append(nodes.pop(masterNodeIndex)) + + for node in nodes: + node.crash() + wait_for_all_nodes_ready(1) + + # Execute some queries while a node is down. + run_dummy_queries() + + # Restart crashed node. + node.start() + wait_for_all_nodes_ready() + + # Execute some queries with the node back up. + run_dummy_queries() + + with subtest("should create databases and users via incandescence"): + for client in clients: + client.succeed(f"PGPASSFILE=/run/locksmith/patroni-testuser psql -h ${link.ipv4} -p ${link.portStr} -U testuser -d testdb --command='create table test_table_{client.name} as select * from generate_series(1, 10) as val;'") + client.fail("PGPASSFILE=/run/locksmith/patroni-testuser psql -h ${link.ipv4} -p ${link.portStr} -U testuser -d postgres --command='select * from dummy;'") + ''; +} From 2a45b0b8e90b8811449fa2fa7025138c4e104146 Mon Sep 17 00:00:00 2001 From: Max Date: Wed, 14 Aug 2024 02:55:28 +0200 Subject: [PATCH 08/10] checks/patroni: drop --- packages/checks/default.nix | 5 - packages/checks/patroni.nix | 211 ------------------------------------ 2 files changed, 216 deletions(-) delete mode 100644 packages/checks/patroni.nix diff --git a/packages/checks/default.nix b/packages/checks/default.nix index c09b60d..ed21da1 100644 --- a/packages/checks/default.nix +++ b/packages/checks/default.nix @@ -35,11 +35,6 @@ in inherit (self'.packages) keycloak; }; - patroni = pkgs.callPackage ./patroni.nix { - inherit (self) nixosModules; - inherit (self'.packages) postgresql; - }; - s3ql-upgrade = pkgs.callPackage ./s3ql-upgrade.nix { inherit (self'.packages) s3ql; inherit (self) nixosModules; diff --git a/packages/checks/patroni.nix b/packages/checks/patroni.nix deleted file mode 100644 index dd24f33..0000000 --- a/packages/checks/patroni.nix +++ /dev/null @@ -1,211 +0,0 @@ -{ nixosTest, nixosModules, postgresql }: - -# taken from https://github.com/phfroidmont/nixpkgs/blob/patroni-module/nixos/tests/patroni.nix -nixosTest ( - let - nodesIps = [ - "192.168.1.1" - "192.168.1.2" - "192.168.1.3" - ]; - - createNode = index: { pkgs, ... }: - let - ip = builtins.elemAt nodesIps index; # since we already use IPs to identify servers - in - { - imports = [ - nixosModules.patroni - nixosModules.systemd-extras - ]; - - networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ - { address = ip; prefixLength = 16; } - ]; - - networking.firewall.allowedTCPPorts = [ 5432 8008 5010 ]; - - environment.systemPackages = [ pkgs.jq ]; - - services.patroni = { - - enable = true; - - postgresqlPackage = postgresql.withPackages (p: [ p.pg_safeupdate ]); - - scope = "cluster1"; - name = "node${toString(index + 1)}"; - nodeIp = ip; - otherNodesIps = builtins.filter (h: h != ip) nodesIps; - softwareWatchdog = true; - - settings = { - bootstrap = { - dcs = { - ttl = 30; - loop_wait = 10; - retry_timeout = 10; - maximum_lag_on_failover = 1048576; - }; - initdb = [ - { encoding = "UTF8"; } - "data-checksums" - ]; - }; - - postgresql = { - use_pg_rewind = true; - use_slots = true; - authentication = { - replication = { - username = "replicator"; - }; - superuser = { - username = "postgres"; - }; - rewind = { - username = "rewind"; - }; - }; - parameters = { - listen_addresses = "${ip}"; - wal_level = "replica"; - hot_standby_feedback = "on"; - unix_socket_directories = "/tmp"; - }; - pg_hba = [ - "host replication replicator 192.168.1.0/24 md5" - # Unsafe, do not use for anything other than tests - "host all all 0.0.0.0/0 trust" - ]; - }; - - etcd3 = { - host = "192.168.1.4:2379"; - }; - }; - - environmentFiles = { - PATRONI_REPLICATION_PASSWORD = pkgs.writeText "replication-password" "postgres"; - PATRONI_SUPERUSER_PASSWORD = pkgs.writeText "superuser-password" "postgres"; - PATRONI_REWIND_PASSWORD = pkgs.writeText "rewind-password" "postgres"; - }; - }; - - # We always want to restart so the tests never hang - systemd.services.patroni.serviceConfig.StartLimitIntervalSec = 0; - }; - in - { - name = "patroni"; - - nodes = { - node1 = createNode 0; - node2 = createNode 1; - node3 = createNode 2; - - etcd = { pkgs, ... }: { - - networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ - { address = "192.168.1.4"; prefixLength = 16; } - ]; - - services.etcd = { - enable = true; - listenClientUrls = [ "http://192.168.1.4:2379" ]; - }; - - networking.firewall.allowedTCPPorts = [ 2379 ]; - }; - - client = { pkgs, ... }: { - environment.systemPackages = [ postgresql ]; - - networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [ - { address = "192.168.2.1"; prefixLength = 16; } - ]; - - services.haproxy = { - enable = true; - config = '' - global - maxconn 100 - - defaults - log global - mode tcp - retries 2 - timeout client 30m - timeout connect 4s - timeout server 30m - timeout check 5s - - listen cluster1 - bind 127.0.0.1:5432 - option httpchk - http-check expect status 200 - default-server inter 3s fall 3 rise 2 on-marked-down shutdown-sessions - ${builtins.concatStringsSep "\n" (map (ip: "server postgresql_${ip}_5432 ${ip}:5432 maxconn 100 check port 8008") nodesIps)} - ''; - }; - }; - }; - - - - testScript = '' - nodes = [node1, node2, node3] - - def wait_for_all_nodes_ready(expected_replicas=2): - booted_nodes = filter(lambda node: node.booted, nodes) - for node in booted_nodes: - print(node.succeed("patronictl list cluster1")) - node.wait_until_succeeds(f"[ $(patronictl list -f json cluster1 | jq 'length') == {expected_replicas + 1} ]") - node.wait_until_succeeds("[ $(patronictl list -f json cluster1 | jq 'map(select(.Role | test(\"^Leader$\"))) | map(select(.State | test(\"^running$\"))) | length') == 1 ]") - node.wait_until_succeeds(f"[ $(patronictl list -f json cluster1 | jq 'map(select(.Role | test(\"^Replica$\"))) | map(select(.State | test(\"^streaming$\"))) | length') == {expected_replicas} ]") - print(node.succeed("patronictl list cluster1")) - client.wait_until_succeeds("psql -h 127.0.0.1 -U postgres --command='select 1;'") - - def run_dummy_queries(): - client.succeed("psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='insert into dummy(val) values (101);'") - client.succeed("test $(psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='select val from dummy where val = 101;') -eq 101") - client.succeed("psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='delete from dummy where val = 101;'") - - start_all() - - with subtest("should bootstrap a new patroni cluster"): - wait_for_all_nodes_ready() - - with subtest("should be able to insert and select"): - client.succeed("psql -h 127.0.0.1 -U postgres --command='create table dummy as select * from generate_series(1, 100) as val;'") - client.succeed("test $(psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='select count(distinct val) from dummy;') -eq 100") - - with subtest("should restart after all nodes are crashed"): - for node in nodes: - node.crash() - for node in nodes: - node.start() - wait_for_all_nodes_ready() - - with subtest("should be able to run queries while any one node is crashed"): - masterNodeName = node1.succeed("patronictl list -f json cluster1 | jq '.[] | select(.Role | test(\"^Leader$\")) | .Member' -r").strip() - masterNodeIndex = int(masterNodeName[len(masterNodeName)-1]) - 1 - - # Move master node at the end of the list to avoid multiple failovers (makes the test faster and more consistent) - nodes.append(nodes.pop(masterNodeIndex)) - - for node in nodes: - node.crash() - wait_for_all_nodes_ready(1) - - # Execute some queries while a node is down. - run_dummy_queries() - - # Restart crashed node. - node.start() - wait_for_all_nodes_ready() - - # Execute some queries with the node back up. - run_dummy_queries() - ''; - }) From a61f97cccf62456c63a9398d6634100106f6fc66 Mon Sep 17 00:00:00 2001 From: Max Date: Wed, 14 Aug 2024 12:46:30 +0200 Subject: [PATCH 09/10] cluster/services/patroni: wait for consul --- cluster/services/patroni/worker.nix | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cluster/services/patroni/worker.nix b/cluster/services/patroni/worker.nix index b6d33ee..1e7def7 100644 --- a/cluster/services/patroni/worker.nix +++ b/cluster/services/patroni/worker.nix @@ -25,6 +25,10 @@ in "d '${baseDir}' 0700 patroni patroni - -" "d '${walDir}' 0700 patroni patroni - -" ]; + systemd.services.patroni = { + requires = [ "consul-ready.service" ]; + after = [ "consul-ready.service" ]; + }; services.patroni = { enable = true; name = hostName; From ff0744f6006693b82d9796c232404da354167c66 Mon Sep 17 00:00:00 2001 From: Max Date: Wed, 14 Aug 2024 14:23:17 +0200 Subject: [PATCH 10/10] cluster/services/patroni: enable synchronous mode --- cluster/services/patroni/worker.nix | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cluster/services/patroni/worker.nix b/cluster/services/patroni/worker.nix index 1e7def7..de5333f 100644 --- a/cluster/services/patroni/worker.nix +++ b/cluster/services/patroni/worker.nix @@ -61,6 +61,7 @@ in }; use_pg_rewind = true; use_slots = true; + synchronous_mode = true; authentication = { replication.username = "patronirep"; rewind.username = "patronirew"; @@ -71,6 +72,7 @@ in wal_level = "replica"; hot_standby_feedback = "on"; unix_socket_directories = "/tmp"; + synchronous_commit = "on"; }; pg_hba = [ "host replication patronirep ${net} scram-sha-256"