From bb54c41a9a46d9bb1b5d5caf2dbd3ab3c780c1fc Mon Sep 17 00:00:00 2001 From: Max Date: Wed, 14 Aug 2024 02:59:33 +0200 Subject: [PATCH] cluster/services/patroni: test in simulacrum --- cluster/services/patroni/default.nix | 6 +- cluster/services/patroni/test.nix | 150 +++++++++++++++++++++++++++ 2 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 cluster/services/patroni/test.nix diff --git a/cluster/services/patroni/default.nix b/cluster/services/patroni/default.nix index b7ebe6f..c3e099d 100644 --- a/cluster/services/patroni/default.nix +++ b/cluster/services/patroni/default.nix @@ -36,6 +36,10 @@ PATRONI_REWIND_PASSWORD = default; metricsCredentials.nodes = nodes.worker; }; - simulacrum.deps = [ "consul" "incandescence" "locksmith" ]; + simulacrum = { + enable = true; + deps = [ "consul" "incandescence" "locksmith" ]; + settings = ./test.nix; + }; }; } diff --git a/cluster/services/patroni/test.nix b/cluster/services/patroni/test.nix new file mode 100644 index 0000000..1c0f38f --- /dev/null +++ b/cluster/services/patroni/test.nix @@ -0,0 +1,150 @@ +{ cluster, ... }: + +let + createNode = index: { pkgs, ... }: + { + + networking.firewall.allowedTCPPorts = [ 5432 8008 5010 ]; + + environment.systemPackages = [ pkgs.jq ]; + + services.patroni = { + + enable = true; + + softwareWatchdog = true; + + settings = { + bootstrap = { + dcs = { + ttl = 30; + loop_wait = 10; + retry_timeout = 10; + maximum_lag_on_failover = 1048576; + }; + initdb = [ + { encoding = "UTF8"; } + "data-checksums" + ]; + }; + + postgresql = { + use_pg_rewind = true; + use_slots = true; + authentication = { + replication = { + username = "replicator"; + }; + superuser = { + username = "postgres"; + }; + rewind = { + username = "rewind"; + }; + }; + parameters = { + wal_level = "replica"; + hot_standby_feedback = "on"; + unix_socket_directories = "/tmp"; + }; + pg_hba = [ + "host replication replicator 192.168.1.0/24 md5" + # Unsafe, do not use for anything other than tests + "host all all 0.0.0.0/0 trust" + ]; + }; + + etcd3 = { + host = "192.168.1.4:2379"; + }; + }; + + environmentFiles = { + PATRONI_REPLICATION_PASSWORD = pkgs.writeText "replication-password" "postgres"; + PATRONI_SUPERUSER_PASSWORD = pkgs.writeText "superuser-password" "postgres"; + PATRONI_REWIND_PASSWORD = pkgs.writeText "rewind-password" "postgres"; + }; + }; + + # We always want to restart so the tests never hang + systemd.services.patroni.serviceConfig.StartLimitIntervalSec = 0; + }; + + clusterName = "poseidon"; + link = cluster.config.links.patroni-pg-access; +in +{ + defaults = { depot, pkgs, ... }: { + environment.systemPackages = [ + pkgs.jq + depot.packages.postgresql + ]; + services.patroni.settings.postgresql.pg_hba = [ + "host all all 0.0.0.0/0 trust" + ]; + }; + + # taken from https://github.com/phfroidmont/nixpkgs/blob/patroni-module/nixos/tests/patroni.nix + testScript = '' + import json + nodeNames = json.loads('${builtins.toJSON cluster.config.services.patroni.nodes.worker}') + clientNames = json.loads('${builtins.toJSON cluster.config.services.patroni.nodes.haproxy}') + nodes = [ n for n in machines if n.name in nodeNames ] + clients = [ n for n in machines if n.name in clientNames ] + + def wait_for_all_nodes_ready(expected_replicas=2): + booted_nodes = filter(lambda node: node.booted, nodes) + for node in booted_nodes: + print(node.succeed("patronictl list ${clusterName}")) + node.wait_until_succeeds(f"[ $(patronictl list -f json ${clusterName} | jq 'length') == {expected_replicas + 1} ]") + node.wait_until_succeeds("[ $(patronictl list -f json ${clusterName} | jq 'map(select(.Role | test(\"^Leader$\"))) | map(select(.State | test(\"^running$\"))) | length') == 1 ]") + node.wait_until_succeeds(f"[ $(patronictl list -f json ${clusterName} | jq 'map(select(.Role | test(\"^Replica$\"))) | map(select(.State | test(\"^streaming$\"))) | length') == {expected_replicas} ]") + print(node.succeed("patronictl list ${clusterName}")) + for client in clients: + client.wait_until_succeeds("psql -h ${link.ipv4} -p ${link.portStr} -U postgres --command='select 1;'") + + def run_dummy_queries(): + for client in clients: + client.succeed("psql -h ${link.ipv4} -p ${link.portStr} -U postgres --pset='pager=off' --tuples-only --command='insert into dummy(val) values (101);'") + client.succeed("test $(psql -h ${link.ipv4} -p ${link.portStr} -U postgres --pset='pager=off' --tuples-only --command='select val from dummy where val = 101;') -eq 101") + client.succeed("psql -h ${link.ipv4} -p ${link.portStr} -U postgres --pset='pager=off' --tuples-only --command='delete from dummy where val = 101;'") + + start_all() + + with subtest("should bootstrap a new patroni cluster"): + wait_for_all_nodes_ready() + + with subtest("should be able to insert and select"): + clients[0].succeed("psql -h ${link.ipv4} -p ${link.portStr} -U postgres --command='create table dummy as select * from generate_series(1, 100) as val;'") + for client in clients: + client.succeed("test $(psql -h ${link.ipv4} -p ${link.portStr} -U postgres --pset='pager=off' --tuples-only --command='select count(distinct val) from dummy;') -eq 100") + + with subtest("should restart after all nodes are crashed"): + for node in nodes: + node.crash() + for node in nodes: + node.start() + wait_for_all_nodes_ready() + + with subtest("should be able to run queries while any one node is crashed"): + masterNodeName = nodes[0].succeed("patronictl list -f json ${clusterName} | jq '.[] | select(.Role | test(\"^Leader$\")) | .Member' -r").strip() + masterNodeIndex = next((i for i, v in enumerate(nodes) if v.name == masterNodeName), None) + + # Move master node at the end of the list to avoid multiple failovers (makes the test faster and more consistent) + nodes.append(nodes.pop(masterNodeIndex)) + + for node in nodes: + node.crash() + wait_for_all_nodes_ready(1) + + # Execute some queries while a node is down. + run_dummy_queries() + + # Restart crashed node. + node.start() + wait_for_all_nodes_ready() + + # Execute some queries with the node back up. + run_dummy_queries() + ''; +}