Compare commits

..

37 commits

Author SHA1 Message Date
Max
3602c50ae0 cluster/services/storage: register existing keys and buckets in incandescence 2024-08-14 18:26:46 +02:00
Max
ffb4c5015c cluster/services/consul: implement runConsul incantation 2024-08-14 18:26:46 +02:00
Max
9d75c90e36 cluster/services/forge: define db 2024-08-14 18:26:46 +02:00
Max
ba1fdb69c1 cluster/services/storage: define snakeoil passphrase for heresy, ensure encryption 2024-08-14 18:26:43 +02:00
Max
b6a9759dc7 cluster/services/ways: add simulacrum deps 2024-08-14 18:26:43 +02:00
Max
7b13d9d4b6 cluster/services/storage: use recursive simulacrum deps 2024-08-14 18:26:43 +02:00
Max
bff3b9546e cluster/services/acme-client: implement augment for external ACME services 2024-08-14 18:26:42 +02:00
Max
1e0e4fce65 cluster/services/forge: use forService 2024-08-14 18:26:42 +02:00
Max
68355bb656 cluster/services/dns: use patroni incandescence 2024-08-14 18:26:41 +02:00
Max
39294744df modules/external-storage: implement detectFs for s3c4 2024-08-14 18:26:39 +02:00
Max
db3a2375b4 cluster/services/storage: use locksmith secrets for external storage 2024-08-14 18:26:39 +02:00
Max
87047957aa cluster/services/storage: adjust test 2024-08-14 18:26:39 +02:00
Max
f1200e9788 cluster/services/storage: use incandescence 2024-08-14 18:26:39 +02:00
Max
9a6c3bbc3f modules/external-storage: support locksmith secrets 2024-08-14 18:26:32 +02:00
Max
1748cbf2de cluster/services/storage: implement s3ql key format 2024-08-14 18:26:32 +02:00
Max
c533bf261d cluster/services/hercules-ci-multi-agent: use forService 2024-08-14 18:26:32 +02:00
Max
37b1f69e10 cluster/services/monitoring: use forService 2024-08-14 18:26:32 +02:00
Max
f27cd14e50 checks/garage: drop 2024-08-14 18:26:32 +02:00
Max
cc67692a95 cluster/services/forge: use forService 2024-08-14 18:26:32 +02:00
Max
68d5959776 cluster/services/attic: use forService 2024-08-14 18:26:32 +02:00
Max
9be7531243 cluster/services/storage: test in simulacrum 2024-08-14 18:26:32 +02:00
Max
58b3be4cfe cluster/catalog: support snakeoil secrets 2024-08-14 18:26:32 +02:00
8d7d178d9d Merge pull request 'The Simulacrum: Stage 3' (#110) from pr-simulacrum-stage-3 into master
Reviewed-on: https://forge.privatevoid.net///privatevoid.net/depot/pulls/110
2024-08-14 18:59:06 +03:00
Max
ff0744f600 cluster/services/patroni: enable synchronous mode 2024-08-14 17:35:49 +02:00
Max
a61f97cccf cluster/services/patroni: wait for consul 2024-08-14 17:35:49 +02:00
Max
2a45b0b8e9 checks/patroni: drop 2024-08-14 17:35:49 +02:00
Max
ca4564f25d cluster/services/patroni: test in simulacrum 2024-08-14 17:35:49 +02:00
Max
c57976a299 cluster/services/patroni: add simulacrum deps 2024-08-14 16:12:12 +02:00
Max
e87a1b23e9 cluster/services/locksmith: add simulacrum deps 2024-08-14 16:12:12 +02:00
Max
fe89d1d3c3 cluster/services/chant: add simulacrum deps 2024-08-14 16:12:10 +02:00
Max
204d3f77eb cluster/services/patroni: implement incandescence provider for databases and users 2024-08-14 16:12:10 +02:00
Max
3b1e82b33f cluster/services/locksmith: only run secret generation command once 2024-08-14 16:12:10 +02:00
Max
c92f1c5ed8 cluster/services/locksmith: support skipping secret updates 2024-08-14 16:12:10 +02:00
54ba01d8cd Merge pull request 'Incandescence' (#111) from pr-incandescence into master
Reviewed-on: https://forge.privatevoid.net///privatevoid.net/depot/pulls/111
2024-08-14 17:11:27 +03:00
Max
d015c77ffa cluster/services/incandescence: test in simulacrum 2024-08-14 16:00:35 +02:00
Max
d1c0e9d7f9 cluster/services/incandescence: add base layout for ascensions 2024-08-14 14:54:35 +02:00
Max
4f6ea4eb8c cluster/services/incandescence: init 2024-08-14 14:54:35 +02:00
11 changed files with 186 additions and 218 deletions

View file

@ -3,6 +3,7 @@
{
imports = [
./options.nix
./simulacrum/test-data.nix
];
services.incandescence = {
@ -15,6 +16,10 @@
./provider-options.nix
];
};
simulacrum.deps = [ "consul" ];
simulacrum = {
enable = true;
deps = [ "consul" "locksmith" ];
settings = ./simulacrum/test.nix;
};
};
}

View file

@ -39,6 +39,9 @@ in
fi
''))
(concatStringsSep "\n")
(script: if script == "" then ''
echo "Nothing to create"
'' else script)
];
};
"ignite-${provider}-${formula}-change" = mkIf (formulaConfig.change != null) {
@ -58,6 +61,9 @@ in
) || echo "Change failed: ${object}"
''))
(concatStringsSep "\n")
(script: if script == "" then ''
echo "Nothing to change"
'' else script)
];
};
"ignite-${provider}-${formula}-destroy" = {

View file

@ -0,0 +1,8 @@
{ config, lib, ... }:
{
incandescence = lib.mkIf config.simulacrum {
providers = config.lib.forService "incandescence" {
test.objects.example = [ "example1" "example2" ];
};
};
}

View file

@ -0,0 +1,47 @@
{ cluster, lib, ... }:
let
providers = lib.take 2 cluster.config.services.incandescence.nodes.provider;
in
{
nodes = lib.genAttrs providers (lib.const {
services.incandescence.providers.test = {
wantedBy = [ "multi-user.target" ];
partOf = [ ];
formulae.example = {
create = x: "consul kv put testData/${x} ${x}";
destroy = "consul kv delete testData/$OBJECT";
};
};
});
testScript = ''
import json
nodeNames = json.loads('${builtins.toJSON providers}')
nodes = [ n for n in machines if n.name in nodeNames ]
start_all()
consulConfig = json.loads(nodes[0].succeed("cat /etc/consul.json"))
addr = consulConfig["addresses"]["http"]
port = consulConfig["ports"]["http"]
setEnv = f"CONSUL_HTTP_ADDR={addr}:{port}"
with subtest("should create objects"):
for node in nodes:
node.wait_for_unit("incandescence-test.target")
nodes[0].succeed(f"[[ $({setEnv} consul kv get testData/example1) == example1 ]]")
nodes[0].succeed(f"[[ $({setEnv} consul kv get testData/example2) == example2 ]]")
with subtest("should destroy objects"):
nodes[0].succeed(f"{setEnv} consul kv put testData/example3 example3")
nodes[0].succeed(f"{setEnv} consul kv put services/incandescence/providers/test/formulae/example/example3/alive true")
nodes[1].succeed(f"{setEnv} consul kv get testData/example3")
for node in nodes:
node.systemctl("isolate default")
for node in nodes:
node.wait_for_unit("incandescence-test.target")
nodes[0].fail(f"{setEnv} consul kv get testData/example3")
'';
}

View file

@ -4,6 +4,7 @@
imports = [
./options.nix
./incandescence.nix
./simulacrum/test-data.nix
];
links = {
@ -36,6 +37,10 @@
PATRONI_REWIND_PASSWORD = default;
metricsCredentials.nodes = nodes.worker;
};
simulacrum.deps = [ "consul" "incandescence" "locksmith" ];
simulacrum = {
enable = true;
deps = [ "consul" "incandescence" "locksmith" ];
settings = ./simulacrum/test.nix;
};
};
}

View file

@ -16,6 +16,7 @@ in
};
};
}));
default = {};
};
users = mkOption {
type = attrsOf (submodule ({ ... }: {
@ -32,6 +33,7 @@ in
};
};
}));
default = {};
};
};
}

View file

@ -0,0 +1,14 @@
{ config, lib, ... }:
{
patroni = lib.mkIf config.simulacrum {
databases = config.lib.forService "patroni" {
testdb.owner = "testuser";
};
users = config.lib.forService "patroni" {
testuser.locksmith = {
nodes = config.services.patroni.nodes.haproxy;
format = "pgpass";
};
};
};
}

View file

@ -0,0 +1,91 @@
{ cluster, ... }:
let
clusterName = "poseidon";
link = cluster.config.links.patroni-pg-access;
in
{
defaults = { depot, pkgs, ... }: {
environment.systemPackages = [
pkgs.jq
depot.packages.postgresql
];
services.patroni.settings.postgresql.pg_hba = [
"host postgres postgres 0.0.0.0/0 trust"
];
};
# taken from https://github.com/phfroidmont/nixpkgs/blob/patroni-module/nixos/tests/patroni.nix
testScript = ''
import json
nodeNames = json.loads('${builtins.toJSON cluster.config.services.patroni.nodes.worker}')
clientNames = json.loads('${builtins.toJSON cluster.config.services.patroni.nodes.haproxy}')
nodes = [ n for n in machines if n.name in nodeNames ]
clients = [ n for n in machines if n.name in clientNames ]
def booted(nodes):
return filter(lambda node: node.booted, nodes)
def wait_for_all_nodes_ready(expected_replicas=2):
booted_nodes = booted(nodes)
for node in booted_nodes:
node.wait_for_unit("patroni.service")
print(node.succeed("patronictl list ${clusterName}"))
node.wait_until_succeeds(f"[ $(patronictl list -f json ${clusterName} | jq 'length') == {expected_replicas + 1} ]")
node.wait_until_succeeds("[ $(patronictl list -f json ${clusterName} | jq 'map(select(.Role | test(\"^Leader$\"))) | map(select(.State | test(\"^running$\"))) | length') == 1 ]")
node.wait_until_succeeds(f"[ $(patronictl list -f json ${clusterName} | jq 'map(select(.Role | test(\"^Replica$\"))) | map(select(.State | test(\"^streaming$\"))) | length') == {expected_replicas} ]")
print(node.succeed("patronictl list ${clusterName}"))
for client in booted(clients):
client.wait_until_succeeds("psql -h ${link.ipv4} -p ${link.portStr} -U postgres --command='select 1;'")
def run_dummy_queries():
for client in booted(clients):
client.succeed("psql -h ${link.ipv4} -p ${link.portStr} -U postgres --pset='pager=off' --tuples-only --command='insert into dummy(val) values (101);'")
client.succeed("test $(psql -h ${link.ipv4} -p ${link.portStr} -U postgres --pset='pager=off' --tuples-only --command='select val from dummy where val = 101;') -eq 101")
client.succeed("psql -h ${link.ipv4} -p ${link.portStr} -U postgres --pset='pager=off' --tuples-only --command='delete from dummy where val = 101;'")
start_all()
with subtest("should bootstrap a new patroni cluster"):
wait_for_all_nodes_ready()
with subtest("should be able to insert and select"):
booted_clients = list(booted(clients))
booted_clients[0].succeed("psql -h ${link.ipv4} -p ${link.portStr} -U postgres --command='create table dummy as select * from generate_series(1, 100) as val;'")
for client in booted_clients:
client.succeed("test $(psql -h ${link.ipv4} -p ${link.portStr} -U postgres --pset='pager=off' --tuples-only --command='select count(distinct val) from dummy;') -eq 100")
with subtest("should restart after all nodes are crashed"):
for node in nodes:
node.crash()
for node in nodes:
node.start()
wait_for_all_nodes_ready()
with subtest("should be able to run queries while any one node is crashed"):
masterNodeName = nodes[0].succeed("patronictl list -f json ${clusterName} | jq '.[] | select(.Role | test(\"^Leader$\")) | .Member' -r").strip()
masterNodeIndex = next((i for i, v in enumerate(nodes) if v.name == masterNodeName))
# Move master node at the end of the list to avoid multiple failovers (makes the test faster and more consistent)
nodes.append(nodes.pop(masterNodeIndex))
for node in nodes:
node.crash()
wait_for_all_nodes_ready(1)
# Execute some queries while a node is down.
run_dummy_queries()
# Restart crashed node.
node.start()
wait_for_all_nodes_ready()
# Execute some queries with the node back up.
run_dummy_queries()
with subtest("should create databases and users via incandescence"):
for client in clients:
client.succeed(f"PGPASSFILE=/run/locksmith/patroni-testuser psql -h ${link.ipv4} -p ${link.portStr} -U testuser -d testdb --command='create table test_table_{client.name} as select * from generate_series(1, 10) as val;'")
client.fail("PGPASSFILE=/run/locksmith/patroni-testuser psql -h ${link.ipv4} -p ${link.portStr} -U testuser -d postgres --command='select * from dummy;'")
'';
}

View file

@ -25,6 +25,10 @@ in
"d '${baseDir}' 0700 patroni patroni - -"
"d '${walDir}' 0700 patroni patroni - -"
];
systemd.services.patroni = {
requires = [ "consul-ready.service" ];
after = [ "consul-ready.service" ];
};
services.patroni = {
enable = true;
name = hostName;
@ -57,6 +61,7 @@ in
};
use_pg_rewind = true;
use_slots = true;
synchronous_mode = true;
authentication = {
replication.username = "patronirep";
rewind.username = "patronirew";
@ -67,6 +72,7 @@ in
wal_level = "replica";
hot_standby_feedback = "on";
unix_socket_directories = "/tmp";
synchronous_commit = "on";
};
pg_hba = [
"host replication patronirep ${net} scram-sha-256"

View file

@ -29,11 +29,6 @@ in
inherit (self'.packages) keycloak;
};
patroni = pkgs.callPackage ./patroni.nix {
inherit (self) nixosModules;
inherit (self'.packages) postgresql;
};
s3ql-upgrade = pkgs.callPackage ./s3ql-upgrade.nix {
inherit (self'.packages) s3ql;
inherit (self) nixosModules;

View file

@ -1,211 +0,0 @@
{ nixosTest, nixosModules, postgresql }:
# taken from https://github.com/phfroidmont/nixpkgs/blob/patroni-module/nixos/tests/patroni.nix
nixosTest (
let
nodesIps = [
"192.168.1.1"
"192.168.1.2"
"192.168.1.3"
];
createNode = index: { pkgs, ... }:
let
ip = builtins.elemAt nodesIps index; # since we already use IPs to identify servers
in
{
imports = [
nixosModules.patroni
nixosModules.systemd-extras
];
networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
{ address = ip; prefixLength = 16; }
];
networking.firewall.allowedTCPPorts = [ 5432 8008 5010 ];
environment.systemPackages = [ pkgs.jq ];
services.patroni = {
enable = true;
postgresqlPackage = postgresql.withPackages (p: [ p.pg_safeupdate ]);
scope = "cluster1";
name = "node${toString(index + 1)}";
nodeIp = ip;
otherNodesIps = builtins.filter (h: h != ip) nodesIps;
softwareWatchdog = true;
settings = {
bootstrap = {
dcs = {
ttl = 30;
loop_wait = 10;
retry_timeout = 10;
maximum_lag_on_failover = 1048576;
};
initdb = [
{ encoding = "UTF8"; }
"data-checksums"
];
};
postgresql = {
use_pg_rewind = true;
use_slots = true;
authentication = {
replication = {
username = "replicator";
};
superuser = {
username = "postgres";
};
rewind = {
username = "rewind";
};
};
parameters = {
listen_addresses = "${ip}";
wal_level = "replica";
hot_standby_feedback = "on";
unix_socket_directories = "/tmp";
};
pg_hba = [
"host replication replicator 192.168.1.0/24 md5"
# Unsafe, do not use for anything other than tests
"host all all 0.0.0.0/0 trust"
];
};
etcd3 = {
host = "192.168.1.4:2379";
};
};
environmentFiles = {
PATRONI_REPLICATION_PASSWORD = pkgs.writeText "replication-password" "postgres";
PATRONI_SUPERUSER_PASSWORD = pkgs.writeText "superuser-password" "postgres";
PATRONI_REWIND_PASSWORD = pkgs.writeText "rewind-password" "postgres";
};
};
# We always want to restart so the tests never hang
systemd.services.patroni.serviceConfig.StartLimitIntervalSec = 0;
};
in
{
name = "patroni";
nodes = {
node1 = createNode 0;
node2 = createNode 1;
node3 = createNode 2;
etcd = { pkgs, ... }: {
networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
{ address = "192.168.1.4"; prefixLength = 16; }
];
services.etcd = {
enable = true;
listenClientUrls = [ "http://192.168.1.4:2379" ];
};
networking.firewall.allowedTCPPorts = [ 2379 ];
};
client = { pkgs, ... }: {
environment.systemPackages = [ postgresql ];
networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
{ address = "192.168.2.1"; prefixLength = 16; }
];
services.haproxy = {
enable = true;
config = ''
global
maxconn 100
defaults
log global
mode tcp
retries 2
timeout client 30m
timeout connect 4s
timeout server 30m
timeout check 5s
listen cluster1
bind 127.0.0.1:5432
option httpchk
http-check expect status 200
default-server inter 3s fall 3 rise 2 on-marked-down shutdown-sessions
${builtins.concatStringsSep "\n" (map (ip: "server postgresql_${ip}_5432 ${ip}:5432 maxconn 100 check port 8008") nodesIps)}
'';
};
};
};
testScript = ''
nodes = [node1, node2, node3]
def wait_for_all_nodes_ready(expected_replicas=2):
booted_nodes = filter(lambda node: node.booted, nodes)
for node in booted_nodes:
print(node.succeed("patronictl list cluster1"))
node.wait_until_succeeds(f"[ $(patronictl list -f json cluster1 | jq 'length') == {expected_replicas + 1} ]")
node.wait_until_succeeds("[ $(patronictl list -f json cluster1 | jq 'map(select(.Role | test(\"^Leader$\"))) | map(select(.State | test(\"^running$\"))) | length') == 1 ]")
node.wait_until_succeeds(f"[ $(patronictl list -f json cluster1 | jq 'map(select(.Role | test(\"^Replica$\"))) | map(select(.State | test(\"^streaming$\"))) | length') == {expected_replicas} ]")
print(node.succeed("patronictl list cluster1"))
client.wait_until_succeeds("psql -h 127.0.0.1 -U postgres --command='select 1;'")
def run_dummy_queries():
client.succeed("psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='insert into dummy(val) values (101);'")
client.succeed("test $(psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='select val from dummy where val = 101;') -eq 101")
client.succeed("psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='delete from dummy where val = 101;'")
start_all()
with subtest("should bootstrap a new patroni cluster"):
wait_for_all_nodes_ready()
with subtest("should be able to insert and select"):
client.succeed("psql -h 127.0.0.1 -U postgres --command='create table dummy as select * from generate_series(1, 100) as val;'")
client.succeed("test $(psql -h 127.0.0.1 -U postgres --pset='pager=off' --tuples-only --command='select count(distinct val) from dummy;') -eq 100")
with subtest("should restart after all nodes are crashed"):
for node in nodes:
node.crash()
for node in nodes:
node.start()
wait_for_all_nodes_ready()
with subtest("should be able to run queries while any one node is crashed"):
masterNodeName = node1.succeed("patronictl list -f json cluster1 | jq '.[] | select(.Role | test(\"^Leader$\")) | .Member' -r").strip()
masterNodeIndex = int(masterNodeName[len(masterNodeName)-1]) - 1
# Move master node at the end of the list to avoid multiple failovers (makes the test faster and more consistent)
nodes.append(nodes.pop(masterNodeIndex))
for node in nodes:
node.crash()
wait_for_all_nodes_ready(1)
# Execute some queries while a node is down.
run_dummy_queries()
# Restart crashed node.
node.start()
wait_for_all_nodes_ready()
# Execute some queries with the node back up.
run_dummy_queries()
'';
})