diff --git a/cluster/services/monitoring/default.nix b/cluster/services/monitoring/default.nix index d530d30..7de69f3 100644 --- a/cluster/services/monitoring/default.nix +++ b/cluster/services/monitoring/default.nix @@ -1,9 +1,11 @@ -{ config, ... }: +{ config, lib, ... }: let nodeFor = nodeType: builtins.head config.services.monitoring.nodes.${nodeType}; meshIpFor = nodeType: config.vars.mesh.${nodeFor nodeType}.meshIp; + + meshIpForNode = name: config.vars.mesh.${name}.meshIp; in { @@ -37,16 +39,24 @@ in ipv4 = meshIpFor "server"; }; }; + hostLinks = lib.genAttrs config.services.monitoring.nodes.grafana (name: { + grafana = { + protocol = "http"; + ipv4 = meshIpForNode name; + }; + }); services.monitoring = { nodes = { client = [ "checkmate" "thunderskin" "VEGAS" "prophet" ]; blackbox = [ "checkmate" "VEGAS" "prophet" ]; + grafana = [ "VEGAS" "prophet" ]; logging = [ "VEGAS" ]; server = [ "VEGAS" ]; }; nixos = { client = ./client.nix; blackbox = ./blackbox.nix; + grafana = ./grafana-ha.nix; logging = ./logging.nix; server = [ ./server.nix diff --git a/cluster/services/monitoring/grafana-ha.nix b/cluster/services/monitoring/grafana-ha.nix new file mode 100644 index 0000000..02b298c --- /dev/null +++ b/cluster/services/monitoring/grafana-ha.nix @@ -0,0 +1,152 @@ +{ cluster, config, depot, lib, pkgs, tools, ... }: +let + inherit (tools.meta) domain; + + inherit (cluster.config.links) loki-ingest prometheus-ingest; + + inherit (cluster.config) hostLinks; + + inherit (config.networking) hostName; + + svc = cluster.config.services.monitoring; + + iniList = lib.concatStringsSep " "; + + login = x: "https://login.${domain}/auth/realms/master/protocol/openid-connect/${x}"; +in +{ + age.secrets = { + grafana-db-credentials = { + file = ./secrets/grafana-db-credentials.age; + owner = "grafana"; + }; + grafana-secrets.file = ./secrets/grafana-secrets.age; + }; + + services.grafana = { + enable = true; + package = depot.packages.grafana; + dataDir = "/srv/storage/private/grafana"; + settings = { + server = { + root_url = "https://monitoring.${domain}/"; + http_addr = hostLinks.${hostName}.grafana.ipv4; + http_port = hostLinks.${hostName}.grafana.port; + }; + database = { + type = "postgres"; + host = cluster.config.links.patroni-pg-access.tuple; + user = "grafana"; + password = "$__file{${config.age.secrets.grafana-db-credentials.path}}"; + }; + analytics.reporting_enabled = false; + "auth.generic_oauth" = { + enabled = true; + allow_sign_up = true; + client_id = "net.privatevoid.monitoring1"; + auth_url = login "auth"; + token_url = login "token"; + api_url = login "userinfo"; + scopes = iniList [ "openid" "profile" "email" "roles" ]; + role_attribute_strict = true; + role_attribute_path = "resource_access.monitoring.roles[0]"; + }; + security = { + cookie_secure = true; + disable_gravatar = true; + }; + feature_toggles.enable = iniList [ + "tempoSearch" + "tempoBackendSearch" + "tempoServiceGraph" + ]; + }; + provision = { + enable = true; + datasources.settings.datasources = [ + { + name = "Prometheus"; + uid = "PBFA97CFB590B2093"; + inherit (prometheus-ingest) url; + type = "prometheus"; + isDefault = true; + } + { + name = "Loki"; + uid = "P8E80F9AEF21F6940"; + inherit (loki-ingest) url; + type = "loki"; + } + ]; + }; + }; + + systemd.services = { + grafana = { + enable = false; + serviceConfig.EnvironmentFile = config.age.secrets.grafana-secrets.path; + }; + grafana-ha = let + base = config.systemd.services.grafana; + inherit (config.services) consul; + svc = config.consul.services.grafana; + run = pkgs.writeShellScript "grafana-ha-start" '' + trap '${svc.commands.deregister}' EXIT + ${svc.commands.register} + ${base.serviceConfig.ExecStart} + ''; + in { + inherit (base) wantedBy; + description = "Grafana | High Availability"; + aliases = [ "grafana.service" ]; + + after = base.after ++ [ "consul.service" ]; + requires = [ "consul.service" ]; + + serviceConfig = base.serviceConfig // { + ExecStart = "${consul.package}/bin/consul lock --shell=false services/grafana ${run}"; + # consul uses AF_NETLINK to determine interface addresses, even when just registering a service + RestrictAddressFamilies = base.serviceConfig.RestrictAddressFamilies ++ [ "AF_NETLINK" ]; + }; + }; + }; + + services.nginx = { + upstreams.grafana-ha.servers = lib.mapAttrs' (_: links: lib.nameValuePair links.grafana.tuple {}) (lib.getAttrs (svc.nodes.grafana) hostLinks); + + virtualHosts."monitoring.${domain}" = lib.recursiveUpdate (tools.nginx.vhosts.proxy "http://grafana-ha") { + locations."/".proxyWebsockets = true; + }; + }; + + security.acme.certs."monitoring.${domain}" = { + dnsProvider = "pdns"; + webroot = lib.mkForce null; + }; + + consul.services.grafana = { + mode = "manual"; + unit = "grafana-ha"; + definition = rec { + name = "grafana"; + address = depot.reflection.interfaces.primary.addrPublic; + port = 443; + checks = [ + { + name = "Frontend"; + id = "service:grafana:frontend"; + interval = "30s"; + http = "https://${address}"; + tls_server_name = "monitoring.${domain}"; + method = "HEAD"; + } + { + name = "Backend"; + id = "service:grafana:backend"; + interval = "5s"; + http = "${hostLinks.${hostName}.grafana.url}/healthz"; + } + ]; + }; + }; +} diff --git a/cluster/services/monitoring/server.nix b/cluster/services/monitoring/server.nix index a01ac58..3305166 100644 --- a/cluster/services/monitoring/server.nix +++ b/cluster/services/monitoring/server.nix @@ -1,92 +1,8 @@ -{ cluster, config, depot, lib, tools, ... }: +{ cluster, ... }: let - inherit (tools.meta) domain; - - inherit (config) links; - - inherit (cluster.config.links) loki-ingest prometheus-ingest; - - iniList = lib.concatStringsSep " "; - - login = x: "https://login.${domain}/auth/realms/master/protocol/openid-connect/${x}"; + inherit (cluster.config.links) prometheus-ingest; in { - age.secrets = { - grafana-db-credentials = { - file = ./secrets/grafana-db-credentials.age; - owner = "grafana"; - }; - grafana-secrets.file = ./secrets/grafana-secrets.age; - }; - - links = { - grafana.protocol = "http"; - }; - services.grafana = { - enable = true; - package = depot.packages.grafana; - dataDir = "/srv/storage/private/grafana"; - settings = { - server = { - root_url = "https://monitoring.${domain}/"; - http_port = links.grafana.port; - }; - database = { - type = "postgres"; - host = cluster.config.links.patroni-pg-access.tuple; - user = "grafana"; - password = "$__file{${config.age.secrets.grafana-db-credentials.path}}"; - }; - analytics.reporting_enabled = false; - "auth.generic_oauth" = { - enabled = true; - allow_sign_up = true; - client_id = "net.privatevoid.monitoring1"; - auth_url = login "auth"; - token_url = login "token"; - api_url = login "userinfo"; - scopes = iniList [ "openid" "profile" "email" "roles" ]; - role_attribute_strict = true; - role_attribute_path = "resource_access.monitoring.roles[0]"; - }; - security = { - cookie_secure = true; - disable_gravatar = true; - }; - feature_toggles.enable = iniList [ - "tempoSearch" - "tempoBackendSearch" - "tempoServiceGraph" - ]; - }; - provision = { - enable = true; - datasources.settings.datasources = [ - { - name = "Prometheus"; - uid = "PBFA97CFB590B2093"; - inherit (prometheus-ingest) url; - type = "prometheus"; - isDefault = true; - } - { - name = "Loki"; - uid = "P8E80F9AEF21F6940"; - inherit (loki-ingest) url; - type = "loki"; - } - ]; - }; - }; - - systemd.services.grafana.serviceConfig = { - EnvironmentFile = config.age.secrets.grafana-secrets.path; - }; - - services.nginx.virtualHosts."monitoring.${domain}" = lib.recursiveUpdate (tools.nginx.vhosts.proxy links.grafana.url) { - locations."/".proxyWebsockets = true; - }; - services.prometheus = { enable = true; listenAddress = prometheus-ingest.ipv4;