From 9063ecb5f4bc1f6fc0e5da2118e301ce9049a00b Mon Sep 17 00:00:00 2001 From: Max Date: Fri, 23 Aug 2024 02:50:18 +0200 Subject: [PATCH] cluster/services/monitoring: make tempo HA --- cluster/services/monitoring/default.nix | 70 +++++++++--------- cluster/services/monitoring/grafana-ha.nix | 10 +++ .../monitoring/secrets/tempo-secrets.age | 16 ----- cluster/services/monitoring/tracing.nix | 71 ++++++++++++++----- 4 files changed, 100 insertions(+), 67 deletions(-) delete mode 100644 cluster/services/monitoring/secrets/tempo-secrets.age diff --git a/cluster/services/monitoring/default.nix b/cluster/services/monitoring/default.nix index 6dfffc8..0b74b16 100644 --- a/cluster/services/monitoring/default.nix +++ b/cluster/services/monitoring/default.nix @@ -18,26 +18,6 @@ in protocol = "http"; ipv4 = meshIpFor "server"; }; - tempo = { - protocol = "http"; - ipv4 = meshIpFor "server"; - }; - tempo-grpc = { - protocol = "http"; - ipv4 = "127.0.0.1"; - }; - tempo-otlp-http = { - protocol = "http"; - ipv4 = meshIpFor "server"; - }; - tempo-otlp-grpc = { - protocol = "http"; - ipv4 = meshIpFor "server"; - }; - tempo-zipkin-http = { - protocol = "http"; - ipv4 = meshIpFor "server"; - }; }; hostLinks = lib.genAttrs config.services.monitoring.nodes.grafana (name: { grafana = { @@ -51,6 +31,7 @@ in blackbox = [ "checkmate" "grail" "prophet" ]; grafana = [ "VEGAS" "prophet" ]; logging = [ "VEGAS" "grail" ]; + tracing = [ "VEGAS" "grail" ]; server = [ "VEGAS" ]; }; nixos = { @@ -61,12 +42,20 @@ in ./provisioning/dashboards.nix ]; logging = ./logging.nix; + tracing = ./tracing.nix; server = [ ./server.nix - ./tracing.nix ]; }; - meshLinks.logging.loki.link.protocol = "http"; + meshLinks = { + logging.loki.link.protocol = "http"; + tracing = { + tempo.link.protocol = "http"; + tempo-otlp-http.link.protocol = "http"; + tempo-otlp-grpc.link.protocol = "grpc"; + tempo-zipkin-http.link.protocol = "http"; + }; + }; }; garage = config.lib.forService "monitoring" { @@ -79,36 +68,51 @@ in nodes = config.services.monitoring.nodes.logging; format = "envFile"; }; - tempo = { }; + tempo-ingest.locksmith = { + nodes = config.services.monitoring.nodes.tracing; + format = "envFile"; + }; + tempo-query.locksmith = { + nodes = config.services.monitoring.nodes.tracing; + format = "envFile"; + }; }; buckets = { loki-chunks.allow = { loki-ingest = [ "read" "write" ]; loki-query = [ "read" ]; }; - tempo-chunks.allow.tempo = [ "read" "write" ]; + tempo-chunks.allow = { + tempo-ingest = [ "read" "write" ]; + tempo-query = [ "read" ]; + }; }; }; - ways = config.lib.forService "monitoring" { - monitoring = { - consulService = "grafana"; - extras.locations."/".proxyWebsockets = true; - }; - monitoring-logs = { + ways = let + query = consulService: { + inherit consulService; internal = true; - consulService = "loki"; extras.extraConfig = '' proxy_read_timeout 3600s; ''; }; - ingest-logs = { + ingest = consulService: { + inherit consulService; internal = true; - consulService = "loki"; extras.extraConfig = '' client_max_body_size 4G; proxy_read_timeout 3600s; ''; }; + in config.lib.forService "monitoring" { + monitoring = { + consulService = "grafana"; + extras.locations."/".proxyWebsockets = true; + }; + monitoring-logs = query "loki"; + monitoring-traces = query "tempo"; + ingest-logs = ingest "loki"; + ingest-traces-otlp = ingest "tempo-ingest-otlp-grpc" // { grpc = true; }; }; } diff --git a/cluster/services/monitoring/grafana-ha.nix b/cluster/services/monitoring/grafana-ha.nix index 07b053e..303273b 100644 --- a/cluster/services/monitoring/grafana-ha.nix +++ b/cluster/services/monitoring/grafana-ha.nix @@ -73,6 +73,16 @@ in inherit (cluster.config.ways.monitoring-logs) url; type = "loki"; } + { + name = "Tempo"; + uid = "P214B5B846CF3925F"; + inherit (cluster.config.ways.monitoring-traces) url; + type = "tempo"; + jsonData = { + serviceMap.datasourceUid = "PBFA97CFB590B2093"; + nodeGraph.enabled = true; + }; + } ]; }; }; diff --git a/cluster/services/monitoring/secrets/tempo-secrets.age b/cluster/services/monitoring/secrets/tempo-secrets.age deleted file mode 100644 index 4144041..0000000 --- a/cluster/services/monitoring/secrets/tempo-secrets.age +++ /dev/null @@ -1,16 +0,0 @@ -age-encryption.org/v1 --> ssh-ed25519 NO562A KhCGp7PAMGrEdzRxBrsW4tRk30JwpI+4lPzrRCUhSw4 -8s7WqA5c3zS1euN5R+jfFNBdvr8OQW8P4NFeqtNsIKo --> ssh-ed25519 5/zT0w 79hJQ2H76EZTW7YcQFCtKaS5Kbssx4Z8dPFjIVzRgFk -A1fDJbUnyIRy+kWa3PhJNj/SdRPlcEy6FYsAfnuZ2AQ --> ssh-ed25519 d3WGuA aylkdL1KliM1NfrYDGlG8X6YjXvVUCU4sV90I+a840U -6sXdqIPjtoNSylZRh1DCghHOwDo+fC7WB4QWQoWmG48 --> //gd+2-grease baUWA$3 z-qs3W O/2.1W -Sfq3+rkMJhpUTTmcos5TaaUtX2Ip9pciHAZLiWPix+C9N7ccac/1W5RNedMJCLsq -MQ+xKzexf8+hgNVhKOksvbKBBROXqk1bUOKk8w3OgFPmmByzmCBUwkdkeu5DFTYR -rg ---- kUl1uIPRkM5y7C68kdN22pMKXP7gazyha4PE+ap0Jqw -w> -C,\ߜIHxG@^P lw6{rb5T> -WܤX4Kp(?9߭^^oP3f O+N0Dt5V# U#y|@GzSi*H] -ꎀ5 \ No newline at end of file diff --git a/cluster/services/monitoring/tracing.nix b/cluster/services/monitoring/tracing.nix index 5f2a35b..52ee5f9 100644 --- a/cluster/services/monitoring/tracing.nix +++ b/cluster/services/monitoring/tracing.nix @@ -1,14 +1,16 @@ { cluster, config, pkgs, ... }: let - inherit (cluster.config) links; + inherit (cluster.config.links) prometheus-ingest; + inherit (config.links) tempo-grpc; + links = cluster.config.hostLinks.${config.networking.hostName}; dataDir = "/srv/storage/private/tempo"; tempoConfig = { server = { http_listen_address = links.tempo.ipv4; http_listen_port = links.tempo.port; - grpc_listen_address = links.tempo-grpc.ipv4; - grpc_listen_port = links.tempo-grpc.port; + grpc_listen_address = tempo-grpc.ipv4; + grpc_listen_port = tempo-grpc.port; }; distributor.receivers = { otlp = { @@ -19,7 +21,7 @@ let }; zipkin.endpoint = links.tempo-zipkin-http.tuple; }; - querier.frontend_worker.frontend_address = links.tempo-grpc.tuple; + querier.frontend_worker.frontend_address = tempo-grpc.tuple; ingester = { trace_idle_period = "30s"; max_block_bytes = 1000000; @@ -56,7 +58,7 @@ let path = "${dataDir}/generator/wal"; remote_write = [ { - url = "${links.prometheus-ingest.url}/api/v1/write"; + url = "${prometheus-ingest.url}/api/v1/write"; send_exemplars = true; } ]; @@ -68,7 +70,11 @@ let ]; }; in { - age.secrets.tempoSecrets.file = ./secrets/tempo-secrets.age; + links.tempo-grpc.protocol = "http"; + + services.locksmith.waitForSecrets.tempo = [ + "garage-tempo-ingest" + ]; users.users.tempo = { isSystemUser = true; @@ -81,24 +87,53 @@ in { systemd.services.tempo = { wantedBy = [ "multi-user.target" ]; + distributed = { + enable = true; + registerServices = [ + "tempo" + "tempo-ingest-otlp-grpc" + ]; + }; serviceConfig = { User = "tempo"; Group = "tempo"; ExecStart = "${pkgs.tempo}/bin/tempo -config.file=${pkgs.writeText "tempo.yaml" (builtins.toJSON tempoConfig)}"; PrivateTmp = true; - EnvironmentFile = config.age.secrets.tempoSecrets.path; + EnvironmentFile = "/run/locksmith/garage-tempo-ingest"; }; }; - services.grafana.provision.datasources.settings.datasources = [ - { - name = "Tempo"; - uid = "P214B5B846CF3925F"; - inherit (links.tempo) url; - type = "tempo"; - jsonData = { - serviceMap.datasourceUid = "PBFA97CFB590B2093"; # prometheus - nodeGraph.enabled = true; + + consul.services = { + tempo = { + mode = "manual"; + definition = { + name = "tempo"; + address = links.tempo.ipv4; + inherit (links.tempo) port; + checks = [ + { + name = "Tempo"; + id = "service:tempo:backend"; + interval = "5s"; + http = "${links.tempo.url}/ready"; + } + ]; }; - } - ]; + }; + tempo-ingest-otlp-grpc = { + mode = "manual"; + definition = { + name = "tempo-ingest-otlp-grpc"; + address = links.tempo-otlp-grpc.ipv4; + inherit (links.tempo-otlp-grpc) port; + checks = [ + { + name = "Tempo Service Status"; + id = "service:tempo-ingest-otlp-grpc:tempo"; + alias_service = "tempo"; + } + ]; + }; + }; + }; }