diff --git a/checks/default.nix b/checks/default.nix index 0379200d..78ab1e53 100644 --- a/checks/default.nix +++ b/checks/default.nix @@ -3,5 +3,6 @@ imports = [ ./packages-ci-matrix.nix ./pre-commit.nix + ./vm-tests.nix ]; } diff --git a/checks/vm-tests.nix b/checks/vm-tests.nix new file mode 100644 index 00000000..6de756cf --- /dev/null +++ b/checks/vm-tests.nix @@ -0,0 +1,39 @@ +{ + lib, + inputs, + self, + ... +}: +{ + perSystem = + { + inputs', + self', + pkgs, + system, + ... + }: + { + checks = { + "healthcheck-test-01" = pkgs.testers.runNixOSTest { + name = "healthcheck-test-01"; + node.specialArgs = { + inherit self system lib; + }; + testScript = '' + machine.start() + machine.wait_for_unit("default.target") + ''; + nodes.machine = + { pkgs, ... }: + { + imports = [ + ../machines/modules/base.nix + ../machines/modules/healthcheck.nix + ]; + }; + + }; + }; + }; +} diff --git a/flake.nix b/flake.nix index d38132a3..d84e1889 100644 --- a/flake.nix +++ b/flake.nix @@ -221,6 +221,7 @@ ./checks ./modules ./packages + ./machines ./shells ]; systems = [ diff --git a/machines/blankpass.txt b/machines/blankpass.txt new file mode 100644 index 00000000..e69de29b diff --git a/machines/default.nix b/machines/default.nix new file mode 100644 index 00000000..b2d1091c --- /dev/null +++ b/machines/default.nix @@ -0,0 +1,43 @@ +{ + lib, + self, + ... +}: +let + system = "x86_64-linux"; + pkgs = import self.inputs.nixpkgs { + inherit system; + config.allowUnfree = true; + }; + mkMachine = x: { + "${lib.removeSuffix ".nix" x}" = lib.nixosSystem { + specialArgs = { inherit self system; }; + modules = lib.unique [ + { + nixpkgs.system = system; + } + ./modules/base.nix + ./modules/${x} + ]; + }; + }; +in +{ + flake.nixosConfigurations = ( + lib.mergeAttrsList (lib.map (x: mkMachine x) (builtins.attrNames (builtins.readDir ./modules))) + ); + flake.modules.nixos = ( + lib.mergeAttrsList ( + lib.map (x: { + "machine_${lib.removeSuffix ".nix" x}" = ( + import (./modules + "/${x}") { + inherit + self + pkgs + ; + } + ); + }) (builtins.attrNames (builtins.readDir ./modules)) + ) + ); +} diff --git a/machines/modules/all.nix b/machines/modules/all.nix new file mode 100644 index 00000000..e9800176 --- /dev/null +++ b/machines/modules/all.nix @@ -0,0 +1,18 @@ +{ + self, + lib, + system, + ... +}: +{ + imports = lib.map (x: ./. + "/${x}") ( + builtins.attrNames ( + lib.removeAttrs (builtins.readDir ./.) [ + "base.nix" + "all.nix" + ] + ) + ); + + environment.systemPackages = lib.attrValues self.packages."${system}"; +} diff --git a/machines/modules/base.nix b/machines/modules/base.nix new file mode 100644 index 00000000..757d5cd5 --- /dev/null +++ b/machines/modules/base.nix @@ -0,0 +1,14 @@ +{ lib, ... }: +{ + fileSystems = { + "/".device = lib.mkDefault "/dev/sda"; + }; + boot.loader.grub.devices = lib.mkDefault [ "/dev/sda" ]; + virtualisation.vmVariant = { + virtualization = { + diskSize = 10 * 1024 * 1024 * 1024; # 10GB + memorySize = 8192; # 8GB + cores = 4; + }; + }; +} diff --git a/machines/modules/ethereum-validators-monitoring.nix b/machines/modules/ethereum-validators-monitoring.nix new file mode 100644 index 00000000..98f5c0c7 --- /dev/null +++ b/machines/modules/ethereum-validators-monitoring.nix @@ -0,0 +1,20 @@ +{ self, ... }: +{ + imports = [ + self.modules.nixos.ethereum-validators-monitoring + ]; + + services.ethereum-validators-monitoring = { + db = { + host = "http://localhost:8123/"; + user = "ethereum"; + password-file = ../blankpass.txt; + name = "ethereum"; + }; + instances = { + # The Ethereum Validator Monitoring sends out too many requests to servers, + # which causes attestations to be missing. Therefore, as we lack a + # dedicated server, we are unable to have e-v-m. + }; + }; +} diff --git a/machines/modules/folder-size-metrics.nix b/machines/modules/folder-size-metrics.nix new file mode 100644 index 00000000..ee37c11b --- /dev/null +++ b/machines/modules/folder-size-metrics.nix @@ -0,0 +1,15 @@ +{ self, ... }: +{ + imports = [ + self.modules.nixos.folder-size-metrics + ]; + + services.folder-size-metrics = { + enable = true; + # args = { #Unchanged + # port = 8888; + # base-path = "/var/lib"; + # interval-sec = 60; + # }; + }; +} diff --git a/machines/modules/healthcheck.nix b/machines/modules/healthcheck.nix new file mode 100644 index 00000000..c9eac97b --- /dev/null +++ b/machines/modules/healthcheck.nix @@ -0,0 +1,53 @@ +{ self, pkgs, ... }: +{ + imports = [ + self.modules.nixos.healthcheck + ]; + + systemd.services.test = { + description = ""; + enable = true; + path = [ + ]; + serviceConfig = { + Type = "simple"; + Restart = "on-failure"; + RestartSec = "10s"; + + ExecStart = '' + sleep 10; # Simulate a startup delay. + echo "Starting test service..."; + echo "Test" > /tmp/test.log + while true ; do + echo "Test" | nc -l 8080 + done + ''; + }; + }; + + # --- Health Check Configuration --- + mcl.services.test.healthcheck = { + runtimePackages = with pkgs; [ + netcat + curl + ]; + + # READINESS: Use the notify pattern to signal when the service is truly ready. + readiness-probe = { + enable = true; + command = "ls /tmp/test.log"; + interval = 2; + statusWaitingMessage = "Test starting, waiting..."; + statusReadyMessage = "Test is ready."; + }; + + # LIVENESS: After startup, use a timer to periodically check health. + liveness-probe = { + enable = true; + command = "[ \"$(nc -w 2 localhost 8080)\" = \"Test2\" ]"; + initialDelay = 15; + interval = 30; # Check every 30 seconds. + timeout = 5; + }; + }; +} diff --git a/machines/modules/lido-keys-api.nix b/machines/modules/lido-keys-api.nix new file mode 100644 index 00000000..c014cc16 --- /dev/null +++ b/machines/modules/lido-keys-api.nix @@ -0,0 +1,30 @@ +{ self, ... }: +{ + imports = [ + self.modules.nixos.lido-keys-api + ]; + + services.lido-keys-api = { + enable = true; + args = { + port = 3000; + cors-whitelist-regexp = "^https?://(?:.+?\.)?(?:lido|testnet|mainnet|holesky)\.fi$"; + global-throttle-ttl = 5; + global-throttle-limit = 100; + global-cache-ttl = 1; + sentry-dsn = ""; + log-level = "debug"; + log-format = "json"; + db-name = "node_operator_keys_service_db"; + db-port = 5432; + db-host = "127.0.0.1"; + db-user = "postgres"; + db-password = ""; + provider-json-rpc-max-batch-size = 100; + provider-concurrent-requests = 5; + provider-batch-aggregation-wait-ms = 10; + validator-registry-enable = true; + }; + }; + +} diff --git a/machines/modules/lido-validator-ejector.nix b/machines/modules/lido-validator-ejector.nix new file mode 100644 index 00000000..6f63b48b --- /dev/null +++ b/machines/modules/lido-validator-ejector.nix @@ -0,0 +1,28 @@ +{ self, config, ... }: +{ + imports = [ + self.modules.nixos.lido-validator-ejector + ]; + + services.lido-validator-ejector = { + enable = true; + args = { + messages-location = config.services.lido-withdrawals-automation.args.output-folder; + consensus-node = ""; + execution-node = ""; + locator-address = ""; + blocks-preload = 100000; + http-port = 8989; + run-metrics = true; + run-health-check = true; + logger-level = "info"; + logger-format = "simple"; + logger-secrets = [ + "MESSAGES_PASSWORD" + "EXECUTION_NODE" + "CONSENSUS_NODE" + ]; + dry-run = false; + }; + }; +} diff --git a/machines/modules/lido-withdrawals-automation.nix b/machines/modules/lido-withdrawals-automation.nix new file mode 100644 index 00000000..2f392749 --- /dev/null +++ b/machines/modules/lido-withdrawals-automation.nix @@ -0,0 +1,17 @@ +{ self, ... }: +{ + imports = [ + self.modules.nixos.lido-withdrawals-automation + ]; + + services.lido-withdrawals-automation = { + enable = true; + args = { + operator-id = ""; + password = ""; + percentage = 10; + output-folder = "/ethereum/lido/withdrawal-automation"; + overwrite = "always"; + }; + }; +} diff --git a/machines/modules/mcl-disko.nix b/machines/modules/mcl-disko.nix new file mode 100644 index 00000000..f34f4703 --- /dev/null +++ b/machines/modules/mcl-disko.nix @@ -0,0 +1,8 @@ +{ self, ... }: +{ + imports = [ + self.modules.nixos.mcl-disko + ]; + + #TODO: Figure out the arguments for this service and enable it +} diff --git a/machines/modules/mcl-host-info.nix b/machines/modules/mcl-host-info.nix new file mode 100644 index 00000000..0d4cdceb --- /dev/null +++ b/machines/modules/mcl-host-info.nix @@ -0,0 +1,12 @@ +{ self, ... }: +{ + imports = [ + self.modules.nixos.mcl-host-info + ]; + + mcl.host-info = { + type = "server"; + isDebugVM = true; + configPath = ./.; + }; +} diff --git a/machines/modules/mcl-secrets.nix b/machines/modules/mcl-secrets.nix new file mode 100644 index 00000000..0db51c68 --- /dev/null +++ b/machines/modules/mcl-secrets.nix @@ -0,0 +1,8 @@ +{ self, ... }: +{ + imports = [ + self.modules.nixos.mcl-secrets + ]; + + #TODO: Figure out the arguments for this service and enable it +} diff --git a/machines/modules/pharos.nix b/machines/modules/pharos.nix new file mode 100644 index 00000000..d32367ec --- /dev/null +++ b/machines/modules/pharos.nix @@ -0,0 +1,11 @@ +{ self, ... }: +{ + imports = [ + self.modules.nixos.pharos + ]; + + services.pharos = { + enable = true; + network = "testnet"; + }; +} diff --git a/machines/modules/pyroscope.nix b/machines/modules/pyroscope.nix new file mode 100644 index 00000000..39af965f --- /dev/null +++ b/machines/modules/pyroscope.nix @@ -0,0 +1,10 @@ +{ self, ... }: +{ + imports = [ + self.modules.nixos.pyroscope + ]; + + services.pyroscope = { + enable = true; + }; +} diff --git a/machines/modules/random-alerts.nix b/machines/modules/random-alerts.nix new file mode 100644 index 00000000..e71cf17f --- /dev/null +++ b/machines/modules/random-alerts.nix @@ -0,0 +1,8 @@ +{ self, ... }: +{ + imports = [ + self.modules.nixos.random-alerts + ]; + + #TODO: Figure out the arguments for this service and enable it +} diff --git a/modules/default.nix b/modules/default.nix index 5698129d..2e4cd37f 100644 --- a/modules/default.nix +++ b/modules/default.nix @@ -9,5 +9,6 @@ ./secrets.nix ./mcl-disko ./pharos + ./healthcheck ]; } diff --git a/modules/healthcheck/default.nix b/modules/healthcheck/default.nix new file mode 100644 index 00000000..f9a34d0f --- /dev/null +++ b/modules/healthcheck/default.nix @@ -0,0 +1,248 @@ +{ ... }: +{ + flake.modules.nixos.healthcheck = + { + config, + lib, + pkgs, + ... + }: + let + inherit (lib) types; + mkProbeOptions = x: { + options = + { + enable = lib.mkEnableOption "the ${x} probe"; + + command = lib.mkOption { + type = types.str; + description = "The command to execute for the ${x} check. Any necessary programs should be added to the healthcheck.runtimePackages option."; + }; + + initialDelay = lib.mkOption { + type = types.int; + default = 15; + description = "Seconds to wait after the service is up before the first ${x} probe."; + }; + + interval = lib.mkOption { + type = types.int; + default = if x == "liveness" then 30 else 2; + description = "How often (in seconds) to perform the ${x} probe."; + }; + + timeout = lib.mkOption { + type = types.int; + default = 10; + description = "Seconds after which the ${x} probe command times out."; + }; + + # TODO: `{success,failure}_treshold` + + retryCount = lib.mkOption { + type = types.int; + default = 10; + description = "Number of times to retry the ${x} probe before considering it failed. (-1 means infinite retries)"; + }; + } + // lib.optionalAttrs (x == "readiness") { + statusWaitingMessage = lib.mkOption { + type = types.str; + default = "Service starting, waiting for ready signal..."; + description = "The status message to send to systemd while waiting."; + }; + + statusReadyMessage = lib.mkOption { + type = types.str; + default = "Service is ready."; + description = '' + The status message to send when the service is ready. + Use %OUTPUT% to substitute the output of the check command. + ''; + }; + }; + }; + + # Options for the liveness probe (timer-based check) + livenessProbeOptions = mkProbeOptions "liveness"; + + # Options for the readiness probe (notify-based check) + readinessProbeOptions = mkProbeOptions "readiness"; + in + { + config = + let + servicesWithHealthcheck = lib.filterAttrs ( + _name: service: service.healthcheck != null + ) config.mcl.services; + in + { + assertions = lib.pipe config.mcl.services [ + (lib.filterAttrs (_: service: service.healthcheck != null)) + (lib.mapAttrsToList ( + name: _: + let + serviceConfig = config.systemd.services.${name}.serviceConfig; + in + { + # NOTE: as per + assertion = lib.elem serviceConfig.Type [ + "simple" + "idle" + ]; + message = '' + Service ${name} is not of type "simple" or "idle", but ${serviceConfig.Type}. + Cannot attach a readiness probe to it. + ''; + } + )) + ]; + systemd = { + services = + let + mainServices = lib.mapAttrs ( + mainServiceName: serviceConfig: + let + cfg = serviceConfig.healthcheck; + probeCfg = cfg.readiness-probe; + in + lib.mkIf (cfg != null && probeCfg.enable) { + # Timeout is now handled manually by the new `ExecStartPost` + serviceConfig.TimeoutStartSec = "infinity"; + + # Add an `ExecStartPost` with a script that runs the readiness probe + # WARN: cannot assure that there is no `ExecStartPost` in the original `serviceConfig` + # (in order to avoid overriding/duplication) + serviceConfig.ExecStartPost = + let + scriptPath = lib.makeBinPath (cfg.runtimePackages ++ (serviceConfig.path or [ ])); + in + lib.getExe ( + pkgs.writeShellScriptBin "${mainServiceName}-readiness-check" '' + set -o nounset + + export PATH="${scriptPath}:$PATH" + + echo "Health check: starting background readiness probe for ${mainServiceName}." + sleep ${toString probeCfg.initialDelay} + retryCount=${toString probeCfg.retryCount} + while true; do + if (timeout ${toString probeCfg.timeout}s ${probeCfg.command} &> /dev/null); then + echo "Health check: probe successful. Notifying systemd that the service is ready." + exit 0 + else + echo "Health check: probe not successful. Notifying systemd that the service is still waiting. Retrying in ${toString probeCfg.interval} seconds..." + if [[ ''${retryCount} -ne -1 ]]; then + retryCount=$((retryCount - 1)) + if [[ ''${retryCount} -le 0 ]]; then + echo "Health check: probe failed after maximum retries. Exiting." + exit 1 + fi + fi + fi + sleep ${toString probeCfg.interval} + done + '' + ); + } + ) servicesWithHealthcheck; + + healthCheckServices = lib.mapAttrs' ( + mainServiceName: serviceConfig: + let + cfg = serviceConfig.healthcheck; + in + { + name = "${mainServiceName}-liveness-check"; + value = lib.mkIf (cfg != null && cfg.liveness-probe.enable) ( + let + probeCfg = cfg.liveness-probe; + checkScript = pkgs.writeShellScriptBin "liveness-check" '' + #!${pkgs.runtimeShell} + echo "Executing liveness probe for ${mainServiceName}..." + if ! (timeout ${toString probeCfg.timeout}s ${probeCfg.command} &> /dev/null); then + echo "Liveness probe for ${mainServiceName} failed. Triggering restart..." + ${lib.getExe' pkgs.systemd "systemctl"} restart ${lib.escapeShellArg mainServiceName}.service + exit 1 + fi + echo "Liveness probe for ${mainServiceName} successful." + ''; + in + { + description = "Liveness check for ${mainServiceName}"; + path = cfg.runtimePackages; + serviceConfig = { + Type = "oneshot"; + ExecStart = "${lib.getExe checkScript}"; + }; + } + ); + } + ) servicesWithHealthcheck; + in + mainServices // healthCheckServices; + + timers = lib.mapAttrs' ( + mainServiceName: serviceConfig: + let + cfg = serviceConfig.healthcheck; + in + { + name = "${mainServiceName}-liveness-check"; + value = lib.mkIf (cfg != null && cfg.liveness-probe.enable) ( + let + probeCfg = cfg.liveness-probe; + in + { + description = "Timer for ${mainServiceName} liveness probe"; + wantedBy = [ "timers.target" ]; + timerConfig = { + OnActiveSec = "${toString probeCfg.initialDelay}s"; + OnUnitInactiveSec = "${toString probeCfg.interval}s"; + }; + } + ); + } + ) servicesWithHealthcheck; + }; + }; + + options.mcl.services = lib.mkOption { + default = { }; + type = types.attrsOf ( + types.submodule { + options = { + healthcheck = lib.mkOption { + default = null; + description = "Declarative health checks for this systemd service."; + type = types.nullOr ( + types.submodule { + options = { + # Programs to add to the PATH for the health check. + runtimePackages = lib.mkOption { + type = types.listOf types.package; + default = [ ]; + description = "Additional programs to add to the PATH for health checks."; + }; + + # The new readiness probe that uses the notify pattern. + readiness-probe = lib.mkOption { + type = types.submodule readinessProbeOptions; + default = { }; + }; + + # The liveness probe (timer-based). + liveness-probe = lib.mkOption { + type = types.submodule livenessProbeOptions; + default = { }; + }; + }; + } + ); + }; + }; + } + ); + }; + }; +}