From 71c06d02dae502f15f0bca63627770030ae5edb2 Mon Sep 17 00:00:00 2001 From: Rodrigo Arias Mallo Date: Tue, 6 Apr 2021 15:23:26 +0200 Subject: [PATCH] stages: add baywatch stage to check the exit code This workaround stage prevents srun from returning 0 to the upper stages when a signal happens after MPI_Finalize. It writes the return code to a file named .srun.rc.$rank and later checks that exists and contains a 0. When the program is killed, exits with non-zero and the error is propagated to the baywatch stage, which aborts immediately without creating the rc file. --- garlic/index.nix | 1 + garlic/stages/baywatch.nix | 26 ++++++++++++++++++++++++++ garlic/stages/srun.nix | 15 +++++++++++++++ garlic/stdexp.nix | 6 +++++- 4 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 garlic/stages/baywatch.nix diff --git a/garlic/index.nix b/garlic/index.nix index b73c86d..ed8e2d8 100644 --- a/garlic/index.nix +++ b/garlic/index.nix @@ -76,6 +76,7 @@ stages = { sbatch = callPackage ./stages/sbatch.nix { }; srun = callPackage ./stages/srun.nix { }; + baywatch = callPackage ./stages/baywatch.nix { }; control = callPackage ./stages/control.nix { }; exec = callPackage ./stages/exec.nix { }; script = callPackage ./stages/script.nix { }; diff --git a/garlic/stages/baywatch.nix b/garlic/stages/baywatch.nix new file mode 100644 index 0000000..2b72f4d --- /dev/null +++ b/garlic/stages/baywatch.nix @@ -0,0 +1,26 @@ +{ + stdenv +, garlicTools +}: +{ + nextStage +}: + +with garlicTools; + +stdenv.mkDerivation rec { + name = "baywatch"; + phases = [ "installPhase" ]; + preferLocalBuild = true; + dontPatchShebangs = true; + installPhase = '' + cat > $out <<'EOF' + #!/bin/sh -e + + ${stageProgram nextStage} + echo $? >> .srun.rc.$SLURM_PROCID + + EOF + chmod +x $out + ''; +} diff --git a/garlic/stages/srun.nix b/garlic/stages/srun.nix index 57fc667..a2481a5 100644 --- a/garlic/stages/srun.nix +++ b/garlic/stages/srun.nix @@ -35,6 +35,21 @@ stdenv.mkDerivation rec { ${srunOptions} \ ${nixPrefix}${stageProgram nextStage} + >&2 echo srun exit code: $? + + # Ensure that none failed, as srun fails to capture errors + # after MPI_Finalize + for i in $(seq 0 $(($SLURM_NTASKS - 1))); do + if [ ! -e .srun.rc.$i ]; then + >&2 echo "missing exit code for rank $i, aborting" + exit 1 + fi + if ! grep -q '^0$' .srun.rc.$i; then + >&2 echo "non-zero exit for rank $i, aborting" + exit 1 + fi + done + ${postSrun} EOF diff --git a/garlic/stdexp.nix b/garlic/stdexp.nix index 33a18b1..4fa4278 100644 --- a/garlic/stdexp.nix +++ b/garlic/stdexp.nix @@ -99,13 +99,17 @@ rec { inherit nextStage; } ); + + baywatch = {nextStage, ...}: stages.baywatch { + inherit nextStage; + }; }; stdPipelineOverride = {overrides ? {}}: let stages = stdStages // overrides; in - with stages; [ sbatch isolate control srun isolate ]; + with stages; [ sbatch isolate control srun isolate baywatch ]; stdPipeline = stdPipelineOverride {};