stages: add baywatch stage to check the exit code

This workaround stage prevents srun from returning 0 to the upper stages
when a signal happens after MPI_Finalize. It writes the return code to a
file named .srun.rc.$rank and later checks that exists and contains a 0.

When the program is killed, exits with non-zero and the error is
propagated to the baywatch stage, which aborts immediately without
creating the rc file.
This commit is contained in:
Rodrigo Arias Mallo 2021-04-06 15:23:26 +02:00
parent 604cfd90a3
commit 71c06d02da
4 changed files with 47 additions and 1 deletions

View File

@ -76,6 +76,7 @@
stages = {
sbatch = callPackage ./stages/sbatch.nix { };
srun = callPackage ./stages/srun.nix { };
baywatch = callPackage ./stages/baywatch.nix { };
control = callPackage ./stages/control.nix { };
exec = callPackage ./stages/exec.nix { };
script = callPackage ./stages/script.nix { };

View File

@ -0,0 +1,26 @@
{
stdenv
, garlicTools
}:
{
nextStage
}:
with garlicTools;
stdenv.mkDerivation rec {
name = "baywatch";
phases = [ "installPhase" ];
preferLocalBuild = true;
dontPatchShebangs = true;
installPhase = ''
cat > $out <<'EOF'
#!/bin/sh -e
${stageProgram nextStage}
echo $? >> .srun.rc.$SLURM_PROCID
EOF
chmod +x $out
'';
}

View File

@ -35,6 +35,21 @@ stdenv.mkDerivation rec {
${srunOptions} \
${nixPrefix}${stageProgram nextStage}
>&2 echo srun exit code: $?
# Ensure that none failed, as srun fails to capture errors
# after MPI_Finalize
for i in $(seq 0 $(($SLURM_NTASKS - 1))); do
if [ ! -e .srun.rc.$i ]; then
>&2 echo "missing exit code for rank $i, aborting"
exit 1
fi
if ! grep -q '^0$' .srun.rc.$i; then
>&2 echo "non-zero exit for rank $i, aborting"
exit 1
fi
done
${postSrun}
EOF

View File

@ -99,13 +99,17 @@ rec {
inherit nextStage;
}
);
baywatch = {nextStage, ...}: stages.baywatch {
inherit nextStage;
};
};
stdPipelineOverride = {overrides ? {}}:
let
stages = stdStages // overrides;
in
with stages; [ sbatch isolate control srun isolate ];
with stages; [ sbatch isolate control srun isolate baywatch ];
stdPipeline = stdPipelineOverride {};