bscpkgs/garlic/stages/srun.nix
Rodrigo Arias Mallo 71c06d02da stages: add baywatch stage to check the exit code
This workaround stage prevents srun from returning 0 to the upper stages
when a signal happens after MPI_Finalize. It writes the return code to a
file named .srun.rc.$rank and later checks that exists and contains a 0.

When the program is killed, exits with non-zero and the error is
propagated to the baywatch stage, which aborts immediately without
creating the rc file.
2021-04-16 09:29:26 +02:00

59 lines
1.0 KiB
Nix

{
stdenv
, slurm
, garlicTools
}:
{
nextStage
, cpuBind
, nixPrefix
, preSrun ? ""
, postSrun ? ""
, srunOptions ? ""
, output ? "stdout.log"
, error ? "stderr.log"
}:
with garlicTools;
stdenv.mkDerivation rec {
name = "srun";
phases = [ "installPhase" ];
preferLocalBuild = true;
dontPatchShebangs = true;
installPhase = ''
cat > $out <<'EOF'
#!/bin/sh -e
${preSrun}
${slurm}/bin/srun \
--mpi=pmi2 \
--cpu-bind=${cpuBind} \
--output=${output} \
--error=${error} \
${srunOptions} \
${nixPrefix}${stageProgram nextStage}
>&2 echo srun exit code: $?
# Ensure that none failed, as srun fails to capture errors
# after MPI_Finalize
for i in $(seq 0 $(($SLURM_NTASKS - 1))); do
if [ ! -e .srun.rc.$i ]; then
>&2 echo "missing exit code for rank $i, aborting"
exit 1
fi
if ! grep -q '^0$' .srun.rc.$i; then
>&2 echo "non-zero exit for rank $i, aborting"
exit 1
fi
done
${postSrun}
EOF
chmod +x $out
'';
}