71c06d02da
This workaround stage prevents srun from returning 0 to the upper stages when a signal happens after MPI_Finalize. It writes the return code to a file named .srun.rc.$rank and later checks that exists and contains a 0. When the program is killed, exits with non-zero and the error is propagated to the baywatch stage, which aborts immediately without creating the rc file.
59 lines
1.0 KiB
Nix
59 lines
1.0 KiB
Nix
{
|
|
stdenv
|
|
, slurm
|
|
, garlicTools
|
|
}:
|
|
{
|
|
nextStage
|
|
, cpuBind
|
|
, nixPrefix
|
|
, preSrun ? ""
|
|
, postSrun ? ""
|
|
, srunOptions ? ""
|
|
, output ? "stdout.log"
|
|
, error ? "stderr.log"
|
|
}:
|
|
|
|
with garlicTools;
|
|
|
|
stdenv.mkDerivation rec {
|
|
name = "srun";
|
|
phases = [ "installPhase" ];
|
|
preferLocalBuild = true;
|
|
dontPatchShebangs = true;
|
|
installPhase = ''
|
|
cat > $out <<'EOF'
|
|
#!/bin/sh -e
|
|
|
|
${preSrun}
|
|
|
|
${slurm}/bin/srun \
|
|
--mpi=pmi2 \
|
|
--cpu-bind=${cpuBind} \
|
|
--output=${output} \
|
|
--error=${error} \
|
|
${srunOptions} \
|
|
${nixPrefix}${stageProgram nextStage}
|
|
|
|
>&2 echo srun exit code: $?
|
|
|
|
# Ensure that none failed, as srun fails to capture errors
|
|
# after MPI_Finalize
|
|
for i in $(seq 0 $(($SLURM_NTASKS - 1))); do
|
|
if [ ! -e .srun.rc.$i ]; then
|
|
>&2 echo "missing exit code for rank $i, aborting"
|
|
exit 1
|
|
fi
|
|
if ! grep -q '^0$' .srun.rc.$i; then
|
|
>&2 echo "non-zero exit for rank $i, aborting"
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
${postSrun}
|
|
EOF
|
|
|
|
chmod +x $out
|
|
'';
|
|
}
|