fwi: generate the model in every node

As we are using local storage, we need a copy of the input in every
node. The current method is to run the generator only in the rank which
has assigned the cpu 0 in the mask.
This commit is contained in:
Rodrigo Arias Mallo 2021-04-12 19:01:10 +02:00
parent 58dc277d3d
commit 99beac9b23

View File

@ -40,67 +40,70 @@ rec {
ntasksPerNode = hw.cpusPerNode;
};
srun = {nextStage, conf, ...}:
exec = {nextStage, conf, ...}:
let
fwiParams = bsc.apps.fwi.params.override {
inherit (conf) nx ny nz;
};
in
stdexp.stdStages.srun {
inherit nextStage conf;
# Now we add some commands to execute before calling srun. These will
# only run in one rank (the first in the list of allocated nodes)
preSrun = ''
export GARLIC_FWI_SRUNDIR=$(pwd)
export GARLIC_FWI_EXECDIR="${conf.tempDir}/out/$GARLIC_USER/$GARLIC_UNIT/$GARLIC_RUN"
mkdir -p "$GARLIC_FWI_EXECDIR"
export GARLIC_FWI_PARAMS="${fwiParams}/fwi_params.txt"
export GARLIC_FWI_FREQ="${fwiParams}/fwi_frequencies.txt"
# We cannot change the working directory of srun, so we use a
# subshell to ignore the cd
(
# Generate the input dataset
>&2 echo "generating the input dataset"
cd "$GARLIC_FWI_EXECDIR"
${fwiParams}/bin/ModelGenerator \
-m "$GARLIC_FWI_PARAMS" "$GARLIC_FWI_FREQ"
)
'';
postSrun = optionalString (conf.enableCTF) ''
# Save the traces
mv "$GARLIC_FWI_EXECDIR"/trace_* .
'' + ''
# Remove everything else
rm -rf "$GARLIC_FWI_EXECDIR"
'';
};
exec = {nextStage, conf, ...}: stages.exec {
in stages.exec {
inherit nextStage;
# FIXME: FWI should allow the I/O directory to be specified as a
# parameter
pre = ''
# Run fwi at the in a directory with fast local storage
cd "$GARLIC_FWI_EXECDIR"
FWI_SRUNDIR=$(pwd)
FWI_EXECDIR="${conf.tempDir}/out/$GARLIC_USER/$GARLIC_UNIT/$GARLIC_RUN"
FWI_PARAMS="${fwiParams}/fwi_params.txt"
FWI_FREQ="${fwiParams}/fwi_frequencies.txt"
# Run fwi in a directory with fast local storage
mkdir -p "$FWI_EXECDIR"
cd "$FWI_EXECDIR"
# Only generate the input if we have the CPU 0 (once per node)
if grep -o 'Cpus_allowed_list:[[:space:]]0' \
/proc/self/status > /dev/null;
then
FWI_CAPTAIN=1
fi
if [ $FWI_CAPTAIN ]; then
>&2 echo "generating the input dataset"
${fwiParams}/bin/ModelGenerator -m "$FWI_PARAMS" "$FWI_FREQ"
fi
echo >&2 "Current dir: $(pwd)"
echo >&2 "Using PARAMS=$GARLIC_FWI_PARAMS and FREQ=$GARLIC_FWI_FREQ"
echo >&2 "Using PARAMS=$FWI_PARAMS and FREQ=$FWI_FREQ"
'' + optionalString (conf.enableCTF) ''
export NANOS6_CONFIG_OVERRIDE="version.instrument=ctf"
'';
argv = [
''"$GARLIC_FWI_PARAMS"''
''"$GARLIC_FWI_FREQ"''
''"$FWI_PARAMS"''
''"$FWI_FREQ"''
] ++ optional (needsBlocksize conf) conf.blocksize ++ [
"-1" # Fordward steps
"-1" # Backward steps
conf.ioFreq # Write/read frequency
];
post = ''
# Go back to the garlic out directory
cd "$FWI_SRUNDIR"
if [ $FWI_CAPTAIN ]; then
'' + optionalString (conf.enableCTF) ''
# FIXME: We should specify the path in the nanos6 config, so we
# can avoid the race condition while they are generating the
# traces
sleep 3
# Save the traces
mv "$FWI_EXECDIR"/trace_* .
'' + ''
rm -rf "$FWI_EXECDIR"
fi
'';
};
apps = bsc.garlic.apps;
@ -117,8 +120,5 @@ rec {
inherit fwiParams;
};
pipeline = stdexp.stdPipelineOverride {
# Replace the stdandard srun stage with our own
overrides = { inherit srun; };
} ++ [ exec program ];
pipeline = stdexp.stdPipeline ++ [ exec program ];
}