# This test compares a FWI version using poor data locality (+NOREUSE) versus
# the optimized version (used for all other experiments). Follows a pseudocode
# snippet illustrating the fundamental difference between version.
#
# NOREUSE
# ----------------------
# for (y) for (x) for (z)
#   computA(v[y][x][z]);
# for (y) for (x) for (z)
#   computB(v[y][x][z]);
# for (y) for (x) for (z)
#   computC(v[y][x][z]);
#
# Optimized version
# ----------------------
# for (y) for (x) for (z)
#   computA(v[y][x][z]);
#   computB(v[y][x][z]);
#   computC(v[y][x][z]);

{
  stdenv
, stdexp
, bsc
, targetMachine
, stages
, callPackage
}:

with stdenv.lib;

let

  inherit (targetMachine) fs;

  # Initial variable configuration
  varConf = {
    gitBranch = [
       "garlic/mpi+send+oss+task"
       "garlic/mpi+send+oss+task+NOREUSE"
    ];

    blocksize = [ 1 2 4 8 ];

    n = [
    	{nx=300; ny=2000; nz=300;} # / half node
    ];
  };

  machineConfig = targetMachine.config;

  # Generate the complete configuration for each unit
  genConf = c: targetMachine.config // rec {
    expName = "fwi-reuse";
    unitName = "${expName}"
      + "-bs${toString blocksize}"
      + "-${toString gitBranch}";

    inherit (machineConfig) hw;
    inherit (c) gitBranch blocksize;
    inherit (c.n) nx ny nz;

    fwiInput = bsc.apps.fwi.input.override {
      inherit (c.n) nx ny nz;
    };

    # Repeat the execution of each unit several times
    loops = 10;

    # Resources
    cpusPerTask = hw.cpusPerSocket;
    ntasksPerNode = 1;
    nodes = 1;
    qos = "debug";
    time = "02:00:00";
    jobName = unitName;

    enableCTF = false;
    ioFreq = -1;

    # Enable permissions to write in the local storage
    extraMounts = [ fs.local.temp ];
    tempDir = fs.local.temp;
  };

  common = callPackage ./common.nix {};

  inherit (common) getConfigs pipeline;

  configs = getConfigs {
    inherit varConf genConf;
  };

in
 
  stdexp.genExperiment { inherit configs pipeline; }