nixos/modules/services/slurm.nix

144 lines
5.1 KiB
Nix
Raw Normal View History

2024-02-24 15:21:47 +08:00
inputs:
{
options.nixos.services.slurm = let inherit (inputs.lib) mkOption types; in
{
enable = mkOption { type = types.bool; default = false; };
cpu =
{
2024-03-09 18:44:30 +08:00
sockets = mkOption { type = types.ints.unsigned; default = 1; };
2024-02-24 15:21:47 +08:00
cores = mkOption { type = types.ints.unsigned; };
threads = mkOption { type = types.ints.unsigned; default = 1; };
2024-06-08 21:03:33 +08:00
mpiThreads = mkOption { type = types.ints.unsigned; default = 1; };
openmpThreads = mkOption { type = types.ints.unsigned; default = 1; };
2024-02-24 15:21:47 +08:00
};
memoryMB = mkOption { type = types.ints.unsigned; };
2024-02-26 19:34:40 +08:00
gpus = mkOption { type = types.attrsOf types.ints.unsigned; };
2024-02-24 15:21:47 +08:00
};
config = let inherit (inputs.config.nixos.services) slurm; in inputs.lib.mkIf slurm.enable
{
services =
{
slurm =
{
server.enable = true;
2024-02-24 19:39:19 +08:00
package = (inputs.pkgs.slurm.override { enableGtk2 = true; }).overrideAttrs
(prev: let inherit (inputs.pkgs.cudaPackages) cuda_nvml_dev; in
2024-02-24 22:51:27 +08:00
{
buildInputs = prev.buildInputs ++ [ cuda_nvml_dev ];
LDFLAGS = [ "-L${cuda_nvml_dev}/lib/stubs" ];
nativeBuildInputs = prev.nativeBuildInputs ++ [ inputs.pkgs.wrapGAppsHook ];
});
2024-02-24 15:21:47 +08:00
clusterName = inputs.config.nixos.system.networking.hostname;
2024-02-25 15:04:44 +08:00
dbdserver =
{
enable = true;
dbdHost = "localhost";
storagePassFile = inputs.config.sops.secrets."slurm/db".path;
extraConfig =
''
StorageHost=*
2024-02-25 15:04:44 +08:00
StorageLoc=slurm
'';
};
2024-02-24 15:21:47 +08:00
client.enable = true;
controlMachine = "localhost";
2024-02-26 19:34:40 +08:00
nodeName =
let gpuString = builtins.concatStringsSep "," (builtins.map
(gpu: "gpu:${gpu.name}:${builtins.toString gpu.value}")
(inputs.localLib.attrsToList slurm.gpus));
in inputs.lib.singleton (builtins.concatStringsSep " "
[
"localhost"
"RealMemory=${builtins.toString slurm.memoryMB}"
2024-03-09 18:44:30 +08:00
"Sockets=${builtins.toString slurm.cpu.sockets}"
2024-02-26 19:34:40 +08:00
"CoresPerSocket=${builtins.toString slurm.cpu.cores}"
"ThreadsPerCore=${builtins.toString slurm.cpu.threads}"
"Gres=${gpuString}"
"State=UNKNOWN"
]);
2024-02-24 15:21:47 +08:00
partitionName = [ "localhost Nodes=localhost Default=YES MaxTime=INFINITE State=UP" ];
procTrackType = "proctrack/cgroup";
2024-02-24 19:39:19 +08:00
extraConfig =
2024-02-26 19:34:40 +08:00
let taskProlog =
''
echo export CUDA_DEVICE_ORDER=PCI_BUS_ID
echo export SLURM_THREADS_PER_CPU=${builtins.toString slurm.cpu.threads}
'';
in
''
SelectType=select/cons_tres
SelectTypeParameters=CR_Core
2024-02-26 19:34:40 +08:00
GresTypes=gpu
2024-03-22 20:32:46 +08:00
DefCpuPerGPU=1
2024-02-26 19:34:40 +08:00
TaskProlog=${inputs.pkgs.writeShellScript "set_env" taskProlog}
2024-02-25 15:04:44 +08:00
2024-02-26 19:34:40 +08:00
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=localhost
AccountingStoreFlags=job_comment,job_env,job_extra,job_script
2024-02-25 15:04:44 +08:00
2024-02-26 19:34:40 +08:00
JobCompType=jobcomp/filetxt
JobCompLoc=/var/log/slurmctld/jobcomp.log
SchedulerParameters=enable_user_top
SlurmdDebug=debug2
'';
extraConfigPaths =
let gpuString = builtins.concatStringsSep "\n" (builtins.map
(gpu: "Name=gpu Type=${gpu.name} Count=${builtins.toString gpu.value}")
(inputs.localLib.attrsToList slurm.gpus));
in [(inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml\n${gpuString}")];
2024-02-24 15:21:47 +08:00
};
munge = { enable = true; password = inputs.config.sops.secrets."munge.key".path; };
};
2024-02-25 15:04:44 +08:00
systemd =
2024-02-24 19:39:19 +08:00
{
2024-02-25 15:04:44 +08:00
services.slurmd.environment =
{
CUDA_PATH = "${inputs.pkgs.cudatoolkit}";
LD_LIBRARY_PATH = "${inputs.config.hardware.nvidia.package}/lib";
};
tmpfiles.rules = [ "d /var/log/slurmctld 700 slurm slurm" ];
2024-02-24 19:39:19 +08:00
};
2024-02-24 15:21:47 +08:00
sops =
{
secrets =
{
"munge.key" =
{
format = "binary";
sopsFile = "${builtins.dirOf inputs.config.sops.defaultSopsFile}/munge.key";
owner = inputs.config.systemd.services.munged.serviceConfig.User;
};
2024-02-25 15:04:44 +08:00
"slurm/db" = { owner = "slurm"; key = "mariadb/slurm"; };
2024-02-24 15:21:47 +08:00
};
};
2024-06-08 21:03:33 +08:00
nixos =
{
packages._packages = [(inputs.pkgs.localPackages.sbatch-tui.overrideAttrs (prev: { src =
let device = inputs.pkgs.substituteAll
{
src = "${prev.src}/src/device.cpp.template";
CpuMpiThreads = slurm.cpu.mpiThreads;
CpuOpenmpThreads = slurm.cpu.openmpThreads;
GpuIds = builtins.concatStringsSep ", " (builtins.map (gpu: ''"${gpu}"'') (builtins.attrNames slurm.gpus));
};
in inputs.pkgs.runCommand "src" {}
''cp -r ${prev.src} $out; chmod +w -R $out; cp ${device} $out/src/device.cpp''; }))];
user.sharedModules = [{ home.packages =
[
(inputs.pkgs.writeShellScriptBin "sbatch"
''
if [ "$#" -eq 0 ]; then
sbatch-tui
else
/run/current-system/sw/bin/sbatch "$@"
fi
'')
];}];
services.mariadb = { enable = true; instances.slurm = {}; };
};
2024-02-24 15:21:47 +08:00
};
}