2024-02-24 15:21:47 +08:00
|
|
|
inputs:
|
|
|
|
{
|
|
|
|
options.nixos.services.slurm = let inherit (inputs.lib) mkOption types; in
|
|
|
|
{
|
|
|
|
enable = mkOption { type = types.bool; default = false; };
|
|
|
|
cpu =
|
|
|
|
{
|
|
|
|
cores = mkOption { type = types.ints.unsigned; };
|
|
|
|
threads = mkOption { type = types.ints.unsigned; default = 1; };
|
|
|
|
};
|
|
|
|
memoryMB = mkOption { type = types.ints.unsigned; };
|
2024-02-26 19:34:40 +08:00
|
|
|
gpus = mkOption { type = types.attrsOf types.ints.unsigned; };
|
2024-02-24 15:21:47 +08:00
|
|
|
};
|
|
|
|
config = let inherit (inputs.config.nixos.services) slurm; in inputs.lib.mkIf slurm.enable
|
|
|
|
{
|
|
|
|
services =
|
|
|
|
{
|
|
|
|
slurm =
|
|
|
|
{
|
|
|
|
server.enable = true;
|
2024-02-24 19:39:19 +08:00
|
|
|
package = (inputs.pkgs.slurm.override { enableGtk2 = true; }).overrideAttrs
|
|
|
|
(prev: let inherit (inputs.pkgs.cudaPackages) cuda_nvml_dev; in
|
2024-02-24 22:51:27 +08:00
|
|
|
{
|
|
|
|
buildInputs = prev.buildInputs ++ [ cuda_nvml_dev ];
|
|
|
|
LDFLAGS = [ "-L${cuda_nvml_dev}/lib/stubs" ];
|
|
|
|
nativeBuildInputs = prev.nativeBuildInputs ++ [ inputs.pkgs.wrapGAppsHook ];
|
|
|
|
});
|
2024-02-24 15:21:47 +08:00
|
|
|
clusterName = inputs.config.nixos.system.networking.hostname;
|
2024-02-25 15:04:44 +08:00
|
|
|
dbdserver =
|
|
|
|
{
|
|
|
|
enable = true;
|
|
|
|
dbdHost = "localhost";
|
|
|
|
storagePassFile = inputs.config.sops.secrets."slurm/db".path;
|
|
|
|
extraConfig =
|
|
|
|
''
|
2024-03-01 19:38:55 +08:00
|
|
|
StorageHost=*
|
2024-02-25 15:04:44 +08:00
|
|
|
StorageLoc=slurm
|
|
|
|
'';
|
|
|
|
};
|
2024-02-24 15:21:47 +08:00
|
|
|
client.enable = true;
|
|
|
|
controlMachine = "localhost";
|
2024-02-26 19:34:40 +08:00
|
|
|
nodeName =
|
|
|
|
let gpuString = builtins.concatStringsSep "," (builtins.map
|
|
|
|
(gpu: "gpu:${gpu.name}:${builtins.toString gpu.value}")
|
|
|
|
(inputs.localLib.attrsToList slurm.gpus));
|
|
|
|
in inputs.lib.singleton (builtins.concatStringsSep " "
|
|
|
|
[
|
|
|
|
"localhost"
|
|
|
|
"RealMemory=${builtins.toString slurm.memoryMB}"
|
|
|
|
"Sockets=1"
|
|
|
|
"CoresPerSocket=${builtins.toString slurm.cpu.cores}"
|
|
|
|
"ThreadsPerCore=${builtins.toString slurm.cpu.threads}"
|
|
|
|
"Gres=${gpuString}"
|
|
|
|
"State=UNKNOWN"
|
|
|
|
]);
|
2024-02-24 15:21:47 +08:00
|
|
|
partitionName = [ "localhost Nodes=localhost Default=YES MaxTime=INFINITE State=UP" ];
|
|
|
|
procTrackType = "proctrack/cgroup";
|
2024-02-24 19:39:19 +08:00
|
|
|
extraConfig =
|
2024-02-26 19:34:40 +08:00
|
|
|
let taskProlog =
|
|
|
|
''
|
|
|
|
echo export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
|
|
|
echo export SLURM_THREADS_PER_CPU=${builtins.toString slurm.cpu.threads}
|
|
|
|
'';
|
|
|
|
in
|
|
|
|
''
|
|
|
|
SelectType=select/cons_tres
|
2024-02-27 20:21:22 +08:00
|
|
|
SelectTypeParameters=CR_Core
|
2024-02-26 19:34:40 +08:00
|
|
|
GresTypes=gpu
|
|
|
|
TaskProlog=${inputs.pkgs.writeShellScript "set_env" taskProlog}
|
2024-02-25 15:04:44 +08:00
|
|
|
|
2024-02-26 19:34:40 +08:00
|
|
|
AccountingStorageType=accounting_storage/slurmdbd
|
|
|
|
AccountingStorageHost=localhost
|
|
|
|
AccountingStoreFlags=job_comment,job_env,job_extra,job_script
|
2024-02-25 15:04:44 +08:00
|
|
|
|
2024-02-26 19:34:40 +08:00
|
|
|
JobCompType=jobcomp/filetxt
|
|
|
|
JobCompLoc=/var/log/slurmctld/jobcomp.log
|
|
|
|
|
|
|
|
SchedulerParameters=enable_user_top
|
|
|
|
|
|
|
|
SlurmdDebug=debug2
|
|
|
|
'';
|
|
|
|
extraConfigPaths =
|
|
|
|
let gpuString = builtins.concatStringsSep "\n" (builtins.map
|
|
|
|
(gpu: "Name=gpu Type=${gpu.name} Count=${builtins.toString gpu.value}")
|
|
|
|
(inputs.localLib.attrsToList slurm.gpus));
|
|
|
|
in [(inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml\n${gpuString}")];
|
2024-02-24 15:21:47 +08:00
|
|
|
};
|
|
|
|
munge = { enable = true; password = inputs.config.sops.secrets."munge.key".path; };
|
|
|
|
};
|
2024-02-25 15:04:44 +08:00
|
|
|
systemd =
|
2024-02-24 19:39:19 +08:00
|
|
|
{
|
2024-02-25 15:04:44 +08:00
|
|
|
services.slurmd.environment =
|
|
|
|
{
|
|
|
|
CUDA_PATH = "${inputs.pkgs.cudatoolkit}";
|
|
|
|
LD_LIBRARY_PATH = "${inputs.config.hardware.nvidia.package}/lib";
|
|
|
|
};
|
|
|
|
tmpfiles.rules = [ "d /var/log/slurmctld 700 slurm slurm" ];
|
2024-02-24 19:39:19 +08:00
|
|
|
};
|
2024-02-24 15:21:47 +08:00
|
|
|
sops =
|
|
|
|
{
|
|
|
|
secrets =
|
|
|
|
{
|
|
|
|
"munge.key" =
|
|
|
|
{
|
|
|
|
format = "binary";
|
|
|
|
sopsFile = "${builtins.dirOf inputs.config.sops.defaultSopsFile}/munge.key";
|
|
|
|
owner = inputs.config.systemd.services.munged.serviceConfig.User;
|
|
|
|
};
|
2024-02-25 15:04:44 +08:00
|
|
|
"slurm/db" = { owner = "slurm"; key = "mariadb/slurm"; };
|
2024-02-24 15:21:47 +08:00
|
|
|
};
|
|
|
|
};
|
2024-02-25 15:04:44 +08:00
|
|
|
nixos.services.mariadb = { enable = true; instances.slurm = {}; };
|
2024-02-24 15:21:47 +08:00
|
|
|
};
|
|
|
|
}
|