Files
nixos/modules/services/slurm.nix
2025-03-07 14:30:06 +08:00

301 lines
12 KiB
Nix

inputs:
{
options.nixos.services.slurm = let inherit (inputs.lib) mkOption types; in
{
enable = mkOption { type = types.bool; default = false; };
# 本机是否为控制节点,如果不是,填写控制节点的主机名
master = mkOption { type = types.nonEmptyStr; default = inputs.config.nixos.model.hostname; };
node = mkOption { type = types.attrsOf (types.submodule (submoduleInputs: { options =
{
# slurm 中使用的节点名称
name = mkOption { type = types.nonEmptyStr; };
address = mkOption { type = types.nonEmptyStr; };
cpu =
{
sockets = mkOption { type = types.ints.unsigned; default = 1; };
cores = mkOption { type = types.ints.unsigned; default = 1; };
threads = mkOption { type = types.ints.unsigned; default = 1; };
};
memoryGB = mkOption { type = types.ints.unsigned; default = 1024; };
gpus = mkOption { type = types.nullOr (types.attrsOf types.ints.unsigned); default = null; };
};}));};
partitions = mkOption { type = types.attrsOf (types.listOf types.nonEmptyStr); default = {}; };
defaultPartition = mkOption { type = types.nonEmptyStr; default = "localhost"; };
tui =
{
cpuQueues = mkOption
{
type = types.nonEmptyListOf (types.submodule (submoduleInputs: { options =
{
name = mkOption { type = types.nonEmptyStr; default = "localhost"; };
mpiThreads = mkOption { type = types.ints.unsigned; default = 1; };
openmpThreads = mkOption { type = types.ints.unsigned; default = 1; };
memoryGB = mkOption { type = types.nullOr types.ints.unsigned; default = null; };
allocateCpus = mkOption { type = types.nullOr types.ints.unsigned; default = null; };
};}));
};
gpuQueues = mkOption
{
type = types.nullOr (types.nonEmptyListOf (types.submodule (submoduleInputs: { options =
{
name = mkOption { type = types.nonEmptyStr; default = "localhost"; };
gpuIds = mkOption { type = types.nullOr (types.listOf types.nonEmptyStr); default = null; };
};})));
default = null;
};
};
# 是否打开防火墙相应端口,对于多节点部署需要打开
setupFirewall = mkOption { type = types.bool; default = false; };
};
config = let inherit (inputs.config.nixos.services) slurm; in inputs.lib.mkIf slurm.enable (inputs.lib.mkMerge
[
# worker 配置
{
services =
{
slurm =
{
package = (inputs.pkgs.slurm.override { enableGtk2 = true; }).overrideAttrs
(prev:
let
inherit (inputs.config.nixos.system.nixpkgs) cuda;
inherit (inputs.pkgs.cudaPackages) cuda_nvml_dev;
additionalInputs = inputs.lib.optionals (cuda != null) [ cuda_nvml_dev cuda_nvml_dev.lib ];
additionalFlags = inputs.lib.optional (cuda != null) "-L${cuda_nvml_dev.lib}/lib/stubs";
in
{
buildInputs = prev.buildInputs or [] ++ additionalInputs;
LDFLAGS = prev.LDFLAGS or [] ++ additionalFlags;
nativeBuildInputs = prev.nativeBuildInputs ++ [ inputs.pkgs.wrapGAppsHook ];
postInstall =
''
pushd contribs/pmi2
make install
popd
pushd contribs/pmi
make install
popd
'' + prev.postInstall;
}
);
client.enable = true;
nodeName = builtins.map
(node:
let gpuString =
if node.value.gpus == null then ""
else "Gres=" + builtins.concatStringsSep "," (builtins.map
(gpu: "gpu:${gpu.name}:${builtins.toString gpu.value}")
(inputs.lib.attrsToList node.value.gpus));
in builtins.concatStringsSep " "
[
node.value.name
"NodeHostname=${node.name}"
"NodeAddr=${node.value.address}"
"RealMemory=${builtins.toString (node.value.memoryGB * 1024)}"
"Sockets=${builtins.toString node.value.cpu.sockets}"
"CoresPerSocket=${builtins.toString node.value.cpu.cores}"
"ThreadsPerCore=${builtins.toString node.value.cpu.threads}"
"${gpuString}"
"State=UNKNOWN"
])
(inputs.localLib.attrsToList slurm.node);
partitionName = builtins.map
(partition:
let nodes = builtins.concatStringsSep "," partition.value;
in builtins.concatStringsSep " "
[
partition.name
"Nodes=${builtins.concatStringsSep "," (builtins.map (n: slurm.node.${n}.name) partition.value)}"
"Default=${if partition.name == slurm.defaultPartition then "YES" else "NO"}"
"MaxTime=INFINITE"
"State=UP"
])
(inputs.localLib.attrsToList slurm.partitions);
procTrackType = "proctrack/cgroup";
controlMachine = slurm.master;
controlAddr = slurm.node.${slurm.master}.address;
extraConfig =
''
SelectType=select/cons_tres
SelectTypeParameters=CR_Core_Memory
GresTypes=gpu
DefCpuPerGPU=1
TaskProlog=${inputs.pkgs.writeShellScript "set_env" "echo export CUDA_DEVICE_ORDER=PCI_BUS_ID"}
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=localhost
AccountingStoreFlags=job_comment,job_env,job_extra,job_script
JobCompType=jobcomp/filetxt
JobCompLoc=/var/log/slurmctld/jobcomp.log
SchedulerParameters=enable_user_top
SlurmdDebug=debug2
SlurmdParameters=l3cache_as_socket
DebugFlags=NO_CONF_HASH
# automatically resume node after drain
ReturnToService=2
# enable task plugins
TaskPlugin=task/affinity,task/cgroup
# omit --mpi=pmix
MpiDefault=pmix
# record more info
JobAcctGatherType=jobacct_gather/cgroup
AccountingStorageTRES=gres/gpu
PrologFlags=contain
# append to output file
JobFileAppend=1
'';
extraConfigPaths =
let gpus = slurm.node.${inputs.config.nixos.model.hostname}.gpus or null;
in inputs.lib.mkIf (gpus != null)
(
let gpuString = builtins.concatStringsSep "\n" (builtins.map
(gpu: "Name=gpu Type=${gpu.name} Count=${builtins.toString gpu.value}")
(inputs.localLib.attrsToList gpus));
in [(inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml\n${gpuString}")]
);
extraCgroupConfig =
''
ConstrainCores=yes
ConstrainRAMSpace=yes
ConstrainSwapSpace=yes
AllowedSwapSpace=20
# this make job hang, not sure why
# ConstrainDevices=yes
'';
};
munge = { enable = true; password = inputs.config.sops.secrets."munge.key".path; };
};
systemd =
{
services.slurmd.environment =
let gpus = slurm.node.${inputs.config.nixos.model.hostname}.gpus or null;
in inputs.lib.mkIf (gpus != null)
{
CUDA_PATH = "${inputs.pkgs.cudatoolkit}";
LD_LIBRARY_PATH = "${inputs.config.hardware.nvidia.package}/lib";
};
};
sops.secrets."munge.key" =
{
format = "binary";
sopsFile = inputs.localLib.mkConditional (inputs.config.nixos.model.cluster == null)
"${builtins.dirOf inputs.config.sops.defaultSopsFile}/munge.key"
"${inputs.config.nixos.system.sops.clusterSopsDir}/munge.key";
owner = inputs.config.systemd.services.munged.serviceConfig.User;
};
networking.firewall =
let config = inputs.lib.mkIf slurm.setupFirewall [ 6818 ];
in { allowedTCPPorts = config; allowedUDPPorts = config; };
environment.sessionVariables = { SLURM_UNBUFFEREDIO = "1"; SLURM_CPU_BIND = "v"; };
}
# master 配置
(inputs.lib.mkIf (slurm.master == inputs.config.nixos.model.hostname)
{
services.slurm =
{
server.enable = true;
dbdserver =
{
enable = true;
dbdHost = "localhost";
storagePassFile = inputs.config.sops.secrets."slurm/db".path;
extraConfig =
''
StorageHost=*
StorageLoc=slurm
'';
};
extraConfig =
''
PrologSlurmctld=${inputs.config.security.wrapperDir}/slurm-info
EpilogSlurmctld=${inputs.config.security.wrapperDir}/slurm-info
'';
};
systemd =
{
services.slurmctld.after = [ "suid-sgid-wrappers.service" ];
tmpfiles.rules = [ "d /var/log/slurmctld 700 slurm slurm" ];
};
sops =
{
secrets = { "slurm/db" = { owner = "slurm"; key = "mariadb/slurm"; }; }
// builtins.listToAttrs (builtins.map
(n:
{
name = "telegram/${n}";
value.sopsFile = "${inputs.config.nixos.system.sops.crossSopsDir}/default.yaml";
})
[ "token" "user/chn" "user/hjp" ]);
templates."info.yaml" =
{
owner = "slurm";
content = let inherit (inputs.config.sops) placeholder; in builtins.toJSON
{
token = placeholder."telegram/token";
user = builtins.listToAttrs (builtins.map (n: { name = n; value = placeholder."telegram/user/${n}"; })
[ "chn" "hjp" ]);
slurmConf = "${inputs.config.services.slurm.etcSlurm}/slurm.conf";
};
};
};
security.wrappers.info =
{
source =
let info = inputs.pkgs.localPackages.info.override
{
slurm = inputs.config.services.slurm.package;
configFile = inputs.config.sops.templates."info.yaml".path;
};
in "${info}/bin/info";
program = "slurm-info";
owner = "slurm";
group = "slurm";
permissions = "544";
capabilities = "cap_setuid,cap_setgid+ep";
};
nixos =
{
packages.packages._packages = [(inputs.pkgs.localPackages.sbatch-tui.override
{
sbatchConfig = inputs.pkgs.writeText "sbatch.yaml" (builtins.toJSON
{
CpuQueues = builtins.map
(queue:
[
queue.name
{
CpuMpiThreads = queue.mpiThreads;
CpuOpenmpThreads = queue.openmpThreads;
MemoryGB = queue.memoryGB;
AllocateCpus = queue.allocateCpus;
}
])
slurm.tui.cpuQueues;
GpuQueues = if slurm.tui.gpuQueues == null then null else builtins.map
(queue: [ queue.name { GpuIds = queue.gpuIds; } ])
slurm.tui.gpuQueues;
});
})];
user.sharedModules = [{ home.packages =
[
(inputs.pkgs.writeShellScriptBin "sbatch"
''if [ "$#" -eq 0 ]; then sbatch-tui; else /run/current-system/sw/bin/sbatch "$@"; fi'')
];}];
services.mariadb = { enable = true; instances.slurm = {}; };
};
networking.firewall =
let config = inputs.lib.mkIf slurm.setupFirewall [ 6817 ];
in { allowedTCPPorts = config; allowedUDPPorts = config; };
})
]);
}