nixos/modules/services/slurm.nix

76 lines
2.6 KiB
Nix
Raw Normal View History

2024-02-24 15:21:47 +08:00
inputs:
{
options.nixos.services.slurm = let inherit (inputs.lib) mkOption types; in
{
enable = mkOption { type = types.bool; default = false; };
cpu =
{
cores = mkOption { type = types.ints.unsigned; };
threads = mkOption { type = types.ints.unsigned; default = 1; };
};
memoryMB = mkOption { type = types.ints.unsigned; };
2024-02-24 19:39:19 +08:00
gpus = mkOption { type = types.ints.unsigned; };
2024-02-24 15:21:47 +08:00
};
config = let inherit (inputs.config.nixos.services) slurm; in inputs.lib.mkIf slurm.enable
{
services =
{
slurm =
{
server.enable = true;
2024-02-24 19:39:19 +08:00
package = (inputs.pkgs.slurm.override { enableGtk2 = true; }).overrideAttrs
(prev: let inherit (inputs.pkgs.cudaPackages) cuda_nvml_dev; in
{ buildInputs = prev.buildInputs ++ [ cuda_nvml_dev ]; LDFLAGS = [ "-L${cuda_nvml_dev}/lib/stubs" ]; });
2024-02-24 15:21:47 +08:00
clusterName = inputs.config.nixos.system.networking.hostname;
# dbdserver =
# {
# enable = true;
# dbdHost = "localhost";
# # storagePassFile
# # extraConfig
# };
client.enable = true;
controlMachine = "localhost";
nodeName = inputs.lib.singleton (builtins.concatStringsSep " "
[
"localhost"
"RealMemory=${builtins.toString slurm.memoryMB}"
"Sockets=1"
"CoresPerSocket=${builtins.toString slurm.cpu.cores}"
"ThreadsPerCore=${builtins.toString slurm.cpu.threads}"
2024-02-24 19:39:19 +08:00
"Gres=gpu:${builtins.toString slurm.gpus}"
2024-02-24 15:21:47 +08:00
"State=UNKNOWN"
]);
partitionName = [ "localhost Nodes=localhost Default=YES MaxTime=INFINITE State=UP" ];
procTrackType = "proctrack/cgroup";
2024-02-24 19:39:19 +08:00
extraConfig =
''
SelectType=select/cons_tres
GresTypes=gpu
SlurmdDebug=debug2
TaskProlog=${inputs.pkgs.writeShellScript "set_cuda_env" "echo export CUDA_DEVICE_ORDER=PCI_BUS_ID"}
'';
extraConfigPaths = [(inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml")];
2024-02-24 15:21:47 +08:00
};
munge = { enable = true; password = inputs.config.sops.secrets."munge.key".path; };
};
2024-02-24 19:39:19 +08:00
systemd.services.slurmd.environment =
{
CUDA_PATH = "${inputs.pkgs.cudatoolkit}";
LD_LIBRARY_PATH = "${inputs.config.hardware.nvidia.package}/lib";
};
2024-02-24 15:21:47 +08:00
sops =
{
secrets =
{
"munge.key" =
{
format = "binary";
sopsFile = "${builtins.dirOf inputs.config.sops.defaultSopsFile}/munge.key";
owner = inputs.config.systemd.services.munged.serviceConfig.User;
};
};
};
};
}