mirror of
https://github.com/CHN-beta/nixos.git
synced 2024-10-22 21:18:44 +08:00
modules.services.slurm: rewrite
This commit is contained in:
parent
c972d44c42
commit
39d64c3f88
@ -131,9 +131,15 @@ inputs:
|
||||
slurm =
|
||||
{
|
||||
enable = true;
|
||||
cpu = { cores = 16; threads = 2; mpiThreads = 2; openmpThreads = 4; };
|
||||
memoryMB = 90112;
|
||||
gpus."4060" = 1;
|
||||
master = "pc";
|
||||
node.pc =
|
||||
{
|
||||
name = "pc"; address = "127.0.0.1";
|
||||
cpu = { cores = 16; threads = 2; };
|
||||
memoryMB = 90112;
|
||||
gpus."4060" = 1;
|
||||
};
|
||||
partitions.default = [ "pc" ];
|
||||
};
|
||||
ollama = {};
|
||||
};
|
||||
|
@ -26,6 +26,31 @@ inputs:
|
||||
snapper.enable = true;
|
||||
sshd = {};
|
||||
smartd.enable = true;
|
||||
slurm =
|
||||
{
|
||||
enable = true;
|
||||
master = "srv1-node0";
|
||||
node =
|
||||
{
|
||||
srv1-node0 =
|
||||
{
|
||||
name = "n0"; address = "192.168.178.1";
|
||||
cpu = { sockets = 4; cores = 20; threads = 2; };
|
||||
memoryMB = 122880;
|
||||
};
|
||||
srv1-node1 =
|
||||
{
|
||||
name = "n1"; address = "192.168.178.2";
|
||||
cpu = { sockets = 4; cores = 8; threads = 2; };
|
||||
memoryMB = 30720;
|
||||
};
|
||||
};
|
||||
partitions =
|
||||
{
|
||||
default = [ "srv1-node0" ];
|
||||
old = [ "srv1-node1" ];
|
||||
};
|
||||
};
|
||||
};
|
||||
user.users = [ "chn" ];
|
||||
};
|
||||
|
@ -26,12 +26,6 @@ inputs:
|
||||
publicKey = "Br+ou+t9M9kMrnNnhTvaZi2oNFRygzebA1NqcHWADWM=";
|
||||
wireguardIp = "192.168.83.9";
|
||||
};
|
||||
slurm =
|
||||
{
|
||||
enable = true;
|
||||
cpu = { sockets = 4; cores = 20; threads = 2; mpiThreads = 8; openmpThreads = 10; };
|
||||
memoryMB = 122880;
|
||||
};
|
||||
};
|
||||
};
|
||||
services.nfs.server =
|
||||
|
@ -14,16 +14,7 @@ inputs:
|
||||
};
|
||||
cluster.nodeType = "worker";
|
||||
};
|
||||
services =
|
||||
{
|
||||
beesd.instances.root = { device = "/"; hashTableSizeMB = 256; threads = 4; };
|
||||
# slurm =
|
||||
# {
|
||||
# enable = true;
|
||||
# cpu = { sockets = 4; cores = 8; threads = 2; mpiThreads = 4; openmpThreads = 8; };
|
||||
# memoryMB = 30720;
|
||||
# };
|
||||
};
|
||||
services.beesd.instances.root = { device = "/"; hashTableSizeMB = 256; threads = 4; };
|
||||
packages =
|
||||
{
|
||||
vasp = null;
|
||||
|
@ -74,9 +74,15 @@ inputs:
|
||||
slurm =
|
||||
{
|
||||
enable = true;
|
||||
cpu = { cores = 16; threads = 2; mpiThreads = 3; openmpThreads = 4; };
|
||||
memoryMB = 94208;
|
||||
gpus = { "2080_ti" = 1; "3090" = 1; "4090" = 1; };
|
||||
master = "xmupc1";
|
||||
node.xmupc1 =
|
||||
{
|
||||
name = "xmupc1"; address = "127.0.0.1";
|
||||
cpu = { cores = 16; threads = 2; };
|
||||
memoryMB = 94208;
|
||||
gpus = { "2080_ti" = 1; "3090" = 1; "4090" = 1; };
|
||||
};
|
||||
partitions.default = [ "xmupc1" ];
|
||||
};
|
||||
xrdp = { enable = true; hostname = [ "xmupc1.chn.moe" ]; };
|
||||
samba =
|
||||
|
@ -73,9 +73,15 @@ inputs:
|
||||
slurm =
|
||||
{
|
||||
enable = true;
|
||||
cpu = { sockets = 2; cores = 22; threads = 2; mpiThreads = 4; openmpThreads = 10; };
|
||||
memoryMB = 253952;
|
||||
gpus."4090" = 1;
|
||||
master = "xmupc2";
|
||||
node.xmupc2 =
|
||||
{
|
||||
name = "xmupc2"; address = "127.0.0.1";
|
||||
cpu = { sockets = 2; cores = 22; threads = 2; };
|
||||
memoryMB = 253952;
|
||||
gpus."4090" = 1;
|
||||
};
|
||||
partitions.default = [ "xmupc2" ];
|
||||
};
|
||||
xrdp = { enable = true; hostname = [ "xmupc2.chn.moe" ]; };
|
||||
samba = { enable = true; hostsAllowed = ""; shares = { home.path = "/home"; root.path = "/"; }; };
|
||||
|
@ -3,77 +3,92 @@ inputs:
|
||||
options.nixos.services.slurm = let inherit (inputs.lib) mkOption types; in
|
||||
{
|
||||
enable = mkOption { type = types.bool; default = false; };
|
||||
cpu =
|
||||
# 本机是否为控制节点,如果不是,填写控制节点的主机名
|
||||
master = mkOption { type = types.nonEmptyStr; default = inputs.config.nixos.system.networking.hostname; };
|
||||
node = mkOption { type = types.attrsOf (types.submodule (submoduleInputs: { options =
|
||||
{
|
||||
sockets = mkOption { type = types.ints.unsigned; default = 1; };
|
||||
cores = mkOption { type = types.ints.unsigned; };
|
||||
threads = mkOption { type = types.ints.unsigned; default = 1; };
|
||||
mpiThreads = mkOption { type = types.ints.unsigned; default = 1; };
|
||||
openmpThreads = mkOption { type = types.ints.unsigned; default = 1; };
|
||||
};
|
||||
memoryMB = mkOption { type = types.ints.unsigned; };
|
||||
gpus = mkOption { type = types.nullOr (types.attrsOf types.ints.unsigned); default = null; };
|
||||
};
|
||||
config = let inherit (inputs.config.nixos.services) slurm; in inputs.lib.mkIf slurm.enable
|
||||
{
|
||||
services =
|
||||
{
|
||||
slurm =
|
||||
# slurm 中使用的节点名称
|
||||
name = mkOption { type = types.nonEmptyStr; };
|
||||
address = mkOption { type = types.nonEmptyStr; };
|
||||
cpu =
|
||||
{
|
||||
server.enable = true;
|
||||
package = (inputs.pkgs.slurm.override { enableGtk2 = true; }).overrideAttrs
|
||||
(prev: let inherit (inputs.pkgs.cudaPackages) cuda_nvml_dev; in
|
||||
{
|
||||
buildInputs = prev.buildInputs ++ [ cuda_nvml_dev cuda_nvml_dev.lib ];
|
||||
LDFLAGS = [ "-L${cuda_nvml_dev.lib}/lib/stubs" ];
|
||||
nativeBuildInputs = prev.nativeBuildInputs ++ [ inputs.pkgs.wrapGAppsHook ];
|
||||
});
|
||||
clusterName = inputs.config.nixos.system.networking.hostname;
|
||||
dbdserver =
|
||||
sockets = mkOption { type = types.ints.unsigned; default = 1; };
|
||||
cores = mkOption { type = types.ints.unsigned; default = 1; };
|
||||
threads = mkOption { type = types.ints.unsigned; default = 1; };
|
||||
mpiThreads = mkOption { type = types.ints.unsigned; default = 1; };
|
||||
openmpThreads = mkOption { type = types.ints.unsigned; default = 1; };
|
||||
};
|
||||
memoryMB = mkOption { type = types.ints.unsigned; default = 1024; };
|
||||
gpus = mkOption { type = types.nullOr (types.attrsOf types.ints.unsigned); default = null; };
|
||||
};}));};
|
||||
partitions = mkOption { type = types.attrsOf (types.listOf types.nonEmptyStr); default = {}; };
|
||||
defaultPartition = mkOption { type = types.nonEmptyStr; default = "default"; };
|
||||
};
|
||||
config = let inherit (inputs.config.nixos.services) slurm; in inputs.lib.mkIf slurm.enable (inputs.lib.mkMerge
|
||||
[
|
||||
# worker 配置
|
||||
{
|
||||
services =
|
||||
{
|
||||
slurm =
|
||||
{
|
||||
enable = true;
|
||||
dbdHost = "localhost";
|
||||
storagePassFile = inputs.config.sops.secrets."slurm/db".path;
|
||||
package = (inputs.pkgs.slurm.override { enableGtk2 = true; }).overrideAttrs
|
||||
(prev:
|
||||
let
|
||||
inherit (inputs.config.nixos.system.nixpkgs) cuda;
|
||||
inherit (inputs.pkgs.cudaPackages) cuda_nvml_dev;
|
||||
additionalInputs = inputs.lib.optionals cuda.enable [ cuda_nvml_dev cuda_nvml_dev.lib ];
|
||||
additionalFlags = inputs.lib.optional cuda.enable "-L${cuda_nvml_dev.lib}/lib/stubs";
|
||||
in
|
||||
{
|
||||
buildInputs = prev.buildInputs or [] ++ additionalInputs;
|
||||
LDFLAGS = prev.LDFLAGS or [] ++ additionalFlags;
|
||||
nativeBuildInputs = prev.nativeBuildInputs ++ [ inputs.pkgs.wrapGAppsHook ];
|
||||
}
|
||||
);
|
||||
client.enable = true;
|
||||
nodeName = builtins.map
|
||||
(node:
|
||||
let gpuString =
|
||||
if node.value.gpus == null then ""
|
||||
else "Gres=" + builtins.concatStringsSep "," (builtins.map
|
||||
(gpu: "gpu:${gpu.name}:${builtins.toString gpu.value}")
|
||||
(inputs.lib.attrsToList node.value.gpus));
|
||||
in builtins.concatStringsSep " "
|
||||
[
|
||||
node.value.name
|
||||
"NodeHostname=${node.name}"
|
||||
"NodeAddr=${node.value.address}"
|
||||
"RealMemory=${builtins.toString node.value.memoryMB}"
|
||||
"Sockets=${builtins.toString node.value.cpu.sockets}"
|
||||
"CoresPerSocket=${builtins.toString node.value.cpu.cores}"
|
||||
"ThreadsPerCore=${builtins.toString node.value.cpu.threads}"
|
||||
"${gpuString}"
|
||||
"State=UNKNOWN"
|
||||
])
|
||||
(inputs.localLib.attrsToList slurm.node);
|
||||
partitionName = builtins.map
|
||||
(partition:
|
||||
let nodes = builtins.concatStringsSep "," partition.value;
|
||||
in builtins.concatStringsSep " "
|
||||
[
|
||||
slurm.node.${partition.name}.name
|
||||
"Nodes=${nodes}"
|
||||
"Default=${if partition.name == slurm.defaultPartition then "YES" else "NO"}"
|
||||
"MaxTime=INFINITE"
|
||||
"State=UP"
|
||||
])
|
||||
(inputs.localLib.attrsToList slurm.partitions);
|
||||
procTrackType = "proctrack/cgroup";
|
||||
controlMachine = slurm.master;
|
||||
extraConfig =
|
||||
''
|
||||
StorageHost=*
|
||||
StorageLoc=slurm
|
||||
'';
|
||||
};
|
||||
client.enable = true;
|
||||
controlMachine = "localhost";
|
||||
nodeName =
|
||||
let gpuString =
|
||||
if slurm.gpus == null then ""
|
||||
else "Gres=" + builtins.concatStringsSep "," (builtins.map
|
||||
(gpu: "gpu:${gpu.name}:${builtins.toString gpu.value}")
|
||||
(inputs.lib.attrsToList slurm.gpus));
|
||||
in inputs.lib.singleton (builtins.concatStringsSep " "
|
||||
[
|
||||
"localhost"
|
||||
"RealMemory=${builtins.toString slurm.memoryMB}"
|
||||
"Sockets=${builtins.toString slurm.cpu.sockets}"
|
||||
"CoresPerSocket=${builtins.toString slurm.cpu.cores}"
|
||||
"ThreadsPerCore=${builtins.toString slurm.cpu.threads}"
|
||||
"${gpuString}"
|
||||
"State=UNKNOWN"
|
||||
]);
|
||||
partitionName = [ "localhost Nodes=localhost Default=YES MaxTime=INFINITE State=UP" ];
|
||||
procTrackType = "proctrack/cgroup";
|
||||
extraConfig =
|
||||
let taskProlog =
|
||||
''
|
||||
echo export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
||||
echo export SLURM_THREADS_PER_CPU=${builtins.toString slurm.cpu.threads}
|
||||
'';
|
||||
in
|
||||
''
|
||||
SelectType=select/cons_tres
|
||||
SelectTypeParameters=CR_Core
|
||||
GresTypes=gpu
|
||||
DefCpuPerGPU=1
|
||||
|
||||
TaskProlog=${inputs.pkgs.writeShellScript "set_env" taskProlog}
|
||||
TaskProlog=${inputs.pkgs.writeShellScript "set_env" "echo export CUDA_DEVICE_ORDER=PCI_BUS_ID"}
|
||||
|
||||
AccountingStorageType=accounting_storage/slurmdbd
|
||||
AccountingStorageHost=localhost
|
||||
@ -89,60 +104,79 @@ inputs:
|
||||
# automatically resume node after drain
|
||||
ReturnToService=2
|
||||
'';
|
||||
extraConfigPaths = inputs.lib.mkIf (slurm.gpus != null)
|
||||
(
|
||||
let gpuString = builtins.concatStringsSep "\n" (builtins.map
|
||||
(gpu: "Name=gpu Type=${gpu.name} Count=${builtins.toString gpu.value}")
|
||||
(inputs.localLib.attrsToList slurm.gpus));
|
||||
in [(inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml\n${gpuString}")]
|
||||
);
|
||||
};
|
||||
munge = { enable = true; password = inputs.config.sops.secrets."munge.key".path; };
|
||||
};
|
||||
systemd =
|
||||
{
|
||||
services.slurmd.environment =
|
||||
{
|
||||
CUDA_PATH = "${inputs.pkgs.cudatoolkit}";
|
||||
LD_LIBRARY_PATH = "${inputs.config.hardware.nvidia.package}/lib";
|
||||
};
|
||||
tmpfiles.rules = [ "d /var/log/slurmctld 700 slurm slurm" ];
|
||||
};
|
||||
sops =
|
||||
{
|
||||
secrets =
|
||||
{
|
||||
"munge.key" =
|
||||
{
|
||||
format = "binary";
|
||||
sopsFile = "${builtins.dirOf inputs.config.sops.defaultSopsFile}/munge.key";
|
||||
owner = inputs.config.systemd.services.munged.serviceConfig.User;
|
||||
extraConfigPaths =
|
||||
let gpus = slurm.node.${inputs.config.nixos.system.networking.hostname}.gpus or null;
|
||||
in inputs.lib.mkIf (gpus != null)
|
||||
(
|
||||
let gpuString = builtins.concatStringsSep "\n" (builtins.map
|
||||
(gpu: "Name=gpu Type=${gpu.name} Count=${builtins.toString gpu.value}")
|
||||
(inputs.localLib.attrsToList gpus));
|
||||
in [(inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml\n${gpuString}")]
|
||||
);
|
||||
};
|
||||
"slurm/db" = { owner = "slurm"; key = "mariadb/slurm"; };
|
||||
munge = { enable = true; password = inputs.config.sops.secrets."munge.key".path; };
|
||||
};
|
||||
};
|
||||
nixos =
|
||||
{
|
||||
packages.packages._packages = [(inputs.pkgs.localPackages.sbatch-tui.override { sbatchConfig =
|
||||
systemd =
|
||||
{
|
||||
cpuMpiThreads = slurm.cpu.mpiThreads;
|
||||
cpuOpenmpThreads = slurm.cpu.openmpThreads;
|
||||
gpuIds =
|
||||
if slurm.gpus == null then ""
|
||||
else builtins.concatStringsSep ", " (builtins.map (gpu: ''"${gpu}"'') (builtins.attrNames slurm.gpus));
|
||||
};})];
|
||||
user.sharedModules = [{ home.packages =
|
||||
[
|
||||
(inputs.pkgs.writeShellScriptBin "sbatch"
|
||||
''
|
||||
if [ "$#" -eq 0 ]; then
|
||||
sbatch-tui
|
||||
else
|
||||
/run/current-system/sw/bin/sbatch "$@"
|
||||
fi
|
||||
'')
|
||||
];}];
|
||||
services.mariadb = { enable = true; instances.slurm = {}; };
|
||||
};
|
||||
};
|
||||
services.slurmd.environment =
|
||||
let gpus = slurm.node.${inputs.config.nixos.system.networking.hostname}.gpus or null;
|
||||
in inputs.lib.mkIf (gpus != null)
|
||||
{
|
||||
CUDA_PATH = "${inputs.pkgs.cudatoolkit}";
|
||||
LD_LIBRARY_PATH = "${inputs.config.hardware.nvidia.package}/lib";
|
||||
};
|
||||
};
|
||||
sops.secrets."munge.key" =
|
||||
{
|
||||
format = "binary";
|
||||
sopsFile = "${builtins.dirOf inputs.config.sops.defaultSopsFile}/munge.key";
|
||||
owner = inputs.config.systemd.services.munged.serviceConfig.User;
|
||||
};
|
||||
}
|
||||
# master 配置
|
||||
(inputs.lib.mkIf (slurm.master == inputs.config.nixos.system.networking.hostname)
|
||||
{
|
||||
services.slurm =
|
||||
{
|
||||
server.enable = true;
|
||||
dbdserver =
|
||||
{
|
||||
enable = true;
|
||||
dbdHost = "localhost";
|
||||
storagePassFile = inputs.config.sops.secrets."slurm/db".path;
|
||||
extraConfig =
|
||||
''
|
||||
StorageHost=*
|
||||
StorageLoc=slurm
|
||||
'';
|
||||
};
|
||||
};
|
||||
systemd.tmpfiles.rules = [ "d /var/log/slurmctld 700 slurm slurm" ];
|
||||
sops.secrets."slurm/db" = { owner = "slurm"; key = "mariadb/slurm"; };
|
||||
nixos =
|
||||
{
|
||||
# TODO: rewrite
|
||||
# packages.packages._packages = [(inputs.pkgs.localPackages.sbatch-tui.override { sbatchConfig =
|
||||
# {
|
||||
# cpuMpiThreads = slurm.cpu.mpiThreads;
|
||||
# cpuOpenmpThreads = slurm.cpu.openmpThreads;
|
||||
# gpuIds =
|
||||
# if slurm.gpus == null then ""
|
||||
# else builtins.concatStringsSep ", " (builtins.map (gpu: ''"${gpu}"'') (builtins.attrNames slurm.gpus));
|
||||
# };})];
|
||||
# user.sharedModules = [{ home.packages =
|
||||
# [
|
||||
# (inputs.pkgs.writeShellScriptBin "sbatch"
|
||||
# ''
|
||||
# if [ "$#" -eq 0 ]; then
|
||||
# sbatch-tui
|
||||
# else
|
||||
# /run/current-system/sw/bin/sbatch "$@"
|
||||
# fi
|
||||
# '')
|
||||
# ];}];
|
||||
services.mariadb = { enable = true; instances.slurm = {}; };
|
||||
};
|
||||
})
|
||||
]);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user