This commit is contained in:
2025-02-27 17:42:23 +08:00
parent b3aea24c87
commit 2343bf3d2b
2 changed files with 33 additions and 8 deletions

View File

@@ -51,7 +51,8 @@ inputs:
let let
inherit (inputs.config.nixos.system.nixpkgs) cuda; inherit (inputs.config.nixos.system.nixpkgs) cuda;
inherit (inputs.pkgs.cudaPackages) cuda_nvml_dev; inherit (inputs.pkgs.cudaPackages) cuda_nvml_dev;
additionalInputs = inputs.lib.optionals (cuda != null) [ cuda_nvml_dev cuda_nvml_dev.lib ]; additionalInputs = [ inputs.pkgs.hdf5_1_10 ]
++ inputs.lib.optionals (cuda != null) [ cuda_nvml_dev cuda_nvml_dev.lib ];
additionalFlags = inputs.lib.optional (cuda != null) "-L${cuda_nvml_dev.lib}/lib/stubs"; additionalFlags = inputs.lib.optional (cuda != null) "-L${cuda_nvml_dev.lib}/lib/stubs";
in in
{ {
@@ -138,27 +139,39 @@ inputs:
# record more info # record more info
JobAcctGatherType=jobacct_gather/cgroup JobAcctGatherType=jobacct_gather/cgroup
AccountingStorageTRES=gres/gpu AccountingStorageTRES=gres/gpu
AcctGatherProfileType=acct_gather_profile/hdf5
# append to output file # append to output file
JobFileAppend=1 JobFileAppend=1
''; '';
extraConfigPaths = extraConfigPaths =
[(inputs.pkgs.writeTextDir "acct_gather.conf"
''
ProfileHDF5Dir=/var/spool/slurm-hdf5
ProfileHDF5Default=Task
'')]
++ (
let gpus = slurm.node.${inputs.config.nixos.model.hostname}.gpus or null; let gpus = slurm.node.${inputs.config.nixos.model.hostname}.gpus or null;
in inputs.lib.mkIf (gpus != null) in inputs.lib.optional (gpus != null)
( (
let gpuString = builtins.concatStringsSep "\n" (builtins.map let gpuString = builtins.concatStringsSep "\n" (builtins.map
(gpu: "Name=gpu Type=${gpu.name} Count=${builtins.toString gpu.value}") (gpu: "Name=gpu Type=${gpu.name} Count=${builtins.toString gpu.value}")
(inputs.localLib.attrsToList gpus)); (inputs.localLib.attrsToList gpus));
in [(inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml\n${gpuString}")] in inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml\n${gpuString}"
); )
);
extraCgroupConfig = extraCgroupConfig =
'' ''
ConstrainCores=yes ConstrainCores=yes
ConstrainRAMSpace=yes ConstrainRAMSpace=yes
ConstrainSwapSpace=yes ConstrainSwapSpace=yes
AllowedSwapSpace=20 AllowedSwapSpace=20
# this make job hang, not sure why # this make job hang, not sure why
# ConstrainDevices=yes # ConstrainDevices=yes
# force use cgroup v2
CgroupPlugin=cgroup/v2
''; '';
}; };
munge = { enable = true; password = inputs.config.sops.secrets."munge.key".path; }; munge = { enable = true; password = inputs.config.sops.secrets."munge.key".path; };
@@ -210,7 +223,11 @@ inputs:
systemd = systemd =
{ {
services.slurmctld.after = [ "suid-sgid-wrappers.service" ]; services.slurmctld.after = [ "suid-sgid-wrappers.service" ];
tmpfiles.rules = [ "d /var/log/slurmctld 700 slurm slurm" ]; tmpfiles.rules =
[
"d /var/log/slurmctld 700 slurm slurm"
"d /var/spool/slurm-hdf5 700 slurm slurm"
];
}; };
sops = sops =
{ {

View File

@@ -3,11 +3,19 @@ inputs:
config = inputs.lib.mkMerge config = inputs.lib.mkMerge
[ [
# for cluster master, export NFS # for cluster master, export NFS
(inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "master") (inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "master") { nixos.services.nfs =
{ nixos.services.nfs = { root = "/"; exports = [ "/nix/persistent/home" ]; accessLimit = "192.168.178.0/24"; }; }) {
root = "/";
exports = [ "/nix/persistent/home" "/nix/persistent/var/spool/slurm-hdf5" ];
accessLimit = "192.168.178.0/24";
};})
# for cluster worker, mount nfs, disable some home manager files # for cluster worker, mount nfs, disable some home manager files
(inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "worker") (inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "worker")
{ nixos.system.fileSystems.mount.nfs."192.168.178.1:/nix/persistent/home" = "/remote/home"; }) { nixos.system.fileSystems.mount.nfs =
{
"192.168.178.1:/nix/persistent/home" = "/remote/home";
"192.168.178.1:/nix/persistent/var/spool/slurm-hdf5" = "/var/spool/slurm-hdf5";
};})
# 将一部分由 home-manager 生成软链接的文件改为直接挂载,以兼容集群的设置 # 将一部分由 home-manager 生成软链接的文件改为直接挂载,以兼容集群的设置
{ {
home-manager.users = builtins.listToAttrs (builtins.map home-manager.users = builtins.listToAttrs (builtins.map