From 2343bf3d2b582e56ad045fe60b84fe5e4aea7608 Mon Sep 17 00:00:00 2001 From: chn Date: Thu, 27 Feb 2025 17:42:23 +0800 Subject: [PATCH] test --- modules/services/slurm.nix | 27 +++++++++++++++++++++----- modules/system/fileSystems/cluster.nix | 14 ++++++++++--- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/modules/services/slurm.nix b/modules/services/slurm.nix index 76652ac7..3b2a1b84 100644 --- a/modules/services/slurm.nix +++ b/modules/services/slurm.nix @@ -51,7 +51,8 @@ inputs: let inherit (inputs.config.nixos.system.nixpkgs) cuda; inherit (inputs.pkgs.cudaPackages) cuda_nvml_dev; - additionalInputs = inputs.lib.optionals (cuda != null) [ cuda_nvml_dev cuda_nvml_dev.lib ]; + additionalInputs = [ inputs.pkgs.hdf5_1_10 ] + ++ inputs.lib.optionals (cuda != null) [ cuda_nvml_dev cuda_nvml_dev.lib ]; additionalFlags = inputs.lib.optional (cuda != null) "-L${cuda_nvml_dev.lib}/lib/stubs"; in { @@ -138,27 +139,39 @@ inputs: # record more info JobAcctGatherType=jobacct_gather/cgroup AccountingStorageTRES=gres/gpu + AcctGatherProfileType=acct_gather_profile/hdf5 # append to output file JobFileAppend=1 ''; extraConfigPaths = + [(inputs.pkgs.writeTextDir "acct_gather.conf" + '' + ProfileHDF5Dir=/var/spool/slurm-hdf5 + ProfileHDF5Default=Task + '')] + ++ ( let gpus = slurm.node.${inputs.config.nixos.model.hostname}.gpus or null; - in inputs.lib.mkIf (gpus != null) + in inputs.lib.optional (gpus != null) ( let gpuString = builtins.concatStringsSep "\n" (builtins.map (gpu: "Name=gpu Type=${gpu.name} Count=${builtins.toString gpu.value}") (inputs.localLib.attrsToList gpus)); - in [(inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml\n${gpuString}")] - ); + in inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml\n${gpuString}" + ) + ); extraCgroupConfig = '' ConstrainCores=yes ConstrainRAMSpace=yes ConstrainSwapSpace=yes AllowedSwapSpace=20 + # this make job hang, not sure why # ConstrainDevices=yes + + # force use cgroup v2 + CgroupPlugin=cgroup/v2 ''; }; munge = { enable = true; password = inputs.config.sops.secrets."munge.key".path; }; @@ -210,7 +223,11 @@ inputs: systemd = { services.slurmctld.after = [ "suid-sgid-wrappers.service" ]; - tmpfiles.rules = [ "d /var/log/slurmctld 700 slurm slurm" ]; + tmpfiles.rules = + [ + "d /var/log/slurmctld 700 slurm slurm" + "d /var/spool/slurm-hdf5 700 slurm slurm" + ]; }; sops = { diff --git a/modules/system/fileSystems/cluster.nix b/modules/system/fileSystems/cluster.nix index ab4f7b26..234d7918 100644 --- a/modules/system/fileSystems/cluster.nix +++ b/modules/system/fileSystems/cluster.nix @@ -3,11 +3,19 @@ inputs: config = inputs.lib.mkMerge [ # for cluster master, export NFS - (inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "master") - { nixos.services.nfs = { root = "/"; exports = [ "/nix/persistent/home" ]; accessLimit = "192.168.178.0/24"; }; }) + (inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "master") { nixos.services.nfs = + { + root = "/"; + exports = [ "/nix/persistent/home" "/nix/persistent/var/spool/slurm-hdf5" ]; + accessLimit = "192.168.178.0/24"; + };}) # for cluster worker, mount nfs, disable some home manager files (inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "worker") - { nixos.system.fileSystems.mount.nfs."192.168.178.1:/nix/persistent/home" = "/remote/home"; }) + { nixos.system.fileSystems.mount.nfs = + { + "192.168.178.1:/nix/persistent/home" = "/remote/home"; + "192.168.178.1:/nix/persistent/var/spool/slurm-hdf5" = "/var/spool/slurm-hdf5"; + };}) # 将一部分由 home-manager 生成软链接的文件改为直接挂载,以兼容集群的设置 { home-manager.users = builtins.listToAttrs (builtins.map