mirror of
https://github.com/CHN-beta/nixos.git
synced 2026-01-12 05:29:23 +08:00
Compare commits
1 Commits
6efc29a7a4
...
hdf5-slurm
| Author | SHA1 | Date | |
|---|---|---|---|
| 2343bf3d2b |
@@ -51,7 +51,8 @@ inputs:
|
||||
let
|
||||
inherit (inputs.config.nixos.system.nixpkgs) cuda;
|
||||
inherit (inputs.pkgs.cudaPackages) cuda_nvml_dev;
|
||||
additionalInputs = inputs.lib.optionals (cuda != null) [ cuda_nvml_dev cuda_nvml_dev.lib ];
|
||||
additionalInputs = [ inputs.pkgs.hdf5_1_10 ]
|
||||
++ inputs.lib.optionals (cuda != null) [ cuda_nvml_dev cuda_nvml_dev.lib ];
|
||||
additionalFlags = inputs.lib.optional (cuda != null) "-L${cuda_nvml_dev.lib}/lib/stubs";
|
||||
in
|
||||
{
|
||||
@@ -138,27 +139,39 @@ inputs:
|
||||
# record more info
|
||||
JobAcctGatherType=jobacct_gather/cgroup
|
||||
AccountingStorageTRES=gres/gpu
|
||||
AcctGatherProfileType=acct_gather_profile/hdf5
|
||||
|
||||
# append to output file
|
||||
JobFileAppend=1
|
||||
'';
|
||||
extraConfigPaths =
|
||||
[(inputs.pkgs.writeTextDir "acct_gather.conf"
|
||||
''
|
||||
ProfileHDF5Dir=/var/spool/slurm-hdf5
|
||||
ProfileHDF5Default=Task
|
||||
'')]
|
||||
++ (
|
||||
let gpus = slurm.node.${inputs.config.nixos.model.hostname}.gpus or null;
|
||||
in inputs.lib.mkIf (gpus != null)
|
||||
in inputs.lib.optional (gpus != null)
|
||||
(
|
||||
let gpuString = builtins.concatStringsSep "\n" (builtins.map
|
||||
(gpu: "Name=gpu Type=${gpu.name} Count=${builtins.toString gpu.value}")
|
||||
(inputs.localLib.attrsToList gpus));
|
||||
in [(inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml\n${gpuString}")]
|
||||
);
|
||||
in inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml\n${gpuString}"
|
||||
)
|
||||
);
|
||||
extraCgroupConfig =
|
||||
''
|
||||
ConstrainCores=yes
|
||||
ConstrainRAMSpace=yes
|
||||
ConstrainSwapSpace=yes
|
||||
AllowedSwapSpace=20
|
||||
|
||||
# this make job hang, not sure why
|
||||
# ConstrainDevices=yes
|
||||
|
||||
# force use cgroup v2
|
||||
CgroupPlugin=cgroup/v2
|
||||
'';
|
||||
};
|
||||
munge = { enable = true; password = inputs.config.sops.secrets."munge.key".path; };
|
||||
@@ -210,7 +223,11 @@ inputs:
|
||||
systemd =
|
||||
{
|
||||
services.slurmctld.after = [ "suid-sgid-wrappers.service" ];
|
||||
tmpfiles.rules = [ "d /var/log/slurmctld 700 slurm slurm" ];
|
||||
tmpfiles.rules =
|
||||
[
|
||||
"d /var/log/slurmctld 700 slurm slurm"
|
||||
"d /var/spool/slurm-hdf5 700 slurm slurm"
|
||||
];
|
||||
};
|
||||
sops =
|
||||
{
|
||||
|
||||
@@ -3,11 +3,19 @@ inputs:
|
||||
config = inputs.lib.mkMerge
|
||||
[
|
||||
# for cluster master, export NFS
|
||||
(inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "master")
|
||||
{ nixos.services.nfs = { root = "/"; exports = [ "/nix/persistent/home" ]; accessLimit = "192.168.178.0/24"; }; })
|
||||
(inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "master") { nixos.services.nfs =
|
||||
{
|
||||
root = "/";
|
||||
exports = [ "/nix/persistent/home" "/nix/persistent/var/spool/slurm-hdf5" ];
|
||||
accessLimit = "192.168.178.0/24";
|
||||
};})
|
||||
# for cluster worker, mount nfs, disable some home manager files
|
||||
(inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "worker")
|
||||
{ nixos.system.fileSystems.mount.nfs."192.168.178.1:/nix/persistent/home" = "/remote/home"; })
|
||||
{ nixos.system.fileSystems.mount.nfs =
|
||||
{
|
||||
"192.168.178.1:/nix/persistent/home" = "/remote/home";
|
||||
"192.168.178.1:/nix/persistent/var/spool/slurm-hdf5" = "/var/spool/slurm-hdf5";
|
||||
};})
|
||||
# 将一部分由 home-manager 生成软链接的文件改为直接挂载,以兼容集群的设置
|
||||
{
|
||||
home-manager.users = builtins.listToAttrs (builtins.map
|
||||
|
||||
Reference in New Issue
Block a user