mirror of
https://github.com/CHN-beta/nixos.git
synced 2026-01-12 01:55:22 +08:00
test
This commit is contained in:
@@ -51,7 +51,8 @@ inputs:
|
|||||||
let
|
let
|
||||||
inherit (inputs.config.nixos.system.nixpkgs) cuda;
|
inherit (inputs.config.nixos.system.nixpkgs) cuda;
|
||||||
inherit (inputs.pkgs.cudaPackages) cuda_nvml_dev;
|
inherit (inputs.pkgs.cudaPackages) cuda_nvml_dev;
|
||||||
additionalInputs = inputs.lib.optionals (cuda != null) [ cuda_nvml_dev cuda_nvml_dev.lib ];
|
additionalInputs = [ inputs.pkgs.hdf5_1_10 ]
|
||||||
|
++ inputs.lib.optionals (cuda != null) [ cuda_nvml_dev cuda_nvml_dev.lib ];
|
||||||
additionalFlags = inputs.lib.optional (cuda != null) "-L${cuda_nvml_dev.lib}/lib/stubs";
|
additionalFlags = inputs.lib.optional (cuda != null) "-L${cuda_nvml_dev.lib}/lib/stubs";
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
@@ -138,27 +139,39 @@ inputs:
|
|||||||
# record more info
|
# record more info
|
||||||
JobAcctGatherType=jobacct_gather/cgroup
|
JobAcctGatherType=jobacct_gather/cgroup
|
||||||
AccountingStorageTRES=gres/gpu
|
AccountingStorageTRES=gres/gpu
|
||||||
|
AcctGatherProfileType=acct_gather_profile/hdf5
|
||||||
|
|
||||||
# append to output file
|
# append to output file
|
||||||
JobFileAppend=1
|
JobFileAppend=1
|
||||||
'';
|
'';
|
||||||
extraConfigPaths =
|
extraConfigPaths =
|
||||||
|
[(inputs.pkgs.writeTextDir "acct_gather.conf"
|
||||||
|
''
|
||||||
|
ProfileHDF5Dir=/var/spool/slurm-hdf5
|
||||||
|
ProfileHDF5Default=Task
|
||||||
|
'')]
|
||||||
|
++ (
|
||||||
let gpus = slurm.node.${inputs.config.nixos.model.hostname}.gpus or null;
|
let gpus = slurm.node.${inputs.config.nixos.model.hostname}.gpus or null;
|
||||||
in inputs.lib.mkIf (gpus != null)
|
in inputs.lib.optional (gpus != null)
|
||||||
(
|
(
|
||||||
let gpuString = builtins.concatStringsSep "\n" (builtins.map
|
let gpuString = builtins.concatStringsSep "\n" (builtins.map
|
||||||
(gpu: "Name=gpu Type=${gpu.name} Count=${builtins.toString gpu.value}")
|
(gpu: "Name=gpu Type=${gpu.name} Count=${builtins.toString gpu.value}")
|
||||||
(inputs.localLib.attrsToList gpus));
|
(inputs.localLib.attrsToList gpus));
|
||||||
in [(inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml\n${gpuString}")]
|
in inputs.pkgs.writeTextDir "gres.conf" "AutoDetect=nvml\n${gpuString}"
|
||||||
);
|
)
|
||||||
|
);
|
||||||
extraCgroupConfig =
|
extraCgroupConfig =
|
||||||
''
|
''
|
||||||
ConstrainCores=yes
|
ConstrainCores=yes
|
||||||
ConstrainRAMSpace=yes
|
ConstrainRAMSpace=yes
|
||||||
ConstrainSwapSpace=yes
|
ConstrainSwapSpace=yes
|
||||||
AllowedSwapSpace=20
|
AllowedSwapSpace=20
|
||||||
|
|
||||||
# this make job hang, not sure why
|
# this make job hang, not sure why
|
||||||
# ConstrainDevices=yes
|
# ConstrainDevices=yes
|
||||||
|
|
||||||
|
# force use cgroup v2
|
||||||
|
CgroupPlugin=cgroup/v2
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
munge = { enable = true; password = inputs.config.sops.secrets."munge.key".path; };
|
munge = { enable = true; password = inputs.config.sops.secrets."munge.key".path; };
|
||||||
@@ -210,7 +223,11 @@ inputs:
|
|||||||
systemd =
|
systemd =
|
||||||
{
|
{
|
||||||
services.slurmctld.after = [ "suid-sgid-wrappers.service" ];
|
services.slurmctld.after = [ "suid-sgid-wrappers.service" ];
|
||||||
tmpfiles.rules = [ "d /var/log/slurmctld 700 slurm slurm" ];
|
tmpfiles.rules =
|
||||||
|
[
|
||||||
|
"d /var/log/slurmctld 700 slurm slurm"
|
||||||
|
"d /var/spool/slurm-hdf5 700 slurm slurm"
|
||||||
|
];
|
||||||
};
|
};
|
||||||
sops =
|
sops =
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -3,11 +3,19 @@ inputs:
|
|||||||
config = inputs.lib.mkMerge
|
config = inputs.lib.mkMerge
|
||||||
[
|
[
|
||||||
# for cluster master, export NFS
|
# for cluster master, export NFS
|
||||||
(inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "master")
|
(inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "master") { nixos.services.nfs =
|
||||||
{ nixos.services.nfs = { root = "/"; exports = [ "/nix/persistent/home" ]; accessLimit = "192.168.178.0/24"; }; })
|
{
|
||||||
|
root = "/";
|
||||||
|
exports = [ "/nix/persistent/home" "/nix/persistent/var/spool/slurm-hdf5" ];
|
||||||
|
accessLimit = "192.168.178.0/24";
|
||||||
|
};})
|
||||||
# for cluster worker, mount nfs, disable some home manager files
|
# for cluster worker, mount nfs, disable some home manager files
|
||||||
(inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "worker")
|
(inputs.lib.mkIf (inputs.config.nixos.model.cluster.nodeType or null == "worker")
|
||||||
{ nixos.system.fileSystems.mount.nfs."192.168.178.1:/nix/persistent/home" = "/remote/home"; })
|
{ nixos.system.fileSystems.mount.nfs =
|
||||||
|
{
|
||||||
|
"192.168.178.1:/nix/persistent/home" = "/remote/home";
|
||||||
|
"192.168.178.1:/nix/persistent/var/spool/slurm-hdf5" = "/var/spool/slurm-hdf5";
|
||||||
|
};})
|
||||||
# 将一部分由 home-manager 生成软链接的文件改为直接挂载,以兼容集群的设置
|
# 将一部分由 home-manager 生成软链接的文件改为直接挂载,以兼容集群的设置
|
||||||
{
|
{
|
||||||
home-manager.users = builtins.listToAttrs (builtins.map
|
home-manager.users = builtins.listToAttrs (builtins.map
|
||||||
|
|||||||
Reference in New Issue
Block a user