mirror of
https://github.com/CHN-beta/nixos.git
synced 2026-01-12 01:29:24 +08:00
packages.info: write to output
This commit is contained in:
@@ -202,18 +202,16 @@ inputs:
|
||||
'';
|
||||
};
|
||||
extraConfig =
|
||||
let info = inputs.pkgs.localPackages.info.override
|
||||
{
|
||||
slurm = inputs.config.services.slurm.package;
|
||||
configFile = inputs.config.sops.templates."info.yaml".path;
|
||||
};
|
||||
in
|
||||
''
|
||||
PrologSlurmctld=${info}/bin/info
|
||||
EpilogSlurmctld=${info}/bin/info
|
||||
PrologSlurmctld=${inputs.config.security.wrapperDir}/slurm-info
|
||||
EpilogSlurmctld=${inputs.config.security.wrapperDir}/slurm-info
|
||||
'';
|
||||
};
|
||||
systemd.tmpfiles.rules = [ "d /var/log/slurmctld 700 slurm slurm" ];
|
||||
systemd =
|
||||
{
|
||||
services.slurmctld.after = [ "suid-sgid-wrappers.service" ];
|
||||
tmpfiles.rules = [ "d /var/log/slurmctld 700 slurm slurm" ];
|
||||
};
|
||||
sops =
|
||||
{
|
||||
secrets = { "slurm/db" = { owner = "slurm"; key = "mariadb/slurm"; }; }
|
||||
@@ -231,19 +229,28 @@ inputs:
|
||||
};
|
||||
};
|
||||
};
|
||||
security.wrappers.info =
|
||||
{
|
||||
source =
|
||||
let info = inputs.pkgs.localPackages.info.override
|
||||
{
|
||||
slurm = inputs.config.services.slurm.package;
|
||||
configFile = inputs.config.sops.templates."info.yaml".path;
|
||||
};
|
||||
in "${info}/bin/info";
|
||||
program = "slurm-info";
|
||||
owner = "slurm";
|
||||
group = "slurm";
|
||||
permissions = "544";
|
||||
capabilities = "cap_setuid,cap_setgid+ep";
|
||||
};
|
||||
nixos =
|
||||
{
|
||||
packages.packages._packages = [ inputs.pkgs.localPackages.sbatch-tui ];
|
||||
user.sharedModules = [{ home.packages =
|
||||
[
|
||||
(inputs.pkgs.writeShellScriptBin "sbatch"
|
||||
''
|
||||
if [ "$#" -eq 0 ]; then
|
||||
sbatch-tui
|
||||
else
|
||||
/run/current-system/sw/bin/sbatch "$@"
|
||||
fi
|
||||
'')
|
||||
''if [ "$#" -eq 0 ]; then sbatch-tui; else /run/current-system/sw/bin/sbatch "$@"; fi'')
|
||||
];}];
|
||||
services.mariadb = { enable = true; instances.slurm = {}; };
|
||||
};
|
||||
|
||||
@@ -8,6 +8,19 @@
|
||||
# define INFO_CONFIG_FILE "/etc/info.yaml"
|
||||
# endif
|
||||
|
||||
struct switch_user
|
||||
{
|
||||
std::uint32_t uid, gid;
|
||||
switch_user(std::uint32_t uid, std::uint32_t gid) : uid(uid), gid(gid) {}
|
||||
boost::system::error_code on_exec_setup(auto&&...)
|
||||
{
|
||||
// first set gid then set uid, otherwise failed
|
||||
if (setegid(gid) != 0 || seteuid(uid) != 0)
|
||||
return boost::system::error_code{errno, boost::system::system_category()};
|
||||
else return {};
|
||||
}
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
using namespace biu::literals;
|
||||
@@ -22,8 +35,8 @@ int main()
|
||||
std::string slurm_conf;
|
||||
std::map<std::string, std::string> context_map
|
||||
{
|
||||
{ "prolog_slurmctld", "RUN" },
|
||||
{ "epilog_slurmctld", "END" }
|
||||
{ "prolog_slurmctld", "Begin" },
|
||||
{ "epilog_slurmctld", "End" }
|
||||
};
|
||||
{
|
||||
auto config = YAML::LoadFile(INFO_CONFIG_FILE);
|
||||
@@ -50,11 +63,15 @@ int main()
|
||||
context = context_cstr;
|
||||
}
|
||||
|
||||
// 从 slurm 处查询信息
|
||||
YAML::Node info;
|
||||
std::uint32_t uid, gid;
|
||||
std::string output_file;
|
||||
// slurm 只能初始化一次,之后即使 fini 再初始化也会无法连接到数据库
|
||||
slurm_init(slurm_conf.c_str());
|
||||
|
||||
// 从 slurm 处查询信息
|
||||
{
|
||||
job_info_msg_t* job_info;
|
||||
slurm_init(slurm_conf.c_str());
|
||||
auto slurm_result = slurm_load_job(&job_info, jid, 0);
|
||||
if (slurm_result != SLURM_SUCCESS) throw std::runtime_error("slurm_load_job failed: {}"_f(slurm_strerror(slurm_result)));
|
||||
else if (job_info->record_count != 1) throw std::runtime_error("job_info->record_count != 1");
|
||||
@@ -69,158 +86,87 @@ int main()
|
||||
info["Job Name"] = null_to_empty(job_info->job_array->name);
|
||||
info["Working Directory"] = null_to_empty(job_info->job_array->work_dir);
|
||||
info["Output File"] = null_to_empty(job_info->job_array->std_out);
|
||||
output_file = null_to_empty(job_info->job_array->std_out);
|
||||
info["Partition"] = null_to_empty(job_info->job_array->partition);
|
||||
info["Submit Time"] = timepoint(job_info->job_array->submit_time);
|
||||
info["Start Time"] = timepoint(job_info->job_array->start_time);
|
||||
if (context == "epilog_slurmctld") info["End Time"] = timepoint(job_info->job_array->end_time);
|
||||
info["Nodes"] = null_to_empty(job_info->job_array->nodes);
|
||||
// not working on epilog_slurmctld
|
||||
// info["Nodes"] = null_to_empty(job_info->job_array->nodes);
|
||||
info["Nodes"] = null_to_empty(std::getenv("SLURM_JOB_NODELIST"));
|
||||
info["TREs Allocated"] = null_to_empty(job_info->job_array->tres_alloc_str);
|
||||
info["GREs Allocated"] = null_to_empty(job_info->job_array->gres_total);
|
||||
if (context == "epilog_slurmctld") info["Exit Code"] = job_info->job_array->exit_code;
|
||||
info["Status"] = get_status(job_info->job_array->job_state);
|
||||
info["Status"].SetStyle(YAML::EmitterStyle::Flow);
|
||||
info["User ID"] = job_info->job_array->user_id;
|
||||
uid = job_info->job_array->user_id;
|
||||
info["Group ID"] = job_info->job_array->group_id;
|
||||
gid = job_info->job_array->group_id;
|
||||
}
|
||||
slurm_free_job_info_msg(job_info);
|
||||
slurm_fini();
|
||||
}
|
||||
|
||||
// 从 slurmdbd 处查询信息
|
||||
// 有问题,先不用这段代码
|
||||
// if (context == "epilog_slurmctld")
|
||||
// {
|
||||
// slurm_init(slurm_conf.c_str());
|
||||
// uint16_t conn_flags = 0;
|
||||
// auto conn = slurmdb_connection_get(&conn_flags);
|
||||
// if (!conn || errno != SLURM_SUCCESS) throw std::runtime_error("slurmdb_connection_get failed.");
|
||||
if (false)
|
||||
{
|
||||
auto conn = slurmdb_connection_get(nullptr);
|
||||
if (!conn) throw std::runtime_error("slurmdb_connection_get failed.");
|
||||
|
||||
// 构造查询
|
||||
// slurmdb_job_cond_t* query = new slurmdb_job_cond_t;
|
||||
// query->step_list = slurm_list_create(slurm_destroy_selected_step);
|
||||
// slurm_selected_step_t* step = new slurm_selected_step_t;
|
||||
// step->step_id.step_het_comp = NO_VAL;
|
||||
// step->step_id.step_id = NO_VAL;
|
||||
// step->step_id.job_id = jid;
|
||||
// step->array_task_id = NO_VAL;
|
||||
// step->het_job_offset = NO_VAL;
|
||||
// step->array_bitmap = nullptr;
|
||||
// slurm_list_append(query->step_list, step);
|
||||
// // 查询
|
||||
// auto result = slurmdb_jobs_get(conn, query);
|
||||
// if (slurm_list_count(result) != 1) throw std::runtime_error("slurmdb_jobs_get failed.");
|
||||
// auto data = reinterpret_cast<slurmdb_job_rec_t*>(slurm_list_pop(result));
|
||||
// // 读取需要的信息并清理
|
||||
// slurm_list_destroy(result);
|
||||
// slurmdb_destroy_job_cond(query);
|
||||
// info["aaaa"] = data->uid;
|
||||
// slurmdb_destroy_job_rec(data);
|
||||
// from: https://github.com/ksyx/turingopt/blob/20d88df423c0722839d1f0d185708da0af7c07a7/watcher/src/main.cpp#L329
|
||||
auto query = reinterpret_cast<slurmdb_job_cond_t*>
|
||||
(std::calloc(1, sizeof(slurmdb_job_cond_t)));
|
||||
query->flags |= JOBCOND_FLAG_NO_TRUNC;
|
||||
query->db_flags = SLURMDB_JOB_FLAG_NOTSET;
|
||||
query->step_list = slurm_list_create(slurm_destroy_selected_step);
|
||||
auto step = new slurm_selected_step_t
|
||||
{nullptr, NO_VAL, NO_VAL, {jid, NO_VAL, NO_VAL}};
|
||||
slurm_list_append(query->step_list, step);
|
||||
// 查询
|
||||
auto result = slurmdb_jobs_get(conn, query);
|
||||
if (slurm_list_count(result) != 1) throw std::runtime_error("slurmdb_jobs_get failed.");
|
||||
auto data = reinterpret_cast<slurmdb_job_rec_t*>(slurm_list_pop(result));
|
||||
// 读取需要的信息并清理
|
||||
slurm_list_destroy(result);
|
||||
slurmdb_destroy_job_cond(query);
|
||||
auto null_to_empty = [](const char* str) { return str ? str : ""; };
|
||||
info["Nodes"] = null_to_empty(data->nodes);
|
||||
slurmdb_destroy_job_rec(data);
|
||||
|
||||
// auto close_result = slurmdb_connection_close(&conn);
|
||||
// if (close_result != SLURM_SUCCESS) throw std::runtime_error("slurmdb_connection_close failed.");
|
||||
// }
|
||||
auto close_result = slurmdb_connection_close(&conn);
|
||||
if (close_result != SLURM_SUCCESS) throw std::runtime_error("slurmdb_connection_close failed.");
|
||||
}
|
||||
|
||||
slurm_fini();
|
||||
|
||||
// 发送消息
|
||||
{
|
||||
TgBot::Bot bot(token);
|
||||
std::stringstream ss;
|
||||
ss << "{} {} {}\n"_f(context, info["Job Id"], info["Job Name"]);
|
||||
ss << "<b>{}</b> {} {}\n"_f(context_map[context], info["Job Id"], info["Job Name"]);
|
||||
ss << "<blockquote expandable>{}</blockquote>"_f(info);
|
||||
bot.getApi().sendMessage
|
||||
(user_map[user], ss.str(), nullptr, nullptr, nullptr, "HTML");
|
||||
}
|
||||
|
||||
// 写入消息
|
||||
if (context == "epilog_slurmctld" && !output_file.empty())
|
||||
{
|
||||
auto text = "\n--------------------\n{}\n--------------------\n"_f(info);
|
||||
boost::asio::io_context context;
|
||||
boost::system::error_code ec;
|
||||
boost::asio::writable_pipe wp{context};
|
||||
boost::process::v2::process proc
|
||||
(
|
||||
context, "/run/current-system/sw/bin/tee", { "-a", output_file.c_str() },
|
||||
boost::process::v2::process_stdio{wp, nullptr, nullptr}, switch_user(uid, gid)
|
||||
);
|
||||
boost::asio::write(wp, boost::asio::buffer(text));
|
||||
wp.close();
|
||||
proc.wait();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
struct switch_user
|
||||
{
|
||||
std::uint32_t uid, gid;
|
||||
switch_user(std::uint32_t uid, std::uint32_t gid) : uid(uid), gid(gid) {}
|
||||
boost::system::error_code on_exec_setup(auto&&...)
|
||||
{
|
||||
// first set gid then set uid, otherwise failed
|
||||
if (setegid(gid) != 0 || seteuid(uid) != 0)
|
||||
return boost::system::error_code{errno, boost::system::system_category()};
|
||||
else return {};
|
||||
}
|
||||
};
|
||||
|
||||
// int slurm_spank_job_epilog(spank_t spank, int ac, char** argv)
|
||||
// {
|
||||
// using namespace biu::literals;
|
||||
// auto [info, outfile, uid, gid] = [&]
|
||||
// {
|
||||
// std::stringstream ss;
|
||||
// std::optional<std::string> outfile;
|
||||
// ss << "------------------------------------------------------------\n";
|
||||
// std::uint32_t jid, uid = -1, gid = -1;
|
||||
// auto result = spank_get_item(spank, S_JOB_ID, &jid);
|
||||
// if (result != ESPANK_SUCCESS) ss << "error getting job id: {}\n"_f(int(result));
|
||||
// else
|
||||
// {
|
||||
// ss << "info for job {}:\n"_f(jid);
|
||||
// YAML::Node info;
|
||||
//
|
||||
// // gather info from slurmctld
|
||||
// job_info_msg_t* job_info;
|
||||
// slurm_init(nullptr);
|
||||
// auto result = slurm_load_job(&job_info, jid, 0);
|
||||
// if (result != SLURM_SUCCESS) ss << "error loading job info: {}\n"_f(slurm_strerror(result));
|
||||
// else if (job_info->record_count != 1) ss << "record_count {} != 1\n"_f(job_info->record_count);
|
||||
// else
|
||||
// {
|
||||
// auto null_to_empty = [](const char* str) { return str ? str : ""; };
|
||||
// auto timepoint = [](time_t time)
|
||||
// { return "{:%Y-%m-%d %H:%M:%S}"_f(*std::localtime(&time)); };
|
||||
// auto timespan = [](time_t time)
|
||||
// { return "{:%H:%M:%S}"_f(std::chrono::seconds(time)); };
|
||||
// auto get_status = [](int code)
|
||||
// { return std::vector{ "{}"_f(job_states(code & 0xff)), "{:#x}"_f(code) }; };
|
||||
// info["Job Id"] = job_info->job_array->job_id;
|
||||
// info["Job Name"] = null_to_empty(job_info->job_array->name);
|
||||
// info["User Id"] = job_info->job_array->user_id;
|
||||
// info["Work Directory"] = null_to_empty(job_info->job_array->work_dir);
|
||||
// info["Output File"] = null_to_empty(job_info->job_array->std_out);
|
||||
// info["Partition"] = null_to_empty(job_info->job_array->partition);
|
||||
// info["Submit Time"] = timepoint(job_info->job_array->submit_time);
|
||||
// info["Start Time"] = timepoint(job_info->job_array->start_time);
|
||||
// info["End Time"] = timepoint(job_info->job_array->end_time);
|
||||
// info["Nodes"] = null_to_empty(job_info->job_array->nodes);
|
||||
// info["TREs Allocated"] = null_to_empty(job_info->job_array->tres_alloc_str);
|
||||
// info["GREs Allocated"] = null_to_empty(job_info->job_array->gres_total);
|
||||
// info["Exit Code"] = job_info->job_array->exit_code;
|
||||
// info["Status"] = get_status(job_info->job_array->job_state);
|
||||
// info["Status"].SetStyle(YAML::EmitterStyle::Flow);
|
||||
// info["Context"] = "{}"_f(spank_context());
|
||||
// info["Remote"] = spank_remote(spank);
|
||||
// if (job_info->job_array->std_out != nullptr) outfile = job_info->job_array->std_out;
|
||||
// uid = job_info->job_array->user_id;
|
||||
// gid = job_info->job_array->group_id;
|
||||
// }
|
||||
// slurm_free_job_info_msg(job_info);
|
||||
// slurm_fini();
|
||||
//
|
||||
// ss << "------------------------------------------------------------\n" << info << '\n';
|
||||
// }
|
||||
// return std::tuple(ss.str(), outfile, uid, gid);
|
||||
// }();
|
||||
// slurm_spank_log("%s", info.c_str());
|
||||
// if (outfile)
|
||||
// {
|
||||
// try
|
||||
// {
|
||||
// boost::asio::io_context context;
|
||||
// boost::system::error_code ec;
|
||||
// boost::asio::writable_pipe wp{context};
|
||||
// boost::process::v2::process proc
|
||||
// (
|
||||
// context, "/run/current-system/sw/bin/tee", { "-a", outfile->c_str() },
|
||||
// boost::process::v2::process_stdio{wp, nullptr, nullptr}, switch_user(uid, gid)
|
||||
// );
|
||||
// boost::asio::write(wp, boost::asio::buffer(info));
|
||||
// wp.close();
|
||||
// proc.wait();
|
||||
// }
|
||||
// catch (boost::system::system_error& e) { slurm_spank_log("boost error writing to output file: %s", e.what()); }
|
||||
// catch (std::exception& e) { slurm_spank_log("error writing to output file: %s", e.what()); }
|
||||
// catch (...) { slurm_spank_log("error writing to output file"); }
|
||||
// }
|
||||
// return 0;
|
||||
// }
|
||||
|
||||
Reference in New Issue
Block a user