From b2382557a6048ea13ee022a73771314729e036d2 Mon Sep 17 00:00:00 2001 From: chn Date: Tue, 25 Feb 2025 13:04:30 +0800 Subject: [PATCH] packages.info: write to output --- modules/services/slurm.nix | 39 ++++--- packages/info/src/main.cpp | 206 ++++++++++++++----------------------- 2 files changed, 99 insertions(+), 146 deletions(-) diff --git a/modules/services/slurm.nix b/modules/services/slurm.nix index 04af0b85..76652ac7 100644 --- a/modules/services/slurm.nix +++ b/modules/services/slurm.nix @@ -202,18 +202,16 @@ inputs: ''; }; extraConfig = - let info = inputs.pkgs.localPackages.info.override - { - slurm = inputs.config.services.slurm.package; - configFile = inputs.config.sops.templates."info.yaml".path; - }; - in '' - PrologSlurmctld=${info}/bin/info - EpilogSlurmctld=${info}/bin/info + PrologSlurmctld=${inputs.config.security.wrapperDir}/slurm-info + EpilogSlurmctld=${inputs.config.security.wrapperDir}/slurm-info ''; }; - systemd.tmpfiles.rules = [ "d /var/log/slurmctld 700 slurm slurm" ]; + systemd = + { + services.slurmctld.after = [ "suid-sgid-wrappers.service" ]; + tmpfiles.rules = [ "d /var/log/slurmctld 700 slurm slurm" ]; + }; sops = { secrets = { "slurm/db" = { owner = "slurm"; key = "mariadb/slurm"; }; } @@ -231,19 +229,28 @@ inputs: }; }; }; + security.wrappers.info = + { + source = + let info = inputs.pkgs.localPackages.info.override + { + slurm = inputs.config.services.slurm.package; + configFile = inputs.config.sops.templates."info.yaml".path; + }; + in "${info}/bin/info"; + program = "slurm-info"; + owner = "slurm"; + group = "slurm"; + permissions = "544"; + capabilities = "cap_setuid,cap_setgid+ep"; + }; nixos = { packages.packages._packages = [ inputs.pkgs.localPackages.sbatch-tui ]; user.sharedModules = [{ home.packages = [ (inputs.pkgs.writeShellScriptBin "sbatch" - '' - if [ "$#" -eq 0 ]; then - sbatch-tui - else - /run/current-system/sw/bin/sbatch "$@" - fi - '') + ''if [ "$#" -eq 0 ]; then sbatch-tui; else /run/current-system/sw/bin/sbatch "$@"; fi'') ];}]; services.mariadb = { enable = true; instances.slurm = {}; }; }; diff --git a/packages/info/src/main.cpp b/packages/info/src/main.cpp index 587527e5..7a67f6c2 100644 --- a/packages/info/src/main.cpp +++ b/packages/info/src/main.cpp @@ -8,6 +8,19 @@ # define INFO_CONFIG_FILE "/etc/info.yaml" # endif +struct switch_user +{ + std::uint32_t uid, gid; + switch_user(std::uint32_t uid, std::uint32_t gid) : uid(uid), gid(gid) {} + boost::system::error_code on_exec_setup(auto&&...) + { + // first set gid then set uid, otherwise failed + if (setegid(gid) != 0 || seteuid(uid) != 0) + return boost::system::error_code{errno, boost::system::system_category()}; + else return {}; + } +}; + int main() { using namespace biu::literals; @@ -22,8 +35,8 @@ int main() std::string slurm_conf; std::map context_map { - { "prolog_slurmctld", "RUN" }, - { "epilog_slurmctld", "END" } + { "prolog_slurmctld", "Begin" }, + { "epilog_slurmctld", "End" } }; { auto config = YAML::LoadFile(INFO_CONFIG_FILE); @@ -50,11 +63,15 @@ int main() context = context_cstr; } - // 从 slurm 处查询信息 YAML::Node info; + std::uint32_t uid, gid; + std::string output_file; + // slurm 只能初始化一次,之后即使 fini 再初始化也会无法连接到数据库 + slurm_init(slurm_conf.c_str()); + + // 从 slurm 处查询信息 { job_info_msg_t* job_info; - slurm_init(slurm_conf.c_str()); auto slurm_result = slurm_load_job(&job_info, jid, 0); if (slurm_result != SLURM_SUCCESS) throw std::runtime_error("slurm_load_job failed: {}"_f(slurm_strerror(slurm_result))); else if (job_info->record_count != 1) throw std::runtime_error("job_info->record_count != 1"); @@ -69,158 +86,87 @@ int main() info["Job Name"] = null_to_empty(job_info->job_array->name); info["Working Directory"] = null_to_empty(job_info->job_array->work_dir); info["Output File"] = null_to_empty(job_info->job_array->std_out); + output_file = null_to_empty(job_info->job_array->std_out); info["Partition"] = null_to_empty(job_info->job_array->partition); info["Submit Time"] = timepoint(job_info->job_array->submit_time); info["Start Time"] = timepoint(job_info->job_array->start_time); if (context == "epilog_slurmctld") info["End Time"] = timepoint(job_info->job_array->end_time); - info["Nodes"] = null_to_empty(job_info->job_array->nodes); + // not working on epilog_slurmctld + // info["Nodes"] = null_to_empty(job_info->job_array->nodes); + info["Nodes"] = null_to_empty(std::getenv("SLURM_JOB_NODELIST")); info["TREs Allocated"] = null_to_empty(job_info->job_array->tres_alloc_str); info["GREs Allocated"] = null_to_empty(job_info->job_array->gres_total); if (context == "epilog_slurmctld") info["Exit Code"] = job_info->job_array->exit_code; info["Status"] = get_status(job_info->job_array->job_state); info["Status"].SetStyle(YAML::EmitterStyle::Flow); + info["User ID"] = job_info->job_array->user_id; + uid = job_info->job_array->user_id; + info["Group ID"] = job_info->job_array->group_id; + gid = job_info->job_array->group_id; } slurm_free_job_info_msg(job_info); - slurm_fini(); } // 从 slurmdbd 处查询信息 + // 有问题,先不用这段代码 // if (context == "epilog_slurmctld") - // { - // slurm_init(slurm_conf.c_str()); - // uint16_t conn_flags = 0; - // auto conn = slurmdb_connection_get(&conn_flags); - // if (!conn || errno != SLURM_SUCCESS) throw std::runtime_error("slurmdb_connection_get failed."); + if (false) + { + auto conn = slurmdb_connection_get(nullptr); + if (!conn) throw std::runtime_error("slurmdb_connection_get failed."); // 构造查询 - // slurmdb_job_cond_t* query = new slurmdb_job_cond_t; - // query->step_list = slurm_list_create(slurm_destroy_selected_step); - // slurm_selected_step_t* step = new slurm_selected_step_t; - // step->step_id.step_het_comp = NO_VAL; - // step->step_id.step_id = NO_VAL; - // step->step_id.job_id = jid; - // step->array_task_id = NO_VAL; - // step->het_job_offset = NO_VAL; - // step->array_bitmap = nullptr; - // slurm_list_append(query->step_list, step); - // // 查询 - // auto result = slurmdb_jobs_get(conn, query); - // if (slurm_list_count(result) != 1) throw std::runtime_error("slurmdb_jobs_get failed."); - // auto data = reinterpret_cast(slurm_list_pop(result)); - // // 读取需要的信息并清理 - // slurm_list_destroy(result); - // slurmdb_destroy_job_cond(query); - // info["aaaa"] = data->uid; - // slurmdb_destroy_job_rec(data); + // from: https://github.com/ksyx/turingopt/blob/20d88df423c0722839d1f0d185708da0af7c07a7/watcher/src/main.cpp#L329 + auto query = reinterpret_cast + (std::calloc(1, sizeof(slurmdb_job_cond_t))); + query->flags |= JOBCOND_FLAG_NO_TRUNC; + query->db_flags = SLURMDB_JOB_FLAG_NOTSET; + query->step_list = slurm_list_create(slurm_destroy_selected_step); + auto step = new slurm_selected_step_t + {nullptr, NO_VAL, NO_VAL, {jid, NO_VAL, NO_VAL}}; + slurm_list_append(query->step_list, step); + // 查询 + auto result = slurmdb_jobs_get(conn, query); + if (slurm_list_count(result) != 1) throw std::runtime_error("slurmdb_jobs_get failed."); + auto data = reinterpret_cast(slurm_list_pop(result)); + // 读取需要的信息并清理 + slurm_list_destroy(result); + slurmdb_destroy_job_cond(query); + auto null_to_empty = [](const char* str) { return str ? str : ""; }; + info["Nodes"] = null_to_empty(data->nodes); + slurmdb_destroy_job_rec(data); - // auto close_result = slurmdb_connection_close(&conn); - // if (close_result != SLURM_SUCCESS) throw std::runtime_error("slurmdb_connection_close failed."); - // } + auto close_result = slurmdb_connection_close(&conn); + if (close_result != SLURM_SUCCESS) throw std::runtime_error("slurmdb_connection_close failed."); + } + + slurm_fini(); // 发送消息 { TgBot::Bot bot(token); std::stringstream ss; - ss << "{} {} {}\n"_f(context, info["Job Id"], info["Job Name"]); + ss << "{} {} {}\n"_f(context_map[context], info["Job Id"], info["Job Name"]); ss << "
{}
"_f(info); bot.getApi().sendMessage (user_map[user], ss.str(), nullptr, nullptr, nullptr, "HTML"); } + + // 写入消息 + if (context == "epilog_slurmctld" && !output_file.empty()) + { + auto text = "\n--------------------\n{}\n--------------------\n"_f(info); + boost::asio::io_context context; + boost::system::error_code ec; + boost::asio::writable_pipe wp{context}; + boost::process::v2::process proc + ( + context, "/run/current-system/sw/bin/tee", { "-a", output_file.c_str() }, + boost::process::v2::process_stdio{wp, nullptr, nullptr}, switch_user(uid, gid) + ); + boost::asio::write(wp, boost::asio::buffer(text)); + wp.close(); + proc.wait(); + } }); } - -struct switch_user -{ - std::uint32_t uid, gid; - switch_user(std::uint32_t uid, std::uint32_t gid) : uid(uid), gid(gid) {} - boost::system::error_code on_exec_setup(auto&&...) - { - // first set gid then set uid, otherwise failed - if (setegid(gid) != 0 || seteuid(uid) != 0) - return boost::system::error_code{errno, boost::system::system_category()}; - else return {}; - } -}; - -// int slurm_spank_job_epilog(spank_t spank, int ac, char** argv) -// { -// using namespace biu::literals; -// auto [info, outfile, uid, gid] = [&] -// { -// std::stringstream ss; -// std::optional outfile; -// ss << "------------------------------------------------------------\n"; -// std::uint32_t jid, uid = -1, gid = -1; -// auto result = spank_get_item(spank, S_JOB_ID, &jid); -// if (result != ESPANK_SUCCESS) ss << "error getting job id: {}\n"_f(int(result)); -// else -// { -// ss << "info for job {}:\n"_f(jid); -// YAML::Node info; -// -// // gather info from slurmctld -// job_info_msg_t* job_info; -// slurm_init(nullptr); -// auto result = slurm_load_job(&job_info, jid, 0); -// if (result != SLURM_SUCCESS) ss << "error loading job info: {}\n"_f(slurm_strerror(result)); -// else if (job_info->record_count != 1) ss << "record_count {} != 1\n"_f(job_info->record_count); -// else -// { -// auto null_to_empty = [](const char* str) { return str ? str : ""; }; -// auto timepoint = [](time_t time) -// { return "{:%Y-%m-%d %H:%M:%S}"_f(*std::localtime(&time)); }; -// auto timespan = [](time_t time) -// { return "{:%H:%M:%S}"_f(std::chrono::seconds(time)); }; -// auto get_status = [](int code) -// { return std::vector{ "{}"_f(job_states(code & 0xff)), "{:#x}"_f(code) }; }; -// info["Job Id"] = job_info->job_array->job_id; -// info["Job Name"] = null_to_empty(job_info->job_array->name); -// info["User Id"] = job_info->job_array->user_id; -// info["Work Directory"] = null_to_empty(job_info->job_array->work_dir); -// info["Output File"] = null_to_empty(job_info->job_array->std_out); -// info["Partition"] = null_to_empty(job_info->job_array->partition); -// info["Submit Time"] = timepoint(job_info->job_array->submit_time); -// info["Start Time"] = timepoint(job_info->job_array->start_time); -// info["End Time"] = timepoint(job_info->job_array->end_time); -// info["Nodes"] = null_to_empty(job_info->job_array->nodes); -// info["TREs Allocated"] = null_to_empty(job_info->job_array->tres_alloc_str); -// info["GREs Allocated"] = null_to_empty(job_info->job_array->gres_total); -// info["Exit Code"] = job_info->job_array->exit_code; -// info["Status"] = get_status(job_info->job_array->job_state); -// info["Status"].SetStyle(YAML::EmitterStyle::Flow); -// info["Context"] = "{}"_f(spank_context()); -// info["Remote"] = spank_remote(spank); -// if (job_info->job_array->std_out != nullptr) outfile = job_info->job_array->std_out; -// uid = job_info->job_array->user_id; -// gid = job_info->job_array->group_id; -// } -// slurm_free_job_info_msg(job_info); -// slurm_fini(); -// -// ss << "------------------------------------------------------------\n" << info << '\n'; -// } -// return std::tuple(ss.str(), outfile, uid, gid); -// }(); -// slurm_spank_log("%s", info.c_str()); -// if (outfile) -// { -// try -// { -// boost::asio::io_context context; -// boost::system::error_code ec; -// boost::asio::writable_pipe wp{context}; -// boost::process::v2::process proc -// ( -// context, "/run/current-system/sw/bin/tee", { "-a", outfile->c_str() }, -// boost::process::v2::process_stdio{wp, nullptr, nullptr}, switch_user(uid, gid) -// ); -// boost::asio::write(wp, boost::asio::buffer(info)); -// wp.close(); -// proc.wait(); -// } -// catch (boost::system::system_error& e) { slurm_spank_log("boost error writing to output file: %s", e.what()); } -// catch (std::exception& e) { slurm_spank_log("error writing to output file: %s", e.what()); } -// catch (...) { slurm_spank_log("error writing to output file"); } -// } -// return 0; -// }