Repository: PBSPro/pbspro Branch: master Commit: cd7ab5edaf03 Files: 1196 Total size: 17.5 MB Directory structure: gitextract_2t7z1zdf/ ├── .clang-format ├── .github/ │ ├── PULL_REQUEST_TEMPLATE.md │ ├── checkclang │ ├── checkpep8 │ └── runchecks ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── COPYRIGHT ├── INSTALL ├── LICENSE ├── Makefile.am ├── PBS_License.txt ├── README.md ├── autogen.sh ├── azure-pipelines.yml ├── buildutils/ │ ├── Makefile.am │ └── attr_parser.py ├── ci/ │ ├── README.md │ ├── ci │ └── etc/ │ ├── build-pbs-packages.sh │ ├── ci-script-wrapper.service │ ├── configure_node.sh │ ├── container-env-setup.sh │ ├── container-init │ ├── do.sh │ ├── do_sanitize_mode.sh │ ├── docker-entrypoint │ ├── gen_ptl_json.sh │ ├── id_rsa │ ├── id_rsa.pub │ ├── install-system-packages │ ├── killit.sh │ ├── macros │ └── sudoers-overrides ├── configure.ac ├── doc/ │ ├── Makefile.am │ ├── man1/ │ │ ├── pbs_hook_attributes.7B │ │ ├── pbs_job_attributes.7B │ │ ├── pbs_login.1B │ │ ├── pbs_module.7B │ │ ├── pbs_node_attributes.7B │ │ ├── pbs_professional.7B │ │ ├── pbs_python.1B │ │ ├── pbs_queue_attributes.7B │ │ ├── pbs_ralter.1B │ │ ├── pbs_rdel.1B │ │ ├── pbs_release_nodes.1B │ │ ├── pbs_resources.7B │ │ ├── pbs_resv_attributes.7B │ │ ├── pbs_rstat.1B │ │ ├── pbs_rsub.1B │ │ ├── pbs_sched_attributes.7B │ │ ├── pbs_server_attributes.7B │ │ ├── pbsdsh.1B │ │ ├── qalter.1B │ │ ├── qdel.1B │ │ ├── qhold.1B │ │ ├── qmove.1B │ │ ├── qmsg.1B │ │ ├── qorder.1B │ │ ├── qrerun.1B │ │ ├── qrls.1B │ │ ├── qselect.1B │ │ ├── qsig.1B │ │ ├── qstat.1B │ │ └── qsub.1B │ ├── man3/ │ │ ├── pbs_alterjob.3B │ │ ├── pbs_asyrunjob.3B │ │ ├── pbs_confirmresv.3B │ │ ├── pbs_connect.3B │ │ ├── pbs_default.3B │ │ ├── pbs_deljob.3B │ │ ├── pbs_delresv.3B │ │ ├── pbs_disconnect.3B │ │ ├── pbs_geterrmsg.3B │ │ ├── pbs_holdjob.3B │ │ ├── pbs_locjob.3B │ │ ├── pbs_manager.3B │ │ ├── pbs_modify_resv.3B │ │ ├── pbs_movejob.3B │ │ ├── pbs_msgjob.3B │ │ ├── pbs_orderjob.3B │ │ ├── pbs_preempt_jobs.3B │ │ ├── pbs_relnodesjob.3B │ │ ├── pbs_rerunjob.3B │ │ ├── pbs_rescquery.3B │ │ ├── pbs_rescreserve.3B │ │ ├── pbs_rlsjob.3B │ │ ├── pbs_runjob.3B │ │ ├── pbs_selectjob.3B │ │ ├── pbs_selstat.3B │ │ ├── pbs_sigjob.3B │ │ ├── pbs_stagein.3B │ │ ├── pbs_statfree.3B │ │ ├── pbs_stathook.3B │ │ ├── pbs_stathost.3B │ │ ├── pbs_statjob.3B │ │ ├── pbs_statnode.3B │ │ ├── pbs_statque.3B │ │ ├── pbs_statresv.3B │ │ ├── pbs_statrsc.3B │ │ ├── pbs_statsched.3B │ │ ├── pbs_statserver.3B │ │ ├── pbs_statvnode.3B │ │ ├── pbs_submit.3B │ │ ├── pbs_submit_resv.3B │ │ ├── pbs_submitresv.3B │ │ ├── pbs_tclapi.3B │ │ ├── pbs_terminate.3B │ │ ├── rm.3B │ │ └── tm.3 │ └── man8/ │ ├── mpiexec.8B │ ├── pbs.8B │ ├── pbs.conf.8B │ ├── pbs_account.8B │ ├── pbs_attach.8B │ ├── pbs_comm.8B │ ├── pbs_dataservice.8B │ ├── pbs_ds_password.8B │ ├── pbs_hostn.8B │ ├── pbs_idled.8B │ ├── pbs_iff.8B │ ├── pbs_interactive.8B │ ├── pbs_lamboot.8B │ ├── pbs_mkdirs.8B │ ├── pbs_mom.8B │ ├── pbs_mpihp.8B │ ├── pbs_mpilam.8B │ ├── pbs_mpirun.8B │ ├── pbs_probe.8B │ ├── pbs_sched.8B │ ├── pbs_server.8B │ ├── pbs_snapshot.8B │ ├── pbs_tclsh.8B │ ├── pbs_tmrsh.8B │ ├── pbs_topologyinfo.8B │ ├── pbs_wish.8B │ ├── pbsfs.8B │ ├── pbsnodes.8B │ ├── pbsrun.8B │ ├── pbsrun_unwrap.8B │ ├── pbsrun_wrap.8B │ ├── printjob.8B │ ├── qdisable.8B │ ├── qenable.8B │ ├── qmgr.8B │ ├── qrun.8B │ ├── qstart.8B │ ├── qstop.8B │ ├── qterm.8B │ ├── tracejob.8B │ └── win_postinstall.py.8B ├── m4/ │ ├── disable_shell_pipe.m4 │ ├── disable_syslog.m4 │ ├── enable_alps.m4 │ ├── enable_ptl.m4 │ ├── pbs_decl_epoll.m4 │ ├── pbs_decl_epoll_pwait.m4 │ ├── pbs_decl_h_errno.m4 │ ├── pbs_decl_ppoll.m4 │ ├── pbs_decl_socklen_t.m4 │ ├── pbs_patch_libtool.m4 │ ├── pbs_systemd_unitdir.m4 │ ├── pbs_version.m4 │ ├── security_check.m4 │ ├── with_cjson.m4 │ ├── with_core_limit.m4 │ ├── with_database_dir.m4 │ ├── with_database_port.m4 │ ├── with_database_user.m4 │ ├── with_editline.m4 │ ├── with_expat.m4 │ ├── with_hwloc.m4 │ ├── with_krbauth.m4 │ ├── with_libical.m4 │ ├── with_libz.m4 │ ├── with_min_stack_limit.m4 │ ├── with_pbs_conf_file.m4 │ ├── with_pmix.m4 │ ├── with_python.m4 │ ├── with_sendmail.m4 │ ├── with_server_home.m4 │ ├── with_server_name_file.m4 │ ├── with_swig.m4 │ ├── with_tcl.m4 │ ├── with_tclatrsep.m4 │ ├── with_tmpdir.m4 │ ├── with_unsupported_dir.m4 │ └── with_xauth.m4 ├── openpbs-rpmlintrc ├── openpbs.spec.in ├── src/ │ ├── Makefile.am │ ├── cmds/ │ │ ├── Makefile.am │ │ ├── mpiexec.in │ │ ├── pbs_attach.c │ │ ├── pbs_attach_sup.c │ │ ├── pbs_dataservice.c │ │ ├── pbs_demux.c │ │ ├── pbs_ds_password.c │ │ ├── pbs_lamboot.in │ │ ├── pbs_mpihp.in │ │ ├── pbs_mpilam.in │ │ ├── pbs_mpirun.in │ │ ├── pbs_ralter.c │ │ ├── pbs_rdel.c │ │ ├── pbs_release_nodes.c │ │ ├── pbs_remsh.in │ │ ├── pbs_rstat.c │ │ ├── pbs_rsub.c │ │ ├── pbs_tmrsh.c │ │ ├── pbsdsh.c │ │ ├── pbsnodes.c │ │ ├── pbsrun.in │ │ ├── pbsrun_unwrap.in │ │ ├── pbsrun_wrap.in │ │ ├── qalter.c │ │ ├── qdel.c │ │ ├── qdisable.c │ │ ├── qenable.c │ │ ├── qhold.c │ │ ├── qmgr.c │ │ ├── qmgr_sup.c │ │ ├── qmove.c │ │ ├── qmsg.c │ │ ├── qorder.c │ │ ├── qrerun.c │ │ ├── qrls.c │ │ ├── qrun.c │ │ ├── qselect.c │ │ ├── qsig.c │ │ ├── qstart.c │ │ ├── qstat.c │ │ ├── qstop.c │ │ ├── qsub.c │ │ ├── qsub_sup.c │ │ ├── qterm.c │ │ ├── sample.qstatrc │ │ └── scripts/ │ │ ├── Makefile.am │ │ ├── limits.pbs_mom.compat │ │ ├── limits.post_services.compat │ │ ├── modulefile.in │ │ ├── pbs.csh │ │ ├── pbs.service.in │ │ ├── pbs.sh │ │ ├── pbs_bootcheck.py │ │ ├── pbs_dataservice │ │ ├── pbs_ds_password │ │ ├── pbs_habitat.in │ │ ├── pbs_init.d.in │ │ ├── pbs_poerun.in │ │ ├── pbs_postinstall.in │ │ ├── pbs_posttrans │ │ ├── pbs_preuninstall │ │ ├── pbs_reload.in │ │ ├── pbs_server │ │ ├── pbs_snapshot │ │ ├── pbs_topologyinfo │ │ ├── pbs_topologyinfo.py │ │ ├── pbsrun.ch_gm.init.in │ │ ├── pbsrun.ch_mx.init.in │ │ ├── pbsrun.gm_mpd.init.in │ │ ├── pbsrun.intelmpi.init.in │ │ ├── pbsrun.mpich2.init.in │ │ ├── pbsrun.mvapich1.init.in │ │ ├── pbsrun.mvapich2.init.in │ │ ├── pbsrun.mx_mpd.init.in │ │ ├── pbsrun.poe.in │ │ ├── pbsrun.poe.init.in │ │ ├── printjob │ │ └── sgiMPI.awk │ ├── hooks/ │ │ ├── Makefile.am │ │ └── cgroups/ │ │ ├── pbs_cgroups.CF │ │ ├── pbs_cgroups.HK │ │ └── pbs_cgroups.PY │ ├── iff/ │ │ ├── Makefile.am │ │ └── iff2.c │ ├── include/ │ │ ├── Long.h │ │ ├── Long_.h │ │ ├── Makefile.am │ │ ├── acct.h │ │ ├── attribute.h │ │ ├── auth.h │ │ ├── avltree.h │ │ ├── basil.h │ │ ├── batch_request.h │ │ ├── bitfield.h │ │ ├── cmds.h │ │ ├── credential.h │ │ ├── dedup_jobids.h │ │ ├── dis.h │ │ ├── grunt.h │ │ ├── hook.h │ │ ├── hook_func.h │ │ ├── ifl_internal.h │ │ ├── job.h │ │ ├── libauth.h │ │ ├── libpbs.h │ │ ├── libsec.h │ │ ├── libutil.h │ │ ├── list_link.h │ │ ├── log.h │ │ ├── mom_func.h │ │ ├── mom_hook_func.h │ │ ├── mom_server.h │ │ ├── mom_vnode.h │ │ ├── net_connect.h │ │ ├── pbs_array_list.h │ │ ├── pbs_assert.h │ │ ├── pbs_client_thread.h │ │ ├── pbs_db.h │ │ ├── pbs_ecl.h │ │ ├── pbs_entlim.h │ │ ├── pbs_error.h │ │ ├── pbs_idx.h │ │ ├── pbs_ifl.h │ │ ├── pbs_internal.h │ │ ├── pbs_json.h │ │ ├── pbs_license.h │ │ ├── pbs_mpp.h │ │ ├── pbs_nodes.h │ │ ├── pbs_python.h │ │ ├── pbs_python_private.h │ │ ├── pbs_reliable.h │ │ ├── pbs_sched.h │ │ ├── pbs_share.h │ │ ├── pbs_v1_module_common.i │ │ ├── pbs_version.h.in │ │ ├── placementsets.h │ │ ├── port_forwarding.h │ │ ├── portability.h │ │ ├── provision.h │ │ ├── qmgr.h │ │ ├── queue.h │ │ ├── range.h │ │ ├── reservation.h │ │ ├── resmon.h │ │ ├── resource.h │ │ ├── resv_node.h │ │ ├── rm.h │ │ ├── sched_cmds.h │ │ ├── server.h │ │ ├── server_limits.h │ │ ├── site_job_attr_def.h │ │ ├── site_job_attr_enum.h │ │ ├── site_qmgr_node_print.h │ │ ├── site_qmgr_que_print.h │ │ ├── site_qmgr_sched_print.h │ │ ├── site_qmgr_svr_print.h │ │ ├── site_que_attr_def.h │ │ ├── site_que_attr_enum.h │ │ ├── site_queue.h │ │ ├── site_resc_attr_def.h │ │ ├── site_resv_attr_def.h │ │ ├── site_resv_attr_enum.h │ │ ├── site_sched_attr_def.h │ │ ├── site_sched_attr_enum.h │ │ ├── site_svr_attr_def.h │ │ ├── site_svr_attr_enum.h │ │ ├── svrfunc.h │ │ ├── ticket.h │ │ ├── tm.h │ │ ├── tm_.h │ │ ├── tpp.h │ │ ├── tracking.h │ │ ├── user.h │ │ └── work_task.h │ ├── lib/ │ │ ├── Libattr/ │ │ │ ├── Long_.c │ │ │ ├── Makefile.am │ │ │ ├── attr_atomic.c │ │ │ ├── attr_fn_acl.c │ │ │ ├── attr_fn_arst.c │ │ │ ├── attr_fn_b.c │ │ │ ├── attr_fn_c.c │ │ │ ├── attr_fn_entlim.c │ │ │ ├── attr_fn_f.c │ │ │ ├── attr_fn_hold.c │ │ │ ├── attr_fn_intr.c │ │ │ ├── attr_fn_l.c │ │ │ ├── attr_fn_ll.c │ │ │ ├── attr_fn_resc.c │ │ │ ├── attr_fn_size.c │ │ │ ├── attr_fn_str.c │ │ │ ├── attr_fn_time.c │ │ │ ├── attr_fn_unkn.c │ │ │ ├── attr_func.c │ │ │ ├── attr_node_func.c │ │ │ ├── attr_resc_func.c │ │ │ ├── master_job_attr_def.xml │ │ │ ├── master_node_attr_def.xml │ │ │ ├── master_queue_attr_def.xml │ │ │ ├── master_resc_def_all.xml │ │ │ ├── master_resv_attr_def.xml │ │ │ ├── master_sched_attr_def.xml │ │ │ ├── master_svr_attr_def.xml │ │ │ ├── resc_map.c │ │ │ ├── strToL.c │ │ │ ├── strTouL.c │ │ │ └── uLTostr.c │ │ ├── Libauth/ │ │ │ ├── Makefile.am │ │ │ ├── README.md │ │ │ ├── gss/ │ │ │ │ ├── Makefile.am │ │ │ │ └── pbs_gss.c │ │ │ └── munge/ │ │ │ ├── Makefile.am │ │ │ └── munge_supp.c │ │ ├── Libcmds/ │ │ │ ├── batch_status.c │ │ │ ├── check_job_script.c │ │ │ ├── chk_Jrange.c │ │ │ ├── ck_job_name.c │ │ │ ├── cmds_common.c │ │ │ ├── cnt2server.c │ │ │ ├── cs_error.c │ │ │ ├── cvtdate.c │ │ │ ├── err_handling.c │ │ │ ├── get_attr.c │ │ │ ├── get_dataservice_usr.c │ │ │ ├── get_server.c │ │ │ ├── isjobid.c │ │ │ ├── locate_job.c │ │ │ ├── parse_at.c │ │ │ ├── parse_depend.c │ │ │ ├── parse_destid.c │ │ │ ├── parse_equal.c │ │ │ ├── parse_jobid.c │ │ │ ├── parse_stage.c │ │ │ ├── prepare_path.c │ │ │ ├── set_attr.c │ │ │ └── set_resource.c │ │ ├── Libdb/ │ │ │ ├── Makefile.am │ │ │ └── pgsql/ │ │ │ ├── Makefile.am │ │ │ ├── db_attr.c │ │ │ ├── db_common.c │ │ │ ├── db_job.c │ │ │ ├── db_node.c │ │ │ ├── db_postgres.h │ │ │ ├── db_que.c │ │ │ ├── db_resv.c │ │ │ ├── db_sched.c │ │ │ ├── db_svr.c │ │ │ ├── pbs_db_env │ │ │ ├── pbs_db_schema.sql │ │ │ ├── pbs_db_utility │ │ │ ├── pbs_ds_systemd │ │ │ └── pbs_schema_upgrade │ │ ├── Libdis/ │ │ │ ├── dis.c │ │ │ ├── dis_.h │ │ │ ├── dis_helpers.c │ │ │ ├── discui_.c │ │ │ ├── discul_.c │ │ │ ├── discull_.c │ │ │ ├── disi10d_.c │ │ │ ├── disi10l_.c │ │ │ ├── disiui_.c │ │ │ ├── disp10d_.c │ │ │ ├── disp10l_.c │ │ │ ├── disrcs.c │ │ │ ├── disrd.c │ │ │ ├── disrf.c │ │ │ ├── disrfcs.c │ │ │ ├── disrfst.c │ │ │ ├── disrl.c │ │ │ ├── disrl_.c │ │ │ ├── disrsc.c │ │ │ ├── disrsi.c │ │ │ ├── disrsi_.c │ │ │ ├── disrsl.c │ │ │ ├── disrsl_.c │ │ │ ├── disrsll_.c │ │ │ ├── disrss.c │ │ │ ├── disrst.c │ │ │ ├── disruc.c │ │ │ ├── disrui.c │ │ │ ├── disrul.c │ │ │ ├── disrull.c │ │ │ ├── disrus.c │ │ │ ├── diswcs.c │ │ │ ├── diswf.c │ │ │ ├── diswl_.c │ │ │ ├── diswsi.c │ │ │ ├── diswsl.c │ │ │ ├── diswui.c │ │ │ ├── diswui_.c │ │ │ ├── diswul.c │ │ │ ├── diswull.c │ │ │ └── ps_dis.c │ │ ├── Libecl/ │ │ │ ├── ecl_verify.c │ │ │ ├── ecl_verify_datatypes.c │ │ │ ├── ecl_verify_object_name.c │ │ │ ├── ecl_verify_values.c │ │ │ └── pbs_client_thread.c │ │ ├── Libifl/ │ │ │ ├── DIS_decode.c │ │ │ ├── DIS_encode.c │ │ │ ├── Makefile.am │ │ │ ├── PBS_attr.c │ │ │ ├── advise.c │ │ │ ├── auth.c │ │ │ ├── conn_table.c │ │ │ ├── dec_DelJobList.c │ │ │ ├── dec_reply.c │ │ │ ├── enc_reply.c │ │ │ ├── entlim_parse.c │ │ │ ├── get_svrport.c │ │ │ ├── grunt_parse.c │ │ │ ├── ifl_impl.c │ │ │ ├── ifl_pointers.c │ │ │ ├── ifl_util.c │ │ │ ├── int_cred.c │ │ │ ├── int_hook.c │ │ │ ├── int_jcred.c │ │ │ ├── int_manage2.c │ │ │ ├── int_manager.c │ │ │ ├── int_modify_resv.c │ │ │ ├── int_msg2.c │ │ │ ├── int_rdrpy.c │ │ │ ├── int_sig2.c │ │ │ ├── int_status.c │ │ │ ├── int_status2.c │ │ │ ├── int_submit.c │ │ │ ├── int_submit_resv.c │ │ │ ├── int_ucred.c │ │ │ ├── list_link.c │ │ │ ├── pbsD_Preempt_Jobs.c │ │ │ ├── pbsD_alterjob.c │ │ │ ├── pbsD_confirmresv.c │ │ │ ├── pbsD_connect.c │ │ │ ├── pbsD_defschreply.c │ │ │ ├── pbsD_deljob.c │ │ │ ├── pbsD_deljoblist.c │ │ │ ├── pbsD_delresv.c │ │ │ ├── pbsD_holdjob.c │ │ │ ├── pbsD_locjob.c │ │ │ ├── pbsD_manager.c │ │ │ ├── pbsD_modify_resv.c │ │ │ ├── pbsD_movejob.c │ │ │ ├── pbsD_msgjob.c │ │ │ ├── pbsD_orderjo.c │ │ │ ├── pbsD_rerunjo.c │ │ │ ├── pbsD_resc.c │ │ │ ├── pbsD_rlsjob.c │ │ │ ├── pbsD_runjob.c │ │ │ ├── pbsD_selectj.c │ │ │ ├── pbsD_sigjob.c │ │ │ ├── pbsD_stagein.c │ │ │ ├── pbsD_stathook.c │ │ │ ├── pbsD_stathost.c │ │ │ ├── pbsD_statjob.c │ │ │ ├── pbsD_statnode.c │ │ │ ├── pbsD_statque.c │ │ │ ├── pbsD_statresv.c │ │ │ ├── pbsD_statrsc.c │ │ │ ├── pbsD_statsched.c │ │ │ ├── pbsD_statsrv.c │ │ │ ├── pbsD_submit.c │ │ │ ├── pbsD_submit_resv.c │ │ │ ├── pbsD_termin.c │ │ │ ├── pbs_delstatfree.c │ │ │ ├── pbs_get_attribute_errors.c │ │ │ ├── pbs_geterrmg.c │ │ │ ├── pbs_geterrno.c │ │ │ ├── pbs_ifl.i │ │ │ ├── pbs_loadconf.c │ │ │ ├── pbs_quote_parse.c │ │ │ ├── pbs_statfree.c │ │ │ ├── rm.c │ │ │ ├── strsep.c │ │ │ ├── tcp_dis.c │ │ │ ├── tm.c │ │ │ └── xml_encode_decode.c │ │ ├── Libjson/ │ │ │ ├── Makefile.am │ │ │ └── cJSON/ │ │ │ ├── Makefile.am │ │ │ └── pbs_cjson.c │ │ ├── Liblicensing/ │ │ │ ├── Makefile.am │ │ │ ├── liblicense.h │ │ │ └── license_client.c │ │ ├── Liblog/ │ │ │ ├── Makefile.am │ │ │ ├── chk_file_sec.c │ │ │ ├── log_event.c │ │ │ ├── pbs_log.c │ │ │ ├── pbs_messages.c │ │ │ └── setup_env.c │ │ ├── Libnet/ │ │ │ ├── Makefile.am │ │ │ ├── get_hostaddr.c │ │ │ ├── hnls.c │ │ │ ├── net_client.c │ │ │ ├── net_server.c │ │ │ ├── net_set_clse.c │ │ │ └── port_forwarding.c │ │ ├── Libpbs/ │ │ │ ├── Makefile.am │ │ │ └── pbs.pc.in │ │ ├── Libpython/ │ │ │ ├── Makefile.am │ │ │ ├── common_python_utils.c │ │ │ ├── module_pbs_v1.c │ │ │ ├── pbs_python_external.c │ │ │ ├── pbs_python_import_types.c │ │ │ ├── pbs_python_svr_external.c │ │ │ ├── pbs_python_svr_internal.c │ │ │ ├── pbs_python_svr_size_type.c │ │ │ └── shared_python_utils.c │ │ ├── Libsec/ │ │ │ ├── Makefile.am │ │ │ └── cs_standard.c │ │ ├── Libsite/ │ │ │ ├── Makefile.am │ │ │ ├── site_allow_u.c │ │ │ ├── site_alt_rte.c │ │ │ ├── site_check_u.c │ │ │ ├── site_map_usr.c │ │ │ ├── site_mom_chu.c │ │ │ ├── site_mom_ckp.c │ │ │ └── site_mom_jst.c │ │ ├── Libtpp/ │ │ │ ├── Makefile.am │ │ │ ├── tpp_client.c │ │ │ ├── tpp_em.c │ │ │ ├── tpp_internal.h │ │ │ ├── tpp_platform.c │ │ │ ├── tpp_router.c │ │ │ ├── tpp_transport.c │ │ │ └── tpp_util.c │ │ ├── Libutil/ │ │ │ ├── Makefile.am │ │ │ ├── avltree.c │ │ │ ├── daemon_protect.c │ │ │ ├── dedup_jobids.c │ │ │ ├── entlim.c │ │ │ ├── execvnode_seq_util.c │ │ │ ├── get_hostname.c │ │ │ ├── hook.c │ │ │ ├── misc_utils.c │ │ │ ├── pbs_aes_encrypt.c │ │ │ ├── pbs_array_list.c │ │ │ ├── pbs_ical.c │ │ │ ├── pbs_idx.c │ │ │ ├── pbs_secrets.c │ │ │ ├── range.c │ │ │ ├── thread_utils.c │ │ │ └── work_task.c │ │ └── Makefile.am │ ├── modules/ │ │ ├── Makefile.am │ │ └── python/ │ │ ├── Makefile.am │ │ ├── pbs/ │ │ │ ├── __init__.py │ │ │ └── v1/ │ │ │ ├── __init__.py │ │ │ ├── _attr_types.py │ │ │ ├── _base_types.py │ │ │ ├── _exc_types.py │ │ │ ├── _export_types.py │ │ │ ├── _pmi_cray.py │ │ │ ├── _pmi_none.py │ │ │ ├── _pmi_sgi.py │ │ │ ├── _pmi_types.py │ │ │ ├── _pmi_utils.py │ │ │ └── _svr_types.py │ │ ├── pbs_hooks/ │ │ │ ├── PBS_alps_inventory_check.HK │ │ │ ├── PBS_alps_inventory_check.PY │ │ │ ├── PBS_cray_atom.CF │ │ │ ├── PBS_cray_atom.HK │ │ │ ├── PBS_cray_atom.PY │ │ │ ├── PBS_power.CF │ │ │ ├── PBS_power.HK │ │ │ ├── PBS_power.PY │ │ │ ├── PBS_xeon_phi_provision.HK │ │ │ └── PBS_xeon_phi_provision.PY │ │ ├── pbs_v1_module_init.c │ │ ├── setup.cfg │ │ └── setup.py │ ├── mom_rcp/ │ │ ├── Makefile.am │ │ ├── README │ │ ├── extern.h │ │ ├── pathnames.h │ │ ├── pbs_stat.h │ │ ├── rcp.c │ │ ├── replace.c │ │ └── util.c │ ├── resmom/ │ │ ├── Makefile.am │ │ ├── catch_child.c │ │ ├── job_recov_fs.c │ │ ├── linux/ │ │ │ ├── alps.c │ │ │ ├── mom_func.c │ │ │ ├── mom_mach.c │ │ │ ├── mom_mach.h │ │ │ ├── mom_start.c │ │ │ └── pe_input.c │ │ ├── mock_run.c │ │ ├── mock_run.h │ │ ├── mom_comm.c │ │ ├── mom_hook_func.c │ │ ├── mom_inter.c │ │ ├── mom_main.c │ │ ├── mom_pmix.c │ │ ├── mom_pmix.h │ │ ├── mom_server.c │ │ ├── mom_updates_bundle.c │ │ ├── mom_vnode.c │ │ ├── mom_walltime.c │ │ ├── popen.c │ │ ├── prolog.c │ │ ├── renew_creds.c │ │ ├── renew_creds.h │ │ ├── requests.c │ │ ├── rm_dep.h │ │ ├── stage_func.c │ │ ├── start_exec.c │ │ └── vnode_storage.c │ ├── scheduler/ │ │ ├── Makefile.am │ │ ├── buckets.cpp │ │ ├── buckets.h │ │ ├── check.cpp │ │ ├── check.h │ │ ├── config.h │ │ ├── constant.h │ │ ├── data_types.h │ │ ├── dedtime.cpp │ │ ├── dedtime.h │ │ ├── fairshare.cpp │ │ ├── fairshare.h │ │ ├── fifo.cpp │ │ ├── fifo.h │ │ ├── get_4byte.cpp │ │ ├── globals.cpp │ │ ├── globals.h │ │ ├── job_info.cpp │ │ ├── job_info.h │ │ ├── limits.cpp │ │ ├── limits_if.h │ │ ├── list_order.awk │ │ ├── misc.cpp │ │ ├── misc.h │ │ ├── multi_threading.cpp │ │ ├── multi_threading.h │ │ ├── node_info.cpp │ │ ├── node_info.h │ │ ├── node_partition.cpp │ │ ├── node_partition.h │ │ ├── parse.cpp │ │ ├── parse.h │ │ ├── pbs_bitmap.cpp │ │ ├── pbs_bitmap.h │ │ ├── pbs_dedicated │ │ ├── pbs_holidays │ │ ├── pbs_holidays.2017 │ │ ├── pbs_resource_group │ │ ├── pbs_sched.cpp │ │ ├── pbs_sched_bare.cpp │ │ ├── pbs_sched_config │ │ ├── pbs_sched_utils.cpp │ │ ├── pbsfs.cpp │ │ ├── prev_job_info.cpp │ │ ├── prev_job_info.h │ │ ├── prime.cpp │ │ ├── prime.h │ │ ├── queue.cpp │ │ ├── queue.h │ │ ├── queue_info.cpp │ │ ├── queue_info.h │ │ ├── resource.cpp │ │ ├── resource.h │ │ ├── resource_resv.cpp │ │ ├── resource_resv.h │ │ ├── resv_info.cpp │ │ ├── resv_info.h │ │ ├── sched_exception.cpp │ │ ├── sched_ifl_wrappers.cpp │ │ ├── server_info.cpp │ │ ├── server_info.h │ │ ├── simulate.cpp │ │ ├── simulate.h │ │ ├── site_code.cpp │ │ ├── site_code.h │ │ ├── site_data.h │ │ ├── sort.cpp │ │ ├── sort.h │ │ ├── state_count.cpp │ │ └── state_count.h │ ├── server/ │ │ ├── Makefile.am │ │ ├── accounting.c │ │ ├── array_func.c │ │ ├── attr_recov.c │ │ ├── attr_recov_db.c │ │ ├── checkkey.c │ │ ├── daemon_info.c │ │ ├── dis_read.c │ │ ├── failover.c │ │ ├── geteusernam.c │ │ ├── hook_func.c │ │ ├── issue_request.c │ │ ├── jattr_get_set.c │ │ ├── job_func.c │ │ ├── job_recov_db.c │ │ ├── job_route.c │ │ ├── license_client.c │ │ ├── licensing_func.c │ │ ├── mom_info.c │ │ ├── nattr_get_set.c │ │ ├── node_func.c │ │ ├── node_manager.c │ │ ├── node_recov_db.c │ │ ├── pbs_comm.c │ │ ├── pbs_db_func.c │ │ ├── pbsd_init.c │ │ ├── pbsd_main.c │ │ ├── process_request.c │ │ ├── qattr_get_set.c │ │ ├── queue_func.c │ │ ├── queue_recov_db.c │ │ ├── rattr_get_set.c │ │ ├── reply_send.c │ │ ├── req_cred.c │ │ ├── req_delete.c │ │ ├── req_getcred.c │ │ ├── req_holdjob.c │ │ ├── req_jobobit.c │ │ ├── req_locate.c │ │ ├── req_manager.c │ │ ├── req_message.c │ │ ├── req_modify.c │ │ ├── req_movejob.c │ │ ├── req_preemptjob.c │ │ ├── req_quejob.c │ │ ├── req_register.c │ │ ├── req_rerun.c │ │ ├── req_rescq.c │ │ ├── req_runjob.c │ │ ├── req_select.c │ │ ├── req_shutdown.c │ │ ├── req_signal.c │ │ ├── req_stat.c │ │ ├── req_track.c │ │ ├── resc_attr.c │ │ ├── run_sched.c │ │ ├── sattr_get_set.c │ │ ├── sched_attr_get_set.c │ │ ├── sched_func.c │ │ ├── setup_resc.c │ │ ├── stat_job.c │ │ ├── svr_chk_owner.c │ │ ├── svr_connect.c │ │ ├── svr_credfunc.c │ │ ├── svr_func.c │ │ ├── svr_jobfunc.c │ │ ├── svr_mail.c │ │ ├── svr_movejob.c │ │ ├── svr_recov_db.c │ │ ├── svr_resccost.c │ │ ├── user_func.c │ │ └── vnparse.c │ ├── tools/ │ │ ├── Makefile.am │ │ ├── chk_tree.c │ │ ├── create_env_file.sh │ │ ├── hostn.c │ │ ├── pbsTclInit.c │ │ ├── pbsTkInit.c │ │ ├── pbs_ds_monitor.c │ │ ├── pbs_idled.c │ │ ├── pbs_probe.c │ │ ├── pbs_python.c │ │ ├── pbs_sleep.c │ │ ├── pbs_tclWrap.c │ │ ├── pbs_upgrade_job.c │ │ ├── printjob.c │ │ ├── rstester.c │ │ ├── site_tclWrap.c │ │ ├── tracejob.c │ │ ├── tracejob.h │ │ └── wrap_tcl.sh.in │ └── unsupported/ │ ├── Makefile.am │ ├── NodeHealthCheck.json │ ├── NodeHealthCheck.py │ ├── README │ ├── ReliableJobStartup.py │ ├── cray_readme │ ├── load_balance.py │ ├── mom_dyn_res.py │ ├── pbs-mailer/ │ │ ├── README.md │ │ ├── debian/ │ │ │ ├── changelog │ │ │ ├── compat │ │ │ ├── conffiles │ │ │ ├── control │ │ │ ├── pbs-mailer.service │ │ │ └── rules │ │ ├── pbs_mail.json │ │ ├── pbs_mail_saver │ │ ├── pbs_mail_sender │ │ ├── release-deb.sh │ │ └── release-rpm.sh │ ├── pbs_config │ ├── pbs_jobs_at.8B │ ├── pbs_loganalyzer │ ├── pbs_output.py │ ├── pbs_rescquery.3B │ ├── pbs_rmget.8B │ ├── pbs_rmget.c │ ├── pbs_stat │ ├── rapid_inter.py │ ├── renew-test/ │ │ ├── base64.c │ │ ├── base64.h │ │ └── renew-test.c │ ├── run_pelog_shell.ini │ ├── run_pelog_shell.py │ ├── sgiICEplacement.sh │ ├── sgiICEvnode.sh │ └── sgigenvnodelist.awk ├── test/ │ ├── Makefile.am │ ├── fw/ │ │ ├── MANIFEST.in │ │ ├── Makefile.am │ │ ├── bin/ │ │ │ ├── pbs_as │ │ │ ├── pbs_benchpress │ │ │ ├── pbs_compare_results │ │ │ ├── pbs_config │ │ │ ├── pbs_cov │ │ │ ├── pbs_loganalyzer │ │ │ ├── pbs_py_spawn │ │ │ ├── pbs_snapshot │ │ │ ├── pbs_stat │ │ │ ├── pbs_swigify │ │ │ └── pbs_sys_report │ │ ├── doc/ │ │ │ ├── caveats.rst │ │ │ ├── commands.rst │ │ │ ├── conf.py │ │ │ ├── howtotest.rst │ │ │ ├── index.rst │ │ │ ├── install.rst │ │ │ ├── intro.rst │ │ │ ├── make.bat │ │ │ ├── ptl.rst │ │ │ └── tutorial.rst │ │ ├── ptl/ │ │ │ ├── __init__.py.in │ │ │ ├── lib/ │ │ │ │ ├── __init__.py │ │ │ │ ├── pbs_api_to_cli.py │ │ │ │ ├── pbs_ifl_mock.py │ │ │ │ ├── pbs_testlib.py │ │ │ │ ├── ptl_batchutils.py │ │ │ │ ├── ptl_comm.py │ │ │ │ ├── ptl_config.py │ │ │ │ ├── ptl_constants.py │ │ │ │ ├── ptl_entities.py │ │ │ │ ├── ptl_error.py │ │ │ │ ├── ptl_expect_action.py │ │ │ │ ├── ptl_fairshare.py │ │ │ │ ├── ptl_mom.py │ │ │ │ ├── ptl_object.py │ │ │ │ ├── ptl_resourceresv.py │ │ │ │ ├── ptl_sched.py │ │ │ │ ├── ptl_server.py │ │ │ │ ├── ptl_service.py │ │ │ │ ├── ptl_types.py │ │ │ │ └── ptl_wrappers.py │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── pbs_anonutils.py │ │ │ ├── pbs_cliutils.py │ │ │ ├── pbs_covutils.py │ │ │ ├── pbs_crayutils.py │ │ │ ├── pbs_dshutils.py │ │ │ ├── pbs_logutils.py │ │ │ ├── pbs_procutils.py │ │ │ ├── pbs_snaputils.py │ │ │ ├── pbs_testsuite.py │ │ │ ├── pbs_testusers.py │ │ │ └── plugins/ │ │ │ ├── __init__.py │ │ │ ├── ptl_report_json.py │ │ │ ├── ptl_test_data.py │ │ │ ├── ptl_test_db.py │ │ │ ├── ptl_test_info.py │ │ │ ├── ptl_test_loader.py │ │ │ ├── ptl_test_runner.py │ │ │ └── ptl_test_tags.py │ │ ├── ptl.csh │ │ ├── ptl.sh │ │ ├── ptlreport │ │ ├── requirements.txt │ │ └── setup.py.in │ ├── scripts/ │ │ └── qsub_multi.sh │ └── tests/ │ ├── Makefile.am │ ├── __init__.py │ ├── functional/ │ │ ├── __init__.py │ │ ├── pbs_Rrecord_resources_used.py │ │ ├── pbs_acct_log.py │ │ ├── pbs_accumulate_resc_used.py │ │ ├── pbs_acl_groups.py │ │ ├── pbs_acl_host_moms.py │ │ ├── pbs_acl_host_queue.py │ │ ├── pbs_acl_host_server.py │ │ ├── pbs_admin_suspend.py │ │ ├── pbs_allpart.py │ │ ├── pbs_alps_inventory_check_hook.py │ │ ├── pbs_alps_release_tunables.py │ │ ├── pbs_array_job_mail.py │ │ ├── pbs_basil_parser_err.py │ │ ├── pbs_basil_support.py │ │ ├── pbs_calendaring.py │ │ ├── pbs_cgroups_hook.py │ │ ├── pbs_check_job_attrib.py │ │ ├── pbs_checkpoint.py │ │ ├── pbs_client_response.py │ │ ├── pbs_complete_running_parent_job.py │ │ ├── pbs_conf_resv_stale_vnode.py │ │ ├── pbs_config.py │ │ ├── pbs_cpuset.py │ │ ├── pbs_cray_check_node_exclusivity.py │ │ ├── pbs_cray_hyperthread.py │ │ ├── pbs_cray_pagg_id.py │ │ ├── pbs_cray_reliable_job_startup.py │ │ ├── pbs_cray_smoketest.py │ │ ├── pbs_cray_suspend_resume.py │ │ ├── pbs_cray_vnode_per_numa.py │ │ ├── pbs_cray_vnode_pool.py │ │ ├── pbs_daemon_service_user.py │ │ ├── pbs_dup_acc_log_for_resv.py │ │ ├── pbs_eligible_time.py │ │ ├── pbs_equiv_classes.py │ │ ├── pbs_exceeded_resources_notification.py │ │ ├── pbs_execjob_susp_resume.py │ │ ├── pbs_fairshare.py │ │ ├── pbs_gen_nodefile_on_sister_mom.py │ │ ├── pbs_grunt.py │ │ ├── pbs_highreslog.py │ │ ├── pbs_holidays.py │ │ ├── pbs_hook_config_os_env.py │ │ ├── pbs_hook_crosslink_mom.py │ │ ├── pbs_hook_debug_input.py │ │ ├── pbs_hook_debug_nocrash.py │ │ ├── pbs_hook_exechost_periodic.py │ │ ├── pbs_hook_execjob_abort.py │ │ ├── pbs_hook_execjob_end.py │ │ ├── pbs_hook_execjob_prologue.py │ │ ├── pbs_hook_jobobit.py │ │ ├── pbs_hook_management.py │ │ ├── pbs_hook_modifyvnode_state_changes.py │ │ ├── pbs_hook_perf_stat.py │ │ ├── pbs_hook_postqueuejob.py │ │ ├── pbs_hook_set_attr.py │ │ ├── pbs_hook_set_interrupt.py │ │ ├── pbs_hook_set_jobenv.py │ │ ├── pbs_hook_set_nonexist.py │ │ ├── pbs_hook_timeout.py │ │ ├── pbs_hook_unset_res.py │ │ ├── pbs_hooksmoketest.py │ │ ├── pbs_hookswig.py │ │ ├── pbs_indirect_resources.py │ │ ├── pbs_init_script.py │ │ ├── pbs_job_array.py │ │ ├── pbs_job_array_comment.py │ │ ├── pbs_job_comment_on_resume.py │ │ ├── pbs_job_default_group.py │ │ ├── pbs_job_dependency.py │ │ ├── pbs_job_purge.py │ │ ├── pbs_job_requeue_timeout_error.py │ │ ├── pbs_job_routing.py │ │ ├── pbs_job_script.py │ │ ├── pbs_job_sort_formula.py │ │ ├── pbs_job_status_after_mom_hup.py │ │ ├── pbs_job_task.py │ │ ├── pbs_maintenance_reservations.py │ │ ├── pbs_modifyresv_hook.py │ │ ├── pbs_mom_hook_sync.py │ │ ├── pbs_mom_hooks_test.py │ │ ├── pbs_mom_job_dir.py │ │ ├── pbs_mom_local_nodename.py │ │ ├── pbs_mom_mock_run.py │ │ ├── pbs_mom_walltime.py │ │ ├── pbs_moved_job.py │ │ ├── pbs_moved_job_local.py │ │ ├── pbs_multi_sched.py │ │ ├── pbs_multiple_execjob_launch_hook.py │ │ ├── pbs_node_buckets.py │ │ ├── pbs_node_jobs_restart.py │ │ ├── pbs_node_jobs_restart_multinode.py │ │ ├── pbs_node_rampdown.py │ │ ├── pbs_node_rampdown_keep_select.py │ │ ├── pbs_node_sleep_state.py │ │ ├── pbs_nodes_json.py │ │ ├── pbs_nodes_queues.py │ │ ├── pbs_nonprint_characters.py │ │ ├── pbs_offline_vnodes.py │ │ ├── pbs_one_event_multiple_hooks.py │ │ ├── pbs_only_explicit_psets.py │ │ ├── pbs_only_small_files_over_tpp.py │ │ ├── pbs_passing_environment_variable.py │ │ ├── pbs_pbsnodes.py │ │ ├── pbs_pbsnodes_output_trimmed.py │ │ ├── pbs_peer.py │ │ ├── pbs_periodic_constant.py │ │ ├── pbs_power_provisioning_cray.py │ │ ├── pbs_power_provisioning_sgi.py │ │ ├── pbs_preemption.py │ │ ├── pbs_printjob.py │ │ ├── pbs_provisioning.py │ │ ├── pbs_provisioning_enhancement.py │ │ ├── pbs_python_restart_settings.py │ │ ├── pbs_python_test.py │ │ ├── pbs_qdel.py │ │ ├── pbs_qmgr.py │ │ ├── pbs_qrun.py │ │ ├── pbs_qselect.py │ │ ├── pbs_qstat.py │ │ ├── pbs_qstat_2servers.py │ │ ├── pbs_qstat_count.py │ │ ├── pbs_qstat_formats.py │ │ ├── pbs_qsub_direct_write.py │ │ ├── pbs_qsub_opts_args.py │ │ ├── pbs_qsub_remove_files.py │ │ ├── pbs_qsub_script.py │ │ ├── pbs_qsub_wblock.py │ │ ├── pbs_que_resc_usage.py │ │ ├── pbs_ralter.py │ │ ├── pbs_release_limited_res_suspend.py │ │ ├── pbs_reliable_job_startup.py │ │ ├── pbs_resc_custom_perm.py │ │ ├── pbs_resc_used_single_node.py │ │ ├── pbs_reservations.py │ │ ├── pbs_resource_multichunk.py │ │ ├── pbs_resource_unset.py │ │ ├── pbs_resource_usage_log.py │ │ ├── pbs_resv_begin_hook.py │ │ ├── pbs_resv_confirm_hook.py │ │ ├── pbs_resv_end_hook.py │ │ ├── pbs_resv_start_dur_end.py │ │ ├── pbs_root_owned_script.py │ │ ├── pbs_rstat.py │ │ ├── pbs_runjob_hook.py │ │ ├── pbs_sched_attr_updates.py │ │ ├── pbs_sched_badstate.py │ │ ├── pbs_sched_fifo.py │ │ ├── pbs_sched_preempt_enforce_resumption.py │ │ ├── pbs_sched_rerun.py │ │ ├── pbs_sched_runjobwait.py │ │ ├── pbs_sched_signal.py │ │ ├── pbs_schedule_indirect_resources.py │ │ ├── pbs_server_hook_attr.py │ │ ├── pbs_server_periodic_hook.py │ │ ├── pbs_set_enforcement.py │ │ ├── pbs_sister_mom_crash.py │ │ ├── pbs_snapshot_unittest.py │ │ ├── pbs_soft_walltime.py │ │ ├── pbs_stf.py │ │ ├── pbs_strict_ordering.py │ │ ├── pbs_support_linux_hook_event_phase1_2.py │ │ ├── pbs_suspend_resume_accounting.py │ │ ├── pbs_svr_dyn_res.py │ │ ├── pbs_systemd.py │ │ ├── pbs_test_entity_limits.py │ │ ├── pbs_test_qorder.py │ │ ├── pbs_test_run_count.py │ │ ├── pbs_test_svr_dflt.py │ │ ├── pbs_test_tpp.py │ │ ├── pbs_trillion_jobid.py │ │ ├── pbs_two_mom_hooks_resources_used.py │ │ ├── pbs_types.py │ │ ├── pbs_unknown_resource_hook_update.py │ │ ├── pbs_unset_exectime.py │ │ ├── pbs_user_reliability.py │ │ ├── pbs_validate_job_qsub_attributes.py │ │ └── pbs_verify_log_output.py │ ├── interfaces/ │ │ ├── __init__.py │ │ ├── pbs_libpbs_so.py │ │ ├── pbs_node_partition.py │ │ ├── pbs_partition.py │ │ ├── pbs_preempt_params.py │ │ └── pbs_sched_interface_test.py │ ├── pbs_smoketest.py │ ├── performance/ │ │ ├── __init__.py │ │ ├── pbs_cgroups_stress.py │ │ ├── pbs_client_nagle_performance.py │ │ ├── pbs_equiv_classes_perf.py │ │ ├── pbs_history_cleanup_quasihang.py │ │ ├── pbs_jobperf.py │ │ ├── pbs_preemptperformance.py │ │ ├── pbs_qstat_performance.py │ │ ├── pbs_qsub_performance.py │ │ ├── pbs_rerunjob_file_transfer_perf.py │ │ ├── pbs_runjobwait_perf.py │ │ ├── pbs_sched_perf.py │ │ ├── pbs_standing_resv_quasihang.py │ │ └── test_dependency_perf.py │ ├── resilience/ │ │ ├── __init__.py │ │ └── pbs_hook_alarm_large_multinode_job.py │ ├── security/ │ │ ├── __init__.py │ │ ├── pbs_command_injection.py │ │ └── pbs_multiple_auth.py │ ├── selftest/ │ │ ├── __init__.py │ │ ├── pbs_config_sched.py │ │ ├── pbs_cycles_test.py │ │ ├── pbs_default_timeout.py │ │ ├── pbs_dshutils_tests.py │ │ ├── pbs_expect.py │ │ ├── pbs_initservices.py │ │ ├── pbs_job_cleanup.py │ │ ├── pbs_json_test_report.py │ │ ├── pbs_manager.py │ │ ├── pbs_managers_operators.py │ │ ├── pbs_param_dict.py │ │ ├── pbs_pbstestsuite.py │ │ ├── pbs_requirements_decorator.py │ │ ├── pbs_resvid_test.py │ │ ├── pbs_test_create_vnodes.py │ │ ├── pbs_test_revert_site_hooks.py │ │ ├── pbs_test_revert_to_defaults.py │ │ ├── pbs_testlogutils.py │ │ └── pbs_testparams_decorator.py │ └── upgrades/ │ └── __init__.py └── valgrind.supp ================================================ FILE CONTENTS ================================================ ================================================ FILE: .clang-format ================================================ --- Language: Cpp # BasedOnStyle: LLVM AccessModifierOffset: -2 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false AlignEscapedNewlines: Right AlignOperands: true AlignTrailingComments: true AllowAllArgumentsOnNextLine: true AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: All AllowShortLambdasOnASingleLine: All AllowShortIfStatementsOnASingleLine: Never AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: MultiLine BinPackArguments: true BinPackParameters: true BreakBeforeBraces: Custom BraceWrapping: AfterCaseLabel: false AfterClass: false AfterControlStatement: false AfterEnum: false AfterFunction: true AfterNamespace: false AfterObjCDeclaration: false AfterStruct: false AfterUnion: false AfterExternBlock: false BeforeCatch: false BeforeElse: false IndentBraces: false SplitEmptyFunction: true SplitEmptyRecord: true SplitEmptyNamespace: true BreakBeforeBinaryOperators: None BreakBeforeInheritanceComma: false BreakInheritanceList: BeforeColon BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeColon BreakAfterJavaFieldAnnotations: false BreakStringLiterals: true ColumnLimit: 0 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 8 Cpp11BracedListStyle: true DerivePointerAlignment: false DisableFormat: false ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true ForEachMacros: - foreach - Q_FOREACH - BOOST_FOREACH IncludeBlocks: Preserve IncludeCategories: - Regex: '[<"]pbs_config.h[>"]' Priority: -1 - Regex: '.*' Priority: 3 - Regex: '^"(llvm|llvm-c|clang|clang-c)/' Priority: 2 - Regex: '^(<|"(gtest|gmock|isl|json)/)' Priority: 4 IncludeIsMainRegex: '(Test)?$' IndentCaseLabels: true IndentPPDirectives: None IndentWidth: 8 IndentWrappedFunctionNames: false JavaScriptQuotes: Leave JavaScriptWrapImports: true KeepEmptyLinesAtTheStartOfBlocks: true MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCBinPackProtocolList: Auto ObjCBlockIndentWidth: 2 ObjCSpaceAfterProperty: false ObjCSpaceBeforeProtocolList: true PenaltyBreakAssignment: 2 PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyBreakTemplateDeclaration: 10 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Right ReflowComments: false SortIncludes: false SortUsingDeclarations: true SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: ControlStatements SpaceBeforeRangeBasedForLoopColon: true SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: Cpp11 StatementMacros: - Q_UNUSED - QT_REQUIRE_VERSION TabWidth: 8 UseTab: Always AlwaysBreakAfterReturnType: AllDefinitions SpaceAfterCStyleCast: true ... ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ #### Describe Bug or Feature #### Describe Your Change #### Link to Design Doc #### Attach Test and Valgrind Logs/Output ================================================ FILE: .github/checkclang ================================================ #!/bin/bash # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. export PATH=$PATH:/usr/local/bin/ checkdir="$(readlink -f $(dirname $0))" which clang-format 1>/dev/null 2>/dev/null if [ $? -ne 0 ]; then echo "Could not find clang-format command" 1>&2 exit 1 fi cd ${checkdir}/.. find . -iname *.h -o -iname *.c -o -iname *.cpp | xargs clang-format --dry-run exit $? ================================================ FILE: .github/checkpep8 ================================================ #!/bin/bash # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. checkdir="$(readlink -f $(dirname $0))" errors=0 which pep8 1>/dev/null 2>/dev/null if [ $? -ne 0 ]; then echo "Could not find pep8 command" 1>&2 exit 1 fi cd ${checkdir}/.. is_python_file() { name=$(basename ${1}) # special case # if .rst file then it will be considered # as a plain text file if [ "x${name##*.}" == "xrst" ]; then return 1 fi # special case # if __init__.py does not contain any code then file # command will consider it as plain text file if [ "x${name}" == "x__init__.py" ]; then return 0 fi if [ "x$(file --mime-type -b ${1})" == "xtext/x-python" ]; then return 0 fi return 1 } check_pep8() { pep8 --show-source ${1} >out_check_pep8 2>&1 return $? } for f in $(find test -type f) do if is_python_file ${f} then if ! check_pep8 ${f} then cat out_check_pep8 1>&2 rm -f out_check_pep8 errors=$((errors + 1)) fi if [ -x "${f}" ]; then echo "${f}: executable bit set" 1>&2 errors=$((errors + 1)) fi fi done if [ ${errors} -ne 0 ]; then exit 1 else exit 0 fi ================================================ FILE: .github/runchecks ================================================ #!/bin/bash # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. declare -a listofchecks listofchecks[0]="checkpep8" listofchecks[1]="checkclang" checkdir=$(readlink -f $(dirname $0)) errors_fails=0 for check in ${listofchecks[@]} do echo -n "Running check: '${check}' ... " if [ ! -x "${checkdir}/${check}" ]; then echo "NOTFOUND" errors_fails=$((errors_fails + 1)) continue fi ${checkdir}/${check} >out 2>err if [ $? -ne 0 ]; then echo "FAILED" cat err errors_fails=$((errors_fails + 1)) else echo "OK" cat out fi done if [ ${errors_fails} -ne 0 ]; then exit 1 else exit 0 fi ================================================ FILE: .gitignore ================================================ # Object files *.o *.ko *.obj *.elf *.slo # Precompiled Headers *.gch *.pch # Libraries *.lib *.libs *.a *.la *.lo *.lai # module files *.mod # Shared objects (inc. Windows DLLs) *.dll *.so *.so.* *.dylib # Executables *.exe *.out *.app *.i*86 *.x86_64 *.hex # Debug files *.dSYM/ # Eclipse project files .project .cproject .pydevproject .settings/ .autotools .csettings/ .devcontainer/ # Files used by ctags tags # Files used by cscope cscope.files cscope.out #Visual Studio files *.user *.ncb *.suo .vscode/ win_configure/.vs/ # Files used by gtags GPATH GRTAGS GTAGS # Files/Directory generated by PBSTestLab ptl_test_results.html ptl_test_results.json test/fw/build/ test/fw/ptl/__init__.py test/fw/setup.py test/tests/ptl_test_results.json test/tests/*/ptl_test_results.json # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # Python Distribution / packaging .Python develop-eggs/ dist/ downloads/ eggs/ .eggs/ sdist/ *.egg-info/ .installed.cfg *.egg # pip Installer logs pip-log.txt pip-delete-this-directory.txt *.log # Build directory target/ target-*/ #PyCharm project directory .idea/ # From automake/autoconf autom4te.cache/ autoscan.log autoscan-*.log configure.scan config.status aclocal.m4 buildutils/config.guess buildutils/config.sub # Libtool libtool m4/libtool.m4 m4/ltoptions.m4 m4/ltsugar.m4 m4/ltversion.m4 m4/lt~obsolete.m4 buildutils/ltmain.sh # Build related files configure Makefile.in Makefile *.deps buildutils/pbs_mkdirs buildutils/ar-lib buildutils/compile buildutils/depcomp buildutils/install-sh buildutils/missing buildutils/py-compile buildutils/exclude_script buildutils/makedepend-sh buildutils/ylwrap # Generated binaries src/cmds/mpiexec src/cmds/nqs2pbs src/cmds/pbs_attach src/cmds/pbs_demux src/cmds/pbs_ds_password.bin src/cmds/pbs_lamboot src/cmds/pbs_mpihp src/cmds/pbs_mpilam src/cmds/pbs_mpirun src/cmds/pbs_rdel src/cmds/pbs_remsh src/cmds/pbs_rstat src/cmds/pbs_rsub src/cmds/pbs_ralter src/cmds/pbs_tmrsh src/cmds/pbsdsh src/cmds/pbsnodes src/cmds/pbs_release_nodes src/cmds/pbs_dataservice.bin src/cmds/pbsrun src/cmds/pbsrun_unwrap src/cmds/pbsrun_wrap src/cmds/qalter src/cmds/qdel src/cmds/qdisable src/cmds/qenable src/cmds/qhold src/cmds/qmgr src/cmds/qmove src/cmds/qmsg src/cmds/qorder src/cmds/qrerun src/cmds/qrls src/cmds/qrun src/cmds/qselect src/cmds/qsig src/cmds/qstart src/cmds/qstat src/cmds/qstop src/cmds/qsub src/cmds/qterm src/cmds/scripts/limits.pbs_mom src/cmds/scripts/limits.post_services src/cmds/scripts/pbs_habitat src/cmds/scripts/pbs_init.d src/cmds/scripts/pbs_poerun src/cmds/scripts/pbs_postinstall src/cmds/scripts/pbsrun.poe src/cmds/scripts/pbs_reload src/iff/pbs_iff src/mom_rcp/pbs_rcp src/resmom/pbs_mom src/scheduler/pbs_sched src/scheduler/pbs_sched_bare src/scheduler/pbsfs src/server/pbs_comm src/server/pbs_server.bin src/tools/pbs_ds_monitor src/tools/pbs_hostn src/tools/pbs_idled src/tools/pbs_probe src/tools/pbs_python src/tools/pbs_tclsh src/tools/pbs_upgrade_job src/tools/pbs_wish src/tools/printjob.bin src/tools/printjob_svr.bin src/tools/tracejob src/tools/wrap_tcl.sh src/tools/pbs_sleep src/unsupported/pbs_rmget src/unsupported/renew-test/renew-test # Generated source files src/include/pbs_version.h src/include/pbs_config.h src/include/pbs_config.h.in src/include/pbs_config.h.in~ src/include/job_attr_enum.h src/include/node_attr_enum.h src/include/queue_attr_enum.h src/include/resc_def_enum.h src/include/resv_attr_enum.h src/include/sched_attr_enum.h src/include/svr_attr_enum.h src/lib/Libattr/queue_attr_def.c src/lib/Libattr/resc_def_all.c src/lib/Libattr/resv_attr_def.c src/lib/Libattr/sched_attr_def.c src/lib/Libattr/svr_attr_def.c src/lib/Libattr/job_attr_def.c src/lib/Libattr/node_attr_def.c src/lib/Libpbs/ecl_job_attr_def.c src/lib/Libpbs/ecl_node_attr_def.c src/lib/Libpbs/ecl_queue_attr_def.c src/lib/Libpbs/ecl_resc_def_all.c src/lib/Libpbs/ecl_resv_attr_def.c src/lib/Libpbs/ecl_sched_attr_def.c src/lib/Libpbs/ecl_svr_attr_def.c src/include/stamp-h1 src/lib/Libpython/pbs_ifl.i src/lib/Libpython/pbs_ifl.py src/lib/Libpython/pbs_ifl_wrap.c src/lib/Libifl/pbs_ifl.py src/lib/Libifl/pbs_ifl_wrap.c src/include/job_attr_enum.h src/include/node_attr_enum.h src/include/queue_attr_enum.h src/include/resc_def_enum.h src/include/resv_attr_enum.h src/include/sched_attr_enum.h src/include/svr_attr_enum.h #Generated source files - Windows src/lib/Libecl/ecl_node_attr_def.c src/lib/Libecl/ecl_job_attr_def.c src/lib/Libecl/ecl_queue_attr_def.c src/lib/Libecl/ecl_resc_def_all.c src/lib/Libecl/ecl_resv_attr_def.c src/lib/Libecl/ecl_sched_attr_def.c src/lib/Libecl/ecl_svr_attr_def.c win_configure/projects/pbs_ifl.i win_configure/projects/pbs_ifl.py win_configure/projects/pbs_ifl_wrap.c #ci logs ci/logs/ ci/logs/prev_LOGS/ ci/.* ci/packages ci/ptl_ts_tree.json ci/docker-compose.json # Generated scripts src/cmds/scripts/modulefile src/cmds/scripts/pbs.service # Generated by make dist *.tar.gz src/lib/Libpbs/pbs.pc # rpm spec file *.spec # Other archive file types *.tar *.tar.bz *.tgz *.zip *.cpio *.rpm *.deb # Generated directories by autotools *.libs ================================================ FILE: CODE_OF_CONDUCT.md ================================================ #### OpenPBS Open Source Project ## **Code of Conduct** This code of conduct is a guide for members of the OpenPBS community. We are committed to providing an open and welcoming environment for the OpenPBS community. We expect that all members of the community will behave according to this code of conduct. This code of conduct is intended to explain the spirit in which we expect to communicate, not to be an exhaustive list. This code of conduct applies to all elements of the OpenPBS community: mailing lists, bug tracking systems, etc. Anyone who violates this code of conduct may be banned from the OpenPBS community. It is unacceptable to follow the letter but not the spirit of this code of conduct. Guidelines for code of conduct: * **Be friendly and patient.** * **Be welcoming:** We strive to be a community that welcomes and supports people of all backgrounds and identities. * **Be considerate:** Your work will be used by other people, and you in turn will depend on the work of others. Decisions you make affect everyone in the community, so please be mindful of your actions and always choose a non-confrontational approach. Remember: this is a global community and English is not everyone's primary language. * **Be respectful:** Disagreements may occur, but we cannot abide personal attacks. The health of the community depends on all members feeling comfortable and supported. If you don't agree, use discretion and be polite. * **Be careful in the words that we choose:** we are a community of professionals, and we conduct ourselves professionally. Be kind to others. Do not insult or put down other participants. Harassment and other exclusionary behavior aren’t acceptable. * **Try to understand why we disagree:** Disagreements, both social and technical, happen all the time. It is important that we resolve disagreements and differing views constructively. Different people have different perspectives on issues. Being unable to understand why someone holds a viewpoint doesn’t mean that they’re wrong. Don’t forget that it is human to err and blaming each other doesn’t get us anywhere. Instead, focus on helping to resolve issues and learning from mistakes. In addition, our open source community members are expected to abide by the **[OpenPBS Acceptable Use Policy](https://openpbs.atlassian.net/wiki/spaces/PBSPro/pages/5537837/Acceptable+Use+Policy). ### Reporting Issues If you experience or witness unacceptable behavior — or have any other concerns — please report it by sending e-mail to webmaster@pbspro.org. All reports will be handled with discretion. In your report please include: * Your contact information. * Names (real, nicknames, or pseudonyms) of any individuals involved. If there are additional witnesses, please include them as well. Your account of what occurred, and if you believe the incident is ongoing. If there is a publicly available record (e.g. a mailing list archive or a public IRC logger), please include a link. * Any additional information that may be helpful. After filing a report, a representative will contact you personally, review the incident, follow up with any additional questions, and make a decision as to how to respond. If the person who is harassing you is part of the response team, they will recuse themselves from handling your incident. If the complaint originates from a member of the response team, it will be handled by a different member of the response team. We will respect confidentiality requests for the purpose of protecting victims of abuse. ### Attribution & Acknowledgements This code of conduct is based on the **[Open Code of Conduct v1.0](https://github.com/todogroup/opencodeofconduct)** from the **[TODOGroup](http://todogroup.org)**. We are thankful for their work and all the communities who have paved the way with codes of conduct. ### PBS Pro Contributor's Portal Please see the PBS Pro Contributor's Portal for the PBS Pro **[Code of Conduct](https://openpbs.atlassian.net/wiki/spaces/PBSPro/pages/5537835/Code+of+Conduct)**. Note: In May 2020, OpenPBS became the new name for the PBS Professional Open Source Project. (PBS Professional will be used to refer to the commercial version; OpenPBS to the Open Source version -- same code, easier naming.) As there are many parts to the project, it will take several weeks to change the name in all places, so you will continue to see references to PBS Pro (as in the above) -- stay tuned. ================================================ FILE: CONTRIBUTING.md ================================================ ### Contributing to the OpenPBS Open Source Project We're so happy that you want to contribute to OpenPBS! Please see the Contributor's Portal for details, guidelines, and how-to tutorials. Start at **[Becoming a Contributor to OpenPBS](https://pbspro.atlassian.net/wiki/spaces/DG/pages/20414474/Becoming+a+Contributor+to+PBS+Pro)**. Note: In May 2020, OpenPBS became the new name for the PBS Professional Open Source Project. (PBS Professional will be used to refer to the commercial version; OpenPBS to the Open Source version -- same code, easier naming.) As there are many parts to the project, it will take several weeks to change the name in all places, so you will continue to see references to PBS Pro (as in the above) -- stay tuned. ================================================ FILE: COPYRIGHT ================================================ Copyright (C) 1994-2021 Altair Engineering, Inc. For more information, contact Altair at www.altair.com. This file is part of both the OpenPBS software ("OpenPBS") and the PBS Professional ("PBS Pro") software. Open Source License Information: OpenPBS is free software. You can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OpenPBS is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Commercial License Information: PBS Pro is commercially licensed software that shares a common core with the OpenPBS software. For a copy of the commercial license terms and conditions, go to: (http://www.pbspro.com/agreement.html) or contact the Altair Legal Department. Altair's dual-license business model allows companies, individuals, and organizations to create proprietary derivative works of OpenPBS and distribute them - whether embedded or bundled with other software - under a commercial license agreement. Use of Altair's trademarks, including but not limited to "PBS™", "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is subject to Altair's trademark licensing policies. ================================================ FILE: INSTALL ================================================ -------------------------------------------------------------------- How to install PBS using the configure script. 0. Disable SELinux. OpenPBS does not support SELinux. With SELinux enabled, initial start fails with datastore permission error. You can also define proper policy but it is out of scope of this guide. 1. Install the prerequisite packages for building PBS. For CentOS-8 systems you should configure and enable powertools repo for hwloc-devel and libedit-devel packages. You should run the following commands as root: dnf install -y dnf-plugins-core dnf config-manager --set-enabled powertools dnf install -y gcc make rpm-build libtool hwloc-devel \ libX11-devel libXt-devel libedit-devel libical-devel \ ncurses-devel perl postgresql-devel postgresql-contrib python3-devel tcl-devel \ tk-devel swig expat-devel openssl-devel libXext libXft \ autoconf automake gcc-c++ cjson-devel For CentOS-7 systems you should run the following command as root: yum install -y gcc make rpm-build libtool hwloc-devel \ libX11-devel libXt-devel libedit-devel libical-devel \ ncurses-devel perl postgresql-devel postgresql-contrib python3-devel tcl-devel \ tk-devel swig expat-devel openssl-devel libXext libXft \ autoconf automake gcc-c++ For openSUSE systems you should run the following command as root: zypper install gcc make rpm-build libtool hwloc-devel \ libX11-devel libXt-devel libedit-devel libical-devel \ ncurses-devel perl postgresql-devel postgresql-contrib python3-devel tcl-devel \ tk-devel swig libexpat-devel libopenssl-devel libXext-devel \ libXft-devel fontconfig autoconf automake gcc-c++ cJSON-devel For Debian systems you should run the following command as root: apt-get install gcc make libtool libhwloc-dev libx11-dev \ libxt-dev libedit-dev libical-dev ncurses-dev perl \ postgresql-server-dev-all postgresql-contrib python3-dev tcl-dev tk-dev swig \ libexpat-dev libssl-dev libxext-dev libxft-dev autoconf \ automake g++ libcjson-dev For Ubuntu-18.04 systems you should run the following command as root: apt install gcc make libtool libhwloc-dev libx11-dev \ libxt-dev libedit-dev libical-dev ncurses-dev perl \ postgresql-server-dev-all postgresql-contrib python3-dev tcl-dev tk-dev swig \ libexpat-dev libssl-dev libxext-dev libxft-dev autoconf \ automake g++ For Ubuntu-24.04 systems you should run the following command as root: apt install gcc make libtool libhwloc-dev libx11-dev \ libxt-dev libedit-dev libical-dev ncurses-dev perl \ postgresql-server-dev-all postgresql-contrib python3-dev tcl-dev tk-dev swig \ libexpat-dev libssl-dev libxext-dev libxft-dev autoconf \ automake g++ libcjson-dev For macOS systems using MacPorts you should run the following command as root: port install autoconf automake libtool pkgconfig \ expat hwloc libedit libical openssl postgresql14 python310 \ swig-python tcl tk xorg-libX11 xorg-libXt 2. Install the prerequisite packages for running PBS. In addition to the commands below, you should also install a text editor of your choosing (vim, emacs, gedit, etc.). For CentOS systems you should run the following command as root: yum install -y expat libedit postgresql-server postgresql-contrib python3 \ sendmail sudo tcl tk libical chkconfig cjson For openSUSE systems you should run the following command as root: zypper install expat libedit postgresql-server postgresql-contrib python3 \ sendmail sudo tcl tk libical1 libcjson1 For Debian (jessie) systems you should run the following command as root: apt-get install expat libedit2 postgresql python3 postgresql-contrib sendmail-bin \ sudo tcl tk libical1a For Debian (stretch) systems you should run the following command as root: apt-get install expat libedit2 postgresql python3 postgresql-contrib sendmail-bin \ sudo tcl tk libical2 For Debian (buster) systems you should run the following command as root: apt-get install expat libedit2 postgresql python3 postgresql-contrib sendmail-bin \ sudo tcl tk libical3 libcjson1 For Ubuntu-18.04 systems you should run the following command as root: apt install expat libedit2 postgresql python3 postgresql-contrib sendmail-bin \ sudo tcl tk libical3 postgresql-server-dev-all For Ubuntu-24.04 systems you should run the following command as root: apt install expat libedit2 postgresql python3 postgresql-contrib sendmail-bin \ sudo tcl tk libical3 postgresql-server-dev-all For macOS systems using MacPorts you should run the following command as root: port install expat libedit libical openssl postgresql14-server python310 \ tcl tk 3. Open a terminal as a normal (non-root) user, unpack the PBS tarball, and cd to the package directory. tar -xpvf openpbs-20.0.0.tar.gz cd openpbs-20.0.0 4. Generate the configure script and Makefiles. (See note 1 below) ./autogen.sh 5. Display the available build parameters. ./configure --help 6. Configure the build for your environment. You may utilize the parameters displayed in the previous step. (See note 2 below) For CentOS and Debian systems you should run the following command: ./configure --prefix=/opt/pbs For openSUSE systems (see note 3 below) you should run the following command: ./configure --prefix=/opt/pbs --libexecdir=/opt/pbs/libexec For macOS systems using MacPorts you should run the following commands: export CPATH=/opt/local/include/postgresql14:/opt/local/include export LIBRARY_PATH=/opt/local/lib/postgresql14:/opt/local/lib ./configure --with-swig=/opt/local --with-tcl=/opt/local If PTL needs to be installed along with PBS use the option "--enable-ptl" (see note 5 below) eg ./configure --prefix=/opt/pbs --enable-ptl 7. Build PBS by running "make". (See note 4 below) make 8. Configure sudo to allow your user account to run commands as root. Refer to the online manual pages for sudo, sudoers, and visudo. 9. Install PBS. Use sudo to run the command as root. sudo make install 10. Configure PBS by executing the post-install script. sudo /opt/pbs/libexec/pbs_postinstall 11. Edit /etc/pbs.conf to configure the PBS services that should be started. If you are installing PBS on only one system, you should change the value of PBS_START_MOM from zero to one. If you use vi as your editor, you would run: sudo vi /etc/pbs.conf 12. Some file permissions must be modified to add SUID privilege. sudo chmod 4755 /opt/pbs/sbin/pbs_iff /opt/pbs/sbin/pbs_rcp 13. Start the PBS services. sudo /etc/init.d/pbs start 14. All configured PBS services should now be running. Update your PATH and MANPATH variables by sourcing the appropriate PBS profile or logging out and back in. For Bourne shell (or similar) run the following: . /etc/profile.d/pbs.sh For C shell (or similar) run the following: source /etc/profile.d/pbs.csh 15. You should now be able to run PBS commands to submit and query jobs. Some examples follow. bash$ qstat -B Server Max Tot Que Run Hld Wat Trn Ext Status ---------------- ----- ----- ----- ----- ----- ----- ----- ----- ----------- host1 0 0 0 0 0 0 0 0 Active bash$ pbsnodes -a host1 Mom = host1 ntype = PBS state = free pcpus = 2 resources_available.arch = linux resources_available.host = host1 resources_available.mem = 2049248kb resources_available.ncpus = 2 resources_available.vnode = host1 resources_assigned.accelerator_memory = 0kb resources_assigned.mem = 0kb resources_assigned.naccelerators = 0 resources_assigned.ncpus = 0 resources_assigned.vmem = 0kb resv_enable = True sharing = default_shared license = l bash$ echo "sleep 60" | qsub 0.host1 bash$ qstat -a host1: Req'd Req'd Elap Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time --------------- -------- -------- ---------- ------ --- --- ------ ----- - ----- 0.host1 mike workq STDIN 2122 1 1 -- -- R 00:00 bash$ -------------------------------------------------------------------- NOTES: Note 1: If you modify configure.ac or adjust timestamps on any files that are automatically generated, you will need to regenerate them by re-running autogen.sh. Note 2: It is advisable to create a simple shell script that calls configure with the appropriate options for your environment. This ensures configure will be called with the same arguments during subsequent invocations. If you have already run configure you can regenerate all of the Makefiles by running "./config.status". The first few lines of config.status will reveal the options that were specified when configure was run. If you set envirnment variables such as CFLAGS it is best to do so as an argument to configure (e.g. ./configure CFLAGS="-O0 -g" --prefix=/opt/pbs). This will ensure consistency when config.status regenerates the Makefiles. Note 3: The openSUSE rpm package expands %_libexecdir to /opt/pbs/lib rather than /opt/pbs/libexec which causes problems for the post- install scripts. Providing the --libexecdir value to configure overrides this behavior. Note 4: You need to use a POSIX (or nearly POSIX) make. GNU make works quite well in this regard; BSD make does not. If you are having any sort of build problems, your make should be a prime suspect. Tremendous effort has been expended to provide proper dependency generation and makefiles without relying on any non-POSIX features. The build should work fine with a simple call to make, however, complicating things by using various make flags is not guaranteed to work. Don't be surprised if the first thing that make does is call configure again. Note 5: PTL gets installed in the parent directory of where PBS is installed. For example if you have given install prefix=/opt/pbs, then you can find PTL installation in the directory /opt/ptl . You may need to log out and log in from the terminal for PATH and PYTHONPATH to update. Using valgrind with PBS. ------------------------------------- Here is a set of steps to detect memory errors/leaks within PBS code. 1. Install the valgrind development package. yum install valgrind-devel (zypper for OpenSUSE). 2. Compile Python in a way that valgrind can work with it, as follows: ./configure --prefix= --without-pymalloc --with-pydebug --with-valgrind make; make install 3. Compile PBS with the special python and in debug mode as follows: ./configure --prefix= --with-python= CFLAGS="-g -DPy_DEBUG -DDEBUG -Wall -Werror" 4. Run pbs daemons under valgrind. a) To detect memory errors (not leaks) run pbs daemons as follows: export LD_LIBRARY_PATH=/opt/pbs/pgsql/lib:/opt/pbs/lib:$LD_LIBRARY_PATH valgrind --tool=memcheck --log-file=/tmp/val.out /opt/pbs/sbin/pbs_server.bin b) To detect memory leaks use the supplied leaks suppression file valgrind.supp, as follows: export LD_LIBRARY_PATH=/opt/pbs/pgsql/lib:/opt/pbs/lib:$LD_LIBRARY_PATH valgrind --tool=memcheck --log-file=/tmp/val.out --suppressions=./valgrind.supp --leak-check=full --track-origins=yes /opt/pbs/sbin/pbs_server.bin ================================================ FILE: LICENSE ================================================ ---------------------------------------------------- Open Source License for OpenPBS and PBS Professional ---------------------------------------------------- Copyright (C) 1994-2021 Altair Engineering, Inc. For more information, contact Altair at www.altair.com. This file is part of both the OpenPBS software ("OpenPBS") and the PBS Professional ("PBS Pro") software. Open Source License Information: OpenPBS is free software. You can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OpenPBS is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Commercial License Information: PBS Pro is commercially licensed software that shares a common core with the OpenPBS software. For a copy of the commercial license terms and conditions, go to: (http://www.pbspro.com/agreement.html) or contact the Altair Legal Department. Altair's dual-license business model allows companies, individuals, and organizations to create proprietary derivative works of OpenPBS and distribute them - whether embedded or bundled with other software - under a commercial license agreement. Use of Altair's trademarks, including but not limited to "PBS™", "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is subject to Altair's trademark licensing policies. ============================================================================== GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU Affero General Public License is a free, copyleft license for software and other kinds of works, specifically designed to ensure cooperation with the community in the case of network server software. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, our General Public Licenses are intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. Developers that use our General Public Licenses protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License which gives you legal permission to copy, distribute and/or modify the software. A secondary benefit of defending all users' freedom is that improvements made in alternate versions of the program, if they receive widespread use, become available for other developers to incorporate. Many developers of free software are heartened and encouraged by the resulting cooperation. However, in the case of software used on network servers, this result may fail to come about. The GNU General Public License permits making a modified version and letting the public access it on a server without ever releasing its source code to the public. The GNU Affero General Public License is designed specifically to ensure that, in such cases, the modified source code becomes available to the community. It requires the operator of a network server to provide the source code of the modified version running there to the users of that server. Therefore, public use of a modified version, on a publicly accessible server, gives the public access to the source code of the modified version. An older license, called the Affero General Public License and published by Affero, was designed to accomplish similar goals. This is a different license, not a version of the Affero GPL, but Affero has released a new version of the Affero GPL which permits relicensing under this license. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU Affero General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Remote Network Interaction; Use with the GNU General Public License. Notwithstanding any other provision of this License, if you modify the Program, your modified version must prominently offer all users interacting with it remotely through a computer network (if your version supports such interaction) an opportunity to receive the Corresponding Source of your version by providing access to the Corresponding Source from a network server at no charge, through some standard or customary means of facilitating copying of software. This Corresponding Source shall include the Corresponding Source for any work covered by version 3 of the GNU General Public License that is incorporated pursuant to the following paragraph. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the work with which it is combined will remain governed by version 3 of the GNU General Public License. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU Affero General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU Affero General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU Affero General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU Affero General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If your software can interact with users remotely through a computer network, you should also make sure that it provides a way for users to get its source. For example, if your program is a web application, its interface could display a "Source" link that leads users to an archive of the code. There are many ways you could offer source, and different solutions will be better for different programs; see section 13 for the specific requirements. You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU AGPL, see . ============================================================================== -------------------------------- Third Party Software Information -------------------------------- PBS Pro includes code created by other parties which is provided under the open source software license agreements chosen by the authors. All unmodified files from these and other sources retain their original copyright and license notices. _ _ _ _ _ _ src/scheduler/sort.c Copyright (c) 1992, 1993. Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. All advertising materials mentioning features or use of this software must display the following acknowledgement: This product includes software developed by the University of California, Berkeley and its contributors. 4. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _ _ _ _ _ _ src/lib/Libwin/rcmd.c Copyright (c) 1983 Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms are permitted provided that the above copyright notice and this paragraph are duplicated in all such forms and that any documentation, advertising materials, and other materials related to such distribution and use acknowledge that the software was developed by the University of California, Berkeley. The name of the University may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. _ _ _ _ _ _ src/resmom/popen.c Copyright (c) 1988, 1993 The Regents of the University of California. All rights reserved. This code is derived from software written by Ken Arnold and published in UNIX Review, Vol. 6, No. 8. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. All advertising materials mentioning features or use of this software must display the following acknowledgement: This product includes software developed by the University of California, Berkeley and its contributors. 4. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _ _ _ _ _ _ src/lib/Libutil/avltree.c Copyright (c) 2000 Gregory Tseytin All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer as the first lines of this file unmodified. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY Gregory Tseytin ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Gregory Tseytin BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _ _ _ _ _ _ buildutils/depcomp buildutils/compile Copyright (C) 1999-2013 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . As a special exception to the GNU General Public License, if you distribute this file as part of a program that contains a configuration script generated by Autoconf, you may include it under the same distribution terms that you use for the rest of that program. _ _ _ _ _ _ buildutils/install-sh Copyright (C) 1994 X Consortium Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC- TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Except as contained in this notice, the name of the X Consortium shall not be used in advertising or otherwise to promote the sale, use or other deal- ings in this Software without prior written authorization from the X Consor- tium. _ _ _ _ _ _ buildutils/ltmain.sh m4/libtool.m4 Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. Written by Gordon Matzigkeit, 1996 This file is part of GNU Libtool. GNU Libtool is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. As a special exception to the GNU General Public License, if you distribute this file as part of a program or library that is built using GNU Libtool, you may include this file under the same distribution terms that you use for the rest of that program. GNU Libtool is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GNU Libtool; see the file COPYING. If not, a copy can be downloaded from http://www.gnu.org/licenses/gpl.html, or obtained by writing to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. _ _ _ _ _ _ m4/lt~obsolete.m4 Copyright (C) 2004, 2005, 2007, 2009 Free Software Foundation, Inc. Written by Scott James Remnant, 2004. (see license below) m4/ltoptions.m4 Copyright (C) 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc. Written by Gary V. Vaughan, 2004 (see license below) m4/ltsugar.m4 Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc. Written by Gary V. Vaughan, 2004 (see license below) m4/ltversion.m4 Copyright (C) 2004 Free Software Foundation, Inc. Written by Scott James Remnant, 2004. (see license below) m4 GNU license: License: GPL-2+ or configure-same-as-package This file is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. The full text of the GNU General Public License version 2 is available on Debian systems in /usr/share/common-licenses/GPL-2. As a special exception to the GNU General Public License, if you distribute this file as part of a program that contains a configuration script generated by Autoconf, you may include it under the same distribution terms that you use for the rest of that program. _ _ _ _ _ _ buildutils/makedepend-sh Copyright (c) 1996, 1998 The NetBSD Foundation, Inc. All rights reserved. This code is derived from software contributed to The NetBSD Foundation by Lonhyn T. Jasinskyj. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. All advertising materials mentioning features or use of this software must display the following acknowledgement: This product includes software developed by the NetBSD Foundation, Inc. and its contributors. 4. Neither the name of The NetBSD Foundation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: Makefile.am ================================================ # # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. # ACLOCAL_AMFLAGS = -I m4 SUBDIRS = buildutils src doc test EXTRA_DIST = \ COPYRIGHT \ INSTALL \ LICENSE \ README.md \ autogen.sh \ openpbs-rpmlintrc \ openpbs.spec ================================================ FILE: PBS_License.txt ================================================ Copyright (C) 1994-2021 Altair Engineering, Inc. For more information, contact Altair at www.altair.com. This file is part of both the OpenPBS software ("OpenPBS") and the PBS Professional ("PBS Pro") software. Open Source License Information: OpenPBS is free software. You can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OpenPBS is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Commercial License Information: PBS Pro is commercially licensed software that shares a common core with the OpenPBS software. For a copy of the commercial license terms and conditions, go to: (http://www.pbspro.com/agreement.html) or contact the Altair Legal Department. Altair's dual-license business model allows companies, individuals, and organizations to create proprietary derivative works of OpenPBS and distribute them - whether embedded or bundled with other software - under a commercial license agreement. Use of Altair's trademarks, including but not limited to "PBS™", "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is subject to Altair's trademark licensing policies. ================================================ FILE: README.md ================================================ ### OpenPBS Open Source Project If you are new to this project, please start at https://www.openpbs.org/ Note: In May 2020, OpenPBS became the new name for the PBS Professional Open Source Project. (PBS Professional will be used to refer to the commercial version; OpenPBS to the Open Source version -- same code, easier naming.) As there are many parts to the project, it will take several weeks to change the name in all places, so you will continue to see references to PBS Pro -- stay tuned. ### What is OpenPBS? OpenPBS® software optimizes job scheduling and workload management in high-performance computing (HPC) environments – clusters, clouds, and supercomputers – improving system efficiency and people’s productivity. Built by HPC people for HPC people, OpenPBS is fast, scalable, secure, and resilient, and supports all modern infrastructure, middleware, and applications. * **Scalability:** supports millions of cores with fast job dispatch and minimal latency; tested beyond 50,000 nodes * **Policy-Driven Scheduling:** meets unique site goals and SLAs by balancing job turnaround time and utilization with optimal job placement * **Resiliency:** includes automatic fail-over architecture with no single point of failure – jobs are never lost, and jobs continue to run despite failures * **Flexible Plugin Framework:** simplifies administration with enhanced visibility and extensibility; customize implementations to meet complex requirements * **Health Checks:** monitors and automatically mitigates faults with a comprehensive health check framework * **Voted #1 HPC Software** by HPC Wire readers and proven for over 20 years at thousands of sites around the globe in both the private sector and public sector ### Community and Ways to Participate OpenPBS is a community effort and there are a variety of ways to engage, from helping answer questions to benchmarking to developing new capabilities and tests. We value being aggressively open and inclusive, but also aggressively respectful and professional. See our [Code of Conduct](https://openpbs.atlassian.net/wiki/display/PBSPro/Code+of+Conduct). The best place to start is by joining the community forum. You may sign up or view the archives via: * [Announcements](http://community.openpbs.org/c/announcements) -- important updates relevant to the entire PBS Pro community * [Users/Site Admins](http://community.openpbs.org/c/users-site-administrators) -- general questions and discussions among end users (system admins, engineers, scientists) * [Developers](http://community.openpbs.org/c/developers) -- technical discussions among developers To dive in deeper and learn more about the project and what the community is up to, visit: * [Contributor’s portal](https://openpbs.atlassian.net/wiki) -- includes roadmaps, processes, how to articles, coding standards, release notes, etc (Uses Confluence) * [Source code](https://github.com/OpenPBS/openpbs) -- includes full source code and test framework (Uses Github) * [Issue tracking system](https://github.com/OpenPBS/openpbs/issues) -- includes bugs and feature requests and status (Uses Github). Previously, we used [JIRA](https://pbspro.atlassian.net), which contains older issues. OpenPBS is also integrated in the OpenHPC software stack. The mission of OpenHPC is to provide an integrated collection of HPC-centric components to provide full-featured HPC software stacks. OpenHPC is a Linux Foundation Collaborative Project. Learn more at: * [OpenHPC.community](http://openhpc.community) * [The Linux Foundation](http://thelinuxfoundation.org) ### Our Vision: One Scheduler for the whole HPC World There is a huge opportunity to advance the state of the art in HPC scheduling by bringing the whole HPC world together, marrying public sector innovations with private sector enterprise know-how, and retargeting the effort wasted re-implementing the same old capabilities again and again towards pushing the outside of the envelope. At the heart of this vision is fostering common standards (at least defacto standards like common software). To this end, Altair has made a big investment by releasing the PBS Professional technology as OpenPBS (under an Open Source license to meet the needs of the public sector), while also continuing to offer PBS Professional (under a commercial license to meet the needs of the private sector). One defacto standard that can work for the whole HPC community. ### Current Build status [![Build Status](https://travis-ci.com/OpenPBS/openpbs.svg?branch=master)](https://travis-ci.com/OpenPBS/openpbs) ================================================ FILE: autogen.sh ================================================ #!/bin/sh # # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. if test -d ./src/resmom; then echo "Generating configure script and Makefile templates." exec autoreconf --force --install -I m4 $* else echo "Execute `basename $0` from the top level distribution directory." fi ================================================ FILE: azure-pipelines.yml ================================================ trigger: branches: include: - master - release_* pr: branches: include: - master - release_* pool: vmImage: 'ubuntu-latest' # Changed from ubuntu-20.04 variables: - name: DOCKER_BUILDKIT value: 1 jobs: - job: runcheck displayName: 'Code Quality Checks' pool: vmImage: 'ubuntu-latest' steps: - checkout: self displayName: 'Checkout code' - bash: | sudo apt-get update sudo apt-get install -y python3-pip sudo pip3 install --upgrade pip sudo pip3 install pycodestyle pep8 flake8 clang-format # Create pep8 symlink if it doesn't exist (for compatibility) if ! command -v pep8 &> /dev/null; then sudo ln -sf $(which pycodestyle) /usr/local/bin/pep8 fi # Verify installations echo "Checking installed tools:" python3 --version pip3 --version pep8 --version || pycodestyle --version clang-format --version # Check if runchecks script exists if [ -f ".github/runchecks" ]; then chmod +x .github/runchecks ./.github/runchecks else echo "Warning: .github/runchecks script not found" # Run basic checks if script is missing echo "Running basic Python style checks..." find . -name "*.py" -exec pep8 {} \; || true fi displayName: 'Run code quality checks' - job: ubuntu_2004_build displayName: 'Ubuntu 20.04' dependsOn: runcheck pool: vmImage: 'ubuntu-latest' variables: OS_TYPE: "ubuntu:20.04" PKG_INSTALL_CMD: "apt-get -y update && apt-get -y upgrade && apt-get install -y python3 build-essential" DOCKER_EXTRA_ARG: "-e DEBIAN_FRONTEND=noninteractive -e LANGUAGE=C.UTF-8 -e LANG=C.UTF-8 -e LC_ALL=C.UTF-8" CI_CMD: "./ci --local" CONTAINER_NAME: "ubuntu2004-$(Build.BuildId)" steps: - checkout: self displayName: 'Checkout code' - script: | echo "Starting build for Ubuntu 20.04" echo "OS Type: $(OS_TYPE)" echo "Package Install: $(PKG_INSTALL_CMD)" echo "Docker Args: $(DOCKER_EXTRA_ARG)" echo "CI Command: $(CI_CMD)" echo "Container Name: $(CONTAINER_NAME)" displayName: 'Display build configuration' - script: | # Pull the Docker image docker pull $(OS_TYPE) # Start the container with proper init to handle zombie processes docker run -d \ $(DOCKER_EXTRA_ARG) \ -h pbs.dev.local \ --name $(CONTAINER_NAME) \ -v $(pwd):$(pwd) \ --privileged \ --init \ -w $(pwd) \ $(OS_TYPE) \ /bin/bash -c "sleep 3600" # Verify container is running docker ps | grep $(CONTAINER_NAME) displayName: 'Start Docker container' - script: | # Install packages docker exec $(CONTAINER_NAME) bash -c "$(PKG_INSTALL_CMD)" # Install additional tools for process management docker exec $(CONTAINER_NAME) bash -c "apt-get install -y procps psmisc" # Verify Python installation docker exec $(CONTAINER_NAME) python3 --version displayName: 'Install dependencies' - script: | # Monitor processes before running CI echo "=== Process monitoring before CI ===" docker exec $(CONTAINER_NAME) bash -c " echo 'Current processes:' ps aux | head -20 echo '' echo 'Checking for zombie/defunct processes:' ps aux | grep -E 'defunct|' || echo 'No zombie processes found' echo '' echo 'PBS-related processes:' ps aux | grep -E 'pbs_|openpbs' || echo 'No PBS processes found' " displayName: 'Monitor processes before CI' - script: | # Check if ci directory and script exist docker exec $(CONTAINER_NAME) bash -c "ls -la" docker exec $(CONTAINER_NAME) bash -c "if [ -d 'ci' ]; then ls -la ci/; else echo 'ci directory not found'; fi" # Run CI script if it exists if docker exec $(CONTAINER_NAME) bash -c "[ -f 'ci/ci' ] || [ -f './ci' ]"; then docker exec --privileged $(CONTAINER_NAME) bash -c "cd ci && $(CI_CMD)" else echo "CI script not found, running basic build test" docker exec $(CONTAINER_NAME) bash -c "python3 -c 'print(\"Python test successful\")'" fi # Check for any PBS processes and stop them properly echo "Checking for PBS processes..." docker exec $(CONTAINER_NAME) bash -c "ps aux | grep -E 'pbs_|openpbs' || echo 'No PBS processes found'" # Stop PBS services properly if they're running docker exec $(CONTAINER_NAME) bash -c " if command -v pbs_server &> /dev/null; then echo 'Stopping PBS services...' pkill -TERM pbs_server || true pkill -TERM pbs_sched || true pkill -TERM pbs_mom || true pkill -TERM pbs_ds_monitor || true sleep 2 # Force kill if still running pkill -KILL pbs_server || true pkill -KILL pbs_sched || true pkill -KILL pbs_mom || true pkill -KILL pbs_ds_monitor || true fi " || true displayName: 'Run CI tests' - script: | # Proper PBS cleanup and container shutdown echo "Cleaning up PBS processes and container..." # Stop PBS services gracefully first docker exec $(CONTAINER_NAME) bash -c " echo 'Stopping PBS services gracefully...' if command -v qterm &> /dev/null; then qterm -t quick || true fi # Stop individual PBS components pkill -TERM pbs_server || true pkill -TERM pbs_sched || true pkill -TERM pbs_mom || true pkill -TERM pbs_ds_monitor || true # Wait a bit for graceful shutdown sleep 3 # Force kill any remaining PBS processes pkill -KILL pbs_server || true pkill -KILL pbs_sched || true pkill -KILL pbs_mom || true pkill -KILL pbs_ds_monitor || true # Clean up any remaining zombie processes ps aux | grep -E 'defunct|' || echo 'No zombie processes found' " || true # Stop and remove container docker stop $(CONTAINER_NAME) || true docker rm $(CONTAINER_NAME) || true displayName: 'Cleanup Docker container' condition: always() - job: ubuntu_2404_build displayName: 'Ubuntu 24.04' dependsOn: runcheck pool: vmImage: 'ubuntu-latest' variables: OS_TYPE: "ubuntu:24.04" PKG_INSTALL_CMD: "apt-get -y update && apt-get -y upgrade && apt-get install -y python3 build-essential" DOCKER_EXTRA_ARG: "-e DEBIAN_FRONTEND=noninteractive -e LANGUAGE=C.UTF-8 -e LANG=C.UTF-8 -e LC_ALL=C.UTF-8" CI_CMD: "./ci --local" CONTAINER_NAME: "ubuntu2404-$(Build.BuildId)" steps: - checkout: self displayName: 'Checkout code' - script: | echo "Starting build for Ubuntu 24.04" echo "OS Type: $(OS_TYPE)" echo "Package Install: $(PKG_INSTALL_CMD)" echo "Docker Args: $(DOCKER_EXTRA_ARG)" echo "CI Command: $(CI_CMD)" echo "Container Name: $(CONTAINER_NAME)" displayName: 'Display build configuration' - script: | # Pull the Docker image docker pull $(OS_TYPE) # Start the container with proper init to handle zombie processes docker run -d \ $(DOCKER_EXTRA_ARG) \ -h pbs.dev.local \ --name $(CONTAINER_NAME) \ -v $(pwd):$(pwd) \ --privileged \ --init \ -w $(pwd) \ $(OS_TYPE) \ /bin/bash -c "sleep 3600" # Verify container is running docker ps | grep $(CONTAINER_NAME) displayName: 'Start Docker container' - script: | # Install packages docker exec $(CONTAINER_NAME) bash -c "$(PKG_INSTALL_CMD)" # Install additional tools for process management docker exec $(CONTAINER_NAME) bash -c "apt-get install -y procps psmisc" # Verify Python installation docker exec $(CONTAINER_NAME) python3 --version displayName: 'Install dependencies' - script: | # Monitor processes before running CI echo "=== Process monitoring before CI ===" docker exec $(CONTAINER_NAME) bash -c " echo 'Current processes:' ps aux | head -20 echo '' echo 'Checking for zombie/defunct processes:' ps aux | grep -E 'defunct|' || echo 'No zombie processes found' echo '' echo 'PBS-related processes:' ps aux | grep -E 'pbs_|openpbs' || echo 'No PBS processes found' " displayName: 'Monitor processes before CI' - script: | # Check if ci directory and script exist docker exec $(CONTAINER_NAME) bash -c "ls -la" docker exec $(CONTAINER_NAME) bash -c "if [ -d 'ci' ]; then ls -la ci/; else echo 'ci directory not found'; fi" # Run CI script if it exists if docker exec $(CONTAINER_NAME) bash -c "[ -f 'ci/ci' ] || [ -f './ci' ]"; then docker exec --privileged $(CONTAINER_NAME) bash -c "cd ci && $(CI_CMD)" else echo "CI script not found, running basic build test" docker exec $(CONTAINER_NAME) bash -c "python3 -c 'print(\"Python test successful\")'" fi # Check for any PBS processes and stop them properly echo "Checking for PBS processes..." docker exec $(CONTAINER_NAME) bash -c "ps aux | grep -E 'pbs_|openpbs' || echo 'No PBS processes found'" # Stop PBS services properly if they're running docker exec $(CONTAINER_NAME) bash -c " if command -v pbs_server &> /dev/null; then echo 'Stopping PBS services...' pkill -TERM pbs_server || true pkill -TERM pbs_sched || true pkill -TERM pbs_mom || true pkill -TERM pbs_ds_monitor || true sleep 2 # Force kill if still running pkill -KILL pbs_server || true pkill -KILL pbs_sched || true pkill -KILL pbs_mom || true pkill -KILL pbs_ds_monitor || true fi " || true displayName: 'Run CI tests' - script: | # Proper PBS cleanup and container shutdown echo "Cleaning up PBS processes and container..." # Stop PBS services gracefully first docker exec $(CONTAINER_NAME) bash -c " echo 'Stopping PBS services gracefully...' if command -v qterm &> /dev/null; then qterm -t quick || true fi # Stop individual PBS components pkill -TERM pbs_server || true pkill -TERM pbs_sched || true pkill -TERM pbs_mom || true pkill -TERM pbs_ds_monitor || true # Wait a bit for graceful shutdown sleep 3 # Force kill any remaining PBS processes pkill -KILL pbs_server || true pkill -KILL pbs_sched || true pkill -KILL pbs_mom || true pkill -KILL pbs_ds_monitor || true # Clean up any remaining zombie processes ps aux | grep -E 'defunct|' || echo 'No zombie processes found' " || true # Stop and remove container docker stop $(CONTAINER_NAME) || true docker rm $(CONTAINER_NAME) || true displayName: 'Cleanup Docker container' condition: always() - job: rocky_sanitize_build displayName: 'Rocky Linux 9 Sanitize' dependsOn: runcheck pool: vmImage: 'ubuntu-latest' variables: OS_TYPE: "rockylinux/rockylinux:9.2" PKG_INSTALL_CMD: "yum -y update && yum -y install python3 gcc gcc-c++ make" DOCKER_EXTRA_ARG: "-e BUILD_MODE=sanitize" CI_CMD: "./ci --local=sanitize" CONTAINER_NAME: "rocky-sanitize-$(Build.BuildId)" steps: - checkout: self displayName: 'Checkout code' - script: | echo "Starting build for Rocky Linux 9 Sanitize" echo "OS Type: $(OS_TYPE)" echo "Package Install: $(PKG_INSTALL_CMD)" echo "Docker Args: $(DOCKER_EXTRA_ARG)" echo "CI Command: $(CI_CMD)" echo "Container Name: $(CONTAINER_NAME)" displayName: 'Display build configuration' - script: | # Pull the Docker image docker pull $(OS_TYPE) # Start the container with proper init to handle zombie processes docker run -d \ $(DOCKER_EXTRA_ARG) \ -h pbs.dev.local \ --name $(CONTAINER_NAME) \ -v $(pwd):$(pwd) \ --privileged \ --init \ -w $(pwd) \ $(OS_TYPE) \ /bin/bash -c "sleep 3600" # Verify container is running docker ps | grep $(CONTAINER_NAME) displayName: 'Start Docker container' - script: | # Install packages docker exec $(CONTAINER_NAME) bash -c "$(PKG_INSTALL_CMD)" # Install additional tools for process management docker exec $(CONTAINER_NAME) bash -c "yum install -y procps-ng psmisc || dnf install -y procps-ng psmisc" # Verify Python installation docker exec $(CONTAINER_NAME) python3 --version displayName: 'Install dependencies' - script: | # Monitor processes before running CI echo "=== Process monitoring before CI ===" docker exec $(CONTAINER_NAME) bash -c " echo 'Current processes:' ps aux | head -20 echo '' echo 'Checking for zombie/defunct processes:' ps aux | grep -E 'defunct|' || echo 'No zombie processes found' echo '' echo 'PBS-related processes:' ps aux | grep -E 'pbs_|openpbs' || echo 'No PBS processes found' " displayName: 'Monitor processes before CI' - script: | # Check if ci directory and script exist docker exec $(CONTAINER_NAME) bash -c "ls -la" docker exec $(CONTAINER_NAME) bash -c "if [ -d 'ci' ]; then ls -la ci/; else echo 'ci directory not found'; fi" # Run CI script if it exists if docker exec $(CONTAINER_NAME) bash -c "[ -f 'ci/ci' ] || [ -f './ci' ]"; then docker exec --privileged $(CONTAINER_NAME) bash -c "cd ci && $(CI_CMD)" else echo "CI script not found, running basic build test" docker exec $(CONTAINER_NAME) bash -c "python3 -c 'print(\"Python test successful\")'" fi # Check for any PBS processes and stop them properly echo "Checking for PBS processes..." docker exec $(CONTAINER_NAME) bash -c "ps aux | grep -E 'pbs_|openpbs' || echo 'No PBS processes found'" # Stop PBS services properly if they're running docker exec $(CONTAINER_NAME) bash -c " if command -v pbs_server &> /dev/null; then echo 'Stopping PBS services...' pkill -TERM pbs_server || true pkill -TERM pbs_sched || true pkill -TERM pbs_mom || true pkill -TERM pbs_ds_monitor || true sleep 2 # Force kill if still running pkill -KILL pbs_server || true pkill -KILL pbs_sched || true pkill -KILL pbs_mom || true pkill -KILL pbs_ds_monitor || true fi " || true displayName: 'Run CI tests' - script: | # Proper PBS cleanup and container shutdown echo "Cleaning up PBS processes and container..." # Stop PBS services gracefully first docker exec $(CONTAINER_NAME) bash -c " echo 'Stopping PBS services gracefully...' if command -v qterm &> /dev/null; then qterm -t quick || true fi # Stop individual PBS components pkill -TERM pbs_server || true pkill -TERM pbs_sched || true pkill -TERM pbs_mom || true pkill -TERM pbs_ds_monitor || true # Wait a bit for graceful shutdown sleep 3 # Force kill any remaining PBS processes pkill -KILL pbs_server || true pkill -KILL pbs_sched || true pkill -KILL pbs_mom || true pkill -KILL pbs_ds_monitor || true # Clean up any remaining zombie processes ps aux | grep -E 'defunct|' || echo 'No zombie processes found' " || true # Stop and remove container docker stop $(CONTAINER_NAME) || true docker rm $(CONTAINER_NAME) || true displayName: 'Cleanup Docker container' condition: always() - job: rocky_kerberos_build displayName: 'Rocky Linux 9 Kerberos' dependsOn: runcheck pool: vmImage: 'ubuntu-latest' variables: OS_TYPE: "rockylinux/rockylinux:9.2" PKG_INSTALL_CMD: "yum -y update && yum -y install python3 gcc gcc-c++ make" DOCKER_EXTRA_ARG: "-e BUILD_MODE=kerberos" CI_CMD: "./ci --local" CONTAINER_NAME: "rocky-kerberos-$(Build.BuildId)" steps: - checkout: self displayName: 'Checkout code' - script: | echo "Starting build for Rocky Linux 9 Kerberos" echo "OS Type: $(OS_TYPE)" echo "Package Install: $(PKG_INSTALL_CMD)" echo "Docker Args: $(DOCKER_EXTRA_ARG)" echo "CI Command: $(CI_CMD)" echo "Container Name: $(CONTAINER_NAME)" displayName: 'Display build configuration' - script: | # Pull the Docker image docker pull $(OS_TYPE) # Start the container with proper init to handle zombie processes docker run -d \ $(DOCKER_EXTRA_ARG) \ -h pbs.dev.local \ --name $(CONTAINER_NAME) \ -v $(pwd):$(pwd) \ --privileged \ --init \ -w $(pwd) \ $(OS_TYPE) \ /bin/bash -c "sleep 3600" # Verify container is running docker ps | grep $(CONTAINER_NAME) displayName: 'Start Docker container' - script: | # Install packages docker exec $(CONTAINER_NAME) bash -c "$(PKG_INSTALL_CMD)" # Install additional tools for process management docker exec $(CONTAINER_NAME) bash -c "yum install -y procps-ng psmisc || dnf install -y procps-ng psmisc" # Verify Python installation docker exec $(CONTAINER_NAME) python3 --version displayName: 'Install dependencies' - script: | # Monitor processes before running CI echo "=== Process monitoring before CI ===" docker exec $(CONTAINER_NAME) bash -c " echo 'Current processes:' ps aux | head -20 echo '' echo 'Checking for zombie/defunct processes:' ps aux | grep -E 'defunct|' || echo 'No zombie processes found' echo '' echo 'PBS-related processes:' ps aux | grep -E 'pbs_|openpbs' || echo 'No PBS processes found' " displayName: 'Monitor processes before CI' - script: | # Check if ci directory and script exist docker exec $(CONTAINER_NAME) bash -c "ls -la" docker exec $(CONTAINER_NAME) bash -c "if [ -d 'ci' ]; then ls -la ci/; else echo 'ci directory not found'; fi" # Run CI script if it exists if docker exec $(CONTAINER_NAME) bash -c "[ -f 'ci/ci' ] || [ -f './ci' ]"; then docker exec --privileged $(CONTAINER_NAME) bash -c "cd ci && $(CI_CMD)" else echo "CI script not found, running basic build test" docker exec $(CONTAINER_NAME) bash -c "python3 -c 'print(\"Python test successful\")'" fi # Check for any PBS processes and stop them properly echo "Checking for PBS processes..." docker exec $(CONTAINER_NAME) bash -c "ps aux | grep -E 'pbs_|openpbs' || echo 'No PBS processes found'" # Stop PBS services properly if they're running docker exec $(CONTAINER_NAME) bash -c " if command -v pbs_server &> /dev/null; then echo 'Stopping PBS services...' pkill -TERM pbs_server || true pkill -TERM pbs_sched || true pkill -TERM pbs_mom || true pkill -TERM pbs_ds_monitor || true sleep 2 # Force kill if still running pkill -KILL pbs_server || true pkill -KILL pbs_sched || true pkill -KILL pbs_mom || true pkill -KILL pbs_ds_monitor || true fi " || true displayName: 'Run CI tests' - script: | # Proper PBS cleanup and container shutdown echo "Cleaning up PBS processes and container..." # Stop PBS services gracefully first docker exec $(CONTAINER_NAME) bash -c " echo 'Stopping PBS services gracefully...' if command -v qterm &> /dev/null; then qterm -t quick || true fi # Stop individual PBS components pkill -TERM pbs_server || true pkill -TERM pbs_sched || true pkill -TERM pbs_mom || true pkill -TERM pbs_ds_monitor || true # Wait a bit for graceful shutdown sleep 3 # Force kill any remaining PBS processes pkill -KILL pbs_server || true pkill -KILL pbs_sched || true pkill -KILL pbs_mom || true pkill -KILL pbs_ds_monitor || true # Clean up any remaining zombie processes ps aux | grep -E 'defunct|' || echo 'No zombie processes found' " || true # Stop and remove container docker stop $(CONTAINER_NAME) || true docker rm $(CONTAINER_NAME) || true displayName: 'Cleanup Docker container' condition: always() ================================================ FILE: buildutils/Makefile.am ================================================ # # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. # EXTRA_DIST = \ attr_parser.py ================================================ FILE: buildutils/attr_parser.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. """ attr_parser.py will parse xml files also called master attribute files containing all the members of both server and ecl files,and will generate two corresponding files one for server and one for ecl """ import getopt import os import pdb import re import string import sys import enum import xml.dom.minidom import xml.parsers.expat list_ecl = [] list_svr = [] list_defs = [] global attr_type global newattr class PropType(enum.Enum): ''' BOTH - Write information for this tag to all the output files SERVER - Write information for this tag to the SERVER file only ECL - Write information for this tag to the ECL file only ''' BOTH = 0 SERVER = 1 ECL = 2 class switch(object): """ This class provides the functionality which is equivalent to switch/case statements in C. It only needs to be defined once. """ def __init__(self, value): self.value = value self.fall = False def __iter__(self): """Return the match method once, then stop""" yield self.match def match(self, *args): """Indicate whether or not to enter a case suite""" if self.fall or not args: return True elif self.value in args: # changed for v1.5, see below self.fall = True return True else: return False def fileappend(prop_type, line): ''' Selects files to append line to dependig on prop_type prop_type - BOTH, SERVER, ECL line - The string line to append to the file(s) ''' global attr_type if prop_type == PropType.SERVER: if attr_type == PropType.SERVER or attr_type == PropType.BOTH: list_svr.append(line) elif prop_type == PropType.ECL: if attr_type == PropType.ECL or attr_type == PropType.BOTH: list_ecl.append(line) elif prop_type == PropType.BOTH: if attr_type == PropType.SERVER or attr_type == PropType.BOTH: list_svr.append(line) if attr_type == PropType.ECL or attr_type == PropType.BOTH: list_ecl.append(line) return None def getText(svr_file, ecl_file, defines_file): ''' getText function - (writes the data stored in lists to file) svr_file - the server side output file ecl_file - the output file to be used by the ECL layer defines_file - the output file containing the macro definitions for the index positions ''' buff = "".join(list_svr) for line in buff: svr_file.write(line) buff = "".join(list_ecl) for line in buff: ecl_file.write(line) buff = "".join(list_defs) for line in buff: defines_file.write(line) def do_head(node): ''' Processes the head element of the node passed ''' alist = node.getElementsByTagName('head') for a in alist: list_svr.append("/*Disclaimer: This is a machine generated file.*/" + '\n') list_svr.append("/*For modifying any attribute change corresponding " "XML file */" + '\n') list_ecl.append("/*Disclaimer: This is a machine generated file.*/" + '\n') list_ecl.append("/*For modifying any attribute change corresponding " "XML file */" + '\n') blist = a.getElementsByTagName('SVR') blist_ecl = a.getElementsByTagName('ECL') for s in blist: text1 = s.childNodes[0].nodeValue text1 = text1.strip(' \t') list_svr.append(text1) for e in blist_ecl: text2 = e.childNodes[0].nodeValue text2 = text2.strip(' \t') list_ecl.append(text2) def do_index(attr): ''' Processes the member_index attribute attr ''' li = None li = attr.getElementsByTagName('member_index') if li: for v in li: buf = v.childNodes[0].nodeValue list_defs.append("\n\t" + buf + ",") def do_member(attr, p_flag, tag_name): ''' Processes the member identified by tage_name attr - the attribute definition node p_flag - property flag - SVR, ECL, BOTH tag_name - the tag_name string to process ''' global newattr buf = None comma = ',' if newattr: comma = '' newattr = False li = attr.getElementsByTagName(tag_name) if li: svr = li[0].getElementsByTagName('SVR') if svr: value = svr for v in value: buf = v.childNodes[0].nodeValue fileappend(PropType.SERVER, comma + '\n' + '\t' + '\t' + buf) ecl = li[0].getElementsByTagName('ECL') if ecl: value = ecl for v in value: buf = v.childNodes[0].nodeValue fileappend(PropType.ECL, comma + '\n' + '\t' + '\t' + buf) value = li for v in value: buf = v.childNodes[0].nodeValue if buf: s = buf.strip('\n \t') if s: fileappend(p_flag, comma + '\n' + '\t' + '\t' + buf) def process(master_file, svr_file, ecl_file, defines_file): ''' process the master xml file and produce the outputs files as requested master_file - the Master XML files to process svr_file - the server side output file ecl_file - the output file to be used by the ECL layer defines_file - the output file containing the macro definitions for the index positions ''' from xml.dom import minidom global attr_type global newattr newattr = False doc = minidom.parse(master_file) nodes = doc.getElementsByTagName('data') for node in nodes: do_head(node) at_list = node.getElementsByTagName('attributes') for attr in at_list: attr_type = PropType.BOTH newattr = True flag_name = attr.getAttribute('flag') if flag_name == 'SVR': attr_type = PropType.SERVER if flag_name == 'ECL': attr_type = PropType.ECL inc_name = attr.getAttribute('include') if inc_name: fileappend(PropType.SERVER, '\n' + inc_name) mem_list = attr.childNodes[0].nodeValue mem_list = mem_list.strip(' \t') fileappend(PropType.BOTH, mem_list) macro_name = attr.getAttribute('macro') if macro_name: fileappend(PropType.BOTH, '\n' + macro_name + "\n") do_index(attr) fileappend(PropType.BOTH, '\t{') do_member(attr, PropType.BOTH, 'member_name') do_member(attr, PropType.SERVER, 'member_at_decode') do_member(attr, PropType.SERVER, 'member_at_encode') do_member(attr, PropType.SERVER, 'member_at_set') do_member(attr, PropType.SERVER, 'member_at_comp') do_member(attr, PropType.SERVER, 'member_at_free') do_member(attr, PropType.SERVER, 'member_at_action') do_member(attr, PropType.BOTH, 'member_at_flags') do_member(attr, PropType.BOTH, 'member_at_type') do_member(attr, PropType.SERVER, 'member_at_parent') do_member(attr, PropType.ECL, 'member_verify_function') do_member(attr, PropType.SERVER, 'member_at_entlim') do_member(attr, PropType.SERVER, 'member_at_struct') fileappend(PropType.BOTH, '\n\t}') fileappend(PropType.BOTH, ",") if macro_name: fileappend(PropType.BOTH, '\n#else') fileappend(PropType.BOTH, '\n\t{\n\t\t"noop"\n\t},') fileappend(PropType.BOTH, '\n#endif') tail_list = node.getElementsByTagName('tail') for t in tail_list: tail_value = t.childNodes[0].nodeValue if tail_value is None: pass fileappend(PropType.BOTH, '\n') tail_both = t.getElementsByTagName('both') tail_svr = t.getElementsByTagName('SVR') tail_ecl = t.getElementsByTagName('ECL') for tb in tail_both: b = tb.childNodes[0].nodeValue b = b.strip(' \t') list_ecl.append(b) list_svr.append(b) for ts in tail_svr: s = ts.childNodes[0].nodeValue s = s.strip(' \t') list_svr.append(s) for te in tail_ecl: e = te.childNodes[0].nodeValue e = e.strip(' \t') list_ecl.append(e) getText(svr_file, ecl_file, defines_file) def main(argv): ''' Opens files,and calls appropriate functions based on Object values. ''' global SVR_FILENAME global ECL_FILENAME global DEFINES_FILENAME global MASTER_FILENAME SVR_FILENAME = "/dev/null" ECL_FILENAME = "/dev/null" DEFINES_FILENAME = "/dev/null" MASTER_FILENAME = "/dev/null" if len(sys.argv) == 2: usage() sys.exit(1) try: opts, args = getopt.getopt( argv, "m:s:e:d:h", ["master=", "svr=", "ecl=", "attr=", "help=", "defines="]) except getopt.error as err: print(str(err)) usage() sys.exit(1) for opt, arg in opts: if opt in ('-h', "--help"): usage() sys.exit(1) elif opt in ("-m", "--master"): MASTER_FILENAME = arg elif opt in ("-s", "--svr"): SVR_FILENAME = arg elif opt in ("-d", "--defines"): DEFINES_FILENAME = arg elif opt in ("-e", "--ecl"): ECL_FILENAME = arg else: print("Invalid Option!") sys.exit(1) # Error conditions are checked here. if (MASTER_FILENAME is None or not os.path.isfile(MASTER_FILENAME) or not os.path.getsize(MASTER_FILENAME) > 0): print("Master file not found or data is not present in File") sys.exit(1) try: master_file = open(MASTER_FILENAME, encoding='utf-8') except IOError as err: print(str(err)) print('Cannot open master file ' + MASTER_FILENAME) sys.exit(1) try: svr_file = open(SVR_FILENAME, 'w', encoding='utf-8') except IOError as err: print(str(err)) print('Cannot open ferver file ' + SVR_FILENAME) sys.exit(1) try: defines_file = open(DEFINES_FILENAME, 'w', encoding='utf-8') except IOError as err: print(str(err)) print('Cannot open defines file ' + DEFINES_FILENAME) sys.exit(1) try: ecl_file = open(ECL_FILENAME, 'w', encoding='utf-8') except IOError as err: print(str(err)) print('Cannot open ecl file ' + ECL_FILENAME) sys.exit(1) process(master_file, svr_file, ecl_file, defines_file) master_file.close() svr_file.close() ecl_file.close() def usage(): """ Usage (depicts the usage of the script) """ print("usage: prog -m -s " "-e -d ") if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: ci/README.md ================================================ Instant-CI is a developer tool which aims at providing continous integration to the developers locally on their development systems. Users can build, install PBS and run PTL tests with a single command. For this, the user need not worry about any underlying dependencies. It also supports build and test history in the form of logs. Dependencies for this tool are: * python3.5 or above * docker (17.12.0+) * docker-compose ***How to setup:*** Simply invoke the following command: ` ./ci` ***CLI interface for ci:*** * **./ci :** This is the primary command for ci. It starts the container (if not already running), builds PBS dependencies. Will configure(if required), make and install PBS. If the tests option are given it will run PTL with the same. It does not take any argument. ```bash ./ci ``` * **./ci --params:** The params option can be used to run ci with a custom configuration. Following parameters can be set. | os | nodes | configure | tests | > os: used to set OS platform of the container (single node)
> nodes: used to define multi-node configuration for container
> configure: will hold the value of configure options for PBS
> tests: will hold the value for pbs_benchpress argument for PTL; if set empty will skip PTL tests
```bash # When the params command is called without any arguments it will display the currently set "configuration" and then proceed to run ci # as the following example. ./ci --params # or ./ci -p # The following command is an example of how to provide a custom configure option for PBS. Everything to the right of the first '=' after configure will # be taken as it is and given as an argument to the configure file in PBS. The same convention follows for other configuration options as well ./ci --params 'configure=CFLAGS=" -O2 -Wall -Werror" --prefix=/tmp/pbs --enable-ptl' # You can also pass multiple parameter with this option for example ./ci -p 'configure=--enable-ptl --prefix=/opt/pbs' -p 'tests=-t SmokeTest.test_basic' # The following are examples how to define a custom test case for pbs_benchpress. # NOTE: The string is passed to pbs_benchpress command therefore one can use all available options of pbs_benchpress here. # By default the test option is set to '-t SmokeTest' ./ci --params 'tests=-f pbs_smoketest.py' ./ci --params 'tests=--tags=smoke' # If you wish to not run any PTL tests then use the below command. This will set tests as empty thus not invoking PTL. ./ci --params 'tests=' # Below is an example of setting the container operating system. This will setup a single container running PBS server. # NOTE: ci uses cached image to increase performance. These cached images are saved on the local system # with the suffix '-ci-pbs'. If you do not wish to use the cached image(s) delete them using . # OS platform can be defined by any image from docker-hub ./ci --params 'os=centos:7' # Following is an example of how to define multi node setup for PBS. # You can define multiple 'mom' or 'comm' nodes but only one 'server' node ./ci --params 'nodes=mom=centos:7;server=ubuntu:16.04;comm=ubuntu:18.04;mom=centos:8' ``` * **./ci --build-pkgs:** Invoke this command to build PBS packages. By default it will build packages for the platform ci container is started for. Optionally accepts argument for other platform. The packages can be found in 'ci/packages' folder. ```bash # Below command builds package for the platform ci was started/currently running on. ./ci --build-pkgs # or ./ci -b ``` * **./ci --delete:** This will delete any containers created by this tool and take a backup of logs. The current logs can be found in the "logs" folder in the ci folder. The backup of previous sessions logs can be can be found in the ci/logs/session-{date}-{timestamp} folder. ```bash # If you want to delete the container simply invoke this command. ./ci --delete # or ./ci -d ``` * **./ci --local:** This will build, install PBS, and run smoke tests on the local machine. This option can not be combined with other options. It does not take configurations from params but runs with predefined params(as run in travis). ```bash # The command to run ./ci --local #or ./ci -l # Optionally one can run the sanitize version (works only on centos:7) with the following argument ./ci --local sanitize ``` ================================================ FILE: ci/ci ================================================ #!/usr/bin/env python3 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. import argparse import configparser import copy import fileinput import json import os import platform import re import shlex import shutil import subprocess import sys import textwrap import threading import time from argparse import RawTextHelpFormatter from string import Template ci_dirname = '' default_platform = '' MACROS = {} def read_macros(): for line in open(os.path.join(ci_dirname, 'etc', 'macros')): var, value = line.split('=') MACROS[var] = value.replace('\n', '') requirements_template = Template('''num_servers=${num_servers} num_moms=${num_moms} num_comms=${num_comms} no_mom_on_server=${no_mom_on_server} no_comm_on_server=${no_comm_on_server} no_comm_on_mom=${no_comm_on_mom} ''') service_template_prist = Template('''{ "image": "${image}", "volumes": [ "../:/pbssrc", "./:/src", "./logs:/logs", "./etc:/workspace/etc" ], "entrypoint": "/workspace/etc/container-init", "environment": [ "NODE_TYPE=${node_type}", "LANG=en_US.utf-8" ], "networks": { "ci.local": { } }, "domainname": "ci.local", "container_name": "${hostname}", "hostname": "${hostname}", "user": "root", "privileged": true, "stdin_open": true, "tty": true }''') def log_error(msg): print("ERROR ::: " + str(msg)) def log_info(msg): t = time.localtime() current_time = time.strftime("%H:%M:%S", t) print(current_time + " ---> " + str(msg)) def log_warning(msg): print("WARNING ::: " + str(msg)) def get_services_list(): _ps = subprocess.run( ["docker-compose", "-f", "docker-compose.json", "ps", "--filter", "status=running", "--services"], stdout=subprocess.PIPE) _p = str((_ps.stdout).decode('utf-8')) return [x for x in _p.splitlines() if len(x) > 0] def get_compose_file_services_list(): compose_file = os.path.join(ci_dirname, 'docker-compose.json') with open(compose_file) as f: compose_file = json.loads(f.read()) return list(compose_file['services'].keys()) def run_cmd(cmd, return_output=False): ''' Run a terminal command, and if needed return output of the command. ''' cmd = shlex.split(cmd) try: a = subprocess.Popen(cmd, stdout=subprocess.PIPE) out, err = a.communicate() if a.returncode != 0: log_error("command failed") log_error(str(err)) else: if return_output: return str(out) except Exception as e: log_error("The command failed.") log_error(e) def run_docker_cmd(run_cmd, run_on='all'): ''' Runs a docker command and on failure redirects user to the container terminal ''' services = get_services_list() services.sort(reverse=True) # we want server cmds to run first for service in services: cmd = "docker-compose -f docker-compose.json exec " cmd += service + " bash -c \'" + run_cmd + "\'" if run_on != 'all' and service.find(run_on) == -1: log_info('Skipping on ' + service + ' as command only to be run on ' + run_on) continue try: log_info(cmd) docker_cmd = shlex.split(cmd) a = subprocess.Popen(docker_cmd) a.communicate() if a.returncode != 0: _msg = "docker cmd returned with non zero exit code," _msg += "redirecting you to container terminal" log_error(_msg) _docker_cmd = "docker-compose -f docker-compose.json exec " _docker_cmd += service + " bash -c \'cd /pbssrc && /bin/bash\'" docker_cmd = shlex.split(_docker_cmd) subprocess.run(docker_cmd) os._exit(1) except Exception as e: log_error("Failed\n:") log_error(e) def write_to_file(file_path, value): with open(file_path, "w+") as f: f.write(value) def read_from_file(file_path): if not os.path.isfile(file_path): open(file_path, 'a').close() with open(file_path, 'r+') as f: val = f.read() return val def commit_docker_image(): ''' Watch for readiness of ci containers to commit a new image ''' images_to_commit = {} time_spent = 0 services = get_services_list() service_count = len(services) timeout = 1 * 60 * 60 while service_count > 0: # Do not want to check constantly as it increases cpu load time.sleep(15) time_spent = time_spent + 15 if time_spent > timeout: log_error("build is taking too long, timed out") sys.exit(1) status = read_from_file(os.path.join( ci_dirname, MACROS['CONFIG_DIR'], MACROS['STATUS_FILE'])) for service in services: if str(status).find(service) != -1: services.remove(service) service_count -= 1 image = (service.split('-', 1)[1][:-2]).replace('-', ':') image = image.replace("_", ".") images_to_commit[image] = service for key in images_to_commit: try: build_id = 'docker-compose -f docker-compose.json ps -q ' + \ images_to_commit[key] build_id = run_cmd(build_id, True) build_id = build_id.split("'")[1] build_id = build_id[:12] image_name = (str(key).replace(':', '-') ).replace('.', '_') + '-ci-pbs' # shortening the build id to 12 characters as is displayed by # 'docker ps' unlike 'docker-compose ps' which shows full id cmd = 'docker commit '+build_id+' '+image_name+':latest' log_info(cmd) run_cmd(cmd) except Exception as e: log_error(e) try: bad_images = "docker images -qa -f'dangling=true'" bad_images = run_cmd(bad_images, True) if bad_images != "b''": bad_images = (bad_images.split("'")[1]).replace("\\n", " ") print("The following untagged images will be removed -> " + bad_images) cmd = 'docker rmi ' + bad_images run_cmd(cmd) except Exception as e: log_warning( "could not remove bad (dangling) images, \ please remove manually") print(e) return True def create_ts_tree_json(): benchpress_opt = os.path.join( ci_dirname, MACROS['CONFIG_DIR'], MACROS['BENCHPRESS_OPT_FILE']) benchpress_value = read_from_file(benchpress_opt) try: cmd = '/src/etc/gen_ptl_json.sh "' + benchpress_value + '"' run_docker_cmd(cmd, run_on='server') except Exception: log_error('Failed to generate testsuite info json') sys.exit(1) def get_node_config(node_image=default_platform): ''' Calculate the required node configuration for given requirements decorator and return node config ''' json_data = {} max_servers_needed = 1 max_moms_needed = 1 max_comms_needed = 1 no_mom_on_server_flag = False no_comm_on_mom_flag = True no_comm_on_server_flag = False try: with open(os.path.join(ci_dirname, 'ptl_ts_tree.json')) as f: json_data = json.load(f) except Exception: log_error('Could not find ptl tree json file') for ts in json_data.values(): for tclist in ts['tclist'].values(): max_moms_needed = max( tclist['requirements']['num_moms'], max_moms_needed) max_servers_needed = max( tclist['requirements']['num_servers'], max_servers_needed) max_comms_needed = max( tclist['requirements']['num_comms'], max_comms_needed) no_mom_on_server_flag = tclist['requirements']['no_mom_on_server']\ or no_mom_on_server_flag no_comm_on_server_flag = tclist['requirements']['no_comm_on_server']\ or no_comm_on_server_flag no_comm_on_mom_flag = tclist['requirements']['no_comm_on_mom']\ or no_comm_on_mom_flag # Create a bash readable requirements decorator file write_to_file(os.path.join(ci_dirname, MACROS['CONFIG_DIR'], MACROS['REQUIREMENT_DECORATOR_FILE']), requirements_template.substitute(num_servers=max_servers_needed, num_moms=max_moms_needed, num_comms=max_comms_needed, no_mom_on_server=no_mom_on_server_flag, no_comm_on_server=no_comm_on_server_flag, no_comm_on_mom=no_comm_on_mom_flag)) server_nodes = [] mom_nodes = [] comm_nodes = [] # get required number of servers and moms for _ in range(max_servers_needed): server_nodes.append(node_image) if not no_mom_on_server_flag: max_moms_needed = max(max_moms_needed, max_servers_needed) if max_moms_needed > max_servers_needed: for _ in range(max_moms_needed - max_servers_needed): mom_nodes.append(node_image) else: for _ in range(max_moms_needed): mom_nodes.append(node_image) only_moms = len(mom_nodes) # get required num of comms if no_comm_on_mom_flag and no_comm_on_server_flag: for _ in range(max_comms_needed): comm_nodes.append(node_image) elif no_comm_on_mom_flag and not no_comm_on_server_flag: if max_comms_needed > max_servers_needed: for _ in range(max_comms_needed-max_servers_needed): comm_nodes.append(node_image) else: if max_comms_needed > only_moms: for _ in range(max_comms_needed - only_moms): comm_nodes.append(node_image) # remove the trailing ';' from the node_config string mom_nodes = ['mom=' + x for x in mom_nodes] server_nodes = ['server=' + x for x in server_nodes] comm_nodes = ['comm=' + x for x in comm_nodes] node_images = ";".join(server_nodes + mom_nodes + comm_nodes) return node_images def tail_build_log(): server_name = '' build_log_path = get_services_list() for i in build_log_path: if i.find('server') != -1: build_log_path = i server_name = i build_log_path = os.path.join( ci_dirname, 'logs', 'build-' + build_log_path) prev = '' next = '' with open(build_log_path, 'rb') as f: while True: f.seek(-2, os.SEEK_END) while f.read(1) != b'\n': f.seek(-2, os.SEEK_CUR) next = f.readline().decode() if next != prev: print(next, end='') prev = next else: status = os.path.join( ci_dirname, MACROS['CONFIG_DIR'], MACROS['STATUS_FILE']) status = read_from_file(status) if status.find(server_name) != -1: return def check_for_existing_image(val=default_platform): ''' This function will check whether an existing image with the post-fix of '-ci-pbs' exists or not for the given docker image. ''' if val.find('-ci-pbs') == -1: search_str = val.replace(":", "-") search_str = search_str.replace(".", '_') search_str += '-ci-pbs' cmd = 'docker images -q ' + search_str search_result = run_cmd(cmd, True) if search_result != "b''": return True, search_str else: return False, val def get_current_setup(): ''' Returns the node config for currently running ci containers ''' compose_file = os.path.join(ci_dirname, 'docker-compose.json') node_config = '' with open(compose_file) as f: compose_file = json.loads(f.read()) for service in compose_file['services']: image = compose_file["services"][service]['image'] if image[-7:] == '-ci-pbs': image = image[:-7][::-1].replace('-', ':', 1)[::-1] node_type = compose_file["services"][service]['environment'][0] node_type = node_type.split('=')[1] node_config += node_type + '=' + image + ';' node_config = node_config[:-1] return node_config def load_conf(): conf_file = os.path.join( ci_dirname, MACROS['CONFIG_DIR'], MACROS['CONF_JSON_FILE']) with open(conf_file) as f: conf_file = json.loads(f.read()) return conf_file def show_set_opts(): conf_opts = load_conf() os_file_list = get_compose_file_services_list() os_file_list = [(x.split('-', 1)[0] + '=' + x.split('-', 1)[1][:-2] ).replace('-', ':').replace('_', '.') for x in os_file_list] os_file_list.sort() conf_opts['OS'] = os_file_list print(json.dumps(conf_opts, indent=2, sort_keys=True)) def create_param_file(): ''' Create param file with necessary node configuration for multi node PTL tests. ''' moms = [] comms = [] include_server_mom = False include_server_comm = False include_mom_comm = False reqs = read_from_file(os.path.join( ci_dirname, MACROS['CONFIG_DIR'], MACROS['REQUIREMENT_DECORATOR_FILE'])) if reqs.find('no_mom_on_server=False') != -1: include_server_mom = True if reqs.find('no_comm_on_server=False') != -1: include_server_comm = True if reqs.find('no_comm_on_mom=False') != -1: include_mom_comm = True for service in get_services_list(): service = service+'.ci.local' if service.find('server') != -1: if include_server_mom: moms.append(service) if include_server_comm: comms.append(service) if service.find('mom') != -1: moms.append(service) if include_mom_comm: comms.append(service) if service.find('comm') != -1: comms.append(service) write_str = '' if len(moms) != 0: write_str = 'moms=' + ':'.join(moms) + '\n' if len(comms) != 0: write_str += 'comms=' + ':'.join(comms) param_path = os.path.join( ci_dirname, MACROS['CONFIG_DIR'], MACROS['PARAM_FILE']) write_to_file(param_path, write_str) def unpack_node_string(nodes): ''' Helper function to expand abbreviated node config ''' for x in nodes: if x.find('*') != -1: num = x.split('*')[0] try: num = int(num) except Exception: log_error('invalid string provided for "nodes" configuration') sys.exit(1) val = x.split('*')[1] nodes.remove(x) for _ in range(num): nodes.append(val) return ';'.join(nodes) def build_compose_file(nodes): ''' Build docker-compose file for given node config in function parameter ''' compose_template = { "version": "3.5", "networks": { "ci.local": { "name": "ci.local" } }, "services": {} } if nodes.find("*") != -1: nodes = unpack_node_string(nodes.split(';')) count = 0 server = '' for n in nodes.split(';'): count = count + 1 node_key, node_val = n.split('=') if (node_val not in MACROS['SUPPORTED_PLATFORMS'].split(',') and ''.join(sys.argv).find(node_val) != -1): log_warning("Given platform '" + node_val + "' is not supported by" + " ci, will result in unexpected behaviour") log_warning("Supported platforms are " + MACROS['SUPPORTED_PLATFORMS']) node_name = node_key + '-' + \ (node_val.replace(':', '-')).replace('.', '_') + '-' + str(count) image_value = node_val _, image_value = check_for_existing_image(node_val) service_template = json.loads(service_template_prist.substitute( image=image_value, node_type=node_key, hostname=node_name)) if node_key == 'server': server = node_name compose_template['services'][node_name] = service_template for service in compose_template['services']: compose_template['services'][service]['environment'].append( "SERVER="+server) f = open(os.path.join(ci_dirname, 'docker-compose.json'), 'w') json.dump(compose_template, f, indent=2, sort_keys=True) f.close() log_info("Configured nodes for ci") def ensure_ci_running(): ''' Check for running ci container; if not start ci container. ''' try: service_count = len(get_services_list()) if service_count == 0: log_info("No running service found") try: log_info('Attempting to start container') os.chdir(ci_dirname) subprocess.run(["docker-compose", "-f", "docker-compose.json", "down", "--remove-orphans"], stdout=subprocess.DEVNULL) if os.path.exists(os.path.join(ci_dirname, MACROS['CONFIG_DIR'], MACROS['STATUS_FILE'])): os.remove(os.path.join( ci_dirname, MACROS['CONFIG_DIR'], MACROS['STATUS_FILE'])) write_to_file(os.path.join( ci_dirname, MACROS['CONFIG_DIR'], MACROS['STATUS_FILE']), '') subprocess.run( ["docker-compose", "-f", "docker-compose.json", "up", "-d"]) log_info('Waiting for container build to complete ') build_log_path = os.path.join(ci_dirname, 'logs') log_info("Build logs can be found in " + build_log_path) # wait for build to complete and commit newly built container tail_build_log() commit_docker_image() except Exception as e: log_error(e) else: log_info("running container found") return 0 except Exception: log_error(e) def check_prerequisites(): ''' This function will check whether docker docker-compose commands are available. Also check docker version is minimum required. ''' cmd = "where" if platform.system() == "Windows" else "which" try: subprocess.run([cmd, "docker"], stdout=subprocess.DEVNULL) except Exception: log_error("docker not found in PATH") sys.exit(1) def version_tuple(s: str): return tuple(int(x) for x in s.split(".")) try: version = subprocess.run( ["docker", "--version"], stdout=subprocess.PIPE) version = re.findall(r'\s*([\d.]+)', version.stdout.decode('utf-8')) req_version = MACROS['REQ_DOCKER_VERSION'] if version_tuple(version[0]) < version_tuple(req_version): print(version[0]) print("Docker version less than minimum required " + req_version) sys.exit(1) except Exception: log_error("Failed to get docker version") sys.exit(1) try: subprocess.run([cmd, "docker-compose"], stdout=subprocess.DEVNULL) except Exception: log_error("docker-compose not found in PATH") sys.exit(1) def is_restart_required(): ''' This function checks if the number of nodes currently running meet requirement for the given test case. If not builds new docker-compose file and returns bool value to restart ci. ''' create_ts_tree_json() current_file_services_list = get_compose_file_services_list() current_node_image = current_file_services_list[0].split( '-', 1)[1][:-2].replace('-', ':') node_config = get_node_config(node_image=current_node_image) potential_list = [] for val in node_config.split(';'): val = val.replace('=', '-') val = val.replace(':', '-') potential_list.append(val) current_file_services_list = [i[:-2] for i in current_file_services_list] # compare without platform names current_file_services_list = [ i.split('-', 1)[0] for i in current_file_services_list] potential_list = [i.split('-', 1)[0] for i in potential_list] potential_list.sort() current_file_services_list.sort() if current_file_services_list != potential_list: build_compose_file(node_config) return True else: return False def setup_config_dir(): ''' Initializes config directory and files for ci ''' command_path = os.path.join(ci_dirname, MACROS['CONFIG_DIR']) if not os.path.exists(command_path): os.mkdir(command_path) target_path = os.path.join(command_path, MACROS['CONF_JSON_FILE']) if not os.path.exists(target_path): value = '{ "configure": "--prefix=/opt/pbs ' value += '--enable-ptl", "tests" : "-t SmokeTest" }' write_to_file(target_path, value) target_path = os.path.join(command_path, MACROS['CONFIGURE_OPT_FILE']) if not os.path.exists(target_path): value = "--prefix=/opt/pbs --enable-ptl" write_to_file(target_path, value) target_path = os.path.join(command_path, MACROS['BENCHPRESS_OPT_FILE']) if not os.path.exists(target_path): value = "-t SmokeTest" write_to_file(target_path, value) target_path = os.path.join(ci_dirname, 'docker-compose.json') if not os.path.exists(target_path): build_compose_file('server=' + default_platform) run_cmd('docker-compose -f docker-compose.json down --remove-orphans') def delete_ci(): ''' Takes backup of logs and deletes running containers. ''' services = get_services_list() if len(services) != 0: build_compose_file(nodes=get_current_setup()) cmd = '/src/etc/killit.sh backup' run_docker_cmd(cmd, run_on='server') log_warning('Removed logs file') log_info('backup files can be found in ' + build_log_path) else: log_info('No running container found, nothing to backup') try: os.chdir(ci_dirname) run_cmd( "docker-compose -f docker-compose.json down --remove-orphans") log_info( "done delete container and services") except Exception as e: log_error("Failed to destroy container and services: " + e) def parse_params(params_list): ''' Update given params ''' if params_list[0] != 'called': container_running = False conf_opts = load_conf() for set_opts in params_list: key, value = (set_opts).split('=', 1) service_count = len(get_services_list()) if service_count > 0: container_running = True if key.lower() == 'nodes': if container_running: log_warning( "Deleting existing containers first,\ find backup in logs folder") delete_ci() build_compose_file(value) elif key.lower() == 'os': if container_running: log_warning( "Deleting existing containers first, \ find backup in logs folder") delete_ci() node_string = value.replace('"', '') node_string = 'server=' + node_string build_compose_file(node_string) else: if key in conf_opts: conf_opts[key] = value f = open(os.path.join( ci_dirname, MACROS['CONFIG_DIR'], MACROS['CONF_JSON_FILE']), 'w') json.dump(conf_opts, f, indent=2, sort_keys=True) f.close() else: log_error("Unrecognised key in parameter: '" + key + "' , nothing updated") sys.exit(1) def run_ci_local(local): ''' Run ci locally on host without spawning containers ''' os.chdir(ci_dirname) # using subprocess.run instead of run_cmd function # so we dont supress stdout and stderr if local == 'normal': exit_code = subprocess.run("./etc/do.sh") sys.exit(exit_code.returncode) if local == 'sanitize': exit_code = subprocess.run("./etc/do_sanitize_mode.sh") sys.exit(exit_code.returncode) def run_ci(build_pkgs=False): ''' Run PBS configure, install PBS and run PTL tests, if build_pkgs is set to True it will instead run package build script only ''' # Display Current options log_info("Running ci with the following options") show_set_opts() if len(get_services_list()) > 0: build_compose_file(get_current_setup()) ret = ensure_ci_running() if ret == 1: log_error( "container build failed, build logs can be found in " + build_log_path) sys.exit(1) command_path = os.path.join(ci_dirname, MACROS['CONFIG_DIR']) conf_opts = load_conf() if build_pkgs: build_cmd = '/src/etc/build-pbs-packages.sh' log_info('The package build logs can be found in logs/pkglogs') run_docker_cmd(build_cmd + ' | tee /logs/pkglogs', run_on='server') sys.exit(0) if conf_opts['tests'] != '': target_path = os.path.join(command_path, MACROS['BENCHPRESS_OPT_FILE']) write_to_file(target_path, conf_opts['tests']) if is_restart_required(): delete_ci() ensure_ci_running() target_path = os.path.join(command_path, MACROS['CONFIGURE_OPT_FILE']) if conf_opts['configure'] != read_from_file(target_path): write_to_file(target_path, conf_opts['configure']) cmd = ' export ONLY_CONFIGURE=1 && /src/etc/do.sh 2>&1 \ | tee -a /logs/build-$(hostname -s) ' run_docker_cmd(cmd) cmd = ' export ONLY_REBUILD=1 && /src/etc/do.sh 2>&1 \ | tee -a /logs/build-$(hostname -s) ' run_docker_cmd(cmd) cmd = ' export ONLY_INSTALL=1 && /src/etc/do.sh 2>&1 \ | tee -a /logs/build-$(hostname -s) ' run_docker_cmd(cmd) target_path = os.path.join(command_path, MACROS['BENCHPRESS_OPT_FILE']) if conf_opts['tests'] == '': write_to_file(target_path, conf_opts['tests']) log_warning("No tests assigned, skipping PTL run") else: create_param_file() write_to_file(target_path, conf_opts['tests']) cmd = 'export RUN_TESTS=1 && export ONLY_TEST=1 && /src/etc/do.sh ' run_docker_cmd(cmd, run_on='server') if __name__ == "__main__": ci_dirname = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ci_dirname = os.path.join(ci_dirname, 'ci') os.chdir(ci_dirname) read_macros() _help = ''' Examples of using arguments. ./ci -p 'OS=centos:7' ./ci -p 'tests=-t SmokeTest' ./ci -p 'configure=CFLAGS="-g -O2" --enable-ptl' ./ci -p 'nodes=mom=centos:7;server=ubuntu:16.04' ./ci -d or ./ci --delete ./ci -b or ./ci --build ./ci -l or ./ci --local Note: Set tests as empty if you dont want to run PTL' ''' _help += 'Supported platforms are ' + MACROS['SUPPORTED_PLATFORMS'] ap = argparse.ArgumentParser(prog='ci', description='Runs the ci tool for pbs', formatter_class=argparse.RawTextHelpFormatter, epilog=textwrap.dedent(_help), conflict_handler='resolve') _help = 'set configuration values for os | nodes | configure | tests' ap.add_argument('-p', '--params', nargs='+', action='append', help=_help, metavar='param') _help = 'destroy pbs container' ap.add_argument('-d', '--delete', action='store_true', help=_help) _help = 'build packages for the current platform.' ap.add_argument('-b', '--build-pkgs', nargs='?', const='called', help=_help) _help = 'Simply run the tests locally, without spawning any containers.' _help += '\ntype can be one of normal (default) or sanitize' ap.add_argument('-l', '--local', nargs='?', const='normal', help=_help, metavar='type') args = ap.parse_args() build_pkgs = False default_platform = MACROS['DEFAULT_PLATFORM'] build_log_path = os.path.join(ci_dirname, 'logs') not_local_run = sys.argv.count('-l') == 0 \ and sys.argv.count('--local') == 0 \ and sys.argv.count('-l=sanitize') == 0\ and sys.argv.count('--local=sanitize') == 0 \ and sys.argv.count('-l=normal') == 0 \ and sys.argv.count('--local=normal') == 0 if not_local_run: setup_config_dir() check_prerequisites() if (not args.delete) and not_local_run and (args.params is None): ret = ensure_ci_running() if ret == 1: log_error( "container build failed, build logs can be found in " + build_log_path) sys.exit(1) try: if args.params is not None: for p in args.params: parse_params(p) if args.build_pkgs is not None: build_pkgs = True if args.delete is True: confirm = input( 'Are you sure you want to delete containers (Y/N)?: ') if confirm[0].lower() == 'n': sys.exit(0) elif confirm[0].lower() == 'y': delete_ci() else: log_error("Invalid option provided") sys.exit(0) if args.local is not None: run_ci_local(args.local) except Exception as e: ap.print_help() log_error(e) run_ci(build_pkgs) ================================================ FILE: ci/etc/build-pbs-packages.sh ================================================ #! /bin/bash -xe # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. . /etc/os-release pbsdir=/pbssrc rpm_dir=/root/rpmbuild rm -rf /src/packages mkdir -p /src/packages mkdir -p ${rpm_dir}/{BUILD,RPMS,SOURCES,SPECS,SRPMS} if [ "x${ID}" == "xcentos" -a "x${VERSION_ID}" == "x8" ]; then export LANG="C.utf8" swig_opt="--with-swig=/usr/local" if [ ! -f /tmp/swig/swig/configure ]; then # source install swig dnf -y install gcc-c++ byacc pcre-devel mkdir -p /tmp/swig/ cd /tmp/swig git clone https://github.com/swig/swig --branch rel-4.0.0 --single-branch cd swig ./autogen.sh ./configure make -j8 make install cd ${PBS_DIR} fi fi cp -r $pbsdir /tmp/pbs cd /tmp/pbs ./autogen.sh mkdir -p target cd target ../configure --prefix=/opt/pbs --enable-ptl ${swig_opt} make dist cp *.tar.gz ${rpm_dir}/SOURCES cp ../*-rpmlintrc ${rpm_dir}/SOURCES cp *.spec ${rpm_dir}/SPECS cflags="-g -O2 -Wall -Werror" cxxflags="-g -O2 -Wall -Werror" if [ "x${ID}" == "xdebian" -o "x${ID}" == "xubuntu" ]; then CFLAGS="${cflags} -Wno-unused-result" CXXFLAGS="${cxxflags} -Wno-unused-result" rpmbuild -ba --nodeps *.spec --with ptl else if [ "x${ID}" == "xcentos" -a "x${VERSION_ID}" == "x8" ]; then CFLAGS="${cflags}" CXXFLAGS="${cxxflags}" rpmbuild -ba *.spec --with ptl -D "_with_swig ${swig_opt}" else CFLAGS="${cflags}" CXXFLAGS="${cxxflags}" rpmbuild -ba *.spec --with ptl fi fi cp ${pbsdir}/README.md /src/packages/ cp ${pbsdir}/LICENSE /src/packages/ cp ${pbsdir}/COPYRIGHT /src/packages/ mv ${rpm_dir}/RPMS/*/*pbs* /src/packages/ mv ${rpm_dir}/SRPMS/*pbs* /src/packages/ cd /src/packages rm -rf /tmp/pbs if [ "x${ID}" == "xdebian" -o "x${ID}" == "xubuntu" ]; then _target_arch=$(dpkg --print-architecture) fakeroot alien --to-deb --scripts --target=${_target_arch} *-debuginfo*.rpm -g _dir=$(/bin/ls -1d *debuginfo* | grep -vE '(rpm|orig)') mv ${_dir}/opt/pbs/usr/ ${_dir}/ rm -rf ${_dir}/opt ( cd ${_dir} dpkg-buildpackage -d -b -us -uc ) rm -rf ${_dir} ${_dir}.orig *debuginfo*.buildinfo *debuginfo*.changes *debuginfo*.rpm fakeroot alien --to-deb --scripts --target=${_target_arch} *.rpm rm -f *.rpm fi ================================================ FILE: ci/etc/ci-script-wrapper.service ================================================ # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. [Unit] Description=Run ci docker entrypoint script at startup after all systemd services are loaded After=getty.target [Service] Type=forking RemainAfterExit=yes EnvironmentFile=/.env-file ExecStart=/src/etc/docker-entrypoint TimeoutStartSec=0 [Install] WantedBy=default.target ================================================ FILE: ci/etc/configure_node.sh ================================================ #! /bin/bash -x # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. . /src/etc/macros if [ -f /src/${CONFIG_DIR}/${REQUIREMENT_DECORATOR_FILE} ]; then . /src/${CONFIG_DIR}/${REQUIREMENT_DECORATOR_FILE} fi if [ "x${NODE_TYPE}" == "xmom" ]; then sed -i "s@PBS_SERVER=.*@PBS_SERVER=${SERVER}@" /etc/pbs.conf sed -i "s@PBS_START_SERVER=.*@PBS_START_SERVER=0@" /etc/pbs.conf ssh -t root@${SERVER} " /opt/pbs/bin/qmgr -c 'c n $(hostname -s)'" if [ "x${no_comm_on_mom}" == "xTrue" ]; then sed -i "s@PBS_START_COMM=.*@PBS_START_COMM=0@" /etc/pbs.conf else sed -i "s@PBS_START_COMM=.*@PBS_START_COMM=1@" /etc/pbs.conf fi sed -i "s@PBS_START_SCHED=.*@PBS_START_SCHED=0@" /etc/pbs.conf fi if [ "x${NODE_TYPE}" == "xserver" ]; then sed -i "s@PBS_SERVER=.*@PBS_SERVER=$(hostname)@" /etc/pbs.conf if [ "x${no_comm_on_server}" == "xTrue" ]; then sed -i "s@PBS_START_COMM=.*@PBS_START_COMM=0@" /etc/pbs.conf else sed -i "s@PBS_START_COMM=.*@PBS_START_COMM=1@" /etc/pbs.conf fi if [ "x${no_mom_on_server}" == "xTrue" ]; then sed -i "s@PBS_START_MOM=.*@PBS_START_MOM=0@" /etc/pbs.conf else sed -i "s@PBS_START_MOM=.*@PBS_START_MOM=1@" /etc/pbs.conf fi sed -i "s@PBS_START_SERVER=.*@PBS_START_SERVER=1@" /etc/pbs.conf sed -i "s@PBS_START_SCHED=.*@PBS_START_SCHED=1@" /etc/pbs.conf fi if [ "x${NODE_TYPE}" == "xcomm" ]; then sed -i "s@PBS_START_COMM=.*@PBS_START_COMM=1@" /etc/pbs.conf sed -i "s@PBS_SERVER=.*@PBS_SERVER=${SERVER}@" /etc/pbs.conf sed -i "s@PBS_START_MOM=.*@PBS_START_MOM=0@" /etc/pbs.conf sed -i "s@PBS_START_SERVER=.*@PBS_START_SERVER=0@" /etc/pbs.conf sed -i "s@PBS_START_SCHED=.*@PBS_START_SCHED=0@" /etc/pbs.conf fi ================================================ FILE: ci/etc/container-env-setup.sh ================================================ # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. export container=docker export TERM=xterm if [ -e /etc/debian_version ]; then export DEBIAN_FRONTEND=noninteractive fi export LOGNAME=${LOGNAME:-"$(id -un)"} export USER=${USER:-"$(id -un)"} export TZ=UTC export PBS_TZID=UTC export PATH="$(printf "%s" "/usr/local/bin:/usr/local/sbin:${PATH}" | awk -v RS=: -v ORS=: '!($0 in a) {a[$0]; print}')" export DOMAIN=$(hostname -d) export PERL5LIB=${HOME}/AUTO/lib/perl5/site_perl export PERL5LIB=${PERL5LIB}:${HOME}/AUTO/lib/site_perl export PERL5LIB=${PERL5LIB}:${HOME}/AUTO/share/perl5 export PERL5LIB=${PERL5LIB}:${HOME}/AUTO/share/perl export PBS_TEST_DEBUG=1 export PBS_TEST_VERBOSE=1 export PBS_PRINT_STACK_TRACE=1 export MAIL="${MAIL:-"/var/mail/$(id -un)"}" ================================================ FILE: ci/etc/container-init ================================================ #!/bin/bash -x # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. /workspace/etc/install-system-packages 2>&1 | tee -a /logs/build-$(hostname -s) # set environment var file touch /.env-file set >/.env-file capsh --print | grep -Eq '*cap_sys_admin*' if [ $? -eq 0 ]; then if [ -x "/usr/lib/systemd/systemd" ]; then exec /usr/lib/systemd/systemd --system elif [ -x "/lib/systemd/systemd" ]; then exec /lib/systemd/systemd --system elif [ -x "/usr/sbin/init" ]; then exec /usr/sbin/init elif [ -x "/sbin/init" ]; then exec /sbin/init else echo "Couldn't start container in systemd mode, starting in default mode" fi fi ================================================ FILE: ci/etc/do.sh ================================================ #!/bin/bash -xe # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. if [ $(id -u) -ne 0 ]; then echo "This script must be run by root user" exit 1 fi if [ -f /src/ci ]; then IS_CI_BUILD=1 FIRST_TIME_BUILD=$1 . /src/etc/macros config_dir=/src/${CONFIG_DIR} chmod -R 755 ${config_dir} logdir=/logs chmod -R 755 ${logdir} PBS_DIR=/pbssrc else PBS_DIR=$(readlink -f $0 | awk -F'/ci/' '{print $1}') fi cd ${PBS_DIR} . /etc/os-release # Extract major version number MAJOR_VERSION="${VERSION_ID%%.*}" SPEC_FILE=$(/bin/ls -1 ${PBS_DIR}/*.spec) REQ_FILE=${PBS_DIR}/test/fw/requirements.txt if [ ! -r ${SPEC_FILE} -o ! -r ${REQ_FILE} ]; then echo "Couldn't find pbs spec file or ptl requirements file" exit 1 fi if [ "x${IS_CI_BUILD}" != "x1" ] || [ "x${FIRST_TIME_BUILD}" == "x1" -a "x${IS_CI_BUILD}" == "x1" ]; then if [ "x${ID}" == "xcentos" -a "x${VERSION_ID}" == "x7" ]; then yum clean all yum -y install yum-utils epel-release rpmdevtools yum -y install python3-pip sudo which net-tools man-db time.x86_64 \ expat libedit postgresql-server postgresql-contrib python3 \ sendmail sudo tcl tk libical libasan llvm git rpmdev-setuptree yum-builddep -y ${SPEC_FILE} yum -y install $(rpmspec --requires -q ${SPEC_FILE} | awk '{print $1}' | sort -u | grep -vE '^(/bin/)?(ba)?sh$') pip3 install --trusted-host pypi.org --trusted-host files.pythonhosted.org -r ${REQ_FILE} if [ "x${BUILD_MODE}" == "xkerberos" ]; then yum -y install krb5-libs krb5-devel libcom_err libcom_err-devel fi yum -y install cmake3 rm -rf cJSON git clone https://github.com/DaveGamble/cJSON.git cd cJSON; mkdir build; cd build; cmake3 .. -DCMAKE_INSTALL_PREFIX=/usr; make; make install; cd ../../ elif [ "x${ID}" == "xcentos" -a "x${VERSION_ID}" == "x8" ]; then export LANG="C.utf8" sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* dnf -y clean all dnf -y install 'dnf-command(config-manager)' dnf -y config-manager --set-enabled powertools dnf -y install epel-release dnf -y install python3-pip sudo which net-tools man-db time.x86_64 \ expat libedit postgresql-server postgresql-contrib python3 \ sendmail sudo tcl tk libical libasan llvm git dnf -y builddep ${SPEC_FILE} dnf -y install $(rpmspec --requires -q ${SPEC_FILE} | awk '{print $1}' | sort -u | grep -vE '^(/bin/)?(ba)?sh$') pip3 install --trusted-host pypi.org --trusted-host files.pythonhosted.org -r ${REQ_FILE} if [ "x${BUILD_MODE}" == "xkerberos" ]; then dnf -y install krb5-libs krb5-devel libcom_err libcom_err-devel fi elif [ "x${ID}" == "xrocky" -a "x${MAJOR_VERSION}" == "x9" ]; then export LANG="C.utf8" dnf -y clean all yum -y install yum-utils dnf -y install 'dnf-command(config-manager)' dnf config-manager --set-enabled crb dnf -y install epel-release dnf -y install python3-pip sudo which net-tools man-db time.x86_64 procps \ expat libedit postgresql-server postgresql-contrib python3 \ sendmail sudo tcl tk libical libasan llvm git chkconfig dnf -y builddep ${SPEC_FILE} dnf -y install $(rpmspec --requires -q ${SPEC_FILE} | awk '{print $1}' | sort -u | grep -vE '^(/bin/)?(ba)?sh$') pip3 install --trusted-host pypi.org --trusted-host files.pythonhosted.org -r ${REQ_FILE} if [ "x${BUILD_MODE}" == "xkerberos" ]; then dnf -y install krb5-libs krb5-devel libcom_err libcom_err-devel fi elif [ "x${ID}" == "xopensuse" -o "x${ID}" == "xopensuse-leap" ]; then zypper -n ref zypper -n install rpmdevtools python3-pip sudo which net-tools man time.x86_64 git rpmdev-setuptree zypper -n install --force-resolution $(rpmspec --buildrequires -q ${SPEC_FILE} | sort -u | grep -vE '^(/bin/)?(ba)?sh$') zypper -n install --force-resolution $(rpmspec --requires -q ${SPEC_FILE} | sort -u | grep -vE '^(/bin/)?(ba)?sh$') pip3 install --trusted-host pypi.org --trusted-host files.pythonhosted.org -r ${REQ_FILE} elif [ "x${ID}" == "xdebian" ]; then if [ "x${DEBIAN_FRONTEND}" == "x" ]; then export DEBIAN_FRONTEND=noninteractive fi apt-get -y update apt-get install -y build-essential dpkg-dev autoconf libtool rpm alien libssl-dev \ libxt-dev libpq-dev libexpat1-dev libedit-dev libncurses5-dev \ libical-dev libhwloc-dev pkg-config tcl-dev tk-dev python3-dev \ swig expat postgresql postgresql-contrib python3-pip sudo \ man-db git elfutils libcjson-dev pip3 install --trusted-host pypi.org --trusted-host files.pythonhosted.org -r ${REQ_FILE} elif [ "x${ID}" == "xubuntu" ]; then if [ "x${DEBIAN_FRONTEND}" == "x" ]; then export DEBIAN_FRONTEND=noninteractive fi apt-get -y update apt-get install -y build-essential dpkg-dev autoconf libtool rpm alien libssl-dev \ libxt-dev libpq-dev libexpat1-dev libedit-dev libncurses5-dev \ libical-dev libhwloc-dev pkg-config tcl-dev tk-dev python3-dev \ swig expat postgresql python3-pip sudo man-db git elfutils libcjson-dev if [[ $(printf '%s\n' "24.04" "$VERSION_ID" | sort -V | head -n1) == "24.04" ]]; then apt-get -y install python3-nose python3-bs4 python3-defusedxml python3-pexpect else pip3 install --trusted-host pypi.org --trusted-host files.pythonhosted.org -r ${REQ_FILE} fi else echo "Unknown platform..." exit 1 fi fi if [ "x${FIRST_TIME_BUILD}" == "x1" -a "x${IS_CI_BUILD}" == "x1" ]; then echo "### First time build is complete ###" echo "READY:$(hostname -s)" >>${config_dir}/${STATUS_FILE} exit 0 fi if [ "x${ID}" == "xcentos" -a "x${VERSION_ID}" == "x8" ]; then export LANG="C.utf8" swig_opt="--with-swig=/usr/local" if [ ! -f /tmp/swig/swig/configure ]; then # source install swig dnf -y install gcc-c++ byacc pcre-devel mkdir -p /tmp/swig/ cd /tmp/swig git clone https://github.com/swig/swig --branch rel-4.0.0 --single-branch cd swig ./autogen.sh ./configure make -j8 make install cd ${PBS_DIR} fi fi if [ "x${ONLY_INSTALL_DEPS}" == "x1" ]; then exit 0 fi _targetdirname=target-${ID}-$(hostname -s) if [ "x${ONLY_INSTALL}" != "x1" -a "x${ONLY_REBUILD}" != "x1" -a "x${ONLY_TEST}" != "x1" ]; then rm -rf ${_targetdirname} fi mkdir -p ${_targetdirname} [[ -f Makefile ]] && make distclean || true if [ ! -f ./${SPEC_FILE} ]; then git config --global --add safe.directory ${PBS_DIR} git checkout ${SPEC_FILE} fi if [ ! -f ./configure ]; then ./autogen.sh fi if [ "x${ONLY_REBUILD}" != "x1" -a "x${ONLY_INSTALL}" != "x1" -a "x${ONLY_TEST}" != "x1" ]; then _cflags="-g -O2 -Wall -Werror" if [ "x${ID}" == "xubuntu" ]; then _cflags="${_cflags} -Wno-unused-result" fi cd ${_targetdirname} if [ -f /src/ci ]; then if [ -f ${config_dir}/${CONFIGURE_OPT_FILE} ]; then PYTHON_CODE=$(cat < 1: if re.search(r"CFLAGS=(\"|\').*(\"|\')",x) != None: print(re.search(r"CFLAGS=(\"|\').*(\"|\')",x).group(0).split('\'')[1]) else: if re.search(r"CFLAGS=(\"|\').*(\"|\')",x) != None: print(re.search(r"CFLAGS=(\"|\').*(\"|\')",x).group(0).split('"')[1]) END ) _cflags="$(python3 -c "$PYTHON_CODE")" PYTHON_CODE=$(cat < 1: if re.search(r"CFLAGS=(\"|\').*(\"|\')",x) != None: print(re.search(r"CFLAGS=(\"|\').*(\"|\')",x).group(0).split('\'')[1]) else: if re.search(r"CFLAGS=(\"|\').*(\"|\')",x) != None: print(re.search(r"CFLAGS=(\"|\').*(\"|\')",x).group(0).split('"')[1]) END ) _cflags="$(python3 -c "$PYTHON_CODE")" PYTHON_CODE=$(cat < 1: if re.search(r"CFLAGS=(\"|\').*(\"|\')",x) != None: print(re.search(r"CFLAGS=(\"|\').*(\"|\')",x).group(0).split('\'')[1]) else: if re.search(r"CFLAGS=(\"|\').*(\"|\')",x) != None: print(re.search(r"CFLAGS=(\"|\').*(\"|\')",x).group(0).split('"')[1]) END ) _cflags="$(python3 -c "$PYTHON_CODE")" PYTHON_CODE=$(cat <. # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. PBS_DIR=$(readlink -f $0 | awk -F'/ci/' '{print $1}') cd ${PBS_DIR} [ -f /sys/fs/selinux/enforce ] && echo 0 > /sys/fs/selinux/enforce yum clean all yum -y update yum -y install yum-utils epel-release rpmdevtools libasan llvm dnf config-manager --set-enabled crb rpmdev-setuptree yum -y install python3-pip sudo which net-tools man-db time.x86_64 procps yum-builddep -y ./*.spec yum -y install cmake3 git rm -rf cJSON git clone https://github.com/DaveGamble/cJSON.git cd cJSON; mkdir build; cd build; cmake3 .. -DCMAKE_INSTALL_PREFIX=/usr; make; make install; cd ../../ ./autogen.sh rm -rf target-sanitize mkdir -p target-sanitize cd target-sanitize ../configure make dist cp -fv *.tar.gz /root/rpmbuild/SOURCES/ CFLAGS="-g -O2 -Wall -Werror -fsanitize=address -fno-omit-frame-pointer" CXXFLAGS="-g -O2 -Wall -Werror -fsanitize=address -fno-omit-frame-pointer" rpmbuild -bb --with ptl *.spec yum -y install /root/rpmbuild/RPMS/x86_64/*-server-??.*.x86_64.rpm yum -y install /root/rpmbuild/RPMS/x86_64/*-debuginfo-??.*.x86_64.rpm yum -y install /root/rpmbuild/RPMS/x86_64/*-ptl-??.*.x86_64.rpm sed -i "s@PBS_START_MOM=0@PBS_START_MOM=1@" /etc/pbs.conf /etc/init.d/pbs start set +e . /etc/profile.d/ptl.sh set -e pbs_config --make-ug cd /opt/ptl/tests/ # Ignore address sanitizer link order because of # importing pbs python modules (like pbs and pbs_ifl) in ptl. # The problem is that original Python bin is not compiled with ASAN. # This will not affect pbs service as it has its own env. export ASAN_OPTIONS=verify_asan_link_order=0 pbs_benchpress --tags=smoke ================================================ FILE: ci/etc/docker-entrypoint ================================================ #!/bin/bash -ex # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. workdir=/src/etc logdir=/logs cd /pbssrc ${workdir}/do.sh 1 2>&1 | tee -a ${logdir}/build-$(hostname -s) if [ $? -ne 0 ]; then exit 1 else exit 0 fi ================================================ FILE: ci/etc/gen_ptl_json.sh ================================================ #!/bin/bash -x # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. cleanup() { cd ${etcdir} rm -rf ./tmpptl } etcdir=$(dirname $(readlink -f "$0")) cidir=/pbssrc/ci cd ${etcdir} mkdir tmpptl workdir=${etcdir}/tmpptl cd ${workdir} mkdir -p ptlsrc /bin/cp -rf ${cidir}/../test/* ptlsrc/ if [ -f ptlsrc/fw/setup.py.in ]; then sed "s;@PBS_VERSION@;1.0.0;g" ptlsrc/fw/setup.py.in >ptlsrc/fw/setup.py sed "s;@PBS_VERSION@;1.0.0;g" ptlsrc/fw/ptl/__init__.py.in >ptlsrc/fw/ptl/__init__.py fi cd ${workdir}/ptlsrc mkdir ../tp __python="$(grep -rE '^#!/usr/bin/(python|env python)[23]' fw/bin/pbs_benchpress | awk -F[/" "] '{print $NF}')" ${__python} -m pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org --prefix $(pwd)/tp -r fw/requirements.txt fw/. cd tests PYTHONPATH=../tp/lib/$(/bin/ls -1 ../tp/lib)/site-packages ${__python} ../tp/bin/pbs_benchpress $1 --gen-ts-tree ret=$? if [ ${ret} -ne 0 ]; then echo "Failed to generate ptl json" cleanup exit $ret else mv ptl_ts_tree.json ${cidir} fi cleanup ================================================ FILE: ci/etc/id_rsa ================================================ -----BEGIN OPENSSH PRIVATE KEY----- b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAABlwAAAAdzc2gtcn NhAAAAAwEAAQAAAYEAt6kNw0C2ZMybkld0sucLkpaMuwn6SXB6+9scN3ZMTSFRSMxa85MT ee8sOsiyrkIjv85nAWdYsGjLBKgr43IlV2qBCxZO2YsTryl52E6pVBbVuizBj8m6sO+3hM hUBEbIrqvplrxf19y2HlNsygSlNFfMb3ptIIvTGGez+o8ZTAI3wXcFqxNxi8flo77yp6UH x31zIDOJCfN98W1GYXVwXiowfkoKkROvbH9B/HsLTjuxkHzFCGwGNzEClr3ayJSmYyJu0P nfjBPeZrL7Dxt1RwSfqI8j1kp4VhLCeEyFYS5pi8CypLgtvL37gLdqEGpBjcf4J/AyjDZJ cDgzTI+ZrTP/ldhnVMy84B8TAC53swauaec1JKDtc+FNSN28GY/0VTcyH7Pwt9gRESWFsV zrN4lwRWZivwndi3mj3zUcge3LQ6pBpjTEGiYIgNNJd5mjDZM9ieB4lC7+MTmq9Yg0Dzm4 u6uanAP5t2up6F5jck/7sLiAX4+fQ8vLZOAqsZdhAAAFgPS9UiD0vVIgAAAAB3NzaC1yc2 EAAAGBALepDcNAtmTMm5JXdLLnC5KWjLsJ+klwevvbHDd2TE0hUUjMWvOTE3nvLDrIsq5C I7/OZwFnWLBoywSoK+NyJVdqgQsWTtmLE68pedhOqVQW1boswY/JurDvt4TIVARGyK6r6Z a8X9fcth5TbMoEpTRXzG96bSCL0xhns/qPGUwCN8F3BasTcYvH5aO+8qelB8d9cyAziQnz ffFtRmF1cF4qMH5KCpETr2x/Qfx7C047sZB8xQhsBjcxApa92siUpmMibtD534wT3may+w 8bdUcEn6iPI9ZKeFYSwnhMhWEuaYvAsqS4Lby9+4C3ahBqQY3H+CfwMow2SXA4M0yPma0z /5XYZ1TMvOAfEwAud7MGrmnnNSSg7XPhTUjdvBmP9FU3Mh+z8LfYERElhbFc6zeJcEVmYr 8J3Yt5o981HIHty0OqQaY0xBomCIDTSXeZow2TPYngeJQu/jE5qvWINA85uLurmpwD+bdr qeheY3JP+7C4gF+Pn0PLy2TgKrGXYQAAAAMBAAEAAAGAJVEQHtATPz/jjESAzajsTQiR55 8LX8ie9HV8sjgzIKjYXzZGdJ85odja38bPp2CA6wQBIePhvVZNidCxujEDLVPSjHIn60O6 6ChBPZYeCZvqKT3WxmRyrmjGnRAnIgdP103O1HXJ845A4sCIpjNzbcM5Ip15dtdyOM85Xn uc5Di/I2wPlscIlyIyoqa1nyKFBh+TOMO/4Gm8+UT+u+akwj1IRSC+LOQXDLB+s9I8ZdTz KyxuzFtGmAg5Qm+o+IBbRbvTzpdx2UHkiFw8+VQn8fwHuzfR+Od48D1kFBCk5yGcAMTQP3 g4AV8vp/UAVU3f4stYWh7okxXE7dKY+YTb1qHbjadNp9KqJUY3d+LO2F2vT7QBD4eIDS22 1emtqfaiLXXWDG1vZHXq3wx5MlvnwFE4gSY9yxF0FsSwi3s0j8zEYjszKQBiAoPLkxmqDq 2/WcmT9GhKd5FsMQEy0W8lBePtRYw85BRfhZH7Lzh0gGZ+3ZYss4qQS2vAzqWWiuhRAAAA wHsVos2ccAcgMeTVYmo3JNgahAF0orP+NPxFLgZrK7Z0nwjICpKfaR6D3lWiFvlhUH33iv wr3gCAFTNL7zblbJXTebA5dvw8kFmUuXhe7/uRGNjn2l38j0t+aHMXDVafo7Dm1chh6pa8 AyP5/OR9sVXsFVrkQ3+iVQHJBpsXDYlI7q5j51CrNb7wgr8l8HhWyDLDTg0irmzfrvPJ43 H7URIgDIDuX7mbSnYoDDtP2azdpaZyG1IZlbFkCNyaQtjycwAAAMEA8fq3kVuTqntNXqTE 3H7CnKSwR8w7yE/VGaVs7jLRvPyHpC3umUiKWjO/ebLMKBKdS3fQ0I72MB2BdeQbmuYTBY 2FwRQOAopjketAZDrZWhjmzRgSsSRofl3N/cqya6L+0RcAfwR/2OGM9E1QzEIyPcH8khVo BK2I+xRpU5s1b5SXw5TOge9PXWgEWvRRtFRgbgOJ5WfPiLabKMm9skVx8BiNFzsVJxsEnb WdwJKwnT+2a7gIOnM+DvFQiLyEr8QTAAAAwQDCTUpPyB8cqP0cCOFFH46im7ryV3ROZLlj hj5dVKpXPyA5iHEQbPTx+VXOLSM1MysNRFPWlisE2OCES897kPgD5cypatnC1aa+sztOeD fuuEN4wZXjDo97DhIaO6YtfzhXI5Y/CEOKWQmrWlEQEf4HEGoK2kQka5KeOPKQTACLcLqi ATLFxSEDr6wyEwHA0EGh7WjH1zEpFDDY9pUCAwmyETD/OriqfCbRPhGrTTQVrcadG/Sc72 V5hjFzgl3J3TsAAAALcm9vdEBwYnMuY2k= -----END OPENSSH PRIVATE KEY----- ================================================ FILE: ci/etc/id_rsa.pub ================================================ ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQC3qQ3DQLZkzJuSV3Sy5wuSloy7CfpJcHr72xw3dkxNIVFIzFrzkxN57yw6yLKuQiO/zmcBZ1iwaMsEqCvjciVXaoELFk7ZixOvKXnYTqlUFtW6LMGPybqw77eEyFQERsiuq+mWvF/X3LYeU2zKBKU0V8xvem0gi9MYZ7P6jxlMAjfBdwWrE3GLx+WjvvKnpQfHfXMgM4kJ833xbUZhdXBeKjB+SgqRE69sf0H8ewtOO7GQfMUIbAY3MQKWvdrIlKZjIm7Q+d+ME95msvsPG3VHBJ+ojyPWSnhWEsJ4TIVhLmmLwLKkuC28vfuAt2oQakGNx/gn8DKMNklwODNMj5mtM/+V2GdUzLzgHxMALnezBq5p5zUkoO1z4U1I3bwZj/RVNzIfs/C32BERJYWxXOs3iXBFZmK/Cd2LeaPfNRyB7ctDqkGmNMQaJgiA00l3maMNkz2J4HiULv4xOar1iDQPObi7q5qcA/m3a6noXmNyT/uwuIBfj59Dy8tk4Cqxl2E= root@pbs.ci ================================================ FILE: ci/etc/install-system-packages ================================================ #!/bin/bash -x # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. groupadd -g 1900 tstgrp00 groupadd -g 1901 tstgrp01 groupadd -g 1902 tstgrp02 groupadd -g 1903 tstgrp03 groupadd -g 1904 tstgrp04 groupadd -g 1905 tstgrp05 groupadd -g 1906 tstgrp06 groupadd -g 1907 tstgrp07 groupadd -g 901 pbs groupadd -g 1146 agt useradd -m -s /bin/bash -u 4357 -g tstgrp00 -G tstgrp00 pbsadmin useradd -m -s /bin/bash -u 9000 -g tstgrp00 -G tstgrp00 pbsbuild useradd -m -s /bin/bash -u 884 -g tstgrp00 -G tstgrp00 pbsdata useradd -m -s /bin/bash -u 4367 -g tstgrp00 -G tstgrp00 pbsmgr useradd -m -s /bin/bash -u 4373 -g tstgrp00 -G tstgrp00 pbsnonroot useradd -m -s /bin/bash -u 4356 -g tstgrp00 -G tstgrp00 pbsoper useradd -m -s /bin/bash -u 4358 -g tstgrp00 -G tstgrp00 pbsother useradd -m -s /bin/bash -u 4371 -g tstgrp00 -G tstgrp00 pbsroot useradd -m -s /bin/bash -u 4355 -g tstgrp00 -G tstgrp02,tstgrp00 pbstest useradd -m -s /bin/bash -u 4359 -g tstgrp00 -G tstgrp00 pbsuser useradd -m -s /bin/bash -u 4361 -g tstgrp00 -G tstgrp01,tstgrp02,tstgrp00 pbsuser1 useradd -m -s /bin/bash -u 4362 -g tstgrp00 -G tstgrp01,tstgrp03,tstgrp00 pbsuser2 useradd -m -s /bin/bash -u 4363 -g tstgrp00 -G tstgrp01,tstgrp04,tstgrp00 pbsuser3 useradd -m -s /bin/bash -u 4364 -g tstgrp01 -G tstgrp04,tstgrp05,tstgrp01 pbsuser4 useradd -m -s /bin/bash -u 4365 -g tstgrp02 -G tstgrp04,tstgrp06,tstgrp02 pbsuser5 useradd -m -s /bin/bash -u 4366 -g tstgrp03 -G tstgrp04,tstgrp07,tstgrp03 pbsuser6 useradd -m -s /bin/bash -u 4368 -g tstgrp01 -G tstgrp01 pbsuser7 useradd -m -s /bin/bash -u 11000 -g tstgrp00 -G tstgrp00 tstusr00 useradd -m -s /bin/bash -u 11001 -g tstgrp00 -G tstgrp00 tstusr01 chmod g+x,o+x /home/* . /etc/os-release if [ "x${ID}" == "xcentos" -a "x${VERSION_ID}" == "x8" ]; then sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* dnf -y clean all dnf -y install 'dnf-command(config-manager)' dnf -y config-manager --set-enabled powertools dnf -y install epel-release dnf -y update dnf -y install git gcc make m4 autoconf automake libtool rpm-build rpmdevtools \ hwloc-devel libX11-devel libXt-devel libXext-devel libXft-devel \ libedit-devel libical-devel cmake glibc-common yum-utils \ ncurses-devel postgresql-devel python3-devel tcl-devel tk-devel swig \ expat-devel openssl-devel libXext libXft expat libedit glibc-static \ postgresql-server python3 tcl tk libical perl tar sendmail sudo perl-Env \ perl-Switch gcc-c++ doxygen elfutils bison flex glibc-langpack-en \ which net-tools man-db time csh lsof tzdata file \ expect perl-App-cpanminus cpan initscripts \ systemd systemd-sysv libcap rsyslog \ openssh-clients openssh-server valgrind-devel valgrind libasan \ llvm bc gzip gdb rsync wget curl ccache bind-utils vim iputils pam-devel dnf -y clean all rpmdev-setuptree __systemd_paths='/etc/systemd/system /usr/lib/systemd/system' elif [ "x${ID}" == "xcentos" -a "x${VERSION_ID}" == "x7" ]; then yum -y clean all rpm --import https://package.perforce.com/perforce.pubkey && { echo [perforce] echo name=Perforce echo baseurl=http://package.perforce.com/yum/rhel/7/x86_64 echo enabled=1 echo gpgcheck=1 } >/etc/yum.repos.d/perforce.repo yum -y install epel-release yum -y update yum -y install git gcc make m4 autoconf automake libtool rpm-build rpmdevtools \ hwloc-devel libX11-devel libXt-devel libXext-devel libXft-devel \ libedit-devel libical-devel cmake glibc-common yum-utils \ ncurses-devel postgresql-devel python3-devel tcl-devel tk-devel swig \ expat-devel openssl-devel libXext libXft expat libedit glibc-static \ postgresql-server python3 tcl tk libical perl tar sendmail sudo perl-Env \ perl-Switch gcc-c++ doxygen elfutils bison flex postgresql-contrib \ which net-tools man-db time csh lsof tzdata file glibc-langpack-en \ expect perl-App-cpanminus cpan \ systemd systemd-sysv libcap rsyslog \ openssh-clients openssh-server valgrind-devel valgrind libasan pam-devel \ llvm bc gzip gdb rsync wget curl ccache bind-utils vim iputils python2-pip helix-cli yum -y clean all rpmdev-setuptree __systemd_paths='/etc/systemd/system /usr/lib/systemd/system' elif [ "x${ID}" == "xopensuse" -o "x${ID}" == "xopensuse-leap" ]; then __on="$(grep -oP '(?<=^NAME=").*(?=")' /etc/os-release)" __ov="$(grep -oP '(?<=^VERSION=").*(?=")' /etc/os-release)" zypper -n addrepo -ceKfG "https://download.opensuse.org/repositories/devel:tools/${__on// /_}_${__ov// /_}/devel:tools.repo" zypper -n addrepo -ceKfG "https://download.opensuse.org/repositories/devel:languages:perl/${__on// /_}_${__ov// /_}/devel:languages:perl.repo" zypper -n addrepo -ceKfG "http://package.perforce.com/yum/rhel/7/x86_64" p4 zypper -n clean -mMa zypper -n refresh -fbd zypper --no-gpg-checks -n update --force-resolution zypper --no-gpg-checks -n install --force-resolution git m4 \ gcc make autoconf automake libtool rpm-build rpmdevtools helix-cli hwloc-devel \ libX11-devel libXt-devel libedit-devel libical-devel cmake ncurses-devel \ postgresql-devel python3-devel tcl-devel tk-devel swig libexpat-devel \ libopenssl-devel libXext-devel libXft-devel expat libedit fontconfig net-tools-deprecated net-tools \ timezone python3-xml glibc-devel-static postgresql-server python3 python3-pip tcl tk \ perl tar sendmail sudo gcc-c++ doxygen elfutils bison flex \ which net-tools net-tools-deprecated man time tcsh lsof file vim \ expect perl-App-cpanminus perl-Parse-PMFile hostname bind-utils \ systemd systemd-sysvinit libcap-progs iputils rsyslog openssh pam-devel \ valgrind-devel valgrind llvm gdb rsync wget ccache bc gzip python-pip zypper -n clean -mMa zypper -n rr devel_tools rpmdev-setuptree __systemd_paths='/etc/systemd/system /usr/lib/systemd/system' elif [ "x${ID}" == "xubuntu" ]; then if [ "x${DEBIAN_FRONTEND}" == "x" ]; then export DEBIAN_FRONTEND=noninteractive fi apt -y update apt -y upgrade apt -y install git build-essential gcc g++ make dpkg-dev m4 \ autoconf automake libtool rpm alien elfutils dh-make \ libhwloc-dev libx11-dev libxt-dev libedit-dev libical-dev cmake \ libncurses-dev libpq-dev python3-dev tcl-dev tk-dev swig libexpat1-dev \ libssl-dev libxext-dev libxft-dev pkg-config expat postgresql perl tar \ sendmail sendmail-bin sudo doxygen bison flex fakeroot libnuma1 \ net-tools man time csh lsof curl gzip iputils-ping \ expect cpanminus locales-all dnsutils tzdata vim bc file \ systemd systemd-sysv sysvinit-utils libcap2-bin rsyslog libpam-dev \ openssh-server openssh-client valgrind llvm gdb rsync wget ccache \ python3 python3-pip cpanminus if [ "x${ID}" == "xubuntu" -a "x${VERSION_ID}" == "x16.04" ]; then wget -qO - https://package.perforce.com/perforce.pubkey | apt-key add - && echo 'deb http://package.perforce.com/apt/ubuntu/ xenial release' >/etc/apt/sources.list.d/perforce.list else wget -qO - https://package.perforce.com/perforce.pubkey | apt-key add - && echo 'deb http://package.perforce.com/apt/ubuntu/ bionic release' >/etc/apt/sources.list.d/perforce.list fi apt -y update apt -y install helix-cli __systemd_paths='/etc/systemd/system /lib/systemd/system' apt -y autoremove apt -y clean rm -rf /var/lib/apt/list/* mkdir -p /root/rpmbuild/SOURCES fi # Install pip, requests and sh python modules set -ex && python -m pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org requests sh && rm -rf ~/.cache /tmp/* # QALib deps modules cpanm -n --no-wget --no-lwp --curl \ IO::Pty IPC::Run IPC::Cmd Class::Accessor Module::Build Pod::Usage \ Getopt::Long DateTime Date::Parse Proc::ProcessTable Test::More \ Unix::Process Time::HiRes File::FcntlLock File::Remote find ${__systemd_paths} -path '*.wants/*' \ -not -name '*journald*' \ -not -name '*systemd-tmpfiles*' \ -not -name '*systemd-user-sessions*' \ -not -name '*getty*' \ -not -name '*dbus*' \ -exec rm -fv {} \; cp /workspace/etc/ci-script-wrapper.service /etc/systemd/system systemctl set-default multi-user.target systemctl enable sshd || systemctl enable ssh systemctl enable sendmail if [ "x${ID}" != "xubuntu" -a "x${VERSION_ID}" != "x16.04" ]; then systemctl disable sm-client systemctl mask sm-client fi systemctl enable rsyslog systemctl disable getty@.service systemctl unmask getty.target systemctl unmask console-getty systemctl enable getty.target systemctl enable console-getty systemctl enable ci-script-wrapper cp /workspace/etc/container-env-setup.sh /etc/profile.d/0container-env-setup.sh cp /workspace/etc/sudoers-overrides /etc/sudoers.d/container-overrides echo '' >/etc/security/limits.conf rm -f /etc/security/limits.d/*.conf rm -rf ~/.ssh mkdir --mode=700 ~/.ssh cp /workspace/etc/id_rsa* ~/.ssh/ chmod 0600 ~/.ssh/id_rsa chmod 0644 ~/.ssh/id_rsa.pub cp ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys chmod 0600 ~/.ssh/authorized_keys echo 'root:pbs' | chpasswd cat /etc/profile.d/0container-env-setup.sh >>/root/.profile cat /etc/profile.d/0container-env-setup.sh >>/root/.bash_profile cat /etc/profile.d/0container-env-setup.sh >>/root/.bashrc for user in $(awk -F: '/^(pbs|tst)/ {print $1}' /etc/passwd); do rm -rf /home/${user}/.ssh cp -rfp ~/.ssh /home/${user}/ chown -R ${user}: /home/${user}/.ssh echo "${user}:pbs" | chpasswd cat /etc/profile.d/0container-env-setup.sh >>/home/${user}/.profile cat /etc/profile.d/0container-env-setup.sh >>/home/${user}/.bash_profile cat /etc/profile.d/0container-env-setup.sh >>/home/${user}/.bashrc chown ${user}: /home/${user}/.bashrc /home/${user}/.profile /home/${user}/.bash_profile done echo 'Host *' >>/etc/ssh/ssh_config echo ' StrictHostKeyChecking no' >>/etc/ssh/ssh_config echo ' ConnectionAttempts 3' >>/etc/ssh/ssh_config echo ' IdentityFile ~/.ssh/id_rsa' >>/etc/ssh/ssh_config echo ' PreferredAuthentications publickey,password' >>/etc/ssh/ssh_config echo 'PermitRootLogin yes' >>/etc/ssh/sshd_config echo 'UseDNS no' >>/etc/ssh/sshd_config sed -i 's/AcceptEnv/# AcceptEnv/g' /etc/ssh/sshd_config ssh-keygen -A rm -f /var/run/*.pid /run/nologin rm -rf ~/.cache ~/.cpanm /var/{log,cache} /tmp /var/tmp /run/*.pid /var/run/*.pid mkdir -p --mode=0755 /var/{log,cache} mkdir -p --mode=1777 /tmp /var/tmp ================================================ FILE: ci/etc/killit.sh ================================================ #!/bin/bash -x # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. killit() { if [ -z "$1" ]; then return 0 fi pid=$(ps -ef 2>/dev/null | grep $1 | grep -v grep | awk '{print $2}') if [ ! -z "${pid}" ]; then echo "kill -TERM ${pid}" kill -TERM ${pid} 2>/dev/null else return 0 fi sleep 10 pid=$(ps -ef 2>/dev/null | grep $1 | grep -v grep | awk '{print $2}') if [ ! -z "${pid}" ]; then echo "kill -KILL ${pid}" kill -KILL ${pid} 2>/dev/null fi } kill_pbs_process() { ps -eaf 2>/dev/null | grep pbs_ | grep -v grep | wc -l if [ $ret -gt 0 ]; then killit pbs_server killit pbs_mom killit pbs_comm killit pbs_sched killit pbs_ds_monitor killit /opt/pbs/pgsql/bin/postgres killit pbs_benchpress ps_count=$(ps -eaf 2>/dev/null | grep pbs_ | grep -v grep | wc -l) if [ ${ps_count} -eq 0 ]; then return 0 else return 1 fi fi } . /etc/os-release if [ "x$1" == "xbackup" ]; then time_stamp=$(date -u "+%Y-%m-%d-%H%M%S") folder=session-${time_stamp} mkdir -p /logs/${folder} cp /logs/build-* /logs/${folder} cp /logs/logfile* /logs/${folder} cp /logs/result* /logs/${folder} cp /src/.config_dir/.conf.json /logs/${folder}/conf.json cp /src/docker-compose.json /logs/${folder}/ rm -rf /logs/build-* rm -rf /logs/logfile* rm -rf /logs/result* rm -rf /pbssrc/target-* exit 0 fi clean=${1} echo "Trying to stop all process via init.d" /etc/init.d/pbs stop ret=$? if [ ${ret} -ne 0 ]; then echo "failed graceful stop" echo "force kill all processes" kill_pbs_process else echo "checking for running ptl" benchpress_count=$(ps -ef 2>/dev/null | grep $1 | grep -v grep | wc -l) if [ ${benchpress_count} -gt 0 ]; then killit pbs_benchpress else echo "No running ptl tests found" fi fi if [ "XX${clean}" == "XXclean" ]; then cd /pbssrc/target-${ID} && make uninstall rm -rf /etc/init.d/pbs rm -rf /etc/pbs.conf rm -rf /var/spool/pbs rm -rf /opt/ptl rm -rf /opt/pbs fi ================================================ FILE: ci/etc/macros ================================================ CONFIG_DIR=.config_dir STATUS_FILE=status PARAM_FILE=params REQUIREMENT_DECORATOR_FILE=requirements_decorator CONFIGURE_OPT_FILE=configure_opt BENCHPRESS_OPT_FILE=benchpress_opt CONF_JSON_FILE=conf.json REQ_DOCKER_VERSION=17.12.0 DEFAULT_PLATFORM=centos:8 SUPPORTED_PLATFORMS=centos:7,centos:8,ubuntu:16.04,ubuntu:18.04 ================================================ FILE: ci/etc/sudoers-overrides ================================================ Defaults syslog = local7 Defaults always_set_home Defaults !requiretty Defaults !env_reset Defaults !secure_path Defaults env_keep = "*" ALL ALL=(ALL) NOPASSWD: ALL ================================================ FILE: configure.ac ================================================ # # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. # AC_PREREQ([2.63]) # Use PBS_VERSION to override the version statically defined here. For example: # ./configure PBS_VERSION=20.0.0 --prefix=/opt/pbs AC_INIT([OpenPBS], [23.06.06], [pbssupport@altair.com], [openpbs], [http://www.openpbs.org/]) AC_CONFIG_HEADERS([src/include/pbs_config.h]) AC_CONFIG_SRCDIR([src/cmds/qmgr.c]) AC_CONFIG_AUX_DIR([buildutils]) AC_CONFIG_MACRO_DIR([m4]) AC_CANONICAL_TARGET([]) os_id=`grep ^ID= /etc/os-release | sed -n 's/.*"\(.*\)"/\1/p'` AS_CASE([$os_id], [opensuse-tumbleweed], m4_define([am_init_string], [-Wall foreign subdir-objects]), [*], m4_define([am_init_string], [-Wall foreign])) AM_INIT_AUTOMAKE(am_init_string) AC_USE_SYSTEM_EXTENSIONS # Checks for programs. AC_PROG_AWK AC_PROG_YACC AC_PROG_SED AC_PROG_CC AC_PROG_LEX AC_PROG_INSTALL AC_PROG_LN_S AC_PROG_CXX AC_SUBST([AM_CXXFLAGS], [--std=c++11]) # Automake macros #AM_PROG_AR macro is defined with automake version >= 1.12 m4_ifdef([AM_PROG_AR], [AM_PROG_AR]) AM_PROG_CC_C_O # Initialize libtool AM_PROG_LIBTOOL LT_INIT([shared static]) # Checks for libraries. AC_CHECK_LIB([c], [xdr_int], [], AC_CHECK_LIB(nsl, xdr_int) ) AC_CHECK_LIB([c], [ruserok], [], AC_CHECK_LIB(socket, ruserok) ) AC_CHECK_LIB([c], [crypt], [], AC_CHECK_LIB(crypt, crypt) ) AC_CHECK_LIB([c], [posix_openpt], AC_DEFINE([HAVE_POSIX_OPENPT], [], [Defined whe posix_openpt is available]) ) AC_CHECK_LIB(dl, dlopen) AC_CHECK_LIB([kvm], [kvm_open]) AC_CHECK_LIB([socket], [socket], [socket_lib="-lsocket -lnsl"] AC_SUBST(socket_lib), [socket_lib=""] AC_SUBST(socket_lib), [-lnsl] ) AC_CHECK_LIB([c], [malloc_info], AC_DEFINE([HAVE_MALLOC_INFO], [], [Defined when malloc_info is available]) ) # Check for X Window System AC_PATH_XTRA # Checks for optional header files. AC_CHECK_HEADERS([ \ com_err.h \ gssapi.h \ krb5.h \ libpq-fe.h \ mach/mach.h \ nlist.h \ sys/eventfd.h \ sys/systeminfo.h \ ]) # Checks for required header files. AC_CHECK_HEADERS([ \ stdio.h \ alloca.h \ arpa/inet.h \ assert.h \ ctype.h \ dirent.h \ dlfcn.h \ execinfo.h \ fcntl.h \ float.h \ fstab.h \ ftw.h \ grp.h \ libgen.h \ limits.h \ math.h \ memory.h \ netdb.h \ netinet/in.h \ netinet/in_systm.h \ netinet/ip.h \ netinet/tcp.h \ openssl/aes.h \ openssl/bio.h \ openssl/err.h \ openssl/evp.h \ openssl/ssl.h \ paths.h \ poll.h \ pthread.h \ pwd.h \ regex.h \ signal.h \ stdbool.h \ stddef.h \ stdint.h \ stdio.h \ stdlib.h \ string.h \ strings.h \ syslog.h \ sys/fcntl.h \ sys/file.h \ sys/ioctl.h \ sys/mman.h \ sys/mount.h \ sys/param.h \ sys/poll.h \ sys/quota.h \ sys/resource.h \ sys/select.h \ sys/signal.h \ sys/socket.h \ sys/stat.h \ sys/statvfs.h \ sys/time.h \ sys/timeb.h \ sys/times.h \ sys/types.h \ sys/uio.h \ sys/un.h \ sys/user.h \ sys/utsname.h \ sys/wait.h \ termios.h \ time.h \ unistd.h \ utime.h \ X11/Intrinsic.h \ X11/X.h \ X11/Xlib.h \ zlib.h \ ],, AC_MSG_ERROR([Required header file is missing.]) \ ) # Checks for typedefs, structures, and compiler characteristics. #AC_CHECK_HEADER_STDBOOL macro is defined with autoconf version >= 2.67 m4_ifdef([AC_CHECK_HEADER_STDBOOL], [AC_CHECK_HEADER_STDBOOL]) AC_TYPE_UID_T AC_TYPE_MODE_T AC_TYPE_OFF_T AC_TYPE_PID_T AC_C_RESTRICT AC_TYPE_SIZE_T AC_TYPE_SSIZE_T AC_CHECK_MEMBERS([struct stat.st_blksize]) AC_TYPE_UINT16_T AC_TYPE_UINT32_T AC_TYPE_UINT64_T AC_TYPE_UINT8_T AC_CHECK_TYPES([ptrdiff_t]) # Checks for library functions. AC_FUNC_ALLOCA AC_FUNC_CHOWN AC_FUNC_ERROR_AT_LINE AC_FUNC_FORK AC_FUNC_GETGROUPS AC_FUNC_GETMNTENT AC_FUNC_LSTAT_FOLLOWS_SLASHED_SYMLINK AC_FUNC_MKTIME AC_FUNC_MMAP AC_FUNC_STRERROR_R AC_FUNC_STRTOD AC_CHECK_FUNCS([ \ alarm \ atexit \ bzero \ dup2 \ endpwent \ floor \ ftruncate \ getcwd \ gethostbyaddr \ gethostbyname \ gethostname \ getmntent \ getpagesize \ gettimeofday \ hasmntopt \ inet_ntoa \ localtime_r \ memchr \ memmove \ memset \ mkdir \ munmap \ pathconf \ poll \ pstat_getdynamic \ putenv \ realpath \ regcomp \ rmdir \ select \ setresuid \ setresgid \ getpwuid \ initgroups \ seteuid \ setegid \ strerror_r \ socket \ strcasecmp \ strchr \ strcspn \ strdup \ strerror \ strncasecmp \ strpbrk \ strrchr \ strspn \ strstr \ strtol \ strtoul \ strtoull \ sysinfo \ uname \ utime \ ]) PKG_PROG_PKG_CONFIG m4_ifdef([PKG_INSTALLDIR], [PKG_INSTALLDIR], [ pkgconfigdir=/usr/lib64/pkgconfig AC_SUBST([pkgconfigdir]) ]) # PBS macros (order matters for some of these) PBS_AC_PBS_VERSION PBS_AC_DECL_H_ERRNO PBS_AC_DECL_SOCKLEN_T PBS_AC_DECL_EPOLL PBS_AC_DECL_EPOLL_PWAIT PBS_AC_DECL_PPOLL PBS_AC_WITH_SERVER_HOME PBS_AC_WITH_SERVER_NAME_FILE PBS_AC_WITH_DATABASE_DIR PBS_AC_WITH_DATABASE_USER PBS_AC_WITH_DATABASE_PORT PBS_AC_WITH_PBS_CONF_FILE PBS_AC_WITH_TMP_DIR PBS_AC_WITH_UNSUPPORTED_DIR PBS_AC_WITH_CORE_LIMIT PBS_AC_WITH_PYTHON PBS_AC_WITH_EXPAT PBS_AC_WITH_EDITLINE PBS_AC_WITH_HWLOC PBS_AC_WITH_LIBICAL PBS_AC_WITH_PMIX PBS_AC_WITH_SENDMAIL PBS_AC_WITH_SWIG PBS_AC_WITH_TCL PBS_AC_WITH_TCLATRSEP PBS_AC_WITH_XAUTH PBS_AC_WITH_KRBAUTH PBS_AC_WITH_MIN_STACK_LIMIT PBS_AC_DISABLE_SHELL_PIPE PBS_AC_DISABLE_SYSLOG PBS_AC_SECURITY PBS_AC_ENABLE_ALPS PBS_AC_WITH_LIBZ PBS_AC_ENABLE_PTL PBS_AC_SYSTEMD_UNITDIR PBS_AC_PATCH_LIBTOOL PBS_AC_WITH_CJSON AC_CONFIG_FILES([ openpbs.spec Makefile buildutils/Makefile doc/Makefile test/Makefile test/fw/Makefile test/tests/Makefile test/fw/setup.py test/fw/ptl/__init__.py src/Makefile src/cmds/Makefile src/cmds/mpiexec src/cmds/pbs_lamboot src/cmds/pbs_mpihp src/cmds/pbs_mpilam src/cmds/pbs_mpirun src/cmds/pbs_remsh src/cmds/pbsrun_unwrap src/cmds/pbsrun_wrap src/cmds/pbsrun src/cmds/scripts/Makefile src/cmds/scripts/modulefile src/cmds/scripts/pbs_habitat src/cmds/scripts/pbs_init.d src/cmds/scripts/pbs_reload src/cmds/scripts/pbs_poerun src/cmds/scripts/pbs_postinstall src/cmds/scripts/pbs.service src/cmds/scripts/pbsrun.poe src/hooks/Makefile src/iff/Makefile src/include/Makefile src/include/pbs_version.h src/lib/Libattr/Makefile src/lib/Libdb/Makefile src/lib/Libdb/pgsql/Makefile src/lib/Libifl/Makefile src/lib/Liblog/Makefile src/lib/Libnet/Makefile src/lib/Libpbs/Makefile src/lib/Libpbs/pbs.pc src/lib/Libpython/Makefile src/lib/Libsec/Makefile src/lib/Libsite/Makefile src/lib/Libtpp/Makefile src/lib/Libutil/Makefile src/lib/Libauth/Makefile src/lib/Libauth/gss/Makefile src/lib/Libauth/munge/Makefile src/lib/Liblicensing/Makefile src/lib/Libjson/Makefile src/lib/Libjson/cJSON/Makefile src/lib/Makefile src/modules/Makefile src/modules/python/Makefile src/mom_rcp/Makefile src/resmom/Makefile src/scheduler/Makefile src/server/Makefile src/tools/Makefile src/tools/wrap_tcl.sh src/unsupported/Makefile ]) AC_OUTPUT ================================================ FILE: doc/Makefile.am ================================================ # # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. # notrans_dist_man1_MANS = \ man1/pbsdsh.1B \ man1/pbs_login.1B \ man1/pbs_python.1B \ man1/pbs_ralter.1B \ man1/pbs_rdel.1B \ man1/pbs_release_nodes.1B \ man1/pbs_rstat.1B \ man1/pbs_rsub.1B \ man1/qalter.1B \ man1/qdel.1B \ man1/qhold.1B \ man1/qmove.1B \ man1/qmsg.1B \ man1/qorder.1B \ man1/qrerun.1B \ man1/qrls.1B \ man1/qselect.1B \ man1/qsig.1B \ man1/qstat.1B \ man1/qsub.1B notrans_dist_man3_MANS = \ man3/pbs_alterjob.3B \ man3/pbs_asyrunjob.3B \ man3/pbs_confirmresv.3B \ man3/pbs_connect.3B \ man3/pbs_default.3B \ man3/pbs_deljob.3B \ man3/pbs_delresv.3B \ man3/pbs_disconnect.3B \ man3/pbs_geterrmsg.3B \ man3/pbs_holdjob.3B \ man3/pbs_locjob.3B \ man3/pbs_manager.3B \ man3/pbs_modify_resv.3B \ man3/pbs_movejob.3B \ man3/pbs_msgjob.3B \ man3/pbs_orderjob.3B \ man3/pbs_preempt_jobs.3B \ man3/pbs_rerunjob.3B \ man3/pbs_rescreserve.3B \ man3/pbs_relnodesjob.3B \ man3/pbs_rlsjob.3B \ man3/pbs_runjob.3B \ man3/pbs_selectjob.3B \ man3/pbs_selstat.3B \ man3/pbs_sigjob.3B \ man3/pbs_stagein.3B \ man3/pbs_statfree.3B \ man3/pbs_stathook.3B \ man3/pbs_stathost.3B \ man3/pbs_statjob.3B \ man3/pbs_statnode.3B \ man3/pbs_statque.3B \ man3/pbs_statresv.3B \ man3/pbs_statrsc.3B \ man3/pbs_statsched.3B \ man3/pbs_statserver.3B \ man3/pbs_statvnode.3B \ man3/pbs_submit.3B \ man3/pbs_submit_resv.3B \ man3/pbs_tclapi.3B \ man3/pbs_terminate.3B \ man3/rm.3B \ man3/tm.3 noinst_man3_MANS = \ man3/pbs_rescquery.3B \ man3/pbs_submitresv.3B notrans_dist_man7_MANS = \ man1/pbs_hook_attributes.7B \ man1/pbs_job_attributes.7B \ man1/pbs_module.7B \ man1/pbs_node_attributes.7B \ man1/pbs_professional.7B \ man1/pbs_queue_attributes.7B \ man1/pbs_resources.7B \ man1/pbs_resv_attributes.7B \ man1/pbs_sched_attributes.7B \ man1/pbs_server_attributes.7B notrans_dist_man8_MANS = \ man8/mpiexec.8B \ man8/pbs.8B \ man8/pbs_account.8B \ man8/pbs_attach.8B \ man8/pbs_comm.8B \ man8/pbs.conf.8B \ man8/pbs_dataservice.8B \ man8/pbs_ds_password.8B \ man8/pbsfs.8B \ man8/pbs_hostn.8B \ man8/pbs_idled.8B \ man8/pbs_iff.8B \ man8/pbs_interactive.8B \ man8/pbs_lamboot.8B \ man8/pbs_mkdirs.8B \ man8/pbs_mom.8B \ man8/pbs_mpihp.8B \ man8/pbs_mpilam.8B \ man8/pbs_mpirun.8B \ man8/pbsnodes.8B \ man8/pbs_probe.8B \ man8/pbsrun.8B \ man8/pbsrun_unwrap.8B \ man8/pbsrun_wrap.8B \ man8/pbs_sched.8B \ man8/pbs_server.8B \ man8/pbs_snapshot.8B \ man8/pbs_tclsh.8B \ man8/pbs_tmrsh.8B \ man8/pbs_topologyinfo.8B \ man8/pbs_wish.8B \ man8/printjob.8B \ man8/qdisable.8B \ man8/qenable.8B \ man8/qmgr.8B \ man8/qrun.8B \ man8/qstart.8B \ man8/qstop.8B \ man8/qterm.8B \ man8/tracejob.8B \ man8/win_postinstall.py.8B ================================================ FILE: doc/man1/pbs_hook_attributes.7B ================================================ .\" .\" Copyright (C) 1994-2021 Altair Engineering, Inc. .\" For more information, contact Altair at www.altair.com. .\" .\" This file is part of both the OpenPBS software ("OpenPBS") .\" and the PBS Professional ("PBS Pro") software. .\" .\" Open Source License Information: .\" .\" OpenPBS is free software. You can redistribute it and/or modify it under .\" the terms of the GNU Affero General Public License as published by the .\" Free Software Foundation, either version 3 of the License, or (at your .\" option) any later version. .\" .\" OpenPBS is distributed in the hope that it will be useful, but WITHOUT .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public .\" License for more details. .\" .\" You should have received a copy of the GNU Affero General Public License .\" along with this program. If not, see . .\" .\" Commercial License Information: .\" .\" PBS Pro is commercially licensed software that shares a common core with .\" the OpenPBS software. For a copy of the commercial license terms and .\" conditions, go to: (http://www.pbspro.com/agreement.html) or contact the .\" Altair Legal Department. .\" .\" Altair's dual-license business model allows companies, individuals, and .\" organizations to create proprietary derivative works of OpenPBS and .\" distribute them - whether embedded or bundled with other software - .\" under a commercial license agreement. .\" .\" Use of Altair's trademarks, including but not limited to "PBS™", .\" "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is .\" subject to Altair's trademark licensing policies. .\" .TH pbs_hook_attributes 7B "6 May 2020" Local "PBS Professional" .SH NAME .B pbs_hook_attributes \- attributes of PBS hooks .SH DESCRIPTION .LP Hook attributes can be set, unset, and viewed using the .B qmgr command. See the .B qmgr(1B) man page. An unset hook attribute takes the default value for that attribute. Under UNIX/Linux, root privilege is required in order to operate on hooks. Under Windows, this must be done from the installation account. For domained environments, the installation account must be a lo cal account that is a member of the local Administrators group on the local computer. For standalone environments, the ins tallation account must be a local account that is a member of the local Administrators group on the local computer. .IP "alarm=" Specifies the number of seconds to allow a hook to run before the hook times out. .br Set by administrator. .br Valid values: >0 .br Format: Integer .br Default value: 30 .IP "debug" Specifies whether or not the hook produces debugging files under PBS_HOME/server_priv/hooks/tmp or PBS_HOME/mom_priv/hooks/tmp. Files are named hook___.in, .data, and .out. When this is set to .I true, the hook leaves debugging files. .br Set by administrator. .br Format: Boolean .br Default value: False .IP "enabled" Determines whether or not a hook is run when its triggering event occurs. If a hook's .I enabled attribute is .I True, the hook is run. .br Set by administrator. .br Format: Boolean .br Default: True .IP "event" List of events that trigger the hook. Can be operated on with the "=", "+=", and "-=" operators. The .I provision event cannot be combined with any other events. .br Valid events: .RS 11 .nf "exechost_periodic" "exechost_startup" "execjob_attach" "execjob_begin" "execjob_end" "execjob_epilogue" "execjob_launch" "execjob_postsuspend" "execjob_preresume" "execjob_preterm" "execjob_prologue" "modifyjob" "movejob" "periodic" "provision" "queuejob" "resvsub" "runjob" "" (meaning no event) .fi .RE .IP .br Set by administrator. .br Format: string array .br Default value: "" (meaning none, i.e. the hook is not triggered) .IP "fail_action" Specifies the action to be taken when hook fails due to alarm call or unhandled exception, or to an internal error such as not enough disk space or memory. Can also specify a subsequent action to be taken when hook runs successfully. Value can be either "none" or one or more of "offline_vnodes", "clear_vnodes_upon_recovery", and "scheduler_restart_cycle". If this attribute is set to multiple values, scheduler restart happens last. .br .I offline_vnodes .RS 11 After unsuccessful hook execution, offlines the vnodes managed by the MoM executing the hook. Only available for execjob_prologue, exechost_startup and execjob_begin hooks. .RE .IP .I clear_vnodes_upon_recovery .RS 11 After successful hook execution, clears vnodes previously offlined via offline_vnodes fail action. Only available for exechost_startup hooks. .RE .IP .I scheduler_restart_cycle .RS 11 After unsuccessful hook execution, restarts scheduling cycle. Only available for execjob_begin and execjob_prologue hooks. .RE .IP .br Set by administrator. .br Format: string_array .br Default value: "none" .IP "freq" Number of seconds between periodic or exechost_periodic triggers. .br Set by administrator. .br Format: integer .br Default: 120 seconds .IP "order" Indicates relative order of hook execution, for hooks of the same type sharing a trigger. Hooks with lower .I order values execute before those with higher values. Does not apply to periodic or exechost_periodic hooks. .br Set by administrator. .br Valid values: .RS 8 Built-in hooks: .I [-1000, 2000] .br Site hooks: .I [1, 1000] .RE .IP Format: Integer .br Default value: 1 .IP "Type" The type of the hook. Cannot be set for a built-in hook. .br Valid values: "pbs", "site" .br .I pbs .RS 11 Hook is built in. .RE .IP .I site .RS 11 Hook is custom (site-defined). .RE .IP .br Set by administrator. .br Format: String .br Default value: "site" .IP "user" Specifies who executes the hook. .br Valid values: "pbsadmin", "pbsuser" .br .I "pbsadmin" .RS 11 The hook executes as root. .RE .IP .I "pbsuser" .RS 11 The hook executes as the triggering job's owner. .RE .IP .br Set by administrator. .br Format: String .br Default value: "pbsadmin" .SH SEE ALSO qmgr(1B), pbs_module(7B), pbs_stathook(3B) ================================================ FILE: doc/man1/pbs_job_attributes.7B ================================================ .\" .\" Copyright (C) 1994-2021 Altair Engineering, Inc. .\" For more information, contact Altair at www.altair.com. .\" .\" This file is part of both the OpenPBS software ("OpenPBS") .\" and the PBS Professional ("PBS Pro") software. .\" .\" Open Source License Information: .\" .\" OpenPBS is free software. You can redistribute it and/or modify it under .\" the terms of the GNU Affero General Public License as published by the .\" Free Software Foundation, either version 3 of the License, or (at your .\" option) any later version. .\" .\" OpenPBS is distributed in the hope that it will be useful, but WITHOUT .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public .\" License for more details. .\" .\" You should have received a copy of the GNU Affero General Public License .\" along with this program. If not, see . .\" .\" Commercial License Information: .\" .\" PBS Pro is commercially licensed software that shares a common core with .\" the OpenPBS software. For a copy of the commercial license terms and .\" conditions, go to: (http://www.pbspro.com/agreement.html) or contact the .\" Altair Legal Department. .\" .\" Altair's dual-license business model allows companies, individuals, and .\" organizations to create proprietary derivative works of OpenPBS and .\" distribute them - whether embedded or bundled with other software - .\" under a commercial license agreement. .\" .\" Use of Altair's trademarks, including but not limited to "PBS™", .\" "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is .\" subject to Altair's trademark licensing policies. .\" .TH pbs_job_attributes 7B "4 March 2021" Local "PBS Professional" .SH NAME .B pbs_job_attributes \- attributes of PBS jobs .SH DESCRIPTION Each PBS job has attributes that characterize that job. .IP "Account_Name" 8 String used for accounting purposes. Can be used for fairshare. .br Can be read and set by user, Operator, Manager. .br Format: .I String; can contain any character. .br Python type: .I str .br Default: No default .IP accounting_id 8 Accounting ID for tracking accounting data not produced by PBS. Readable by all. .br Format: .I String .br Python type: .I str .IP accrue_type 8 Indicates what kind of time the job is accruing. .br Readable by Manager only. .br Format: .I Integer .br Python type: .I int .br Valid values: .RS 11 One of 0 (initial_time), 1 (ineligible_time), 2 (eligible_time), or 3 (run_time). .RE .IP Default: .I 2 (eligible_time) .IP alt_id 8 For a few systems, the session ID is insufficient to track which processes belong to the job. Where a different identifier is required, it is recorded in this attribute. If set, it will also be recorded in the end-of-job accounting record. .br On Windows, holds PBS home directory. .br Readable by all; settable by None. .br Format: .I String; may contain white spaces. .br Python type: .I str .br Default: No default .IP "argument_list" 8 Job executable's argument list. Shown if job is submitted with "-- []". .br Can be read and set by user, Operator, Manager. .br Format: .I JSDL=encoded string .RS 11 .I <1st arg> .br .I <2nd arg> .br .I .RE .IP Example: .RS 11 If arguments are "A B": A B .RE .IP Python type: .I str .br Default: No default .IP array 8 Indicates whether this is a job array. Set to .I True if this is an array job. .br Can be read and set by user. Can be read by Manager and Operator. .br Format: .I Boolean .br Python type: .I bool .br Default: .I False .IP array_id 8 Applies only to subjobs. Array identifier of subjob. Readable by all; set by PBS. .br Format: .I String .br Python type: .I str .br Default: No default .IP array_index 8 Applies only to subjobs. Index number of subjob. Readable by all; set by PBS. .br Format: .I String .br Python type: .I int .br Default: No default .IP array_indices_remaining 8 Applies only to job arrays. List of indices of subjobs still queued. Readable by all; set by PBS. .br Format: .I String .br Syntax: Range or list of ranges, e.g. 500, 552, 596-1000 .br Python type: .I str .br Default: No default .IP array_indices_submitted 8 Applies only to job arrays. Complete list of indices of subjobs given at submission time. .br Can be read and set by user. Can be read by Manager and Operator. .br Format: .I String .br Syntax: Given as range, e.g. 1-100 .br Python type: .I pbs.range .br Default: No default .IP array_state_count 8 Applies only to job arrays. Lists number of subjobs in each state. .br Readable by all; set by PBS. .br Format: .I String .br Python type: .I pbs.state_count .br Default: No default .IP "block" 8 Specifies whether qsub will wait for the job to complete and return the exit value of the job. .br For X11 forwarding jobs, and jobs with .I interactive and .I block attributes set to .I True, the job's exit status is not returned. .br When .I block is .I True, qsub waits for the job to finish. .br Can be read and set by user. Can be read by Manager and Operator. .br Format: .I Boolean .br Python type: .I int .br Default: .I False .IP "Checkpoint" 8 Determines when the job will be checkpointed. An .I $action script is required to checkpoint the job. See the .I pbs_mom(8B) man page. .br Can be read and set by user, Operator, Manager. .br Format: .I String, containing description of interval at which to checkpoint. .br Python type: .I pbs.checkpoint .br Valid values: .RS .IP c 3 Checkpoint at intervals, measured in CPU time, set on the job's execution queue. If no interval set at queue, job is not checkpointed. .IP "c=" 3 Checkpoint at intervals of the specified number of minutes of job CPU time. This value must be greater than zero. If the interval specified is less than that set on the job's execution queue, the queue's interval is used. .br Format: .I Integer .IP w 3 Checkpoint at intervals, measured in walltime, set on job's execution queue. If no interval set at queue, job is not checkpointed. .IP "w=" 3 Checkpoint at intervals of the specified number of minutes of job walltime. This value must be greater than zero. If the interval specified is less that that set on job's execution queue, the queue's interval is used. .br Format: .I Integer .IP n 3 No checkpointing. .IP s 3 Checkpoint only when the server is shut down. .IP u 3 Unset. Defaults to behavior when .I interval argument is set to .I s. .LP Default: .I u .RE .IP comment 8 Comment about job. Informational only. .br Can be read by user. Can be read and set by Operator, Manager. .br Format: .I String .br Python type: .I str .br Default: No default .IP create_resv_from_job 8 When this attribute is .I True, when this job is run, immediately creates and confirms a job-specific start reservation on the same resources as the job (including resources inherited by the job), and places the job in the job-specific reservation's queue. Sets the job's .I create_resv_from_job attribute to .I True. Sets the job-specific reservation's .I reserve_job attribute to the ID of the job from which the reservation was created. The new reservation's duration and start time are the same as the job's walltime and start time. If the job is peer scheduled, the job-specific reservation is created in the pulling complex. .br Readable and settable by all. .br Format: .I Boolean .br Python type: .I bool .br Default: .I False .IP ctime 8 Timestamp; time at which the job was created. .br Readable by all; set by PBS. .br Format: .I Integer .br Syntax: Timestamp. .RS 11 Printed by .B qstat in human-readable format. Output by hooks as seconds since epoch. .RE .IP Python type: .I int .br Default: No default .IP depend 8 Specifies inter-job dependencies. .br No limit on number of dependencies. .br Can be read and set by user, Operator, Manager. .br Format: .I String .br Syntax: .RS 11 ":[, ...][,:[, ...] ...]" .br Must be enclosed in double quotes if it contains commas. .RE .IP Example: "before:123,456" .br Python type: .I pbs.depend .br Valid values: .RS .IP "after:" 3 This job may run at any point after all jobs in .I job ID list have started execution. .IP "afterok:" 3 This job may run only after all jobs in .I job ID list have terminated with no errors. .IP "afternotok:" 3 This job may run only after all jobs in .I job ID list have terminated with errors. .IP "afterany:" This job can run after all jobs in .I job ID list have finished execution, with or without errors. This job will not run if a job in the .I job ID list was deleted without ever having been run. .IP "before:" 3 Jobs in .I job ID list may start once this job has started. .IP "beforeok:" 3 Jobs in .I job ID list may start once this job terminates without errors. .IP "beforenotok:" 3 If this job terminates execution with errors, jobs in .I job ID list may begin. .IP "beforeany:" 3 Jobs in .I job ID list may begin execution once this job terminates execution, with or without errors. .IP "on:" 3 This job may run after .I count dependencies on other jobs have been satisfied. This type is used with one of the .I before types listed. .I Count is an integer greater than .I 0. .RE .IP Default: No dependencies .IP egroup 8 If the job is queued, this attribute is set to the group name under which the job is to be run. .br Readable by Manager only. .br Format: .I String .br Python type: .I str .br Default: No default .IP eligible_time 8 The amount of wall clock wait time a job has accrued while the job is blocked waiting for resources. For a job currently accruing .I eligible_time, if we were to add enough of the right type of resources, the job would start immediately. .br Viewable via .B qstat -f. .br Readable by job owner, Manager and Operator. Settable by Operator or Manager. .br Format: .I Duration .br Python type: .I pbs.duration .br Default: .I Zero .IP "Error_Path" 8 The final path name for the file containing the job's standard error stream. See the .B qsub and .B qalter commands. .br Can be read and set by user, Operator, Manager. .br Format: .I String .br Python type: .I str .br Syntax: .I [:] .br Valid values: .RS .IP "" 3 Path is relative to the current working directory of command executing on current host. .IP "" 3 Path is absolute path on current host where command is executing. .IP ":" 3 Path is relative to user's home directory on specified host. .IP ":" 3 Path is absolute path on named host. .IP "No path" 3 Path is current working directory where qsub is executed. .RE .IP Default: Default path is current working directory where qsub is run. If the output path is specified, but does not include a filename, the default filename is .I .ER. If the path name is not specified, the default filename is .I .e. .IP estimated 8 List of estimated values for job. Used to report job's .I exec_vnode, start_time, and .I soft_walltime. Can be set in a hook or via qalter, but PBS will overwrite the values. .br Format: Format of reported element .br Syntax: .RS 11 .I estimated.=[, estimated.= ...] .RE .IP Python type: .I pbs.pbs_resource .br Syntax: .RS 11 estimated.= .br where is a resource .RE .IP Reported values: .RS .IP "exec_vnode" 3 The estimated vnodes used by this job. .br Readable by all; settable by Manager and Operator. .br Format: .I String .br Python type: .I pbs.exec_vnode .br Default: Unset .IP "soft_walltime" 3 The estimated soft walltime for this job. Calculated when a job exceeds its soft_walltime resource. .br Readable by all; settable by Manager. .br Format: .I Duration .br Python type: .I pbs.duration .br Default: Unset .IP "start_time" 3 The estimated start time for this job. .br Readable by all; settable by Manager and Operator. .br Format: .I start_time is printed by qstat in human-readable format; .I start_time output in hooks as seconds since epoch. .br Python type: .I int .br Default: Unset .RE .IP .IP etime 8 Timestamp; time when job became eligible to run, i.e. was enqueued in an execution queue and was in the "Q" state. Reset when a job moves queues, or is held then released. Not affected by qaltering. .br Readable by all; set by PBS. .br Format: .I Integer .br Syntax: .RS 11 Printed by qstat in human-readable format. Output in hooks as seconds since epoch. .RE .IP Python type: .I int .br Default: No default .IP euser 8 If the job is queued, this attribute is set to the user name under which the job is to be run. .br Readable by Manager only; set by PBS. .br Format: .I String .br Python type: .I str .br Default: No default .IP "executable" 8 JSDL-encoded listing of job's executable. Shown if job is submitted with "-- []". .br Can be read and set by user, Operator, Manager. .br Format: .I JSDL-encoded string .br Syntax: .br Example: .RS 11 If the executable is ping, the string is ping .RE .IP Python type: .I str .br Default: No default .IP "Execution_Time" 8 Timestamp; time after which the job may execute. Before this time, the job remains queued in the (W)ait state. Can be set when stage-in fails and PBS moves job start time out 30 minutes to allow user to fix problem. .br Can be read and set by user, Operator, Manager. .br Format: .I Datetime .br Syntax: .I [[CCwYY]MMDDhhmm[.ss] .br Python type: .I int .br Default: Unset (no delay) .IP exec_host 8 If the job is running, this is set to the name of the host or hosts on which the job is executing. .br Can be read by user, Operator, Manager. .br Format: .I String .br Syntax: .RS 11 .I /N[*C][+...], .br where .I N is task slot number, starting with 0, on that vnode, and .I C is the number of CPUs allocated to the job. .I *C does not appear if .I C has a value of .I 1. .RE .IP Python type: .I pbs.exec_host .br Default: No default .IP exec_vnode 8 List of chunks for the job. Each chunk shows the name of the vnode(s) from which it is taken, along with the host-level, consumable resources allocated from that vnode, and any AOE provisioned on this vnode for this job. .br If a vnode is allocated to the job but no resources from the vnode are used by the job, the vnode name appears alone. .br If a chunk is split across vnodes, the name of each vnode and its resources appear inside one pair of parentheses, joined with a plus sign ("+"). .br Can be read by user. Can be read and set by Manager, Operator. .br Format: .I String .br Syntax: .RS 11 Each chunk is enclosed in parentheses, and chunks are connected by plus signs. .RE .IP Example: .RS 11 For a job which requested two chunks that were satisfied by resources from three vnodes, .I exec_vnode is .br (vnodeA:ncpus=N:mem=X)+(vnodeB:ncpus=P:mem=Y+vnodeC:mem=Z). .br For a job which requested one chunk and exclusive use of a 2-vnode host, where the chunk was satisfied by resources from one vnode, .I exec_vnode is .br (vnodeA:ncpus=N:mem=X)+(vnodeB). .RE .IP Python type: .I pbs.exec_vnode .br Default: No default .IP Exit_status 8 Exit status of job. Set to zero for successful execution. If any subjob of an array job has non-zero exit status, the array job has non-zero exit status. .br Readable by all; set by PBS. .br Format: .I Integer .br Python type: .I int .br Default: No default .IP "forward_x11_cookie" 8 Contains the X authorization cookie. .br Readable by all; set by PBS. .br Format: .I String .br Python type: .I int .br Default: No default .IP "forward_x11_port" 8 Contains the number of the port being listened to by the port forwarder on the submission host. .br Readable by all; set by PBS. .br Format: .I Integer .br Python type: .I int .br Default: No default .IP "group_list" 8 A list of group names used to determine the group under which the job runs. When a job runs, the server selects a group name from the list according to the following ordered set of rules: .RS .IP "1." 3 Select the group name for which the associated host name matches the name of the server host. .IP "2." 3 Select the group name which has no associated host name. .IP "3." 3 Use the login group for the user name under which the job will be run. .RE .IP .br Can be read and set by user, Operator, Manager. .br Format: .I String .br Syntax: .RS 11 .I [@] [,[@] ...] .br Must be enclosed in double quotes if it contains commas. .RE .IP Python type: .I pbs.group_list .br Default: No default .IP hashname 8 No longer used. .IP "Hold_Types" 8 The set of holds currently applied to the job. If the set is not null, the job will not be scheduled for execution and is said to be in the .I held state. The .I held state takes precedence over the .I wait state. .br Can be read and set by user, Operator, Manager. .br Format: .I String, made up of the letters .I 'n', 'o', 'p', 's', 'u' .br Hold types: .RS .IP n 3 No hold .IP o 3 Other hold .IP p 3 Bad password .IP s 3 System hold .IP u 3 User hold .RE .IP Python type: .I pbs.hold_types .br Default: .I n (no hold) .RE .IP "interactive" 8 Specifies whether the job is interactive. .br When both this attribute and the .I block attribute are .I True, no exit status is returned. For X11 forwarding jobs, the job's exit status is not returned. .br Cannot be set using a PBS directive. .br Job arrays cannot be interactive. .br Can be set, but not altered, by unprivileged user. Can be read by Operator, Manager. .br Format: .I Boolean .br Python type: .I int .br Default: .I False .IP "jobdir" 8 Path of the job's staging and execution directory on the primary execution host. Either user's home, or private sandbox. Depends on value of .I sandbox attribute. Viewable via .B qstat -f. .br Readable by all; set by PBS. .br Format: .I String .br Python type: .I str .br Default: No default .IP "Job_Name" 8 The job name. See the .B qsub and .B qalter commands. .br Can be read and set by user, Operator, Manager. .br Format: .I String up to 236 characters, first character must be alphabetic or numeric .br Python type: .I str .br Default: Base name of job script, or STDIN .IP "Job_Owner" 8 The login name on the submitting host of the user who submitted the batch job. .br Readable by all; set by PBS. .br Format: .I String .br Python type: .I str .br Default: No default .IP "job_state" 8 The state of the job. .br Readable by all. Can be set indirectly by all. .br Format: .I Character .br Job states: .RS .IP B 3 .I Begun. Job arrays only. The job array has begun execution. .br Python type: PBS job state constant .I pbs.JOB_STATE_BEGUN .IP E 3 .I Exiting. The job has finished, with or without errors, and PBS is cleaning up post-execution. .br Python type: PBS job state constant .I pbs.JOB_STATE_EXITING .IP F 3 .I Finished. Job is finished. Job has completed execution, job failed during execution, or job was deleted. .br Python type: PBS job state constant .I pbs.JOB_STATE_FINISHED .IP H 3 .I Held. The job is held. .br Python type: PBS job state constant .I pbs.JOB_STATE_HELD .IP M 3 .I Moved. Job has been moved to another server. .br Python type: PBS job state constant .I pbs.JOB_STATE_MOVED .IP Q 3 .I Queued. The job resides in an execution or routing queue pending execution or routing. It is not in .B held or .B waiting state. .br Python type: PBS job state constant .I pbs.JOB_STATE_QUEUED .IP R 3 .I Running. The job is in a execution queue and is running. .br Python type: PBS job state constant .I pbs.JOB_STATE_RUNNING .IP S 3 .I Suspended. The job was executing and has been suspended. The job does not use CPU cycles or walltime. .br Python type: PBS job state constant .I pbs.JOB_STATE_SUSPEND .IP T 3 .I Transiting. The job is being routed or moved to a new destination. .br Python type: PBS job state constant .I pbs.JOB_STATE_TRANSIT .IP U 3 .I User suspended. The job was running on a workstation configured for cycle harvesting and the keyboard/mouse is currently busy. The job is suspended until the workstation has been idle for a configured amount of time. .br Python type: PBS job state constant .I pbs.JOB_STATE_SUSPEND_USERACTIVE .IP W 3 .I Waiting. The .I Execution_Time attribute contains a time in the future. Can be set when stage-in fails and PBS moves job start time out 30 minutes to allow user to fix problem. .br Python type: PBS job state constant .I pbs.JOB_STATE_WAIITING .IP X 3 .I Expired. Subjobs only. Subjob is finished (expired.) .br Python type: PBS job state constant .I pbs.JOB_STATE_EXPIRED .LP .RE .IP "Join_Path" 8 Specifies whether the job's standard error and standard output streams are to be merged and placed in the file specified in the .I Output_Path job attribute. .br When set to .I True, the job's standard error and standard output streams are merged. .br Can be read and set by user, Operator, Manager. .br Format: .I String .br Behavior: .RS .IP eo 3 Standard output and standard error are merged, intermixed, into a single stream, which becomes standard error. .IP oe 3 Standard output and standard error are merged, intermixed, into a single stream, which becomes standard output. .IP n 3 Standard output and standard error are not merged. .RE .IP Python type: .I pbs.join_path .br Default: .I False .IP "Keep_Files" 8 Specifies whether the standard output and/or standard error streams are retained on the execution host in the job's staging and execution directory after the job has executed. Otherwise these files are returned to the submission host. .I Keep_Files overrides the .I Output_Path and .I Error_Path attributes. .br Readable and settable by all. .br Format: .I String .br Python type: .I pbs.keep_files .br Valid values: Can be one of the following: .RS .IP o 3 The standard output stream is retained. The filename is: .I .o .IP e 3 The standard error stream is retained. The filename is: .I .e .IP "eo, oe" 3 Both standard output and standard error streams are retained. .IP d 3 Output and error are written directly to their final destination .IP n 3 Neither stream is retained. Files are returned to submission host. .RE .IP Default: .I n .RS 11 (neither stream is retained, and files are returned to submission host.) .RE .IP "Mail_Points" 8 Specifies state changes for which the server sends mail about the job. .br Can be read and set by user, Operator, Manager. .br Format: .I String .br Python type: .I pbs.mail_points .br Valid values: Combination of .I a, b, and .I e, with optional .I j, or .I n by itself. .RS .IP a 3 Mail is sent when job is aborted .IP b 3 Mail is sent at beginning of job .IP e 3 Mail is sent when job ends .IP j 3 Mail is sent for subjobs. Must be combined with one or more of .I a, b, and .I e options. .IP n 3 No mail is sent. Cannot be combined with other options. .RE .IP Default: .I a .IP "Mail_Users" 8 The set of users to whom mail is sent when the job makes state changes specified in the .I Mail_Points job attribute. .br Can be read and set by user, Operator, Manager. .br Format: .I String .br Syntax: "@[,@]" .br Python type: .I pbs.email_list .br Default: Job owner only .IP "max_run_subjobs" 8 Sets a limit on the number of subjobs that can be running at one time. Can be set using .B qsub -J [%] or .B qalter -Wmax_run_subjobs= . .IP mtime 8 Timestamp; the time that the job was last modified, changed state, or changed locations. .br Format: .I Integer. .br Syntax: Timestamp. .RS 11 Printed by qstat in human-readable format; output in hooks as seconds since epoch. .RE .IP Python type: .I int .br Default: No default .IP "no_stdio_sockets" 8 .B Not used. .IP "Output_Path" 8 The final path name for the file containing the job's standard output stream. See the .B qsub and .B qalter commands. .br Can be read and set by user, Operator, Manager. .br Format: .I String .br Python type: .I str .br Syntax: .I [:] .br Valid values: .RS .IP "" 3 Path is relative to the current working directory of command executing on current host. .IP "" 3 Path is absolute path on current host where command is executing. .IP ":" 3 Path is relative to user's home directory on specified host. .IP ":" 3 Path is absolute path on named host. .IP "No path" 3 Path is current working directory where qsub is executed. .RE .IP Default: .RS 11 Default path is current working directory where qsub is run. .br If the output path is specified, but does not include a filename, the default filename is .I .OU. .br If the path name is not specified, the default filename is .I .o. .RE .IP "pcap_accelerator" 8 Power attribute. Power cap for an accleerator. Corresponds to Cray .I capmc set_power_cap --accel setting. See .I capmc documentation. .br Readable and settable by all. .br Format: .I Integer .br Units: .I Watts .br Python type: .I int .br Default: Unset .IP "pcap_node" 8 Power attribute. Power cap for a node. Corresponds to Cray .I capmc set_power_cap --node setting. See .I capmc documentation. .br Readable and settable by all. .br Format: .I Integer .br Units: .I Watts .br Python type: .I int .br Default: Unset .IP "pgov" 8 Power attribute. Cray ALPS reservation setting for CPU throttling corresponding to .I p-governor. See BASIL 1.4 documentation. We do not recommend using this attribute. .br Readable and settable by all. .br Format: .I String .br Python type: .I str .br Default: Unset .IP "Priority" 8 The scheduling priority for the job. Higher value indicates higher priority. .br Can be read and set by user, Operator, Manager. .br Format: .I Integer .br Syntax: .I [+|-]nnnnn .br Valid values: [-1024, +1023] inclusive .br Python type: .I int .br Default: Unset .IP "project" 8 The job's project. A project is a way to tag jobs. Each job can belong to at most one project. .br Readable and settable by user, Operator, Manager. .br Format: .I String .RS 11 Can contain any characters except for the following: Slash ("/"), left bracket ("["), right bracket ("]"), double quote ("""), semicolon (";"), colon (":"), vertical bar ("|"), left angle bracket ("<"), right angle bracket (">"), plus ("+"), comma (","), question mark ("?"), and asterisk ("*"). .RE .IP Python type: .I str .br Default: "_pbs_project_default" .IP "pset" 8 .B Deprecated. Name of the placement set used by the job. .br Can be read by user, Operator. Can be read and set by Manager. .br Format: .I String .br Python type: .I str .br Default: No default .IP "pstate" 8 Power attribute. Cray ALPS reservation setting for CPU frequency corresponding to .I p-state. See BASIL 1.4 documentation. .br Readable and settable by all. .br Format: .I String .br Units: .I Hertz .br Python type: .I str .br Default: Unset .IP qtime 8 Timestamp; the time that the job entered the current queue. .br Readable by all; settable only by PBS. .br Format: .I Integer .br Syntax: Timestamp. .RS 11 Printed by qstat in human-readable format; output in hooks as seconds since epoch. .RE .IP Python type: .I int .br Default: No default .IP queue 8 The name of the queue in which the job currently resides. .br Readable by all; settable only by PBS. .br Format: .I String .br Python type: .I pbs.queue .br Default: No default .IP queue_rank 8 A number indicating the job's position within the queue. Only used internally by PBS. .br Readable by Manager only. .br Format: .I Integer .br Python type: .I int .br Default: No default .IP queue_type 8 The type of queue in which the job is currently residing. .br Readable by Manager only. .br Format: .I Character .br Valid values: One of .I E or .I R .RS .IP E 3 Execution queue .br Python type: .RS 3 PBS job state constant .I pbs.QUEUE_TYPE_EXECUTION .RE .IP R 3 Routing queue .br Python type: .RS 3 PBS job state constant .I pbs.QUEUE_TYPE_EXECUTION .RE .RE .IP Default: No default .IP "release_nodes_on_stageout" 8 Controls whether job vnodes are released when stageout begins. .br Cannot be used with vnodes tied to Cray X* series systems. .br When cgroups is enabled and this is used with some but not all vnodes from one MoM, resources on those vnodes that are part of a cgroup are not released until the entire cgroup is released. .br The job's .I stageout attribute must be set for the .I release_nodes_on_stageout attribute to take effect. .br When set to .I True, all of the job's vnodes not on the primary execution host are released when stageout begins. .br When set to .I False, job's vnodes are released when the job finishes and MoM cleans up the job. .br Readable and settable by all. .br Format: .I Boolean .br Python type: .I bool .br Default: .I False .IP "Remove_Files" 8 Specifies whether standard output and/or standard error files are automatically removed upon job completion. .br Readable and settable by all. .br Format: .I String .br Python type: .I str .br Valid values: "e", "o", "eo", "oe", or unset .RS .IP e 3 Standard error is removed upon job completion. .IP o 3 Standard output is removed upon job completion. .IP "eo, oe" 3 Standard output and standard error are removed upon job completion. .IP unset 3 Neither is removed. .RE .IP Default: Unset .IP "Rerunable" 8 Specifies whether the job can be rerun. Does not affect how a job is treated if the job could not begin execution. .br Job arrays are required to be rerunnable and are rerunnable by default. .br Readable and settable by all. .br Format: .I Character .br Syntax: One of .I y or .I n .br Python type: .I bool .br Default: y (job is rerunnable) .IP "Resource_List" 8 The list of resources required by the job. List is a set of .I = strings. The meaning of name and value is dependent upon defined resources. Each value establishes the limit of usage of that resource. If not set, the value for a resource may be determined by a queue or server default established by the administrator. .br Readable and settable by all. .br Format: .I String .br Syntax: .RS 11 .I Resource_List.=[, Resource_List.=, ...] .RE .IP Python type: .I pbs.pbs_resource .br Syntax: .RS 11 Resource_List[""]= .br where is any built-in or custom resource .RE .IP Default: No default .IP "resources_released" 8 Listed by vnode, consumable resources that were released when the job was suspended. Populated only when .I restrict_res_to_release_on_suspend server attribute is set. .br Readable by all. Set by server. .br Format: .I String .br Syntax: .RS 11 .I (:=: .I =:...)+ .I (:=:...) .RE .IP Python type: .I str .br Default: No default .IP "resource_release_list" 8 Sum of each consumable resource requested by the job that was released when the job was suspended. Populated only when .I restrict_res_to_release_on_suspend server attribute is set. .br Readable by Manager and Operator. Set by server. .br Format: .I String .br Syntax: .RS 11 .I resource_released_list.=, .I resource_released_list.=, ... .RE .IP Python type: .I pbs.pbs_resource .br Default: No default .IP "resources_used" 8 The amount of each resource used by the job. .br Readable by all; set by PBS. .br Format: .I String .br Syntax: .RS 11 List of .I resources_used.=,resources_used.= pairs. .RE .IP Example: resources_used.mem=2mb .br Python type: .I pbs.pbs_resource .br Syntax: .RS 11 resources_used[""]= .br where is any built-in or custom resource .RE .IP Default: No default .IP run_count 8 The number of times the server thinks the job has been executed. .br The .I run_count attribute starts at zero. Job is held after 21 tries. .br Can be set via qsub, qalter, or a hook. .br Can be read and set by Manager and Operator. .br Format: .I Integer; must be greater than or equal to zero. .br Python type: .I int .br Default: .I Zero .IP "run_version" 8 Used internally by PBS to track the instance of the job. .br Set by PBS. Visible to Manager only. .br Format: .I Integer .br Python type: .I int .br Default: No default .IP "sandbox" 8 Specifies type of location PBS uses for job staging and execution. .br User-settable via .B qsub -Wsandbox= or via a PBS directive. .br See the $jobdir_root MoM configuration option in .B pbs_mom.8B. .br Can be read and set by user, Operator, Manager. .br Format: .I String .br Valid values: .I PRIVATE, HOME, unset .br .RS .IP PRIVATE 3 When set to PRIVATE, PBS creates job-specific staging and execution directories under the directory specified in the .I $jobdir_root MoM configuration option. .IP "HOME or unset" 3 PBS uses the job owner's home directory for staging and execution. .RE .IP Python type: .I str .br Default: Unset .IP schedselect 8 The union of the select specification of the job, and the queue and server defaults for resources in a chunk. .br Can be read by PBS Manager only. .br Format: .I String .br Python type: .I pbs.select .br Default: No default .IP sched_hint 8 .B No longer used. .IP security_context 8 Contains security context of job submitter. Set by PBS to the security context of the job submitter at the time of job submission. If not present when a request is submitted, an error occurs, a server message is logged, and the request is rejected. .br Readable by all; set by PBS. .br Format: .I String in SELinux format .br Default: Unset .IP server 8 The name of the server which is currently managing the job. When the secondary server is running during failover, shows the name of the primary server. After a job is moved to another server, either via qmove or peer scheduling, shows the name of the new server. .br Readable by all; set by PBS. .br Format: .I String .br Python type: .I pbs.server .br Default: No default .IP session_id 8 If the job is running, this is set to the session ID of the first executing task. .br Readable by all; set by PBS. .br Format: .I Integer .br Python type: .I int .br Default: No default .IP "Shell_Path_List" 8 One or more absolute paths to the program(s) to process the job's script file. .br Can be read and set by user, Operator, Manager. .br Format: .I String .br Syntax: .RS 11 "[@][,[@]...]" .br Must be enclosed in double quotes if it contains commas. .RE .IP Python type: .I pbs.path_list .br Default: User's login shell on execution host .IP stagein 8 The list of files to be staged in prior to job execution. .br Can be read and set by user, Operator, Manager. .br Format: .I String .br Syntax: .RS 11 "@: [, @:, ...]" .RE .IP Python type: .I pbs.staging_list .br Default: No default .IP stageout 8 The list of files to be staged out after job execution. .br Can be read and set by user, Operator, Manager. .br Format: .I String .br Syntax: .RS 11 "@: [, @:, ...]" .RE .IP Python type: .I pbs.staging_list .br Default: No default .IP Stageout_status 8 Status of stageout. If stageout succeeded, this is set to 1. If stageout failed, this is set to 0. Displayed only if set. If stageout fails for any subjob of an array job, the value of .I Stageout_status is zero for the array job. Available only for finished jobs. .br Readable by all; set by PBS. .br Format: .I Integer .br Python type: .I int .br Default: No default .IP stime 8 Timestamp; time when the job started execution. Changes when job is restarted. .br Readable by all; set by PBS. .br Format: .I Integer .br Syntax: Timestamp. .RS 11 Printed by qstat in human-readable format; output in hooks as seconds since epoch. .RE .IP Python type: .I int .br Default: No default .IP "Submit_arguments" 8 Job submission arguments given on the .B qsub command line. Available for all jobs. .br Can be read and set by user, Operator, Manager. .br Format: .I String .br Python type: .I str .br Default: No default .IP "substate" 8 The substate of the job. The substate is used internally by PBS. .br Readable by all; set by PBS. .br Format: .I Integer .br Python type: .I int .br Default: No default .IP sw_index 8 .B No longer used. .IP "tobjob_ineligible" 8 Allows administrators to mark this job as ineligible to be a top job. .br When .I True , this job is not eligible to be the top job. .br Can be read and set by Manager. .br Format: .I Boolean .br Python type: .I bool .br Default: .I False .IP umask 8 The initial umask of the job is set to the value of this attribute when the job is created. This may be changed by umask commands in the shell initialization files such as .profile or .cshrc. .br Can be read and set by user, Operator, Manager. .br Format: .I Decimal integer .br Python type: .I int .br Default: .I 077 .IP "User_List" 8 The list of users which determines the user name under which the job is run on a given host. No length limit. .br When a job is to be executed, the server selects a user name from the list according to the following ordered set of rules: .RS .IP 1. 3 Select the user name from the list for which the associated host name matches the name of the server. .IP 2. 3 Select the user name which has no associated host name; the wild card name. .IP 3. 3 Use the value of .I Job_Owner as the user name. .RE .IP Readable and settable by all. .br Format: .I String .br Syntax: .RS 11 "@ [,@...]" .br Must be enclosed in double quotes if it contains commas. May be up to 256 characters in length. .RE .IP Python type: .I pbs.user_list .br Default: Value of .I Job_Owner job attribute .IP "Variable_List" 8 List of environment variables set in the job's execution environment. See the qsub(1B) command. .br Readable and settable by all. .br Format: .I String .br Syntax: .RS 11 "= [,=...]" .br Must be enclosed in double quotes if it contains commas. .RE .IP Python type: .I pbs.pbs_resource .br Syntax: .RS 11 Variable_List[""]= .RE .IP Default: No default .SH SEE ALSO qsub(1B), qalter(1B), qhold(1B), qrls(1B), pbs_resources(7B) ================================================ FILE: doc/man1/pbs_login.1B ================================================ .\" .\" Copyright (C) 1994-2021 Altair Engineering, Inc. .\" For more information, contact Altair at www.altair.com. .\" .\" This file is part of both the OpenPBS software ("OpenPBS") .\" and the PBS Professional ("PBS Pro") software. .\" .\" Open Source License Information: .\" .\" OpenPBS is free software. You can redistribute it and/or modify it under .\" the terms of the GNU Affero General Public License as published by the .\" Free Software Foundation, either version 3 of the License, or (at your .\" option) any later version. .\" .\" OpenPBS is distributed in the hope that it will be useful, but WITHOUT .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public .\" License for more details. .\" .\" You should have received a copy of the GNU Affero General Public License .\" along with this program. If not, see . .\" .\" Commercial License Information: .\" .\" PBS Pro is commercially licensed software that shares a common core with .\" the OpenPBS software. For a copy of the commercial license terms and .\" conditions, go to: (http://www.pbspro.com/agreement.html) or contact the .\" Altair Legal Department. .\" .\" Altair's dual-license business model allows companies, individuals, and .\" organizations to create proprietary derivative works of OpenPBS and .\" distribute them - whether embedded or bundled with other software - .\" under a commercial license agreement. .\" .\" Use of Altair's trademarks, including but not limited to "PBS™", .\" "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is .\" subject to Altair's trademark licensing policies. .\" .TH pbs_login 1B "15 July 2020" Local "PBS Professional" .SH NAME .B pbs_login - cache encrypted user password for authentication .SH Synopsis .B pbs_login .br .B pbs_login -m .br echo | .B pbs_login -p .SH Description The .B pbs_login command encrypts the password and caches it locally where it can be used by daemons for authorization. Job submitters must run this command at each submission host each time their password changes. On Windows, the .B win_postinstall script calls .B pbs_login to store the PBS service account password so that the account user can be authenticated by daemons. .SH Required Privilege Can be run by any user. .SH Options to pbs_login .IP "(no options)" 8 Queries user for password. .IP "-m " 8 This option is intended to be used only by the PBS service account, which is the account that is used to execute .B pbs_mom via the Service Control Manager on Windows. This option is used during installation when invoked by the .B win_postinstall script, or by the administrator when the PBS service account password has changed. Stores PBS service account password in the mom_priv directory. .IP "-p" 8 Caches user password on client host. Intended to be run by job submitter at client host. Allows job submitter to be authenticated by daemons. ================================================ FILE: doc/man1/pbs_module.7B ================================================ .\" .\" Copyright (C) 1994-2021 Altair Engineering, Inc. .\" For more information, contact Altair at www.altair.com. .\" .\" This file is part of both the OpenPBS software ("OpenPBS") .\" and the PBS Professional ("PBS Pro") software. .\" .\" Open Source License Information: .\" .\" OpenPBS is free software. You can redistribute it and/or modify it under .\" the terms of the GNU Affero General Public License as published by the .\" Free Software Foundation, either version 3 of the License, or (at your .\" option) any later version. .\" .\" OpenPBS is distributed in the hope that it will be useful, but WITHOUT .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public .\" License for more details. .\" .\" You should have received a copy of the GNU Affero General Public License .\" along with this program. If not, see . .\" .\" Commercial License Information: .\" .\" PBS Pro is commercially licensed software that shares a common core with .\" the OpenPBS software. For a copy of the commercial license terms and .\" conditions, go to: (http://www.pbspro.com/agreement.html) or contact the .\" Altair Legal Department. .\" .\" Altair's dual-license business model allows companies, individuals, and .\" organizations to create proprietary derivative works of OpenPBS and .\" distribute them - whether embedded or bundled with other software - .\" under a commercial license agreement. .\" .\" Use of Altair's trademarks, including but not limited to "PBS™", .\" "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is .\" subject to Altair's trademark licensing policies. .\" .TH pbs_module 7B "6 April 2020" Local "PBS Professional" .SH NAME .B pbs_module \- Python interface to PBS and PBS hook environment .SH DESCRIPTION The .I pbs module provides an interface to PBS and the hook environment. The interface is made up of Python objects, which have attributes and methods. You can operate on these objects using Python code. .SH PBS MODULE OBJECTS .IP pbs.acl Represents a PBS ACL type. .IP pbs.args Represents a space-separated list of PBS arguments to commands like qsub, qdel. .IP pbs.argv[] Represents a list of argument strings to be passed to the program .IP pbs.BadAttributeValueError Raised when setting the attribute value of a pbs.* object to an invalid value. .IP pbs.BadAttributeValueTypeError Raised when setting the attribute value of a pbs.* object to an invalid value type. .IP pbs.BadResourceValueError Raised when setting the resource value of a pbs.* object to an invalid value. .IP pbs.BadResourceValueTypeError Raised when setting the resource value of a pbs.* object to an invalid value type. .IP pbs.checkpoint Represents a job's .I Checkpoint attribute. .IP pbs.depend Represents a job's .I depend attribute. .IP pbs.duration Represents a time interval. .IP pbs.email_list Represents the set of users to whom mail may be sent. .IP pbs.env[] Dictionary of environment variables. .IP pbs.event Represents a PBS event. .IP pbs.EventIncompatibleError Raised when referencing a non-existent attribute in pbs.event(). .IP pbs.EXECHOST_PERIODIC The .I exechost_periodic event type. .IP pbs.EXECHOST_STARTUP The .I exechost_startup event type. .IP pbs.EXECJOB_ATTACH The .I execjob_attach event type. .IP pbs.EXECJOB_BEGIN The .I execjob_begin event type. .IP pbs.EXECJOB_END The .I execjob_end event type. .IP pbs.EXECJOB_EPILOGUE The .I execjob_epilogue event type. .IP pbs.EXECJOB_LAUNCH The .I execjob_launch event type. .IP pbs.EXECJOB_PRETERM The .I execjob_preterm event type. .IP pbs.EXECJOB_PROLOGUE The .I execjob_prologue event type. .IP pbs.exec_host Represents a job's .I exec_host attribute. .IP pbs.exec_vnode Represents a job's .I exec_vnode attribute. .IP pbs.group_list Represents a list of group names. .IP pbs.hold_types Represents a job's .I Hold_Types attribute. .IP pbs.hook_config_filename Contains path to hook's configuration file. .IP pbs.job Represents a PBS job. .IP pbs.job_list[] Represents a list of pbs.job objects. .IP pbs.job_sort_formula Represents the server's .I job_sort_formula attribute. .IP pbs.JOB_STATE_BEGUN Represents the job array state of having started. .IP pbs.JOB_STATE_EXITING Represents the job state of exiting. .IP pbs.JOB_STATE_EXPIRED Represents the subjob state of expiring. .IP pbs.JOB_STATE_FINISHED Represents the job state of finished. .IP pbs.JOB_STATE_HELD Represents the job state of held. .IP pbs.JOB_STATE_MOVED Represents the job state of moved. .IP pbs.JOB_STATE_QUEUED Represents the job state of queued. .IP pbs.JOB_STATE_RUNNING Represents the job state of running. .IP pbs.JOB_STATE_SUSPEND Represents the job state of suspended. .IP pbs.JOB_STATE_SUSPEND_USERACTIVE Represents the job state of suspended due to user activity. .IP pbs.JOB_STATE_TRANSIT Represents the job state of transiting. .IP pbs.JOB_STATE_WAITING Represents the job state of waiting. .IP pbs.join_path Represents a job's .I Join_Path attribute. .IP pbs.keep_files Represents a job's .I Keep_Files attribute. .IP pbs.license_count Represents a set of licensing-related counters. .IP pbs.LOG_DEBUG Log level 004. .IP pbs.LOG_ERROR Log level 004. .IP pbs.LOG_WARNING Log level 004. .IP pbs.mail_points Represents a job's .I Mail_Points attribute. .IP pbs.MODIFYJOB The .I modifyjob event type. .IP pbs.MOVEJOB The .I movejob event type. .IP pbs.ND_BUSY Represents .I busy vnode state. .IP pbs.ND_DEFAULT_EXCL Represents .I default_excl sharing vnode attribute value .IP pbs.ND_DEFAULT_SHARED Represents .I default_shared sharing vnode attribute value. .IP pbs.ND_DOWN Represents .I down vnode state .IP pbs.ND_FORCE_EXCL Represents .I force_excl sharing vnode attribute value. .IP pbs.ND_FREE Represents .I free vnode state. .IP pbs.ND_GLOBUS PBS no longer supports Globus. The Globus functionality has been .B removed from PBS. Represents .I globus vnode .I ntype. .IP pbs.ND_IGNORE_EXCL Represents .I ignore_excl sharing vnode attribute value. .IP pbs.ND_JOBBUSY Represents .I job-busy vnode state. .IP pbs.ND_JOB_EXCLUSIVE Represents .I job-exclusive vnode state. .IP pbs.ND_OFFLINE Represents .I offline vnode state. .IP pbs.ND_PBS Represents .I pbs vnode .I ntype. .IP pbs.ND_PROV Represents .I provisioning vnode state. .IP pbs.ND_RESV_EXCLUSIVE Represents .I resv-exclusive vnode state. .IP pbs.ND_STALE Represents .I stale vnode state. .IP pbs.ND_STATE_UNKNOWN Represents .I state-unknown, down vnode state. .IP pbs.ND_UNRESOLVABLE Represents the .I unresolvable vnode state. .IP pbs.ND_WAIT_PROV Represents .I wait-provisioning vnode state. .IP pbs.node_group_key Represents the server or queue .I node_group_key attribute. .IP pbs.path_list Represents a list of pathnames. .IP pbs.pbs_conf[] Dictionary of entries in pbs.conf. .IP pbs.pid Represents the process ID of a process belonging to a job. .IP pbs.place Represents the .I place job submission specification. .IP pbs.progname Path of job shell or executable. .IP pbs.QTYPE_EXECUTION The .I execution queue type. .IP pbs.QTYPE_ROUTE The .I route queue type. .IP pbs.queue Represents a PBS queue. .IP pbs.QUEUEJOB The .I queuejob event type. .IP pbs.range Represents a range of numbers referring to array indices. .IP pbs.resv Represents a PBS reservation. .IP pbs.RESVSUB The .I resvsub event type. .IP pbs.RESV_STATE_BEING_DELETED Represents the reservation state RESV_BEING_DELETED. .IP pbs.RESV_STATE_CONFIRMED Represents the reservation state RESV_CONFIRMED. .IP pbs.RESV_STATE_DEGRADED Represents the reservation state RESV_DEGRADED. .IP pbs.RESV_STATE_DELETED Represents the reservation state RESV_DELETED. .IP pbs.RESV_STATE_DELETING_JOBS Represents the reservation state RESV_DELETING_JOBS. .IP pbs.RESV_STATE_FINISHED Represents the reservation state RESV_FINISHED. .IP pbs.RESV_STATE_NONE Represents the reservation state RESV_NONE. .IP pbs.RESV_STATE_RUNNING Represents the reservation state RESV_RUNNING. .IP pbs.RESV_STATE_TIME_TO_RUN Represents the reservation state RESV_TIME_TO_RUN. .IP pbs.RESV_STATE_UNCONFIRMED Represents the reservation state RESV_UNCONFIRMED. .IP pbs.RESV_STATE_WAIT Represents the reservation state RESV_WAIT. .IP pbs.route_destinations Represents a queue's .I route_destinations attribute. .IP pbs.RUNJOB The .I runjob event type. .IP pbs.select Represents the .I select job submission specification. .IP pbs.server Represents the local PBS server. .IP pbs.size Represents a PBS .I size type. .IP pbs.software Represents a site-dependent software specification resource. .IP pbs.staging_list Represents a list of file stagein or stageout parameters. .IP pbs.state_count Represents a set of job-related state counters. .IP pbs.SV_STATE_ACTIVE Represents the server state "Scheduling". .IP pbs.SV_STATE_HOT Represents the server state "Hot_Start". .IP pbs.SV_STATE_IDLE Represents the server state "Idle". .IP pbs.SV_STATE_SHUTDEL Represents the server state "Terminating, Delayed". .IP pbs.SV_STATE_SHUTIMM Represents the server state "Terminating". .IP pbs.SV_STATE_SHUTSIG Represents the server state "Terminating", when a signal has been caught. .IP pbs.UnsetAttributeNameError Raised when referencing a non-existent name of a pbs.* object. .IP pbs.UnsetResourceNameError Raised when referencing a non-existent name of a pbs.* object. .IP pbs.user_list Represents a list of user names. .IP pbs.vchunk Represents a resource chunk assigned to a job. .IP pbs.version Represents PBS version information. .IP pbs.vnode Represents a PBS vnode. .IP pbs.vnode_list[] Represents a list of PBS vnodes. .IP SystemExit Raised when accepting or rejecting an action. .LP .SH PBS MODULE GLOBAL METHODS .IP pbs.acl("[+|-]][,...]") Creates an object representing a PBS ACL, using the given string parameter. Instantiation of these objects requires a formatted input string. .IP pbs.args("") where .I are space-separated arguments to a command such as .B qsub or .B qdel. Creates an object representing the arguments to the command. Example: .RS 10 pbs.args("-Wsuppress_email=N -r y") .RE .IP Instantiation of these objects requires a formatted input string. .IP pbs.checkpoint("") where .I must be one of "n", "s", "c", "c=mmm", "w", or "w=mmm" Creates an object representing the job's .I Checkpoint attribute, using the given string. Instantiation of these objects requires a formatted input string. .IP pbs.depend("") .I must be of format ":[,...]", or "on:", and where .I is one of "after", "afterok", "afterany", "before", "beforeok", and "beforenotok". Creates a PBS dependency specification object representing the job's .I depend attribute, using the given .I . Instantiation of these objects requires a formatted input string. .IP pbs.duration("[[hours:]minutes:]seconds[.milliseconds]") Creates a time specification duration instance, returning the equivalent number of seconds from the given time string. Represents an interval or elapsed time in number of seconds. Duration objects can be specified using either a time or an integer. See the "pbs.duration()" creation method. .IP pbs.duration() Creates an integer duration instance using the specified number of seconds. A .I pbs.duration instance can be operated on by any of the Python .I int functions. When performing arithmetic operations on a .I pbs.duration type, ensure the resulting value is a .I pbs.duration() type, before assigning to a job member that expects such a type. .IP pbs.email_list("[,...]") Creates an object representing a mail list. Instantiation of these objects requires a formatted input string. .IP pbs.exec_host("host/N[*C][+...]") Create an object representing the .I exec_host job attribute, using the given host and resource specification. Instantiation of these objects requires a formatted input string. .IP pbs.exec_vnode("[+...]") .I is () Creates an object representing the .I exec_vnode job attribute, using the given vnode and resource specification. When the .B qrun -H command is used, or when the scheduler runs a job, the .I pbs.job.exec_vnode object contains the vnode specification for the job. Instantiation of these objects requires a formatted input string. .br Example: .br pbs.exec_vnode("(vnodeA:ncpus=N:mem=X)+(nodeB:ncpus=P:mem=Y+nodeC:mem=Z)") .br This object is managed and accessed via the .I str() or .I repr() functions. .br Example: .br Python> ev = pbs.server().job("10").exec_vnode .br Python> str(ev) .br "(vnodeA:ncpus=2:mem=200m)+(vnodeB:ncpus=5:mem=1g)" .IP pbs.get_hook_config_file() Returns the path to the hook's configuration file, or None if there is no configuration file. For example: .RS 10 configfilepath = pbs.get_hook_config_file() .RE .IP pbs.get_local_nodename() This returns a Python str whose value is the name of the local natural vnode. If you want to refer to the vnode object representing the current host, you can pass this vnode name as the key to .I pbs.event().vnode_list[]. For example: .RS 10 Vn = pbs.event().vnode_list[pbs.get_local_nodename()] .RE .IP pbs.get_pbs_conf() This method returns a dictionary of values which represent entries in the pbs.conf file. The method reads the file on the host where a hook runs, so pre-execution event hooks get the entries on the server host, and execution event hooks get the entries on the execution host where the hook runs. The method reads /etc/pbs.conf on the host where pbs_python runs. Example: .RS 10 pbs_conf = pbs.get_pbs_conf() .br pbs.logmsg(pbs.LOG_DEBUG, "pbs home is " % (pbs_conf['PBS_HOME])) .RE .IP If you HUP pbs_mom (Linux/UNIX), pbs.get_pbs_conf returns the reloaded contents of the pbs.conf file. .IP pbs.group_list("[@][,[@]...]") Creates an object representing a PBS group list. To use a group list object: .br pbs.job.group_list = pbs.group_list(....) .br Instantiation of these objects requires a formatted input string. .IP pbs.hold_types("") where .I is one of "u", "o", "s", or "n". Creates an object representing the .I Hold_Types job attribute. Instantiation of these objects requires a formatted input string. .IP pbs.job_sort_formula("") where .I is a string containing a math formula. Creates an object representing the .I job_sort_formula server attribute. Instantiation of these objects requires a formatted input string. .IP pbs.join_path({"oe"|"eo"|"n"}) Creates an object representing the .I Join_Path job attribute. Instantiation of these objects requires a formatted input string. .IP pbs.keep_files("") where .I is one of "o", "e", "oe", "eo". Creates an object representing the .I Keep_Files job attribute. Instantiation of these objects requires a formatted input string. .IP pbs.license_count("Avail_Global:Avail_Local:Used:High_Use:") Instantiates an object representing a .I license_count attribute. Instantiation of these objects requires a formatted input string. .IP pbs.logjobmsg(job_ID,message) where .I job_ID must be an existing or previously existing job ID and where .I message is an arbitrary string. This puts a custom string in the PBS Server log. The .B tracejob command can be used to print out the job-related messages logged by a hook script. Messages are logged at log event class .I pbs.LOG_DEBUG. .IP pbs.logmsg(log_event_class,message) where .I message is an arbitrary string, and where .I log_event_class can be one of the message log event class constants: .br pbs.LOG_WARNING .br pbs.LOG_ERROR .br pbs.LOG_DEBUG .br This puts a custom string in the daemon log. .IP pbs.mail_points("") where .I is "a", "b", and/or "e", or "n". Creates an object representing a .I Mail_Points attribute. Instantiation of these objects requires a formatted input string. .IP pbs.node_group_key("") Creates an object representing the resource to be used for node grouping, using the specified resource. .IP pbs.path_list("[@][,@...]") Creates an object representing a PBS pathname list. To use a path list object: .br pbs.job.Shell_Path_List = pbs.path_list(....) .br Instantiation of these objects requires a formatted input string. .IP pbs.env() Creates an empty environment variable list. For example, to create an empty environment variable list: .RS 10 pbs.event().env = pbs.pbs_env() .RE .IP pbs.place("[arrangement]:[sharing]:[group]") .I arrangement can be "pack", "scatter", "free", "vscatter" .br .I sharing can be "shared", "excl", "exclhost" .br .I group can be of the form "group=" .br .I [arrangement], [sharing], and .I [group] can be given in any order or combination. .br Creates a place object representing the job's place specification. Instantiation of these objects requires a formatted input string. .br Example: .br pl = pbs.place("pack:excl") .br s = repr(pl) (or s = `pl`) .br letter = pl[0] (assigns 'p' to letter) .br s = s + ":group=host" (append to string) br pl = pbs.place(s) (update original pl) .IP pbs.range("-:") Creates a PBS object representing a range of values. .br Example: .br pbs.range("1-30:3") .br Instantiation of these objects requires a formatted input string. .IP pbs.reboot([]) This stops hook execution, so that remaining lines in the hook script are not executed, and starts the tasks that would normally begin after the hook is finished, such as flagging the current host to be rebooted. The MoM logs show the following: .RS 10 requested for host to be rebooted .RE .IP We recommend that before calling pbs.reboot(), you set any vnodes managed by this MoM offline, and requeue the current job, if this hook is not an exechost_periodic hook. For example: .RS 10 for v in pbs.event().vnode_list.keys(): .br \ \ \ pbs.event().vnode_list[v].state = pbs.ND_OFFLINE .br \ \ \ pbs.event().vnode_list[v].comment = "MoM host rebooting" .br pbs.event().job.rerun() .br pbs.reboot() .RE .IP The effect of the call to pbs.reboot() is not instantaneous. The reboot happens after the hook executes, and after any of the other actions such as pbs.event().job.rerun(), pbs.event().delete(), and pbs.event().vnode_list[] take effect. A hook with its user attribute set to pbsuser cannot successfully invoke pbs.reboot(), even if the owner is a PBS Manager or Operator. If this is attempted, the host is not rebooted, and the following message appears at log event class PBSEVENT_DEBUG2 in the MoM logs: .RS 10 ; Not allowed to issue reboot if run as user. .RE .IP The is an optional argument. It is a Python str which is executed instead of the reboot command that is the default for the system. For example: .RS 10 pbs.reboot("/usr/local/bin/my_reboot -s 10 -c 'going down in 10'") .RE .IP The specified is executed in a shell on Linux/UNIX or via cmd on Windows. .IP pbs.route_destinations("[,,...]") where .I is queue_name[@server_host[:port]] .br Creates an object that represents a .I route_destinations routing queue attribute. Instantiation of these objects requires a formatted input string. .IP pbs.select("[N:]res=val[:res=val][+[N:]res=val[:res=val]...]") Creates a .I select object representing the job's select specification. Instantiation of these objects requires a formatted input string. Example: .br sel = pbs.select("2:ncpus=1:mem=5gb+3:ncpus=2:mem=5gb") .br s = repr(sel) (or s = `sel`) .br letter = s[3] (assigns 'c' to letter) .br s = s + "+5:scratch=10gb" (append to string) .br sel = pbs.select(s) (reset the value of sel) .br .IP pbs.size() Creates a PBS .I size object using the given integer value, storing the value as the number of bytes. Size objects can be specified using either an integer or a string. See the "pbs.size("")" creation method. .IP pbs.size("") Creates a PBS .I size object out of the given string specification. The size of a word is the word size on the execution host. .I Size objects can be specified using either an integer or a string. To operate on .I pbs.size instances, use the "+" and "-" operators. To compare .I pbs.size instances, use the "==", "!=", ">", "<", ">=", and "<=" operators. Example: the sizes are normalized to the smaller of the 2 suffixes. In this case, "10gb" becomes "10240mb" and is added to "10mb": .br sz = pbs.size("10gb") .br sz = sz + 10mb .br 10250mb .br Example: the following returns .I True because .I sz is greater than 100 bytes: .br if sz > 100: .br \ \ \ \ gt100 = True .br .IP pbs.software("") Creates an object representing a site-dependent software resource. Instantiation of these objects requires a formatted input string. .IP pbs.staging_list("[,,...]") where .I is @: Creates an object representing a job file staging parameters list. To use a staging list object: .br pbs.job.stagein = pbs.staging_list(....) .br Instantiation of these objects requires a formatted input string. .IP pbs.state_count("Transit:Queued:Held:Running:Exiting:Begun:) Instantiates an object representing a .I state_count attribute. Instantiation of these objects requires a formatted input string. .IP pbs.user_list("[@][,@...]") Creates an object representing a PBS user list. To use a user list object: .br pbs.job.User_List = pbs.user_list(....) .br Instantiation of these objects requires a formatted input string. .IP pbs.version("") Creates an object representing the PBS version string. Instantiation of these objects requires a formatted input string. .SH ATTRIBUTES AND RESOURCES .br Hooks can read Server, Queue, or reservation resources. Hooks can read vnode or job attributes and resources. Hooks can modify .IP The resources requested by a job .br The resources used by a job .br The attributes of a job .br The resource arguments to pbs_rsub .br Vnode attributes and resources .br The shell or program to be executed in a job .br The arguments to the shell or program to be executed in a job .br The environment of the job .LP Custom and built-in PBS resources are represented in Python dictionaries, where the resource names are the dictionary keys. Built-in resources are listed in pbs_resources(7B). You reference a resource through a vnode, the Server, the event that triggered the hook, or the current job, for example: .IP pbs.server().resources_available["< resource name>"] .br pbs.event().job.Resource_List["< resource name>"] .br pbs.event().vnode_list[].resources_available["< resource name >"] .LP The resource name must be in quotes. Example: Get the number of CPUs: .IP ncpus = Resource_List["ncpus"] .LP An instance R of a job resource can be set as follows: .IP R[""] = .LP For example: .IP pbs.event().job().Resource_List["mem"] = 8gb .LP .SH EXCEPTIONS .IP pbs.BadAttributeValueError Raised when setting the attribute value of a pbs.* object to an invalid value. .IP pbs.BadAttributeValueTypeError Raised when setting the attribute value of a pbs.* object to an invalid value type. .IP pbs.BadResourceValueError Raised when setting the resource value of a pbs.* object to an invalid value. .IP pbs.BadResourceValueTypeError Raised when setting the resource value of a pbs.* object to an invalid value type. .IP pbs.EventIncompatibleError Raised when referencing a non-existent attribute in pbs.event(). .IP pbs.UnsetAttributeNameError Raised when referencing a non-existent name of an attribute. .IP pbs.UnsetResourceNameError Raised when referencing a non-existent name of a resource. .IP SystemExit Raised when accepting or rejecting an action. .LP If a hook encounters an unhandled exception: .IP PBS rejects the corresponding action, and an error message is printed to stderr. .br A message is printed to the daemon log. .LP .SH SEE ALSO pbs_hook_attributes(7B), pbs_resources(7B), qmgr(1B) ================================================ FILE: doc/man1/pbs_node_attributes.7B ================================================ .\" .\" Copyright (C) 1994-2021 Altair Engineering, Inc. .\" For more information, contact Altair at www.altair.com. .\" .\" This file is part of both the OpenPBS software ("OpenPBS") .\" and the PBS Professional ("PBS Pro") software. .\" .\" Open Source License Information: .\" .\" OpenPBS is free software. You can redistribute it and/or modify it under .\" the terms of the GNU Affero General Public License as published by the .\" Free Software Foundation, either version 3 of the License, or (at your .\" option) any later version. .\" .\" OpenPBS is distributed in the hope that it will be useful, but WITHOUT .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public .\" License for more details. .\" .\" You should have received a copy of the GNU Affero General Public License .\" along with this program. If not, see . .\" .\" Commercial License Information: .\" .\" PBS Pro is commercially licensed software that shares a common core with .\" the OpenPBS software. For a copy of the commercial license terms and .\" conditions, go to: (http://www.pbspro.com/agreement.html) or contact the .\" Altair Legal Department. .\" .\" Altair's dual-license business model allows companies, individuals, and .\" organizations to create proprietary derivative works of OpenPBS and .\" distribute them - whether embedded or bundled with other software - .\" under a commercial license agreement. .\" .\" Use of Altair's trademarks, including but not limited to "PBS™", .\" "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is .\" subject to Altair's trademark licensing policies. .\" .TH pbs_node_attributes 7B "17 July 2020" Local "PBS Professional" .SH NAME .B pbs_node_attributes \- attributes of PBS vnodes .SH DESCRIPTION Vnodes have the following attributes: .IP comment 8 Information about this vnode. This attribute may be set by the manager to any string to inform users of any information relating to the node. If this attribute is not explicitly set, the PBS server will use the attribute to pass information about the node status, specifically why the node is down. If the attribute is explicitly set by the manager, it will not be modified by the server. .br Readable by all; settable by Manager. .br Format: .I String .br Python type: .I str .br Default: No default .IP current_aoe 8 The AOE currently instantiated on this vnode. Cannot be set on server's host. .br Readable by all; settable by Manager. .br Format: .I String .br Python type: .I str .br Default: .I Unset .IP current_eoe 8 Current value of eoe on this vnode. We do not recommend setting this attribute manually. .br Readable by all; settable by Manager (not recommended). .br Format: .I String .br Python type: .I str .br Default: .I Unset .IP in_multivnode_host 8 Specifies whether a vnode is part of a multi-vnoded host. Used internally. Do not set. .br Readable and settable by Manager (not recommended). .br Format: .I Integer .br Python type: .I int .br Behavior: .RS .IP 1 3 Part of a multi-vnode host .IP Unset 3 Not part of a multi-vnode host .RE .IP Default: .I Unset .IP jobs 8 List of jobs running on this vnode. .br Readable by all; set by PBS. .br Format: .I String .br Syntax: .I /, ... .br Python type: .I int .br .IP last_state_change_time 8 Records the most recent time that this node changed state. .br Format: .RS 11 Timestamp .br Printed by qstat in human-readable Date format. .br Output in hooks as seconds since epoch. .RE .IP .IP last_used_time 8 Records the most recent time that this node finished being used for a job or reservation. Set at creation or reboot time. Updated when node is released early from a running job. Reset when node is ramped up. .br Format: .RS 11 Timestamp .br Printed by qstat in human-readable Date format. .br Output in hooks as seconds since epoch. .RE .IP license 8 .br Indicates whether this vnode is licensed. .br Readable by all; set by PBS. .br Format: .I Character .br Python type: .I str .br Valid values: .RS .IP l 3 This vnode is licensed. .RE .IP Default: .I Unset .IP license_info 8 Number of licenses assigned to this vnode. .br Readable by all; set by PBS. .br Format: .I Integer .br Python type: .I int .br Default: .I Unset .IP lictype 8 No longer used. .IP maintenance_jobs 8 List of jobs that were running on this vnode, but have been suspended via the .I admin-suspend signal to qsig. .br Readable by Manager; set by PBS. .br Format: .I string_array .br Python type: .I str .br Default: No default .IP Mom Hostname of host on which MoM daemon runs. .br Readable by all. Can be explicitly set by Manager only via .B qmgr, and only at vnode creation. The server can set this to the FQDN of the host on which MoM runs, if the vnode name is the same as the hostname. .br Format: .I String .br Python type: .I str .br Default: Value of .I vnode resource (vnode name) .IP name 8 The name of this vnode. .br Readable by all; settable by Manager. .br Format: .I String .br Python type: .I str .br Default: No default .IP no_multinode_jobs 8 Controls whether jobs which request more than one chunk are allowed to execute on this vnode. Used for cycle harvesting. .br Readable by all; settable by Manager. .br Format: .I Boolean .br Python type: .I bool .br Behavior: .RS .IP True 3 Jobs requesting more than one chunk are not allowed to execute on this vnode. .RE .IP Default: .I False .IP ntype 8 The type of this vnode. .br Readable by all; settable by Manager. .br Format: .I String .br Valid values: .RS .IP PBS 3 Normal vnode .br Python type: .I pbs.ND_PBS .br Default: .I PBS .RE .IP partition 8 Name of partition to which this vnode is assigned. A vnode can be assigned to at most one partition. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Python type: .I str .br Default: No default .IP pbs_version 8 The version of PBS for this MoM. .br Readable by all; set by PBS. .br Format: .I String .br Python type: .I str .br Default: No default .IP pcpus 8 .B Deprecated. The number of physical CPUs on this vnode. This is set to the number of CPUs available when MoM starts. For a multiple-vnode MoM, only the natural vnode has .I pcpus. .br Readable by all; set by PBS. .br Format: .I Integer .br Python type: .I int .br Default: .I Number of CPUs on startup .IP pnames The list of resources being used for placement sets. Not used for scheduling; advisory only. .br Readable by all; settable by Manager. .br Format: .I String .br Syntax: .I Comma-separated list of resource names .br Python type: .I str .br Default: No default .IP Port 8 Port number on which MoM daemon listens. .br Can be explicitly set only via .B qmgr, and only at vnode creation. Readable and settable by Operator and Manager. .br Format: .I Integer .br Python type: .I int .br Default: .I 15002 .IP poweroff_eligible 8 Enables powering this vnode up and down by PBS. .br Readable by all; settable by Manager. .br Format: .I Boolean .br Python type: .I bool .br Values: .RS 11 .IP True PBS can power this vnode on and off. .IP False PBS cannot power this vnode on and off. .RE .IP Default: .I False .IP power_provisioning 8 Enables use of power profiles by this vnode. .br Readable by all; settable by Manager. .br Format: .I Boolean .br Python type: .I bool .br Behavior: .RS .IP True 3 Power provisioning is enabled at this vnode. .IP False 3 Power provisioning is disabled at this vnode. .RE .IP Default: .I False .IP Priority 8 The priority of this vnode compared with other vnodes. .br Readable by all; settable by Operator and Manager. .br Format: .I Integer .br Python type: .I int .br Valid values: .I -1024 to +1023 .br Default: No default .IP provision_enable Controls whether this vnode can be provisioned. Cannot be set on server's host. .br Format: .I Boolean .br Python type: .I bool .br Behavior: .RS .IP True 3 This vnode may be provisioned. .IP False 3 This vnode may not be provisioned. .RE .IP Default: .I False .IP queue 8 .B Deprecated. The queue with which this vnode is associated. Each vnode can be associated with at most 1 queue. Queues can be associated with multiple vnodes. Any jobs in a queue that has associated vnodes can run only on those vnodes. If a vnode has an associated queue, only jobs in that queue can run on that vnode. .br Readable by all; settable by Manager. .br Format: .I String .br Python type: .I pbs.queue .br Behavior: .RS .IP "" 3 Only jobs in specified queue may run on this vnode. .IP Unset 3 Any job in any queue that does not have associated vnodes can run on this vnode. .RE .IP Default: No default .IP resources_assigned 8 The total amount of each resource allocated to jobs and started reservations running on this vnode. .br Readable by all; set by PBS. .br Format: String .br Syntax: .RS 11 .I resources_assigned.=[,resources_assigned.= .RE .IP Python type: .I pbs.pbs_resource .br Syntax: .RS 11 resources_assigned[''] = < val> .br where .I resource name is any built-in or custom resource .RE .IP Default: No default .IP resources_available 8 The list of resource and amounts available on this vnode. If not explicitly set, the amount shown is that reported by the pbs_mom running on this vnode. If a resource value is explicitly set, that value is retained across restarts. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Syntax: .RS 11 .I resources_available.=, .I resources_available. = , ... .RE .IP Python type: .I pbs.pbs_resource .br Syntax: .RS 11 resources_available[''] = < val> .br where .I resource name is any built-in or custom resource .RE .IP Default: No default .IP resv 8 List of advance and standing reservations pending on this vnode. .br Readable by all; set by PBS. .br Format: .I String .br Syntax: .RS 11 .I [, , ...] .br (Comma-separated list of reservation IDs) .RE .IP Python type: .I str .br Example: resv = R142.examplemachine, R143.examplemachine .br Default: No default .IP resv_enable 8 Controls whether the vnode can be used for advance and standing reservations. Reservations are incompatible with cycle harvesting. .br Readable by all; settable by Manger. .br Format: .I Boolean .br Python type: .I bool .br Behavior: .RS 11 When set to .I True, this vnode can be used for reservations. Existing reservations are honored when this attribute is changed from .I True to .I False. .RE .IP Default: .I True .IP sharing 8 Specifies whether more than one job at a time can use the resources of the vnode or the vnode's host. Either (1) the vnode or host is allocated exclusively to one job, or (2) the vnode's or host's unused resources are available to other jobs. .br Can be set using .I pbs_mom -s insert only. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Python type: .I int .br Valid values: .RS .IP default_shared 3 Defaults to .I shared .br Python type: .I pbs.ND_DEFAULT_SHARED .IP default_excl 3 Defaults to .I exclusive .br Python type: .I pbs.ND_DEFAULT_EXCL .IP default_exclhost 3 Entire host is assigned to the job unless the job's sharing request specifies otherwise .br Python type: .I pbs.ND_DEFAULT_EXCLHOST .IP ignore_excl 3 Overrides any job .I place=excl setting .br Python type: .I pbs.ND_IGNORE_EXCL .IP force_excl 3 Overrides any job .I place=shared setting .br Python type: .I pbs.ND_FORCE_EXCL .IP force_exclhost 3 The entire host is assigned to the job, regardless of the job's sharing request .br Python type: .I pbs.ND_FORCE_EXCLHOST .IP Unset 3 Defaults to .I shared .RE .IP Behavior of a vnode or host is determined by a combination of the .I sharing attribute and a job's placement directive, defined as follows: .nf | Vnode Behavior | Host Behavior | when place= | when place= | | sharing value | unset shared excl |exclhost !=exclhost ---------------------------------------------------------------- not set | shared shared excl | excl depends on place default_shared | shared shared excl | excl depends on place default_excl | excl shared excl | excl depends on place default_exclhost | excl shared excl | excl depends on place ignore_excl | shared shared shared| shared not exclusive force_excl | excl excl excl | excl not exclusive force_exclhost | excl excl excl | excl excl .fi Example: : sharing=force_excl .br Default value: .I default_shared .IP state 8 Shows or sets the state of the vnode. .br Readable by all. All states are set by PBS; Operator and Manager can set .I state to .I offline. .br Format: .I String .br Syntax: .I [, , ...] .br (Comma-separated list of one or more states) .br Python type: .I int .br Valid values: .RS .IP busy 3 Vnode is reporting load average greater than allowed max. Can combine with .I offline .IP down 3 Node is not responding to queries from the server. Cannot be combined with .I free, provisioning. .IP free 3 Vnode is up and capable of accepting additional job(s). Cannot be combined with other states. .IP job-busy 3 All CPUs on the vnode are allocated to jobs. Can combine with: .I offline, resv_exclusive .IP job-exclusive 3 Entire vnode is exclusively allocated to one job at the job's request. Can combine: .I offline, resv_exclusive .IP offline 3 Jobs are not to be assigned to this vnode. Can combine: .I busy, job-busy, job-exclusive, resv_exclusive .IP provisioning 3 Vnode is in being provisioned. Cannot be combined with other states. .IP resv-exclusive 3 Running reservation has requested exclusive use of vnode. Can combine with: .I job-exclusive, offline .IP stale 3 Vnode was previously reported to server, but is no longer reported to server. Cannot combine with .I free, provisioning. .IP state-unknown 3 The server has never been able to contact the vnode. Either MoM is not running on the vnode, the vnode hardware is down, or there is a network problem. .IP unresolvable 3 The server cannot resolve the name of the vnode. .IP wait-provisioning 3 Vnode needs to be provisioned, but can't: limit reached for concurrently provisioning vnodes. See the .I max_concurrent_provision server attribute. .RE .IP Default: No default .IP topology_info Contains information intended to be used in hooks. .br Visible in and usable by hooks only. Invisible to Manager, Operator, User. .br Format: .I XML String .br Python type: .I str .br Default value: .I Unset .IP vnode_pool 8 Cray only. Allows just one MoM, instead of all, to report inventory upon startup, allowing faster startup and less network communication between server and non-reporting MoMs. On each Cray, all MoMs must have same setting for this attribute. Can be set only at vnode creation; valid only on login nodes running a MoM. Not supported on non-Cray machines. .br Readable by all; settable by Manager. .br Format: .I Integer .br Python type: .I int .br Behavior: .RS .IP ">0" 3 Only one MoM per Cray reports inventory. .IP Unset 3 Each MoM reports inventory separately. .RE .IP Default: .I 0 (Unset) .SH SEE ALSO qmgr(1B) ================================================ FILE: doc/man1/pbs_professional.7B ================================================ .\" .\" Copyright (C) 1994-2021 Altair Engineering, Inc. .\" For more information, contact Altair at www.altair.com. .\" .\" This file is part of both the OpenPBS software ("OpenPBS") .\" and the PBS Professional ("PBS Pro") software. .\" .\" Open Source License Information: .\" .\" OpenPBS is free software. You can redistribute it and/or modify it under .\" the terms of the GNU Affero General Public License as published by the .\" Free Software Foundation, either version 3 of the License, or (at your .\" option) any later version. .\" .\" OpenPBS is distributed in the hope that it will be useful, but WITHOUT .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public .\" License for more details. .\" .\" You should have received a copy of the GNU Affero General Public License .\" along with this program. If not, see . .\" .\" Commercial License Information: .\" .\" PBS Pro is commercially licensed software that shares a common core with .\" the OpenPBS software. For a copy of the commercial license terms and .\" conditions, go to: (http://www.pbspro.com/agreement.html) or contact the .\" Altair Legal Department. .\" .\" Altair's dual-license business model allows companies, individuals, and .\" organizations to create proprietary derivative works of OpenPBS and .\" distribute them - whether embedded or bundled with other software - .\" under a commercial license agreement. .\" .\" Use of Altair's trademarks, including but not limited to "PBS™", .\" "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is .\" subject to Altair's trademark licensing policies. .\" .TH pbs_professional 7B "6 May 2020" Local "PBS Professional" .SH NAME .B PBS Professional \- The PBS Professional workload management system .SH DESCRIPTION PBS Professional is an HPC workload manager and job scheduler. PBS schedules jobs onto systems with the required resources, according to specified policy. PBS runs on most major platforms. See .B www.pbsworks.com and .B https://pbspro.atlassian.net/wiki/spaces/PBSPro/overview. .B Primary Commands .br .IP "init.d/pbs" 8 Starts, stops or restarts PBS daemons on the local machine. This command is typically placed in /etc/init.d so that PBS starts up automatically. See the .B pbs.8B man page. .br .IP "qmgr" 8 Administrator's interface for configuring and managing PBS. See the .B qmgr.8B man page. .IP "qstat" 8 Administrator's and job submitter's tool for checking server, queue, and job status. See the .B qstat.1B man page. .IP "qsub" 8 Job submitter's tool for submitting jobs to PBS. See the .B qsub.1B man page. .LP .SH SEE ALSO .br pbs_mom(8B), pbs_server(8B), pbs_sched(8B), pbs_comm(8B) ================================================ FILE: doc/man1/pbs_python.1B ================================================ .\" .\" Copyright (C) 1994-2021 Altair Engineering, Inc. .\" For more information, contact Altair at www.altair.com. .\" .\" This file is part of both the OpenPBS software ("OpenPBS") .\" and the PBS Professional ("PBS Pro") software. .\" .\" Open Source License Information: .\" .\" OpenPBS is free software. You can redistribute it and/or modify it under .\" the terms of the GNU Affero General Public License as published by the .\" Free Software Foundation, either version 3 of the License, or (at your .\" option) any later version. .\" .\" OpenPBS is distributed in the hope that it will be useful, but WITHOUT .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public .\" License for more details. .\" .\" You should have received a copy of the GNU Affero General Public License .\" along with this program. If not, see . .\" .\" Commercial License Information: .\" .\" PBS Pro is commercially licensed software that shares a common core with .\" the OpenPBS software. For a copy of the commercial license terms and .\" conditions, go to: (http://www.pbspro.com/agreement.html) or contact the .\" Altair Legal Department. .\" .\" Altair's dual-license business model allows companies, individuals, and .\" organizations to create proprietary derivative works of OpenPBS and .\" distribute them - whether embedded or bundled with other software - .\" under a commercial license agreement. .\" .\" Use of Altair's trademarks, including but not limited to "PBS™", .\" "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is .\" subject to Altair's trademark licensing policies. .\" .TH pbs_python 1B "6 May 2020" Local "PBS Professional" .SH NAME .B pbs_python \- Python interpreter for debugging a hook script from the command line .SH SYNOPSIS .B pbs_python --hook [-e ] [-i ] .RS 11 [-L ] [-l ] [-o ] .br [-r ] [-s ] [] .RE .B pbs_python .B pbs_python --version .SH DESCRIPTION The PBS Python interpreter, .B pbs_python, is a wrapper for Python. You can use the .B pbs_python wrapper that is shipped with PBS to debug hooks. Either: .RS 5 Use the .I --hook option to .B pbs_python to run .B pbs_python as a wrapper to Python, employing the .B pbs_python options. With the .I --hook option, you cannot use the standard Python options. The rest of this man page covers how to use .B pbs_python with the .I --hook option. Do not use the .I --hook option, so .B pbs_python runs the Python interpreter, with the standard Python options, and without access to the .B pbs_python options. .RE .B Debugging Hooks .br You can get each hook to write out debugging files, and then modify the files and use them as debugging input to .B pbs_python. Alternatively, you can write them yourself. Debugging files can contain information about the event, about the site, and about what the hook changed. You can use these as inputs to a hook when debugging. .SH Options to pbs_python .IP "--hook" 6 This option is a switch. When you use this option, you can use the PBS Python module (via "import pbs"), and the other options described here are available. When you use this option, you cannot use the standard Python options. This option is useful for debugging. When you do not use this option, you cannot use the other options listed here, but you can use the standard Python options. .IP "-e " 6 Sets the mask that determines which event types are logged by .B pbs_python. To see only debug messages, set the value to 0xd80. To see all messages, set the value to 0xffff. .br The .B pbs_python interpreter uses the same set of mask values that are used for the .I $logevent entry in the .B pbs_mom configuration file. See the pbs_mom.8B man page. Available only when .I --hook option is used. .IP "-i " 6 Text file containing data to populate pbs.event() objects. Each line specifies an attribute value or a resource value. Syntax of each input line is one of the following: .RS 10 .= .br .[]= .RE .IP Where .RS 10 is a PBS object name which can refer to its sub-objects. Examples: "pbs.event()", "pbs.event().job", "pbs.event().vnode_list[""]". .RE .IP Example input file: .RS 10 .br pbs.event().hook_name=proto .br pbs.event().hook_type=site .br pbs.event().type=queuejob .br pbs.event().requestor=user1 .br pbs.event().requestor_host=host1 .br pbs.event().hook_alarm=40 .br pbs.event().job.id=72 .br pbs.event().job.Job_Name=job1 .br pbs.event().job.Resource_List[ncpus]=5 .br pbs.event().job.Resource_List[mem]=6mb .br pbs.event().vnode_list["host1"].resources_available["ncpus"] = 5 .br pbs.event().vnode_list["host1"].resources_available["mem"] = 300gb .RE .IP Available only when .I --hook option is used. .IP "-L " 6 Directory holding the log file where pbs.logmsg() and pbs.logjobmsg() write their output. Default is current working directory where .B pbs_python is executed. Available only when .I --hook option is used. .IP "-l " 6 Log file where pbs.logmsg() and pbs.logjobmsg() write their output. Default file name is current date in .I yyyymmdd format. Available only when .I --hook option is used. .IP "-o " 6 The hook execution record contains the changes made after executing the hook script, such as the attributes and resources set in any pbs.event() jobs and reservations, whether an action was accepted or rejected, and any pbs.reject() messages. .br Example hook execution record: .RS 10 .br pbs.event().job.Job_Name=job2 .br pbs.event().job.Resource_List[file]=60gb .br pbs.event().job.Resource_List[ncpus]=5 .br pbs.event().job.Resource_List[mem]=20gb .br pbs.event().job.Account_Name=account2 .br pbs.event().reject=True .br pbs.event().reject_msg=No way! .RE .IP Without this option, output goes to stdout. .IP Available only when .I --hook option is used. .IP "-r " 6 File/path name containing a resource definition specifying a custom resource whose Python type is .I pbs.resource. .br Format: .br .I type= [flag=] .br This file has the same format as the PBS_HOME/server_priv/resourcedef file. Available only when .I --hook option is used. .IP "-s " 6 The site data file can contain any relevant information about the server, queues, vnodes, and jobs at the server. This file can be written by a hook or by the administrator. .br When the hook writes it, this file contains the values that populate the server, queues, vnodes, reservations, and jobs, with all attributes and resources for which there are values. .br The site data file is named .I hook___.data. Available only when .I --hook option is used. .IP "--version" 6 The .B pbs_python command prints its version information and exits. This option can only be used alone. .SH ARGUMENTS .IP "" 6 The hook script to execute. We recommend importing the PBS Python module at the start of the script: .RS 9 import pbs .RE .IP If you do not specify .I Python script, you can perform interactive debugging. If you type the following: .RS 9 pbs_python --hook -i hook.input .RE .IP The interpreter displays a prompt: .RS 9 >> .RE .IP You can type your Python lines at the prompt: .RS 9 >>import pbs .br >> e=pbs.event().job .br >> print e.id .br .br ... .RE ================================================ FILE: doc/man1/pbs_queue_attributes.7B ================================================ .\" .\" Copyright (C) 1994-2021 Altair Engineering, Inc. .\" For more information, contact Altair at www.altair.com. .\" .\" This file is part of both the OpenPBS software ("OpenPBS") .\" and the PBS Professional ("PBS Pro") software. .\" .\" Open Source License Information: .\" .\" OpenPBS is free software. You can redistribute it and/or modify it under .\" the terms of the GNU Affero General Public License as published by the .\" Free Software Foundation, either version 3 of the License, or (at your .\" option) any later version. .\" .\" OpenPBS is distributed in the hope that it will be useful, but WITHOUT .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public .\" License for more details. .\" .\" You should have received a copy of the GNU Affero General Public License .\" along with this program. If not, see . .\" .\" Commercial License Information: .\" .\" PBS Pro is commercially licensed software that shares a common core with .\" the OpenPBS software. For a copy of the commercial license terms and .\" conditions, go to: (http://www.pbspro.com/agreement.html) or contact the .\" Altair Legal Department. .\" .\" Altair's dual-license business model allows companies, individuals, and .\" organizations to create proprietary derivative works of OpenPBS and .\" distribute them - whether embedded or bundled with other software - .\" under a commercial license agreement. .\" .\" Use of Altair's trademarks, including but not limited to "PBS™", .\" "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is .\" subject to Altair's trademark licensing policies. .\" .TH pbs_queue_attributes 7B "6 May 2020" Local "PBS Professional" .SH NAME pbs_queue_attributes \- Attributes of PBS queues .SH DESCRIPTION Queues have the following attributes: .IP acl_group_enable 8 Controls whether group access to the queue obeys the access control list defined in the .I acl_groups queue attribute. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Boolean .br Python type: .I bool .br Behavior: .RS .IP True 3 Group access to the queue is limited according to the group access control list. .IP False 3 All groups are allowed access. .RE .IP Default: .I False .IP acl_groups 8 List of groups which are allowed or denied access to this queue. The groups in the list are groups on the server host, not submitting hosts. List is evaluated left-to-right; first match in list is used. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Syntax: "[+|-][, ...]" .br Python type: .I pbs.acl .br Default: No default .IP acl_host_enable 8 Controls whether host access to the queue obeys the access control list defined in the .I acl_hosts queue attribute. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Boolean .br Python type: .I bool .br Behavior: .RS .IP True 3 Host access to the queue is limited according to the host access control list. .IP False 3 All hosts are allowed access. .RE .IP Default: .I False .IP acl_hosts 8 List of hosts from which jobs may be submitted to this queue. List is evaluated left-to-right; first match in list is used. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Syntax: "[+|-][, ...]" .br Python type: .I pbs.acl .br Default: No default .IP acl_user_enable 8 Controls whether user access to the queue obeys the access control list defined in the .I acl_users queue attribute. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Boolean .br Python type: .I bool .br Behavior: .RS .IP True 3 User access to the queue is limited according to the user access control list. .IP False 3 All users are allowed access. .RE .IP Default: .I False .IP acl_users 8 List of users which are allowed or denied access to this queue. List is evaluated left-to-right; first match in list is used. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Syntax: "[+|-][, ...]" .br Python type: .I pbs.acl .br Default: No default .IP alt_router 8 No longer used. .IP backfill_depth 8 Modifies backfilling behavior for this queue. Sets the number of jobs to be backfilled around in this queue. Overrides .I backfill_depth server attribute. .br Recommendation: set this to less than .I 100. .br Applies to execution queues. .br Readable by all; settable by all. .br Format: .I Integer .br Valid values: Must be >=0 .br Behavior: .RS .IP ">= 0" 3 PBS backfills around the specified number of jobs. .IP "Unset" 3 Backfill depth is set to .I 1. .RE .IP Python type: .I int .br Default: Unset (backfill depth is 1) .IP checkpoint_min 8 Minimum number of minutes of CPU time or walltime allowed between checkpoints of a job. If a user specifies a time less than this value, this value is used instead. The value given in .I checkpoint_min is used for both CPU minutes and walltime minutes. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Integer .br Python type: .I pbs.duration .br Default: No default .IP default_chunk 8 The list of resources which will be inserted into each chunk of a job's select specification if the corresponding resource is not specified by the user. This provides a means for a site to be sure a given resource is properly accounted for even if not specified by the user. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Syntax: .RS 11 .nf .I default_chunk.= .I [, default_chunk.=, ...] .fi .RE .IP Python type: .I pbs.pbs_resource .br Syntax: .RS 11 default_chunk[""]= .br where .I resource name is any built-in or custom resource .RE .IP Default: No default .IP enabled 8 Specifies whether this queue accepts new jobs. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Boolean .br Python type: .I bool .br Behavior: .RS .IP True 3 This queue is enabled. This queue accepts new jobs; new jobs can be enqueued. .IP False 3 This queue does not accept new jobs. .RE .IP Default: .I False (disabled) .IP from_route_only 8 Specifies whether this queue accepts jobs only from routing queues, or from both execution and routing queues. .br Applies to routing and execution queues. .br Readable by all; settable by Manager. .br Format: .I Boolean .br Python type: .I bool .br Behavior: .RS .IP True 3 This queue accepts jobs only from routing queues. .IP False 3 This queue accepts jobs from both execution and routing queues, as well as directly from submitter. .RE .IP Default: .I False .IP hasnodes 8 Indicates whether vnodes are associated with this queue. .br Applies to execution queues. .br Readable by all; set by PBS. .br Format: .I Boolean .br Python type: .I bool .br Behavior: .RS 11 When .I True, there are vnodes associated with this queue. .RE .IP Default: .I False .IP kill_delay 8 The time delay between sending SIGTERM and SIGKILL when a qdel command is issued against a running job. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Integer .br Units: .I Seconds .br Python type: .I pbs.duration .br Valid values: Must be >= 0 .br Default: .I 10 seconds .IP max_array_size 8 The maximum number of subjobs that are allowed in an array job. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Integer .br Python type: .I int .br Default: No default .IP max_group_res 8 Old limit attribute. Incompatible with new limit attributes. The maximum amount of the specified resource that any single group may consume in a complex. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Syntax: .I max_group_res.= .br Python type: .I pbs.pbs_resource .br Syntax: .RS 11 max_group_res[""]= .br where .I resource name is any built-in or custom resource .RE .IP Valid values: Any PBS resource, e.g. "ncpus", "mem", "pmem" .br Example: .I set server max_group_res.ncpus=6 .br Default: No default .IP max_group_res_soft 8 Old limit attribute. Incompatible with new limit attributes. The soft limit on the amount of the specified resource that any single group may consume in a complex. If a group is consuming more than this amount of the specified resource, their jobs are eligible to be preempted by jobs from groups who are not over their soft limit. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Syntax: .I max_group_res_soft.= .br Python type: .I pbs.pbs_resource .br Syntax: .RS 11 max_group_res_soft[""]= .br where .I resource name is any built-in or custom resource .RE .IP Valid values: Any PBS resource, e.g. "ncpus", "mem", "pmem" .br Example: .I set queue workq max_group_res_soft.ncpus=3 .br Default: No default .IP max_group_run 8 Old limit attribute. Incompatible with new limit attributes. The maximum number of jobs owned by a group that are allowed to be running from this queue at one time. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Integer .br Python type: .I int .br Default: No default .IP max_group_run_soft 8 Old limit attribute. Incompatible with new limit attributes. The maximum number of jobs owned by users in a single group that are allowed to be running from this queue at one time. If a group has more than this number of jobs running, their jobs are eligible to be preempted by jobs from groups who are not over their soft limit. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Integer .br Python type: .I int .br Default: No default .IP max_queuable 8 Old limit attribute. Incompatible with new limit attributes. The maximum number of jobs allowed to reside in this queue at any given time. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Integer .br Python type: .I int .br Default: No default (no limit) .IP max_queued 8 Limit attribute. The maximum number of jobs allowed to be queued in or running from this queue. Can be specified for projects, users, groups, or all. Cannot be used with old limit attributes. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br .br Format: .I Limit specification. See .B FORMATS. .br Python type: .I pbs.pbs_resource .br Syntax: .RS 11 max_queued[""]= .br where .I resource name is any built-in or custom resource .RE .IP Default: No default .IP max_queued_res 8 Limit attribute. The maximum amount of the specified resource allowed to be allocated to jobs queued in or running from this queue. Can be specified for projects, users, groups, or all. Cannot be used with old limit attributes. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Limit specification. See .B FORMATS. .br Syntax: .I max_queued_res.= .br Python type: .I pbs.pbs_resource .br Syntax: .RS 11 max_queued_res[""]= .br where .I resource name is any built-in or custom resource .RE .IP Valid values: Any PBS resource, e.g. "ncpus", "mem", "pmem" .br Example: .I set queue workq max_queued_res.ncpus=4 .br Default: No default .IP max_run 8 Limit attribute. The maximum number of jobs allowed to be running from this queue. Can be specified for projects, users, groups, or all. Cannot be used with old limit attributes. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Limit specification. See .B FORMATS. .br Python type: .I pbs.pbs_resource .br Syntax: .RS 11 max_run[""]= .br where .I resource name is any built-in or custom resource .RE .IP Default: No default .IP max_run_res 8 Limit attribute. The maximum amount of the specified resource allowed to be allocated to jobs running from this queue. Can be specified for projects, users, groups, or all. Cannot be used with old limit attributes. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Limit specification. See .B FORMATS. .br Syntax: .I max_run_res.= .br Python type: .I pbs.pbs_resource .br Syntax: .RS 11 max_run_res[""]= .br where .I resource name is any built-in or custom resource .RE .IP Valid values: Any PBS resource, e.g. "ncpus", "mem", "pmem" .br Example: .I set queue workq max_run_res.ncpus=4 .br Default: No default .IP max_run_res_soft 8 Limit attribute. Soft limit on the amount of the specified resource allowed to be allocated to jobs running from this queue. Can be specified for projects, users, groups, or all. Cannot be used with old limit attributes. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Limit specification. See .B FORMATS. .br Syntax: .I max_run_res_soft.= .br Python type: .I pbs.pbs_resource .br Syntax: .RS 11 max_run_res_soft[""]= .br where .I resource name is any built-in or custom resource .RE .IP Valid values: Any PBS resource, e.g. "ncpus", "mem", "pmem" .br Example: .I set queue workq max_run_res_soft.ncpus=2 .br Default: No default .IP max_run_soft 8 Limit attribute. Soft limit on the number of jobs allowed to be running from this queue. Can be specified for projects, users, groups, or all. Cannot be used with old limit attributes. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Limit specification. See .B FORMATS. .br Python type: .I pbs.pbs_resource .br Syntax: .RS 11 max_run_soft[""]= .br where .I resource name is any built-in or custom resource .RE .IP Default: No default .IP max_running 8 Old limit attribute. Incompatible with new limit attributes. For an execution queue, this is the largest number of jobs allowed to be running at any given time. For a routing queue, this is the largest number of jobs allowed to be transiting from this queue at any given time. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Integer .br Python type: .I int .br Default: No default .IP max_user_res 8 Old limit attribute. Incompatible with new limit attributes. The maximum amount of the specified resource that any single user may consume. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Syntax: .I max_user_res.= .br Python type: .I pbs.pbs_resource .br Syntax: .RS 11 max_user_res[""]= .br where .I resource name is any built-in or custom resource .RE .IP Valid values: Any PBS resource, e.g. "ncpus", "mem", "pmem" .br Example: .I set queue workq max_user_res.ncpus=2 .br Default: No default .IP max_user_res_soft 8 Old limit attribute. Incompatible with new limit attributes. The soft limit on the amount of the specified resource that any single user may consume. If a user is consuming more than this amount of the specified resource, their jobs are eligible to be preempted by jobs from users who are not over their soft limit. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Syntax: .I max_user_res_soft.= .br Python type: .I pbs.pbs_resource .br Syntax: .RS 11 max_user_res_soft[""]= .br where .I resource name is any built-in or custom resource .RE .IP Valid values: Any PBS resource, e.g. "ncpus", "mem", "pmem" .br Example: .I set queue workq max_user_res_soft.ncpus=2 .br Default: No default .IP max_user_run 8 Old limit attribute. Incompatible with new limit attributes. The maximum number of jobs owned by a single user that are allowed to be running from this queue at one time. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Integer .br Python type: .I int .br Default: No default .IP max_user_run_soft 8 Old limit attribute. Incompatible with new limit attributes. The soft limit on the number of jobs owned by any single user that are allowed to be running from this queue at one time. If a user has more than this number of jobs running, their jobs are eligible to be preempted by jobs from users who are not over their soft limit. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Integer .br Python type: .I int .br Default: No default .IP node_group_key 8 Specifies the resources to use for placement sets. Overrides server's .I node_group_key attribute. Specified resources must be of type .I string_array. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I string_array .br Syntax: .RS 11 .I Comma-separated list of resource names. .br When specifying multiple resources, enclose value in double quotes. .RE .IP Python type: .I pbs.node_group_key .br Example: .RS 11 Qmgr> set queue workq node_group_key= .RE .IP Default: No default .IP partition 8 Name of partition to which this queue is assigned. Cannot be set for routing queue. An execution queue cannot be changed to a routing queue while this attribute is set. .br Applies to execution queues. .br Readable by all; settable by Manager. .br Format: .I String .br Python type: .I str .br Default: No default .IP Priority 8 The priority of this queue compared to other queues of the same type in this PBS complex. Priority can define a queue as an express queue. See .I preempt_queue_prio in the pbs_sched(8B) man page. Used for execution queues only; the value of .I Priority has no meaning for routing queues. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Integer .br Valid values: -1024 to 1023 .br Python type: .I int .br Default: No default .IP queued_jobs_threshold 8 Limit attribute. The maximum number of jobs allowed to be queued in this queue. Can be specified for projects, users, groups, or all. Cannot be used with old limit attributes. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Limit specification; See .B LIMITS .br Python type: .I pbs.pbs_resource .br Syntax: .RS 11 queued_jobs_threshold[""]= .br where .I resource name is any built-in or custom resource .RE .IP Default: No default .IP queued_jobs_threshold_res 8 Limit attribute. The maximum amount of the specified resource allowed to be allocated to jobs queued in this queue. Can be specified for projects, users, groups, or all. Cannot be used with old limit attributes. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I limit specification. See .B LIMITS .br Syntax: .I queued_jobs_threshold_res.= .br Python type: .I pbs.pbs_resource .br Syntax: .RS 11 queued_jobs_threshold_res_[""]= .br where .I resource name is any built-in or custom resource .RE .IP Valid values: Any PBS resource, e.g. "ncpus", "mem", "pmem" .br Example: .I set queue workq queued_jobs_threshold_res.ncpus=8 .br Default: No default .IP queue_type 8 The type of the queue. This attribute must be explicitly set at queue creation. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Python type: .RS 11 PBS queue type constant: .I pbs.QUEUETYPE_EXECUTION or .I pbs.QUEUETYPE_ROUTE .RE .IP Valid values: "e", "execution", "r", "route" .br Default: No default .IP require_cred 8 Specifies the credential type required. All jobs submitted to the named queue without the specified credential will be rejected. .br Applies to routing and execution queues. .br Readable by all; settable by Manager. .br Format: .I String .br Python type: .I str .br Valid values: .I krb5 or .I dce .br Default: Unset .IP require_cred_enable 8 Specifies whether the credential authentication method specified in the .I require_cred queue attribute is required for this queue. .br Applies to routing and execution queues. .br Readable by all; settable by Manager. .br Format: .I Boolean .br Python type: .I bool .br Behavior: .RS 11 When set to .I True, the credential authentication method is required. .RE .IP Default: .I False .IP resources_assigned 8 The total for each kind of resource allocated to running and exiting jobs in this queue. .br Applies to execution queues. .br Readable by all; set by PBS. .br Format: .I String .br Syntax: .RS 11 .nf .I resources_assigned.= .I resources_assigned.= ... .fi .RE .IP Python type: .I pbs.pbs_resource .br Syntax: .RS 11 resources_assigned[""]= .br where .I resource name is any built-in or custom resource .RE .IP Default value: No default .IP resources_available 8 The list of resources and amounts available to jobs running in this queue. The sum of the resource of each type used by all jobs running from this queue cannot exceed the total amount listed here. See the .I qmgr(1B) man page. .br Applies to execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Syntax: .RS 11 .nf .I resources_available.= .I resources_available.= ... .fi .RE .IP Python type: .I pbs.pbs_resource .br Syntax: .RS 11 resources_available[""]= .br where .I resource name is any built-in or custom resource .RE .IP Default value: No default .IP resources_default 8 The list of default resource values which are set as limits for a job residing in this queue and for which the job did not specify a limit. If not explicitly set, the default limit for a job is determined by the first of the following attributes which is set: server's .I resources_default, queue's .I resources_max, server's .I resources_max. If none of these is set, the job gets unlimited resource usage. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Syntax: .RS 11 .nf .I resources_default.=, .I resources_default.=, ... .fi .RE .IP Python type: .I pbs.pbs_resource .br Syntax: .RS 11 resources_default[""]= .br where .I resource name is any built-in or custom resource .RE .IP Default value: No default .IP resources_max 8 The maximum amount of each resource that can be requested by a single job in this queue. The queue value supersedes any server wide maximum limit. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Syntax: .RS 11 .nf .I resources_max.=, .I resources_max.=, ... .fi .RE .IP Python type: .I pbs.pbs_resource .br Syntax: .RS 11 resources_max[""]= .br where .I resource name is any built-in or custom resource .RE .IP Default value: No default (infinite usage) .IP resources_min 8 The minimum amount of each resource that can be requested by a single job in this queue. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I String .br Syntax: .RS 11 .nf .I resources_min.=, .I resources_min.=, ... .fi .RE .IP Python type: .I pbs.pbs_resource .br Syntax: .RS 11 resources_min[""]= .br where .I resource name is any built-in or custom resource .RE .IP Default value: No default (zero usage) .IP route_destinations 8 The list of destinations to which jobs may be routed. .br Must be set to at least one valid destination. .br Applies to routing queues. .br Readable by all; settable by Manager. .br Format: .I String .br Syntax: .RS 11 List of comma-separated strings: .br .I [@[:]] .RE .IP Python type: .I pbs.route_destinations .br Example: .I Q1,Q2@remote,Q3@remote:15501 .br Default: No default .IP route_held_jobs 8 Specifies whether jobs in the .I held state can be routed from this queue. .br Applies to routing queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Boolean .br Python type: .I bool .br Behavior: .RS 11 When .I True, jobs with a hold can be routed from this queue. .RE .IP Default: .I False .IP route_lifetime 8 The maximum time a job is allowed to reside in this routing queue. If a job cannot be routed in this amount of time, the job is aborted. .br Applies to routing queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Integer .br Units: .I Seconds .br Python type: .I pbs.duration .br Behavior: .RS .IP >0 3 Jobs can reside for specified number of seconds .IP "0 or unset" 3 Jobs can reside for infinite time .RE .IP Default: Unset .IP route_retry_time 8 Time delay between routing retries. Typically used when the network between servers is down. .br Applies to routing queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Integer .br Units: .I Seconds .br Python type: .I pbs.duration .br Default: .I 30 seconds .IP route_waiting_jobs 8 Specifies whether jobs whose .I Execution_Time attribute value is in the future can be routed from this queue. .br Applies to routing queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Boolean .br Python type: .I bool Behavior: .RS 11 When .I True, jobs with a future .I Execution_Time can be routed from this queue. .RE .IP Default: .I False .IP started 8 If this is an execution queue, specifies whether jobs in this queue can be scheduled for execution, or if this is a routing queue, whether jobs can be routed. .br Applies to routing and execution queues. .br Readable by all; settable by Operator and Manager. .br Format: .I Boolean .br Python type: .I bool .br Behavior: When .I True, jobs in this queue can run or be routed. .br Default: .I False .IP state_count 8 The total number of jobs in each state currently residing in this queue. .br Applies to routing and execution queues. .br Readable by all; set by PBS. .br Format: .I String .br Syntax: .I transiting=,exiting=, ... .br Python type: .I pbs.state_count .br Default: No default .IP total_jobs 8 The number of jobs currently residing in this queue. .br Applies to routing and execution queues. .br Readable by all; set by PBS. .br Format: .I Integer .br Python type: .I int .br Default: No default .SH FORMATS .IP "Limit specification" 8 Limit attributes can be set, added to, or removed from. Format for setting a limit specification: .RS 11 .nf set server = "=[, =] ..." .fi .RE .IP Format for adding to a limit specification: .RS 11 .nf set server += "=[, =] ..." .fi .RE .IP Format for removing from a limit specification: .RS 11 .nf set server -= "=[, [=] ..." .br or .br set server -= "[, ] ..." .fi .RE .IP Where .I limit specification is .RS 11 o:PBS_ALL Overall limit .br u:PBS_GENERIC Generic users .br u: A specific user .br g:PBS_GENERIC Generic groups .br g: A specific group .br p:PBS_GENERIC Generic projects .br p: A specific project .RE .IP The .I limit specification can contain spaces anywhere except after the colon (":"). .br If there are comma-separated .I limit specifications, the entire string must be enclosed in double quotes. .br A username, groupname, or project name containing spaces must be enclosed in quotes. .br If a username, groupname, or project name is quoted using double quotes, and the entire string requires quotes, the outer enclosing quotes must be single quotes. Similarly, if the inner quotes are single quotes, the outer quotes must be double quotes. .br .I PBS_ALL is a keyword which indicates that this limit applies to the usage total. .br .I PBS_GENERIC is a keyword which indicates that this limit applies to generic users, groups, or projects. .br When removing a limit, the .I limit value does not need to be specified. .br For example, to set the .I max_queued limit on QueueA to 5 for total usage, and to limit user bill to 3: .RS 11 s q QueueA max_queued = "[o:PBS_ALL=5], [u:bill =3]" .RE .IP Examples of setting, adding, and removing: .br .RS 11 set server max_run="[u:PBS_GENERIC=2], [g:group1=10], [o:PBS_ALL = 100]" .br set server max_run+="[u:user1=3], [g:PBS_GENERIC=8]" .br set server max_run-="[u:user2], [g:group3]" .br set server max_run_res.ncpus="[u:PBS_GENERIC=2], [g:group1=8], [o:PBS_ALL = 64]" .RE .IP .SH Incompatible Limit Attributes The old and new limit attributes are incompatible. If any of one kind is set, none of the other kind can be set. All of one kind must be unset in order to set any of the other kind. .br .SH SEE ALSO qmgr(1B) ================================================ FILE: doc/man1/pbs_ralter.1B ================================================ .\" .\" Copyright (C) 1994-2021 Altair Engineering, Inc. .\" For more information, contact Altair at www.altair.com. .\" .\" This file is part of both the OpenPBS software ("OpenPBS") .\" and the PBS Professional ("PBS Pro") software. .\" .\" Open Source License Information: .\" .\" OpenPBS is free software. You can redistribute it and/or modify it under .\" the terms of the GNU Affero General Public License as published by the .\" Free Software Foundation, either version 3 of the License, or (at your .\" option) any later version. .\" .\" OpenPBS is distributed in the hope that it will be useful, but WITHOUT .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public .\" License for more details. .\" .\" You should have received a copy of the GNU Affero General Public License .\" along with this program. If not, see . .\" .\" Commercial License Information: .\" .\" PBS Pro is commercially licensed software that shares a common core with .\" the OpenPBS software. For a copy of the commercial license terms and .\" conditions, go to: (http://www.pbspro.com/agreement.html) or contact the .\" Altair Legal Department. .\" .\" Altair's dual-license business model allows companies, individuals, and .\" organizations to create proprietary derivative works of OpenPBS and .\" distribute them - whether embedded or bundled with other software - .\" under a commercial license agreement. .\" .\" Use of Altair's trademarks, including but not limited to "PBS™", .\" "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is .\" subject to Altair's trademark licensing policies. .\" .TH pbs_ralter 1B "28 February 2021" Local "PBS Professional" .SH NAME .B pbs_ralter \- modify an existing reservation .SH SYNOPSIS .B pbs_ralter [-D ] [-E ] [-G ] [-I ] [-l select=" 10 (Lowercase L) Specifies new select specification for reservation. New specification must be a subset of the same chunks requested by the original reservation. If reservation is started, cannot be used to release chunks where reservation jobs are running. If reservation is started and degraded, you must release all unavailable chunks in order to alter the reservation select specification. .IP "-m " 10 Specifies the set of events that cause mail to be sent to the list of users specified in the .I -M option. Format: .I String .br Syntax: Either of .RS 13 1) any combination of "a", "b", "c" or "e", or .br 2) the single character "n" .RE .IP .nf Suboptions to -m Option: Character Meaning -------------------------------------------------------------- a Notify if reservation is terminated for any reason b Notify when the reservation period begins c Notify when the reservation is confirmed e Notify when the reservation period ends n Send no mail. Cannot be used with any of a, b, c or e. .fi Default: No default; if not specified, mail events are unchanged .IP "-M " 10 The list of users to whom mail is sent whenever the reservation transitions to one of the states specified in the .I -m option. Format: .I [@][,[@]...] Default: No default; if not specified, user list is unchanged .IP "-N " 10 Specifies a name for the reservation. Format: .RS 13 String up to 15 characters in length. It must consist of printable, non-white space characters with the first character alphabetic. .RE .IP Default: No default; if not specified, reservation name is unchanged .IP "-R " 10 Specifies reservation's new start time. This option can be used either when the reservation is not running or there are no jobs are submitted to the reservation. You cannot use this option when a reservation is not empty and has started running. The specifications for providing the time are the same as for pbs_rsub: .br If the day, .I DD, is not specified, it defaults to today if the time .I hhmm is in the future. Otherwise, the day is set to tomorrow. For example, if you alter a reservation with the specification -R 1110 at 11:15 a.m., it is interpreted as being for 11:10 a.m. tomorrow. If the month portion, .I MM, is not specified, it defaults to the current month, provided that the specified day .I DD, is in the future. Otherwise, the month is set to next month. Similar rules apply to the two other optional, left-side components. Format: .I Datetime .IP "-U " 10 Comma-separated list of users who are and are not allowed to submit jobs to this reservation. Sets reservation's .I Authorized_Users attribute to .I auth user list. .br This list becomes the .I acl_users attribute for the reservation's queue. .br More specific entries should be listed before more general, because the list is read left-to-right, and the first match determines access. The reservation creator's username is automatically added to this list, whether or not the reservation creator specifies this list. .br If both the .I Authorized_Users and .I Authorized_Groups reservation attributes are set, a user must belong to both in order to be able to submit jobs to this reservation. .br See the .I Authorized_Users reservation attribute in the pbs_resv_attributes(7B) man page. .br Syntax: .I [+|-][@][,[+|-][@]...] .br Default: no default; user list is unchanged .br .IP "-Wforce" 10 Enforces changes made to the reservation start time, end time, or duration, regardless of the actions of the scheduler. Can be used only by the PBS Administrator. Note that with this option you can force PBS to oversubscribe resources, in which case you (the administrator) may need to manage them yourself. Cannot be used to change the start time of a reservation in which jobs are running. .IP "--version" 10 The .B pbs_ralter command returns its PBS version information and exits. This option can only be used alone. .SH OPERANDS The pbs_ralter command takes a reservation ID. .br For an advance or job-specific reservation this has the form: .RS 4 .I "R[.][@]" .RE For a standing reservation this has the form: .RS 4 .I "S[.][@]" .RE For a maintenance reservation this has the form: .RS 4 .I "M[.][@]" .RE .I @ specifies a reservation at a server other than the default server. ================================================ FILE: doc/man1/pbs_rdel.1B ================================================ .\" .\" Copyright (C) 1994-2021 Altair Engineering, Inc. .\" For more information, contact Altair at www.altair.com. .\" .\" This file is part of both the OpenPBS software ("OpenPBS") .\" and the PBS Professional ("PBS Pro") software. .\" .\" Open Source License Information: .\" .\" OpenPBS is free software. You can redistribute it and/or modify it under .\" the terms of the GNU Affero General Public License as published by the .\" Free Software Foundation, either version 3 of the License, or (at your .\" option) any later version. .\" .\" OpenPBS is distributed in the hope that it will be useful, but WITHOUT .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public .\" License for more details. .\" .\" You should have received a copy of the GNU Affero General Public License .\" along with this program. If not, see . .\" .\" Commercial License Information: .\" .\" PBS Pro is commercially licensed software that shares a common core with .\" the OpenPBS software. For a copy of the commercial license terms and .\" conditions, go to: (http://www.pbspro.com/agreement.html) or contact the .\" Altair Legal Department. .\" .\" Altair's dual-license business model allows companies, individuals, and .\" organizations to create proprietary derivative works of OpenPBS and .\" distribute them - whether embedded or bundled with other software - .\" under a commercial license agreement. .\" .\" Use of Altair's trademarks, including but not limited to "PBS™", .\" "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is .\" subject to Altair's trademark licensing policies. .\" .TH pbs_rdel 1B "6 May 2020" Local "PBS Professional" .SH NAME .B pbs_rdel \- delete a PBS reservation .SH SYNOPSIS .B pbs_rdel [,...] .br .B pbs_rdel --version .SH DESCRIPTION The .B pbs_rdel command deletes reservations in the order specified. This command deletes the specified reservations, whether or not they are running, all jobs in the reservations, and the reservation queues. .B Required Privilege .br A reservation may be deleted by its owner, a PBS Operator, or a PBS Manager. .SH OPTIONS .IP "--version" 10 The .B pbs_rdel command returns its PBS version information and exits. This option can only be used alone. .SH OPERANDS The pbs_rdel command accepts one or more .I reservation ID operands. .br For an advance or job-specific reservation this has the form: .RS 4 .I "R[.][@]" .RE For a standing reservation this has the form: .RS 4 .I "S[.][@]" .RE For a maintenance reservation this has the form: .RS 4 .I "M[.][@]" .RE .I @ specifies a reservation at a server other than the default server. .SH EXIT STATUS .IP "Zero" 10 Upon success .IP "Greater than zero" 10 Upon failure to process any operand .SH SEE ALSO pbs_rsub(1B), pbs_rstat(1B), pbs_resv_attributes(7B) ================================================ FILE: doc/man1/pbs_release_nodes.1B ================================================ .\" .\" Copyright (C) 1994-2021 Altair Engineering, Inc. .\" For more information, contact Altair at www.altair.com. .\" .\" This file is part of both the OpenPBS software ("OpenPBS") .\" and the PBS Professional ("PBS Pro") software. .\" .\" Open Source License Information: .\" .\" OpenPBS is free software. You can redistribute it and/or modify it under .\" the terms of the GNU Affero General Public License as published by the .\" Free Software Foundation, either version 3 of the License, or (at your .\" option) any later version. .\" .\" OpenPBS is distributed in the hope that it will be useful, but WITHOUT .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public .\" License for more details. .\" .\" You should have received a copy of the GNU Affero General Public License .\" along with this program. If not, see . .\" .\" Commercial License Information: .\" .\" PBS Pro is commercially licensed software that shares a common core with .\" the OpenPBS software. For a copy of the commercial license terms and .\" conditions, go to: (http://www.pbspro.com/agreement.html) or contact the .\" Altair Legal Department. .\" .\" Altair's dual-license business model allows companies, individuals, and .\" organizations to create proprietary derivative works of OpenPBS and .\" distribute them - whether embedded or bundled with other software - .\" under a commercial license agreement. .\" .\" Use of Altair's trademarks, including but not limited to "PBS™", .\" "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is .\" subject to Altair's trademark licensing policies. .\" .TH pbs_release_nodes 1B "6 May 2020" Local "PBS Professional" .SH NAME .B pbs_release_nodes \- release vnodes assigned to a PBS job .SH SYNOPSIS .B pbs_release_nodes [-j ] .RS 18 .br [-k ( | .br )] .br [ [] ...] .RE .br .B pbs_release_nodes [-j ] -a .br .B pbs_release_nodes --version .SH DESCRIPTION You can use the .B pbs_release_nodes command to release no-longer-needed sister hosts or vnodes assigned to a running job, before the job would normally release them. These vnodes are then available for use by other jobs. You can specify the names of sister vnodes to be released, or you can release all sister vnodes not on the primary execution host that are assigned to a running job via the .I -a option. PBS can keep the numbe of sister hosts you specify, or PBS can release all sister vnodes except for the ones you specify vi a a select statement. Can be used on jobs and subjobs, but not on job arrays or ranges of subjobs. .B Caveats and Restrictions .br You can release only sister hosts or vnodes that are not on the primary execution host. You cannot release vnodes on the primary execution host. The job must be running (in the .I R state). The pbs_release_nodes command is not supported on vnodes tied to Cray X* series systems (vnodes whose .I vntype has the "cray_" prefix). If cgroups support is enabled, and pbs_release_nodes is called to release some but not all the vnodes managed by a MoM, resources on those vnodes that are part of a cgroup are not released until the entire cgroup is released. You cannot release a partial host. If you try to release some but not all of a host, the job's .I exec_vnode attribute shows the new, smaller list of vnodes, but the pbsnodes command will reveal that the host is still allocated to the job. If you specify release of a vnode on which a job process is running, that process is terminated when the vnode is released. .B Required Privilege .br This command can be run by the job owner, the PBS Manager, Operator, and Administrator, as well as root on Linux and Admin on Windows. .SH Options to pbs_release_nodes .IP "-a" 10 Releases all job vnodes not on the primary execution host. Cannot be used .I -k option, or with list of vnode names. .IP "-j " 10 Specifies the job ID for the job or subjob whose vnode(s) are to be released. .IP "-k | " 10 Use .I keep number to specify how many sister hosts to keep. Use .I keep selection to specify which sister vnodes to keep. The .I keep selection is a select statement beginning with "select=" specifying which vnodes to keep. The primary execution host and its vnodes are not released. For example, to release all sister hosts except 8: .br .B \ \ \ pbs_release_nodes -k 8 .br To release all sister vnodes except for 4 of the ones marked with "bigmem": .br .B \ \ \ pbs_release_nodes -k select=4:bigmem=true Cannot be used with .I -a option or with vnode list argument. .IP "(no options)" 10 With no options, pbs_release_nodes uses the value of the .I PBS_JOBID environment variable as the job ID of the job whose vnodes are to be released. .IP "--version" 10 The pbs_release_nodes command returns its PBS version information and exits. This option can only be used alone. .SH Operands for pbs_release_nodes The pbs_release_nodes command can take as an operand a list of vnodes. Format: .br .I [ [] ...] .br Cannot be used with the .I -a option. .SH Usage This command can be run at the command line, or called inside a job script, where it can use the value of the .I PBS_JOBID environment variable. You can release any vnode that appears in the job's .I exec_vnode attribute that is not on the primary execution host. You can release a particular set of a job's vnodes, or you can release all of a job's non-primary-execution-host vnodes. To release specific vnodes: .br .B \ \ \ pbs_release_nodes [-j ] [] ...] To release all of a job's vnodes that are not on the primary execution host: .br .B \ \ \ pbs_release_nodes [-j ] -a To release all except a specified number of vnodes: .br .B \ \ \ pbs_release_nodes -k To release all vnodes except for those in a select specification: .br .B \ \ \ pbs_release_nodes -k %s' __s += [_s % ('all', 1, 'Show All')] __s += [_s % ('pass', 2, 'Show only "Passed"')] __s += [_s % ('skip', 3, 'Show only "Skipped"')] __s += [_s % ('fail', 4, 'Show only "Failed"')] __s += [_s % ('error', 5, 'Show only "Error"')] __s += [_s % ('timedout', 6, 'Show only "TimedOut"')] __s += [''] self.__dbobj[_TESTRESULT_TN].write('\n'.join(__s)) self.__dbobj[_TESTRESULT_TN].flush() def __write_test_data(self, data): if _TESTRESULT_TN not in self.__dbobj.keys(): self.__dbobj[_TESTRESULT_TN] = open(self.dbpath, 'w+') self.__write_test_html_header(data) d = {} d['suite'] = data['suite'] d['testcase'] = data['testcase'] d['status'] = data['status'] d['status_data'] = data['status_data'] d['duration'] = str(data['duration']) self.__dbobj[_TESTRESULT_TN].seek(0, os.SEEK_END) self.__dbobj[_TESTRESULT_TN].seek( self.__dbobj[_TESTRESULT_TN].tell() - 27, os.SEEK_SET) t = self.__dbobj[_TESTRESULT_TN].readline().strip() line = '' if t != '[': line += ',\n' else: line += '\n' line += str(d) + '\n];' self.__dbobj[_TESTRESULT_TN].seek(0, os.SEEK_END) self.__dbobj[_TESTRESULT_TN].seek( self.__dbobj[_TESTRESULT_TN].tell() - 26, os.SEEK_SET) self.__dbobj[_TESTRESULT_TN].write(line) self.__dbobj[_TESTRESULT_TN].flush() self.__index += 1 def write(self, data, logfile=None): if len(data) == 0: return if 'testdata' in data.keys(): self.__write_test_data(data['testdata']) def close(self, result=None): for v in self.__dbobj.values(): v.write('\n') v.flush() v.close() class JSONDb(DBType): """ JSON type database """ def __init__(self, dbtype, dbpath, dbaccess): super(JSONDb, self).__init__(dbtype, dbpath, dbaccess) if self.dbtype != 'json': _msg = 'db type does not match with my type(json)' raise PTLDbError(rc=1, rv=False, msg=_msg) if not self.dbpath: _msg = 'Db path require!' raise PTLDbError(rc=1, rv=False, msg=_msg) elif not self.dbpath.endswith('.json'): self.dbpath = self.dbpath.rstrip('.db') + '.json' self.jdata = {} self.__cmd = [os.path.basename(sys.argv[0])] self.__cmd += sys.argv[1:] self.__cmd = ' '.join(self.__cmd) self.res_data = PTLJsonData(command=self.__cmd) def __write_test_data(self, data): prev_data = copy.deepcopy(self.jdata) self.jdata = self.res_data.get_json(data=data, prev_data=prev_data) with open(self.dbpath, 'w') as fd: json.dump(self.jdata, fd, indent=2) fd.write("\n") def write(self, data, logfile=None): if len(data) == 0: return if 'testdata' in data.keys(): self.__write_test_data(data['testdata']) def close(self, result=None): if result is not None and self.jdata: dur = str(result.stop - result.start) self.jdata['result']['start'] = str(result.start) self.jdata['result']["end"] = str(result.stop) self.jdata['result']['duration'] = dur with open(self.dbpath, 'w') as fd: json.dump(self.jdata, fd, indent=2) fd.write("\n") class PTLTestDb(Plugin): """ PTL Test Database Plugin """ name = 'PTLTestDb' score = sys.maxsize - 5 logger = logging.getLogger(__name__) def __init__(self): Plugin.__init__(self) self.__dbconn = None self.__dbtype = None self.__dbpath = None self.__dbaccess = None self.__dbmapping = {'file': FileDb, 'html': HTMLDb, 'json': JSONDb, 'sqlite': SQLiteDb, 'pgsql': PostgreSQLDb} self.__du = DshUtils() def options(self, parser, env): """ Register command line options """ pass def set_data(self, dbtype, dbpath, dbaccess): """ Set the data """ self.__dbtype = dbtype self.__dbpath = dbpath self.__dbaccess = dbaccess def configure(self, options, config): """ Configure the plugin and system, based on selected options :param options: Configuration options for ``plugin`` and ``system`` """ if self.__dbconn is not None: return if self.__dbtype is None: self.__dbtype = 'json' if self.__dbtype not in self.__dbmapping.keys(): self.logger.error('Invalid db type: %s' % self.__dbtype) sys.exit(1) try: self.__dbconn = self.__dbmapping[self.__dbtype](self.__dbtype, self.__dbpath, self.__dbaccess) except PTLDbError as e: self.logger.error(str(e) + '\n') sys.exit(1) self.enabled = True def __create_data(self, test, err=None, status=None): if hasattr(test, 'test'): _test = test.test sn = _test.__class__.__name__ elif hasattr(test, 'context'): test = _test = test.context sn = test.__name__ else: return {} testdata = {} data = {} cur_time = datetime.datetime.now() if (hasattr(_test, 'server') and (getattr(_test, 'server', None) is not None)): testdata['pbs_version'] = _test.server.attributes['pbs_version'] testdata['hostname'] = _test.server.hostname else: testdata['pbs_version'] = 'unknown' testdata['hostname'] = 'unknown' testdata['machinfo'] = self.__get_machine_info(_test) testdata['testparam'] = getattr(_test, 'param', None) testdata['suite'] = sn testdata['suitedoc'] = str(_test.__class__.__doc__) testdata['file'] = _test.__module__.replace('.', '/') + '.py' testdata['module'] = _test.__module__ testdata['testcase'] = getattr(_test, '_testMethodName', '') testdata['testdoc'] = getattr(_test, '_testMethodDoc', '') testdata['start_time'] = getattr(test, 'start_time', cur_time) testdata['end_time'] = getattr(test, 'end_time', cur_time) testdata['duration'] = getattr(test, 'duration', 0) testdata['tags'] = getattr(_test, TAGKEY, []) testdata['requirements'] = getattr(_test, 'requirements', default_requirements) measurements_dic = getattr(_test, 'measurements', {}) if measurements_dic: testdata['measurements'] = measurements_dic additional_data_dic = getattr(_test, 'additional_data', {}) if additional_data_dic: testdata['additional_data'] = additional_data_dic if err is not None: if isclass(err[0]) and issubclass(err[0], SkipTest): testdata['status'] = 'SKIP' testdata['status_data'] = 'Reason = %s' % (err[1]) else: if isclass(err[0]) and issubclass(err[0], TimeOut): status = 'TIMEDOUT' testdata['status'] = status testdata['status_data'] = getattr(test, 'err_in_string', '') else: testdata['status'] = status testdata['status_data'] = '' data['testdata'] = testdata md = getattr(_test, 'metrics_data', {}) if len(md) > 0: data['metrics_data'] = md return data def __get_machine_info(self, test): """ Helper function to return machines dictionary with details :param: test :test type: object returns dictionary with machines information """ mpinfo = { 'servers': [], 'moms': [], 'comms': [], 'clients': [] } minstall_type = { 'servers': 'server', 'moms': 'execution', 'comms': 'communication', 'clients': 'client' } for name in mpinfo: mlist = None if (hasattr(test, name) and (getattr(test, name, None) is not None)): mlist = getattr(test, name).values() if mlist: for mc in mlist: mpinfo[name].append(mc) machines = {} for k, v in mpinfo.items(): for _v in v: hst = _v.hostname if hst not in machines: machines[hst] = {} mshort = machines[hst] mshort['platform'] = _v.get_uname(hostname=hst) mshort['os_info'] = _v.get_os_info(hostname=hst) machines[hst]['pbs_install_type'] = minstall_type[k] if ((k == 'moms' or k == 'comms') and hst in mpinfo['servers']): machines[hst]['pbs_install_type'] = 'server' return machines def addError(self, test, err): self.__dbconn.write(self.__create_data(test, err, 'ERROR')) def addFailure(self, test, err): self.__dbconn.write(self.__create_data(test, err, 'FAIL')) def addSuccess(self, test): self.__dbconn.write(self.__create_data(test, None, 'PASS')) def finalize(self, result): self.__dbconn.close(result) self.__dbconn = None self.__dbaccess = None def process_output(self, info={}, dbout=None, dbtype=None, dbaccess=None, name=None, logtype=None, summary=False): """ Send analyzed log information to either the screen or to a database file. :param info: A dictionary of log analysis metrics. :type info: Dictionary :param dbout: The name of the database file to send output to :type dbout: str or None :param dbtype: Type of database :param dbaccess: Path to a file that defines db options (PostreSQL only) :param name: The name of the log file being analyzed :type name: str or None :param logtype: The log type, one of ``accounting``, ``schedsummary``, ``scheduler``, ``server``, or ``mom`` :param summary: If True output summary only """ if dbout is not None: try: self.set_data(dbtype, dbout, dbaccess) self.configure(None, None) data = {'metrics_data': {logtype: info}} self.__dbconn.write(data, os.path.basename(name)) self.finalize(None) except Exception as e: sys.stderr.write(str(traceback.print_exc())) sys.stderr.write('Error processing output ' + str(e)) return if lu.CFC in info: freq_info = info[lu.CFC] elif 'summary' in info and lu.CFC in info['summary']: freq_info = info['summary'][lu.CFC] else: freq_info = None if 'matches' in info: for m in info['matches']: print(m, end=' ') del info['matches'] if freq_info is not None: for ((l, m), n) in freq_info: b = time.strftime("%m/%d/%y %H:%M:%S", time.localtime(l)) e = time.strftime("%m/%d/%y %H:%M:%S", time.localtime(m)) print(b + ' -', end=' ') if b[:8] != e[:8]: print(e, end=' ') else: print(e[9:], end=' ') print(': ' + str(n)) return if lu.EST in info: einfo = info[lu.EST] m = [] for j in einfo[lu.EJ]: m.append('Job ' + j[lu.JID] + '\n\testimated:') if lu.Eat in j: for estimate in j[lu.Eat]: m.append('\t\t' + str(time.ctime(estimate))) if lu.JST in j: m.append('\tstarted:\n') m.append('\t\t' + str(time.ctime(j[lu.JST]))) m.append('\testimate range: ' + str(j[lu.ESTR])) m.append('\tstart to estimated: ' + str(j[lu.ESTA])) if lu.NEST in j: m.append('\tnumber of estimates: ' + str(j[lu.NEST])) if lu.NJD in j: m.append('\tnumber of drifts: ' + str(j[lu.NJD])) if lu.JDD in j: m.append('\tdrift duration: ' + str(j[lu.JDD])) m.append('\n') if lu.ESTS in einfo: m.append('\nsummary: ') for k, v in sorted(einfo[lu.ESTS].items()): if 'duration' in k: m.append('\t' + k + ': ' + str(PbsTypeDuration(int(v)))) else: m.append('\t' + k + ': ' + str(v)) print("\n".join(m)) return sorted_info = sorted(info.items()) for (k, v) in sorted_info: if summary and k != 'summary': continue print(str(k) + ": ", end=' ') if isinstance(v, dict): sorted_v = sorted(v.items()) for (k, val) in sorted_v: print(str(k) + '=' + str(val) + ' ') print() else: print(str(v)) print('') ================================================ FILE: test/fw/ptl/utils/plugins/ptl_test_info.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. import sys import logging import unittest from nose.plugins.base import Plugin from ptl.utils.pbs_testsuite import PBSTestSuite from ptl.utils.plugins.ptl_test_tags import TAGKEY from ptl.utils.pbs_testsuite import REQUIREMENTS_KEY from ptl.utils.pbs_testsuite import default_requirements from copy import deepcopy log = logging.getLogger('nose.plugins.PTLTestInfo') def get_effective_reqs(ts_reqs=None, tc_reqs=None): """ get effective requirements at test case """ tc_effective_reqs = {} if (tc_reqs is None and ts_reqs is None): tc_effective_reqs = deepcopy(default_requirements) else: tc_effective_reqs = deepcopy(default_requirements) tc_effective_reqs.update(ts_reqs) tc_effective_reqs.update(tc_reqs) return tc_effective_reqs class FakeRunner(object): def __init__(self, config): self.config = config def run(self, test): self.config.plugins.finalize(None) sys.exit(0) class PTLTestInfo(Plugin): """ Load test cases from given parameter """ name = 'PTLTestInfo' score = sys.maxsize - 2 logger = logging.getLogger(__name__) def __init__(self): self.list_test = None self.showinfo = None self.verbose = None self.gen_ts_tree = None self.suites = [] self._tree = {} self.total_suite = 0 self.total_case = 0 self.__ts_tree = {} self.__tags_tree = {'NoTags': {}} def options(self, parser, env): """ Register command line options """ pass def set_data(self, suites, list_test, showinfo, verbose, gen_ts_tree): """ Set the data required for running the tests :param suites: Test suites to run :param list_test: List of test to run :param gen_ts_tree: Generate test suite tree """ self.suites = suites.split(',') self.list_test = list_test self.showinfo = showinfo self.verbose = verbose self.gen_ts_tree = gen_ts_tree def configure(self, options, config): """ Configure the plugin and system, based on selected options :param options: Options to configure plugin and system """ self.config = config self.enabled = True def prepareTestRunner(self, runner): return FakeRunner(config=self.config) def wantClass(self, cls): """ Is the class wanted? """ if not issubclass(cls, unittest.TestCase) or cls is PBSTestSuite \ or cls is unittest.TestCase: return False self._tree.setdefault(cls.__name__, cls) if len(cls.__bases__) > 0: self.wantClass(cls.__bases__[0]) def _get_hierarchy(self, cls, level=0): delim = ' ' * level msg = [delim + cls.__name__] try: subclses = cls.__subclasses__() except TypeError: pass else: for subcls in subclses: msg.extend(self._get_hierarchy(subcls, level + 1)) return msg def _print_suite_info(self, suite): w = sys.stdout self.total_suite += 1 if self.list_test: w.write('\n\n') w.write('Test Suite: %s\n\n' % suite.__name__) w.write(' file: %s.py\n\n' % suite.__module__.replace('.', '/')) w.write(' module: %s\n\n' % suite.__module__) tags = getattr(suite, TAGKEY, None) if tags is not None: w.write(' Tags: %s\n\n' % (', '.join(tags))) w.write(' Suite Doc: \n') for l in str(suite.__doc__).split('\n'): w.write(' %s\n' % l) dcl = suite.__dict__ cases = [] for k in dcl.keys(): if k.startswith('test_'): k = getattr(suite, k) try: k.__name__ except BaseException: # not a test case, ignore continue self.total_case += 1 cases.append('\t%s\n' % (k.__name__)) if self.verbose: tags = getattr(k, TAGKEY, None) if tags is not None: cases.append('\n\t Tags: %s\n\n' % (', '.join(tags))) doc = k.__doc__ if doc is not None: cases.append('\t Test Case Doc: \n') for l in str(doc).split('\n'): cases.append('\t%s\n' % (l)) if len(cases) > 0: w.write(' Test Cases: \n') w.writelines(cases) if self.list_test or self.showinfo: lines = self._get_hierarchy(suite, 1)[1:] if len(lines) > 0: w.write('\n Test suite hierarchy:\n') for l in lines: w.write(l + '\n') def _gen_ts_tree(self, suite): n = suite.__name__ tsd = {} tsd['doc'] = str(suite.__doc__) tstags = getattr(suite, TAGKEY, []) numnodes = 1 for tag in tstags: if 'numnodes' in tag: numnodes = tag.split('=')[1].strip() break tsd['tags'] = tstags if len(tstags) > 0 else "None" tsd['numnodes'] = str(numnodes) tsd['file'] = suite.__module__.replace('.', '/') + '.py' tsd['module'] = suite.__module__ dcl = suite.__dict__ tcs = {} ts_req = getattr(suite, REQUIREMENTS_KEY, {}) for k in dcl.keys(): if k.startswith('test_'): tcd = {} tc = getattr(suite, k) try: tc.__name__ except BaseException: # not a test case, ignore continue tcd['doc'] = str(tc.__doc__) tc_req = getattr(tc, REQUIREMENTS_KEY, {}) tcd['requirements'] = get_effective_reqs(ts_req, tc_req) numnodes = 1 tctags = sorted(set(tstags + getattr(tc, TAGKEY, []))) for tag in tctags: if 'numnodes' in tag: numnodes = tag.split('=')[1].strip() break tcd['tags'] = tctags if len(tctags) > 0 else "None" tcd['numnodes'] = str(numnodes) tcs[k] = deepcopy(tcd) if len(tctags) > 0: for tag in tctags: if tag not in self.__tags_tree.keys(): self.__tags_tree[tag] = {} if n not in self.__tags_tree[tag].keys(): self.__tags_tree[tag][n] = deepcopy(tsd) if 'tclist' not in self.__tags_tree[tag][n].keys(): self.__tags_tree[tag][n]['tclist'] = {} self.__tags_tree[tag][n]['tclist'][k] = deepcopy(tcd) else: if n not in self.__tags_tree['NoTags'].keys(): self.__tags_tree['NoTags'][n] = deepcopy(tsd) if 'tclist' not in self.__tags_tree['NoTags'][n].keys(): self.__tags_tree['NoTags'][n]['tclist'] = {} self.__tags_tree['NoTags'][n]['tclist'][k] = deepcopy(tcd) if len(tcs.keys()) > 0: self.__ts_tree[n] = deepcopy(tsd) self.__ts_tree[n]['tclist'] = tcs def finalize(self, result): if (self.list_test and not self.suites) or self.gen_ts_tree: suites = list(self._tree.keys()) else: suites = self.suites suites.sort() unknown = [] if self.gen_ts_tree: func = self._gen_ts_tree else: func = self._print_suite_info for k in suites: try: suite = eval(k, globals(), self._tree) except BaseException: unknown.append(k) continue func(suite) if self.list_test: w = sys.stdout w.write('\n\n') w.write('Total number of Test Suites: %d\n' % (self.total_suite)) w.write('Total number of Test Cases: %d\n' % (self.total_case)) elif self.gen_ts_tree: tsdata = '' tagsdata = '' try: import json tsdata = json.dumps(self.__ts_tree, indent=4) tagsdata = json.dumps(self.__tags_tree, indent=4) except ImportError: try: import simplejson tsdata = simplejson.dumps(self.__ts_tree, indent=4) tagsdata = simplejson.dumps(self.__tags_tree, indent=4) except ImportError: _pre = str(self.__ts_tree).replace('"', '\\"') tsdata = _pre.replace('\'', '"') _pre = str(self.__tags_tree).replace('"', '\\"') tagsdata = _pre.replace('\'', '"') f = open('ptl_ts_tree.json', 'w+') f.write(tsdata) f.close() f = open('ptl_tags_tree.json', 'w+') f.write(tagsdata) f.close() if len(unknown) > 0: self.logger.error('Unknown testsuite(s): %s' % (','.join(unknown))) ================================================ FILE: test/fw/ptl/utils/plugins/ptl_test_loader.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. import os import sys import logging import copy from nose.plugins.base import Plugin from ptl.utils.pbs_testsuite import PBSTestSuite from ptl.utils.pbs_dshutils import DshUtils class PTLTestLoader(Plugin): """ Load test cases from given parameter """ name = 'PTLTestLoader' score = sys.maxsize - 1 logger = logging.getLogger(__name__) def __init__(self): Plugin.__init__(self) self.suites_list = [] self.excludes = [] self.follow = False self._only_ts = '__only__ts__' self._only_tc = '__only__tc__' self._test_marker = 'test_' self._tests_list = {self._only_ts: [], self._only_tc: []} self._excludes_list = {self._only_ts: [], self._only_tc: []} self.__tests_list_copy = {self._only_ts: [], self._only_tc: []} self.__allowed_cls = [] self.__allowed_method = [] self.testfiles = None def options(self, parser, env): """ Register command line options """ pass def set_data(self, testgroup, suites, excludes, follow, testfiles=None): """ Set the data required for loading test data :param testgroup: Test group :param suites: Test suites to load :param excludes: Tests to exclude while running :param testfiles: Flag to check if test is run by filename """ if os.access(str(testgroup), os.R_OK): f = open(testgroup, 'r') self.suites_list.extend(f.readline().strip().split(',')) f.close() elif suites is not None: self.suites_list.extend(suites.split(',')) if excludes is not None: self.excludes.extend(excludes.split(',')) self.follow = follow self.testfiles = testfiles def configure(self, options, config): """ Configure the ``plugin`` and ``system``, based on selected options """ tl = self._tests_list tlc = self.__tests_list_copy for _is in self.suites_list: if '.' in _is: suite, case = _is.split('.') if case in tl[self._only_tc]: tl[self._only_tc].remove(case) tlc[self._only_tc].remove(case) if suite in tl.keys(): if case not in tl[suite]: tl[suite].append(case) tlc[suite].append(case) else: tl.setdefault(suite, [case]) tlc.setdefault(suite, [case]) elif _is.startswith(self._test_marker): if _is not in tl[self._only_tc]: tl[self._only_tc].append(_is) tlc[self._only_tc].append(_is) else: if _is not in tl[self._only_ts]: tl[self._only_ts].append(_is) tlc[self._only_ts].append(_is) for k, v in tl.items(): if k in (self._only_ts, self._only_tc): continue if len(v) == 0: tl[self._only_ts].append(k) tlc[self._only_ts].append(k) for name in tl[self._only_ts]: if name in tl.keys(): del tl[name] del tlc[name] extl = self._excludes_list for _is in self.excludes: if '.' in _is: suite, case = _is.split('.') if case in extl[self._only_tc]: extl[self._only_tc].remove(case) if suite in extl.keys(): if case not in extl[suite]: extl[suite].append(case) else: extl.setdefault(suite, [case]) elif _is.startswith(self._test_marker): if _is not in extl[self._only_tc]: extl[self._only_tc].append(_is) else: if _is not in extl[self._only_ts]: extl[self._only_ts].append(_is) for k, v in extl.items(): if k in (self._only_ts, self._only_tc): continue if len(v) == 0: extl[self._only_ts].append(k) for name in extl[self._only_ts]: if name in extl.keys(): del extl[name] self.logger.debug('included_tests:%s' % (str(self._tests_list))) self.logger.debug('included_tests(copy):%s' % (str(self.__tests_list_copy))) self.logger.debug('excluded_tests:%s' % (str(self._excludes_list))) self.enabled = len(self.suites_list) > 0 del self.suites_list del self.excludes def check_unknown(self): """ Check for unknown test suite and test case """ self.logger.debug('check_unknown called') tests_list_copy = copy.deepcopy(self.__tests_list_copy) only_ts = tests_list_copy.pop(self._only_ts) only_tc = tests_list_copy.pop(self._only_tc) msg = [] if len(tests_list_copy) > 0: for k, v in tests_list_copy.items(): msg.extend(map(lambda x: k + '.' + x, v)) if len(only_tc) > 0: msg.extend(only_tc) if len(msg) > 0: _msg = ['unknown testcase(s): %s' % (','.join(msg))] msg = _msg if len(only_ts) > 0: msg += ['unknown testsuite(s): %s' % (','.join(only_ts))] if len(msg) > 0: for l in msg: self.logger.error(l) sys.exit(1) def prepareTestLoader(self, loader): """ Prepare test loader """ old_loadTestsFromNames = loader.loadTestsFromNames def check_loadTestsFromNames(names, module=None): tests_dir = names if not self.testfiles: ptl_test_dir = __file__ ptl_test_dir = os.path.join(ptl_test_dir.split('ptl')[0], "ptl", "tests") user_test_dir = os.environ.get("PTL_TESTS_DIR", None) if user_test_dir and os.path.isdir(user_test_dir): tests_dir += [user_test_dir] if os.path.isdir(ptl_test_dir): tests_dir += [ptl_test_dir] rv = old_loadTestsFromNames(tests_dir, module) self.check_unknown() return rv loader.loadTestsFromNames = check_loadTestsFromNames return loader def check_follow(self, cls, method=None): cname = cls.__name__ if not issubclass(cls, PBSTestSuite): return False if cname == 'PBSTestSuite': if 'PBSTestSuite' not in self._tests_list[self._only_ts]: return False if cname in self._excludes_list[self._only_ts]: return False if cname in self._tests_list[self._only_ts]: if cname in self.__tests_list_copy[self._only_ts]: self.__tests_list_copy[self._only_ts].remove(cname) return True if ((cname in self._tests_list.keys()) and (method is None)): return True if method is not None: mname = method.__name__ if not mname.startswith(self._test_marker): return False if mname in self._excludes_list[self._only_tc]: return False if ((cname in self._excludes_list.keys()) and (mname in self._excludes_list[cname])): return False if ((cname in self._tests_list.keys()) and (mname in self._tests_list[cname])): if cname in self.__tests_list_copy.keys(): if mname in self.__tests_list_copy[cname]: self.__tests_list_copy[cname].remove(mname) if len(self.__tests_list_copy[cname]) == 0: del self.__tests_list_copy[cname] return True if mname in self._tests_list[self._only_tc]: if mname in self.__tests_list_copy[self._only_tc]: self.__tests_list_copy[self._only_tc].remove(mname) return True if self.follow: return self.check_follow(cls.__bases__[0], method) else: return False def is_already_allowed(self, cls, method=None): """ :param method: Method to check :returns: True if method is already allowed else False """ name = cls.__name__ if method is not None: name += '.' + method.__name__ if name in self.__allowed_method: return True else: self.__allowed_method.append(name) return False else: if name in self.__allowed_cls: return True else: self.__allowed_cls.append(name) return False def wantClass(self, cls): """ Is the class wanted? """ has_test = False for t in dir(cls): if t.startswith(self._test_marker): has_test = True break if not has_test: return False rv = self.check_follow(cls) if rv and not self.is_already_allowed(cls): self.logger.debug('wantClass:%s' % (str(cls))) else: return False def wantFunction(self, function): """ Is the function wanted? """ return self.wantMethod(function) def wantMethod(self, method): """ Is the method wanted? """ try: cls = method.__self__.__class__ except AttributeError: return False if not method.__name__.startswith(self._test_marker): return False rv = self.check_follow(cls, method) if rv and not self.is_already_allowed(cls, method): self.logger.debug('wantMethod:%s' % (str(method))) else: return False ================================================ FILE: test/fw/ptl/utils/plugins/ptl_test_runner.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. import datetime import logging import fnmatch import os import platform import pwd import re import signal import socket import sys import time import tempfile import unittest from threading import Timer from logging import StreamHandler from traceback import format_exception from types import ModuleType from nose.core import TextTestRunner from nose.plugins.base import Plugin from nose.plugins.skip import SkipTest from nose.suite import ContextSuite from nose.util import isclass import ptl from ptl.lib.pbs_testlib import PBSInitServices from ptl.utils.pbs_covutils import LcovUtils from ptl.utils.pbs_dshutils import DshUtils from ptl.utils.pbs_dshutils import TimeOut from ptl.utils.pbs_testsuite import (MINIMUM_TESTCASE_TIMEOUT, REQUIREMENTS_KEY, TIMEOUT_KEY) from ptl.utils.plugins.ptl_test_info import get_effective_reqs from ptl.utils.pbs_testusers import PBS_ALL_USERS, PBS_USERS, PbsUser from ptl.lib.ptl_constants import (PTL_TRUE, PTL_FALSE) from io import StringIO log = logging.getLogger('nose.plugins.PTLTestRunner') class TCThresholdReached(Exception): """ Raise this exception to tell that tc-failure-threshold reached """ class TestLogCaptureHandler(StreamHandler): """ Log handler for capturing logs which test case print using logging module """ def __init__(self): self.buffer = StringIO() StreamHandler.__init__(self, self.buffer) self.setLevel(logging.DEBUG2) fmt = '%(asctime)-15s %(levelname)-8s %(message)s' self.setFormatter(logging.Formatter(fmt)) def get_logs(self): return self.buffer.getvalue() class _PtlTestResult(unittest.TestResult): """ Ptl custom test result """ separator1 = '=' * 70 separator2 = '___m_oo_m___' logger = logging.getLogger(__name__) def __init__(self, stream, descriptions, verbosity, config=None): unittest.TestResult.__init__(self) self.stream = stream self.showAll = verbosity > 1 self.dots = verbosity == 1 self.descriptions = descriptions self.errorClasses = {} self.config = config self.success = [] self.skipped = [] self.timedout = [] self.handler = TestLogCaptureHandler() self.start = datetime.datetime.now() self.stop = datetime.datetime.now() def getDescription(self, test): """ Get the test result description """ if hasattr(test, 'test'): return str(test.test) elif isinstance(test.context, ModuleType): tmn = getattr(test.context, '_testMethodName', 'unknown') return '%s (%s)' % (tmn, test.context.__name__) elif isinstance(test, ContextSuite): tmn = getattr(test.context, '_testMethodName', 'unknown') return '%s (%s.%s)' % (tmn, test.context.__module__, test.context.__name__) else: return str(test) def getTestDoc(self, test): """ Get test document """ if hasattr(test, 'test'): if hasattr(test.test, '_testMethodDoc'): return test.test._testMethodDoc else: return None else: if hasattr(test, '_testMethodDoc'): return test._testMethodDoc else: return None def clear_stop(self): self.shouldStop = False def startTest(self, test): """ Start the test :param test: Test to start :type test: str """ ptl_logger = logging.getLogger('ptl') if self.handler not in ptl_logger.handlers: ptl_logger.addHandler(self.handler) self.handler.buffer.truncate(0) self.handler.buffer.seek(0) unittest.TestResult.startTest(self, test) test.start_time = datetime.datetime.now() if self.showAll: self.logger.info('test name: ' + self.getDescription(test) + '...') self.logger.info('test start time: ' + test.start_time.ctime()) tdoc = self.getTestDoc(test) if tdoc is not None: tdoc = '\n' + tdoc self.logger.info('test docstring: %s' % (tdoc)) def addSuccess(self, test): """ Add success to the test result """ self.success.append(test) unittest.TestResult.addSuccess(self, test) if self.showAll: self.logger.info('ok\n') elif self.dots: self.logger.info('.') def _addError(self, test, err): unittest.TestResult.addError(self, test, err) if self.showAll: self.logger.info('ERROR\n') elif self.dots: self.logger.info('E') def addError(self, test, err): """ Add error to the test result :param test: Test for which to add error :type test: str :param error: Error message to add :type error: str """ if isclass(err[0]) and issubclass(err[0], TCThresholdReached): return if isclass(err[0]) and issubclass(err[0], SkipTest): self.addSkip(test, err[1]) return if isclass(err[0]) and issubclass(err[0], TimeOut): self.addTimedOut(test, err) return for cls, (storage, label, isfail) in self.errorClasses.items(): if isclass(err[0]) and issubclass(err[0], cls): if isfail: test.passed = False storage.append((test, err)) if self.showAll: self.logger.info(label + '\n') elif self.dots: self.logger.info(label[0]) return test.passed = False self._addError(test, err) def addFailure(self, test, err): """ Indicate failure """ unittest.TestResult.addFailure(self, test, err) if self.showAll: self.logger.info('FAILED\n') elif self.dots: self.logger.info('F') def addSkip(self, test, reason): """ Indicate skipping of test :param test: Test to skip :type test: str :param reason: Reason fot the skip :type reason: str """ self.skipped.append((test, reason)) if self.showAll: self.logger.info('SKIPPED') elif self.dots: self.logger.info('S') def addTimedOut(self, test, err): """ Indicate timeout :param test: Test for which timeout happened :type test: str :param err: Error for timeout :type err: str """ self.timedout.append((test, self._exc_info_to_string(err, test))) if self.showAll: self.logger.info('TIMEDOUT') elif self.dots: self.logger.info('T') def printErrors(self): """ Print the errors """ _blank_line = False if ((len(self.errors) > 0) or (len(self.failures) > 0) or (len(self.timedout) > 0)): if self.dots or self.showAll: self.logger.info('') _blank_line = True self.printErrorList('ERROR', self.errors) self.printErrorList('FAILED', self.failures) self.printErrorList('TIMEDOUT', self.timedout) for cls in self.errorClasses.keys(): storage, label, isfail = self.errorClasses[cls] if isfail: if not _blank_line: self.logger.info('') _blank_line = True self.printErrorList(label, storage) self.config.plugins.report(self.stream) def printErrorList(self, flavour, errors): """ Print the error list :param errors: Errors to print """ for test, err in errors: self.logger.info(self.separator1) self.logger.info('%s: %s\n' % (flavour, self.getDescription(test))) self.logger.info(self.separator2) self.logger.info('%s\n' % err) def printLabel(self, label, err=None): """ Print the label for the error :param label: Label to print :type label: str :param err: Error for which label to be printed :type err: str """ if self.showAll: message = [label] if err: try: detail = str(err[1]) except BaseException: detail = None if detail: message.append(detail) self.logger.info(': '.join(message)) elif self.dots: self.logger.info(label[:1]) def wasSuccessful(self): """ Check whether the test successful or not :returns: True if no ``errors`` or no ``failures`` or no ``timeout`` else return False """ if self.errors or self.failures or self.timedout: return False for cls in self.errorClasses.keys(): storage, _, isfail = self.errorClasses[cls] if not isfail: continue if storage: return False return True def printSummary(self): """ Called by the test runner to print the final summary of test run results. :param start: Time at which test begins :param stop: Time at which test ends """ self.printErrors() msg = ['=' * 80] ef = [] error = 0 fail = 0 skip = 0 timedout = 0 success = len(self.success) if len(self.failures) > 0: for failedtest in self.failures: fail += 1 msg += ['failed: ' + self.getDescription(failedtest[0])] ef.append(failedtest) if len(self.errors) > 0: for errtest in self.errors: error += 1 msg += ['error: ' + self.getDescription(errtest[0])] ef.append(errtest) if len(self.skipped) > 0: for skiptest, reason in self.skipped: skip += 1 _msg = 'skipped: ' + str(skiptest).strip() _msg += ' reason: ' + str(reason).strip() msg += [_msg] if len(self.timedout) > 0: for tdtest in self.timedout: timedout += 1 msg += ['timedout: ' + self.getDescription(tdtest[0])] ef.append(tdtest) cases = [] suites = [] for _ef in ef: if hasattr(_ef[0], 'test'): cname = _ef[0].test.__class__.__name__ tname = getattr(_ef[0].test, '_testMethodName', 'unknown') cases.append(cname + '.' + tname) suites.append(cname) cases = sorted(list(set(cases))) suites = sorted(list(set(suites))) if len(cases) > 0: _msg = 'Test cases with failures: ' _msg += ','.join(cases) msg += [_msg] if len(suites) > 0: _msg = 'Test suites with failures: ' _msg += ','.join(suites) msg += [_msg] runned = success + fail + error + skip + timedout _msg = 'run: ' + str(runned) _msg += ', succeeded: ' + str(success) _msg += ', failed: ' + str(fail) _msg += ', errors: ' + str(error) _msg += ', skipped: ' + str(skip) _msg += ', timedout: ' + str(timedout) msg += [_msg] msg += ['Tests run in ' + str(self.stop - self.start)] self.logger.info('\n'.join(msg)) class SystemInfo: """ used to get system's ram size and disk size information. :system_ram: Available ram(in GB) of the test running machine :system_disk: Available disk size(in GB) of the test running machine """ logger = logging.getLogger(__name__) def get_system_info(self, hostname=None): du = DshUtils() # getting RAM size in gb mem_info = du.cat(hostname, "/proc/meminfo") if mem_info['rc'] != 0: _msg = 'failed to get content of /proc/meminfo of host: ' self.logger.error(_msg + hostname) else: got_mem_available = False for i in mem_info['out']: if "MemTotal" in i: self.system_total_ram = float(i.split()[1]) / (2**20) elif "MemAvailable" in i: mem_available = float(i.split()[1]) / (2**20) got_mem_available = True break elif "MemFree" in i: mem_free = float(i.split()[1]) / (2**20) elif "Buffers" in i: buffers = float(i.split()[1]) / (2**20) elif i.startswith("Cached"): cached = float(i.split()[1]) / (2**20) if got_mem_available: self.system_ram = mem_available else: self.system_ram = mem_free + buffers + cached # getting disk size in gb pbs_conf = du.parse_pbs_config(hostname) pbs_home_info = du.run_cmd(hostname, cmd=['df', '-k', pbs_conf['PBS_HOME']]) if pbs_home_info['rc'] != 0: _msg = 'failed to get output of df -k command of host: ' self.logger.error(_msg + hostname) else: disk_info = pbs_home_info['out'] disk_size = disk_info[1].split() self.system_disk = float(disk_size[3]) / (2**20) self.system_disk_used_percent = float(disk_size[4].rstrip('%')) class PtlTextTestRunner(TextTestRunner): """ Test runner that uses ``PtlTestResult`` to enable errorClasses, as well as providing hooks for plugins to override or replace the test output stream, results, and the test case itself. """ cur_repeat_count = 1 def __init__(self, stream=sys.stdout, descriptions=True, verbosity=3, config=None, repeat_count=1, repeat_delay=0): self.logger = logging.getLogger(__name__) self.result = None self.repeat_count = repeat_count self.repeat_delay = repeat_delay TextTestRunner.__init__(self, stream, descriptions, verbosity, config) def _makeResult(self): return _PtlTestResult(self.stream, self.descriptions, self.verbosity, self.config) def run(self, test): """ Overrides to provide plugin hooks and defer all output to the test result class. """ do_exit = False wrapper = self.config.plugins.prepareTest(test) if wrapper is not None: test = wrapper wrapped = self.config.plugins.setOutputStream(self.stream) if wrapped is not None: self.stream = wrapped self.result = result = self._makeResult() self.result.start = datetime.datetime.now() try: for i in range(self.repeat_count): PtlTextTestRunner.cur_repeat_count = i + 1 if i != 0: time.sleep(self.repeat_delay) test(result) if self.repeat_count > 1: self.logger.info("==========================================") self.logger.info("All Tests are repeated %d times" % self.repeat_count) self.logger.info("==========================================") except KeyboardInterrupt: do_exit = True self.result.stop = datetime.datetime.now() result.printSummary() self.config.plugins.finalize(result) if do_exit: sys.exit(1) return result class PTLTestRunner(Plugin): """ PTL Test Runner Plugin """ name = 'PTLTestRunner' score = sys.maxsize - 4 logger = logging.getLogger(__name__) timeout = None def __init__(self): Plugin.__init__(self) self.param = None self.repeat_count = 1 self.repeat_delay = 0 self.use_cur_setup = False self.lcov_bin = None self.lcov_data = None self.lcov_out = None self.lcov_utils = None self.lcov_nosrc = None self.lcov_baseurl = None self.genhtml_bin = None self.config = None self.result = None self.tc_failure_threshold = None self.cumulative_tc_failure_threshold = None self.__failed_tc_count = 0 self.__tf_count = 0 self.__failed_tc_count_msg = False self._test_marker = 'test_' self.hardware_report_timer = None def options(self, parser, env): """ Register command line options """ pass def set_data(self, paramfile, testparam, repeat_count, repeat_delay, lcov_bin, lcov_data, lcov_out, genhtml_bin, lcov_nosrc, lcov_baseurl, tc_failure_threshold, cumulative_tc_failure_threshold, use_cur_setup): if paramfile is not None: _pf = open(paramfile, 'r') _params_from_file = _pf.readlines() _pf.close() _nparams = [] for l in range(len(_params_from_file)): if _params_from_file[l].startswith('#'): continue else: _nparams.append(_params_from_file[l]) _f = ','.join([l.strip('\r\n') for l in _nparams]) if testparam is not None: testparam += ',' + _f else: testparam = _f self.param = testparam self.repeat_count = repeat_count self.repeat_delay = repeat_delay self.use_cur_setup = use_cur_setup self.lcov_bin = lcov_bin self.lcov_data = lcov_data self.lcov_out = lcov_out self.genhtml_bin = genhtml_bin self.lcov_nosrc = lcov_nosrc self.lcov_baseurl = lcov_baseurl self.tc_failure_threshold = tc_failure_threshold self.cumulative_tc_failure_threshold = cumulative_tc_failure_threshold def configure(self, options, config): """ Configure the plugin and system, based on selected options """ self.config = config self.enabled = True self.param_dict = self.__get_param_dictionary() def prepareTestRunner(self, runner): """ Prepare test runner """ return PtlTextTestRunner(verbosity=3, config=self.config, repeat_count=self.repeat_count, repeat_delay=self.repeat_delay) def prepareTestResult(self, result): """ Prepare test result """ self.result = result def startContext(self, context): context.param = self.param context.use_cur_setup = self.use_cur_setup context.start_time = datetime.datetime.now() if isclass(context) and issubclass(context, unittest.TestCase): self.result.logger.info(self.result.separator1) self.result.logger.info('suite name: ' + context.__name__) doc = context.__doc__ if doc is not None: self.result.logger.info('suite docstring: \n' + doc + '\n') self.result.logger.info(self.result.separator1) self.__failed_tc_count = 0 self.__failed_tc_count_msg = False def __get_timeout(self, test): _test = None if hasattr(test, 'test'): _test = test.test elif hasattr(test, 'context'): _test = test.context if _test is None: return MINIMUM_TESTCASE_TIMEOUT dflt_timeout = int(getattr(_test, 'conf', {}).get('default-testcase-timeout', MINIMUM_TESTCASE_TIMEOUT)) tc_timeout = int(getattr(getattr(_test, getattr(_test, '_testMethodName', ''), None), TIMEOUT_KEY, 0)) return max([dflt_timeout, tc_timeout]) def __set_test_end_data(self, test, err=None): if self.hardware_report_timer is not None: self.hardware_report_timer.cancel() if not hasattr(test, 'start_time'): test = test.context if err is not None: is_skip = issubclass(err[0], SkipTest) is_tctr = issubclass(err[0], TCThresholdReached) if not (is_skip or is_tctr): self.__failed_tc_count += 1 self.__tf_count += 1 try: test.err_in_string = self.result._exc_info_to_string(err, test) except BaseException: etype, value, tb = err test.err_in_string = ''.join(format_exception(etype, value, tb)) else: test.err_in_string = 'None' test.end_time = datetime.datetime.now() test.duration = test.end_time - test.start_time test.captured_logs = self.result.handler.get_logs() def __get_param_dictionary(self): """ Method to convert data in param into dictionary of cluster information """ def get_bool(v): if v is None or v == '': return False if v in PTL_TRUE: return True if v in PTL_FALSE: return False raise ValueError("Need boolean value, not %s" % v) tparam_contents = {} nomomlist = [] shortname = (socket.gethostname()).split('.', 1)[0] for key in ['servers', 'moms', 'comms', 'clients', 'nomom']: tparam_contents[key] = [] tparam_contents['mom_on_server'] = False tparam_contents['no_mom_on_server'] = False tparam_contents['no_comm_on_server'] = False tparam_contents['no_comm_on_mom'] = False if self.param is not None: for h in self.param.split(','): if '=' in h: k, v = h.split('=', 1) hosts = [x.split('@')[0] for x in v.split(':')] if (k == 'server' or k == 'servers'): tparam_contents['servers'].extend(hosts) elif (k == 'mom' or k == 'moms'): tparam_contents['moms'].extend(hosts) elif k == 'comms': tparam_contents['comms'] = hosts elif k == 'client': tparam_contents['clients'] = hosts elif k == 'nomom': nomomlist = hosts elif k == 'mom_on_server': tparam_contents['mom_on_server'] = get_bool(v) elif k == 'no_mom_on_server': tparam_contents['no_mom_on_server'] = get_bool(v) elif k == 'no_comm_on_mom': tparam_contents['no_comm_on_mom'] = get_bool(v) for pkey in ['servers', 'moms', 'comms', 'clients']: if not tparam_contents[pkey]: tparam_contents[pkey] = set([shortname]) else: tparam_contents[pkey] = set(tparam_contents[pkey]) if nomomlist: tparam_contents['nomom'] = set(nomomlist) return tparam_contents @staticmethod def __are_requirements_matching(param_dic=None, test=None): """ Validates test requirements against test cluster information returns True on match or error message otherwise None :param param_dic: dictionary of cluster information from data passed to param list :param_dic type: dic :param test: test object :test type: object :returns True or error message or None """ logger = logging.getLogger(__name__) ts_requirements = {} tc_requirements = {} param_count = {} _servers = set(param_dic['servers']) _moms = set(param_dic['moms']) _comms = set(param_dic['comms']) _nomom = set(param_dic['nomom']) _mom_on_server = param_dic['mom_on_server'] _no_mom_on_server = param_dic['no_mom_on_server'] _no_comm_on_mom = param_dic['no_comm_on_mom'] _no_comm_on_server = param_dic['no_comm_on_server'] shortname = (socket.gethostname()).split('.', 1)[0] if test is None: return None test_name = getattr(test.test, '_testMethodName', None) if test_name is not None: method = getattr(test.test, test_name, None) if method is not None: tc_requirements = getattr(method, REQUIREMENTS_KEY, {}) cls = method.__self__.__class__ ts_requirements = getattr(cls, REQUIREMENTS_KEY, {}) if not tc_requirements: if not ts_requirements: return None eff_tc_req = get_effective_reqs(ts_requirements, tc_requirements) setattr(test.test, 'requirements', eff_tc_req) for key in ['servers', 'moms', 'comms', 'clients']: param_count['num_' + key] = len(param_dic[key]) for pk in param_count: if param_count[pk] < eff_tc_req[pk]: _msg = 'available ' + pk + " (" _msg += str(param_count[pk]) + ") is less than required " + pk _msg += " (" + str(eff_tc_req[pk]) + ")" logger.error(_msg) return _msg if hasattr(test, 'test'): _test = test.test elif hasattr(test, 'context'): _test = test.context else: return None name = 'moms' if (hasattr(_test, name) and (getattr(_test, name, None) is not None)): for mc in getattr(_test, name).values(): platform = mc.platform if platform not in ['linux', 'shasta', 'cray'] and mc.hostname in _moms: _moms.remove(mc.hostname) for hostname in _moms: si = SystemInfo() si.get_system_info(hostname) available_sys_ram = getattr(si, 'system_ram', None) if available_sys_ram is None: _msg = 'failed to get ram info on host: ' + hostname logger.error(_msg) return _msg elif eff_tc_req['min_mom_ram'] >= available_sys_ram: _msg = hostname + ': available ram (' + str(available_sys_ram) _msg += ') is less than the minimum required ram (' _msg += str(eff_tc_req['min_mom_ram']) _msg += ') for test execution' logger.error(_msg) return _msg available_sys_disk = getattr(si, 'system_disk', None) if available_sys_disk is None: _msg = 'failed to get disk info on host: ' + hostname logger.error(_msg) return _msg elif eff_tc_req['min_mom_disk'] >= available_sys_disk: _msg = hostname + ': available disk space (' _msg += str(available_sys_disk) _msg += ') is less than the minimum required disk space (' _msg += str(eff_tc_req['min_mom_disk']) _msg += ') for test execution' logger.error(_msg) return _msg for hostname in param_dic['servers']: si = SystemInfo() si.get_system_info(hostname) available_sys_ram = getattr(si, 'system_ram', None) if available_sys_ram is None: _msg = 'failed to get ram info on host: ' + hostname logger.error(_msg) return _msg elif eff_tc_req['min_server_ram'] >= available_sys_ram: _msg = hostname + ': available ram (' + str(available_sys_ram) _msg += ') is less than the minimum required ram (' _msg += str(eff_tc_req['min_server_ram']) _msg += ') for test execution' logger.error(_msg) return _msg available_sys_disk = getattr(si, 'system_disk', None) if available_sys_disk is None: _msg = 'failed to get disk info on host: ' + hostname logger.error(_msg) return _msg elif eff_tc_req['min_server_disk'] >= available_sys_disk: _msg = hostname + ': available disk space (' _msg += str(available_sys_disk) _msg += ') is less than the minimum required disk space (' _msg += str(eff_tc_req['min_server_disk']) _msg += ') for test execution' logger.error(_msg) return _msg if _moms & _servers: if eff_tc_req['no_mom_on_server'] or \ (_nomom - _servers) or \ _no_mom_on_server: _msg = 'no mom on server' logger.error(_msg) return _msg else: if eff_tc_req['mom_on_server'] or \ _mom_on_server: _msg = 'mom on server' logger.error(_msg) return _msg if _comms & _servers: if eff_tc_req['no_comm_on_server'] or _no_comm_on_server: _msg = 'no comm on server' logger.error(_msg) return _msg comm_mom_list = _moms & _comms if comm_mom_list and shortname in comm_mom_list: # Excluding the server hostname for flag 'no_comm_on_mom' comm_mom_list.remove(shortname) if comm_mom_list: if eff_tc_req['no_comm_on_mom']: _msg = 'no comm on mom' logger.error(_msg) return _msg else: if not eff_tc_req['no_comm_on_mom']: _msg = 'no comm on server' logger.error(_msg) return _msg def check_hardware_status_and_core_files(self, test): """ function checks hardware status and core files every 5 minutes """ du = DshUtils() systems = list(self.param_dict['servers']) systems.extend(self.param_dict['moms']) systems.extend(self.param_dict['comms']) systems = list(set(systems)) if hasattr(test, 'test'): _test = test.test elif hasattr(test, 'context'): _test = test.context else: return None for name in ['servers', 'moms', 'comms', 'clients']: mlist = None if (hasattr(_test, name) and (getattr(_test, name, None) is not None)): mlist = getattr(_test, name).values() if mlist: for mc in mlist: platform = mc.platform if ((platform not in ['linux', 'shasta', 'cray']) and (mc.hostname in systems)): systems.remove(mc.hostname) self.hardware_report_timer = Timer( 300, self.check_hardware_status_and_core_files, args=(test,)) self.hardware_report_timer.start() for hostname in systems: hr = SystemInfo() hr.get_system_info(hostname) # monitors disk used_disk_percent = getattr(hr, 'system_disk_used_percent', None) if used_disk_percent is None: _msg = hostname _msg += ": unable to get disk info" self.hardware_report_timer.cancel() raise SkipTest(_msg) elif 70 <= used_disk_percent < 95: _msg = hostname + ": disk usage is at " _msg += str(used_disk_percent) + "%" _msg += ", disk cleanup is recommended." self.logger.warning(_msg) elif used_disk_percent >= 95: _msg = hostname + ":disk usage > 95%, skipping the test(s)" self.hardware_report_timer.cancel() raise SkipTest(_msg) # checks for core files pbs_conf = du.parse_pbs_config(hostname) mom_priv_path = os.path.join(pbs_conf["PBS_HOME"], "mom_priv") if du.isdir(hostname=hostname, path=mom_priv_path): mom_priv_files = du.listdir( hostname=hostname, path=mom_priv_path, sudo=True, fullpath=False) if fnmatch.filter(mom_priv_files, "core*"): _msg = hostname + ": core files found in " _msg += mom_priv_path self.logger.warning(_msg) server_priv_path = os.path.join( pbs_conf["PBS_HOME"], "server_priv") if du.isdir(hostname=hostname, path=server_priv_path): server_priv_files = du.listdir( hostname=hostname, path=server_priv_path, sudo=True, fullpath=False) if fnmatch.filter(server_priv_files, "core*"): _msg = hostname + ": core files found in " _msg += server_priv_path self.logger.warning(_msg) sched_priv_path = os.path.join(pbs_conf["PBS_HOME"], "sched_priv") if du.isdir(hostname=hostname, path=sched_priv_path): sched_priv_files = du.listdir( hostname=hostname, path=sched_priv_path, sudo=True, fullpath=False) if fnmatch.filter(sched_priv_files, "core*"): _msg = hostname + ": core files found in " _msg += sched_priv_path self.logger.warning(_msg) for u in PBS_ALL_USERS: user_home_files = du.listdir(hostname=hostname, path=u.home, sudo=True, fullpath=False, runas=u.name) if user_home_files and fnmatch.filter( user_home_files, "core*"): _msg = hostname + ": user-" + str(u) _msg += ": core files found in " self.logger.warning(_msg + u.home) def startTest(self, test): """ Start the test """ if ((self.cumulative_tc_failure_threshold != 0) and (self.__tf_count >= self.cumulative_tc_failure_threshold)): _msg = 'Total testcases failure count exceeded cumulative' _msg += ' testcase failure threshold ' _msg += '(%d)' % self.cumulative_tc_failure_threshold self.logger.error(_msg) raise KeyboardInterrupt if ((self.tc_failure_threshold != 0) and (self.__failed_tc_count >= self.tc_failure_threshold)): if self.__failed_tc_count_msg: raise TCThresholdReached _msg = 'Testcases failure for this testsuite count exceeded' _msg += ' testcase failure threshold ' _msg += '(%d)' % self.tc_failure_threshold self.logger.error(_msg) self.__failed_tc_count_msg = True raise TCThresholdReached rv = None rv = self.__are_requirements_matching(self.param_dict, test) if rv is not None: # Below method call is needed in order to get the test case # details in the output and to have the skipped test count # included in total run count of the test run self.result.startTest(test) raise SkipTest(rv) # function report hardware status and core files self.check_hardware_status_and_core_files(test) def timeout_handler(signum, frame): raise TimeOut('Timed out after %s second' % timeout) if PTLTestRunner.timeout is None: timeout = self.__get_timeout(test) old_handler = signal.signal(signal.SIGALRM, timeout_handler) setattr(test, 'old_sigalrm_handler', old_handler) signal.alarm(timeout) def stopTest(self, test): """ Stop the test """ old_sigalrm_handler = getattr(test, 'old_sigalrm_handler', None) if old_sigalrm_handler is not None: signal.signal(signal.SIGALRM, old_sigalrm_handler) signal.alarm(0) def addError(self, test, err): """ Add error """ if isclass(err[0]) and issubclass(err[0], TCThresholdReached): return True self.__set_test_end_data(test, err) def addFailure(self, test, err): """ Add failure """ self.__set_test_end_data(test, err) def addSuccess(self, test): """ Add success """ self.__set_test_end_data(test) def _cleanup(self): self.logger.info('Cleaning up temporary files') du = DshUtils() hosts = set(self.param_dict['moms']).union( set(self.param_dict['servers'])) for user in PBS_USERS: self.logger.debug('Cleaning %s\'s home directory' % (str(user))) runas = PbsUser.get_user(user) for host in hosts: ret = du.run_cmd(host, cmd=['printenv', 'HOME'], sudo=True, runas=runas, logerr=False, as_script=False, level=logging.DEBUG) if ret['rc'] == 0: path = ret['out'][0].strip() else: return None ftd = [] files = du.listdir(host, path=path, runas=user, level=logging.DEBUG) bn = os.path.basename ftd.extend([f for f in files if bn(f).startswith('PtlPbs')]) ftd.extend([f for f in files if bn(f).startswith('STDIN')]) if len(ftd) > 1000: for i in range(0, len(ftd), 1000): j = i + 1000 du.rm(host, path=ftd[i:j], runas=user, force=True, level=logging.DEBUG) root_dir = os.sep dirlist = set([os.path.join(root_dir, 'tmp'), os.path.join(root_dir, 'var', 'tmp')]) # get tmp dir from the environment for envname in 'TMPDIR', 'TEMP', 'TMP': dirname = os.getenv(envname) if dirname: dirlist.add(dirname) p = re.compile(r'^pbs\.\d+') for tmpdir in dirlist: # list the contents of each tmp dir and # get the file list to be deleted self.logger.info('Cleaning up ' + tmpdir + ' dir') ftd = [] files = du.listdir(path=tmpdir) bn = os.path.basename ftd.extend([f for f in files if bn(f).startswith('PtlPbs')]) ftd.extend([f for f in files if bn(f).startswith('STDIN')]) ftd.extend([f for f in files if bn(f).startswith('pbsscrpt')]) ftd.extend([f for f in files if bn(f).startswith('pbs.conf.')]) ftd.extend([f for f in files if p.match(bn(f))]) for f in ftd: du.rm(path=f, sudo=True, recursive=True, force=True, level=logging.DEBUG) for f in du.tmpfilelist: du.rm(path=f, sudo=True, force=True, level=logging.DEBUG) del du.tmpfilelist[:] tmpdir = tempfile.gettempdir() os.chdir(tmpdir) tmppath = os.path.join(tmpdir, 'dejagnutemp%s' % os.getpid()) if du.isdir(path=tmppath): du.rm(path=tmppath, recursive=True, sudo=True, force=True, level=logging.DEBUG) def begin(self): command = sys.argv command[0] = os.path.basename(command[0]) self.logger.info('input command: ' + ' '.join(command)) self.logger.info('param: ' + str(self.param)) self.logger.info('ptl version: ' + str(ptl.__version__)) _m = 'platform: ' + ' '.join(platform.uname()).strip() self.logger.info(_m) self.logger.info('python version: ' + str(platform.python_version())) self.logger.info('user: ' + pwd.getpwuid(os.getuid())[0]) self.logger.info('-' * 80) if self.lcov_data is not None: self.lcov_utils = LcovUtils(cov_bin=self.lcov_bin, html_bin=self.genhtml_bin, cov_out=self.lcov_out, data_dir=self.lcov_data, html_nosrc=self.lcov_nosrc, html_baseurl=self.lcov_baseurl) # Initialize coverage analysis self.lcov_utils.zero_coverage() # The following 'dance' is done due to some oddities on lcov's # part, according to this the lcov readme file at # http://ltp.sourceforge.net/coverage/lcov/readme.php that reads: # # Note that this step only works after the application has # been started and stopped at least once. Otherwise lcov will # abort with an error mentioning that there are no data/.gcda # files. self.lcov_utils.initialize_coverage(name='PTLTestCov') PBSInitServices().restart() self._cleanup() def finalize(self, result): if self.lcov_data is not None: # See note above that briefly explains the 'dance' needed to get # reliable coverage data PBSInitServices().restart() self.lcov_utils.capture_coverage(name='PTLTestCov') exclude = ['"*work/gSOAP/*"', '"*/pbs/doc/*"', 'lex.yy.c', 'pbs_ifl_wrap.c', 'usr/include/*', 'unsupported/*'] self.lcov_utils.merge_coverage_traces(name='PTLTestCov', exclude=exclude) self.lcov_utils.generate_html() self.lcov_utils.change_baseurl() self.logger.info('\n'.join(self.lcov_utils.summarize_coverage())) self._cleanup() ================================================ FILE: test/fw/ptl/utils/plugins/ptl_test_tags.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. import sys import logging import unittest from nose.plugins.base import Plugin import collections try: from collections.abc import Callable # Python 3.10+ except ImportError: from collections import Callable # For Python versions before 3.10 log = logging.getLogger('nose.plugins.PTLTestTags') TAGKEY = '__PTL_TAGS_LIST__' def tags(*args, **kwargs): """ Decorator that adds tags to classes or functions or methods """ def wrap_obj(obj): tagobj = getattr(obj, TAGKEY, []) for name in args: tagobj.append(name) PTLTestTags.tags_list.append(name) setattr(obj, name, True) for name, value in kwargs.items(): tagobj.append('%s=%s' % (name, value)) PTLTestTags.tags_list.append(name) setattr(obj, name, value) setattr(obj, TAGKEY, sorted(set(tagobj))) return obj return wrap_obj def get_tag_value(method, cls, tag_name, default=False): """ Look up an tag on a ``method/function``. If the tag isn't found there, looking it up in the method's class, if any. """ Missing = object() value = getattr(method, tag_name, Missing) if value is Missing and cls is not None: value = getattr(cls, tag_name, Missing) if value is Missing: return default return value class EvalHelper(object): """ Object that can act as context dictionary for eval and looks up names as attributes on a method/function and its class. """ def __init__(self, method, cls): self.method = method self.cls = cls def __getitem__(self, name): return get_tag_value(self.method, self.cls, name) class FakeRunner(object): def __init__(self, matched, tags_list, list_tags, verbose): self.matched = matched self.tags_list = tags_list self.list_tags = list_tags self.verbose = verbose def run(self, test): if self.list_tags: print(('\n'.join(sorted(set(self.tags_list))))) sys.exit(0) suites = sorted(set(self.matched.keys())) if not self.verbose: print(('\n'.join(suites))) else: for k in suites: v = sorted(set(self.matched[k])) for _v in v: print((k + '.' + _v)) sys.exit(0) class PTLTestTags(Plugin): """ Load test cases from given parameter """ name = 'PTLTestTags' score = sys.maxsize - 3 logger = logging.getLogger(__name__) tags_list = [] def __init__(self): Plugin.__init__(self) self.tags_to_check = [] self.tags = [] self.eval_tags = [] self.tags_info = False self.list_tags = False self.verbose = False self.matched = {} self._test_marker = 'test_' def options(self, parser, env): """ Register command line options """ pass def set_data(self, tags, eval_tags, tags_info=False, list_tags=False, verbose=False): self.tags.extend(tags) self.eval_tags.extend(eval_tags) self.tags_info = tags_info self.list_tags = list_tags self.verbose = verbose def configure(self, options, config): """ Configure the plugin and system, based on selected options. attr and eval_attr may each be lists. self.attribs will be a list of lists of tuples. In that list, each list is a group of attributes, all of which must match for the rule to match. """ self.tags_to_check = [] for tag in self.eval_tags: def eval_in_context(expr, obj, cls): return eval(expr, None, EvalHelper(obj, cls)) self.tags_to_check.append([(tag, eval_in_context)]) for tags in self.tags: tag_group = [] for tag in tags.strip().split(','): if not tag: continue items = tag.split('=', 1) if len(items) > 1: key, value = items else: key = items[0] if key[0] == '!': key = key[1:] value = False else: value = True tag_group.append((key, value)) self.tags_to_check.append(tag_group) if (len(self.tags_to_check) > 0) or self.list_tags: self.enabled = True def is_tags_matching(self, method, cls=None): """ Verify whether a method has the required tags The method is considered a match if it matches all tags for any tag group. """ any_matched = False for group in self.tags_to_check: group_matched = True for key, value in group: tag_value = get_tag_value(method, cls, key) if isinstance(value, Callable): if not value(key, method, cls): group_matched = False break elif value is True: if not bool(tag_value): group_matched = False break elif value is False: if bool(tag_value): group_matched = False break elif type(tag_value) in (list, tuple): value = str(value).lower() if value not in [str(x).lower() for x in tag_value]: group_matched = False break else: if ((value != tag_value) and (str(value).lower() != str(tag_value).lower())): group_matched = False break any_matched = any_matched or group_matched if not any_matched: return False def prepareTestRunner(self, runner): """ Prepare test runner """ if (self.tags_info or self.list_tags): return FakeRunner(self.matched, self.tags_list, self.list_tags, self.verbose) def wantClass(self, cls): """ Accept the class if its subclass of TestCase and has at-least one test case """ if not issubclass(cls, unittest.TestCase): return False has_test = False for t in dir(cls): if t.startswith(self._test_marker): has_test = True break if not has_test: return False def wantFunction(self, function): """ Accept the function if its tags match. """ return False def wantMethod(self, method): """ Accept the method if its tags match. """ try: cls = method.__self__.__class__ except AttributeError: return False if not method.__name__.startswith(self._test_marker): return False rv = self.is_tags_matching(method, cls) if rv is None: cname = cls.__name__ if cname not in self.matched.keys(): self.matched[cname] = [] self.matched[cname].append(method.__name__) return rv ================================================ FILE: test/fw/ptl.csh ================================================ #!/usr/bin/csh # # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. # This file will set path variables in case of ptl installation if ( -f /etc/debian_version ) then set __ptlpkgname=`dpkg -W -f='${binary:Package}\n' | grep -E '*-ptl$'` if ( "x${__ptlpkgname}" != "x" ) then set ptl_prefix_lib=`dpkg -L ${__ptlpkgname} | grep -m 1 lib$` endif else set __ptlpkgname=`rpm -qa | grep -E '*-ptl-[[:digit:]]'` if ( "x${__ptlpkgname}" != "x" ) then set ptl_prefix_lib=`rpm -ql ${__ptlpkgname} | grep -m 1 lib$` endif endif if ( $?ptl_prefix_lib ) then set python_dir=`/bin/ls -1 ${ptl_prefix_lib}` set prefix=`dirname ${ptl_prefix_lib}` setenv PATH ${prefix}/bin/:${PATH} if ( $?PYTHONPATH ) then setenv PYTHONPATH ${prefix}/lib/${python_dir}/site-packages/:$PYTHONPATH else setenv PYTHONPATH ${prefix}/lib/${python_dir}/site-packages/ endif unset python_dir unset prefix unset ptl_prefix_lib else if ( $?PBS_CONF_FILE ) then set conf = "$PBS_CONF_FILE" else set conf = /etc/pbs.conf endif if ( -r "${conf}" ) then # we only need PBS_EXEC from pbs.conf set __PBS_EXEC=`grep '^[[:space:]]*PBS_EXEC=' "$conf" | tail -1 | sed 's/^[[:space:]]*PBS_EXEC=\([^[:space:]]*\)[[:space:]]*/\1/'` if ( "X${__PBS_EXEC}" != "X" ) then # Define PATH and PYTHONPATH for the users set PTL_PREFIX=`dirname ${__PBS_EXEC}`/ptl set python_dir=`/bin/ls -1 ${PTL_PREFIX}/lib`/site-packages if ( $?PATH && -d ${PTL_PREFIX}/bin ) then setenv PATH "${PATH}:${PTL_PREFIX}/bin" endif if ( -d "${PTL_PREFIX}/lib/${python_dir}" ) then if ( $?PYTHONPATH ) then setenv PYTHONPATH "${PYTHONPATH}:${PTL_PREFIX}/lib/${python_dir}" else setenv PYTHONPATH "${PTL_PREFIX}/lib/${python_dir}" endif endif endif endif unset __PBS_EXEC unset PTL_PREFIX unset conf unset python_dir endif endif ================================================ FILE: test/fw/ptl.sh ================================================ #!/usr/bin/sh # # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. # This file will set path variables in case of ptl installation if [ -f /etc/debian_version ]; then __ptlpkgname=$(dpkg -W -f='${binary:Package}\n' 2>/dev/null | grep -E '*-ptl$') if [ "x${__ptlpkgname}" != "x" ]; then ptl_prefix_lib=$(dpkg -L ${__ptlpkgname} 2>/dev/null | grep -m 1 lib$ 2>/dev/null) fi else __ptlpkgname=$(rpm -qa 2>/dev/null | grep -E '*-ptl-[[:digit:]]') if [ "x${__ptlpkgname}" != "x" ]; then ptl_prefix_lib=$(rpm -ql ${__ptlpkgname} 2>/dev/null | grep -m 1 lib$ 2>/dev/null) fi fi if [ "x${ptl_prefix_lib}" != "x" ]; then python_dir=$( /bin/ls -1 ${ptl_prefix_lib} ) prefix=$( dirname ${ptl_prefix_lib} ) export PATH=${prefix}/bin/:${PATH} export PYTHONPATH=${prefix}/lib/${python_dir}/site-packages${PYTHONPATH:+:$PYTHONPATH} unset python_dir unset prefix unset ptl_prefix_lib else conf="${PBS_CONF_FILE:-/etc/pbs.conf}" if [ -r "${conf}" ]; then # we only need PBS_EXEC from pbs.conf __PBS_EXEC=$( grep '^[[:space:]]*PBS_EXEC=' "$conf" | tail -1 | sed 's/^[[:space:]]*PBS_EXEC=\([^[:space:]]*\)[[:space:]]*/\1/' ) if [ "X${__PBS_EXEC}" != "X" ]; then # Define PATH and PYTHONPATH for the users PTL_PREFIX=$( dirname ${__PBS_EXEC} )/ptl python_dir=$( /bin/ls -1 ${PTL_PREFIX}/lib )/site-packages [ -d "${PTL_PREFIX}/bin" ] && export PATH="${PATH}:${PTL_PREFIX}/bin" [ -d "${PTL_PREFIX}/lib/${python_dir}" ] && export PYTHONPATH="${PYTHONPATH:+$PYTHONPATH:}${PTL_PREFIX}/lib/${python_dir}" [ -d "${__PBS_EXEC}/lib/python/altair" ] && export PYTHONPATH="${PYTHONPATH:+$PYTHONPATH:}${__PBS_EXEC}/lib/python/altair" [ -d "${__PBS_EXEC}/lib64/python/altair" ] && export PYTHONPATH="${PYTHONPATH:+$PYTHONPATH:}${__PBS_EXEC}/lib64/python/altair" fi unset __PBS_EXEC unset PTL_PREFIX unset conf unset python_dir fi fi ================================================ FILE: test/fw/ptlreport ================================================ #!/bin/bash # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. prog="`basename $0`" usage() { echo -en "${prog}\n" echo -en "\tParses the PTL test output file and reports\n" echo -en "\tvarious counts like total, passed, failed, error-ed,\n" echo -en "\tskipped and timedout test cases from file.\n\n" echo -en "Usage:\n\t${prog} [OPTIONS]\n\n" echo -en "OPTIONS:\n" echo -en "\t-t | --total\t- Print total number of test cases\n" echo -en "\t-p | --passes\t- Print passed test cases\n" echo -en "\t-f | --fails\t- Print failed test cases\n" echo -en "\t-e | --errors\t- Print error-ed test cases\n" echo -en "\t-s | --skipped\t- Print skipped test cases\n" echo -en "\t-T | --timedout\t- Print timedout test cases\n" echo -en "\t-r | --runtime\t- Print total runtime of tests\n" echo -en "\t-S | --summary\t- Print summary of tests\n" echo -en "\t-v | --verbose\t- Print verbose output, can be supplied multiple times to increase verbosity\n\n" } # args: print_info() { if [ ${1} -eq 1 ] then [ ${_space} -eq 1 ] && echo " " || _space=1 if [ ${verbose} -ge 1 ] then [ ${2} -le 0 ] && echo "${3^} test(s): ${2}" && return [ ${verbose} -eq 1 -o ${3} == "skipped" ] && echo "${2} test(s) ${3}:" && \ sed -n "/^${3}: \(.*\)$/p" ${ptl_test_log} | awk '{ $1 = "\t"; print $0 }' && return if [ ${verbose} -gt 1 ] then lines=`sed -n "/^${3}: \(.*\)$/p" ${ptl_test_log} | awk '{ $1 = ""; gsub(/ /, "@", $0); print $0 }'` for line in ${lines} do line=${3^^}":"`echo ${line} | tr '@' ' '` echo ${line} sed -n "/${line}/,\${N;/^\n$/{P;q};P;D}" ${ptl_test_log} | \ awk 'NR > 3 { sub(/.*Traceback/, "Traceback", $0); print " "$0}' done fi else echo ${2} fi fi } if [ $# -le 1 ] then usage exit 1 fi ptl_test_log=$1 if [ ! -r "${ptl_test_log}" ] then echo "${prog}: ${ptl_test_log} doesn't exist or does't have read permission!" exit 1 fi total=0 passes=0 fails=0 errors=0 skipped=0 timedout=0 summary=0 verbose=0 runtime=0 _space=0 shift while [ "$1" != "" ]; do case $1 in -p | --passes) passes=1; shift;; -f | --fails) fails=1; shift;; -e | --errors) errors=1; shift;; -s | --skipped) skipped=1; shift ;; -T | --timedout) timedout=1; shift;; -v | --verbose) verbose=$((${verbose} + 1)); shift ;; -t | --total) total=1; shift;; -S | --summary) summary=1; shift;; -r | --runtime) runtime=1; shift;; -h | --help ) usage; exit 0;; * ) echo -en "Unknown Option: $1\n\n"; usage; exit 1; esac done summary_line=`sed -n '/^run:.*: [0-9]*$/p' ${ptl_test_log}` read total_ct pass_ct fail_ct err_ct skip_ct timedout_ct <<< \ `echo ${summary_line} | awk -F '[,:]' \ 'ORS=" " { for (i=2; i<=NF; i=i+2) { gsub(/^[ \t]+/, "", $i); print $i } }'` if [ ${total} -eq 1 ] then [ ${_space} -eq 1 ] && echo " " || _space=1 [ ${verbose} -ge 1 ] && echo "Total test(s): ${total_ct}" || echo ${total_ct} fi if [ ${passes} -eq 1 ] then [ ${_space} -eq 1 ] && echo " " || _space=1 [ ${verbose} -ge 1 ] && echo "Passed test(s): ${pass_ct}" || echo ${pass_ct} fi print_info ${fails} ${fail_ct} "failed" print_info ${errors} ${err_ct} "error" print_info ${skipped} ${skip_ct} "skipped" print_info ${timedout} ${timedout_ct} "timedout" if [ ${summary} -eq 1 ] then [ ${_space} -eq 1 ] && echo " " || _space=1 [ ${verbose} -ge 1 ] && echo -en "Summary: \n\t " echo ${summary_line} fi if [ ${runtime} -eq 1 ] then test_run_output=`sed -n '/^Tests run in [\.:0-9]*$/p' ${ptl_test_log} | awk -F. '{print $1}'` [ ${_space} -eq 1 ] && echo " " || _space=1 [ ${verbose} -ge 1 ] && echo ${test_run_output} || echo ${test_run_output} | awk '{ print $NF }' fi exit 0 ================================================ FILE: test/fw/requirements.txt ================================================ # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. nose beautifulsoup4 pexpect defusedxml ================================================ FILE: test/fw/setup.py.in ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from setuptools import setup, find_packages import os os.chdir(os.path.dirname(os.path.abspath(os.path.abspath(__file__)))) def get_reqs(): install_requires = open('requirements.txt').readlines() return [r.strip() for r in install_requires] def get_scripts(): return ['bin/%s' % (x) for x in os.listdir('bin')] setup( name='PbsTestLab', version='@PBS_VERSION@', packages=find_packages(), scripts=get_scripts(), include_package_data=True, license='AGPLv3 with exceptions', description='PBS Testing and Benchmarking Framework', long_description=open(os.path.abspath('./doc/intro.rst')).read(), install_requires=get_reqs(), keywords='PbsTestLab ptl pbs', zip_safe=False, classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Other Environment', 'Intended Audience :: Developers', 'License :: AGPLv3 with exceptions', 'Operating System :: POSIX :: Linux', 'Programming Language :: Python :: 3.6', 'Topic :: Software Development :: Testing', 'Topic :: Software Development :: Quality Assurance', ] ) ================================================ FILE: test/scripts/qsub_multi.sh ================================================ #!/bin/bash # Used to achieve faster job submission of large number of jobs for performance testing if [ $# -lt 2 ]; then echo "syntax: $0 " exit 1 fi function submit_jobs { njobs=$1 echo "New thread submitting jobs=$njobs" for i in $(seq 1 $njobs) do qsub -- /bin/date > /dev/null done } if [ "$1" = "submit" ]; then njobs=$2 submit_jobs $njobs exit 0 fi nthreads=$1 njobs=$2 echo "parameters supplied: nthreads=$nthreads, njobs=$njobs" start_time=`date +%s%3N` for i in $(seq 1 $nthreads) do setsid $0 submit $njobs & done wait end_time=`date +%s%3N` diff=`bc -l <<< "scale=3; ($end_time - $start_time) / 1000"` total_jobs=`bc -l <<< "$njobs * $nthreads"` perf=`bc -l <<< "scale=3; $total_jobs / $diff"` echo "Time(ms) started=$start_time, ended=$end_time" echo "Total jobs submitted=$total_jobs, time taken(secs.ms)=$diff, jobs/sec=$perf" ================================================ FILE: test/tests/Makefile.am ================================================ # # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. # if ENABLEPTL ptl_testsdir = ${ptl_prefix}/tests dist_ptl_tests_PYTHON = $(wildcard $(srcdir)/*.py) ptl_testfunctionaldir = $(ptl_testsdir)/functional dist_ptl_testfunctional_DATA = $(wildcard $(srcdir)/functional/*.py) ptl_testinterfacesdir = $(ptl_testsdir)/interfaces dist_ptl_testinterfaces_DATA = $(wildcard $(srcdir)/interfaces/*.py) ptl_testperformancedir = $(ptl_testsdir)/performance dist_ptl_testperformance_DATA = $(wildcard $(srcdir)/performance/*.py) ptl_testresiliencedir = $(ptl_testsdir)/resilience dist_ptl_testresilience_DATA = $(wildcard $(srcdir)/resilience/*.py) ptl_testsecuritydir = $(ptl_testsdir)/security dist_ptl_testsecurity_DATA = $(wildcard $(srcdir)/security/*.py) ptl_testselftestdir = $(ptl_testsdir)/selftest dist_ptl_testselftest_DATA = $(wildcard $(srcdir)/selftest/*.py) ptl_testupgradesdir = $(ptl_testsdir)/upgrades dist_ptl_testupgrades_DATA = $(wildcard $(srcdir)/upgrades/*.py) endif ================================================ FILE: test/tests/__init__.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. ================================================ FILE: test/tests/functional/__init__.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from ptl.utils.pbs_testsuite import * class TestFunctional(PBSTestSuite): """ Base test suite for Functional tests """ pass ================================================ FILE: test/tests/functional/pbs_Rrecord_resources_used.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * import re @requirements(num_moms=2) class Test_Rrecord_with_resources_used(TestFunctional): """ This test suite tests whether the 'R' record in accounting logs has information on resources_used in the following scenarios. a) The node the job was running on goes down and node_fail_requeue timeout is hit. b) It is rerun using qrerun . c) It is rerun using qrerun -Wforce . d) mom is restarted without any options or with the '-r' option """ def setUp(self): TestFunctional.setUp(self) if len(self.moms) != 2: self.skipTest('test requires two MoMs as input, ' + 'use -p moms=:') self.server.set_op_mode(PTL_CLI) # PBSTestSuite returns the moms passed in as parameters as dictionary # of hostname and MoM object self.momA = self.moms.values()[0] self.momB = self.moms.values()[1] self.hostA = self.momA.shortname self.hostB = self.momB.shortname a = {'resources_available.ncpus': 4} self.server.manager(MGR_CMD_SET, NODE, a, id=self.hostA) self.server.manager(MGR_CMD_SET, NODE, a, id=self.hostB) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) def common(self, is_nonrerunnable, restart_mom): # Set node_fail_requeue=5 on server a = {ATTR_nodefailrq: 5} self.server.manager(MGR_CMD_SET, SERVER, a) # Job script test = [] test += ['#PBS -N RequeueTest\n'] test += ['#PBS -l ncpus=1\n'] test += ['echo Starting test at `date`\n'] test += ['sleep 1000\n'] test1 = [] test1 += ['#PBS -N RequeueTest\n'] test1 += ['#PBS -lselect=1:ncpus=1 -l place=scatter\n'] test1 += ['echo Starting test at `date`\n'] test1 += ['sleep 1000\n'] # Submit three jobs J1,J2,J3[] j1 = Job(TEST_USER, attrs={ATTR_k: 'oe'}) j1.create_script(body=test) jid1 = self.server.submit(j1) if is_nonrerunnable is True: j2 = Job(TEST_USER, attrs={ATTR_r: 'n', ATTR_k: 'oe'}) else: j2 = Job(TEST_USER, attrs={ATTR_r: 'y', ATTR_k: 'oe'}) j2.create_script(body=test1) jid2 = self.server.submit(j2) j3 = Job(TEST_USER, attrs={ATTR_J: '1-6', ATTR_k: 'oe'}) j3.create_script(body=test) jid3 = self.server.submit(j3) subjobs = self.server.status(JOB, id=jid3, extend='t') jid3s1 = subjobs[1]['id'] # Wait for the jobs to start running. self.server.expect(JOB, {ATTR_substate: '42'}, jid1) self.server.expect(JOB, {ATTR_substate: '42'}, jid2) self.server.expect(JOB, {ATTR_substate: '42'}, jid3s1) # Verify that accounting logs have Resource_List. value self.server.accounting_match( msg='.*Resource_List.*', id=jid1, regexp=True) self.server.accounting_match( msg='.*Resource_List.*', id=jid2, regexp=True) self.server.accounting_match( msg='.*Resource_List.*', id=jid3s1, regexp=True) # Bring both moms down using kill -9 self.momA.signal('-KILL') self.momB.signal('-KILL') # Verify that both nodes are reported to be down. self.server.expect(NODE, {ATTR_NODE_state: ( MATCH_RE, '.*down.*')}, id=self.hostA) self.server.expect(NODE, {ATTR_NODE_state: ( MATCH_RE, '.*down.*')}, id=self.hostB) self.server.expect(JOB, {ATTR_state: 'Q'}, jid1) self.server.expect(JOB, {ATTR_state: 'Q'}, jid3s1) if is_nonrerunnable is False: # All rerunnable jobs - all should be in 'Q' state. self.server.expect(JOB, {ATTR_state: 'Q'}, jid2) else: # Job2 is non-rerunnable. self.server.expect(JOB, {ATTR_state: 'F'}, jid2, extend='x') # tracejob should show "Job requeued, execution node down" self.server.tracejob_match( msg='Job requeued, execution node .* down', id=jid1, regexp=True) if is_nonrerunnable is False: e = True else: e = False msg = 'Job requeued, execution node .* down' self.server.tracejob_match(msg=msg, id=jid2, regexp=True, existence=e) self.server.tracejob_match( msg='Job requeued, execution node .* down', id=jid3s1, regexp=True) self.server.accounting_match( msg='.*Resource_List.*', id=jid1, regexp=True) self.server.accounting_match( msg='.*Resource_List.*', id=jid2, regexp=True) self.server.accounting_match( msg='.*Resource_List.*', id=jid3s1, regexp=True) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) if restart_mom == 's': # Start mom without any option self.momA.start() self.momB.start() elif restart_mom == 'r': # Start mom with -r option self.momA.start(args=['-r']) self.momB.start(args=['-r']) return jid1, jid2, jid3s1 def test_Rrecord_with_nodefailrequeue(self): """ Scenario: The node on which the job was running goes down and node_fail_requeue time-out is hit. Expected outcome: Server should record last known resource usage in the 'R' record. """ jid1, jid2, jid3s1 = self.common(False, False) self.server.accounting_match( msg='.*R;' + jid1 + '.*resources_used.*', id=jid1, regexp=True) self.server.accounting_match( msg='.*R;' + jid2 + '.*resources_used.*', id=jid2, regexp=True) self.server.accounting_match( msg='.*R;' + re.escape(jid3s1) + '.*resources_used.*', id=jid3s1, regexp=True) def test_Rrecord_when_mom_restarted_with_r(self): """ Scenario: The node on which the job was running goes down and node_fail_requeue time-out is hit and mom is restarted with '-r' Expected outcome: Server should record last known resource usage in the 'R' record. """ jid1, jid2, jid3s1 = self.common(False, 'r') self.server.accounting_match( msg='.*R;' + jid1 + '.*resources_used.*run_count=1', id=jid1, regexp=True) self.server.accounting_match( msg='.*R;' + jid2 + '.*resources_used.*run_count=1', id=jid2, regexp=True) self.server.accounting_match( msg='.*R;' + re.escape(jid3s1) + '.*resources_used.*run_count=1', id=jid3s1, regexp=True) def test_Rrecord_for_nonrerunnable_jobs(self): """ Scenario: One non-rerunnable job. The node on which the job was running goes down and node_fail_requeue time-out is hit. Expected outcome: Server should record last known resource usage in the 'R' record only for rerunnable jobs. """ a = {ATTR_JobHistoryEnable: 1} self.server.manager(MGR_CMD_SET, SERVER, a) jid1, jid2, jid3s1 = self.common(True, 'r') self.server.accounting_match( msg='.*R;' + jid1 + '.*resources_used.*run_count=1', id=jid1, regexp=True) self.server.accounting_match( msg='.*R;' + jid2 + '.*resources_used.*run_count=1', id=jid2, regexp=True, existence=False, max_attempts=5) self.server.accounting_match( msg='.*R;' + re.escape(jid3s1) + '.*resources_used.*run_count=1', id=jid3s1, regexp=True) def test_Rrecord_when_mom_restarted_without_r(self): """ Scenario: Mom restarted without '-r' option and jobs are requeued using qrerun. Expected outcome: Server should record last known resource usage in the 'R' record for both. """ jid1, jid2, jid3s1 = self.common(False, 's') self.server.accounting_match( msg='.*R;' + jid1 + '.*resources_used.*run_count=1', id=jid1, regexp=True) self.server.accounting_match( msg='.*R;' + jid2 + '.*resources_used.*run_count=1', id=jid2, regexp=True) self.server.accounting_match( msg='.*R;' + re.escape(jid3s1) + '.*resources_used.*run_count=1', id=jid3s1, regexp=True) # Verify that the jobs are in 'Q' state. self.server.expect(JOB, {ATTR_state: 'Q'}, jid1) self.server.expect(JOB, {ATTR_state: 'Q'}, jid2) self.server.expect(JOB, {ATTR_state: 'Q'}, jid3s1) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {ATTR_substate: '42'}, jid1) self.server.expect(JOB, {ATTR_substate: '42'}, jid2) self.server.expect(JOB, {ATTR_substate: '42'}, jid3s1) # qrerun the jobs and wait for them to start running. self.server.rerunjob(jobid=jid1) self.server.rerunjob(jobid=jid2) self.server.rerunjob(jobid=jid3s1) # Confirm that the 'R' record is generated and the run_count is 2. self.server.accounting_match( msg='.*R;' + jid1 + '.*resources_used.*run_count=2', id=jid1, regexp=True) self.server.accounting_match( msg='.*R;' + jid2 + '.*resources_used.*run_count=2', id=jid2, regexp=True) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) def test_Rrecord_with_multiple_reruns(self): """ Scenario: Job is rerun multiple times. Expected outcome: Server should record last known resource usage every time the job is rerun. """ dflt_q = self.server.default_queue # As user submit three jobs. test = [] test += ['#PBS -N RequeueTest\n'] test += ['#PBS -l ncpus=1\n'] test += ['echo Starting test at `date`\n'] test += ['sleep 1000\n'] j1 = Job(TEST_USER) j1.create_script(body=test) j1.set_attributes({ATTR_r: 'y', ATTR_l + '.ncpus': 2}) jid1 = self.server.submit(j1) j2 = Job(TEST_USER) j2.create_script(body=test) j2.set_attributes({ATTR_r: 'n', ATTR_l + '.ncpus': 2}) jid2 = self.server.submit(j2) j3 = Job(TEST_USER) j3.create_script(body=test) j3.set_attributes({ATTR_J: '1-4', ATTR_k: 'oe'}) jid3 = self.server.submit(j3) subjobs = self.server.status(JOB, id=jid3, extend='t') jid3s1 = subjobs[1]['id'] # Verify that the jobs have started running. self.server.expect(JOB, {ATTR_substate: '42', 'run_count': 1}, jid1) self.server.expect(JOB, {ATTR_substate: '42', 'run_count': 1}, jid2) self.server.expect(JOB, {ATTR_state: 'B'}, jid3) self.server.expect(JOB, {ATTR_substate: '42', 'run_count': 1}, jid3s1) # Verify that the accounting logs have Resource_List. but no # R records. self.server.accounting_match( msg='.*Resource_List.*', id=jid1, regexp=True) msg = '.*R;' + jid1 + '.*resources_used.*' self.server.accounting_match(msg=msg, id=jid1, regexp=True, existence=False) self.server.accounting_match( msg='.*Resource_List.*', id=jid2, regexp=True) msg = '.*R;' + jid2 + '.*resources_used.*' self.server.accounting_match(msg=msg, id=jid2, regexp=True, existence=False) self.server.accounting_match( msg='.*Resource_List.*', id=jid3s1, regexp=True) self.server.accounting_match(msg='.*R;' + re.escape(jid3s1) + '.*resources_used.*', id=jid3s1, regexp=True, existence=False) # sleep for 5 seconds so the jobs use some resources. time.sleep(5) self.server.rerunjob(jid1) self.server.rerunjob(jid3s1) # Verify that the accounting logs have R logs with last known resource # usage. No R logs for J2. self.server.accounting_match( msg='.*R;' + jid1 + '.*Exit_status=-11.*.*resources_used.*.*run_count=1.*', id=jid1, regexp=True) msg = '.*R;' + jid2 + '.*resources_used.*' self.server.accounting_match(msg=msg, id=jid2, regexp=True, existence=False) self.server.accounting_match(msg='.*R;' + re.escape( jid3s1) + '.*Exit_status=-11.*.*resources_used.*.*run_count=1.*', id=jid3s1, regexp=True) # sleep for 5 seconds so the jobs use some resources. time.sleep(5) self.server.rerunjob(jid1) self.server.rerunjob(jid3s1) # Verify that the accounting logs should R logs with last known # resource usage Resource_used and run_count should be 3 for J1. # No R logs in accounting for J2. self.server.accounting_match( msg='.*R;' + jid1 + '.*Exit_status=-11.*.*resources_used.*.*run_count=2.*', id=jid1, regexp=True) msg = '.*R;' + jid2 + '.*resources_used.*' self.server.accounting_match(msg=msg, id=jid2, regexp=True, existence=False) self.server.accounting_match( msg='.*R;' + re.escape(jid3s1) + '.*Exit_status=-11.*.*resources_used.*.*run_count=1.*', id=jid3s1, regexp=True) def test_Rrecord_with_multiple_reruns_case2(self): """ Scenario: Jobs submitted with select cput and ncpus. Job is rerun multiple times. Expected outcome: Server should record last known resource usage that has cputime. """ dflt_q = self.server.default_queue script = [] script += ['i=0;\n'] script += ['while [ $i -ne 0 ] || sleep 0.125;\n'] script += ['do i=$(((i+1) % 10000 ));\n'] script += ['done\n'] j1 = Job(TEST_USER) j1.create_script(body=script) j1.set_attributes( {ATTR_l + '.cput': 160, ATTR_l + '.ncpus': 3, ATTR_k: 'oe'}) jid1 = self.server.submit(j1) j2 = Job(TEST_USER) j2.create_script(body=script) j2.set_attributes( {ATTR_l + '.cput': 180, ATTR_l + '.ncpus': 3, ATTR_k: 'oe'}) jid2 = self.server.submit(j2) # Verify that the jobs have started running. self.server.expect(JOB, {ATTR_substate: '42', 'run_count': 1}, jid1) self.server.expect(JOB, {ATTR_substate: '42', 'run_count': 1}, jid2) # Verify that the accounting logs have Resource_List. but no # R records. self.server.accounting_match( msg='.*Resource_List.*', id=jid1, regexp=True) msg = '.*R;' + jid1 + '.*resources_used.*' self.server.accounting_match(msg=msg, id=jid1, regexp=True, existence=False) self.server.accounting_match( msg='.*Resource_List.*', id=jid2, regexp=True) msg = '.*R;' + jid2 + '.*resources_used.*' self.server.accounting_match(msg=msg, id=jid2, regexp=True, existence=False) time.sleep(5) jids = self.server.select() self.server.rerunjob(jids) # Verify that the accounting logs have R record with last known # resource usage and run_count should be 2 for J1 and J2. self.server.accounting_match( msg='.*R;' + jid1 + '.*.*resources_used.cput=[0-9]*:[0-9]*:[0-9]*.*.*run_count=1.*', id=jid1, regexp=True) self.server.accounting_match( msg='.*R;' + jid2 + '.*.*resources_used.cput=[0-9]*:[0-9]*:[0-9]*.*.*run_count=1.*', id=jid2, regexp=True) time.sleep(5) jids = self.server.select() self.server.rerunjob(jids) self.server.accounting_match( msg='.*R;' + jid1 + '.*.*resources_used.cput=[0-9]*:[0-9]*:[0-9]*.*.*run_count=2.*', id=jid1, regexp=True) self.server.accounting_match( msg='.*R;' + jid2 + '.*.*resources_used.cput=[0-9]*:[0-9]*:[0-9]*.*.*run_count=2.*', id=jid2, regexp=True) def test_Rrecord_job_rerun_forcefully(self): """ Scenario: Job is forcefully rerun. Expected outcome: server should record last known resource usage in the R record. """ dflt_q = self.server.default_queue test = [] test += ['#PBS -N RequeueTest\n'] test += ['#PBS -l ncpus=1\n'] test += ['echo Starting test at `date`\n'] test += ['sleep 1000\n'] j1 = Job(TEST_USER) j1.create_script(body=test) j1.set_attributes({ATTR_r: 'y', ATTR_l + '.ncpus': 2}) jid1 = self.server.submit(j1) j2 = Job(TEST_USER) j2.create_script(body=test) j2.set_attributes({ATTR_r: 'n', ATTR_l + '.ncpus': 2}) jid2 = self.server.submit(j2) j3 = Job(TEST_USER) j3.create_script(body=test) j3.set_attributes({ATTR_J: '1-4', ATTR_k: 'oe'}) jid3 = self.server.submit(j3) subjobs = self.server.status(JOB, id=jid3, extend='t') jid3s1 = subjobs[1]['id'] # Verify that the jobs have started running. self.server.expect(JOB, {ATTR_substate: '42', 'run_count': 1}, jid1) self.server.expect(JOB, {ATTR_substate: '42', 'run_count': 1}, jid2) self.server.expect(JOB, {ATTR_state: 'B'}, jid3) self.server.expect(JOB, {ATTR_substate: '42', 'run_count': 1}, jid3s1) # Verify that the accounting logs have Resource_List. but no # R records. self.server.accounting_match( msg='.*Resource_List.*', id=jid1, regexp=True) msg = '.*R;' + jid1 + '.*resources_used.*' self.server.accounting_match(msg=msg, id=jid1, regexp=True, existence=False) self.server.accounting_match( msg='.*Resource_List.*', id=jid2, regexp=True) msg = '.*R;' + jid2 + '.*resources_used.*' self.server.accounting_match(msg=msg, id=jid2, regexp=True, existence=False) self.server.accounting_match( msg='.*Resource_List.*', id=jid3s1, regexp=True) self.server.accounting_match(msg='.*R;' + re.escape(jid3s1) + '.*resources_used.*', id=jid3s1, regexp=True, existence=False) time.sleep(5) jids = self.server.select(extend='T') self.server.rerunjob(jids, extend='force') # Verify that the accounting logs have R record with last known # resource usage and run_count should be 1 for J1 and J2 self.server.accounting_match( msg='.*R;' + jid1 + '.*Exit_status=-11.*.*resources_used.*.*run_count=1.*', id=jid1, regexp=True) self.server.accounting_match( msg='.*R;' + jid2 + '.*Exit_status=-11.*.*resources_used.*.*run_count=1.*', id=jid2, regexp=True) self.server.accounting_match(msg='.*R;' + re.escape( jid3s1) + '.*Exit_status=-11.*.*resources_used.*.*run_count=1.*', id=jid3s1, regexp=True) time.sleep(5) jids = self.server.select(extend='T') self.server.rerunjob(jids, extend='force') # Verify that the accounting logs have R record with last known # usage and run_count should be 2 for J1 and J2. self.server.accounting_match( msg='.*R;' + jid1 + '.*Exit_status=-11.*.*resources_used.*.*run_count=2.*', id=jid1, regexp=True) self.server.accounting_match( msg='.*R;' + jid2 + '.*Exit_status=-11.*.*resources_used.*.*run_count=2.*', id=jid2, regexp=True) self.server.accounting_match(msg='.*R;' + re.escape( jid3s1) + '.*Exit_status=-11.*.*resources_used.*.*run_count=2.*', id=jid3s1, regexp=True) def tearDown(self): TestFunctional.tearDown(self) ================================================ FILE: test/tests/functional/pbs_acct_log.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * class TestAcctLog(TestFunctional): """ Tests dealing with the PBS accounting logs """ def setUp(self): TestFunctional.setUp(self) a = {'type': 'string', 'flag': 'h'} self.server.manager(MGR_CMD_CREATE, RSC, a, id='foo_str') def test_long_resource_end(self): """ Test to see if a very long string resource is neither truncated in the job's resources_used attr or the accounting log at job end """ self.server.manager(MGR_CMD_SET, SERVER, {'job_history_enable': 'True'}) # Create a very long string - the truncation was 2048 characters # 4096 is plenty big to show it hstr = '1'*4096 hook_body = "import pbs\n" hook_body += "e = pbs.event()\n" hook_body += "hstr=\'" + hstr + "\'\n" hook_body += "e.job.resources_used[\"foo_str\"] = hstr\n" a = {'event': 'execjob_epilogue', 'enabled': 'True'} self.server.create_import_hook("ep", a, hook_body) J = Job() J.set_sleep_time(1) jid = self.server.submit(J) # Make sure the resources_used value hasn't been truncated self.server.expect(JOB, {'job_state': 'F'}, id=jid, extend='x') self.server.expect( JOB, {'resources_used.foo_str': hstr}, extend='x', max_attempts=1) # Make sure the accounting log hasn't been truncated log_match = 'resources_used.foo_str=' + hstr self.server.accounting_match( "E;%s;.*%s.*" % (jid, log_match), regexp=True) # Make sure the server log hasn't been truncated log_match = 'resources_used.foo_str=' + hstr self.server.log_match("%s;.*%s.*" % (jid, log_match), regexp=True) def test_long_resource_reque(self): """ Test to see if a very long string value is truncated in the 'R' requeue accounting record """ # Create a very long string - the truncation was 2048 characters # 4096 is plenty big to show it hstr = "" for i in range(4096): hstr += "1" hook_body = "import pbs\n" hook_body += "e = pbs.event()\n" hook_body += "hstr=\'" + hstr + "\'\n" hook_body += "e.job.resources_used[\"foo_str\"] = hstr\n" a = {'event': 'execjob_prologue', 'enabled': 'True'} self.server.create_import_hook("pr", a, hook_body) J = Job() jid = self.server.submit(J) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) self.server.rerunjob(jid) self.server.expect(JOB, {'job_state': 'Q'}, id=jid) # Make sure the accounting log hasn't been truncated acctlog_match = 'resources_used.foo_str=' + hstr self.server.accounting_match( "R;%s;.*%s.*" % (jid, acctlog_match), regexp=True) def test_queue_record(self): """ Test the correct data is being printed in the queue record """ t = time.time() a = {ATTR_g: TEST_USER.groups[0], ATTR_project: 'foo', ATTR_A: 'bar', ATTR_N: 'baz', ATTR_l + '.walltime': '1:00:00'} j1 = Job(TEST_USER, a) jid1 = self.server.submit(j1) (_, line) = self.server.accounting_match(';Q;' + jid1) # Check for euser self.assertIn('user=' + str(TEST_USER), line) # Check for egroup self.assertIn('group=' + str(TEST_USER.groups[0]), line) # Check for project self.assertIn('project=foo', line) # Check for account name self.assertIn('account=\"bar\"', line) # Check for job name self.assertIn('jobname=baz', line) # Check for queue self.assertIn('queue=workq', line) # Check for the existance of times self.assertIn('etime=', line) self.assertIn('ctime=', line) self.assertIn('qtime=', line) self.assertNotIn('start=', line) # Check for walltime self.assertIn('Resource_List.walltime=01:00:00', line) j2 = Job(TEST_USER, {ATTR_J: '1-2', ATTR_depend: 'afterok:' + jid1}) jid2 = self.server.submit(j2) (_, line) = self.server.accounting_match(';Q;' + jid2) self.assertIn('array_indices=1-2', line) self.assertIn('depend=afterok:' + jid1, line) r = Reservation() rid1 = self.server.submit(r) a = {'reserve_state': (MATCH_RE, "RESV_CONFIRMED|2")} self.server.expect(RESV, a, id=rid1) j3 = Job(TEST_USER, {ATTR_queue: rid1.split('.')[0]}) jid3 = self.server.submit(j3) (_, line) = self.server.accounting_match(';Q;' + jid3) self.assertIn('resvID=' + rid1, line) def test_queue_record_hook(self): """ Test that changes made in a queuejob hook are reflected in the Q record """ qj_hook = """ import pbs pbs.event().job.project = 'foo' pbs.event().accept() """ qj_attrs = {'event': 'queuejob', 'enabled': 'True'} self.server.create_import_hook('qj', qj_attrs, qj_hook) j = Job() jid1 = self.server.submit(j) (_, line) = self.server.accounting_match(';Q;' + jid1) self.assertIn('project=foo', line) def test_alter_record(self): """ Test the accounting log alter record """ self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) j1 = Job(TEST_USER1) jid1 = self.server.submit(j1) # Basic test for existance of record for Resource_List self.server.alterjob(jid1, {ATTR_l + '.walltime': '1:00:00'}) self.server.accounting_match(';a;' + jid1 + ';Resource_List.walltime=01:00:00') # Check for default value when unsetting self.server.manager(MGR_CMD_SET, SERVER, {ATTR_rescdflt + '.walltime': '30:00'}) self.server.alterjob(jid1, {ATTR_l + '.walltime': ''}) self.server.accounting_match(';a;' + jid1 + ';Resource_List.walltime=00:30:00') self.server.alterjob(jid1, {ATTR_l + '.software': 'foo'}) self.server.accounting_match(';a;' + jid1 + ';Resource_List.software=foo') # Check for UNSET record when value is unset self.server.alterjob(jid1, {ATTR_l + '.software': '\"\"'}) self.server.accounting_match(';a;' + jid1 + ';Resource_List.software=UNSET') # Check for non-resource attribute self.server.alterjob(jid1, {ATTR_p: 150}) self.server.accounting_match(';a;' + jid1 + ';Priority=150') self.server.alterjob(jid1, {ATTR_g: str(TSTGRP1)}) self.server.accounting_match(';a;' + jid1 + ';group_list=' + str(TSTGRP1)) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) # Check that scheduler's alters are not logged self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.accounting_match( ';a;' + jid1 + ';comment', existence=False, max_attempts=2) def test_alter_record_hooks(self): """ Test that when hooks set attributes, an 'a' record is logged """ mj_hook = """ import pbs pbs.event().job.comment = 'foo' pbs.event().accept() """ mj_attrs = {'event': 'modifyjob', 'enabled': 'True'} rj_hook = """ import pbs pbs.event().job.project = 'abc' pbs.event().reject('foo') """ rj_attrs = {'event': 'runjob', 'enabled': 'True'} self.server.create_import_hook('mj', mj_attrs, mj_hook) self.server.create_import_hook('rj', rj_attrs, rj_hook) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) j1 = Job() jid1 = self.server.submit(j1) self.server.alterjob(jid1, {ATTR_p: 150}) (_, line) = self.server.accounting_match(';a;' + jid1 + ';') self.assertIn('Priority=150', line) self.assertIn('comment=foo', line) try: self.server.runjob(jid1) except PbsRunError: # runjob hook is rejecting the run request pass self.server.accounting_match(';a;' + jid1 + ';project=abc') def test_alter_record_queuejob_hook(self): """ Test that when a queuejob hook set an attribute, an 'a' record is logged. """ qj_hook = """ import pbs e1 = pbs.event() e1.job.project = 'abc' e2 = pbs.event() e2.accept() """ qj_attrs = {'event': 'queuejob', 'enabled': 'True'} self.server.create_import_hook('qj', qj_attrs, qj_hook) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) j1 = Job(TEST_USER, {'Resource_List.walltime': 42}) j1.set_sleep_time(1) jid1 = self.server.submit(j1) self.server.alterjob(jid1, {ATTR_p: 150}) (_, line) = self.server.accounting_match(';a;' + jid1 + ';') self.assertIn('Priority=150', line) (_, line) = self.server.accounting_match(';Q;' + jid1 + ';') self.assertIn('project=abc', line) self.server.runjob(jid1) (_, line) = self.server.accounting_match(';E;' + jid1 + ';') self.assertIn('project=abc', line) def test_alter_record_modifyjob_hook(self): """ Test that when a modifyjob hook set attributes, an 'a' record is logged. """ mj_hook = """ import pbs e1 = pbs.event() e1.job.comment = 'foo' e1.job.project = 'abc' e2 = pbs.event() e2.accept() """ mj_attrs = {'event': 'modifyjob', 'enabled': 'True'} self.server.create_import_hook('mj', mj_attrs, mj_hook) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) j1 = Job(TEST_USER, {'Resource_List.walltime': 42}) j1.set_sleep_time(1) jid1 = self.server.submit(j1) self.server.alterjob(jid1, {ATTR_p: 150}) (_, line) = self.server.accounting_match(';a;' + jid1 + ';') self.assertIn('Priority=150', line) self.assertIn('comment=foo', line) self.assertIn('project=abc', line) self.server.runjob(jid1) (_, line) = self.server.accounting_match(';E;' + jid1 + ';') # self.assertIn('comment=foo', line) # Doesn't exist in E self.assertIn('project=abc', line) def test_alter_record_runjob_hook(self): """ Test that when a runjob hook set attributes, an 'a' record is logged. """ info_hook = """ import pbs e1 = pbs.event() pbs.logmsg(pbs.LOG_ERROR, f"HOOK:e1:{hex(id(e1))}" f" job.id:{e1.job.id}" f" job.project:{e1.job.project}" f" comment:{e1.job.comment}" f" hex(id(job)):{hex(id(e1.job))}") e1.accept() """ qj_attrs = {'event': 'queuejob', 'enabled': 'True'} mj_attrs = {'event': 'modifyjob', 'enabled': 'True'} self.server.create_import_hook('qj', qj_attrs, info_hook) self.server.create_import_hook('mj', mj_attrs, info_hook) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) j1 = Job(TEST_USER, {'Resource_List.walltime': 42}) j1.set_sleep_time(1) jid1 = self.server.submit(j1) self.server.alterjob(jid1, {ATTR_p: 150}) rj_hook = """ import pbs e1 = pbs.event() e1.job.Output_Path = '/tmp/job-%s-output' e1.job.Error_Path = '/tmp/job-%s-error' e2 = pbs.event() e2.accept() """ % (jid1, jid1) rj_attrs = {'event': 'runjob', 'enabled': 'True'} self.server.create_import_hook('rj', rj_attrs, rj_hook) self.server.runjob(jid1) (_, line) = self.server.accounting_match(';a;' + jid1 + ';') self.assertIn('Priority=150', line) (_, line) = self.server.accounting_match(';a;' + jid1 + ';Output_Path') self.assertIn('Output_Path=/tmp/job-%s-output' % jid1, line) (_, line) = self.server.accounting_match(';a;' + jid1 + ';Error_Path') self.assertIn('Error_Path=/tmp/job-%s-error' % jid1, line) def test_multiple_alter_record_hooks(self): """ Test that when hooks set attributes, an 'a' record is logged. """ mj_hook_00 = """ import pbs e1 = pbs.event() e1.job.comment = 'foo' e1.job.project = "aaa" e2 = pbs.event() e2.accept() """ mj_hook_01 = """ import pbs e1 = pbs.event() e1.job.comment = 'foo2' e1.job.project = "bbb" e2 = pbs.event() e2.accept() """ mj_attrs_00 = {'event': 'modifyjob', 'order': '1', 'enabled': 'True'} mj_attrs_01 = {'event': 'modifyjob', 'order': '2', 'enabled': 'True'} rj_hook = """ import pbs e1 = pbs.event() e1.job.project = 'abc' e2 = pbs.event() e2.reject('bar') """ rj_attrs = {'event': 'runjob', 'enabled': 'True'} self.server.create_import_hook('mj01', mj_attrs_01, mj_hook_01) # create out of order. self.server.create_import_hook('mj00', mj_attrs_00, mj_hook_00) self.server.create_import_hook('rj', rj_attrs, rj_hook) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) j1 = Job(TEST_USER, {'Resource_List.walltime': 42}) jid1 = self.server.submit(j1) self.server.alterjob(jid1, {ATTR_p: 150}) (_, line) = self.server.accounting_match(';a;' + jid1 + ';') self.assertIn('Priority=150', line) self.assertIn('comment=foo2', line) self.assertIn('project=bbb', line) try: self.server.runjob(jid1) except PbsRunError: # runjob hook is rejecting the run request pass self.server.accounting_match(f';a;{jid1};project=abc') def test_queue_record_multiple_hook_00(self): """ Test that changes made in a queuejob hooks are reflected in the Q record """ self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False', 'job_history_enable': 'True', }) qj_hook_00 = """ import pbs e1 = pbs.event() e1.job.project = 'foo00' pbs.logmsg(pbs.LOG_ERROR, f"HOOK:e1:{hex(id(e1))}" f" job.id:{e1.job.id}" f" job.project:{e1.job.project}" f" Resource_List:{e1.job.Resource_List}" f" hex(id(job)):{hex(id(e1.job))}") e1.accept() """ qj_hook_01 = """ import pbs e1 = pbs.event() e1.job.project = str(e1.job.project) + '_foo01' e1.accept() """ qj_attrs = {'event': 'queuejob', 'enabled': 'True'} self.server.create_import_hook('qj00', qj_attrs, qj_hook_00) # FIXME set hook attr to priority order self.server.create_import_hook('qj01', qj_attrs, qj_hook_01) j = Job(TEST_USER, {'Resource_List.walltime': 42}) j.set_sleep_time(1) jid1 = self.server.submit(j) self.server.alterjob(jid1, {ATTR_p: 150}) (_, line) = self.server.accounting_match(';Q;' + jid1) self.assertIn('project=foo00_foo01', line) (_, line) = self.server.accounting_match(';a;' + jid1) self.assertIn('Priority=150', line) self.server.runjob(jid1) self.server.expect(JOB, {'job_state': 'F'}, extend='x', id=jid1) (_, line) = self.server.accounting_match(';E;' + jid1) self.assertIn('project=foo00_foo01', line) def test_queue_record_multiple_hook_01(self): """ Test that changes made in a modifyjob hook are reflected in the E record """ self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False', 'job_history_enable': 'True', }) mj_hook_00 = """ import pbs e1 = pbs.event() e1.job.project = 'foo02' pbs.logmsg(pbs.LOG_ERROR, f"HOOKQ0:e1:{hex(id(e1))}" f" job.id:{e1.job.id}" f" job.project:{e1.job.project}" f" hex(id(job)):{hex(id(e1.job))}") e1.accept() """ mj_hook_01 = """ import pbs e1 = pbs.event() e1.job.project = str(e1.job.project) + '_foo03' pbs.logmsg(pbs.LOG_ERROR, f"HOOKQ0:e1:{hex(id(e1))}" f" job.id:{e1.job.id}" f" job.project:{e1.job.project}" f" hex(id(job)):{hex(id(e1.job))}") e1.accept() """ mj_attrs_00 = {'event': 'modifyjob', 'order': 1, 'enabled': 'True'} mj_attrs_01 = {'event': 'modifyjob', 'order': 2, 'enabled': 'True'} self.server.create_import_hook('mj_00', mj_attrs_00, mj_hook_00) self.server.create_import_hook('mj_01', mj_attrs_01, mj_hook_01) j = Job(TEST_USER, {'Resource_List.walltime': 42}) j.set_sleep_time(1) jid1 = self.server.submit(j) self.server.alterjob(jid1, {ATTR_p: 150}) (_, line) = self.server.accounting_match(';a;' + jid1) self.assertIn('Priority=150', line) self.assertIn('project=foo02_foo03', line) self.server.runjob(jid1) self.server.expect(JOB, {'job_state': 'F'}, extend='x', id=jid1) (_, line) = self.server.accounting_match(';E;' + jid1) self.assertIn('project=foo02_foo03', line) def test_queue_record_multiple_hook_02(self): """ Test that changes made in a queuejob then modifyjob are stacking using job_o in the modifyjob hook. """ self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False', 'job_history_enable': 'True', }) qj_hook_00 = """ import pbs e1 = pbs.event() e1.job.project = 'foo00' e1 = pbs.event() pbs.logmsg(pbs.LOG_ERROR, f"HOOKQ0:e1:{hex(id(e1))}" f" job.id:{e1.job.id}" f" job.project:{e1.job.project}" f" Resource_List:{e1.job.Resource_List}" f" hex(id(job)):{hex(id(e1.job))}") e1.accept() """ qj_attrs = {'event': 'queuejob', 'enabled': 'True'} self.server.create_import_hook('qj00', qj_attrs, qj_hook_00) mj_hook_00 = """ import pbs e1 = pbs.event() pbs.logmsg(pbs.LOG_ERROR, f"HOOKM0a:e1:{hex(id(e1))}" f" job.id:{e1.job.id}" f" job.project:{e1.job.project}" f" job_o.id:{e1.job_o.id}" f" job_o.project:{e1.job_o.project}" f" hex(id(job)):{hex(id(e1.job))}") e1.job.project = str(e1.job_o.project) + '_foo01' pbs.logmsg(pbs.LOG_ERROR, f"HOOKM0b:e1:{hex(id(e1))}" f" job.id:{e1.job.id}" f" job.project:{e1.job.project}" f" job_o.id:{e1.job_o.id}" f" job_o.project:{e1.job_o.project}" f" hex(id(job)):{hex(id(e1.job))}") e1.accept() """ mj_attrs = {'event': 'modifyjob', 'enabled': 'True'} # FIXME: there is a problem here when you enable the modifyjob hook. # the modifyjob hook doesn't get the change from the queuejob. self.server.create_import_hook('mj_00', mj_attrs, mj_hook_00) j = Job(TEST_USER, {'Resource_List.walltime': 1}) j.set_sleep_time(1) jid1 = self.server.submit(j) self.server.alterjob(jid1, {ATTR_p: 150}) self.server.runjob(jid1) self.server.expect(JOB, {'job_state': 'F'}, extend='x', id=jid1) (_, line) = self.server.accounting_match(';Q;' + jid1) self.assertIn('project=foo00', line) (_, line) = self.server.accounting_match(';a;' + jid1) self.assertIn('Priority=150', line) self.assertIn('project=foo00_foo01', line) (_, line) = self.server.accounting_match(';E;' + jid1) self.assertIn('project=foo00_foo01', line) def test_queue_record_multiple_hook_03(self): """ Test that changes made in a queuejob then modifyjob are stacking using job_o in the first modifyjob hook, but not in the second. """ self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False', 'job_history_enable': 'True', }) qj_hook_00 = """ import pbs e1 = pbs.event() e1.job.project = 'foo00' e1.accept() """ qj_hook_01 = """ import pbs e1 = pbs.event() e1.job.project = str(e1.job.project) + '_foo01' pbs.logmsg(pbs.LOG_ERROR, f"HOOKQ1:e1:{hex(id(e1))}" f" job.id:{e1.job.id}" f" job.project:{e1.job.project}" f" Resource_List:{e1.job.Resource_List}" f" hex(id(job)):{hex(id(e1.job))}") e1.accept() """ qj_attrs = {'event': 'queuejob', 'enabled': 'True'} self.server.create_import_hook('qj00', qj_attrs, qj_hook_00) self.server.create_import_hook('qj01', qj_attrs, qj_hook_01) mj_hook_00 = """ import pbs e1 = pbs.event() pbs.logmsg(pbs.LOG_ERROR, f"HOOKM0a:e1:{hex(id(e1))}" f" job.id:{e1.job.id}" f" job.project:{e1.job.project}" f" job_o.id:{e1.job_o.id}" f" job_o.project:{e1.job_o.project}" f" hex(id(job)):{hex(id(e1.job))}") e1.job.project = str(e1.job_o.project) + '_foo02' pbs.logmsg(pbs.LOG_ERROR, f"HOOKM0b:e1:{hex(id(e1))}" f" jobid:{e1.job.id}" f" project:{e1.job.project}" f" job_o.id:{e1.job_o.id}" f" job_o.project:{e1.job_o.project}" f" hex(id(job)):{hex(id(e1.job))}") e1.accept() """ mj_hook_01 = """ import pbs e1 = pbs.event() e1.job.project = str(e1.job.project) + '_foo03' pbs.logmsg(pbs.LOG_ERROR, f"HOOKM1:e1:{hex(id(e1))}" f" job.id:{e1.job.id}" f" job.project:{e1.job.project}" f" job_o.id:{e1.job_o.id}" f" job_o.project:{e1.job_o.project}" f" hex(id(job)):{hex(id(e1.job))}") e1.accept() """ mj_attrs_00 = {'event': 'modifyjob', 'order': 1, 'enabled': 'True'} mj_attrs_01 = {'event': 'modifyjob', 'order': 2, 'enabled': 'True'} self.server.create_import_hook('mj_00', mj_attrs_00, mj_hook_00) self.server.create_import_hook('mj_01', mj_attrs_01, mj_hook_01) j = Job(TEST_USER, {'Resource_List.walltime': 1}) j.set_sleep_time(1) jid1 = self.server.submit(j) self.server.alterjob(jid1, {ATTR_p: 150}) (_, line) = self.server.accounting_match(';Q;' + jid1) self.assertIn('project=foo00_foo01', line) self.server.runjob(jid1) self.server.expect(JOB, {'job_state': 'F'}, extend='x', id=jid1) (_, line) = self.server.accounting_match(';a;' + jid1) self.assertIn('Priority=150', line) self.assertIn('project=foo00_foo01_foo02_foo03', line) (_, line) = self.server.accounting_match(';E;' + jid1) self.assertIn('project=foo00_foo01_foo02_foo03', line) ================================================ FILE: test/tests/functional/pbs_accumulate_resc_used.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * import ast @requirements(num_moms=3) class TestPbsAccumulateRescUsed(TestFunctional): """ This tests the feature in PBS that enables mom hooks to accumulate resources_used values for resources beside cput, cpupercent, and mem. This includes accumulation of custom resources. The mom hooks supported this feature are: exechost_periodic, execjob_prologue, and execjob_epilogue. PRE: Have a cluster of PBS with 3 mom hosts, with an exechost_startup that adds custom resources. POST: When a job ends, accounting_logs reflect the aggregated resources_used values. And with job_history_enable=true, one can do a 'qstat -x -f ' to obtain information of a previous job. """ # Class variables def setUp(self): TestFunctional.setUp(self) self.logger.info("len moms = %d" % (len(self.moms))) if len(self.moms) != 3: usage_string = 'test requires 3 MoMs as input, ' + \ 'use -p moms=::' self.skip_test(usage_string) # PBSTestSuite returns the moms passed in as parameters as dictionary # of hostname and MoM object self.momA = self.moms.values()[0] self.momB = self.moms.values()[1] self.momC = self.moms.values()[2] self.momA.delete_vnode_defs() self.momB.delete_vnode_defs() self.momC.delete_vnode_defs() self.hostA = self.momA.shortname self.hostB = self.momB.shortname self.hostC = self.momC.shortname rc = self.server.manager(MGR_CMD_DELETE, NODE, None, "") self.assertEqual(rc, 0) rc = self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostA) self.assertEqual(rc, 0) rc = self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostB) self.assertEqual(rc, 0) rc = self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostC) self.assertEqual(rc, 0) # Give the moms a chance to contact the server. self.server.expect(NODE, {'state': 'free'}, id=self.hostA) self.server.expect(NODE, {'state': 'free'}, id=self.hostB) self.server.expect(NODE, {'state': 'free'}, id=self.hostC) # First set some custom resources via exechost_startup hook. startup_hook_body = """ import pbs e=pbs.event() localnode=pbs.get_local_nodename() e.vnode_list[localnode].resources_available['foo_i'] = 7 e.vnode_list[localnode].resources_available['foo_f'] = 5.0 e.vnode_list[localnode].resources_available['foo_str'] = "seventyseven" """ hook_name = "start" a = {'event': "exechost_startup", 'enabled': 'True'} rv = self.server.create_import_hook( hook_name, a, startup_hook_body, overwrite=True) self.assertTrue(rv) self.momA.signal("-HUP") self.momB.signal("-HUP") self.momC.signal("-HUP") a = {'job_history_enable': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) # Next set some custom resources via qmgr -c 'create resource' attr = {} attr['type'] = 'string' attr['flag'] = 'h' r = 'foo_str2' rc = self.server.manager( MGR_CMD_CREATE, RSC, attr, id=r, runas=ROOT_USER, logerr=False) self.assertEqual(rc, 0) # Ensure the new resource is seen by all moms. momlist = [self.momA, self.momB, self.momC] for m in momlist: m.log_match("resourcedef;copy hook-related file") attr['type'] = 'string' attr['flag'] = 'h' r = 'foo_str3' rc = self.server.manager( MGR_CMD_CREATE, RSC, attr, id=r, runas=ROOT_USER, logerr=False) self.assertEqual(rc, 0) # Ensure the new resource is seen by all moms. for m in momlist: m.log_match("resourcedef;copy hook-related file") attr['type'] = 'string' attr['flag'] = 'h' r = 'foo_str4' rc = self.server.manager( MGR_CMD_CREATE, RSC, attr, id=r, runas=ROOT_USER, logerr=False) self.assertEqual(rc, 0) # Ensure the new resource is seen by all moms. for m in momlist: m.log_match("resourcedef;copy hook-related file") attr['type'] = 'string_array' attr['flag'] = 'h' r = 'stra' rc = self.server.manager( MGR_CMD_CREATE, RSC, attr, id=r, runas=ROOT_USER, logerr=False) self.assertEqual(rc, 0) # Give the moms a chance to receive the updated resource. # Ensure the new resource is seen by all moms. for m in momlist: m.log_match("resourcedef;copy hook-related file") def test_epilogue(self): """ Test accumulatinon of resources of a multinode job from an exechost_epilogue hook. """ self.logger.info("test_epilogue") hook_body = """ import pbs e=pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "executed epilogue hook") if e.job.in_ms_mom(): e.job.resources_used["vmem"] = pbs.size("9gb") e.job.resources_used["foo_i"] = 9 e.job.resources_used["foo_f"] = 0.09 e.job.resources_used["foo_str"] = '{"seven":7}' e.job.resources_used["cput"] = 10 e.job.resources_used["stra"] = '"glad,elated","happy"' e.job.resources_used["foo_str3"] = \ \"\"\"{"a":6,"b":"some value #$%^&*@","c":54.4,"d":"32.5gb"}\"\"\" e.job.resources_used["foo_str2"] = "seven" e.job.resources_used["foo_str4"] = "eight" else: e.job.resources_used["vmem"] = pbs.size("10gb") e.job.resources_used["foo_i"] = 10 e.job.resources_used["foo_f"] = 0.10 e.job.resources_used["foo_str"] = '{"eight":8,"nine":9}' e.job.resources_used["foo_str2"] = '{"seven":7}' e.job.resources_used["cput"] = 20 e.job.resources_used["stra"] = '"cucumbers,bananas"' e.job.resources_used["foo_str3"] = \"\"\""vn1":4,"vn2":5,"vn3":6\"\"\" """ hook_name = "epi" a = {'event': "execjob_epilogue", 'enabled': 'True', 'order': 999} rv = self.server.create_import_hook( hook_name, a, hook_body, overwrite=True) self.assertTrue(rv) a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.walltime': 10, 'Resource_List.place': "scatter"} j = Job(TEST_USER) j.set_attributes(a) j.set_sleep_time("10") jid = self.server.submit(j) # The results should show results for custom resources 'foo_i', # 'foo_f', 'foo_str', 'foo_str3', and bultin resources 'vmem', # 'cput', and should be accumulating based # on the hook script, where MS defines 1 value, while the 2 sister # Moms define the same value. For 'string' type, it will be a # union of all values obtained from sister moms and local mom, and # the result will be in JSON-format. # # foo_str is for testing normal values. # foo_str2 is for testing non-JSON format value received from MS. # foo_str3 is for testing non-JSON format value received from a sister # mom. # foo_str4 is for testing MS-only set values. # # For string_array type resource 'stra', it is not accumulated but # will be set to last seen value from a mom epilogue hook. self.server.expect(JOB, { 'job_state': 'F', 'resources_used.foo_f': '0.29', 'resources_used.foo_i': '29', 'resources_used.foo_str4': "eight", 'resources_used.stra': "\"glad,elated\",\"happy\"", 'resources_used.vmem': '29gb', 'resources_used.cput': '00:00:50', 'resources_used.ncpus': '3'}, extend='x', offset=10, attrop=PTL_AND, id=jid) foo_str_dict_in = {"eight": 8, "seven": 7, "nine": 9} qstat = self.server.status( JOB, 'resources_used.foo_str', id=jid, extend='x') foo_str_dict_out_str = eval(qstat[0]['resources_used.foo_str']) foo_str_dict_out = eval(foo_str_dict_out_str) self.assertTrue(foo_str_dict_in == foo_str_dict_out) # resources_used.foo_str3 must not be set since a sister value is not # of JSON-format. self.server.expect(JOB, 'resources_used.foo_str3', op=UNSET, extend='x', id=jid) self.momA.log_match( "Job %s resources_used.foo_str3 cannot be " % (jid,) + "accumulated: value '\"vn1\":4,\"vn2\":5,\"vn3\":6' " + "from mom %s not JSON-format" % (self.hostB,)) # resources_used.foo_str2 must not be set. self.server.expect(JOB, 'resources_used.foo_str2', op=UNSET, id=jid) self.momA.log_match( "Job %s resources_used.foo_str2 cannot be " % (jid,) + "accumulated: value 'seven' from mom %s " % (self.hostA,) + "not JSON-format") # Match accounting_logs entry acctlog_match = 'resources_used.foo_f=0.29' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = 'resources_used.foo_i=29' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = "resources_used.foo_str='%s'" % (foo_str_dict_out_str,) self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = 'resources_used.vmem=29gb' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = 'resources_used.cput=00:00:50' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) # ensure resources_foo_str2 is not reported in accounting_logs since # it's unset due to non-JSON-format value. acctlog_match = 'resources_used.foo_str2=' self.server.accounting_match("E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100, existence=False) acctlog_match = 'resources_used.foo_str4=eight' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = 'resources_used.ncpus=3' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) # resources_used.foo_str3 must not show up in accounting_logs acctlog_match = 'resources_used.foo_str3=', self.server.accounting_match("E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100, existence=False) acctlog_match = r'resources_used.stra=\"glad\,elated\"\,\"happy\"' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) def test_prologue(self): """ Test accumulatinon of resources of a multinode job from an exechost_prologue hook. On cpuset systems don't check for cput because the pbs_cgroups hook will be enabled and will overwrite the cput value set in the prologue hook """ has_cpuset = False for mom in self.moms.values(): if mom.is_cpuset_mom(): has_cpuset = True self.logger.info("test_prologue") hook_body = """ import pbs e=pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "executed prologue hook") if e.job.in_ms_mom(): e.job.resources_used["vmem"] = pbs.size("11gb") e.job.resources_used["foo_i"] = 11 e.job.resources_used["foo_f"] = 0.11 e.job.resources_used["foo_str"] = '{"seven":7}' e.job.resources_used["cput"] = 11 e.job.resources_used["stra"] = '"glad,elated","happy"' e.job.resources_used["foo_str3"] = \ \"\"\"{"a":6,"b":"some value #$%^&*@","c":54.4,"d":"32.5gb"}\"\"\" e.job.resources_used["foo_str2"] = "seven" e.job.resources_used["foo_str4"] = "eight" else: e.job.resources_used["vmem"] = pbs.size("12gb") e.job.resources_used["foo_i"] = 12 e.job.resources_used["foo_f"] = 0.12 e.job.resources_used["foo_str"] = '{"eight":8,"nine":9}' e.job.resources_used["foo_str2"] = '{"seven":7}' e.job.resources_used["cput"] = 12 e.job.resources_used["stra"] = '"cucumbers,bananas"' e.job.resources_used["foo_str3"] = \"\"\""vn1":4,"vn2":5,"vn3":6\"\"\" """ hook_name = "prolo" a = {'event': "execjob_prologue", 'enabled': 'True'} rv = self.server.create_import_hook( hook_name, a, hook_body, overwrite=True) self.assertTrue(rv) a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.walltime': 10, 'Resource_List.place': 'scatter'} j = Job(TEST_USER) j.set_attributes(a) # The pbsdsh call is what allows a first task to get spawned on # on a sister mom, causing the execjob_prologue hook to execute. j.create_script( "pbsdsh -n 1 hostname\n" + "pbsdsh -n 2 hostname\n" + "sleep 10\n") jid = self.server.submit(j) # The results should show results for custom resources 'foo_i', # 'foo_f', 'foo_str', 'foo_str3', and bultin resources 'vmem', # 'cput', and should be accumulating based # on the hook script, where MS defines 1 value, while the 2 sister # Moms define the same value. For 'string' type, it will be a # union of all values obtained from sister moms and local mom, and # the result will be in JSON-format. # # foo_str is for testing normal values. # foo_str2 is for testing non-JSON format value received from MS. # foo_str3 is for testing non-JSON format value received from a sister # mom. # foo_str4 is for testing MS-only set values. # # For string_array type resource 'stra', it is not accumulated but # will be set to last seen value from a mom prologue hook. a = { 'job_state': 'F', 'resources_used.foo_f': '0.35', 'resources_used.foo_i': '35', 'resources_used.foo_str4': "eight", 'resources_used.stra': "\"glad,elated\",\"happy\"", 'resources_used.vmem': '35gb', 'resources_used.ncpus': '3'} if not has_cpuset: a['resources_used.cput'] = '00:00:35' self.server.expect(JOB, a, extend='x', offset=10, attrop=PTL_AND, id=jid) foo_str_dict_in = {"eight": 8, "seven": 7, "nine": 9} qstat = self.server.status( JOB, 'resources_used.foo_str', id=jid, extend='x') foo_str_dict_out_str = eval(qstat[0]['resources_used.foo_str']) foo_str_dict_out = eval(foo_str_dict_out_str) self.assertTrue(foo_str_dict_in == foo_str_dict_out) # resources_used.foo_str3 must not be set since a sister value is # not of JSON-format. self.server.expect(JOB, 'resources_used.foo_str3', op=UNSET, extend='x', id=jid) self.momA.log_match( "Job %s resources_used.foo_str3 cannot be " % (jid,) + "accumulated: value '\"vn1\":4,\"vn2\":5,\"vn3\":6' " + "from mom %s not JSON-format" % (self.hostB,)) self.momA.log_match( "Job %s resources_used.foo_str3 cannot be " % (jid,) + "accumulated: value '\"vn1\":4,\"vn2\":5,\"vn3\":6' " + "from mom %s not JSON-format" % (self.hostC,)) # Ensure resources_used.foo_str3 is not set since it has a # non-JSON format value. self.server.expect(JOB, 'resources_used.foo_str3', op=UNSET, extend='x', id=jid) # resources_used.foo_str2 must not be set. self.server.expect(JOB, 'resources_used.foo_str2', op=UNSET, id=jid) self.momA.log_match( "Job %s resources_used.foo_str2 cannot be " % (jid,) + "accumulated: value 'seven' from " + "mom %s not JSON-format" % (self.hostA,)) # Match accounting_logs entry acctlog_match = 'resources_used.foo_f=0.35' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = 'resources_used.foo_i=35' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = "resources_used.foo_str='%s'" % (foo_str_dict_out_str,) self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = 'resources_used.vmem=35gb' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) if not has_cpuset: acctlog_match = 'resources_used.cput=00:00:35' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) # resources_used.foo_str2 should not be reported in accounting_logs. acctlog_match = 'resources_used.foo_str2=' self.server.accounting_match("E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100, existence=False) acctlog_match = 'resources_used.ncpus=3' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) # resources_used.foo_str3 must not show up in accounting_logs acctlog_match = 'resources_used.foo_str3=' self.server.accounting_match("E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100, existence=False) acctlog_match = 'resources_used.foo_str4=eight' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = r'resources_used.stra=\"glad\,elated\"\,\"happy\"' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) def test_periodic(self): """ Test accumulatinon of resources from an exechost_periodic hook. """ self.logger.info("test_periodic") hook_body = """ import pbs e=pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "executed periodic hook") i = 0 l = [] for v in pbs.server().vnodes(): pbs.logmsg(pbs.LOG_DEBUG, "node %s" % (v.name,)) l.append(v.name) local_node=pbs.get_local_nodename() for jk in e.job_list.keys(): if local_node == l[0]: e.job_list[jk].resources_used["vmem"] = pbs.size("11gb") e.job_list[jk].resources_used["foo_i"] = 11 e.job_list[jk].resources_used["foo_f"] = 0.11 e.job_list[jk].resources_used["foo_str"] = '{"seven":7}' e.job_list[jk].resources_used["cput"] = 11 e.job_list[jk].resources_used["stra"] = '"glad,elated","happy"' e.job_list[jk].resources_used["foo_str3"] = \ \"\"\"{"a":6,"b":"some value #$%^&*@","c":54.4,"d":"32.5gb"}\"\"\" e.job_list[jk].resources_used["foo_str2"] = "seven" elif local_node == l[1]: e.job_list[jk].resources_used["vmem"] = pbs.size("12gb") e.job_list[jk].resources_used["foo_i"] = 12 e.job_list[jk].resources_used["foo_f"] = 0.12 e.job_list[jk].resources_used["foo_str"] = '{"eight":8}' e.job_list[jk].resources_used["cput"] = 12 e.job_list[jk].resources_used["stra"] = '"cucumbers,bananas"' e.job_list[jk].resources_used["foo_str2"] = '{"seven":7}' e.job_list[jk].resources_used["foo_str3"] = \ \"\"\"{"vn1":4,"vn2":5,"vn3":6}\"\"\" else: e.job_list[jk].resources_used["vmem"] = pbs.size("13gb") e.job_list[jk].resources_used["foo_i"] = 13 e.job_list[jk].resources_used["foo_f"] = 0.13 e.job_list[jk].resources_used["foo_str"] = '{"nine":9}' e.job_list[jk].resources_used["foo_str2"] = '{"seven":7}' e.job_list[jk].resources_used["cput"] = 13 e.job_list[jk].resources_used["stra"] = '"cucumbers,bananas"' e.job_list[jk].resources_used["foo_str3"] = \ \"\"\"{"vn1":4,"vn2":5,"vn3":6}\"\"\" """ hook_name = "period" a = {'event': "exechost_periodic", 'enabled': 'True', 'freq': 15} rv = self.server.create_import_hook( hook_name, a, hook_body, overwrite=True) self.assertTrue(rv) a = {'resources_available.ncpus': '2'} self.server.manager(MGR_CMD_SET, NODE, a, self.hostA) self.server.manager(MGR_CMD_SET, NODE, a, self.hostB) self.server.manager(MGR_CMD_SET, NODE, a, self.hostC) a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.place': 'scatter'} j = Job(TEST_USER) j.set_attributes(a) j.set_sleep_time("35") jid1 = self.server.submit(j) jid2 = self.server.submit(j) for jid in [jid1, jid2]: # The results should show results for custom resources 'foo_i', # 'foo_f', 'foo_str', 'foo_str3', and bultin resources 'vmem', # 'cput', and should be accumulating based # on the hook script, where MS defines 1 value, while the 2 sister # Moms define the same value. For 'string' type, it will be a # union of all values obtained from sister moms and local mom, and # the result will be in JSON-format. # foo_str is for testing normal values. # foo_str2 is for testing non-JSON format value received from MS. # foo_str3 is for testing non-JSON format value received from a # sister mom. # self.server.expect(JOB, { 'job_state': 'F', 'resources_used.foo_f': '0.36', 'resources_used.foo_i': '36', 'resources_used.stra': "\"glad,elated\",\"happy\"", 'resources_used.vmem': '36gb', 'resources_used.cput': '00:00:36', 'resources_used.ncpus': '3'}, extend='x', offset=35, attrop=PTL_AND, id=jid) foo_str_dict_in = {"eight": 8, "seven": 7, "nine": 9} qstat = self.server.status( JOB, 'resources_used.foo_str', id=jid, extend='x') foo_str_dict_out_str = eval(qstat[0]['resources_used.foo_str']) foo_str_dict_out = eval(foo_str_dict_out_str) self.assertTrue(foo_str_dict_in == foo_str_dict_out) foo_str3_dict_in = {"a": 6, "b": "some value #$%^&*@", "c": 54.4, "d": "32.5gb", "vn1": 4, "vn2": 5, "vn3": 6} qstat = self.server.status( JOB, 'resources_used.foo_str3', id=jid, extend='x') foo_str3_dict_out_str = eval(qstat[0]['resources_used.foo_str3']) foo_str3_dict_out = eval(foo_str3_dict_out_str) self.assertTrue(foo_str3_dict_in == foo_str3_dict_out) # resources_used.foo_str2 must be unset since its value is not of # JSON-format. self.server.expect(JOB, 'resources_used.foo_str2', op=UNSET, extend='x', id=jid) # Match accounting_logs entry acctlog_match = 'resources_used.foo_f=0.36' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = 'resources_used.foo_i=36' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = "resources_used.foo_str='%s'" % ( foo_str_dict_out_str,) self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = 'resources_used.vmem=36gb' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = 'resources_used.cput=00:00:36' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) # resources_used.foo_str2 must not show in accounting_logs acctlog_match = 'resources_used.foo_str2=', self.server.accounting_match("E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100, existence=False) acctlog_match = 'resources_used.ncpus=3' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = "resources_used.foo_str3='%s'" % ( foo_str3_dict_out_str.replace('.', r'\.'). replace("#$%^&*@", r"\#\$\%\^\&\*\@")) self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) acctlog_match = r'resources_used.stra=\"glad\,elated\"\,\"happy\"' self.server.accounting_match( "E;%s;.*%s.*" % (jid, acctlog_match), regexp=True, n=100) def test_resource_bool(self): """ To test that boolean value are not getting aggregated """ # Create a boolean type resource attr = {} attr['type'] = 'boolean' self.server.manager( MGR_CMD_CREATE, RSC, attr, id='foo_bool', runas=ROOT_USER, logerr=False) hook_body = """ import pbs e=pbs.event() j=e.job if j.in_ms_mom(): j.resources_used["foo_bool"] = True else: j.resources_used["foo_bool"] = False """ hook_name = "epi_bool" a = {'event': "execjob_epilogue", 'enabled': "True"} self.server.create_import_hook( hook_name, a, hook_body, overwrite=True) a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.walltime': 10, 'Resource_List.place': 'scatter'} j = Job(TEST_USER) j.set_attributes(a) j.set_sleep_time("5") jid = self.server.submit(j) # foo_bool is True a = {'resources_used.foo_bool': "True", 'job_state': 'F'} self.server.expect(JOB, a, extend='x', offset=5, attrop=PTL_AND, id=jid) def test_resource_invisible(self): """ Test that value aggregation is same for invisible resources """ # Set float and string_array to be invisible resource attr = {} attr['flag'] = 'ih' self.server.manager( MGR_CMD_SET, RSC, attr, id='foo_f', runas=ROOT_USER) self.server.manager( MGR_CMD_SET, RSC, attr, id='foo_str', runas=ROOT_USER) hook_body = """ import pbs e=pbs.event() j = e.job if j.in_ms_mom(): j.resources_used["foo_f"] = 2.114 j.resources_used["foo_str"] = '{"one":1,"two":2}' else: j.resources_used["foo_f"] = 3.246 j.resources_used["foo_str"] = '{"two":2, "three":3}' """ hook_name = "epi_invis" a = {'event': "execjob_epilogue", 'enabled': 'True'} self.server.create_import_hook( hook_name, a, hook_body, overwrite=True) a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.walltime': 10, 'Resource_List.place': 'scatter'} j = Job(TEST_USER) j.set_attributes(a) j.set_sleep_time("5") jid = self.server.submit(j) # Verify that values are accumulated for float and string array a = {'resources_used.foo_f': '8.606'} self.server.expect(JOB, a, extend='x', offset=5, id=jid) foo_str_dict_in = {"one": 1, "two": 2, "three": 3} qstat = self.server.status( JOB, 'resources_used.foo_str', id=jid, extend='x') foo_str_dict_out_str = eval(qstat[0]['resources_used.foo_str']) foo_str_dict_out = eval(foo_str_dict_out_str) self.assertEqual(foo_str_dict_in, foo_str_dict_out) def test_reservation(self): """ Test that job inside reservations works same NOTE: Due to the reservation duration and the job duration both being equal, this test found 2 race conditions. KEEP the durations equal to each other. """ # Create non-host level resources from qmgr attr = {} attr['type'] = 'size' self.server.manager( MGR_CMD_CREATE, RSC, attr, id='foo_i2', runas=ROOT_USER) # Ensure the new resource is seen by all moms. momlist = [self.momA, self.momB, self.momC] for m in momlist: m.log_match("resourcedef;copy hook-related file") attr['type'] = 'float' self.server.manager( MGR_CMD_CREATE, RSC, attr, id='foo_f2', runas=ROOT_USER) # Ensure the new resource is seen by all moms. for m in momlist: m.log_match("resourcedef;copy hook-related file") attr['type'] = 'string_array' self.server.manager( MGR_CMD_CREATE, RSC, attr, id='stra2', runas=ROOT_USER) # Ensure the new resource is seen by all moms. for m in momlist: m.log_match("resourcedef;copy hook-related file") # Create an epilogue hook hook_body = """ import pbs e = pbs.event() j = e.job pbs.logmsg(pbs.LOG_DEBUG, "executed epilogue hook") j.resources_used["foo_i"] = 2 j.resources_used["foo_i2"] = pbs.size(1000) j.resources_used["foo_f"] = 1.02 j.resources_used["foo_f2"] = 2.01 j.resources_used["stra"] = '"happy"' j.resources_used["stra2"] = '"glad"' """ # Create and import hook a = {'event': "execjob_epilogue", 'enabled': 'True'} self.server.create_import_hook( "epi", a, hook_body, overwrite=True) # Submit a reservation a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.place': 'scatter', 'reserve_start': time.time() + 10, 'reserve_end': time.time() + 30, } r = Reservation(TEST_USER, a) rid = self.server.submit(r) a = {'reserve_state': (MATCH_RE, "RESV_CONFIRMED|2")} self.server.expect(RESV, a, id=rid) rname = rid.split('.') # Submit a job inside reservation a = {'Resource_List.select': '3:ncpus=1', ATTR_queue: rname[0]} j = Job(TEST_USER) j.set_attributes(a) j.set_sleep_time(20) jid = self.server.submit(j) # Verify the resource values a = {'resources_used.foo_i': '6', 'resources_used.foo_i2': '3kb', 'resources_used.foo_f': '3.06', 'resources_used.foo_f2': '6.03', 'resources_used.stra': "\"happy\"", 'resources_used.stra2': "\"glad\"", 'job_state': 'F'} self.server.expect(JOB, a, extend='x', attrop=PTL_AND, offset=30, interval=1, id=jid) # Below is commented out due to a problem with history jobs # disapearing after a server restart when the reservation is # in state BD during restart. # Once that bug is fixed, this test code should be uncommented # and run. # Restart server and verifies that the values are still the same # self.server.restart() # self.server.expect(JOB, a, extend='x', id=jid) def test_server_restart(self): """ Test that resource accumulation will not get impacted if server is restarted during job execution On cpuset systems don't check for cput because the pbs_cgroups hook will be enabled and will overwrite the cput value set in the prologue hook """ has_cpuset = False for mom in self.moms.values(): if mom.is_cpuset_mom(): has_cpuset = True # Create a prologue hook hook_body = """ import pbs e=pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "executed prologue hook") if e.job.in_ms_mom(): e.job.resources_used["vmem"] = pbs.size("11gb") e.job.resources_used["foo_i"] = 11 e.job.resources_used["foo_f"] = 0.11 e.job.resources_used["foo_str"] = '{"seven":7}' e.job.resources_used["cput"] = 11 e.job.resources_used["stra"] = '"glad,elated","happy"' e.job.resources_used["foo_str4"] = "eight" else: e.job.resources_used["vmem"] = pbs.size("12gb") e.job.resources_used["foo_i"] = 12 e.job.resources_used["foo_f"] = 0.12 e.job.resources_used["foo_str"] = '{"eight":8,"nine":9}' e.job.resources_used["cput"] = 12 e.job.resources_used["stra"] = '"cucumbers,bananas"' """ hook_name = "prolo" a = {'event': "execjob_prologue", 'enabled': 'True'} self.server.create_import_hook( hook_name, a, hook_body, overwrite=True) a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.walltime': 20, 'Resource_List.place': 'scatter'} j = Job(TEST_USER) j.set_attributes(a) # The pbsdsh call is what allows a first task to get spawned on # on a sister mom, causing the execjob_prologue hook to execute. j.create_script( "pbsdsh -n 1 hostname\n" + "pbsdsh -n 2 hostname\n" + "sleep 10\n") jid = self.server.submit(j) # Once the job is started running restart server self.server.expect(JOB, {'job_state': "R", "substate": 42}, id=jid) self.server.restart() # Job will be requeued and rerun. Verify that the # resource accumulation is similar as if server is # not started a = {'resources_used.foo_i': '35', 'resources_used.foo_f': '0.35', 'resources_used.vmem': '35gb', 'resources_used.stra': "\"glad,elated\",\"happy\"", 'resources_used.foo_str4': "eight", 'job_state': 'F'} if not has_cpuset: a['resources_used.cput'] = '00:00:35' self.server.expect(JOB, a, extend='x', offset=5, id=jid, interval=1, attrop=PTL_AND) foo_str_dict_in = {"eight": 8, "seven": 7, "nine": 9} qstat = self.server.status( JOB, 'resources_used.foo_str', id=jid, extend='x') foo_str_dict_out_str = eval(qstat[0]['resources_used.foo_str']) foo_str_dict_out = eval(foo_str_dict_out_str) self.assertEqual(foo_str_dict_in, foo_str_dict_out) def test_mom_down(self): """ Test that resource_accumulation is not impacted due to mom restart """ # Set node_fail_requeue to requeue job self.server.manager(MGR_CMD_SET, SERVER, {'node_fail_requeue': 10}) hook_body = """ import pbs e = pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "executed periodic hook") for jj in e.job_list.keys(): e.job_list[jj].resources_used["foo_i"] = 1 e.job_list[jj].resources_used["foo_str"] = '{"happy":"true"}' e.job_list[jj].resources_used["stra"] = '"one","two"' """ a = {'event': "exechost_periodic", 'enabled': 'True', 'freq': 10} self.server.create_import_hook( "period", a, hook_body, overwrite=True) a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.walltime': 300, 'Resource_List.place': 'scatter'} j = Job(TEST_USER) j.set_attributes(a) jid1 = self.server.submit(j) # Submit a job that can never run a = {'Resource_List.select': '5:ncpus=1', 'Resource_List.place': 'scatter'} j.set_attributes(a) j.set_sleep_time("300") jid2 = self.server.submit(j) # Wait for 10s approx for hook to get executed # verify the resources_used.foo_i self.server.expect(JOB, {'resources_used.foo_i': '3'}, offset=10, id=jid1, interval=1) self.server.expect(JOB, "resources_used.foo_i", op=UNSET, id=jid2) # Bring sister mom down self.momB.stop() # Wait for 20 more seconds for preiodic hook to run # more than once and verify that value is still 3 self.server.expect(JOB, {'resources_used.foo_i': '3'}, offset=20, id=jid1, interval=1) # Wait for job to be requeued by node_fail_requeue self.server.rerunjob(jid1, runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'Q'}, id=jid1) # Verify that resources_used.foo_i is unset self.server.expect(JOB, "resources_used.foo_i", op=UNSET, id=jid1) # Bring sister mom up self.momB.start() self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state': 'R'}, id=jid1, interval=1) # Verify that value of foo_i for job1 is set back self.server.expect(JOB, {'resources_used.foo_i': '3'}, offset=10, id=jid1, interval=1) def test_job_rerun(self): """ Test that resource accumulates once when job is rerun """ hook_body = """ import pbs e = pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "executed periodic hook") for jj in e.job_list.keys(): e.job_list[jj].resources_used["foo_f"] = 1.01 e.job_list[jj].resources_used["cput"] = 10 """ a = {'event': "exechost_periodic", 'enabled': 'True', 'freq': 10} self.server.create_import_hook( "period", a, hook_body, overwrite=True) a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.place': 'scatter'} j = Job(TEST_USER) j.set_attributes(a) jid1 = self.server.submit(j) self.server.expect(JOB, {'job_state': "R", "substate": 42}, id=jid1) # Wait for 10s approx for hook to get executed # Verify the resources_used.foo_f a = {'resources_used.foo_f': '3.03', 'resources_used.cput': 30} self.server.expect(JOB, a, offset=10, id=jid1, attrop=PTL_AND, interval=1) # Rerun the job self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) self.server.rerunjob(jobid=jid1, runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'Q'}, id=jid1) # Verify that foo_f is unset self.server.expect(JOB, 'Resource_List.foo_f', op=UNSET, id=jid1) # turn the scheduling on self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state': "R", "substate": 42}, attrop=PTL_AND, id=jid1) # Validate that resources_used.foo_f is reset self.server.expect(JOB, a, offset=10, id=jid1, attrop=PTL_AND, interval=1) def test_job_array(self): """ Test that resource accumulation for subjobs also work """ hook_body = """ import pbs e=pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "executed epilogue hook") if e.job.in_ms_mom(): e.job.resources_used["vmem"] = pbs.size("9gb") e.job.resources_used["foo_i"] = 9 e.job.resources_used["foo_f"] = 0.09 e.job.resources_used["foo_str"] = '{"seven":7}' e.job.resources_used["cput"] = 10 e.job.resources_used["stra"] = '"glad,elated","happy"' else: e.job.resources_used["vmem"] = pbs.size("10gb") e.job.resources_used["foo_i"] = 10 e.job.resources_used["foo_f"] = 0.10 e.job.resources_used["foo_str"] = '{"eight":8,"nine":9}' e.job.resources_used["cput"] = 20 e.job.resources_used["stra"] = '"cucumbers,bananas"' """ a = {'event': "execjob_epilogue", 'enabled': 'True'} self.server.create_import_hook( "test", a, hook_body, overwrite=True) a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.walltime': 10, 'Resource_List.place': 'scatter'} j = Job(TEST_USER, attrs={ATTR_J: '1-2'}) j.set_attributes(a) j.set_sleep_time("5") jid = self.server.submit(j) # Verify that once subjobs are over values are # set for each subjob in the accounting logs subjob1 = str.replace(jid, '[]', '[1]') acctlog_match = 'resources_used.foo_f=0.29' # Below code is commented due to a PTL issue # s = self.server.accounting_match( # "E;%s;.*%s.*" % (subjob1, acctlog_match), regexp=True, n=100) # self.assertTrue(s) acctlog_match = 'resources_used.foo_i=29' # s = self.server.accounting_match( # "E;%s;.*%s.*" % (subjob1, acctlog_match), regexp=True, n=100) # self.assertTrue(s) foo_str_dict_in = {"eight": 8, "seven": 7, "nine": 9} acctlog_match = "resources_used.foo_str='%s'" % (foo_str_dict_in,) # s = self.server.accounting_match( # "E;%s;.*%s.*" % (subjob1, acctlog_match), regexp=True, n=100) # self.assertTrue(s) acctlog_match = 'resources_used.vmem=29gb' # s = self.server.accounting_match( # "E;%s;.*%s.*" % (subjob1, acctlog_match), regexp=True, n=100) # self.assertTrue(s) acctlog_match = 'resources_used.cput=00:00:50' # s = self.server.accounting_match( # "E;%s;.*%s.*" % (subjob1, acctlog_match), regexp=True, n=100) # self.assertTrue(s) acctlog_match = r'resources_used.stra=\"glad\,elated\"\,\"happy\"' # s = self.server.accounting_match( # "E;%s;.*%s.*" % (subjob1, acctlog_match), regexp=True, n=100) # self.assertTrue(s) def test_epi_pro(self): """ Test that epilogue and prologue changing same and different resources. Values of same resource would get overwriteen by the last hook. On cpuset systems don't check for cput because the pbs_cgroups hook will be enabled and will overwrite the cput value set in the prologue hook """ has_cpuset = False for mom in self.moms.values(): if mom.is_cpuset_mom(): has_cpuset = True hook_body = """ import pbs e=pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "In prologue hook") e.job.resources_used["foo_i"] = 10 e.job.resources_used["foo_f"] = 0.10 """ a = {'event': "execjob_prologue", 'enabled': 'True'} self.server.create_import_hook( "pro", a, hook_body, overwrite=True) # Verify the copy message in the logs to avoid # race conditions momlist = [self.momA, self.momB, self.momC] for m in momlist: m.log_match("pro.PY;copy hook-related file") hook_body = """ import pbs e=pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "In epilogue hook") e.job.resources_used["foo_f"] = 0.20 e.job.resources_used["cput"] = 10 """ a = {'event': "execjob_epilogue", 'enabled': 'True'} self.server.create_import_hook( "epi", a, hook_body, overwrite=True) # Verify the copy message in the logs to avoid # race conditions for m in momlist: m.log_match("epi.PY;copy hook-related file") a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.place': 'scatter'} j = Job(TEST_USER) j.set_attributes(a) j.create_script( "pbsdsh -n 1 hostname\n" + "pbsdsh -n 2 hostname\n" + "sleep 5\n") jid = self.server.submit(j) # Verify the resources_used once the job is over b = { 'resources_used.foo_i': '30', 'resources_used.foo_f': '0.6', 'job_state': 'F'} if not has_cpuset: b['resources_used.cput'] = '30' self.server.expect(JOB, b, extend='x', id=jid, offset=5, interval=1) # Submit another job j1 = Job(TEST_USER) j1.set_attributes(a) j1.create_script( "pbsdsh -n 1 hostname\n" + "pbsdsh -n 2 hostname\n" + "sleep 300\n") jid1 = self.server.submit(j1) # Verify that prologue hook has set the values self.server.expect(JOB, { 'job_state': 'R', 'resources_used.foo_i': '30', 'resources_used.foo_f': '0.3'}, attrop=PTL_AND, id=jid1, interval=2) # Force delete the job self.server.deljob(id=jid1, wait=True, attr_W="force") # Verify values are accumulated by prologue hook only self.server.expect(JOB, { 'resources_used.foo_i': '30', 'resources_used.foo_f': '0.3'}, attrop=PTL_AND, extend='x', id=jid1) def test_server_restart2(self): """ Test that server restart during hook execution has no impact """ hook_body = """ import pbs import time e = pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "executed epilogue hook") if e.job.in_ms_mom(): e.job.resources_used["vmem"] = pbs.size("9gb") e.job.resources_used["foo_i"] = 9 e.job.resources_used["foo_f"] = 0.09 e.job.resources_used["foo_str"] = '{"seven":7}' e.job.resources_used["cput"] = 10 else: e.job.resources_used["vmem"] = pbs.size("10gb") e.job.resources_used["foo_i"] = 10 e.job.resources_used["foo_f"] = 0.10 e.job.resources_used["foo_str"] = '{"eight":8,"nine":9}' e.job.resources_used["cput"] = 20 time.sleep(15) """ a = {'event': "execjob_epilogue", 'enabled': 'True'} self.server.create_import_hook( "epi", a, hook_body, overwrite=True) # Submit a job a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.walltime': 10, 'Resource_List.place': "scatter", 'Keep_Files': 'oe'} j = Job(TEST_USER) j.set_attributes(a) j.set_sleep_time("5") jid = self.server.submit(j) # Verify the resource values a = {'resources_used.foo_i': 29, 'resources_used.foo_f': 0.29} a_dict = {'eight': 8, 'seven': 7, 'nine': 9} self.server.expect(JOB, a, extend='x', attrop=PTL_AND, offset=5, id=jid, interval=1) # check for dictionary resource job_status = self.server.status(JOB, id=jid, extend='x') job_str_resource = dict(job_status[0])['resources_used.foo_str'] job_str_resource = ast.literal_eval(ast.literal_eval(job_str_resource)) self.assertEqual(job_str_resource, a_dict) # Restart server while hook is still executing self.server.restart() # Verify that values again self.server.expect(JOB, a, extend='x', attrop=PTL_AND, id=jid) # check for dictionary resource job_status = self.server.status(JOB, id=jid, extend='x') job_str_resource = dict(job_status[0])['resources_used.foo_str'] job_str_resource = ast.literal_eval(ast.literal_eval(job_str_resource)) self.assertEqual(job_str_resource, a_dict) def test_mom_down2(self): """ Test that when mom is down values are still accumulated for resources """ hook_body = """ import pbs e=pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "executed epilogue hook") if e.job.in_ms_mom(): e.job.resources_used["vmem"] = pbs.size("9gb") e.job.resources_used["foo_i"] = 9 e.job.resources_used["foo_f"] = 0.09 e.job.resources_used["foo_str"] = '{"seven":7}' e.job.resources_used["cput"] = 10 e.job.resources_used["stra"] = '"glad,elated","happy"' else: e.job.resources_used["vmem"] = pbs.size("10gb") e.job.resources_used["foo_i"] = 10 e.job.resources_used["foo_f"] = 0.10 e.job.resources_used["foo_str"] = '{"eight":8,"nine":9}' e.job.resources_used["cput"] = 20 e.job.resources_used["stra"] = '"cucumbers,bananas"' """ a = {'event': "execjob_epilogue", 'enabled': 'True'} self.server.create_import_hook( "epi", a, hook_body, overwrite=True) # Submit a job a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.walltime': 40, 'Resource_List.place': "scatter"} j = Job(TEST_USER) j.set_attributes(a) jid = self.server.submit(j) # Verify job is running self.server.expect(JOB, {'job_state': "R"}, id=jid) # Bring sister mom down self.momB.stop() # Wait for job to end # Validate that the values are being set # with 2 moms only self.server.expect(JOB, {'job_state': 'F', 'resources_used.foo_i': '19', 'resources_used.foo_f': '0.19'}, offset=10, id=jid, interval=1, extend='x', attrop=PTL_AND) a_dict = {'eight': 8, 'nine': 9, 'seven': 7} # check for dictionary resource job_status = self.server.status(JOB, id=jid, extend='x') job_str_resource = dict(job_status[0])['resources_used.foo_str'] job_str_resource = ast.literal_eval(ast.literal_eval(job_str_resource)) self.assertEqual(job_str_resource, a_dict) # Bring the mom back up self.momB.start() def test_finished_walltime(self): """ If used resources are modified from hook, this test makes sure that mem used resources are merged and once the job ends, the walltime is not zero. """ hook_body = """ import pbs e = pbs.event() if e.type == pbs.EXECHOST_PERIODIC: for jobid in e.job_list: e.job_list[jobid].resources_used["mem"] = pbs.size('1024kb') else: e.job.resources_used["mem"] = pbs.size('1024kb') """ hook_name = "multinode_used" attr = {'event': 'exechost_periodic,execjob_epilogue,execjob_end', 'freq': '3', 'enabled': 'True'} rv = self.server.create_import_hook(hook_name, attr, hook_body) self.assertTrue(rv) sleeptime = 30 a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.walltime': sleeptime, 'Resource_List.place': "scatter"} j = Job(TEST_USER) j.set_attributes(a) j.set_sleep_time(f"{sleeptime}") jid = self.server.submit(j) self.server.expect(JOB, { 'job_state': 'R', 'resources_used.mem': '3072kb'}, attrop=PTL_AND, offset=sleeptime/2, id=jid) self.server.expect(JOB, { 'job_state': 'F', 'resources_used.mem': '3072kb', 'resources_used.walltime': sleeptime}, op=GE, extend='x', offset=sleeptime/2, attrop=PTL_AND, id=jid) ================================================ FILE: test/tests/functional/pbs_acl_groups.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * class Test_acl_groups(TestFunctional): """ Test to check acl_groups and acl_resv_groups considers secondary group """ def test_acl_grp_queue(self): """ Set acl_groups on a queue and submit a job with a user for whom the set group is a secondary group """ a = {'queue_type': 'execution', 'started': 't', 'enabled': 't', 'acl_group_enable': 't', 'acl_groups': TSTGRP1} self.server.manager(MGR_CMD_CREATE, QUEUE, a, id='workq2') a = {'queue': 'workq2'} j = Job(TEST_USER1, attrs=a) # If 'Unauthorized Request' is found in error message the test would # fail as user was not able to submit job as a secondary group member try: jid = self.server.submit(j) except PbsSubmitError as e: self.assertFalse('Unauthorized Request' in e.msg[0]) def test_acl_resv_groups(self): """ Set acl_resv_groups on server and submit a reservation from a user for whom the set group is a secondary group """ self.server.manager(MGR_CMD_SET, SERVER, { 'acl_resv_group_enable': 'true'}) self.server.manager(MGR_CMD_SET, SERVER, {'acl_resv_groups': TSTGRP1}) # If 'Requestor's group not authorized' is found in error message the # test would fail as user was not able to submit reservation # as a secondary group member try: r = Reservation(TEST_USER1) rstart = int(time.time()) + 10 rend = int(time.time()) + 360 a = {'reserve_start': rstart, 'reserve_end': rend} r.set_attributes(a) rid = self.server.submit(r) except PbsSubmitError as e: self.assertFalse( 'Requestor\'s group not authorized' in e.msg[0]) ================================================ FILE: test/tests/functional/pbs_acl_host_moms.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * class Test_acl_host_moms(TestFunctional): """ This test suite is for testing the server attribute acl_host_moms_enable and this test requires two moms. """ def setUp(self): """ Determine the remote host and set acl_host_enable = True """ TestFunctional.setUp(self) usage_string = 'test requires a MoM and a client as input, ' + \ ' use -p moms=,client=' # PBSTestSuite returns the moms passed in as parameters as dictionary # of hostname and MoM object self.momA = self.moms.values()[0] self.momA.delete_vnode_defs() self.hostA = self.momA.shortname if not self.du.is_localhost(self.server.client): # acl_hosts expects FQDN self.hostB = socket.getfqdn(self.server.client) else: self.skip_test(usage_string) self.remote_host = None if not self.du.is_localhost(self.hostA): self.remote_host = self.hostA else: self.skip_test(usage_string) self.assertTrue(self.remote_host) self.server.manager(MGR_CMD_SET, SERVER, { 'acl_hosts': self.hostB}) self.server.manager(MGR_CMD_SET, SERVER, {'acl_host_enable': True}) self.pbsnodes_cmd = os.path.join(self.server.pbs_conf[ 'PBS_EXEC'], 'bin', 'pbsnodes') + ' -av' self.qstat_cmd = os.path.join(self.server.pbs_conf[ 'PBS_EXEC'], 'bin', 'qstat') def test_acl_host_moms_enable(self): """ Set acl_host_moms_enable = True and check whether or not the remote host is able run pbsnodes and qstat. """ self.server.manager(MGR_CMD_SET, SERVER, { 'acl_host_moms_enable': True}) ret = self.du.run_cmd(self.remote_host, cmd=self.pbsnodes_cmd) self.assertEqual(ret['rc'], 0) ret = self.du.run_cmd(self.remote_host, cmd=self.qstat_cmd) self.assertEqual(ret['rc'], 0) def test_acl_host_moms_disable(self): """ Set acl_host_moms_enable = False and check whether or not the remote host is forbidden to run pbsnodes and qstat. """ self.server.manager(MGR_CMD_SET, SERVER, { 'acl_host_moms_enable': False}) ret = self.du.run_cmd(self.remote_host, cmd=self.pbsnodes_cmd) self.assertNotEqual(ret['rc'], 0) ret = self.du.run_cmd(self.remote_host, cmd=self.qstat_cmd) self.assertNotEqual(ret['rc'], 0) def test_acl_host_moms_hooks_and_jobs(self): """ Use hooks to test whether remote host is able to run pbs.server() and check whether the job that is submitted goes to the 'R' state. """ hook_name = "hook_acl_host_moms_t" hook_body = """ import pbs e = pbs.event() svr = pbs.server().server_state e.accept() """ try: self.server.manager(MGR_CMD_DELETE, HOOK, None, hook_name) except Exception: pass a = {'event': 'execjob_begin', 'enabled': 'True'} self.server.create_import_hook( hook_name, a, hook_body, overwrite=True) self.server.manager(MGR_CMD_SET, SERVER, { 'acl_host_moms_enable': False}) j = Job() j.set_sleep_time(10) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'H'}, id=jid) self.server.manager(MGR_CMD_SET, SERVER, { 'acl_host_moms_enable': True}) j = Job() j.set_sleep_time(10) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R'}, id=jid) def test_acl_host_mom_queue_access(self): """ Test that remote host cannot submit jobs to queue where acl_host_enable is True and acl_host_moms_enable is set on server, but remote host is not added in acl_hosts. """ queue_n = 'tempq' queue_params = {'queue_type': 'Execution', 'enabled': 'True', 'started': 'True', 'acl_host_enable': 'True'} self.server.manager(MGR_CMD_CREATE, QUEUE, queue_params, id='tempq') self.server.manager(MGR_CMD_SET, SERVER, { 'acl_host_moms_enable': True}) # Setting acl_host_enable on queue overrides acl_host_moms_enable # on server and requires acl_hosts to include remote host's name. self.server.manager(MGR_CMD_SET, SERVER, {'flatuid': True}) # Setting flatuid lets us submit jobs on server as a remote # host without creating a seperate user account there. qsub_cmd_on_queue = os.path.join(self.server.pbs_conf[ 'PBS_EXEC'], 'bin', 'qsub') + ' -q ' + queue_n + ' -- /bin/sleep 10' j = Job(attrs={ATTR_queue: queue_n}) j.set_sleep_time(10) cannot_submit = 0 try: jid = self.server.submit(j) except PbsSubmitError: cannot_submit = 1 self.assertEqual(cannot_submit, 1) ================================================ FILE: test/tests/functional/pbs_acl_host_queue.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * class Test_acl_host_queue(TestFunctional): """ This test suite is for testing the queue attributes acl_host_enable and acl_hosts. """ def test_acl_host_enable_refuse(self): """ Set acl_host_enable = True on queue and check whether or not the submit is refused. """ a = {"acl_host_enable": True, "acl_hosts": "foo"} self.server.manager(MGR_CMD_SET, QUEUE, a, self.server.default_queue) j = Job(TEST_USER) try: self.server.submit(j) except PbsSubmitError as e: error_msg = "qsub: Access from host not allowed, or unknown host" self.assertEquals(e.msg[0], error_msg) else: self.fail("Queue is violating acl_hosts") def test_acl_host_enable_allow(self): """ Set acl_host_enable = True along with acl_hosts and check whether or not a job can be submitted. """ a = {"acl_host_enable": True, "acl_hosts": self.server.hostname} self.server.manager(MGR_CMD_SET, QUEUE, a, self.server.default_queue) j = Job(TEST_USER) jid = self.server.submit(j) self.logger.info('Job submitted successfully: ' + jid) ================================================ FILE: test/tests/functional/pbs_acl_host_server.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * class Test_acl_host_server(TestFunctional): """ This test suite is for testing the subnets in server's attribute acl_hosts. This test requires remote client. """ def setUp(self): """ Determine the server ip and remote host """ TestFunctional.setUp(self) usage_string = 'test requires a remote client as input,' + \ ' use -p client=' self.serverip = socket.gethostbyname(self.server.hostname) if not self.du.is_localhost(self.server.client): self.remote_host = socket.getfqdn(self.server.client) else: self.skip_test(usage_string) self.assertTrue(self.remote_host) self.pbsnodes_cmd = os.path.join(self.server.pbs_conf[ 'PBS_EXEC'], 'bin', 'pbsnodes') + ' -av' \ + ' -s ' + self.server.hostname def test_acl_subnet_enable_allow(self): """ Set acl_host_enable = True, subnet to server ip with the mask 255.255.0.0 or 16 and check whether or not the remote host is able to run pbsnodes. It should allow. """ a = {"acl_host_enable": True, "acl_hosts": self.serverip + "/255.255.0.0"} self.server.manager(MGR_CMD_SET, SERVER, a) ret = self.du.run_cmd(self.remote_host, cmd=self.pbsnodes_cmd) self.assertEqual(ret['rc'], 0) a = {"acl_host_enable": True, "acl_hosts": self.serverip + "/16"} self.server.manager(MGR_CMD_SET, SERVER, a) ret = self.du.run_cmd(self.remote_host, cmd=self.pbsnodes_cmd) self.assertEqual(ret['rc'], 0) def test_acl_subnet_enable_refuse(self): """ Set acl_host_enable = True, subnet to server ip with the mask 255.255.255.255 or 32 and check whether or not the remote host is able to run pbsnodes. It should refuse. """ a = {"acl_host_enable": True, "acl_hosts": self.serverip + "/255.255.255.255"} self.server.manager(MGR_CMD_SET, SERVER, a) ret = self.du.run_cmd(self.remote_host, cmd=self.pbsnodes_cmd) self.assertNotEqual(ret['rc'], 0) a = {"acl_host_enable": True, "acl_hosts": self.serverip + "/32"} self.server.manager(MGR_CMD_SET, SERVER, a) ret = self.du.run_cmd(self.remote_host, cmd=self.pbsnodes_cmd) self.assertNotEqual(ret['rc'], 0) def tearDown(self): """ Unset the acl attributes so tearDown can process on remote host. """ a = ["acl_host_enable", "acl_hosts"] self.server.manager(MGR_CMD_UNSET, SERVER, a) ================================================ FILE: test/tests/functional/pbs_admin_suspend.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. import time from tests.functional import * class TestAdminSuspend(TestFunctional): """ Test the admin-suspend/admin-resume feature for node maintenance """ def setUp(self): TestFunctional.setUp(self) a = {'resources_available.ncpus': 4, 'resources_available.mem': '4gb'} self.mom.create_vnodes(a, 1) def test_basic(self): """ Test basic admin-suspend functionality """ j1 = Job(TEST_USER) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid1) j2 = Job(TEST_USER) jid2 = self.server.submit(j2) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid2) vnode = self.mom.shortname + '[0]' # admin-suspend job 1. self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'S'}, id=jid1) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) self.server.expect(NODE, {'maintenance_jobs': jid1}) # admin-suspend job 2 self.server.sigjob(jid2, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'S'}, id=jid2) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) self.server.expect(NODE, {'maintenance_jobs': jid1 + "," + jid2}) # admin-resume job 1. Make sure the node is still in state maintenance self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) self.server.expect(NODE, {'maintenance_jobs': jid2}) # admin-resume job 2. Make sure the node retuns to state free self.server.sigjob(jid2, 'admin-resume', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'R'}, id=jid2) self.server.expect(NODE, {'state': 'free'}, id=vnode) def test_basic_ja(self): """ Test basic admin-suspend functionality for job arrays """ jA = Job(TEST_USER) jA.set_attributes({'Resource_List.select': '1:ncpus=1', ATTR_J: '1-2'}) jidA = self.server.submit(jA) self.server.expect(JOB, {'job_state': 'B'}, id=jidA) subjobs = self.server.status(JOB, id=jidA, extend='t') # subjobs[0] is the array itself. Need the subjobs jid1 = subjobs[1]['id'] jid2 = subjobs[2]['id'] self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid1) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid2) vnode = self.mom.shortname + '[0]' # admin-suspend job 1. self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'S'}, id=jid1) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) self.server.expect(NODE, {'maintenance_jobs': jid1}) # admin-suspend job 2 self.server.sigjob(jid2, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'S'}, id=jid2) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) self.server.expect(NODE, {'maintenance_jobs': jid1 + "," + jid2}) # admin-resume job 1. Make sure the node is still in state maintenance self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) self.server.expect(NODE, {'maintenance_jobs': jid2}) # admin-resume job 2. Make sure the node retuns to state free self.server.sigjob(jid2, 'admin-resume', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'R'}, id=jid2) self.server.expect(NODE, {'state': 'free'}, id=vnode) def test_basic_restart(self): """ Test basic admin-suspend functionality with server restart The restart will test if the node recovers properly in maintenance """ j1 = Job(TEST_USER) jid = self.server.submit(j1) self.server.expect( JOB, {'job_state': 'R', 'substate': 42}, attrop=PTL_AND, id=jid) vnode = self.mom.shortname + '[0]' # admin-suspend job self.server.sigjob(jid, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'S'}, id=jid) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) self.server.expect(NODE, {'maintenance_jobs': jid}) self.server.restart() self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) self.server.expect(NODE, {'maintenance_jobs': jid}) # Checking licenses to avoid failure at resume since PBS licenses # might not be available and as a result resume fails rv = self.is_server_licensed(self.server) _msg = 'No license found on server %s' % (self.server.shortname) self.assertTrue(rv, _msg) # admin-resume job self.server.sigjob(jid, 'admin-resume', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'R'}, id=jid) self.server.expect(NODE, {'state': 'free'}, id=vnode) def test_cmd_perm(self): """ Test permissions on admin-suspend, admin-resume, maintenance_jobs and the maintenace node state. """ vnode = self.mom.shortname + '[0]' # Test to make sure we can't set the maintenance node state try: self.server.manager( MGR_CMD_SET, NODE, {'state': 'maintenance'}, id=vnode, runas=ROOT_USER) except PbsManagerError as e: self.assertTrue('Illegal value for node state' in e.msg[0]) self.server.expect(NODE, {'state': 'free'}, id=vnode) # Test to make sure we can't set the 'maintenance_jobs' attribute try: self.server.manager( MGR_CMD_SET, NODE, {'maintenance_jobs': 'foo'}, id=vnode, runas=ROOT_USER) except PbsManagerError as e: self.assertTrue( 'Cannot set attribute, read only or insufficient permission' in e.msg[0]) self.server.expect(NODE, 'maintenance_jobs', op=UNSET, id=vnode) # Test to make sure regular users can't admin-suspend jobs j = Job(TEST_USER) jid = self.server.submit(j) self.server.expect( JOB, {'job_state': 'R', 'substate': 42}, attrop=PTL_AND, id=jid) try: self.server.sigjob(jid, 'admin-suspend', runas=TEST_USER) except PbsSignalError as e: self.assertTrue('Unauthorized Request' in e.msg[0]) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid) # Test to make sure regular users can't admin-resume jobs self.server.sigjob(jid, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'S'}, id=jid) try: self.server.sigjob(jid, 'admin-resume', runas=TEST_USER) except PbsSignalError as e: self.assertTrue('Unauthorized Request' in e.msg[0]) self.server.expect(JOB, {'job_state': 'S'}, id=jid) def test_wrong_state1(self): """ Test using wrong resume signal is correctly rejected """ j1 = Job(TEST_USER) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid1) self.server.sigjob(jid1, "suspend", runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'S'}, id=jid1) try: self.server.sigjob(jid1, "admin-resume", runas=ROOT_USER) except PbsSignalError as e: self.assertTrue( 'Job can not be resumed with the requested resume signal' in e.msg[0]) self.server.expect(JOB, {'job_state': 'S'}, id=jid1) def test_wrong_state2(self): """ Test using wrong resume signal is correctly rejected """ j1 = Job(TEST_USER) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid1) self.server.sigjob(jid1, "admin-suspend", runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'S'}, id=jid1) self.server.expect(JOB, {'substate': 43}, id=jid1) try: self.server.sigjob(jid1, "resume", runas=ROOT_USER) except PbsSignalError as e: self.assertTrue( 'Job can not be resumed with the requested resume signal' in e.msg[0]) # If resume had worked, the job would be in substate 45 self.server.expect(JOB, {'substate': 43}, id=jid1) def test_deljob(self): """ Test whether a node leaves the maintenance state when an admin-suspendedd job is deleted """ j = Job(TEST_USER) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid) vnode = self.mom.shortname + '[0]' self.server.sigjob(jid, 'admin-suspend', runas=ROOT_USER) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) self.server.deljob(jid, wait=True) self.server.expect(NODE, {'state': 'free'}, id=vnode) def test_deljob_force(self): """ Test whether a node leaves the maintenance state when an admin-suspendedd job is deleted with -Wforce """ j = Job(TEST_USER) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid) vnode = self.mom.shortname + '[0]' self.server.sigjob(jid, 'admin-suspend', runas=ROOT_USER) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) self.server.deljob(jid, extend='force', wait=True) self.server.expect(NODE, {'state': 'free'}, id=vnode) def test_rerunjob(self): """ Test whether a node leaves the maintenance state when an admin-suspended job is requeued """ j = Job(TEST_USER) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid) vnode = self.mom.shortname + '[0]' self.server.sigjob(jid, 'admin-suspend', runas=ROOT_USER) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) self.server.rerunjob(jid, extend='force') # Job eventually goes to R state after being requeued for short time self.server.expect(JOB, {'job_state': 'R'}, id=jid) self.server.expect(NODE, {'state': 'free'}, id=vnode) def test_multivnode(self): """ Submit a job to multiple vnodes. Send an admin-suspend signal and see all nodes go into maintenance """ a = {'resources_available.ncpus': 4, 'resources_available.mem': '4gb'} self.mom.create_vnodes(a, 3, usenatvnode=True) j = Job(TEST_USER) j.set_attributes({'Resource_List.select': '3:ncpus=1', 'Resource_List.place': 'vscatter'}) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid) self.server.sigjob(jid, 'admin-suspend', runas=ROOT_USER) self.server.expect(NODE, {'state=maintenance': 3}) self.server.expect(JOB, {'job_state': 'S'}, id=jid) self.server.sigjob(jid, 'admin-resume', runas=ROOT_USER) self.server.expect(NODE, {'state=free': 3}) def test_multivnode2(self): """ Submit a job to multiple vnodes. Send an admin-suspend signal and see all nodes go into maintenance Submit a single node job to one of the nodes. Resume the multinode Job and see the single node job's node stil in maintenance """ a = {'resources_available.ncpus': 4, 'resources_available.mem': '4gb'} self.mom.create_vnodes(a, 3, usenatvnode=True) # Submit multinode job 1 j1 = Job(TEST_USER) j1.set_attributes({'Resource_List.select': '3:ncpus=1', 'Resource_List.place': 'vscatter'}) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid1) vnode = self.mom.shortname + '[0]' # Submit Job 2 to specific node j2 = Job(TEST_USER) j2.set_attributes({'Resource_List.select': '1:ncpus=1:vnode=' + vnode}) jid2 = self.server.submit(j2) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid2) # admin-suspend job 1 and see all three nodes go into maintenance self.server.sigjob(jid1, 'admin-suspend') self.server.expect(JOB, {'job_state': 'S'}, id=jid1) self.server.expect(NODE, {'state=maintenance': 3}) # admin-suspend job 2 self.server.sigjob(jid2, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'S'}, id=jid2) # admin-resume job1 and see one node stay in maintenance self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER) self.server.expect(NODE, {'state=free': 2}) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) def test_multivnode_excl(self): """ Submit an excl job to multiple vnodes. Send an admin-suspend signal and see all nodes go into maintenance """ a = {'resources_available.ncpus': 4, 'resources_available.mem': '4gb'} self.mom.create_vnodes(a, 3, usenatvnode=True) j = Job(TEST_USER) j.set_attributes({'Resource_List.select': '3:ncpus=1', 'Resource_List.place': 'vscatter:excl'}) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid) self.server.expect(NODE, {'state=job-exclusive': 3}) self.server.sigjob(jid, 'admin-suspend', runas=ROOT_USER) self.server.expect(NODE, {'state=maintenance': 3}) self.server.expect(JOB, {'job_state': 'S'}, id=jid) self.server.sigjob(jid, 'admin-resume', runas=ROOT_USER) self.server.expect(NODE, {'state=job-exclusive': 3}) def test_degraded_resv(self): """ Test if a reservation goes into the degraded state after its node is put into maintenance """ # Submit a reservation r = Reservation(TEST_USER) r.set_attributes({'Resource_List.select': '1:ncpus=1', 'reserve_start': time.time() + 3600, 'reserve_end': time.time() + 7200}) rid = self.server.submit(r) # See reservation is confirmed a = {'reserve_state': (MATCH_RE, 'RESV_CONFIRMED|2')} d = self.server.expect(RESV, a, rid) # Submit a job and see it run j = Job(TEST_USER) j.set_attributes({'Resource_List.select': '1:ncpus=1', 'Resource_List.walltime': 120}) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid) vnode = self.mom.shortname + '[0]' # Admin-suspend job self.server.sigjob(jid, 'admin-suspend', runas=ROOT_USER) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) # See reservation in degreaded state a = {'reserve_state': (MATCH_RE, 'RESV_DEGRADED|10')} d = self.server.expect(RESV, a, rid) def test_resv_jobend(self): """ Test if a node goes back to free state when reservation ends and admin-suspended job is killed """ # Submit a reservation r = Reservation(TEST_USER) r.set_attributes({'Resource_List.select': '1:ncpus=1', 'reserve_start': time.time() + 30, 'reserve_end': time.time() + 60}) rid = self.server.submit(r) # See reservation is confirmed a = {'reserve_state': (MATCH_RE, 'RESV_CONFIRMED|2')} d = self.server.expect(RESV, a, id=rid) # Submit a job j = Job(TEST_USER) rque = rid.split(".") j.set_attributes({'queue': rque[0]}) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'Q'}, id=jid) # Wait for reservation to start a = {'reserve_state': (MATCH_RE, 'RESV_RUNNING|3')} d = self.server.expect(RESV, a, rid, offset=30) # job is running as well self.server.expect( JOB, {'job_state': 'R', 'substate': 42}, id=jid, max_attempts=30) vnode = self.mom.shortname + '[0]' # Admin-suspend job self.server.sigjob(jid, 'admin-suspend', runas=ROOT_USER) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) # Submit another job outside of reservation j = Job(TEST_USER) jid2 = self.server.submit(j) self.server.expect(JOB, {'job_state': 'Q'}, id=jid2) # Wait for the reservation to get over # Job also gets deleted and node state goes back to free self.server.expect(JOB, 'queue', op=UNSET, id=jid, offset=120) self.server.expect(NODE, {'state': 'free'}, id=vnode) # job2 starts running self.server.expect(JOB, {'job_state': 'R'}, id=jid2, max_attempts=60) def test_que(self): """ Test to check that job gets suspended on non-default queue """ # create a high priority workq2 and a routeq a = {'queue_type': 'execution', 'started': 't', 'enabled': 't', 'priority': 150} self.server.manager(MGR_CMD_CREATE, QUEUE, a, id='workq2') a = {'queue_type': 'route', 'started': 't', 'enabled': 't', 'route_destinations': 'workq2'} self.server.manager(MGR_CMD_CREATE, QUEUE, a, id='route') # submit a normal job j = Job(TEST_USER) j.set_attributes({'Resource_List.select': '1:ncpus=3'}) jid1 = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid1) # submit a high priority job. Make sure job1 is suspended. j = Job(TEST_USER) j.set_attributes( {'Resource_List.select': '1:ncpus=3', 'queue': 'route'}) jid2 = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid2) self.server.expect(JOB, {'job_state': 'S'}, id=jid1) # Above will not cause node state to go to maintenance vnode = self.mom.shortname + '[0]' self.server.expect( NODE, {'state': (MATCH_RE, 'free|job-exclusive')}, id=vnode) # admin suspend job2 self.server.sigjob(jid2, 'admin-suspend', runas=ROOT_USER) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) self.server.expect(JOB, {'job_state=S': 2}) # Releasing job1 will fail and not change node state rv = self.server.sigjob(jid1, 'resume', runas=ROOT_USER, logerr='True') self.assertFalse(rv) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) # deleting job1 will not change node state either self.server.deljob(jid1, wait=True) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) # Admin-resume job2 self.server.sigjob(jid2, 'admin-resume', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid2) self.server.expect(NODE, {'state': 'free'}, id=vnode) # suspend the job self.server.sigjob(jid2, 'suspend', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'S'}, id=jid2) self.server.expect( NODE, {'state': (MATCH_RE, 'free|job-exclusive')}, id=vnode) def test_resume(self): """ Test node state remains in maintenance until all jobs are not resumed """ a = {'resources_available.ncpus': 4, 'resources_available.mem': '4gb'} self.mom.create_vnodes(a, 3, usenatvnode=True) j = Job(TEST_USER) j.set_attributes({'Resource_List.select': '3:ncpus=1', 'Resource_List.place': 'vscatter'}) jid1 = self.server.submit(j) jid2 = self.server.submit(j) jid3 = self.server.submit(j) self.server.expect(JOB, {'job_state=R': 3, 'substate=42': 3}) self.server.expect(NODE, {'state=free': 3}) # admin suspend first 2 jobs and let 3rd job run # First only suspend job1 and verify that it will # put all the nodes to maintenance state self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER) self.server.expect(NODE, {'state=maintenance': 3}) self.server.sigjob(jid2, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {'job_state=S': 2}) self.server.expect(JOB, {'job_state': 'R'}, id=jid3) # submit a new job and it will be queued j = Job(TEST_USER) jid4 = self.server.submit(j) self.server.expect(JOB, {'job_state': 'Q'}, id=jid4) # List all maintenance_jobs self.server.expect(NODE, {'maintenance_jobs': jid1 + "," + jid2}) # resume 1 job that will not change node state self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER) self.server.expect(NODE, {'state=maintenance': 3}) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid1) self.server.expect(JOB, {'job_state': 'S'}, id=jid2) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid3) # resume the remaining job self.server.sigjob(jid2, 'admin-resume', runas=ROOT_USER) self.server.expect(NODE, {'state=free': 3}) self.server.expect(JOB, {'job_state=R': 4}) def test_admin_resume_loop(self): """ Test that running admin-resume in a loop will have no impact on PBS """ # submit a job j = Job(TEST_USER) j.set_sleep_time(300) jid1 = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid1) vnode = self.mom.shortname + '[0]' # admin suspend and resume job in a loop for x in range(15): self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'S'}, id=jid1) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) # sleep for sometime time.sleep(3) # resume the job self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) self.server.expect(NODE, {'state': 'free'}, id=vnode) def test_custom_res(self): """ Test that job will not run on a node in maintenance state if explicitly asking for a resource on that node """ # create multiple vnodes a = {'resources_available.ncpus': 4, 'resources_available.mem': '4gb'} self.mom.create_vnodes(a, 3, usenatvnode=True) # create a node level resource self.server.manager( MGR_CMD_CREATE, RSC, {'type': 'float', 'flag': 'nh'}, id="foo", runas=ROOT_USER) vnode = self.mom.shortname + '[1]' # set foo on vn[1] self.server.manager( MGR_CMD_SET, NODE, {'resources_available.foo': 5}, id=vnode, runas=ROOT_USER) # set foo in sched_config self.scheduler.add_resource('foo') # submit a few jobs j = Job(TEST_USER) j.set_attributes({'Resource_List.select': 'vnode=' + vnode}) jid1 = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid1) # admin suspend the job to put the node to maintenance self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {'job_state': 'S'}, id=jid1) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) # submit other jobs asking for specific resources on vn[1] j = Job(TEST_USER) j.set_attributes({'Resource_List.foo': '2'}) jid2 = self.server.submit(j) self.server.expect(JOB, {'job_state': 'Q'}, id=jid2) # submit more jobs. They should be running j = Job(TEST_USER) jid3 = self.server.submit(j) jid4 = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid3) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid4) # verify that vn[1] is still in maintenance and # job3 and job4 not running on vn[1] self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) try: self.server.expect(JOB, {'exec_vnode': (MATCH_RE, vnode)}, id=jid3, max_attempts=20) self.server.expect(JOB, {'exec_vnode': (MATCH_RE, vnode)}, id=jid4, max_attempts=20) except Exception as e: self.assertFalse(e.rv) msg = "jid3 and jid4 not running on " + vnode + " as expected" self.logger.info(msg) def test_list_jobs_1(self): """ Test to list and set maintenance_jobs as various users """ # This test is run with CLI mode only _m = self.server.get_op_mode() if _m != PTL_CLI: self.skipTest("Not all commands can be run with API mode") # submit a few jobs j = Job(TEST_USER) jid1 = self.server.submit(j) jid2 = self.server.submit(j) jid3 = self.server.submit(j) # verify that all are running self.server.expect(JOB, {'job_state=R': 3, 'substate=42': 3}) # admin-suspend 2 of them self.server.sigjob(jid2, 'admin-suspend', runas=ROOT_USER) self.server.sigjob(jid3, 'admin-suspend', runas=ROOT_USER) vnode = self.mom.shortname + '[0]' # node state is in maintenance self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) # list maintenance_jobs as root self.server.expect(NODE, {'maintenance_jobs': jid2 + "," + jid3}, runas=ROOT_USER) # list maintenance jobs as user self.server.expect(NODE, {'maintenance_jobs': jid2 + "," + jid3}, runas=TEST_USER) # set an operator self.server.manager(MGR_CMD_SET, SERVER, {'operators': 'pbsoper@*'}) # List all jobs in maintenance mode as operator self.server.expect( NODE, {'maintenance_jobs': jid2 + "," + jid3}, runas='pbsoper') # set maintenance_jobs as root try: self.server.manager(MGR_CMD_SET, NODE, {'maintenance_jobs': jid1}, id=vnode, runas=ROOT_USER) except PbsManagerError as e: self.assertFalse(e.rv) msg = "Cannot set attribute, read only" +\ " or insufficient permission maintenance_jobs" self.assertTrue(msg in e.msg[0]) # Set maintenance_jobs as operator try: self.server.manager(MGR_CMD_SET, NODE, {'maintenance_jobs': jid1}, id=vnode, runas='pbsoper') except PbsManagerError as e: self.assertFalse(e.rv) msg = "Cannot set attribute, read only" +\ " or insufficient permission maintenance_jobs" self.assertTrue(msg in e.msg[0]) # Set maintenance_jobs as user try: self.server.manager(MGR_CMD_SET, NODE, {'maintenance_jobs': jid1}, id=vnode, runas=TEST_USER) except PbsManagerError as e: self.assertFalse(e.rv) self.assertTrue("Unauthorized Request" in e.msg[0]) def test_list_jobs_2(self): """ Test to list maintenance_jobs when no job is admin-suspended """ # Submit a few jobs j = Job(TEST_USER) jid1 = self.server.submit(j) jid2 = self.server.submit(j) jid3 = self.server.submit(j) # verify that all are running self.server.expect(JOB, {'job_state=R': 3, 'substate=42': 3}) vnode = self.mom.shortname + '[0]' # list maintenance_jobs. It should be empty self.server.expect(NODE, 'maintenance_jobs', op=UNSET, id=vnode) # Regular suspend a job self.server.sigjob(jid2, 'suspend', runas=ROOT_USER) # List maintenance_jobs again self.server.expect(NODE, 'maintenance_jobs', op=UNSET, id=vnode) def test_preempt_order(self): """ Test that scheduler preempt_order has no impact on admin-suspend """ # create a high priority queue a = {'queue_type': 'e', 'enabled': 't', 'started': 't', 'priority': 150} self.server.manager(MGR_CMD_CREATE, QUEUE, a, id="highp") # set preempt_order to R self.server.manager(MGR_CMD_SET, SCHED, {'preempt_order': 'R'}, runas=ROOT_USER) vnode = self.mom.shortname + '[0]' # submit a job j = Job(TEST_USER) j.set_attributes({'Resource_List.select': 'vnode=' + vnode}) jid1 = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid1) # submit a high priority job j = Job(TEST_USER) j.set_attributes({'queue': 'highp', 'Resource_List.select': '1:ncpus=4:vnode=' + vnode}) jid2 = self.server.submit(j) # job2 is running and job1 is requeued self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid2) self.server.expect(JOB, {'job_state': 'Q'}, id=jid1) # admin-suspend job1. It will fail try: self.server.sigjob(jid1, 'admin-suspend', logerr=False) except Exception as e: self.assertFalse(e.rv) # admin suspend job2 self.server.sigjob(jid2, 'admin-suspend') self.server.expect(JOB, {'job_state': 'S'}, id=jid2) self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) # admin-resume job2. node state will become job-busy. self.server.sigjob(jid2, 'admin-resume') self.server.expect(NODE, {'state': 'job-busy'}, id=vnode) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid2) self.server.expect(JOB, {'job_state': 'Q'}, id=jid1) def test_hook(self): """ List maintenance_jobs via hook """ # Create and import a hook hook_name = "test" hook_body = """ import pbs vn = pbs.server().vnode('vn[0]') pbs.logmsg(pbs.LOG_DEBUG,\ "list of maintenance_jobs are %s" % vn.maintenance_jobs) """ a = {'resources_available.ncpus': 4, 'resources_available.mem': '4gb'} self.mom.create_vnodes(a, 1, vname='vn') a = {'event': 'exechost_periodic', 'enabled': 'True', 'freq': 5} self.server.create_import_hook(hook_name, a, hook_body) # submit few jobs j = Job(TEST_USER) jid1 = self.server.submit(j) jid2 = self.server.submit(j) self.server.expect(JOB, {'job_state=R': 2}) # wait for the periodic hook time.sleep(5) # look for the log message self.mom.log_match("list of maintenance_jobs are None") # admin-suspend jobs self.server.sigjob(jid1, 'admin-suspend') self.server.sigjob(jid2, 'admin-suspend') # wait for periodic hook and check mom_log time.sleep(5) self.mom.log_match("list of maintenance_jobs are %s" % ((jid1 + "," + jid2),)) # admin-resume job1 self.server.sigjob(jid1, 'admin-resume') # wait for periodic hook and check mom_log time.sleep(5) self.mom.log_match( "list of maintenance_jobs are %s" % (jid2,)) def test_offline(self): """ Test that if a node is put to offline and removed from maintenance state it remains offlined """ # submit a job and admin-suspend it j1 = Job(TEST_USER) jid1 = self.server.submit(j1) j2 = Job(TEST_USER) jid2 = self.server.submit(j2) self.server.expect(JOB, {'job_state': "R", 'substate': 42}, id=jid1) self.server.expect(JOB, {'job_state': "R", 'substate': 42}, id=jid2) self.server.sigjob(jid1, 'admin-suspend') self.server.sigjob(jid2, 'admin-suspend') vnode = self.mom.shortname + '[0]' # node state is in maintenance self.server.expect(NODE, {'state': 'maintenance'}, id=vnode) # submit another job. It will be queued j3 = Job(TEST_USER) jid3 = self.server.submit(j3) self.server.expect(JOB, {'job_state': 'Q'}, id=jid3) # mark the node as offline too self.server.manager(MGR_CMD_SET, NODE, {'state': 'offline'}, id=vnode) # delete job1 as user and resume job2 self.server.deljob(jid1, wait=True, runas=TEST_USER) self.server.sigjob(jid2, 'admin-resume') # verify that node state is offline and # job3 is still queued self.server.expect(NODE, {'state': 'offline'}, id=vnode) self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid2) self.server.expect(JOB, {'job_state': 'Q'}, id=jid3) ================================================ FILE: test/tests/functional/pbs_allpart.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * class TestSchedAllPart(TestFunctional): """ Test the scheduler's allpart optimization """ def setUp(self): TestFunctional.setUp(self) a = {'resources_available.ncpus': 1, 'resources_available.mem': '1gb'} self.mom.create_vnodes(a, 2, usenatvnode=True) def test_free_nodes(self): """ Test that if there aren't enough free nodes available, it is reported """ self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) a = {'Resource_List.select': '2:ncpus=1'} j1 = Job(TEST_USER, a) jid1 = self.server.submit(j1) j2 = Job(TEST_USER, a) jid2 = self.server.submit(j2) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) a = {'job_state': 'Q', 'comment': 'Not Running: Not enough free nodes available'} self.server.expect(JOB, a, id=jid2) def test_vscatter(self): """ Test that we determine we can't run a job when there aren't enough free nodes available due to vscatter """ self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) a = {'Resource_List.select': '1:ncpus=1'} j1 = Job(TEST_USER, a) jid1 = self.server.submit(j1) a = {'Resource_List.select': '2:ncpus=1', 'Resource_List.place': 'vscatter'} j2 = Job(TEST_USER, a) jid2 = self.server.submit(j2) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) a = {'job_state': 'Q', 'comment': 'Not Running: Not enough free nodes available'} self.server.expect(JOB, a, id=jid2) def test_vscatter2(self): """ Test that we can determine a job can never run if it is requesting more nodes than is in the complex via vscatter """ self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.place': 'vscatter'} j = Job(TEST_USER, a) jid = self.server.submit(j) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) a = {'job_state': 'Q', 'comment': 'Can Never Run: Not enough total nodes available'} self.server.expect(JOB, a, id=jid) def test_rassn(self): """ Test rassn resource (ncpus) is unavailable and the comment is shown with a RAT line """ self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) a = {'Resource_List.select': '1:ncpus=1'} j1 = Job(TEST_USER, a) jid1 = self.server.submit(j1) a = {'Resource_List.select': '2:ncpus=1'} j2 = Job(TEST_USER, a) jid2 = self.server.submit(j2) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) m = 'Not Running: Insufficient amount of resource: ncpus ' + \ '(R: 2 A: 1 T: 2)' a = {'job_state': 'Q', 'comment': m} self.server.expect(JOB, a, id=jid2) def test_nonexistent_non_consumable(self): """ Test that a nonexistent non-consumable value is caught as 'Never Run' """ a = {'Resource_List.select': '1:ncpus=1:vnode=foo'} j = Job(TEST_USER, a) jid = self.server.submit(j) m = r'Can Never Run: Insufficient amount of resource: vnode \(foo !=' a = {'job_state': 'Q', 'comment': (MATCH_RE, m)} self.server.expect(JOB, a, id=jid) def test_too_many_ncpus(self): """ test that a job is marked as can never run if it requests more cpus than are available on the entire complex """ a = {'Resource_List.select': '3:ncpus=1'} j = Job(TEST_USER, a) jid = self.server.submit(j) m = 'Can Never Run: Insufficient amount of resource: ncpus ' + \ '(R: 3 A: 2 T: 2)' a = {'job_state': 'Q', 'comment': m} self.server.expect(JOB, a, id=jid) ================================================ FILE: test/tests/functional/pbs_alps_inventory_check_hook.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. import os from tests.functional import * @tags('cray', 'mom') class TestAlpsInventoryCheckHook(TestFunctional): """ PBS mom appears not to periodically automatically re-query the node inventory on Cray. """ def setUp(self): self.platform = DshUtils().get_platform() if self.platform != 'cray' and self.platform != 'craysim': self.skipTest("This is not a cray platform") TestFunctional.setUp(self) with open("/etc/xthostname") as xthost_file: self.crayhostname = xthost_file.readline().rstrip() self.server.manager(MGR_CMD_SET, PBS_HOOK, {'enabled': 'true', 'freq': 3}, id='PBS_alps_inventory_check') def delete_cray_compute_node(self): """ Deletes the cray compute node from pbs node list """ vnl = self.server.filter( VNODE, {'resources_available.vntype': 'cray_compute'}) vlist = vnl["resources_available.vntype=cray_compute"] self.server.manager(MGR_CMD_DELETE, NODE, id=vlist[0]) def test_apstat_cmd(self): """ Test the log when apstat is not present in the expected/default location, it indicates a Cray system issue. """ now = time.time() if self.platform == "craysim": if os.path.exists("/opt/cray/alps/default/bin/stat"): # The file to be renamed is conflicting with existing file self.skipTest("Conflict in the testcase settings") os.rename( "/opt/cray/alps/default/bin/apstat", "/opt/cray/alps/default/bin/stat") try: self.mom.log_match( "ALPS Inventory Check: apstat command can not " + "be found at /opt/cray/alps/default/bin/apstat", starttime=now, max_attempts=10, interval=2) finally: os.rename( "/opt/cray/alps/default/bin/stat", "/opt/cray/alps/default/bin/apstat") else: self.skipTest("This test can be run on a simulator") def test_xthostname(self): """ Test when hook attempts to read the /etc/xthostname file to determine Cray hostname, but the hostname file is missing. """ now = time.time() if self.platform == "craysim": if os.path.exists("/etc/xt"): # The file to be renamed is conflicting with existing file self.skipTest("Conflict in the testcase settings") os.rename("/etc/xthostname", "/etc/xt") try: self.mom.log_match( "/etc/xthostname file found on this host", starttime=now, max_attempts=10, interval=2) finally: os.rename("/etc/xt", "/etc/xthostname") else: self.skipTest("This test can be run on a simulator") def test_start_of_hook(self): """ Test log at the start of hook processing. """ now = time.time() self.mom.log_match( "Processing ALPS inventory for crayhost %s" % self.crayhostname, starttime=now, max_attempts=10, interval=2) def test_cray_login_nodes(self): """ Test log when no nodes with vntype 'cray_login' are present. """ now = time.time() mc = self.mom.parse_config() save = mc["$alps_client"] del mc["$alps_client"] self.mom.apply_config(mc) self.host = self.mom.shortname try: self.server.manager(MGR_CMD_DELETE, NODE, None, "") self.server.manager(MGR_CMD_CREATE, NODE, id=self.host) self.mom.log_match( "ALPS Inventory Check: No eligible " + "login nodes to perform inventory check", starttime=now, max_attempts=10, interval=2) finally: mc["$alps_client"] = save self.mom.apply_config(mc, False) def test_pbs_home_path(self): """ Test log when mom_priv directory is not in the expected/default location (PBS_HOME), indicating a PBS installation issue. """ if self.platform == "craysim": now = time.time() pbs_conf = self.du.parse_pbs_config(self.server.shortname) save = pbs_conf['PBS_HOME'] self.du.set_pbs_config( self.server.shortname, confs={ 'PBS_HOME': ''}) try: self.delete_cray_compute_node() self.mom.log_match( "ALPS Inventory Check: Internal error in retrieving " + "path to mom_priv", starttime=now, max_attempts=10, interval=2) finally: self.du.set_pbs_config( self.server.shortname, confs={ 'PBS_HOME': save}) else: self.skipTest("This test can be run on a simulator") def test_alps_and_pbs_are_in_sync(self): """ Test log when both PBS and ALPS are in sync i.e. they report the same number of compute nodes in the Cray cluster. """ now = time.time() self.mom.log_match( "ALPS Inventory Check: PBS and ALPS are in sync", starttime=now, max_attempts=10, interval=2) def test_nodes_out_of_sync(self): """ Test the log when PBS and ALPS are out of sync """ now = time.time() self.delete_cray_compute_node() self.mom.log_match( "ALPS Inventory Check: Compute " + "nodes defined in ALPS, but not in PBS", starttime=now, max_attempts=10, interval=2) def test_failure_in_refreshing_nodes(self): """ Test log when the Hook is unable to HUP the Mom and successfully refresh nodes. """ if self.platform == "craysim": now = time.time() pbs_conf = self.du.parse_pbs_config(self.server.shortname) save = pbs_conf['PBS_HOME'] self.du.set_pbs_config( self.server.shortname, confs={'PBS_HOME': 'xyz'}) try: self.delete_cray_compute_node() self.mom.log_match( "ALPS Inventory Check: Failure in refreshing nodes on " + "login node (%s)" % self.mom.hostname, starttime=now, max_attempts=10, interval=2) finally: self.du.set_pbs_config( self.server.shortname, confs={ 'PBS_HOME': save}) else: self.skipTest("This test can be run on cray a simulator") ================================================ FILE: test/tests/functional/pbs_alps_release_tunables.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * import math from ptl.utils.pbs_logutils import PBSLogUtils @tags('cray') class TestCrayAlpsReleaseTunables(TestFunctional): """ Set of tests to verify alps release tunables namely alps_release_wait_time and alps_release_jitter """ def setUp(self): machine = self.du.get_platform() if not machine == 'cray': self.skipTest("Test suite only meant to run on a Cray") TestFunctional.setUp(self) @staticmethod def get_epoch(msg): # Since its a log message split on ';' to get timestamp a = PBSLogUtils.convert_date_time(msg.split(';')[0]) return a def test_alps_release_wait_time(self): """ Set alps_release_wait_time to a higher value and then notice that subsequest reservation cancellation requests are made at least after the set interval. """ # assigning a random value to alps_release_wait_time that is # measurable using mom log messages arwt = 4.298 self.mom.add_config({'$alps_release_wait_time': arwt}) # submit a job and then delete it after it starts running start_time = time.time() j1 = Job(TEST_USER) jid1 = self.server.submit(j1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) time.sleep(2) self.server.delete(jid1) # Look for a message that confirms that reservation is deleted self.mom.log_match("%s;ALPS reservation cancelled" % jid1, starttime=start_time) # Now that we know that reservation is cleared we should # check for time difference between each cancellation request out = self.mom.log_match("%s;Canceling ALPS reservation *" % jid1, n='ALL', regexp=True, allmatch=True) # We found something, Let's first check there are atleast 2 such # log messages, If not then that means reservation was cancelled # in the first attempt itself, at that point right thing to do is # to either run it again or find out a way to delay the reservation # cancellation at ALPS level itself. if len(out) >= 2: # variable 'out' is a list of tuples and every second element # in a tuple is the matched log message time_prev = self.get_epoch(out[0][1]) for data in out[1:]: time_current = self.get_epoch(data[1]) fail_msg = "alps_release_wait_time not working" self.assertGreaterEqual(time_current - time_prev, math.floor(arwt), msg=fail_msg) time_prev = time_current else: self.skipTest("Reservation cancelled without retry, Try again!") def test_alps_release_jitter(self): """ Set alps_release_jitter to a higher value and then notice that subsequest reservation cancellation requests are made by adding a random time interval (less than jitter) to alps_release_wait_time. """ # assigning a random value to alps_release_jitter that is # measurable using mom log messages arj = 2.198 arwt = 1 max_delay = (arwt + math.ceil(arj)) self.mom.add_config({'$alps_release_jitter': arj}) self.mom.add_config({'$alps_release_wait_time': arwt}) # There is no good way to test jitter and it is a random number # less than value set in alps_release_jitter. So in this case # we can probably try deleting a reservation a few times. n = retry = 5 for _ in range(n): # submit a job and then delete it after it starts running start_time = time.time() j1 = Job(TEST_USER) jid1 = self.server.submit(j1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) time.sleep(2) self.server.delete(jid1) # Look for a message that confirms that reservation is deleted self.mom.log_match("%s;ALPS reservation cancelled" % jid1, starttime=start_time) # Now that we know that reservation is cleared we should # check for time difference between each cancellation request out = self.mom.log_match("%s;Canceling ALPS reservation *" % jid1, n='ALL', regexp=True, allmatch=True) # We found something, Let's first check there are atleast 2 such # log messages, If not then that means reservation was cancelled # in the first attempt itself, at that point right thing to do is # to either run it again or find out a way to delay the reservation # cancellation at ALPS level itself. if len(out) >= 2: retry -= 1 # variable 'out' is a list of tuples and every second element # in a tuple is the matched log message time_prev = self.get_epoch(out[0][1]) for data in out[1:]: time_current = self.get_epoch(data[1]) self.assertLessEqual(time_current - time_prev, max_delay, msg="alps_release_jitter not working") time_prev = time_current if retry == 5: self.skipTest("Reservation cancelled without retry, Try again!") ================================================ FILE: test/tests/functional/pbs_array_job_mail.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * import os class Test_array_job_email(TestFunctional): """ This test suite is for testing arrayjob e-mailing (parent job and subjob) """ def test_emails(self): """ Run arrayjob with -m jabe and test if the e-mails are received """ self.server.manager(MGR_CMD_SET, SERVER, {'job_history_enable': 'true'}) mailfile = os.path.join("/var/mail", str(TEST_USER)) if not os.path.isfile(mailfile): self.skip_test("Mail file '%s' does not exist or " "mail is not setup. " "Hence this step would be skipped. " "Please check manually." % mailfile) J = Job(TEST_USER, attrs={ATTR_m: 'jabe', ATTR_J: '1-2'}) J.set_sleep_time(1) parent_jid = self.server.submit(J) self.server.expect(JOB, {'job_state': 'F'}, parent_jid, extend='x', max_attempts=15, interval=2) subjob_jid = parent_jid.replace("[]", "[1]") emails = [("PBS Job Id: " + parent_jid, "Begun execution"), ("PBS Job Id: " + parent_jid, "Execution terminated"), ("PBS Job Id: " + subjob_jid, "Begun execution"), ("PBS Job Id: " + subjob_jid, "Execution terminated")] for (jobid, msg) in emails: emailpass = 0 for j in range(5): time.sleep(5) ret = self.du.tail(filename=mailfile, sudo=True, option="-n 600") maillog = [x.strip() for x in ret['out']] for i in range(0, len(maillog) - 2): if jobid == maillog[i] and msg == maillog[i + 2]: emailpass = 1 break if emailpass: break self.assertTrue(emailpass, "Message '" + jobid + " " + msg + "' not found in " + mailfile) def test_qsub_errors_j_mailpoint(self): """ Try to submit 'qsub -m j' and test possible errors """ J = Job(TEST_USER, attrs={ATTR_m: 'j'}) error_msg = "mail option 'j' can not be used without array job" try: self.server.submit(J) except PbsSubmitError as e: self.assertTrue(error_msg in e.msg[0]) J = Job(TEST_USER, attrs={ATTR_m: 'j', ATTR_J: '1-2'}) error_msg = "illegal -m value" try: self.server.submit(J) except PbsSubmitError as e: self.assertTrue(error_msg in e.msg[0]) def test_email_non_existent_user(self): """ Verify when a job array is submitted with a valid and invalid mail recipients and all file stageout attempts fails then email should get delivered to valid recipient and no email would be sent to invalid recipient. """ non_existent_user = PbsAttribute.random_str(length=5) non_existent_mailfile = os.path.join(os.sep, "var", "mail", non_existent_user) pbsuser_mailfile = os.path.join(os.sep, "var", "mail", str(TEST_USER)) # Check mail file should exist for existent user if not os.path.isfile(pbsuser_mailfile): msg = "Skipping this test as Mail file '%s' " % pbsuser_mailfile msg += "does not exist or mail is not setup." self.skip_test(msg) # Check non existent user mail file should not exist self.assertFalse(os.path.isfile(non_existent_mailfile)) src_file = PbsAttribute.random_str(length=5) stageout_path = os.path.join(os.sep, '1', src_file) dest_file = stageout_path + '1' if not os.path.isdir(stageout_path) and os.path.exists(src_file): os.remove(src_file) # Submit job with invalid stageout path usermail_list = str(TEST_USER) + "," + non_existent_user set_attrib = {ATTR_stageout: stageout_path + '@' + self.mom.shortname + ':' + dest_file, ATTR_M: usermail_list, ATTR_J: '1-2', ATTR_S: '/bin/bash'} j = Job() j.set_attributes(set_attrib) j.set_sleep_time(1) jid = self.server.submit(j) subjid = j.create_subjob_id(jid, 1) self.server.expect(JOB, 'queue', op=UNSET, id=jid) # Check stageout file should not be present self.assertFalse(os.path.exists(dest_file)) exp_msg = "PBS Job Id: " + subjid err_msg = "%s msg not found in pbsuser's mail log" % exp_msg email_pass = 0 for i in range(5): time.sleep(5) # Check if mail is deliverd to valid user mail file ret = self.du.tail(filename=pbsuser_mailfile, runas=TEST_USER, option="-n 50") maillog = [x.strip() for x in ret['out']] if exp_msg in maillog: email_pass = 1 break self.assertTrue(email_pass, err_msg) # Verify there should not be any email for invalid user self.assertFalse(os.path.isfile(non_existent_mailfile)) ================================================ FILE: test/tests/functional/pbs_basil_parser_err.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * @tags('cray', 'mom') class TestBasilParserErrors(TestFunctional): """ Test the BASIL parser error messages """ def setUp(self): TestFunctional.setUp(self) momA = self.moms.values()[0] if not momA.is_cray(): self.skipTest("%s: not a cray mom." % (momA.shortname)) def test_basil_errors(self): """ Check for the non existence of BASIL errors in mom logs """ self.mom.log_match("PERMANENT BASIL error from SYNTAX", max_attempts=10, interval=1, existence=False) self.mom.log_match("Error in BASIL response", max_attempts=10, interval=1, existence=False) ================================================ FILE: test/tests/functional/pbs_basil_support.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * from string import Template import os import defusedxml.ElementTree as ET @tags('cray', 'mom') class TestBasilQuery(TestFunctional): """ This test suite is for testing the support for BASIL 1.7/1.4 basil query.Test if query is made with correct BASIL version, and that vnodes are getting created as per the query response. """ basil_version = ['1.7', '1.4', '1.3'] available_version = "" @staticmethod def init_inventory_node(): node = {} node['vnode'] = "" node['arch'] = "" node['current_aoe'] = "" node['host'] = "" node['hbmem'] = "" node['mem'] = "" node['ncpus'] = "" node['PBScrayhost'] = "" node['PBScraynid'] = "" node['vntype'] = "" node['accelerator_memory'] = "" node['accelerator_model'] = "" node['naccelerators'] = "" return node def reset_nodes(self, hostA): # Remove all nodes self.server.manager(MGR_CMD_DELETE, NODE, None, "") # Restart PBS self.server.restart() # Create node self.server.manager(MGR_CMD_CREATE, NODE, None, hostA) # Wait for 3 seconds for changes to take effect time.sleep(3) def setUp(self): TestFunctional.setUp(self) self.server.manager(MGR_CMD_SET, PBS_HOOK, {'enabled': 'true', 'freq': 10}, id='PBS_alps_inventory_check') momA = self.moms.values()[0] if not momA.is_cray(): self.skipTest("%s: not a cray mom." % (momA.shortname)) mom_config = momA.parse_config() if '$alps_client' not in mom_config: self.skipTest("alps_client not set in mom config.") if '$vnode_per_numa_node' in mom_config: momA.unset_mom_config('$vnode_per_numa_node', False) momA.add_config({'$logevent': '0xffffffff'}) # check if required BASIL version available on the machine. for ver in self.basil_version: xml_out = self.query_alps(ver, 'QUERY', 'ENGINE') xml_tree = ET.parse(xml_out) os.remove(xml_out) response = xml_tree.find(".//ResponseData") status = response.attrib['status'] if status == "SUCCESS": self.available_version = ver break if self.available_version == "": self.skipTest("No supported basil version found on the platform.") # Reset nodes self.reset_nodes(momA.shortname) def query_alps(self, ver, method, qtype): """ Send a query to ALPS of a certain type and return the xml output file. """ basil_protocol = 'protocol="%s"' % (ver) basil_method = 'method="%s"' % (method) basil_qtype = 'type="%s"' % (qtype) queryt = Template('\n') query = queryt.substitute(ver=basil_protocol, method=basil_method, qtype=basil_qtype) mom_config = self.mom.parse_config() alps_client = mom_config['$alps_client'] fn = self.du.create_temp_file(body=query) xout = self.du.create_temp_file() self.du.run_cmd(cmd="%s < %s > %s" % (alps_client, fn, xout), as_script=True) os.remove(fn) return xout def comp_node(self, vnode): """ Check if compute node is found in pbsnodes -av output. If so check if the vnode attribute has the correct values. """ name = vnode['vnode'] try: pbs_node = self.server.status(NODE, id=name)[0] except PbsStatusError: self.assertFalse(pbs_node is None, "Cray compute node %s doesn't exist on pbs server" % (name)) for rsc, xval in vnode.items(): if rsc != 'current_aoe': resource = 'resources_available.' + rsc else: resource = rsc if xval != "": if resource in pbs_node: rval = pbs_node[resource] if rval == xval: self.logger.info( "%s: node has %s=%s" % (name, rsc, rval)) self.assertTrue(True) else: self.assertFalse("%s: node has %s=%s but XML %s=%s" % (name, resource, rval, rsc, xval)) else: self.assertFalse( "%s\t: node has no resource %s" % (name, rsc)) def get_knl_vnodes(self): xml_out = self.query_alps('1.7', 'QUERY', 'SYSTEM') tree = ET.parse(xml_out) os.remove(xml_out) root = tree.getroot() knl_vnodes = {} knl_info = {} # If node has the KNL processor then add them # to knl_vnodes dictionary for node in root.getiterator('Nodes'): # XML values role = node.attrib["role"] state = node.attrib["state"] numa_cfg = node.attrib["numa_cfg"] hbm_size_mb = node.attrib["hbm_size_mb"] hbm_cache_pct = node.attrib["hbm_cache_pct"] if role == 'batch' and state == 'up' and numa_cfg != ""\ and hbm_size_mb != "" and hbm_cache_pct != "": # derived values from XML knl_info['current_aoe'] = numa_cfg + '_' + hbm_cache_pct knl_info['hbmem'] = hbm_size_mb + 'mb' nid_ranges = node.text.strip() nid_range_list = list(nid_ranges.split(',')) while len(nid_range_list) > 0: nid_range = nid_range_list.pop() nid1 = nid_range.split('-') if len(nid1) == 2: # range of nodes r1 = int(nid1[0]) r2 = int(nid1[1]) + 1 for node_id in range(r1, r2): # associate each nid with it's knl information knl_vnodes['%d' % node_id] = knl_info else: # single node node_id = int(nid1[0]) knl_vnodes['%d' % node_id] = knl_info return knl_vnodes def retklist(self): """ Return a list of KNL vnodes, empty list if there are no KNL vnodes. """ klist = [] # Find the list of KNL vnodes kvnl = self.server.filter(VNODE, {'current_aoe': (NE, "")}) if len(kvnl) == 0: self.skipTest(reason='No KNL vnodes present') else: klist = list(kvnl.values())[0] self.logger.info("KNL vnode list: %s" % (klist)) return klist def set_provisioning(self): """ Set provisioning enabled and aoe resource on Xeon Phi nodes. """ # Check for provisioning setup momA = self.moms.values()[0].shortname serverA = self.servers.values()[0].shortname msg = ("Provide a mom not present on server host while invoking" " the test: -p moms=") if momA == serverA: self.skipTest(reason=msg) nodelist = self.server.status(NODE, 'current_aoe') for node in nodelist: a = {'provision_enable': 'true', 'resources_available.aoe': '%s' % node['current_aoe']} self.server.manager(MGR_CMD_SET, NODE, a, id=node['id']) def unset_provisioning(self): """ Unset provisioning attribute and aoe resource on Xeon Phi nodes. """ nodelist = self.server.status(NODE, 'current_aoe') for node in nodelist: a = ['provision_enable', 'resources_available.aoe'] self.server.manager(MGR_CMD_UNSET, NODE, a, id=node['id']) def request_current_aoe(self): """ Get the value of current_aoe set on the XeonPhi vnodes """ aoe_val = self.server.status(NODE, 'current_aoe') req_aoe = aoe_val[0]['current_aoe'] return req_aoe def test_InventoryQueryVersion(self): """ Test if BASIL version is set to required BASIL version on cray/simulator platform. """ self.mom.signal('-HUP') engine_query_log = "" % (self.basil_version[1]) self.mom.log_match(engine_query_log, n='ALL', max_attempts=3) if self.available_version == '1.7': msg = 'This Cray system supports the BASIL 1.7 protocol' self.mom.log_match(msg, n='ALL', max_attempts=3) basil_version_log = 'alps_engine_query;The basilversion is' \ ' set to 1.4' else: basil_version_log = 'alps_engine_query;The basilversion is' \ ' set to ' + self.available_version self.mom.log_match(basil_version_log, max_attempts=3) def test_InventoryVnodes(self): """ This test validates the vnode created using alps BASIL 1.4 & 1.7 inventory query response. """ knl_vnodes = {} # Parse inventory query response and fetch node information. xml_out = self.query_alps('1.4', 'QUERY', 'INVENTORY') xml_tree = ET.parse(xml_out) os.remove(xml_out) inventory_1_4_el = xml_tree.find(".//Inventory") hn = inventory_1_4_el.attrib["mpp_host"] if self.available_version == '1.7': knl_vnodes = self.get_knl_vnodes() # Fill vnode structure using BASIL response for node in inventory_1_4_el.getiterator('Node'): role = node.attrib["role"] if role == 'BATCH': # XML values node_id = node.attrib["node_id"] cu_el = node.findall('.//ComputeUnit') mem_el = node.findall('.//Memory') ac_el = node.findall('.//Accelerator') page_size_kb = mem_el[0].attrib["page_size_kb"] page_count = mem_el[0].attrib["page_count"] vnode = self.init_inventory_node() vnode['arch'] = node.attrib['architecture'] vnode['vnode'] = hn + '_' + node_id vnode['vntype'] = "cray_compute" vnode['mem'] = str(int(page_size_kb) * int(page_count) * len(mem_el)) + "kb" vnode['host'] = vnode['vnode'] vnode['PBScraynid'] = node_id vnode['PBScrayhost'] = hn vnode['ncpus'] = str(len(cu_el)) if ac_el: vnode['naccelerators'] = str(len(ac_el)) vnode['accelerator_memory'] = str( ac_el[0].attrib['memory_mb']) + "mb" vnode['accelerator_model'] = ac_el[0].attrib['family'] if node_id in knl_vnodes: vnode['hbmem'] = knl_vnodes[node_id]['hbmem'] vnode['current_aoe'] = knl_vnodes[node_id]['current_aoe'] vnode['vnode'] = hn + '_' + node_id # Compare xml vnode with pbs node. self.logger.info("Validating vnode:%s" % (vnode['vnode'])) self.comp_node(vnode) def test_cray_login_node(self): """ This test validates that cray mom node resources value remain unchanged before and after adding $alps_client in mom config. """ mom_id = self.mom.shortname try: cray_login_node = self.server.status(NODE, id=mom_id)[0] self.mom.unset_mom_config('$alps_client', False) self.reset_nodes(mom_id) pbs_node = self.server.status(NODE, id=mom_id)[0] except PbsStatusError: self.assertFalse(True, "Mom node %s doesn't exist on pbs server" % (mom_id)) # List of resources to be ignored while comparing. ignr_rsc = ['license', 'last_state_change_time'] for rsc, val in pbs_node.items(): if rsc in ignr_rsc: continue self.assertTrue(rsc in cray_login_node, ("%s\t: login node has no rsc %s") % (mom_id, rsc)) rval = cray_login_node[rsc] self.assertEqual(rval, val, ("%s\t: pbs node has %s=%s but login " "node has %s=%s") % (mom_id, rsc, val, rsc, rval)) def test_hbmemm_rsc(self): """ Create a job that requests enough HBMEM. Submit the job to the Server. Check if the job is in the 'R' state and if the job runs on a KNL vnode. Delete the job. """ knl_vnodes = self.get_knl_vnodes() if len(knl_vnodes) == 0: self.skipTest(reason='No KNL vnodes present') else: self.logger.info("KNL vnode list: %s" % (knl_vnodes)) hbm_req = 4192 a = {'Resource_List.select': '1:hbmem=%dmb' % hbm_req} job = Job(TEST_USER, attrs=a) job_id = self.server.submit(job) self.server.expect(JOB, {'job_state': 'R'}, id=job_id) # Check that exec_vnode is a KNL vnode.` self.server.status(JOB, 'exec_vnode', id=job_id) evnode = list(job.execvnode()[0].keys())[0] nid = evnode.split('_')[1] if nid in knl_vnodes.keys(): self.logger.info("exec_vnode %s is a KNL vnode." % (evnode)) rv = 1 else: self.logger.info("exec_vnode %s is not a KNL vnode." % (evnode)) rv = 0 self.assertTrue(rv == 1) nodes = self.server.status(NODE) for n in nodes: v_name = n['id'] if v_name == evnode: hbm_assig = n['resources_assigned.hbmem'] hbm_int = int(re.search(r'\d+', hbm_assig).group()) hbm_in_kb = hbm_req * 1024 self.logger.info( "vnode name=%s -- hbm assigned=%s -- hbm requested=%dkb" % (v_name, hbm_assig, hbm_in_kb)) if hbm_int == hbm_in_kb: self.logger.info( "The requested hbmem of %s mb has been assigned." % (str(hbm_req))) self.assertTrue(True) else: self.logger.info( "The assigned hbmem of %s, on %s, does not match " "requested hbmem of %d mb" % (hbm_assig, v_name, hbm_req)) self.assertTrue(False) def test_job_request_insufficent_hbmemm_rsc(self): """ Submit a job request that requests more than available HBMEM. Check if the job is in the 'Q' state with valid comment. Delete the job """ # Find the list of KNL vnodes knl_vnodes = self.get_knl_vnodes() if len(knl_vnodes) == 0: self.skipTest(reason='No KNL vnodes present') else: self.logger.info("KNL vnode list: %s" % (knl_vnodes)) hbm_req = 18000 a = {'Resource_List.select': '1:hbmem=%dmb' % hbm_req} job = Job(TEST_USER, attrs=a) job_id = self.server.submit(job) # Check that job is in Q state with valid comment job_comment = "Not Running: Insufficient amount of resource: hbmem" self.server.expect(JOB, {'job_state': 'Q', 'comment': (MATCH_RE, job_comment)}, attrop=PTL_AND, id=job_id) def test_job_request_knl(self): """ Create a job that requests aoe should run on a KNL vnode. Submit the job to the Server. Check if the job runs on a KNL vnode and if the job is in the 'R' state. """ if self.du.platform == 'craysim': self.skipTest(reason='Test is not applicable for Craysim') # Find the list of KNL vnodes klist = self.retklist() # Set provisioning attributes on KNL vnode. self.set_provisioning() # Submit job that request aoe req_aoe = self.request_current_aoe() job = Job(TEST_USER) job.create_script( "#PBS -joe -o localhost:/tmp -lselect=1:ncpus=1:aoe=%s\n" % req_aoe + " cd /tmp\n" "aprun -B sleep 10\n" "sleep 10") job_id = self.server.submit(job) self.server.expect(JOB, {'job_state': 'R'}, id=job_id) # Check that exec_vnode is a KNL vnode. self.server.status(JOB, 'exec_vnode', id=job_id) evnode = job.get_vnodes()[0] self.assertIn(evnode, klist, "exec_vnode %s is not a KNL vnode." % (evnode)) self.logger.info("exec_vnode %s is a KNL vnode." % (evnode)) # Unset provisioning attributes. self.unset_provisioning() def test_job_request_subchunk(self): """ Test job request consist of subchunks with and without aoe resource. """ if self.du.platform == 'craysim': self.skipTest(reason='Test is not applicable for craysim') # Find the list of KNL vnodes klist = self.retklist() # Set provisioning attributes. self.set_provisioning() # Submit job that request sub-chunk with and without aoe resources req_aoe = self.request_current_aoe() job = Job(TEST_USER) job.create_script( "#PBS -joe -o localhost:/tmp -lplace=scatter " "-lselect=1:ncpus=1:aoe=%s+1:ncpus=1\n" % req_aoe + " cd /tmp\n" "aprun -B sleep 10\n" "sleep 10") job_id = self.server.submit(job) self.server.expect(JOB, {'job_state': 'R'}, id=job_id) # Check that exec_vnode is a KNL vnode. self.server.status(JOB, 'exec_vnode', id=job_id) evnode = job.get_vnodes() self.assertIn(evnode[0], klist, "exec_vnode %s is not a KNL vnode." % (evnode[0])) self.logger.info("exec_vnode %s is a KNL vnode." % (evnode[0])) self.assertNotIn(evnode[1], klist, "exec_vnode %s is a KNL" " vnode." % (evnode[1])) self.logger.info("exec_vnode %s is not a KNL vnode." % (evnode[1])) # Unset provisioning attributes. self.unset_provisioning() def test_pbs_alps_in_sync(self): """ Check for the presence of message indicating PBS and ALPS are in sync. """ # Determine if BASIL 1.7 is supported. try: rv = self.mom.log_match( "This Cray system supports the BASIL 1.7 protocol.", n='ALL', max_attempts=10) except PtlLogMatchError: self.skipTest( reason='Test not applicable for system not having BASIL 1.7') # Determine if KNL vnodes are present. knl_vnodes = self.get_knl_vnodes() if len(knl_vnodes) == 0: self.skipTest(reason='No KNL vnodes present') else: self.logger.info("KNL vnode list: %s" % (knl_vnodes)) # Check for PBS ALPS Inventory Hook message. now = time.time() rv = self.mom.log_match("ALPS Inventory Check: PBS and ALPS" " are in sync", starttime=now, interval=5) self.assertTrue(rv) def test_knl_batch_to_interactive(self): """ Change the mode of any two KNL nodes to interactive. Then check if the PBS_alps_inventory_check hook picks up on the change and nodes are marked as stale. Restore changes to hook and mode of KNL nodes. """ if self.du.platform == 'craysim': self.skipTest(reason='xtprocadmin cmd is not on cray simulator') # Find the list of KNL vnodes klist = self.retklist() # Change mode of two KNL nodes to interactive if len(klist) >= 2: k1 = klist[0] k2 = klist[len(klist) - 1] knl1 = re.search(r'\d+', k1).group() knl2 = re.search(r'\d+', k2).group() cmd = ['xtprocadmin', '-k', 'm', 'interactive', '-n', knl1] ret = self.server.du.run_cmd(self.server.hostname, cmd, logerr=True) self.assertEqual(ret['rc'], 0) cmd = ['xtprocadmin', '-k', 'm', 'interactive', '-n', knl2] ret = self.server.du.run_cmd(self.server.hostname, cmd, logerr=True) self.assertEqual(ret['rc'], 0) # Do Mom HUP self.mom.signal('-HUP') # Check that the nodes are now stale. self.server.expect(VNODE, {'state': 'Stale'}, id=k1, max_attempts=10, interval=5) self.server.expect(VNODE, {'state': 'Stale'}, id=k2) # Change nodes back to batch mode cmd = ['xtprocadmin', '-k', 'm', 'batch'] ret = self.server.du.run_cmd(self.server.hostname, cmd, logerr=True) self.assertEqual(ret['rc'], 0) # Do Mom HUP self.mom.signal('-HUP') # Check that the nodes are now free. self.server.expect(VNODE, {'state': 'free'}, id=k1, max_attempts=10, interval=5) self.server.expect(VNODE, {'state': 'free'}, id=k2) def test_job_run_on_knl_node(self): """ Change the mode of KNL nodes to batch. Then check if the PBS_alps_inventory_check hook picks up on the change. Submit job and confirm job should be in R state """ if self.du.platform == 'craysim': self.skipTest(reason='xtprocadmin cmd is not on cray simulator') # Find the list of KNL vnodes klist = self.retklist() # Change mode of all nodes to interactive cmd = ['xtprocadmin', '-k', 'm', 'interactive'] ret = self.server.du.run_cmd(self.server.hostname, cmd, logerr=True) self.assertEqual(ret['rc'], 0) # Change mode of two KNL nodes to batch if len(klist) >= 2: k1 = klist[0] k2 = klist[len(klist) - 1] knl1 = re.search(r'\d+', k1).group() knl2 = re.search(r'\d+', k2).group() cmd = ['xtprocadmin', '-k', 'm', 'batch', '-n', knl1] ret = self.server.du.run_cmd(self.server.hostname, cmd, logerr=True) self.assertEqual(ret['rc'], 0) cmd = ['xtprocadmin', '-k', 'm', 'batch', '-n', knl2] ret = self.server.du.run_cmd(self.server.hostname, cmd, logerr=True) self.assertEqual(ret['rc'], 0) # Do Mom HUP self.mom.signal('-HUP') # Check that the nodes are Free. self.server.expect(VNODE, {'state': 'free'}, id=k1, max_attempts=10, interval=5) self.server.expect(VNODE, {'state': 'free'}, id=k2) # Submit few jobs a = {'Resource_List.select': '1:vntype=cray_compute'} job = Job(TEST_USER, attrs=a) job_id = self.server.submit(job) self.server.expect(JOB, {'job_state': 'R'}, id=job_id) # Check that exec_vnode is a KNL vnode. self.server.status(JOB, 'exec_vnode', id=job_id) evnode = job.get_vnodes()[0] self.assertIn(evnode, klist, "exec_vnode %s is not a KNL vnode." % (evnode)) self.logger.info("exec_vnode %s is a KNL vnode." % (evnode)) job2 = Job(TEST_USER, attrs=a) job_id2 = self.server.submit(job2) self.server.expect(JOB, {'job_state': 'R'}, id=job_id2) # Check that exec_vnode is a KNL vnode. self.server.status(JOB, 'exec_vnode', id=job_id2) evnode = job2.get_vnodes()[0] self.assertIn(evnode, klist, "exec_vnode %s is not a KNL vnode." % (evnode)) self.logger.info("exec_vnode %s is a KNL vnode." % (evnode)) job3 = Job(TEST_USER, attrs=a) job_id3 = self.server.submit(job3) self.server.expect(JOB, {'job_state': 'Q'}, id=job_id3) # Delete the Job1. self.server.delete(job_id, wait=True) # Verify Job3 should start running self.server.expect(JOB, {'job_state': 'R'}, id=job_id3) # Check that exec_vnode is a KNL vnode. self.server.status(JOB, 'exec_vnode', id=job_id3) evnode = job3.get_vnodes()[0] self.assertIn(evnode, klist, "exec_vnode %s is not a KNL vnode." % (evnode)) self.logger.info("exec_vnode %s is a KNL vnode." % (evnode)) def test_validate_pbs_xeon_phi_provision_hook(self): """ Verify the default attribute of pbs_hook PBS_xeon_phi_provision hook. """ if self.du.platform != 'cray': self.skipTest(reason='pbs_hook PBS_xeon_phi_provision is not' ' available on non-cray machine') attr = {'type': 'pbs', 'enabled': 'false', 'event': 'provision', 'alarm': 1800, 'order': 1, 'debug': 'false', 'user': 'pbsadmin', 'fail_action': 'none'} self.server.manager(MGR_CMD_LIST, PBS_HOOK, attr, id='PBS_xeon_phi_provision') self.server.manager(MGR_CMD_SET, PBS_HOOK, {'enabled': 'true', 'alarm': 1000}, id='PBS_xeon_phi_provision') self.server.manager(MGR_CMD_LIST, PBS_HOOK, {'enabled': 'true', 'alarm': 1000}, id='PBS_xeon_phi_provision') # Reset pbs_hook value to default PBS_xeon_phi_provision hook self.server.manager(MGR_CMD_SET, PBS_HOOK, {'enabled': 'false', 'alarm': 1800}, id='PBS_xeon_phi_provision') self.server.manager(MGR_CMD_LIST, PBS_HOOK, attr, id='PBS_xeon_phi_provision') def tearDown(self): TestFunctional.tearDown(self) if self.du.platform == 'cray': # Change all nodes back to batch mode and restart PBS cmd = ['xtprocadmin', '-k', 'm', 'batch'] self.logger.info(cmd) ret = self.server.du.run_cmd(self.server.hostname, cmd, logerr=True) self.assertEqual(ret['rc'], 0) # Restore hook freq to 300 self.server.manager(MGR_CMD_SET, PBS_HOOK, {'enabled': 'true', 'freq': 300}, id='PBS_alps_inventory_check') # Do Mom HUP self.mom.signal('-HUP') ================================================ FILE: test/tests/functional/pbs_calendaring.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. import time from tests.functional import * from ptl.utils.pbs_logutils import PBSLogUtils class TestCalendaring(TestFunctional): """ This test suite tests if PBS scheduler calendars events correctly """ def test_topjob_start_time(self): """ In this test we test that the top job which gets added to the calendar has estimated start time correctly set for future when job history is enabled and opt_backfill_fuzzy is turned off. """ self.scheduler.set_sched_config({'strict_ordering': 'true all'}) a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) a = {'backfill_depth': '2', 'job_history_enable': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) # Turn opt_backfill_fuzzy off because we want to check if the job can # run after performing every end event in calendaring code instead # of rounding it off to next time boundary (default it 60 seconds) a = {'opt_backfill_fuzzy': 'off'} self.server.manager(MGR_CMD_SET, SCHED, a) res_req = {'Resource_List.select': '1:ncpus=1', 'Resource_List.walltime': 30, 'array_indices_submitted': '1-6'} j1 = Job(TEST_USER, attrs=res_req) j1.set_sleep_time(30) jid1 = self.server.submit(j1) j1_sub1 = j1.create_subjob_id(jid1, 1) j1_sub2 = j1.create_subjob_id(jid1, 2) res_req = {'Resource_List.select': '1:ncpus=1', 'Resource_List.walltime': 30} j2 = Job(TEST_USER, attrs=res_req) jid2 = self.server.submit(j2) self.server.expect(JOB, {'job_state': 'X'}, j1_sub1, interval=1) self.server.expect(JOB, {'job_state': 'R'}, j1_sub2) self.server.expect(JOB, {'job_state': 'Q'}, jid2) job1 = self.server.status(JOB, id=jid1) job2 = self.server.status(JOB, id=jid2) time_now = int(time.time()) # get estimated start time of both the jobs self.assertIn('estimated.start_time', job1[0]) est_val1 = job1[0]['estimated.start_time'] self.assertIn('estimated.start_time', job2[0]) est_val2 = job2[0]['estimated.start_time'] est1 = time.strptime(est_val1, "%a %b %d %H:%M:%S %Y") est2 = time.strptime(est_val2, "%a %b %d %H:%M:%S %Y") est_epoch1 = int(time.mktime(est1)) est_epoch2 = int(time.mktime(est2)) # since only one subjob of array parent can become topjob # second job must start 10 seconds after that because # walltime of array job is 10 seconds. self.assertEqual(est_epoch2, est_epoch1 + 30) # Also make sure that since second subjob from array is running # Third subjob should set estimated.start_time in future. self.assertGreater(est_epoch1, time_now) def test_topjob_start_time_of_subjob(self): """ In this test we test that the subjob which gets added to the calendar as top job and it has estimated start time correctly set when opt_backfill_fuzzy is turned off. """ self.scheduler.set_sched_config({'strict_ordering': 'true all'}) a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) a = {'backfill_depth': '2'} self.server.manager(MGR_CMD_SET, SERVER, a) # Turn opt_backfill_fuzzy off because we want to check if the job can # run after performing every end event in calendaring code instead # of rounding it off to next time boundary (default it 60 seconds) a = {'opt_backfill_fuzzy': 'off'} self.server.manager(MGR_CMD_SET, SCHED, a) res_req = {'Resource_List.select': '1:ncpus=1', 'Resource_List.walltime': 20, 'array_indices_submitted': '1-6'} j = Job(TEST_USER, attrs=res_req) j.set_sleep_time(10) jid = self.server.submit(j) j1_sub1 = j.create_subjob_id(jid, 1) j1_sub2 = j.create_subjob_id(jid, 2) self.server.expect(JOB, {'job_state': 'X'}, j1_sub1, interval=1) self.server.expect(JOB, {'job_state': 'R'}, j1_sub2) job_arr = self.server.status(JOB, id=jid) # check estimated start time is set on job array self.assertIn('estimated.start_time', job_arr[0]) errmsg = jid + ";Error in calculation of start time of top job" self.scheduler.log_match(errmsg, existence=False, max_attempts=10) def test_topjob_fail(self): """ Test that when we fail to add a job to the calendar it doesn't take up a topjob slot. The server's backfill_depth is 1 by default, so we just need to submit a job that can never run and a job that can. The can never run job will fail to be added to the calendar and the second job will be. """ # We need two nodes to create the situation where a job can never run. # We need to create this situation in such a way that the scheduler # doesn't detect it. If the scheduler detects that a job can't run, # it won't try and add it to the calendar. To do this, we ask for # 1 node with 2 cpus. There are 2 nodes with 1 cpu each. attrs = {'resources_available.ncpus': 1} self.mom.create_vnodes(attrib=attrs, num=2, sharednode=False) self.scheduler.set_sched_config({'strict_ordering': 'True ALL'}) # Submit job to eat up all the resources attrs = {'Resource_List.select': '2:ncpus=1', 'Resource_List.walltime': '1:00:00'} j1 = Job(TEST_USER, attrs) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) # submit job that can never run. attrs['Resource_List.select'] = '1:ncpus=2' j2 = Job(TEST_USER, attrs) jid2 = self.server.submit(j2) # submit a job that can run, but just not now attrs['Resource_List.select'] = '1:ncpus=1' j3 = Job(TEST_USER, attrs) jid3 = self.server.submit(j3) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'True'}) msg = jid2 + ';Error in calculation of start time of top job' self.scheduler.log_match(msg) msg = jid3 + ';Job is a top job and will run at' self.scheduler.log_match(msg) def test_topjob_bucket(self): """ In this test we test that a bucket job will be calendared to start at the end of the last job on a node """ self.scheduler.set_sched_config({'strict_ordering': 'true all'}) a = {'resources_available.ncpus': 2} self.mom.create_vnodes(a, 1) res_req = {'Resource_List.select': '1:ncpus=1', 'Resource_List.walltime': 30} j1 = Job(TEST_USER, attrs=res_req) j1.set_sleep_time(30) jid1 = self.server.submit(j1) res_req = {'Resource_List.select': '1:ncpus=1', 'Resource_List.walltime': 45} j2 = Job(TEST_USER, attrs=res_req) j2.set_sleep_time(45) jid2 = self.server.submit(j2) res_req = {'Resource_List.select': '1:ncpus=1', 'Resource_List.place': 'excl'} j3 = Job(TEST_USER, attrs=res_req) jid3 = self.server.submit(j3) self.server.expect(JOB, {'job_state': 'R'}, jid1) self.server.expect(JOB, {'job_state': 'R'}, jid2) self.server.expect(JOB, {'job_state': 'Q'}, jid3) job1 = self.server.status(JOB, id=jid1) job2 = self.server.status(JOB, id=jid2) job3 = self.server.status(JOB, id=jid3) end_time = time.mktime(time.strptime(job2[0]['stime'], '%c')) + 45 est_time = job3[0]['estimated.start_time'] est_time = time.mktime(time.strptime(est_time, '%c')) self.assertAlmostEqual(end_time, est_time, delta=1) def test_zero_resource_pushes_topjob(self): """ This test case tests the scenario where a job that requests zero instance of a resource as the last resource in the select statement pushes the start time of top jobs """ attrs = {'resources_available.ncpus': 4} self.mom.create_vnodes(attrib=attrs, num=5, sharednode=False) attr = {ATTR_RESC_TYPE: 'long', ATTR_RESC_FLAG: 'hn'} self.server.manager(MGR_CMD_CREATE, RSC, attr, id='ngpus') resources = self.scheduler.sched_config['resources'] resources = resources[:-1] + ', ngpus, zz\"' a = {'job_sort_key': '"job_priority HIGH ALL"', 'resources': resources, 'strict_ordering': 'True ALL'} self.scheduler.set_sched_config(a) a = {'Resource_List.select': '2:ncpus=4', 'Resource_List.walltime': '1:00:00', 'Resource_List.place': 'vscatter'} j = Job(TEST_USER) j.set_attributes(a) jid1 = self.server.submit(j) j = Job(TEST_USER) j.set_attributes(a) jid2 = self.server.submit(j) a = {'Resource_List.select': '5:ncpus=4', 'Resource_List.walltime': '1:00:00', ATTR_p: "1000", 'Resource_List.place': 'vscatter'} j = Job(TEST_USER) j.set_attributes(a) jid3 = self.server.submit(j) a = {'Resource_List.select': '1:ncpus=4', 'Resource_List.walltime': '24:00:01', 'Resource_List.place': 'vscatter'} j = Job(TEST_USER) j.set_attributes(a) jid4 = self.server.submit(j) a = {'Resource_List.select': '1:ncpus=4:ngpus=0', 'Resource_List.walltime': '24:00:01', 'Resource_List.place': 'vscatter'} j = Job(TEST_USER) j.set_attributes(a) jid5 = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2) self.server.expect(JOB, {ATTR_state: 'Q'}, id=jid3) c = "Not Running: Job would conflict with reservation or top job" self.server.expect(JOB, {ATTR_state: 'Q', ATTR_comment: c}, id=jid4) self.server.expect(JOB, {ATTR_state: 'Q', ATTR_comment: c}, id=jid5) def test_zero_resource_job_conflict_resv(self): """ This test case tests the scenario where a job that requests zero instance of a resource as the last resource in the select statement pushes the start time of reservations """ attrs = {'resources_available.ncpus': 4} self.mom.create_vnodes(attrib=attrs, num=5, sharednode=False) attr = {ATTR_RESC_TYPE: 'long', ATTR_RESC_FLAG: 'hn'} self.server.manager(MGR_CMD_CREATE, RSC, attr, id='ngpus') resources = self.scheduler.sched_config['resources'] resources = resources[:-1] + ', ngpus, zz\"' a = {'job_sort_key': '"job_priority HIGH ALL"', 'resources': resources, 'strict_ordering': 'True ALL'} self.scheduler.set_sched_config(a) a = {'Resource_List.select': '2:ncpus=4', 'Resource_List.walltime': '1:00:00', 'Resource_List.place': 'vscatter'} j = Job(TEST_USER) j.set_attributes(a) jid1 = self.server.submit(j) j = Job(TEST_USER) j.set_attributes(a) jid2 = self.server.submit(j) now = int(time.time()) a = {'Resource_List.select': '5:ncpus=4', 'reserve_start': now + 3610, 'reserve_end': now + 6610, 'Resource_List.place': 'vscatter'} r = Reservation(TEST_USER) r.set_attributes(a) rid = self.server.submit(r) exp = {'reserve_state': (MATCH_RE, "RESV_CONFIRMED|2")} self.server.expect(RESV, exp, id=rid) a = {'Resource_List.select': '1:ncpus=4', 'Resource_List.walltime': '24:00:01', 'Resource_List.place': 'vscatter'} j = Job(TEST_USER) j.set_attributes(a) jid3 = self.server.submit(j) a = {'Resource_List.select': '1:ncpus=4:ngpus=0', 'Resource_List.walltime': '24:00:01', 'Resource_List.place': 'vscatter'} j = Job(TEST_USER) j.set_attributes(a) jid4 = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2) c = "Not Running: Job would conflict with reservation or top job" self.server.expect(JOB, {ATTR_state: 'Q', ATTR_comment: c}, id=jid3) self.server.expect(JOB, {ATTR_state: 'Q', ATTR_comment: c}, id=jid4) def test_topjob_stale_estimates_clearing_on_clear_attr_set(self): """ In this test we test that former top job with stale estimate gets the estimate cleared once the server attribute clear_topjob_estimates_enable is set to True """ self.scheduler.set_sched_config({'strict_ordering': 'true all'}) a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) a = {'backfill_depth': '2'} self.server.manager(MGR_CMD_SET, SERVER, a) a = {'scheduler_iteration': '5'} self.server.manager(MGR_CMD_SET, SCHED, a) res_req = {'Resource_List.select': '1:ncpus=1', 'Resource_List.walltime': 300} j1 = Job(TEST_USER, attrs=res_req) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, jid1) j2 = Job(TEST_USER, attrs=res_req) jid2 = self.server.submit(j2) job2 = self.server.status(JOB, id=jid2) self.assertIn('estimated.start_time', job2[0]) self.assertIn('estimated.exec_vnode', job2[0]) self.server.expect(JOB, {'topjob': True}, jid2, max_attempts=5) a = {'backfill_depth': '0'} self.server.manager(MGR_CMD_SET, SERVER, a) time.sleep(6) job2 = self.server.status(JOB, id=jid2) self.assertIn('estimated.start_time', job2[0]) self.assertIn('estimated.exec_vnode', job2[0]) self.server.expect(JOB, {'topjob': False}, jid2, max_attempts=5) a = {'clear_topjob_estimates_enable': True} self.server.manager(MGR_CMD_SET, SERVER, a) self.server.expect(JOB, 'estimated.start_time', id=jid2, op=UNSET, interval=1, max_attempts=10) self.server.expect(JOB, 'estimated.exec_vnode', id=jid2, op=UNSET, interval=1, max_attempts=10) def test_topjob_estimates_clearing_enabled(self): """ In this test we test that the top job which gets added to the calendar with valid estimate has estimate cleared once it losses top job status. The clearing needs to have the server attribute clear_topjob_estimates_enable set to true. Also, the job's topjob attribute is set accordingly. """ self.scheduler.set_sched_config({'strict_ordering': 'true all'}) a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) a = {'backfill_depth': '2', 'clear_topjob_estimates_enable': True} self.server.manager(MGR_CMD_SET, SERVER, a) a = {'scheduler_iteration': '5'} self.server.manager(MGR_CMD_SET, SCHED, a) res_req = {'Resource_List.select': '1:ncpus=1', 'Resource_List.walltime': 300} j1 = Job(TEST_USER, attrs=res_req) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, jid1) j2 = Job(TEST_USER, attrs=res_req) jid2 = self.server.submit(j2) job2 = self.server.status(JOB, id=jid2) self.assertIn('estimated.start_time', job2[0]) self.assertIn('estimated.exec_vnode', job2[0]) self.server.expect(JOB, {'topjob': True}, jid2, max_attempts=5) a = {'backfill_depth': '0'} self.server.manager(MGR_CMD_SET, SERVER, a) time.sleep(6) job2 = self.server.status(JOB, id=jid2) self.assertNotIn('estimated.start_time', job2[0]) self.assertNotIn('estimated.exec_vnode', job2[0]) self.server.expect(JOB, {'topjob': False}, jid2, max_attempts=5) def test_topjob_estimates_clearing_disabled(self): """ In this test we test that the top job which gets added to the calendar with valid estimate has not estimate cleared if it losses top job status. The clearing is prevented by clear_topjob_estimates_enable set to false/unset. Also, the job's topjob attribute is set accordingly. """ self.scheduler.set_sched_config({'strict_ordering': 'true all'}) a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, self.mom.shortname) a = {'backfill_depth': '2', 'clear_topjob_estimates_enable': False} self.server.manager(MGR_CMD_SET, SERVER, a) a = {'scheduler_iteration': '5'} self.server.manager(MGR_CMD_SET, SCHED, a) res_req = {'Resource_List.select': '1:ncpus=1', 'Resource_List.walltime': 300} j1 = Job(TEST_USER, attrs=res_req) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, jid1) j2 = Job(TEST_USER, attrs=res_req) jid2 = self.server.submit(j2) job2 = self.server.status(JOB, id=jid2) self.assertIn('estimated.start_time', job2[0]) self.assertIn('estimated.exec_vnode', job2[0]) self.server.expect(JOB, {'topjob': True}, jid2, max_attempts=5) a = {'backfill_depth': '0'} self.server.manager(MGR_CMD_SET, SERVER, a) time.sleep(6) job2 = self.server.status(JOB, id=jid2) self.assertIn('estimated.start_time', job2[0]) self.assertIn('estimated.exec_vnode', job2[0]) self.server.expect(JOB, {'topjob': False}, jid2, max_attempts=5) ================================================ FILE: test/tests/functional/pbs_cgroups_hook.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. import glob from tests.functional import * # # FUNCTION convert_size # def convert_size(value, units='b'): """ Convert a string containing a size specification (e.g. "1m") to a string using different units (e.g. "1024k"). This function only interprets a decimal number at the start of the string, stopping at any unrecognized character and ignoring the rest of the string. When down-converting (e.g. MB to KB), all calculations involve integers and the result returned is exact. When up-converting (e.g. KB to MB) floating point numbers are involved. The result is rounded up. For example: 1023MB -> GB yields 1g 1024MB -> GB yields 1g 1025MB -> GB yields 2g <-- This value was rounded up Pattern matching or conversion may result in exceptions. """ logs = {'b': 0, 'k': 10, 'm': 20, 'g': 30, 't': 40, 'p': 50, 'e': 60, 'z': 70, 'y': 80} try: new = units[0].lower() if new not in logs: raise ValueError('Invalid unit value') result = re.match(r'([-+]?\d+)([bkmgtpezy]?)', str(value).lower()) if not result: raise ValueError('Unrecognized value') val, old = result.groups() if int(val) < 0: raise ValueError('Value may not be negative') if old not in logs: old = 'b' factor = logs[old] - logs[new] val = float(val) val *= 2 ** factor if (val - int(val)) > 0.0: val += 1.0 val = int(val) return str(val) + units.lower() except Exception: return None def have_swap(): """ Returns 1 if swap space is not 0 otherwise returns 0 """ tt = 0 with open(os.path.join(os.sep, 'proc', 'meminfo'), 'r') as fd: for line in fd: entry = line.split() if ((entry[0] == 'SwapFree:') and (entry[1] != '0')): tt = 1 return tt def systemd_escape(buf): """ Escape strings for usage in system unit names Some distros don't provide the systemd-escape command """ if not isinstance(buf, str): raise ValueError('Not a basetype string') ret = '' for i, char in enumerate(buf): if i < 1 and char == '.': if (sys.version_info[0] < 3): ret += '\\x' + '.'.encode('hex') else: ret += '\\x' + b'.'.hex() elif char.isalnum() or char in '_.': ret += char elif char == '/': ret += '-' else: # Will turn non-ASCII into UTF-8 hex sequence on both Py2/3 if (sys.version_info[0] < 3): hexval = char.encode('hex') else: hexval = char.encode('utf-8').hex() for j in range(0, len(hexval), 2): ret += '\\x' + hexval[j:j + 2] return ret def count_items(items): """ Given a comma-separated string of numerical items of either singular value or a range of values (-), return the actual number of items. For example, items="4-6,9,12-15" count(items) = 8 since items expands to "4,5,6,9,12,13,14,15" """ ct = 0 if items is None: return ct for i in items.split(','): j = i.split('-') if len(j) == 2: ct += len(range(int(j[0]), int(j[1]))) + 1 else: ct += 1 return ct @tags('mom', 'multi_node') class TestCgroupsHook(TestFunctional): """ This test suite targets Linux Cgroups hook functionality. """ def is_memsw_enabled(self, host, mem_path): """ Check if system has swapcontrol enabled, then return true else return false """ if not mem_path: self.logger.info("memory controller not enabled on this host") return 'false' # List all files and check if memsw files exists if self.du.isfile(hostname=host, path=mem_path + os.path.sep + "memory.memsw.usage_in_bytes"): self.logger.info("memsw swap accounting is enabled on this host") return 'true' else: self.logger.info("memsw swap accounting not enabled on this host") return 'false' def setUp(self): self.hook_name = 'pbs_cgroups' # Cleanup previous pbs_cgroup hook so as to not interfere with test c_hook = self.server.filter(HOOK, {'enabled': True}, id=self.hook_name) if c_hook: self.server.manager(MGR_CMD_DELETE, HOOK, id=self.hook_name) a = {'resources_available.ncpus': (EQ, 0), 'state': 'free'} no_cpu_vnodes = self.server.filter(VNODE, a, attrop=PTL_AND) if no_cpu_vnodes: # TestFunctional.setUp() would error out if leftover setup # has no cpus vnodes. Best to cleanup vnodes altogether. self.logger.info("Deleting the existing vnodes") self.mom.delete_vnode_defs() self.mom.restart() for mom in self.moms.values(): if mom.is_cpuset_mom(): mom.revert_to_default = False TestFunctional.setUp(self) # Some of the tests requires 2 or 3 nodes. # Setting the default values when no mom is specified self.vntypename = [] self.iscray = False self.noprefix = False self.tempfile = [] self.moms_list = [] self.hosts_list = [] self.nodes_list = [] self.paths = {} for cnt in range(0, len(self.moms)): mom = self.moms.values()[cnt] if mom.is_cray(): self.iscray = True host = mom.shortname # Check if mom has needed cgroup mounted, otherwise skip test self.paths[host] = self.get_paths(host) if not self.paths[host]['cpuset']: self.skipTest('cpuset subsystem not mounted') self.logger.info("%s: cgroup cpuset is mounted" % host) if self.iscray: node = self.get_hostname(host) else: node = host vntype = self.get_vntype(host) if vntype is None: vntype = "no_cgroups" self.logger.info("vntype value is %s" % vntype) self.logger.info("Deleting the existing vnodes on %s" % host) mom.delete_vnode_defs() # Restart MoM time.sleep(2) time_before_restart = int(time.time()) time.sleep(2) mom.restart() # Make sure that MoM has restarted far enough before reconfiguring # as that sends a HUP and may otherwise interfere with the restart # We send either a HELLO or a restart to server -- wait for that mom.log_match("sent to server", starttime=time_before_restart, n='ALL') self.logger.info("increase log level for mom and \ set polling intervals") c = {'$logevent': '0xffffffff', '$clienthost': self.server.name, '$min_check_poll': 8, '$max_check_poll': 12} mom.add_config(c) self.moms_list.append(mom) self.hosts_list.append(host) self.nodes_list.append(node) self.vntypename.append(vntype) # Setting self.mom defaults to primary mom as some of # library methods assume that self.mom = self.moms_list[0] host = self.moms_list[0].shortname # Delete ALL vnodes # Re-creation moved to the end *after* we correctly set up the hook self.server.manager(MGR_CMD_DELETE, NODE, None, "") self.serverA = self.servers.values()[0].name self.mem = 'true' if not self.paths[host]['memory']: self.mem = 'false' self.swapctl = self.is_memsw_enabled(host, self.paths[host]['memsw']) self.server.set_op_mode(PTL_CLI) self.server.cleanup_jobs() if not self.iscray: self.remove_vntype() self.eatmem_script = """ import sys import time MB = 2 ** 20 iterations = 1 chunkSizeMb = 1 sleeptime = 0 if (len(sys.argv) > 1): iterations = int(sys.argv[1]) if (len(sys.argv) > 2): chunkSizeMb = int(sys.argv[2]) if (len(sys.argv) > 3): sleeptime = int(sys.argv[3]) if (iterations < 1): print('Iteration count must be greater than zero.') exit(1) if (chunkSizeMb < 1): print('Chunk size must be greater than zero.') exit(1) totalSizeMb = chunkSizeMb * iterations print('Allocating %d chunk(s) of size %dMB. (%dMB total)' % (iterations, chunkSizeMb, totalSizeMb)) buf = '' for i in range(iterations): print('allocating %dMB' % ((i + 1) * chunkSizeMb)) buf += ('#' * MB * chunkSizeMb) if sleeptime > 0: time.sleep(sleeptime) """ self.eatmem_script2 = """ import sys import time MB = 2 ** 20 iterations1 = 1 chunkSizeMb1 = 1 sleeptime1 = 0 if (len(sys.argv) > 1): iterations1 = int(sys.argv[1]) if (len(sys.argv) > 2): chunkSizeMb1 = int(sys.argv[2]) if (len(sys.argv) > 3): sleeptime1 = int(sys.argv[3]) if (iterations1 < 1): print('Iteration count must be greater than zero.') exit(1) if (chunkSizeMb1 < 1): print('Chunk size must be greater than zero.') exit(1) totalSizeMb1 = chunkSizeMb1 * iterations1 print('Allocating %d chunk(s) of size %dMB. (%dMB total)' % (iterations1, chunkSizeMb1, totalSizeMb1)) start_time1 = time.time() buf = '' for i in range(iterations1): print('allocating %dMB' % ((i + 1) * chunkSizeMb1)) buf += ('#' * MB * chunkSizeMb1) end_time1 = time.time() if sleeptime1 > 0 and (end_time1 - start_time1) < sleeptime1 : time.sleep(sleeptime1 - end_time1 + start_time1) if len(sys.argv) <= 4: exit(0) iterations2 = 1 chunkSizeMb2 = 1 sleeptime2 = 0 if (len(sys.argv) > 4): iterations2 = int(sys.argv[4]) if (len(sys.argv) > 5): chunkSizeMb2 = int(sys.argv[5]) if (len(sys.argv) > 6): sleeptime2 = int(sys.argv[6]) if (iterations2 < 1): print('Iteration count must be greater than zero.') exit(1) if (chunkSizeMb2 < 1): print('Chunk size must be greater than zero.') exit(1) totalSizeMb2 = chunkSizeMb2 * iterations2 print('Allocating %d chunk(s) of size %dMB. (%dMB total)' % (iterations2, chunkSizeMb2, totalSizeMb2)) start_time2 = time.time() # Do not reinitialize buf!! for i in range(iterations2): print('allocating %dMB' % ((i + 1) * chunkSizeMb2)) buf += ('#' * MB * chunkSizeMb2) end_time2 = time.time() if sleeptime2 > 0 and (end_time2 - start_time2) < sleeptime2 : time.sleep(sleeptime2 - end_time2 + start_time2) """ self.eatmem_job1 = \ '#PBS -joe\n' \ '#PBS -S /bin/bash\n' \ 'sleep 10\n' \ 'python_path=`which python 2>/dev/null`\n' \ 'python3_path=`which python3 2>/dev/null`\n' \ 'python2_path=`which python2 2>/dev/null`\n' \ 'if [ -z "$python_path" ]; then\n' \ ' if [ -n "$python3_path" ]; then\n' \ ' python_path=$python3_path\n' \ ' else\n' \ ' python_path=$python2_path\n' \ ' fi\n' \ 'fi\n' \ 'if [ -z "$python_path" ]; then\n' \ ' echo Exiting -- no python found\n' \ ' exit 1\n' \ 'fi\n' \ '$python_path - 80 10 10 </dev/null`\n' \ 'python3_path=`which python3 2>/dev/null`\n' \ 'python2_path=`which python2 2>/dev/null`\n' \ 'if [ -z "$python_path" ]; then\n' \ ' if [ -n "$python3_path" ]; then\n' \ ' python_path=$python3_path\n' \ ' else\n' \ ' python_path=$python2_path\n' \ ' fi\n' \ 'fi\n' \ 'if [ -z "$python_path" ]; then\n' \ ' echo Exiting -- no python found\n' \ ' exit 1\n' \ 'fi\n' \ 'let i=0; while [ $i -lt 400000 ]; do let i+=1 ; done\n' \ '$python_path - 200 2 10 </dev/null`\n' \ 'python3_path=`which python3 2>/dev/null`\n' \ 'python2_path=`which python2 2>/dev/null`\n' \ 'if [ -z "$python_path" ]; then\n' \ ' if [ -n "$python3_path" ]; then\n' \ ' python_path=$python3_path\n' \ ' else\n' \ ' python_path=$python2_path\n' \ ' fi\n' \ 'fi\n' \ 'if [ -z "$python_path" ]; then\n' \ ' echo Exiting -- no python found\n' \ ' exit 1\n' \ 'fi\n' \ 'timeout 8 md5sum .slice/-.slice # (and needs to be passed through systemd_escape) # 2) / # # Some older hooks used either depending on the OS platform # which was the reason to support a list in the first place # # If you need to add paths to make the tests support older hooks, # put the least likely paths at the end of the list, to avoid # changing test timings too much. # jobdirs = [os.path.join(basedir, 'pbs_jobs.service/jobid', jobid)] for jdir in jobdirs: if self.du.isdir(hostname=host, path=jdir, sudo=True): return jdir return None def find_main_cpath(self, cdir, host=None): if host is None: host = self.hosts_list[0] rc = self.du.isdir(host, path=cdir) if rc: paths = ['pbs_jobs.service/jobid', 'pbs.service/jobid', 'pbs.slice', 'pbs'] for p in paths: cpath = os.path.join(cdir, p) rc = self.du.isdir(host, path=cpath) if rc: return cpath return None def load_hook(self, filename, mom_checks=True): """ Import and enable a hook pointed to by the URL specified. """ try: with open(filename, 'r') as fd: script = fd.read() except IOError: self.assertTrue(False, 'Failed to open hook file %s' % filename) events = ['execjob_begin', 'execjob_launch', 'execjob_attach', 'execjob_epilogue', 'execjob_end', 'exechost_startup', 'exechost_periodic', 'execjob_resize', 'execjob_abort'] # Alarm timeout should be set really large because some tests will # create a lot of simultaneous jobs on a single (slow) MoM # Shipped default is 90 seconds, which is reasonable for real hosts, # but not for containers or VMs sharing a host a = {'enabled': 'True', 'freq': '10', 'alarm': 120, 'event': events} # Sometimes the deletion of the old hook is still pending failed = True for _ in range(5): try: self.server.create_import_hook(self.hook_name, a, script, overwrite=True, level=logging.DEBUG) except Exception: time.sleep(2) else: failed = False break if failed: self.skipTest('pbs_cgroups_hook: failed to load hook') # Add the configuration self.load_default_config(mom_checks=mom_checks) def load_config(self, cfg, mom_checks=True): """ Create a hook configuration file with the provided contents. """ fn = self.du.create_temp_file(hostname=self.serverA, body=cfg) self.tempfile.append(fn) self.logger.info('Current config: %s' % cfg) a = {'content-type': 'application/x-config', 'content-encoding': 'default', 'input-file': fn} # In tests that use this, make sure that other hook CF # copies from setup, node creations, MoM restarts etc. # are all finished, so that we don't match a CF copy # message in the logs from someone else! time.sleep(5) just_before_import = int(time.time()) time.sleep(2) self.server.manager(MGR_CMD_IMPORT, HOOK, a, self.hook_name) if mom_checks: self.moms_list[0].log_match('pbs_cgroups.CF;' 'copy hook-related ' 'file request received', starttime=just_before_import, n='ALL') pbs_home = self.server.pbs_conf['PBS_HOME'] svr_conf = os.path.join( os.sep, pbs_home, 'server_priv', 'hooks', 'pbs_cgroups.CF') pbs_home = self.mom.pbs_conf['PBS_HOME'] mom_conf = os.path.join( os.sep, pbs_home, 'mom_priv', 'hooks', 'pbs_cgroups.CF') if mom_checks: # reload config if server and mom cfg differ up to count times count = 5 while (count > 0): r1 = self.du.run_cmd(cmd=['cat', svr_conf], sudo=True, hosts=self.serverA) r2 = self.du.run_cmd(cmd=['cat', mom_conf], sudo=True, hosts=self.mom.shortname) if r1['out'] != r2['out']: self.logger.info('server & mom pbs_cgroups.CF differ') time.sleep(2) just_before_import = int(time.time()) time.sleep(2) self.server.manager(MGR_CMD_IMPORT, HOOK, a, self.hook_name) self.moms_list[0].log_match('pbs_cgroups.CF;' 'copy hook-related ' 'file request received', starttime=just_before_import, n='ALL') else: self.logger.info('server & mom pbs_cgroups.CF match') break time.sleep(1) count -= 1 self.assertGreater(count, 0, "pbs_cgroups.CF failed to load") # A HUP of each mom ensures update to hook config file is # seen by the exechost_startup hook. time.sleep(2) stime = int(time.time()) time.sleep(2) for mom in self.moms_list: mom.signal('-HUP') mom.log_match('hook_perf_stat;label=hook_exechost_startup_' 'pbs_cgroups_.* profile_stop', regexp=True, starttime=stime, existence=True, interval=1, n='ALL') def load_default_config(self, mom_checks=True): """ Load the default pbs_cgroups hook config file """ self.config_file = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'lib', 'python', 'altair', 'pbs_hooks', 'pbs_cgroups.CF') time.sleep(2) now = int(time.time()) time.sleep(2) a = {'content-type': 'application/x-config', 'content-encoding': 'default', 'input-file': self.config_file} self.server.manager(MGR_CMD_IMPORT, HOOK, a, self.hook_name) if not mom_checks: return self.moms_list[0].log_match('pbs_cgroups.CF;copy hook-related ' 'file request received', starttime=now, n='ALL') def set_vntype(self, host, typestring='myvntype'): """ Set the vnode type for the local mom. """ pbs_home = self.server.pbs_conf['PBS_HOME'] vntype_file = os.path.join(pbs_home, 'mom_priv', 'vntype') self.logger.info('Setting vntype to %s in %s on mom %s' % (typestring, vntype_file, host)) localhost = socket.gethostname() fn = self.du.create_temp_file(hostname=localhost, body=typestring) self.tempfile.append(fn) ret = self.du.run_copy(hosts=host, src=fn, dest=vntype_file, sudo=True, uid='root', gid='root', mode=0o644) if ret['rc'] != 0: self.skipTest('pbs_cgroups_hook: failed to set vntype') def remove_vntype(self): """ Unset the vnode type on the moms. """ for mom in self.moms_list: pbs_home = mom.pbs_conf['PBS_HOME'] vn_file = os.path.join(pbs_home, 'mom_priv', 'vntype') host = mom.shortname self.logger.info('Deleting vntype files %s from mom %s' % (vn_file, host)) ret = self.du.rm(hostname=host, path=vn_file, force=True, sudo=True, logerr=False) if not ret: self.skipTest('pbs_cgroups_hook: failed to remove vntype') def get_vntype(self, host): """ Get the vntype if it exists for example on cray """ vntype = 'no_cgroups' pbs_home = self.server.pbs_conf['PBS_HOME'] vntype_f = os.path.join(pbs_home, 'mom_priv', 'vntype') self.logger.info('Reading the vntype value for mom %s' % host) if self.du.isfile(hostname=host, path=vntype_f): output = self.du.cat(hostname=host, filename=vntype_f, sudo=True) vntype = output['out'][0] return vntype def wait_and_read_file(self, host, filename=''): """ Make several attempts to read a file and return its contents """ self.logger.info('Reading file: %s on host: %s' % (filename, host)) if not filename: raise ValueError('Invalid filename') for _ in range(30): if self.du.isfile(hostname=host, path=filename): break time.sleep(0.5) self.assertTrue(self.du.isfile(hostname=host, path=filename), 'File %s not found on host %s' % (filename, host)) # Wait for output to flush time.sleep(2) output = self.du.cat(hostname=host, filename=filename, sudo=True) if output['rc'] == 0: return output['out'] else: return [] def get_hostname(self, host): """ get hostname of the mom. This is needed since cgroups logs hostname not mom name """ cmd = 'hostname' rv = self.du.run_cmd(hosts=host, cmd=cmd) ret = rv['out'][0].split('.')[0] return ret def get_host_names(self, host): """ get shortname and hostname of the mom. This is needed for some systems where hostname and shortname is different. """ cmd1 = 'hostname -s' rv1 = self.du.run_cmd(hosts=host, cmd=cmd1) host2 = self.get_hostname(host) hostlist = '"' + host2 + '"' moms = [hostlist] mlog = ["'" + host2 + "'"] # if shortname and hostname is not same then construct a # list including both to be passed to cgroups hook if (str(rv1['out'][0]) != host2): moms.append('"' + str(rv1['out'][0]) + '"') mlog.append("'" + str(rv1['out'][0]) + "'") if len(moms) > 1: mom1 = ','.join(moms) log1 = ', '.join(mlog) else: mom1 = '"' + host2 + '"' log1 = "'" + host2 + "'" return mom1, log1 @requirements(num_moms=2) def test_cgroup_vntype_excluded(self): """ Test to verify that cgroups are not enforced on nodes that have an exclude vntype file set """ name = 'CGROUP8' if self.vntypename[0] == 'no_cgroups': self.logger.info('Adding vntype %s to mom %s ' % (self.vntypename[0], self.moms_list[0])) self.set_vntype(typestring=self.vntypename[0], host=self.hosts_list[0]) a = self.cfg1 % ('', '"' + self.vntypename[0] + '"', '', '', self.mem, self.swapctl) self.load_config(a) for m in self.moms.values(): m.restart() a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0], ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) time.sleep(2) stime = int(time.time()) time.sleep(2) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ATTR_o, jid) o = j.attributes[ATTR_o] self.tempfile.append(o) self.logger.info('memory subsystem is at location %s' % self.paths[self.hosts_list[0]]['memory']) cpath = self.get_cgroup_job_dir('memory', jid, self.hosts_list[0]) self.assertFalse(self.is_dir(cpath, self.hosts_list[0])) self.moms_list[0].log_match( "%s is in the excluded vnode type list: ['%s']" % (self.vntypename[0], self.vntypename[0]), starttime=stime, n='ALL') self.logger.info('vntypes on both hosts are: %s and %s' % (self.vntypename[0], self.vntypename[1])) if self.vntypename[1] == self.vntypename[0]: self.logger.info('Skipping the second part of this test ' 'since hostB also has same vntype value') return a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[1], ATTR_N: name} j1 = Job(TEST_USER, attrs=a) j1.create_script(self.sleep600_job) jid2 = self.server.submit(j1) a = {'job_state': 'R'} self.server.expect(JOB, a, jid2) self.server.status(JOB, ATTR_o, jid2) o = j1.attributes[ATTR_o] self.tempfile.append(o) cpath = self.get_cgroup_job_dir('memory', jid2, self.hosts_list[1]) self.assertTrue(self.is_dir(cpath, self.hosts_list[1])) @requirements(num_moms=2) def test_cgroup_host_excluded(self): """ Test to verify that cgroups are not enforced on nodes that have the exclude_hosts set """ name = 'CGROUP9' mom, log = self.get_host_names(self.hosts_list[0]) self.load_config(self.cfg1 % ('%s' % mom, '', '', '', self.mem, self.swapctl)) for m in self.moms.values(): m.restart() a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0], ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) time.sleep(2) stime = int(time.time()) time.sleep(2) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ATTR_o, jid) o = j.attributes[ATTR_o] self.tempfile.append(o) cpath = self.get_cgroup_job_dir('memory', jid, self.hosts_list[0]) self.assertFalse(self.is_dir(cpath, self.hosts_list[0])) host = self.get_hostname(self.hosts_list[0]) self.moms_list[0].log_match('%s is in the excluded host list: [%s]' % (host, log), starttime=stime, n='ALL') self.server.delete(jid, wait=True) a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[1], ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid2 = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid2) self.server.status(JOB, ATTR_o, jid2) o = j.attributes[ATTR_o] self.tempfile.append(o) cpath = self.get_cgroup_job_dir('memory', jid2, self.hosts_list[1]) self.assertTrue(self.is_dir(cpath, self.hosts_list[1])) @requirements(num_moms=2) def test_cgroup_exclude_vntype_mem(self): """ Test to verify that cgroups are not enforced on nodes that have an exclude vntype file set """ name = 'CGROUP12' if self.vntypename[0] == 'no_cgroups': self.logger.info('Adding vntype %s to mom %s' % (self.vntypename[0], self.moms_list[0])) self.set_vntype(typestring='no_cgroups', host=self.hosts_list[0]) self.load_config(self.cfg3 % ('', 'false', '', self.mem, '"' + self.vntypename[0] + '"', self.swapctl, '"' + self.vntypename[0] + '"')) for m in self.moms.values(): m.restart() a = {'Resource_List.select': '1:ncpus=1:mem=100mb:host=%s' % self.hosts_list[0], ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) time.sleep(2) stime = int(time.time()) time.sleep(2) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ATTR_o, jid) o = j.attributes[ATTR_o] self.tempfile.append(o) self.moms_list[0].log_match('cgroup excluded for subsystem memory ' 'on vnode type %s' % self.vntypename[0], starttime=stime, n='ALL') self.logger.info('vntype values for each hosts are: %s and %s' % (self.vntypename[0], self.vntypename[1])) if self.vntypename[0] == self.vntypename[1]: self.logger.info('Skipping the second part of this test ' 'since hostB also has same vntype value') return a = {'Resource_List.select': '1:ncpus=1:mem=100mb:host=%s' % self.hosts_list[1], ATTR_N: name} j1 = Job(TEST_USER, attrs=a) j1.create_script(self.sleep600_job) jid2 = self.server.submit(j1) a = {'job_state': 'R'} self.server.expect(JOB, a, jid2) self.server.status(JOB, ATTR_o, jid2) o = j1.attributes[ATTR_o] self.tempfile.append(o) cpath = self.get_cgroup_job_dir('memory', jid2, self.hosts_list[1]) self.assertTrue(self.is_dir(cpath, self.hosts_list[1])) def test_cgroup_periodic_update_check_values(self): """ Test to verify that cgroups are reporting usage for cput and mem """ if not self.paths[self.hosts_list[0]]['memory']: self.skipTest('Test requires memory subystem mounted') name = 'CGROUP13' conf = {'freq': 2} self.server.manager(MGR_CMD_SET, HOOK, conf, self.hook_name) self.load_config(self.cfg3 % ('', 'false', '', self.mem, '', self.swapctl, '')) a = {'Resource_List.select': '1:ncpus=1:mem=500mb:host=%s' % self.hosts_list[0], ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.eatmem_job3) time.sleep(2) stime = int(time.time()) time.sleep(2) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ATTR_o, jid) o = j.attributes[ATTR_o] self.tempfile.append(o) # Scouring the logs for initial values takes too long resc_list = ['resources_used.mem'] if self.swapctl == 'true': resc_list.append('resources_used.vmem') qstat = self.server.status(JOB, resc_list, id=jid) mem = convert_size(qstat[0]['resources_used.mem'], 'kb') match = re.match(r'(\d+)kb', mem) self.assertFalse(match is None) usage = int(match.groups()[0]) self.assertGreater(300000, usage) if self.swapctl == 'true': vmem = convert_size(qstat[0]['resources_used.vmem'], 'kb') match = re.match(r'(\d+)kb', vmem) self.assertFalse(match is None) usage = int(match.groups()[0]) self.assertGreater(300000, usage) err_msg = "Unexpected error in pbs_cgroups " + \ "handling exechost_periodic event: TypeError" self.moms_list[0].log_match(err_msg, max_attempts=3, interval=1, n='ALL', starttime=stime, existence=False) # Allow some time to pass for values to be updated # sleep 2s: make sure no old log lines will match 'begin' time time.sleep(2) begin = int(time.time()) # sleep 2s to allow for small time differences and rounding errors time.sleep(2) self.logger.info('Waiting for periodic hook to update usage data.') # loop to check if cput, mem, vmem are expected values cput_usage = 0.0 mem_usage = 0 vmem_usage = 0 # Faster systems might expect to see the usage you finally expect # recorder after 8-10 seconds; on TH it can take up to a minute time.sleep(8) for count in range(30): time.sleep(2) if self.paths[self.hosts_list[0]]['cpuacct'] and cput_usage <= 1.0: # Match last line from the bottom line = self.moms_list[0].log_match( '%s;update_job_usage: CPU usage:' % jid, starttime=begin, n='ALL') match = re.search(r'CPU usage: ([0-9.]+) secs', line[1]) cput_usage = float(match.groups()[0]) self.logger.info("Found cput_usage: %ss" % str(cput_usage)) if (self.paths[self.hosts_list[0]]['memory'] and mem_usage <= 400000): # Match last line from the bottom line = self.moms_list[0].log_match( '%s;update_job_usage: Memory usage: mem=' % jid, starttime=begin, n='ALL') match = re.search(r'mem=(\d+)kb', line[1]) mem_usage = int(match.groups()[0]) self.logger.info("Found mem_usage: %skb" % str(mem_usage)) if self.swapctl == 'true' and vmem_usage <= 400000: # Match last line from the bottom line = self.moms_list[0].log_match( '%s;update_job_usage: Memory usage: vmem=' % jid, starttime=begin, n='ALL') match = re.search(r'vmem=(\d+)kb', line[1]) vmem_usage = int(match.groups()[0]) self.logger.info("Found vmem_usage: %skb" % str(vmem_usage)) if cput_usage > 1.0 and mem_usage > 400000: if self.swapctl == 'true': if vmem_usage > 400000: break else: break # try to make next loop match the _next_ updates # note: we might still be unlucky and just match an old update, # but not next time: the loop's sleep will make 'begin' advance begin = int(time.time()) self.assertGreater(cput_usage, 1.0) self.assertGreater(mem_usage, 400000) if self.swapctl == 'true': self.assertGreater(vmem_usage, 400000) def test_cgroup_cpuset_and_memory(self): """ Test to verify that the job cgroup is created correctly Check to see that cpuset.cpus=0, cpuset.mems=0 and that memory.limit_in_bytes = 314572800 """ if not self.paths[self.hosts_list[0]]['memory']: self.skipTest('Test requires memory subystem mounted') name = 'CGROUP1' self.load_config(self.cfg3 % ('', 'false', '', self.mem, '', self.swapctl, '')) # This test expects the job to land on CPU 0. # The previous test may have qdel -Wforce its jobs, and then it takes # some time for MoM to run the execjob_epilogue and execjob_end # *after* the job has disappeared on the server. # So wait a while before restarting MoM time.sleep(10) # Restart mom for changes made by cgroups hook to take effect self.mom.restart() a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0], ATTR_N: name, ATTR_k: 'oe'} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, [ATTR_o, 'exec_host'], jid) fna = self.get_cgroup_job_dir('cpuset', jid, self.hosts_list[0]) self.assertFalse(fna is None, 'No job directory for cpuset subsystem') fnma = self.get_cgroup_job_dir('memory', jid, self.hosts_list[0]) self.assertFalse(fnma is None, 'No job directory for memory subsystem') memscr = self.du.run_cmd(cmd=[self.cpuset_mem_script % (fna, fnma)], as_script=True, hosts=self.mom.shortname) memscr_out = memscr['out'] self.logger.info('memscr_out:\n%s' % memscr_out) self.assertTrue('CpuIDs=0' in memscr_out) self.logger.info('CpuIDs check passed') self.assertTrue('MemorySocket=0' in memscr_out) self.logger.info('MemorySocket check passed') if self.mem == 'true': self.assertTrue('MemoryLimit=314572800' in memscr_out) self.logger.info('MemoryLimit check passed') def test_cgroup_cpuset_and_memsw(self): """ Test to verify that the job cgroup is created correctly using the default memory and vmem Check to see that cpuset.cpus=0, cpuset.mems=0 and that memory.limit_in_bytes = 100663296 memory.memsw.limit_in_bytes = 201326592 If there is too little swap, the latter could be smaller """ if not self.paths[self.hosts_list[0]]['memory']: self.skipTest('Test requires memory subystem mounted') name = 'CGROUP2' self.load_config(self.cfg3 % ('', 'false', '', self.mem, '', self.swapctl, '')) a = {'Resource_List.select': '1:ncpus=1:host=%s' % self.hosts_list[0], ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, [ATTR_o, 'exec_host'], jid) fn = self.get_cgroup_job_dir('cpuset', jid, self.hosts_list[0]) fnm = self.get_cgroup_job_dir('memory', jid, self.hosts_list[0]) scr = self.du.run_cmd(cmd=[self.cpuset_mem_script % (fn, fnm)], as_script=True, hosts=self.mom.shortname) scr_out = scr['out'] self.logger.info('scr_out:\n%s' % scr_out) self.assertTrue('CpuIDs=0' in scr_out) self.logger.info('CpuIDs check passed') self.assertTrue('MemorySocket=0' in scr_out) self.logger.info('MemorySocket check passed') if self.mem == 'true': self.assertTrue('MemoryLimit=100663296' in scr_out) self.logger.info('MemoryLimit check passed') if self.swapctl == 'true': # Get total phys+swap memory available mem_base = os.path.join(self.paths[self.hosts_list[0]] ['memory'], 'pbs_jobs.service', 'jobid') vmem_avail = os.path.join(mem_base, 'memory.memsw.limit_in_bytes') result = self.du.cat(hostname=self.mom.hostname, filename=vmem_avail, sudo=True) vmem_avail_in_bytes = None try: vmem_avail_in_bytes = int(result['out'][0]) except Exception: # None will be seen as a failure, nothing to do pass self.logger.info("total available memsw: %d" % vmem_avail_in_bytes) self.assertTrue(vmem_avail_in_bytes is not None, "Unable to read total memsw available") mem_avail = os.path.join(mem_base, 'memory.limit_in_bytes') result = self.du.cat(hostname=self.mom.hostname, filename=mem_avail, sudo=True) mem_avail_in_bytes = None try: mem_avail_in_bytes = int(result['out'][0]) except Exception: # None will be seen as a failure, nothing to do pass self.logger.info("total available mem: %d" % mem_avail_in_bytes) self.assertTrue(mem_avail_in_bytes is not None, "Unable to read total mem available") swap_avail_in_bytes = vmem_avail_in_bytes - mem_avail_in_bytes MemswLimitExpected = (100663296 + min(100663296, swap_avail_in_bytes)) self.assertTrue(('MemswLimit=%d' % MemswLimitExpected) in scr_out) self.logger.info('MemswLimit check passed') def test_cgroup_prefix_and_devices(self): """ Test to verify that the cgroup prefix is set to "sbp" and that the devices subsystem exists with the correct devices allowed """ if not self.paths[self.hosts_list[0]]['devices']: self.skipTest('Skipping test since no devices subsystem defined') name = 'CGROUP3' self.load_config(self.cfg2) # Restart mom for changes made by cgroups hook to take effect self.mom.restart() # Make sure to run on the MoM just restarted a = {ATTR_N: name} a['Resource_List.select'] = \ '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0] j = Job(TEST_USER, attrs=a) j.set_sleep_time(600) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, [ATTR_o, 'exec_host'], jid) devd = self.paths[self.hosts_list[0]]['devices'] scr = self.du.run_cmd( cmd=[self.check_dirs_script % (jid, devd)], as_script=True, hosts=self.mom.shortname) scr_out = scr['out'] self.logger.info('scr_out:\n%s' % scr_out) # the config file named entries must be translated to major/minor # containers will make them different!! # self.du.run_cmd returns a list of one-line strings # the console awk command produces major and minor on separate lines console_results = \ self.du.run_cmd(cmd=['ls -al /dev/console' '| awk \'BEGIN {FS=" |,"} ' '{print $5} {print $7}\''], as_script=True, hosts=self.hosts_list[0]) (console_major, console_minor) = console_results['out'] # only one line here tty0_major_results = \ self.du.run_cmd(cmd=['ls -al /dev/tty0' '| awk \'BEGIN {FS=" |,"} ' '{print $5}\''], as_script=True, hosts=self.hosts_list[0]) tty0_major = tty0_major_results['out'][0] check_devices = ['b *:* rwm', 'c %s:%s rwm' % (console_major, console_minor), 'c %s:* rwm' % (tty0_major), 'c 1:* rwm', 'c 10:* rwm'] for device in check_devices: self.assertTrue(device in scr_out, '"%s" not found in: %s' % (device, scr_out)) self.logger.info('device_list check passed') def test_devices_and_gpu_discovery(self): """ Test to verify that if the device subsystem is enabled and discover_gpus is true, _discover_gpus is called The GPU tests should in theory make this redundant, but they require a test harness that has GPUs. This test will allow to see if the GPU discovery is at least called even when the test harness has no GPUs. """ if not self.paths[self.hosts_list[0]]['devices']: self.skipTest('Skipping test since no devices subsystem defined') name = 'CGROUP3' time.sleep(2) begin = int(time.time()) time.sleep(2) self.load_config(self.cfg14 % ('true', 'true')) # These will throw an exception if the routines that should not # have been called were called. # n='ALL' is needed because the cgroup hook is so verbose # that 50 lines will not suffice self.moms_list[0].log_match('_discover_devices', starttime=begin, existence=True, max_attempts=2, interval=1, n='ALL') self.moms_list[0].log_match('NVIDIA SMI', starttime=begin, existence=True, max_attempts=2, interval=1, n='ALL') self.logger.info('devices_and_gpu_discovery check passed') def test_suppress_devices_discovery(self): """ Test to verify that if the device subsystem is turned off, neither _discover_devices nor _discover_gpus is called """ if not self.paths[self.hosts_list[0]]['devices']: self.skipTest('Skipping test since no devices subsystem defined') name = 'CGROUP3' time.sleep(2) begin = int(time.time()) time.sleep(2) self.load_config(self.cfg14 % ('true', 'false')) # These will throw an exception if the routines that should not # have been called were called. # n='ALL' is needed because the cgroup hook is so verbose # that 50 lines will not suffice self.moms_list[0].log_match('_discover_devices', starttime=begin, existence=False, max_attempts=2, interval=1, n='ALL') self.moms_list[0].log_match('_discover_gpus', starttime=begin, existence=False, max_attempts=2, interval=1, n='ALL') self.logger.info('suppress_devices_discovery check passed') def test_suppress_gpu_discovery(self): """ Test to verify that if the device subsystem is enabled and discover_gpus is false, nvidia-smi is not called discover_gpus is called but just returns {} """ if not self.paths[self.hosts_list[0]]['devices']: self.skipTest('Skipping test since no devices subsystem defined') name = 'CGROUP3' time.sleep(2) begin = int(time.time()) time.sleep(2) self.load_config(self.cfg14 % ('false', 'true')) # These will throw an exception if the routines that should not # have been called were called. # n='ALL' is needed because the cgroup hook is so verbose # that 50 lines will not suffice self.moms_list[0].log_match('_discover_devices', starttime=begin, existence=True, max_attempts=2, interval=1, n='ALL') self.moms_list[0].log_match('NVIDIA SMI', starttime=begin, existence=False, max_attempts=2, interval=1, n='ALL') self.logger.info('suppress_gpu_discovery check passed') def test_cgroup_cpuset(self): """ Test to verify that 2 jobs are not assigned the same cpus """ pcpus = 0 with open('/proc/cpuinfo', 'r') as desc: for line in desc: if re.match('^processor', line): pcpus += 1 if pcpus < 2: self.skipTest('Test requires at least two physical CPUs') name = 'CGROUP4' # since we do not configure vnodes ourselves wait for the setup # of this test to propagate all hooks etc. # otherwise the load_config tests to see if it's all done # might get confused # occasional trouble seen on TH2 self.load_config(self.cfg3 % ('', 'false', '', self.mem, '', self.swapctl, '')) # Submit two jobs a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0], ATTR_N: name + 'a'} j1 = Job(TEST_USER, attrs=a) j1.create_script(self.sleep600_job) jid1 = self.server.submit(j1) b = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0], ATTR_N: name + 'b'} j2 = Job(TEST_USER, attrs=b) j2.create_script(self.sleep600_job) jid2 = self.server.submit(j2) a = {'job_state': 'R'} # Make sure they are both running self.server.expect(JOB, a, jid1) self.server.expect(JOB, a, jid2) # cpuset paths for both jobs fn1 = self.get_cgroup_job_dir('cpuset', jid1, self.hosts_list[0]) fn2 = self.get_cgroup_job_dir('cpuset', jid2, self.hosts_list[0]) # Capture the output of cpuset_mem_script for both jobs scr1 = self.du.run_cmd(cmd=[self.cpuset_mem_script % (fn1, None)], as_script=True, hosts=self.hosts_list[0]) scr1_out = scr1['out'] self.logger.info('scr1_out:\n%s' % scr1_out) scr2 = self.du.run_cmd(cmd=[self.cpuset_mem_script % (fn2, None)], as_script=True, hosts=self.hosts_list[0]) scr2_out = scr2['out'] self.logger.info('scr2_out:\n%s' % scr2_out) # Ensure the CPU ID for each job differs cpuid1 = None for kv in scr1_out: if 'CpuIDs=' in kv: cpuid1 = kv break self.assertNotEqual(cpuid1, None, 'Could not read first CPU ID.') cpuid2 = None for kv in scr2_out: if 'CpuIDs=' in kv: cpuid2 = kv break self.assertNotEqual(cpuid2, None, 'Could not read second CPU ID.') self.logger.info("cpuid1 = %s and cpuid2 = %s" % (cpuid1, cpuid2)) self.assertNotEqual(cpuid1, cpuid2, 'Processes should be assigned to different CPUs') self.logger.info('CpuIDs check passed') @timeout(1800) def test_cgroup_cpuset_ncpus_are_cores(self): """ Test to verify that correct number of jobs run on a hyperthread enabled system when ncpus_are_cores is set to true. """ # Check that system has hyperthreading enabled and has # at least two threads ("pcpus") # WARNING: do not assume that physical CPUs are numbered from 0 # and that all processors from a physical ID are contiguous # count the number of different physical IDs with a set! pcpus = 0 sibs = 0 cores = 0 pval = 0 phys_set = set() with open('/proc/cpuinfo', 'r') as desc: for line in desc: if re.match('^processor', line): pcpus += 1 sibs_match = re.search(r'siblings : ([0-9]+)', line) cores_match = re.search(r'cpu cores : ([0-9]+)', line) phys_match = re.search(r'physical id : ([0-9]+)', line) if sibs_match: sibs = int(sibs_match.groups()[0]) if cores_match: cores = int(cores_match.groups()[0]) if phys_match: pval = int(phys_match.groups()[0]) phys_set.add(pval) phys = len(phys_set) if (sibs == 0 or cores == 0): self.skipTest('Insufficient information about the processors.') if pcpus < 2: self.skipTest('This test requires at least two processors.') if sibs / cores == 1: self.skipTest('This test requires hyperthreading to be enabled.') name = 'CGROUP18' self.load_config(self.cfg8 % ('', '', self.mem, '', self.swapctl, '')) # Make sure to restart MOM # HUP is not enough to get rid of earlier # per socket vnodes created when vnode_per_numa_node=True self.mom.restart() # Submit M jobs N cpus wide, where M is the amount of physical # processors and N is number of 'cpu cores' per M. Expect them to run. njobs = phys if njobs > 100: self.skipTest("too many jobs (%d) to submit" % njobs) a = {'Resource_List.select': '1:ncpus=%s:mem=300mb:host=%s' % (cores, self.hosts_list[0]), ATTR_N: name + 'a'} for _ in range(njobs): j = Job(TEST_USER, attrs=a) # make sure this stays around for an hour # (or until deleted in teardown) j.set_sleep_time(3600) jid = self.server.submit(j) a1 = {'job_state': 'R'} # give the scheduler, server and MoM some time # it's not a luxury on containers with few CPU resources time.sleep(2) self.server.expect(JOB, a1, jid) # Submit another job, expect in Q state -- this one with only 1 CPU b = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0], ATTR_N: name + 'b'} j2 = Job(TEST_USER, attrs=b) jid2 = self.server.submit(j2) b1 = {'job_state': 'Q'} # Make sure to give the scheduler ample time here: # we want to make sure jid2 doesn't run because it can't, # not because the scheduler has not yet gotten to it time.sleep(30) self.server.expect(JOB, b1, jid2) def test_cgroup_enforce_memory(self): """ Test to verify that the job is killed when it tries to use more memory than it requested """ if not self.paths[self.hosts_list[0]]['memory'] or not self.mem: self.skipTest('Test requires memory subystem mounted') name = 'CGROUP5' self.load_config(self.cfg3b % ('false')) a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0], ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.eatmem_job1) time.sleep(2) stime = int(time.time()) time.sleep(2) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ATTR_o, jid) o = j.attributes[ATTR_o] self.tempfile.append(o) # mem and vmem limit will both be set, and either could be detected self.mom.log_match('%s;Cgroup mem(ory|sw) limit exceeded' % jid, regexp=True, n='ALL', starttime=stime) def test_cgroup_enforce_memsw(self): """ Test to verify that the job is killed when it tries to use more vmem than it requested """ if not self.paths[self.hosts_list[0]]['memory']: self.skipTest('Test requires memory subystem mounted') # run the test if swap space is available if not self.mem or not self.swapctl: self.skipTest('Test requires memory controller with memsw' 'swap accounting enabled') if have_swap() == 0: self.skipTest('no swap space available on the local host') # Get the grandparent directory fn = self.paths[self.hosts_list[0]]['memory'] fn = os.path.join(fn, 'memory.memsw.limit_in_bytes') if not self.is_file(fn, self.hosts_list[0]): self.skipTest('vmem resource not present on node') self.load_config(self.cfg3 % ('', 'false', '', self.mem, '', self.swapctl, '')) name = 'CGROUP6' # Make sure output file is gone, otherwise wait and read # may pick up stale copy of earlier test self.du.rm(runas=TEST_USER, path='~/' + name + '.*', as_script=True) a = { 'Resource_List.select': '1:ncpus=1:mem=400mb:vmem=420mb:host=%s' % self.hosts_list[0], ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.eatmem_job1) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, [ATTR_o, 'exec_host'], jid) filename = j.attributes[ATTR_o] ehost = j.attributes['exec_host'] tmp_file = filename.split(':')[1] tmp_host = ehost.split('/')[0] tmp_out = self.wait_and_read_file(filename=tmp_file, host=tmp_host) self.tempfile.append(tmp_file) success = False foundstr = '' if tmp_out == []: success = False else: joined_out = '\n'.join(tmp_out) if 'Cgroup memsw limit exceeded' in joined_out: success = True foundstr = 'Cgroup memsw limit exceeded' elif 'Cgroup mem limit exceeded' in joined_out: success = True foundstr = 'Cgroup mem limit exceeded' elif 'MemoryError' in joined_out: success = True foundstr = 'MemoryError' self.assertTrue(success, 'No Cgroup memory/memsw limit exceeded ' 'or MemoryError found in joined stdout/stderr') self.logger.info('Joined stdout/stderr contained expected string: ' + foundstr) def test_cgroup_diag_messages(self): """ Test to verify that job that exceeded resources has the diag_message set correctly. """ if not self.paths[self.hosts_list[0]]['memory']: self.skipTest('Test requires memory subystem mounted') # run the test if swap space is available if not self.mem or not self.swapctl: self.skipTest('Test requires memory controller with memsw' 'swap accounting enabled') if have_swap() == 0: self.skipTest('no swap space available on the local host') # Get the grandparent directory fn = self.paths[self.hosts_list[0]]['memory'] fn = os.path.join(fn, 'memory.memsw.limit_in_bytes') if not self.is_file(fn, self.hosts_list[0]): self.skipTest('vmem resource not present on node') # Make sure job history is enabled to see when job is gone a = {'job_history_enable': 'True'} rc = self.server.manager(MGR_CMD_SET, SERVER, a) self.assertEqual(rc, 0) self.load_config(self.cfg3 % ('', 'false', '', self.mem, '', self.swapctl, '')) a = { 'Resource_List.select': '1:ncpus=1:mem=400mb:vmem=420mb:host=%s' % self.hosts_list[0]} j = Job(TEST_USER, attrs=a) j.create_script(self.eatmem_job1) jid = self.server.submit(j) a = {'job_state': 'F'} self.server.expect(JOB, a, jid, extend='x', offset=10) resc = ['resources_used.diag_messages'] s = self.server.status(JOB, resc, id=jid, extend='x') dmsg = s[0]['resources_used.diag_messages'].replace("'", "") json_exceeded = json.loads(dmsg) msg = json_exceeded[self.mom.shortname] self.assertEqual(msg, 'Cgroup mem limit exceeded, ' 'Cgroup memsw limit exceeded') def cgroup_offline_node(self, name, vnpernuma=False): """ Per vnode_per_numa_node config setting, return True if able to verify that the node is offlined when it can't clean up the cgroup and brought back online once the cgroup is cleaned up. """ # Make sure job history is enabled to see when job is gone a = {'job_history_enable': 'True'} rc = self.server.manager(MGR_CMD_SET, SERVER, a) self.assertEqual(rc, 0) self.server.expect(SERVER, {'job_history_enable': 'True'}) if 'freezer' not in self.paths[self.hosts_list[0]]: self.skipTest('Freezer cgroup is not mounted') # Get the grandparent directory fdir = self.paths[self.hosts_list[0]]['freezer'] if not self.is_dir(fdir, self.hosts_list[0]): self.skipTest('Freezer cgroup is not found') # Configure the hook self.load_config(self.cfg3 % ('', vnpernuma, '', self.mem, '', self.swapctl, '')) a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0], 'Resource_List.walltime': 600, ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) job_status = self.server.status(JOB, id=jid) filename = j.attributes[ATTR_o] tmp_file = filename.split(':')[1] self.tempfile.append(tmp_file) self.logger.info("Added %s to temp files to clean up" % tmp_file) self.logger.info("Job session ID is apparently %s" % str(j.attributes['session_id'])) # Query the pids in the cgroup jdir = self.get_cgroup_job_dir('cpuset', jid, self.hosts_list[0]) tasks_file = os.path.join(jdir, 'tasks') time.sleep(2) ret = self.du.cat(self.hosts_list[0], tasks_file, sudo=True) tasks = ret['out'] if len(tasks) < 2: self.skipTest('pbs_cgroups_hook: only one task in cgroup') self.logger.info('Tasks: %s' % tasks) self.assertTrue(tasks, 'No tasks in cpuset cgroup for job') # Make dir in freezer subsystem under directory where we # have delegate control from systemd fdir_pbs = os.path.join(fdir, 'pbs_jobs.service', 'PtlPbs') if not self.du.isdir(self.hosts_list[0], fdir_pbs): self.du.mkdir(hostname=self.hosts_list[0], path=fdir_pbs, mode=0o755, sudo=True) # Write PIDs into the tasks file for the freezer cgroup # All except the top job process -- it remains thawed to # let the job exit task_file = os.path.join(fdir_pbs, 'tasks') success = True body = '' for pidstr in tasks: if pidstr.strip() == j.attributes['session_id']: self.logger.info('Skipping top job process ' + pidstr) else: cmd = ['echo ' + pidstr + ' >>' + task_file] ret = self.du.run_cmd(hosts=self.hosts_list[0], cmd=cmd, sudo=True, as_script=True) if ret['rc'] != 0: success = False self.logger.info('Failed to put %s into %s on %s' % (pidstr, task_file, self.hosts_list[0])) self.logger.info('rc = %d', ret['rc']) self.logger.info('stdout = %s', ret['out']) self.logger.info('stderr = %s', ret['err']) if not success: self.skipTest('pbs_cgroups_hook: Failed to copy freezer tasks') # Freeze the cgroup freezer_file = os.path.join(fdir_pbs, 'freezer.state') state = 'FROZEN' fn = self.du.create_temp_file(body=state) self.tempfile.append(fn) ret = self.du.run_copy(self.hosts_list[0], src=fn, dest=freezer_file, sudo=True, uid='root', gid='root', mode=0o644) if ret['rc'] != 0: self.skipTest('pbs_cgroups_hook: Failed to copy ' 'freezer state FROZEN') confirmed_frozen = False for count in range(30): ret = self.du.cat(hostname=self.hosts_list[0], filename=freezer_file, sudo=True) if ret['rc'] != 0: self.logger.info("Cannot confirm freezer state" "sleeping 30 seconds instead") time.sleep(30) break if ret['out'][0] == 'FROZEN': self.logger.info("job processes reported as FROZEN") confirmed_frozen = True break else: self.logger.info("freezer state reported as " + ret['out'][0]) time.sleep(1) if not confirmed_frozen: self.logger.info("Freezer did not work; skip test after cleanup") # Catch any exception so we can thaw the cgroup or the jobs # will remain frozen and impact subsequent tests passed = True # Now delete the job try: self.server.delete(id=jid) except Exception as exc: passed = False self.logger.info('Job could not be deleted') if confirmed_frozen: # The cgroup hook should fail to clean up the cgroups # because of the freeze, and offline node # Note that when vnode per numa node is enabled, this # will take longer: the execjob_epilogue will first mark # the per-socket vnode offline, but only the exechost_periodic # will mark the natural node offline try: self.server.expect(NODE, {'state': (MATCH_RE, 'offline')}, id=self.nodes_list[0], offset=10, interval=3) except Exception as exc: passed = False self.logger.info('Node never went offline') # Thaw the cgroup state = 'THAWED' fn = self.du.create_temp_file(body=state) self.tempfile.append(fn) ret = self.du.run_copy(self.hosts_list[0], src=fn, dest=freezer_file, sudo=True, uid='root', gid='root', mode=0o644) if ret['rc'] != 0: # Skip the test at the end when this happens, # but still attempt to clean up! confirmed_frozen = False # First confirm the processes were thawed for count in range(30): ret = self.du.cat(hostname=self.hosts_list[0], filename=freezer_file, sudo=True) if ret['rc'] != 0: self.logger.info("Cannot confirm freezer state" "sleeping 30 seconds instead") time.sleep(30) break if ret['out'][0] == 'THAWED': self.logger.info("job processes reported as THAWED") break else: self.logger.info("freezer state reported as " + ret['out'][0]) time.sleep(1) # once the freezer is thawed, all the processes should receive # the cgroup hook's kill signal and disappear; # confirm they're gone before deleting freezer freezer_tasks = os.path.join(fdir_pbs, 'tasks') for count in range(30): ret = self.du.cat(hostname=self.hosts_list[0], filename=freezer_tasks, sudo=True) if ret['rc'] != 0: self.logger.info("Cannot confirm freezer tasks" "sleeping 30 seconds instead") time.sleep(30) break if ret['out'] == [] or ret['out'][0] == '': self.logger.info("Processes in thawed freezer are gone") break else: self.logger.info("tasks still in thawed freezer: " + str(ret['out'])) time.sleep(1) cmd = ["rmdir", fdir_pbs] self.logger.info("Removing %s" % fdir_pbs) self.du.run_cmd(self.hosts_list[0], cmd=cmd, sudo=True) # Due to orphaned jobs node is not coming back to free state # workaround is to recreate the nodes. Orphaned jobs will # get cleaned up in tearDown hence not doing it here # try deleting the job once more, to ensure that the node isn't # busy try: self.server.delete(id=jid) except Exception as exc: pass bs = {'job_state': 'F'} self.server.expect(JOB, bs, jid, extend='x', offset=1) if not confirmed_frozen: self.cgroup_recreate_nodes() self.skipTest('Could not confirm freeze/thaw worked') return passed def cgroup_recreate_nodes(self): """ Since the job delete action was purposefully bent out of shape, node state might stay busy for some time retry until it works -- this is for the sanity of the next test """ for count in range(30): try: self.server.manager(MGR_CMD_DELETE, NODE, None, "") self.logger.info('Managed to delete nodes') break except Exception: self.logger.info('Failed to delete nodes (still busy?)') time.sleep(1) for host in self.hosts_list: try: self.server.manager(MGR_CMD_CREATE, NODE, id=host) except Exception: # the delete might have failed and then the create will, # but still confirm the node goes back to free state pass self.server.expect(NODE, {'state': 'free'}, id=host, interval=3) def test_cgroup_offline_node_preserve_comment(self): """ Test to verify that offlined node that is bring back online preserves custom comment. """ a = {'comment': "foo bar"} self.server.manager(MGR_CMD_SET, NODE, a, self.hosts_list[0]) name = 'CGROUP7.1' vn_per_numa = 'false' rv = self.cgroup_offline_node(name, vn_per_numa) self.assertTrue(rv) a = {'comment': "foo bar"} self.server.expect(NODE, a, id=self.hosts_list[0]) self.cgroup_recreate_nodes() def test_cgroup_offline_node(self): """ Test to verify that the node is offlined when it can't clean up the cgroup and brought back online once the cgroup is cleaned up. vnode_per_numa_node = false """ name = 'CGROUP7.1' vn_per_numa = 'false' rv = self.cgroup_offline_node(name, vn_per_numa) self.assertTrue(rv) self.cgroup_recreate_nodes() def test_cgroup_offline_node_vnpernuma(self): """ Test to verify that the node is offlined when it can't clean up the cgroup and brought back online once the cgroup is cleaned up. vnode_per_numa_node = true """ with open(os.path.join(os.sep, 'proc', 'meminfo'), 'r') as fd: meminfo = fd.read() if 'Hugepagesize' not in meminfo: self.skipTest('Hugepagesize not in meminfo') name = 'CGROUP7.2' vn_per_numa = 'true' rv = self.cgroup_offline_node(name, vn_per_numa) self.assertTrue(rv) self.cgroup_recreate_nodes() @requirements(num_moms=2) def test_cgroup_cpuset_host_excluded(self): """ Test to verify that cgroups subsystems are not enforced on nodes that have the exclude_hosts set but are enforced on other systems """ name = 'CGROUP10' mom, _ = self.get_host_names(self.hosts_list[0]) self.load_config(self.cfg1 % ('', '', '', '%s' % mom, self.mem, self.swapctl)) a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0], ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) time.sleep(2) stime = int(time.time()) time.sleep(2) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ATTR_o, jid) o = j.attributes[ATTR_o] self.tempfile.append(o) hostn = self.get_hostname(self.hosts_list[0]) self.moms_list[0].log_match('cgroup excluded for subsystem cpuset ' 'on host %s' % hostn, starttime=stime, n='ALL') cpath = self.get_cgroup_job_dir('cpuset', jid, self.hosts_list[0]) self.assertFalse(self.is_dir(cpath, self.hosts_list[0])) # Now try a job on momB a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[1], ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid2 = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid2) cpath = self.get_cgroup_job_dir('cpuset', jid2, self.hosts_list[1]) self.logger.info('Checking for %s on %s' % (cpath, self.moms_list[1])) self.assertTrue(self.is_dir(cpath, self.hosts_list[1])) @requirements(num_moms=2) def test_cgroup_run_on_host(self): """ Test to verify that the cgroup hook only runs on nodes in the run_only_on_hosts """ name = 'CGROUP11' mom, log = self.get_host_names(self.hosts_list[0]) self.load_config(self.cfg1 % ('', '', '%s' % mom, '', self.mem, self.swapctl)) a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[1], ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) time.sleep(2) stime = int(time.time()) time.sleep(2) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ATTR_o, jid) o = j.attributes[ATTR_o] self.tempfile.append(o) hostn = self.get_hostname(self.hosts_list[1]) self.moms_list[1].log_match( 'set enabled to False based on run_only_on_hosts', starttime=stime, n='ALL') cpath = self.get_cgroup_job_dir('memory', jid, self.hosts_list[1]) self.assertFalse(self.is_dir(cpath, self.hosts_list[1])) a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0], ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid2 = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid2) self.server.status(JOB, ATTR_o, jid2) o = j.attributes[ATTR_o] self.tempfile.append(o) cpath = self.get_cgroup_job_dir('memory', jid2, self.hosts_list[0]) self.assertTrue(self.is_dir(cpath, self.hosts_list[0])) def test_cgroup_qstat_resources(self): """ Test to verify that cgroups are reporting usage for mem, and vmem in qstat """ name = 'CGROUP14' self.load_config(self.cfg3 % ('', 'false', '', self.mem, '', self.swapctl, '')) a = {'Resource_List.select': '1:ncpus=1:mem=500mb', ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.eatmem_job2) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, [ATTR_o, 'exec_host'], jid) o = j.attributes[ATTR_o] self.tempfile.append(o) host = j.attributes['exec_host'] self.logger.info('OUTPUT: %s' % o) resc_list = ['resources_used.cput'] resc_list += ['resources_used.mem'] resc_list += ['resources_used.vmem'] qstat1 = self.server.status(JOB, resc_list, id=jid) for q in qstat1: self.logger.info('Q1: %s' % q) cput1 = qstat1[0]['resources_used.cput'] mem1 = qstat1[0]['resources_used.mem'] vmem1 = qstat1[0]['resources_used.vmem'] self.logger.info('Waiting 35 seconds for CPU time to accumulate') time.sleep(35) qstat2 = self.server.status(JOB, resc_list, id=jid) for q in qstat2: self.logger.info('Q2: %s' % q) cput2 = qstat2[0]['resources_used.cput'] mem2 = qstat2[0]['resources_used.mem'] vmem2 = qstat2[0]['resources_used.vmem'] self.assertNotEqual(cput1, cput2) self.assertNotEqual(mem1, mem2) # Check vmem only if system has swap control if self.swapctl == 'true': self.assertNotEqual(vmem1, vmem2) def test_cgroup_reserve_mem(self): """ Test to verify that the mom reserve memory for OS when there is a reserve mem request in the config. Install cfg3 and then cfg4 and measure difference between the amount of available memory and memsw. For example, on a system with 1GB of physical memory and 1GB of active swap. With cfg3 in place, we should see 1GB - 50MB = 950MB of available memory and 2GB - (50MB + 45MB) = 1905MB of available vmem. With cfg4 in place, we should see 1GB - 100MB = 900MB of available memory and 2GB - (100MB + 90MB) = 1810MB of available vmem. When we calculate the differences we get: mem: 950MB - 900MB = 50MB = 51200KB vmem: 1905MB - 1810MB = 95MB = 97280KB """ if not self.paths[self.hosts_list[0]]['memory']: self.skipTest('Test requires memory subystem mounted') self.load_config(self.cfg3 % ('', 'false', '', self.mem, '', self.swapctl, '')) self.server.expect(NODE, {'state': 'free'}, id=self.nodes_list[0], interval=3, offset=10) if self.swapctl == 'true': vmem = self.server.status(NODE, 'resources_available.vmem', id=self.nodes_list[0]) self.logger.info('vmem: %s' % str(vmem)) vmem1 = PbsTypeSize(vmem[0]['resources_available.vmem']) self.logger.info('Vmem-1: %s' % vmem1.value) mem = self.server.status(NODE, 'resources_available.mem', id=self.nodes_list[0]) mem1 = PbsTypeSize(mem[0]['resources_available.mem']) self.logger.info('Mem-1: %s' % mem1.value) self.load_config(self.cfg4 % (self.mem, self.swapctl)) self.server.expect(NODE, {'state': 'free'}, id=self.nodes_list[0], interval=3, offset=10) if self.swapctl == 'true': vmem = self.server.status(NODE, 'resources_available.vmem', id=self.nodes_list[0]) vmem2 = PbsTypeSize(vmem[0]['resources_available.vmem']) self.logger.info('Vmem-2: %s' % vmem2.value) vmem_resv = vmem1 - vmem2 if (vmem_resv.unit == 'b'): vmem_resv_bytes = vmem_resv.value elif (vmem_resv.unit == 'kb'): vmem_resv_bytes = vmem_resv.value * 1024 elif (vmem_resv.unit == 'mb'): vmem_resv_bytes = vmem_resv.value * 1024 * 1024 self.logger.info('Vmem resv diff in bytes: %s' % vmem_resv_bytes) # rounding differences may make diff slighly smaller than we expect # accept 1MB deviation as irrelevant # Note: since we don't know if there is swap, memsw reserved # increase might not have been heeded. Change this to a higher # value (cfr. above) only on test harnesses that have enough swap self.assertGreaterEqual(vmem_resv_bytes, (51200 - 1024) * 1024) mem = self.server.status(NODE, 'resources_available.mem', id=self.nodes_list[0]) mem2 = PbsTypeSize(mem[0]['resources_available.mem']) self.logger.info('Mem-2: %s' % mem2.value) mem_resv = mem1 - mem2 if (mem_resv.unit == 'b'): mem_resv_bytes = mem_resv.value elif (mem_resv.unit == 'kb'): mem_resv_bytes = mem_resv.value * 1024 elif (mem_resv.unit == 'mb'): mem_resv_bytes = mem_resv.value * 1024 * 1024 self.logger.info('Mem resv diff in bytes: %s' % mem_resv_bytes) # rounding differences may make diff slighly smaller than we expect # accept 1MB deviation as irrelevant self.assertGreaterEqual(mem_resv_bytes, (51200 - 1024) * 1024) @requirements(num_moms=2) def test_cgroup_multi_node(self): """ Test multi-node jobs with cgroups """ name = 'CGROUP16' self.load_config(self.cfg6 % (self.mem, self.swapctl)) a = {'Resource_List.select': '2:ncpus=1:mem=100mb', 'Resource_List.place': 'scatter', ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep30_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, 'exec_host', jid) ehost = j.attributes['exec_host'] tmp_host = ehost.split('+') ehost1 = tmp_host[0].split('/')[0] ehjd1 = self.get_cgroup_job_dir('memory', jid, ehost1) self.assertTrue(self.is_dir(ehjd1, ehost1), 'Missing memory subdirectory: %s' % ehjd1) ehost2 = tmp_host[1].split('/')[0] ehjd2 = self.get_cgroup_job_dir('memory', jid, ehost2) self.assertTrue(self.is_dir(ehjd2, ehost2), 'Missing memory subdirectory: %s' % ehjd2) # Wait for job to finish and make sure that cgroup directories # has been cleaned up by the hook self.server.expect(JOB, 'queue', op=UNSET, offset=30, interval=1, id=jid) self.assertFalse(self.is_dir(ehjd1, ehost1), 'Directory still present: %s' % ehjd1) self.assertFalse(self.is_dir(ehjd2, ehost2), 'Directory still present: %s' % ehjd2) def test_cgroup_job_array(self): """ Test that cgroups are created for subjobs like a regular job """ if not self.paths[self.hosts_list[0]]['memory']: self.skipTest('Test requires memory subystem mounted') name = 'CGROUP17' self.load_config(self.cfg1 % ('', '', '', '', self.mem, self.swapctl)) a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0], ATTR_N: name, ATTR_J: '1-4', 'Resource_List.place': 'pack:excl'} j = Job(TEST_USER, attrs=a) j.set_sleep_time(60) jid = self.server.submit(j) a = {'job_state': 'B'} self.server.expect(JOB, a, jid) # Get subjob ID subj1 = jid.replace('[]', '[1]') self.server.expect(JOB, {'job_state': 'R'}, subj1) rv = self.server.status(JOB, ['exec_host'], subj1) ehost = rv[0].get('exec_host') ehost1 = ehost.split('/')[0] # Verify that cgroups files created for subjobs # but not for parent job array cpath = self.get_cgroup_job_dir('memory', subj1, ehost1) self.assertTrue(self.is_dir(cpath, ehost1)) cpath = self.get_cgroup_job_dir('memory', jid, ehost1) self.assertFalse(self.is_dir(cpath, ehost1)) # Verify that subjob4 is queued and no cgroups # files are created for queued subjob subj4 = jid.replace('[]', '[4]') self.server.expect(JOB, {'job_state': 'Q'}, id=subj4) cpath = self.get_cgroup_job_dir('memory', subj4, ehost1) self.assertFalse(self.is_dir(cpath, self.hosts_list[0])) # Delete subjob1 and verify that cgroups files are cleaned up self.server.delete(id=subj1) self.server.expect(JOB, {'job_state': 'X'}, subj1) cpath = self.get_cgroup_job_dir('memory', subj1, ehost1) self.assertFalse(self.is_dir(cpath, ehost1)) # Verify if subjob2 is running subj2 = jid.replace('[]', '[2]') self.server.expect(JOB, {'job_state': 'R'}, id=subj2) # Force delete the subjob and verify cgroups # files are cleaned up self.server.delete(id=subj2, extend='force') self.server.expect(JOB, {'job_state': 'X'}, subj2) # Adding extra sleep for file to clean up # since qdel -Wforce changed state of subjob # without waiting for MoM # retry 10 times (for 20 seconds max. in total) # if the directory is still there... cpath = self.get_cgroup_job_dir('memory', subj2, ehost1) for trial in range(0, 10): time.sleep(2) if not self.is_dir(cpath, ehost1): # we're done break self.assertFalse(self.is_dir(cpath, ehost1)) @requirements(num_moms=2) def test_cgroup_cleanup(self): """ Test that cgroups files are cleaned up after qdel """ self.load_config(self.cfg1 % ('', '', '', '', self.mem, self.swapctl)) a = {'Resource_List.select': '2:ncpus=1:mem=100mb', 'Resource_List.place': 'scatter'} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ['exec_host'], jid) ehost = j.attributes['exec_host'] tmp_host = ehost.split('+') ehost1 = tmp_host[0].split('/')[0] ehost2 = tmp_host[1].split('/')[0] ehjd1 = self.get_cgroup_job_dir('cpuset', jid, ehost1) self.assertTrue(self.is_dir(ehjd1, ehost1)) ehjd2 = self.get_cgroup_job_dir('cpuset', jid, ehost2) self.assertTrue(self.is_dir(ehjd2, ehost2)) self.server.delete(id=jid, wait=True) self.assertFalse(self.is_dir(ehjd1, ehost1)) self.assertFalse(self.is_dir(ehjd2, ehost2)) def test_cgroup_execjob_end_should_delete_cgroup(self): """ Test to verify that if execjob_epilogue hook failed to run or to clean up cgroup files for a job, execjob_end hook should clean them up """ self.load_config(self.cfg4 % (self.mem, self.swapctl)) # remove epilogue and periodic from the list of events attr = {'enabled': 'True', 'event': ['execjob_begin', 'execjob_launch', 'execjob_attach', 'execjob_end', 'exechost_startup']} self.server.manager(MGR_CMD_SET, HOOK, attr, self.hook_name) self.server.expect(NODE, {'state': 'free'}, id=self.nodes_list[0]) j = Job(TEST_USER) j.set_sleep_time(1) jid = self.server.submit(j) # wait for job to finish self.server.expect(JOB, 'queue', id=jid, op=UNSET, interval=1, offset=1) # verify that cgroup files for this job are gone even if # epilogue and periodic events are disabled for subsys, path in self.paths[self.hosts_list[0]].items(): # only check under subsystems that are enabled enabled_subsys = ['cpuacct', 'cpuset', 'memory', 'memsw'] if (any([x in subsys for x in enabled_subsys])): continue if path: # Following code only works with recent hooks # and default cgroup_prefix # change the path if testing with older hooks # see comments in get_cgroup_job_dir() filename = os.path.join(path, 'pbs_jobs.service', 'jobid', str(jid)) self.logger.info('Checking that file %s should not exist' % filename) self.assertFalse(self.du.isfile(self.hosts_list[0], filename)) @skipOnCray def test_cgroup_assign_resources_mem_only_vnode(self): """ Test to verify that job requesting mem larger than any single vnode works properly """ if not self.paths[self.hosts_list[0]]['memory']: self.skipTest('Test requires memory subystem mounted') # vnode_per_numa_node enabled, so we get per-socket vnodes self.load_config(self.cfg3 % ('', 'true', '', self.mem, '', self.swapctl, '')) self.server.expect(NODE, {ATTR_NODE_state: 'free'}, id=self.hosts_list[0]+'[0]') socket1_found = False nodestat = self.server.status(NODE) total_kb = 0 for node in nodestat: if (self.mom.shortname + '[') not in node['id']: self.logger.info('Skipping vnode %s' % node['id']) else: if node['id'] == self.mom.shortname + '[0]': self.logger.info('Found socket 0, vnode %s' % node['id']) if node['id'] == self.mom.shortname + '[1]': socket1_found = True self.logger.info('Found socket 1, vnode %s ' '(multi socket!)' % node['id']) # PbsTypeSize value is in kb node_kb = PbsTypeSize(node['resources_available.mem']).value self.logger.info('Vnode %s memory: %skb' % (node['id'], node_kb)) total_kb += node_kb total_mb = int(total_kb / 1024) self.logger.info("Total memory on first MoM: %smb" % total_mb) if not socket1_found: self.skipTest('Test requires more than one NUMA node ' '(i.e. "socket") on first host') memreq_mb = total_mb - 2 a = {'Resource_List.select': '1:ncpus=1:host=%s:mem=%smb' % (self.mom.shortname, str(memreq_mb))} j1 = Job(TEST_USER, attrs=a) j1.create_script('date') jid1 = self.server.submit(j1) # Job should finish and thus dequeued self.server.expect(JOB, 'queue', id=jid1, op=UNSET, interval=1, offset=1) a = {'Resource_List.select': '1:ncpus=1:host=%s:mem=%smb' % (self.mom.shortname, str(memreq_mb + 1024))} j3 = Job(TEST_USER, attrs=a) j3.create_script('date') jid3 = self.server.submit(j3) # Will either start with "Can Never Run" or "Not Running" # Don't match only one a = {'job_state': 'Q', 'comment': (MATCH_RE, '.*: Insufficient amount of resource: mem.*')} self.server.expect(JOB, a, attrop=PTL_AND, id=jid3, offset=10, interval=1) @timeout(1800) def test_cgroup_cpuset_exclude_cpu(self): """ Confirm that exclude_cpus reduces resources_available.ncpus """ # Fetch the unmodified value of resources_available.ncpus self.load_config(self.cfg5 % ('false', '', 'false', 'false', 'false', self.mem, self.swapctl)) self.server.expect(NODE, {'state': 'free'}, id=self.nodes_list[0], interval=1) result = self.server.status(NODE, 'resources_available.ncpus', id=self.nodes_list[0]) orig_ncpus = int(result[0]['resources_available.ncpus']) self.assertGreater(orig_ncpus, 0) self.logger.info('Original value of ncpus: %d' % orig_ncpus) if orig_ncpus < 2: self.skipTest('Node must have at least two CPUs') # Now exclude CPU zero self.load_config(self.cfg5 % ('false', '0', 'false', 'false', 'false', self.mem, self.swapctl)) self.server.expect(NODE, {'state': 'free'}, id=self.nodes_list[0], interval=1) result = self.server.status(NODE, 'resources_available.ncpus', id=self.nodes_list[0]) new_ncpus = int(result[0]['resources_available.ncpus']) self.assertGreater(new_ncpus, 0) self.logger.info('New value with one CPU excluded: %d' % new_ncpus) self.assertEqual((new_ncpus + 1), orig_ncpus) # Repeat the process with vnode_per_numa_node set to true vnode = '%s[0]' % self.nodes_list[0] self.load_config(self.cfg5 % ('true', '', 'false', 'false', 'false', self.mem, self.swapctl)) self.server.expect(NODE, {'state': 'free'}, id=vnode, interval=1) result = self.server.status(NODE, 'resources_available.ncpus', id=vnode) orig_ncpus = int(result[0]['resources_available.ncpus']) self.assertGreater(orig_ncpus, 0) self.logger.info('Original value of vnode ncpus: %d' % orig_ncpus) # Exclude CPU zero again self.load_config(self.cfg5 % ('true', '0', 'false', 'false', 'false', self.mem, self.swapctl)) self.server.expect(NODE, {'state': 'free'}, id=vnode, interval=1) result = self.server.status(NODE, 'resources_available.ncpus', id=vnode) new_ncpus = int(result[0]['resources_available.ncpus']) self.assertEqual((new_ncpus + 1), orig_ncpus) def test_cgroup_cpuset_mem_fences(self): """ Confirm that mem_fences affects setting of cpuset.mems """ if not self.paths[self.hosts_list[0]]['memory']: self.skipTest('Test requires memory subystem mounted') # Get the grandparent directory cpuset_base = self.paths[self.hosts_list[0]]['cpuset'] cpuset_mems = os.path.join(cpuset_base, 'cpuset.mems') result = self.du.cat(hostname=self.hosts_list[0], filename=cpuset_mems, sudo=True) if result['rc'] != 0 or result['out'][0] == '0': self.skipTest('Test requires two NUMA nodes') # First try with mem_fences set to true (the default) self.load_config(self.cfg5 % ('false', '', 'true', 'false', 'false', self.mem, self.swapctl)) # Do not use node_list -- vnode_per_numa_node is NOW off # so use the natural node. Otherwise might 'expect' stale vnode self.server.expect(NODE, {'state': 'free'}, id=self.hosts_list[0], interval=3, offset=10) a = {'Resource_List.select': '1:ncpus=1:mem=100mb:host=%s' % self.hosts_list[0]} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ATTR_o, jid) o = j.attributes[ATTR_o] self.tempfile.append(o) fn = self.get_cgroup_job_dir('cpuset', jid, self.hosts_list[0]) fn = os.path.join(fn, 'cpuset.mems') result = self.du.cat(hostname=self.hosts_list[0], filename=fn, sudo=True) self.assertEqual(result['rc'], 0) value_mem_fences = result['out'][0] self.logger.info("value with mem_fences: %s" % value_mem_fences) self.server.delete(jid, wait=True) # Now try with mem_fences set to false self.load_config(self.cfg5 % ('false', '', 'false', 'false', 'false', self.mem, self.swapctl)) self.server.expect(NODE, {'state': 'free'}, id=self.nodes_list[0], interval=3, offset=10) a = {'Resource_List.select': '1:ncpus=1:mem=100mb:host=%s' % self.hosts_list[0]} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ATTR_o, jid) o = j.attributes[ATTR_o] self.tempfile.append(o) fn = self.get_cgroup_job_dir('cpuset', jid, self.hosts_list[0]) fn = os.path.join(fn, 'cpuset.mems') result = self.du.cat(hostname=self.hosts_list[0], filename=fn, sudo=True) self.assertEqual(result['rc'], 0) # compare mem value under mem_fences and under no mem_fences value_no_mem_fences = result['out'][0] self.logger.info("value with no mem_fences:%s" % value_no_mem_fences) self.assertNotEqual(value_no_mem_fences, value_mem_fences) def test_cgroup_cpuset_mem_hardwall(self): """ Confirm that mem_hardwall affects setting of cpuset.mem_hardwall """ if not self.paths[self.hosts_list[0]]['memory']: self.skipTest('Test requires memory subystem mounted') self.load_config(self.cfg5 % ('false', '', 'true', 'false', 'false', self.mem, self.swapctl)) self.server.expect(NODE, {'state': 'free'}, id=self.nodes_list[0], interval=3, offset=10) a = {'Resource_List.select': '1:ncpus=1:mem=100mb:host=%s' % self.hosts_list[0]} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ATTR_o, jid) o = j.attributes[ATTR_o] self.tempfile.append(o) memh_path = 'cpuset.mem_hardwall' fn = self.get_cgroup_job_dir('cpuset', jid, self.hosts_list[0]) if self.noprefix: memh_path = 'mem_hardwall' fn = os.path.join(fn, memh_path) self.logger.info('fn is %s' % fn) if not (self.is_file(fn, self.hosts_list[0])): self.skipTest('cgroup mem_hardwall of job does not exist') result = self.du.cat(hostname=self.hosts_list[0], filename=fn, sudo=True) self.assertEqual(result['rc'], 0) self.assertEqual(result['out'][0], '0') self.server.delete(jid, wait=True) self.load_config(self.cfg5 % ('false', '', 'true', 'true', 'false', self.mem, self.swapctl)) self.server.expect(NODE, {'state': 'free'}, id=self.nodes_list[0], interval=3, offset=10) a = {'Resource_List.select': '1:ncpus=1:mem=100mb:host=%s' % self.hosts_list[0]} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ATTR_o, jid) o = j.attributes[ATTR_o] self.tempfile.append(o) fn = self.get_cgroup_job_dir('cpuset', jid, self.hosts_list[0]) fn = os.path.join(fn, memh_path) if not (self.is_file(fn, self.hosts_list[0])): self.skipTest('cgroup mem_hardwall of job does not exist') result = self.du.cat(hostname=self.hosts_list[0], filename=fn, sudo=True) self.assertEqual(result['rc'], 0) self.assertEqual(result['out'][0], '1') def test_cgroup_find_gpus(self): """ Confirm that the hook finds the correct number of GPUs. Note: This assumes all GPUs have the same MIG configuration, either on or off. """ if not self.paths[self.hosts_list[0]]['devices']: self.skipTest('Skipping test since no devices subsystem defined') name = 'CGROUP3' self.load_config(self.cfg2) cmd = ['nvidia-smi', '-L'] try: rv = self.du.run_cmd(hosts=self.moms_list[0].hostname, cmd=cmd) except OSError: rv = {'err': True} if rv['err'] or 'GPU' not in rv['out'][0]: self.skipTest('Skipping test since nvidia-smi not found') last_gpu_was_physical = False gpus = 0 # store uuids of the MIG devices uuid_list = [] for l in rv['out']: if l.startswith('GPU'): last_gpu_was_physical = True gpus += 1 elif l.lstrip().startswith('MIG'): uuid_list.append(l.split()[-1].rstrip(")")) if last_gpu_was_physical: gpus -= 1 last_gpu_was_physical = False gpus += 1 if gpus < 1: self.skipTest('Skipping test since no gpus found on %s' % (self.nodes_list[0])) ngpus_stat = self.server.status(NODE, id=self.nodes_list[0])[0] self.logger.info("pbsnodes for %s reported: %s" % (self.nodes_list[0], ngpus_stat)) self.assertTrue('resources_available.ngpus' in ngpus_stat, "No resources_available.ngpus found on node %s" % (self.nodes_list[0])) ngpus = int(ngpus_stat['resources_available.ngpus']) self.assertEqual(gpus, ngpus, 'ngpus is incorrect') a = {'Resource_List.select': '1:ngpus=1', ATTR_N: name} j = Job(TEST_USER, attrs=a) j.create_script(self.check_gpu_script) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R'}, jid) self.server.status(JOB, [ATTR_o, 'exec_host'], jid) filename = j.attributes[ATTR_o] self.tempfile.append(filename) ehost = j.attributes['exec_host'] tmp_file = filename.split(':')[1] tmp_host = ehost.split('/')[0] tmp_out = self.wait_and_read_file(filename=tmp_file, host=tmp_host) mig_devices_in_use = tmp_out[-1] for mig_device in mig_devices_in_use.split(","): self.assertIn(mig_device, uuid_list, "MIG identifiers do not match") self.logger.info(tmp_out) self.assertIn('There are 1 GPUs', tmp_out, 'No gpus were assigned') self.assertIn('c 195:255 rwm', tmp_out, 'Nvidia controller not found') m = re.search(r'195:(?!255)', '\n'.join(tmp_out)) self.assertIsNotNone(m.group(0), 'No gpu assigned in cgroups') def test_cgroup_cpuset_memory_spread_page(self): """ Confirm that mem_spread_page affects setting of cpuset.memory_spread_page """ if not self.paths[self.hosts_list[0]]['memory']: self.skipTest('Test requires memory subystem mounted') self.load_config(self.cfg5 % ('false', '', 'true', 'false', 'false', self.mem, self.swapctl)) nid = self.nodes_list[0] self.server.expect(NODE, {'state': 'free'}, id=nid, interval=3, offset=10) hostn = self.hosts_list[0] a = {'Resource_List.select': '1:ncpus=1:mem=100mb:host=%s' % hostn} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ATTR_o, jid) o = j.attributes[ATTR_o] self.tempfile.append(o) spread_path = 'cpuset.memory_spread_page' fn = self.get_cgroup_job_dir('cpuset', jid, hostn) if self.noprefix: spread_path = 'memory_spread_page' fn = os.path.join(fn, spread_path) self.assertTrue(self.is_file(fn, hostn)) result = self.du.cat(hostname=hostn, filename=fn, sudo=True) self.assertEqual(result['rc'], 0) self.assertEqual(result['out'][0], '0') self.server.delete(jid, wait=True) self.load_config(self.cfg5 % ('false', '', 'true', 'false', 'true', self.mem, self.swapctl)) self.server.expect(NODE, {'state': 'free'}, id=nid, interval=3, offset=10) a = {'Resource_List.select': '1:ncpus=1:mem=100mb:host=%s' % hostn} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, ATTR_o, jid) o = j.attributes[ATTR_o] self.tempfile.append(o) fn = self.get_cgroup_job_dir('cpuset', jid, hostn) fn = os.path.join(fn, spread_path) result = self.du.cat(hostname=hostn, filename=fn, sudo=True) self.assertEqual(result['rc'], 0) self.assertEqual(result['out'][0], '1') def test_cgroup_use_hierarchy(self): """ Test that memory.use_hierarchy is enabled by default when PBS cgroups hook is instantiated """ # Remove PBS directories from memory subsystem cpath = None if ('memory' in self.paths[self.hosts_list[0]] and self.paths[self.hosts_list[0]]['memory']): cdir = self.paths[self.hosts_list[0]]['memory'] cpath = self.find_main_cpath(cdir) else: self.skipTest( "memory subsystem is not enabled for cgroups") if cpath is not None: cmd = ["rmdir", cpath] self.du.run_cmd(cmd=cmd, sudo=True, hosts=self.hosts_list[0]) self.logger.info("Removing %s" % cpath) self.load_config(self.cfg6 % (self.mem, self.swapctl)) # check where cpath is once more # since we loaded a new cgroup config file cpath = None if ('memory' in self.paths[self.hosts_list[0]] and self.paths[self.hosts_list[0]]['memory']): cdir = self.paths[self.hosts_list[0]]['memory'] cpath = self.find_main_cpath(cdir) # Verify that memory.use_hierarchy is enabled fpath = os.path.join(cpath, "memory.use_hierarchy") self.logger.info("looking for file %s" % fpath) rc = self.du.isfile(hostname=self.hosts_list[0], path=fpath) if rc: ret = self.du.cat(hostname=self.hosts_list[0], filename=fpath, logerr=False) val = (' '.join(ret['out'])).strip() self.assertEqual( val, "1", "%s is not equal to 1" % val) self.logger.info("memory.use_hierarchy is enabled") else: self.assertFalse(1, "File %s not present" % fpath) def test_cgroup_periodic_update_known_jobs(self): """ Verify that jobs known to mom are updated, not orphans """ conf = {'freq': 5, 'order': 100} self.server.manager(MGR_CMD_SET, HOOK, conf, self.hook_name) self.load_config(self.cfg3 % ('', 'false', '', self.mem, '', self.swapctl, '')) # Submit a short job and let it run to completion a = {'Resource_List.select': '1:ncpus=1:mem=100mb:host=%s' % self.hosts_list[0]} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep5_job) time.sleep(2) stime = int(time.time()) time.sleep(2) jid1 = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid1) self.server.status(JOB, ATTR_o, jid1) o = j.attributes[ATTR_o] self.tempfile.append(o) err_msg = "Unexpected error in pbs_cgroups " + \ "handling exechost_periodic event: TypeError" self.moms_list[0].log_match(err_msg, max_attempts=3, interval=1, n='ALL', starttime=stime, existence=False) self.server.log_match(jid1 + ';Exit_status=0', n='ALL', starttime=stime) # Create a periodic hook that runs more frequently than the # cgroup hook to prepend jid1 to mom_priv/hooks/hook_data/cgroup_jobs hookname = 'prependjob' hookbody = """ import pbs import os import re import time import traceback event = pbs.event() jid_to_prepend = '%s' pbs_home = '' pbs_mom_home = '' if 'PBS_HOME' in os.environ: pbs_home = os.environ['PBS_HOME'] if 'PBS_MOM_HOME' in os.environ: pbs_mom_home = os.environ['PBS_MOM_HOME'] pbs_conf = pbs.get_pbs_conf() if pbs_conf: if not pbs_home and 'PBS_HOME' in pbs_conf: pbs_home = pbs_conf['PBS_HOME'] if not pbs_mom_home and 'PBS_MOM_HOME' in pbs_conf: pbs_mom_home = pbs_conf['PBS_MOM_HOME'] if not pbs_home or not pbs_mom_home: if 'PBS_CONF_FILE' in os.environ: pbs_conf_file = os.environ['PBS_CONF_FILE'] else: pbs_conf_file = os.path.join(os.sep, 'etc', 'pbs.conf') regex = re.compile(r'\\s*([^\\s]+)\\s*=\\s*([^\\s]+)\\s*') try: with open(pbs_conf_file, 'r') as desc: for line in desc: match = regex.match(line) if match: if not pbs_home and match.group(1) == 'PBS_HOME': pbs_home = match.group(2) if not pbs_mom_home and (match.group(1) == 'PBS_MOM_HOME'): pbs_mom_home = match.group(2) except Exception: pass if not pbs_home: pbs.logmsg(pbs.EVENT_DEBUG, 'Failed to locate PBS_HOME') event.reject() if not pbs_mom_home: pbs_mom_home = pbs_home jobsfile = os.path.join(pbs_mom_home, 'mom_priv', 'hooks', 'hook_data', 'cgroup_jobs') try: with open(jobsfile, 'r+') as desc: jobdict = eval(desc.read()) if jid_to_prepend not in jobdict: jobdict[jid_to_prepend] = time.time() desc.seek(0) desc.write(str(jobdict)) desc.truncate() except Exception as exc: pbs.logmsg(pbs.EVENT_DEBUG, 'Failed to modify ' + jobsfile) pbs.logmsg(pbs.EVENT_DEBUG, str(traceback.format_exc().strip().splitlines())) event.reject() event.accept() """ % jid1 events = ['execjob_begin', 'exechost_periodic'] hookconf = {'enabled': 'True', 'freq': 2, 'alarm': 30, 'event': events} self.server.create_import_hook(hookname, hookconf, hookbody, overwrite=True) # Submit a second job and verify that the following message # does NOT appear in the mom log: # _exechost_periodic_handler: Failed to update jid1 a = {'Resource_List.select': '1:ncpus=1:mem=100mb:host=%s' % self.hosts_list[0]} j = Job(TEST_USER, attrs=a) # Here a short job is OK, since we are waiting for it to end j.create_script(self.sleep30_job) time.sleep(2) presubmit = int(time.time()) time.sleep(2) jid2 = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid2) self.server.status(JOB, ATTR_o, jid2) o = j.attributes[ATTR_o] self.tempfile.append(o) err_msg = "Unexpected error in pbs_cgroups " + \ "handling exechost_periodic event: TypeError" self.moms_list[0].log_match(err_msg, max_attempts=3, interval=1, n='ALL', starttime=presubmit, existence=False) self.server.log_match(jid2 + ';Exit_status=0', n='ALL', starttime=presubmit) self.server.manager(MGR_CMD_DELETE, HOOK, None, hookname) command = ['rm', '-rf', os.path.join(self.moms_list[0].pbs_conf['PBS_HOME'], 'mom_priv', 'hooks', 'hook_data', 'cgroup_jobs')] self.du.run_cmd(cmd=command, hosts=self.hosts_list[0], sudo=True) logmsg = '_exechost_periodic_handler: Failed to update %s' % jid1 self.moms_list[0].log_match(msg=logmsg, starttime=presubmit, n='ALL', max_attempts=1, existence=False) @requirements(num_moms=3) def test_cgroup_release_nodes(self): """ Verify that exec_vnode values are trimmed when execjob_launch hook prunes job via release_nodes(), tolerate_node_failures=job_start """ self.load_config(self.cfg7 % (self.mem, self.mem)) # instantiate queuejob hook hook_event = 'queuejob' hook_name = 'qjob' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.qjob_hook_body) # instantiate execjob_launch hook hook_event = 'execjob_launch' hook_name = 'launch' a = {'event': hook_event, 'enabled': 'true'} self.keep_select = 'e.job.Resource_List["site"]' self.server.create_import_hook( hook_name, a, self.launch_hook_body % (self.keep_select)) # Submit a job that requires 2 nodes j = Job(TEST_USER) j.create_script(self.job_scr2 % (self.hosts_list[1])) jid = self.server.submit(j) # Check the exec_vnode while in substate 41 self.server.expect(JOB, {ATTR_substate: '41'}, id=jid) self.server.expect(JOB, 'exec_vnode', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) execvnode1 = job_stat[0]['exec_vnode'] self.logger.info("initial exec_vnode: %s" % execvnode1) initial_vnodes = execvnode1.split('+') # Check the exec_vnode after job is in substate 42 self.server.expect(JOB, {ATTR_substate: '42'}, id=jid) self.server.expect(JOB, 'exec_vnode', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) execvnode2 = job_stat[0]['exec_vnode'] self.logger.info("pruned exec_vnode: %s" % execvnode2) pruned_vnodes = execvnode2.split('+') # Check that the pruned exec_vnode has one less than initial value self.assertEqual(len(pruned_vnodes) + 1, len(initial_vnodes)) # Find the released vnode for vn in initial_vnodes: if vn not in pruned_vnodes: rel_vn = vn vnodeB = rel_vn.split(':')[0].split('(')[1] self.logger.info("released vnode: %s" % vnodeB) # Submit a second job requesting the released vnode, job runs j2 = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=1:mem=100mb:vnode=%s' % vnodeB}) jid2 = self.server.submit(j2) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2) @requirements(num_moms=3) def test_cgroup_sismom_resize_fail(self): """ Verify that exec_vnode values are trimmed when execjob_launch hook prunes job via release_nodes(), exec_job_resize failure in sister mom, tolerate_node_failures=job_start """ self.load_config(self.cfg7 % (self.mem, self.mem)) # instantiate queuejob hook hook_event = 'queuejob' hook_name = 'qjob' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.qjob_hook_body) # instantiate execjob_launch hook hook_event = 'execjob_launch' hook_name = 'launch' a = {'event': hook_event, 'enabled': 'true'} self.keep_select = 'e.job.Resource_List["site"]' self.server.create_import_hook( hook_name, a, self.launch_hook_body % (self.keep_select)) # instantiate execjob_resize hook hook_event = 'execjob_resize' hook_name = 'resize' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook( hook_name, a, self.resize_hook_body % ('not')) # Submit a job that requires 2 nodes j = Job(TEST_USER) # Note mother superior is mom[1] not mom[0] j.create_script(self.job_scr2 % (self.hosts_list[1])) time.sleep(2) stime = int(time.time()) time.sleep(2) jid = self.server.submit(j) # Check the exec_vnode while in substate 41 self.server.expect(JOB, {ATTR_substate: '41'}, id=jid) self.server.expect(JOB, 'exec_vnode', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) execvnode1 = job_stat[0]['exec_vnode'] self.logger.info("initial exec_vnode: %s" % execvnode1) # Check the exec_resize hook reject message in sister mom logs self.moms_list[0].log_match( "Job;%s;Cannot resize the job" % (jid), starttime=stime, interval=2, n='ALL') # Check that MS saw that the sister mom failed to update the job # This message is on MS mom[1] but mentions sismom mom[0] self.moms_list[1].log_match( "Job;%s;sister node %s.* failed to update job" % (jid, self.hosts_list[0]), starttime=stime, interval=2, regexp=True, n='ALL') # Because of resize hook reject Mom failed to update the job. # Check that job got requeued. self.server.log_match("Job;%s;Job requeued" % (jid), starttime=stime, n='ALL') @requirements(num_moms=3) def test_cgroup_msmom_resize_fail(self): """ Verify that exec_vnode values are trimmed when execjob_launch hook prunes job via release_nodes(), exec_job_resize failure in mom superior, tolerate_node_failures=job_start """ self.load_config(self.cfg7 % (self.mem, self.mem)) # instantiate queuejob hook hook_event = 'queuejob' hook_name = 'qjob' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.qjob_hook_body) # instantiate execjob_launch hook hook_event = 'execjob_launch' hook_name = 'launch' a = {'event': hook_event, 'enabled': 'true'} self.keep_select = 'e.job.Resource_List["site"]' self.server.create_import_hook( hook_name, a, self.launch_hook_body % (self.keep_select)) # instantiate execjob_resize hook hook_event = 'execjob_resize' hook_name = 'resize' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook( hook_name, a, self.resize_hook_body % ('')) # Submit a job that requires 2 nodes j = Job(TEST_USER) j.create_script(self.job_scr2 % (self.hosts_list[1])) time.sleep(2) stime = int(time.time()) time.sleep(2) jid = self.server.submit(j) # Check the exec_vnode while in substate 41 self.server.expect(JOB, {ATTR_substate: '41'}, id=jid) self.server.expect(JOB, 'exec_vnode', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) execvnode1 = job_stat[0]['exec_vnode'] self.logger.info("initial exec_vnode: %s" % execvnode1) # Check the exec_resize hook reject message in MS log self.moms_list[1].log_match( "Job;%s;Cannot resize the job" % (jid), starttime=stime, interval=2, n='ALL') # Because of resize hook reject Mom failed to update the job. # Check that job got requeued self.server.log_match("Job;%s;Job requeued" % (jid), starttime=stime) @requirements(num_moms=3) def test_cgroup_msmom_nodes_only(self): """ Verify that exec_vnode values are trimmed when execjob_launch hook prunes job via release_nodes(), job is using only vnodes from mother superior host, tolerate_node_failures=job_start """ self.load_config(self.cfg7 % (self.mem, self.mem)) # disable queuejob hook hook_event = 'queuejob' hook_name = 'qjob' a = {'event': hook_event, 'enabled': 'false'} self.server.create_import_hook(hook_name, a, self.qjob_hook_body) # instantiate execjob_launch hook hook_event = 'execjob_launch' hook_name = 'launch' a = {'event': hook_event, 'enabled': 'true'} self.keep_select = '"ncpus=1:mem=100mb"' self.server.create_import_hook( hook_name, a, self.launch_hook_body % (self.keep_select)) # disable execjob_resize hook hook_event = 'execjob_resize' hook_name = 'resize' a = {'event': hook_event, 'enabled': 'false'} self.server.create_import_hook( hook_name, a, self.resize_hook_body % ('')) # Submit a job that requires two vnodes j = Job(TEST_USER) j.create_script(self.job_scr3) time.sleep(2) stime = int(time.time()) time.sleep(2) jid = self.server.submit(j) # Check the exec_vnode while in substate 41 self.server.expect(JOB, {ATTR_substate: '41'}, id=jid) self.server.expect(JOB, 'exec_vnode', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) execvnode1 = job_stat[0]['exec_vnode'] self.logger.info("initial exec_vnode: %s" % execvnode1) initial_vnodes = execvnode1.split('+') # Check the exec_vnode after job is in substate 42 self.server.expect(JOB, {ATTR_substate: '42'}, id=jid) self.server.expect(JOB, 'exec_vnode', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) execvnode2 = job_stat[0]['exec_vnode'] self.logger.info("pruned exec_vnode: %s" % execvnode2) pruned_vnodes = execvnode2.split('+') # Check that the pruned exec_vnode has one less than initial value self.assertEqual(len(pruned_vnodes) + 1, len(initial_vnodes)) # Check that the exec_vnode got pruned self.moms_list[0].log_match("Job;%s;pruned from exec_vnode=%s" % ( jid, execvnode1), starttime=stime, n='ALL') self.moms_list[0].log_match("Job;%s;pruned to exec_vnode=%s" % ( jid, execvnode2), starttime=stime, n='ALL') # Find out the released vnode if initial_vnodes[0] == execvnode2: execvnodeB = initial_vnodes[1] else: execvnodeB = initial_vnodes[0] vnodeB = execvnodeB.split(':')[0].split('(')[1] self.logger.info("released vnode: %s" % vnodeB) # Submit job2 requesting the released vnode, job runs j2 = Job(TEST_USER, { ATTR_l + '.select': '1:ncpus=1:mem=100mb:vnode=%s' % vnodeB}) jid2 = self.server.submit(j2) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2) @requirements(num_moms=3) def test_cgroups_abort(self): """ Verify that if one of the sister mom is down then cgroups hook will call the abort event which will cleanup the cgroups files on sister moms and primary mom """ self.logger.info("Stopping mom on host %s" % self.hosts_list[1]) self.moms_list[1].signal('-19') a = {'Resource_List.select': '1:ncpus=1:host=%s+1:ncpus=1:host=%s+1:ncpus=1:host=%s' % (self.hosts_list[0], self.hosts_list[1], self.hosts_list[2])} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R', 'substate': '41'} self.server.expect(JOB, a, jid) self.logger.info("Killing mom on host %s" % self.hosts_list[1]) time.sleep(2) now = int(time.time()) time.sleep(2) self.moms_list[1].signal('-9') self.server.expect(NODE, {'state': "down"}, id=self.hosts_list[1]) self.server.expect(JOB, {'job_state': 'Q'}, id=jid) # Verify that cgroups directories are cleaned on primary mom cpath = self.get_cgroup_job_dir('memory', jid, self.hosts_list[0]) self.assertFalse(self.is_dir(cpath, self.hosts_list[0])) # Verify that cgroups directories are cleaned by execjob_abort # hook on sister mom cpath = self.get_cgroup_job_dir('memory', jid, self.hosts_list[2]) self.assertFalse(self.is_dir(cpath, self.hosts_list[2])) self.moms_list[0].log_match("job_start_error", starttime=now, n='ALL') self.moms_list[0].log_match("Event type is execjob_abort", starttime=now, n='ALL') self.moms_list[0].log_match("Event type is execjob_epilogue", starttime=now, n='ALL') self.moms_list[0].log_match("Event type is execjob_end", starttime=now, n='ALL') self.moms_list[2].log_match("Event type is execjob_abort", starttime=now, n='ALL') self.moms_list[1].pi.restart() self.server.expect(JOB, {'job_state': 'R'}, id=jid) @timeout(1800) def test_big_cgroup_cpuset(self): """ With vnodes_per_numa and use_hyperthreads set to "true", test to verify that a job requesting at least 10 vnodes (i.e. 10 memory sockets) get a cgroup cpuset with the correct number of cpus and memory sockets. """ name = 'CGROUP_BIG' self.load_config(self.cfg9 % (self.mem, self.mem)) vnodes_count = 10 try: self.server.expect(VNODE, {'state=free': vnodes_count}, op=GE, count=True, interval=2) except Exception as exc: self.skipTest("Test require >= %d free vnodes" % (vnodes_count,)) rncpus = 'resources_available.ncpus' a = {rncpus: (GT, 0), 'state': 'free'} free_nodes = self.server.filter(VNODE, a, attrop=PTL_AND, idonly=False) vnodes = list(free_nodes.values())[0] self.assertGreaterEqual(len(vnodes), vnodes_count, 'Test does not have enough free vnodes') # find the minimum number of cpus found among the vnodes cpus_per_vnode = None for v in vnodes: v_rncpus = int(v[rncpus]) if not cpus_per_vnode: cpus_per_vnode = v_rncpus if v_rncpus < cpus_per_vnode: cpus_per_vnode = v_rncpus # Submit a job select_spec = "%d:ncpus=%d" % (vnodes_count, cpus_per_vnode) a = {'Resource_List.select': select_spec, ATTR_N: name + 'a'} j1 = Job(TEST_USER, attrs=a) j1.create_script(self.sleep600_job) jid1 = self.server.submit(j1) a = {'job_state': 'R'} # Make sure job is running self.server.expect(JOB, a, jid1) # cpuset path for job fn1 = self.get_cgroup_job_dir('cpuset', jid1, self.hosts_list[0]) # Capture the output of cpuset_mem_script for job scr1 = self.du.run_cmd(cmd=[self.cpuset_mem_script % (fn1, None)], as_script=True, hosts=self.hosts_list[0]) tmp_out1 = scr1['out'] self.logger.info("test output for job1: %s" % (tmp_out1)) # Ensure the number of cpus assigned matches request cpuids = None for kv in tmp_out1: if 'CpuIDs=' in kv: cpuids = kv.split("=")[1] break cpus_assn = count_items(cpuids) cpus_req = vnodes_count * cpus_per_vnode self.logger.info("CpuIDs assn=%d req=%d" % (cpus_assn, cpus_req)) self.assertEqual(cpus_assn, cpus_req, 'CpuIDs assigned did not match requested') self.logger.info('CpuIDs check passed') # Ensure the number of sockets assigned matches request memsocket = None for kv in tmp_out1: if 'MemorySocket=' in kv: memsocket = kv.split("=")[1] break mem_assn = count_items(memsocket) self.logger.info("MemSocket assn=%d req=%d" % (mem_assn, vnodes_count)) self.assertEqual(mem_assn, vnodes_count, 'MemSocket assigned not match requested') self.logger.info('MemSocket check passed') @requirements(num_moms=2) def test_checkpoint_abort_preemption(self): """ Test to make sure that when scheduler preempts a multi-node job with checkpoint_abort, execjob_abort cgroups hook on secondary node gets called. The abort hook cleans up assigned cgroups, allowing the higher priority job to run on the same node. """ # create express queue a = {'queue_type': 'execution', 'started': 'True', 'enabled': 'True', 'Priority': 200} self.server.manager(MGR_CMD_CREATE, QUEUE, a, "express") # have scheduler preempt lower priority jobs using 'checkpoint' self.server.manager(MGR_CMD_SET, SCHED, {'preempt_order': 'C'}) # have moms do checkpoint_abort chk_script = """#!/bin/bash kill $1 exit 0 """ a = {'resources_available.ncpus': 1} for m in self.moms.values(): chk_file = m.add_checkpoint_abort_script(body=chk_script) # ensure resulting checkpoint file has correct permission self.du.chown(hostname=m.shortname, path=chk_file, uid=0, gid=0, sudo=True) self.server.manager(MGR_CMD_SET, NODE, a, id=m.shortname) # submit multi-node job a = {'Resource_List.select': '1:ncpus=1:host=%s+1:ncpus=1:host=%s' % ( self.hosts_list[0], self.hosts_list[1]), 'Resource_List.place': 'scatter:exclhost'} j1 = Job(TEST_USER, attrs=a) jid1 = self.server.submit(j1) # to work around a scheduling race, check for substate 42 # if you test for R then a slow job startup might update # resources_assigned late and make scheduler overcommit nodes # and run both jobs self.server.expect(JOB, {'substate': '42'}, id=jid1) # Submit an express queue job requesting needing also 2 nodes a[ATTR_q] = 'express' j2 = Job(TEST_USER, attrs=a) time.sleep(2) stime = int(time.time()) time.sleep(2) jid2 = self.server.submit(j2) self.server.expect(JOB, {'job_state': 'Q'}, id=jid1) err_msg = "%s;.*Failed to assign resources.*" % (jid2,) for m in self.moms.values(): m.log_match(err_msg, max_attempts=3, interval=1, starttime=stime, regexp=True, existence=False, n='ALL') self.server.expect(JOB, {'job_state': 'R', 'substate': 42}, id=jid2) @requirements(num_moms=2) def test_checkpoint_restart(self): """ Test to make sure that when a preempted and checkpointed multi-node job restarts, execjob_begin cgroups hook gets called on both mother superior and sister moms. """ # create express queue a = {'queue_type': 'execution', 'started': 'True', 'enabled': 'True', 'Priority': 200} self.server.manager(MGR_CMD_CREATE, QUEUE, a, "express") # have scheduler preempt lower priority jobs using 'checkpoint' self.server.manager(MGR_CMD_SET, SCHED, {'preempt_order': 'C'}) # have moms do checkpoint_abort chk_script = """#!/bin/bash kill $1 exit 0 """ restart_script = """#!/bin/bash sleep 300 """ a = {'resources_available.ncpus': 1} for m in self.moms.values(): # add checkpoint script m.add_checkpoint_abort_script(body=chk_script) m.add_restart_script(body=restart_script, abort_time=300) self.server.manager(MGR_CMD_SET, NODE, a, id=m.shortname) # submit multi-node job a = {'Resource_List.select': '1:ncpus=1:host=%s+1:ncpus=1:host=%s' % ( self.hosts_list[0], self.hosts_list[1]), 'Resource_List.place': 'scatter:exclhost'} j1 = Job(TEST_USER, attrs=a) j1.set_sleep_time(300) jid1 = self.server.submit(j1) # to work around a scheduling race, check for substate 42 # if you test for R then a slow job startup might update # resources_assigned late and make scheduler overcommit nodes # and run both jobs self.server.expect(JOB, {'substate': '42'}, id=jid1) time.sleep(5) cpath = self.get_cgroup_job_dir('cpuset', jid1, self.hosts_list[0]) self.assertTrue(self.is_dir(cpath, self.hosts_list[0])) cpath = self.get_cgroup_job_dir('cpuset', jid1, self.hosts_list[1]) self.assertTrue(self.is_dir(cpath, self.hosts_list[1])) # Submit an express queue job requesting needing also 2 nodes a[ATTR_q] = 'express' j2 = Job(TEST_USER, attrs=a) j2.set_sleep_time(300) jid2 = self.server.submit(j2) self.server.expect(JOB, {'job_state': 'Q'}, id=jid1) self.server.expect(JOB, {'substate': '42'}, id=jid2) time.sleep(5) cpath = self.get_cgroup_job_dir('cpuset', jid2, self.hosts_list[0]) self.assertTrue(self.is_dir(cpath, self.hosts_list[0])) cpath = self.get_cgroup_job_dir('cpuset', jid2, self.hosts_list[1]) self.assertTrue(self.is_dir(cpath, self.hosts_list[1])) # delete express queue job self.server.delete(jid2) # wait until the preempted job is sent to MoM again # the checkpointing script hangs, so it stays in substate 41 self.server.expect(JOB, {'job_state': 'R', 'substate': 41}, id=jid1) # we need to give the hooks some time here... time.sleep(10) # check the cpusets for the deleted preemptor are gone cpath = self.get_cgroup_job_dir('cpuset', jid2, self.hosts_list[0]) self.assertFalse(self.is_dir(cpath, self.hosts_list[0])) cpath = self.get_cgroup_job_dir('cpuset', jid2, self.hosts_list[1]) self.assertFalse(self.is_dir(cpath, self.hosts_list[1])) # check the cpusets for the restarted formerly-preempted are there cpath = self.get_cgroup_job_dir('cpuset', jid1, self.hosts_list[0]) self.assertTrue(self.is_dir(cpath, self.hosts_list[0])) cpath = self.get_cgroup_job_dir('cpuset', jid1, self.hosts_list[1]) self.assertTrue(self.is_dir(cpath, self.hosts_list[1])) def test_cpu_controller_enforce_default(self): """ Test an enabled cgroup 'cpu' controller with quotas enforced using default (non-specified) values of cfs_period_us, and cfs_quota_fudge_factor. """ root_quota_host1 = None try: root_quota_host1_str = \ self.du.run_cmd(hosts=self.hosts_list[0], cmd=['cat', '/sys/fs/cgroup/cpu/cpu.cfs_quota_us']) root_quota_host1 = int(root_quota_host1_str['out'][0]) except Exception: pass # If that link is missing and it's only # mounted under the cpu/cpuacct unified directory... if root_quota_host1 is None: try: root_quota_host1_str = \ self.du.run_cmd(hosts=self.hosts_list[0], cmd=['cat', '/sys/fs/cgroup/' 'cpu,cpuacct/cpu.cfs_quota_us']) root_quota_host1 = int(root_quota_host1_str['out'][0]) except Exception: pass # If still not found, try to see if it is in a unified cgroup mount # as in cgroup v2 if root_quota_host1 is None: try: root_quota_host1_str = \ self.du.run_cmd(hosts=self.hosts_list[0], cmd=['cat', '/sys/fs/cgroup/cpu.cfs_quota_us']) root_quota_host1 = int(root_quota_host1_str['out'][0]) except Exception: pass if root_quota_host1 is None: self.skipTest('cpu group controller test: ' 'could not determine root cfs_quota_us') elif root_quota_host1 != -1: self.skipTest('cpu group controller test: ' 'root cfs_quota_us is not unlimited, cannot test ' 'cgroup hook CPU quotas in this environment') name = 'CGROUP1' self.load_config(self.cfg10 % (self.mem, self.mem)) default_cfs_period_us = 100000 default_cfs_quota_fudge_factor = 1.03 # Restart mom for changes made by cgroups hook to take effect self.mom.restart() self.server.expect(NODE, {'state': 'free'}, id=self.nodes_list[0], interval=1) result = self.server.status(NODE, 'resources_available.ncpus', id=self.nodes_list[0]) orig_ncpus = int(result[0]['resources_available.ncpus']) self.assertGreater(orig_ncpus, 0) self.logger.info('Original value of ncpus: %d' % orig_ncpus) if orig_ncpus >= 2: ncpus_req = 2 else: ncpus_req = 1 a = {'Resource_List.select': "ncpus=%d" % ncpus_req, ATTR_N: name, ATTR_k: 'oe'} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, [ATTR_o, 'exec_host'], jid) fna = self.get_cgroup_job_dir('cpu', jid, self.hosts_list[0]) self.assertFalse(fna is None, 'No job directory for cpu subsystem') cpu_scr = self.du.run_cmd(cmd=[self.cpu_controller_script % fna], as_script=True, hosts=self.hosts_list[0]) cpu_scr_out = cpu_scr['out'] self.logger.info('cpu_scr_out:\n%s' % cpu_scr_out) shares_match = (ncpus_req * 1000) self.assertTrue("cpu_shares=%d" % shares_match in cpu_scr_out) self.logger.info("cpu_shares check passed (match %d)" % shares_match) self.assertTrue("cpu_cfs_period_us=%d" % (default_cfs_period_us) in cpu_scr_out) self.logger.info("cpu_cfs_period_us check passed (match %d)" % (default_cfs_period_us)) cfs_quota_us_match = default_cfs_period_us * \ ncpus_req * default_cfs_quota_fudge_factor self.assertTrue("cpu_cfs_quota_us=%d" % (cfs_quota_us_match) in cpu_scr_out) self.logger.info("cpu_cfs_quota_us check passed (match %d)" % (cfs_quota_us_match)) def test_cpu_controller_enforce(self): """ Test an enabled cgroup 'cpu' controller with quotas enforced, using specific values to: cfs_period_us cfs_quota_fudge_factor in config file 'cfg11'. """ root_quota_host1 = None try: root_quota_host1_str = \ self.du.run_cmd(hosts=self.hosts_list[0], cmd=['cat', '/sys/fs/cgroup/cpu/cpu.cfs_quota_us']) root_quota_host1 = int(root_quota_host1_str['out'][0]) except Exception: pass # If that link is missing and it's only # mounted under the cpu/cpuacct unified directory... if root_quota_host1 is None: try: root_quota_host1_str = \ self.du.run_cmd(hosts=self.hosts_list[0], cmd=['cat', '/sys/fs/cgroup/' 'cpu,cpuacct/cpu.cfs_quota_us']) root_quota_host1 = int(root_quota_host1_str['out'][0]) except Exception: pass # If still not found, try to see if it is in a unified cgroup mount # as in cgroup v2 if root_quota_host1 is None: try: root_quota_host1_str = \ self.du.run_cmd(hosts=self.hosts_list[0], cmd=['cat', '/sys/fs/cgroup/cpu.cfs_quota_us']) root_quota_host1 = int(root_quota_host1_str['out'][0]) except Exception: pass if root_quota_host1 is None: self.skipTest('cpu group controller test: ' 'could not determine root cfs_quota_us') elif root_quota_host1 != -1: self.skipTest('cpu group controller test: ' 'root cfs_quota_us is not unlimited, cannot test ' 'cgroup hook CPU quotas in this environment') name = 'CGROUP1' cfs_period_us = 200000 cfs_quota_fudge_factor = 1.05 self.load_config(self.cfg11 % (self.mem, self.mem, cfs_period_us, cfs_quota_fudge_factor)) self.server.expect(NODE, {'state': 'free'}, id=self.nodes_list[0], interval=1) result = self.server.status(NODE, 'resources_available.ncpus', id=self.nodes_list[0]) orig_ncpus = int(result[0]['resources_available.ncpus']) self.assertGreater(orig_ncpus, 0) self.logger.info('Original value of ncpus: %d' % orig_ncpus) if orig_ncpus >= 2: ncpus_req = 2 else: ncpus_req = 1 a = {'Resource_List.select': "ncpus=%d" % ncpus_req, ATTR_N: name, ATTR_k: 'oe'} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, [ATTR_o, 'exec_host'], jid) fna = self.get_cgroup_job_dir('cpu', jid, self.hosts_list[0]) self.assertFalse(fna is None, 'No job directory for cpu subsystem') cpu_scr = self.du.run_cmd(cmd=[self.cpu_controller_script % fna], as_script=True, hosts=self.hosts_list[0]) cpu_scr_out = cpu_scr['out'] self.logger.info('cpu_scr_out:\n%s' % cpu_scr_out) shares_match = (ncpus_req * 1000) self.assertTrue("cpu_shares=%d" % shares_match in cpu_scr_out) self.logger.info("cpu_shares check passed (match %d)" % shares_match) self.assertTrue("cpu_cfs_period_us=%d" % (cfs_period_us) in cpu_scr_out) self.logger.info( "cpu_cfs_period_us check passed (match %d)" % (cfs_period_us)) cfs_quota_us_match = cfs_period_us * ncpus_req * cfs_quota_fudge_factor self.assertTrue("cpu_cfs_quota_us=%d" % (cfs_quota_us_match) in cpu_scr_out) self.logger.info("cpu_cfs_quota_us check passed (match %d)" % (cfs_quota_us_match)) def test_cpu_controller_enforce_default_zero_job(self): """ Test an enabled cgroup 'cpu' controller with quotas enforced on zero-cpu job, using default (non-specified) values of: cfs_period_us cfs_quota_fudge_factor zero_cpus_shares_fraction zero_cpus_quota_fraction """ root_quota_host1 = None try: root_quota_host1_str = \ self.du.run_cmd(hosts=self.hosts_list[0], cmd=['cat', '/sys/fs/cgroup/cpu/cpu.cfs_quota_us']) root_quota_host1 = int(root_quota_host1_str['out'][0]) except Exception: pass # If that link is missing and it's only # mounted under the cpu/cpuacct unified directory... if root_quota_host1 is None: try: root_quota_host1_str = \ self.du.run_cmd(hosts=self.hosts_list[0], cmd=['cat', '/sys/fs/cgroup/' 'cpu,cpuacct/cpu.cfs_quota_us']) root_quota_host1 = int(root_quota_host1_str['out'][0]) except Exception: pass # If still not found, try to see if it is in a unified cgroup mount # as in cgroup v2 if root_quota_host1 is None: try: root_quota_host1_str = \ self.du.run_cmd(hosts=self.hosts_list[0], cmd=['cat', '/sys/fs/cgroup/cpu.cfs_quota_us']) root_quota_host1 = int(root_quota_host1_str['out'][0]) except Exception: pass if root_quota_host1 is None: self.skipTest('cpu group controller test: ' 'could not determine root cfs_quota_us') elif root_quota_host1 != -1: self.skipTest('cpu group controller test: ' 'root cfs_quota_us is not unlimited, cannot test ' 'cgroup hook CPU quotas in this environment') name = 'CGROUP1' # config file 'cfg12' has 'allow_zero_cpus=true' under cpuset, to allow # zero-cpu jobs. self.load_config(self.cfg12 % (self.mem, self.mem)) default_cfs_period_us = 100000 default_cfs_quota_fudge_factor = 1.03 default_zero_shares_fraction = 0.002 default_zero_quota_fraction = 0.2 # Restart mom for changes made by cgroups hook to take effect self.mom.restart() a = {'Resource_List.select': 'ncpus=0', ATTR_N: name, ATTR_k: 'oe'} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, [ATTR_o, 'exec_host'], jid) fna = self.get_cgroup_job_dir('cpu', jid, self.hosts_list[0]) self.assertFalse(fna is None, 'No job directory for cpu subsystem') cpu_scr = self.du.run_cmd(cmd=[self.cpu_controller_script % fna], as_script=True, hosts=self.hosts_list[0]) cpu_scr_out = cpu_scr['out'] self.logger.info('cpu_scr_out:\n%s' % cpu_scr_out) shares_match = (default_zero_shares_fraction * 1000) self.assertTrue("cpu_shares=%d" % shares_match in cpu_scr_out) self.logger.info("cpu_shares check passed (match %d)" % shares_match) self.assertTrue("cpu_cfs_period_us=%d" % (default_cfs_period_us) in cpu_scr_out) self.logger.info("cpu_cfs_period_us check passed (match %d)" % (default_cfs_period_us)) cfs_quota_us_match = default_cfs_period_us * \ default_zero_quota_fraction * default_cfs_quota_fudge_factor self.assertTrue("cpu_cfs_quota_us=%d" % (cfs_quota_us_match) in cpu_scr_out) self.logger.info("cpu_cfs_quota_us check passed (match %d)" % (cfs_quota_us_match)) def test_cpu_controller_enforce_zero_job(self): """ Test an enabled cgroup 'cpu' controller with quotas enforced on a zero-cpu job. Quotas are enforced using specific values to: cfs_period_us cfs_quota_fudge_factor zero_cpus_shares_fraction zero_cpus_quota_fraction in config file 'cfg13'. """ root_quota_host1 = None try: root_quota_host1_str = \ self.du.run_cmd(hosts=self.hosts_list[0], cmd=['cat', '/sys/fs/cgroup/cpu/cpu.cfs_quota_us']) root_quota_host1 = int(root_quota_host1_str['out'][0]) except Exception: pass # If that link is missing and it's only # mounted under the cpu/cpuacct unified directory... if root_quota_host1 is None: try: root_quota_host1_str = \ self.du.run_cmd(hosts=self.hosts_list[0], cmd=['cat', '/sys/fs/cgroup/' 'cpu,cpuacct/cpu.cfs_quota_us']) root_quota_host1 = int(root_quota_host1_str['out'][0]) except Exception: pass # If still not found, try to see if it is in a unified cgroup mount # as in cgroup v2 if root_quota_host1 is None: try: root_quota_host1_str = \ self.du.run_cmd(hosts=self.hosts_list[0], cmd=['cat', '/sys/fs/cgroup/cpu.cfs_quota_us']) root_quota_host1 = int(root_quota_host1_str['out'][0]) except Exception: pass if root_quota_host1 is None: self.skipTest('cpu group controller test: ' 'could not determine root cfs_quota_us') elif root_quota_host1 != -1: self.skipTest('cpu group controller test: ' 'root cfs_quota_us is not unlimited, cannot test ' 'cgroup hook CPU quotas in this environment') name = 'CGROUP1' cfs_period_us = 200000 cfs_quota_fudge_factor = 1.05 zero_cpus_shares_fraction = 0.3 zero_cpus_quota_fraction = 0.5 # config file 'cfg13' has 'allow_zero_cpus=true' under cpuset, to allow # zero-cpu jobs. self.load_config(self.cfg13 % (self.mem, self.mem, cfs_period_us, cfs_quota_fudge_factor, zero_cpus_shares_fraction, zero_cpus_quota_fraction)) a = {'Resource_List.select': 'ncpus=0', ATTR_N: name, ATTR_k: 'oe'} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) self.server.status(JOB, [ATTR_o, 'exec_host'], jid) fna = self.get_cgroup_job_dir('cpu', jid, self.hosts_list[0]) self.assertFalse(fna is None, 'No job directory for cpu subsystem') cpu_scr = self.du.run_cmd(cmd=[self.cpu_controller_script % fna], as_script=True, hosts=self.hosts_list[0]) cpu_scr_out = cpu_scr['out'] self.logger.info('cpu_scr_out:\n%s' % cpu_scr_out) shares_match = (zero_cpus_shares_fraction * 1000) self.assertTrue("cpu_shares=%d" % shares_match in cpu_scr_out) self.logger.info("cpu_shares check passed (match %d)" % shares_match) self.assertTrue("cpu_cfs_period_us=%d" % (cfs_period_us) in cpu_scr_out) self.logger.info( "cpu_cfs_period_us check passed (match %d)" % (cfs_period_us)) cfs_quota_us_match = cfs_period_us * \ zero_cpus_quota_fraction * cfs_quota_fudge_factor self.assertTrue("cpu_cfs_quota_us=%d" % (cfs_quota_us_match) in cpu_scr_out) self.logger.info("cpu_cfs_quota_us check passed (match %d)" % (cfs_quota_us_match)) def test_vnodepernuma_use_hyperthreads(self): """ Test to verify that correct number of jobs run with vnodes_per_numa=true and use_hyperthreads=true """ pcpus = 0 sibs = 0 cores = 0 pval = 0 phys = {} with open('/proc/cpuinfo', 'r') as desc: for line in desc: if re.match('^processor', line): pcpus += 1 sibs_match = re.search(r'siblings : ([0-9]+)', line) cores_match = re.search(r'cpu cores : ([0-9]+)', line) phys_match = re.search(r'physical id : ([0-9]+)', line) if sibs_match: sibs = int(sibs_match.groups()[0]) if cores_match: cores = int(cores_match.groups()[0]) if phys_match: pval = int(phys_match.groups()[0]) phys[pval] = 1 if (sibs == 0 or cores == 0): self.skipTest('Insufficient information about the processors.') if pcpus < 2: self.skipTest('This test requires at least two processors.') hyperthreads_per_core = int(sibs / cores) name = 'CGROUP20' # set vnode_per_numa=true with use_hyperthreads=true self.load_config(self.cfg3 % ('', 'true', '', self.mem, '', self.swapctl, '')) # Submit M*N*P jobs, where M is the number of physical processors, # N is the number of 'cpu cores' per M. and P being the # number of hyperthreads per core. njobs = len(phys) * cores * hyperthreads_per_core if njobs > 100: self.skipTest("too many jobs (%d) to submit" % njobs) a = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0], ATTR_N: name + 'a'} for _ in range(njobs): j = Job(TEST_USER, attrs=a) # make sure this stays around for an hour # (or until deleted in teardown) j.set_sleep_time(3600) jid = self.server.submit(j) a1 = {'job_state': 'R'} self.server.expect(JOB, a1, jid) # Submit another job, expect in Q state b = {'Resource_List.select': '1:ncpus=1:mem=300mb:host=%s' % self.hosts_list[0], ATTR_N: name + 'b'} j2 = Job(TEST_USER, attrs=b) jid2 = self.server.submit(j2) b1 = {'job_state': 'Q'} self.server.expect(JOB, b1, jid2) def test_cgroup_default_config(self): """ Test to make sure using the default hook config file still run a basic job, and cleans up cpuset upon qdel. """ # The default hook config has 'memory' subsystem enabled if not self.paths[self.hosts_list[0]]['memory']: self.skipTest('Test requires memory subystem mounted') self.load_default_config() # Reduce the noise in mom_logs for existence=False matching c = {'$logevent': '511'} self.mom.add_config(c) a = {'Resource_List.select': 'ncpus=1:mem=100mb'} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) time.sleep(2) stime = int(time.time()) time.sleep(2) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R'}, jid) err_msg = "write_value: Permission denied.*%s.*memsw" % (jid) self.mom.log_match(err_msg, max_attempts=3, interval=1, n='ALL', starttime=stime, regexp=True, existence=False) self.server.status(JOB, ['exec_host'], jid) ehost = j.attributes['exec_host'] ehost1 = ehost.split('/')[0] ehjd1 = self.get_cgroup_job_dir('cpuset', jid, ehost1) self.assertTrue(self.is_dir(ehjd1, ehost1), "job cpuset dir not found") self.server.delete(id=jid, wait=True) self.assertFalse(self.is_dir(ehjd1, ehost1), "job cpuset dir found") def test_cgroup_cgswap(self, vnode_per_numa_node=False): """ Test to verify (with vnode_per_numa_node disabled by default): - whether queuejob/modifyjob set cgswap to vmem-mem in jobs - whether nodes get resources_available.cgswap filled in - whether a collection of jobs submitted that do not exceed available vmem but would deplete cgswap are indeed not all run simultaneously """ if not self.mem: self.skipTest('Test requires memory subystem mounted') if self.swapctl != 'true': self.skipTest('Test requires memsw accounting enabled') self.server.remove_resource('cgswap') self.server.add_resource('cgswap', 'size', 'nh') self.scheduler.add_resource('cgswap') events = ['execjob_begin', 'execjob_launch', 'execjob_attach', 'execjob_epilogue', 'execjob_end', 'exechost_startup', 'exechost_periodic', 'execjob_resize', 'execjob_abort', 'queuejob', 'modifyjob'] # Enable the cgroups hook new events conf = {'enabled': 'True', 'freq': 10, 'event': events} self.server.manager(MGR_CMD_SET, HOOK, conf, self.hook_name) self.load_config(self.cfg15 % ('true' if vnode_per_numa_node else 'false')) vnode_name = self.mom.shortname if vnode_per_numa_node: vnode_name += "[0]" cgswapstat = self.server.status(NODE, 'resources_available.cgswap', id=vnode_name) self.assertTrue(cgswapstat and 'resources_available.cgswap' in cgswapstat[0], 'cgswap resource not found on node') cgswap = PbsTypeSize(cgswapstat[0]['resources_available.cgswap']) self.logger.info('Test node appears to have %s cgswap' % cgswap.encode()) if cgswap == PbsTypeSize("0kb"): self.logger.info('First Mom has no swap, test will just ' 'check if job cgswap is added') a = {'Resource_List.select': '1:ncpus=0:mem=100mb:vmem=1100mb:vnode=%s' % vnode_name} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep30_job) jid = self.server.submit(j) # scheduler sets comment when the job cannot run, # server sets comment when the job runs # in both cases the comment gets set self.server.expect(JOB, 'comment', op=SET) job_status = self.server.status(JOB, id=jid) cgswap = None select_resource = job_status[0]['Resource_List.select'] chunkspecs = select_resource.split(':') for c in chunkspecs: if '=' in c: name, value = c.split('=') if name == 'cgswap': cgswap = PbsTypeSize(value) self.assertTrue(cgswap is not None, 'job cgswap was not added') self.assertTrue(cgswap == PbsTypeSize('1000mb'), 'job cgswap is %s instead of expected 1000mb' % str(cgswap)) self.logger.info('job cgswap detected to be correct, roughly %s' % str(cgswap)) # check that indeed you cannot run the job since it requests # swap usage and there is none job_comment = job_status[0]['comment'] self.assertTrue('Insufficient amount of resource: cgswap' in job_comment, 'Job comment should indicate insufficient cgswap ' 'but is: %s' % job_comment) self.logger.info('job comment as expected: %s' % job_comment) else: self.logger.info('First MoM has swap, confirming cgswap ' 'correctly throttles jobs accepted') # PbsTypeSize value is stored in kb units cgreqval = int(float(cgswap.value) / 1024.0 / 3.0 * 2.0) cgreqsuffix = 'mb' cgreq = PbsTypeSize(str(cgreqval) + cgreqsuffix) vmemreqsize = PbsTypeSize("100mb") + cgreq vmemreq = str(int(vmemreqsize.value / 1024))+'mb' self.logger.info('will submit jobs with 100mb mem and %s vmem' % vmemreq) a = {'Resource_List.select': '1:ncpus=0:mem=100mb:vmem=%s:vnode=%s' % (vmemreq, vnode_name)} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) bs = {'job_state': 'R'} self.server.expect(JOB, bs, jid, offset=1) cgswap = None job_status = self.server.status(JOB, id=jid) select_resource = job_status[0]['Resource_List.select'] chunkspecs = select_resource.split(':') for c in chunkspecs: if '=' in c: name, value = c.split('=') if name == 'cgswap': cgswap = PbsTypeSize(value) self.assertTrue(cgswap is not None, 'job cgswap was not added') self.assertTrue(cgswap == cgreq, 'job cgswap is %s instead of expected %s' % (str(cgswap), str(cgreq))) self.logger.info('job cgswap detected to be correct, roughly %s' % str(cgswap)) j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) # Second job should not run - not enough cgswap # scheduler sets comment when the job cannot run, # server sets comment when the job runs # in both cases the comment gets set self.server.expect(JOB, 'comment', op=SET) job_status = self.server.status(JOB, id=jid) # check that indeed you cannot run the job since it requests # too much swap usage while the first job runs job_comment = job_status[0]['comment'] self.assertTrue('Insufficient amount of resource: cgswap' in job_comment, 'Job comment should indicate insufficient cgswap ' 'but is: %s' % job_comment) self.logger.info('job comment as expected: %s' % job_comment) def test_cgroup_cgswap_numa(self): """ Test to verify (with vnode_per_numa_node enabled): - whether queuejob/modifyjob set cgswap to vmem-mem in jobs - whether nodes get resources_available.cgswap filled in - whether a collection of jobs submitted that do not exceed available vmem but would deplete cgswap are indeed not all run simultaneously """ self.test_cgroup_cgswap(vnode_per_numa_node=True) def test_cgroup_enforce_default(self, enforce_flags=('true', 'true'), exclhost=False): """ Test to verify if the flags to enforce default mem are working and to ensure mem and memsw limits are set as expected; default is to enforce both mem and memsw defaults: job should get small mem limit and larger memsw limit if there is swap. """ if not self.mem: self.skipTest('Test requires memory subystem mounted') if self.swapctl != 'true': self.skipTest('Test requires memsw accounting enabled') self.load_config(self.cfg16 % enforce_flags) a = {'Resource_List.select': '1:ncpus=1:vnode=%s' % self.mom.shortname} if exclhost: a['Resource_List.place'] = 'exclhost' j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) bs = {'job_state': 'R'} self.server.expect(JOB, bs, jid, offset=1) mem_base = os.path.join(self.paths[self.hosts_list[0]]['memory'], 'pbs_jobs.service', 'jobid') # Get total physical memory available mem_avail = os.path.join(mem_base, 'memory.limit_in_bytes') result = self.du.cat(hostname=self.mom.hostname, filename=mem_avail, sudo=True) mem_avail_in_bytes = None try: mem_avail_in_bytes = int(result['out'][0]) except Exception: # None will be seen as a failure, nothing to do pass self.logger.info("total available mem: %d" % mem_avail_in_bytes) self.assertTrue(mem_avail_in_bytes is not None, "Unable to read total memory available") # Get total phys+swap memory available vmem_avail = os.path.join(mem_base, 'memory.memsw.limit_in_bytes') result = self.du.cat(hostname=self.mom.hostname, filename=vmem_avail, sudo=True) vmem_avail_in_bytes = None try: vmem_avail_in_bytes = int(result['out'][0]) except Exception: # None will be seen as a failure, nothing to do pass self.assertTrue(vmem_avail_in_bytes is not None, "Unable to read total memsw available") self.logger.info("total available memsw: %d" % vmem_avail_in_bytes) # Get job physical mem limit mem_limit = os.path.join(mem_base, str(jid), 'memory.limit_in_bytes') result = self.du.cat(hostname=self.mom.hostname, filename=mem_limit, sudo=True) mem_limit_in_bytes = None try: mem_limit_in_bytes = int(result['out'][0]) except Exception: # None will be seen as a failure, nothing to do pass self.assertTrue(mem_limit_in_bytes is not None, "Unable to read job mem limit") self.logger.info("job mem limit: %d" % mem_limit_in_bytes) # Get job phys+swap mem limit vmem_limit = os.path.join(mem_base, str(jid), 'memory.memsw.limit_in_bytes') result = self.du.cat(hostname=self.mom.hostname, filename=vmem_limit, sudo=True) vmem_limit_in_bytes = None try: vmem_limit_in_bytes = int(result['out'][0]) except Exception: # None will be seen as a failure, nothing to do pass self.assertTrue(vmem_limit_in_bytes is not None, "Unable to read job memsw limit") self.logger.info("job memsw limit: %d" % vmem_limit_in_bytes) # Check results correspond to enforcement flags and job placement swap_avail = vmem_avail_in_bytes - mem_avail_in_bytes if enforce_flags[0] == 'true' and not exclhost: self.assertTrue(mem_limit_in_bytes == 100 * 1024 * 1024, "Job mem limit is %d expected %d" % (mem_limit_in_bytes, 100 * 1024 * 1024)) else: self.assertTrue(mem_avail_in_bytes == mem_limit_in_bytes, "job mem limit (%d) should be identical to " "total mem available (%d)" % (mem_limit_in_bytes, mem_avail_in_bytes)) self.logger.info("job mem limit is total mem available (%d)" % mem_avail_in_bytes) if enforce_flags[1] == 'true' and not exclhost: expected_vmem = (mem_limit_in_bytes + min(100 * 1024 * 1024, swap_avail)) self.assertTrue(vmem_limit_in_bytes == expected_vmem, "memsw limit: expected %d, got %d" % (expected_vmem, vmem_limit_in_bytes)) self.logger.info("job memsw limit is expected %d" % vmem_limit_in_bytes) else: if swap_avail: self.assertTrue(vmem_avail_in_bytes == vmem_limit_in_bytes, "job memsw limit (%d) should be identical to " "total memsw available (%d)" % (vmem_limit_in_bytes, vmem_avail_in_bytes)) self.logger.info("job memsw limit is total memsw available " " (%d)" % vmem_avail_in_bytes) else: self.assertTrue(mem_limit_in_bytes == vmem_limit_in_bytes, "no swap, mem (%d) and vmem (%d) limits " "should be identical but are not" % (mem_limit_in_bytes, vmem_limit_in_bytes)) self.logger.info("no swap: job memsw limit is job mem limit") def test_cgroup_enforce_default_tf(self): """ Test to verify if the flags to enforce default mem are working and to ensure mem and memsw limits are set as expected; enforce mem but not memsw: job should get small mem limit memsw should be unlimited (i.e. able to consume memsw set as limit for all jobs) """ self.test_cgroup_enforce_default(enforce_flags=('true', 'false')) def test_cgroup_enforce_default_ft(self): """ Test to verify if the flags to enforce default mem are working and to ensure mem and memsw limits are set as expected; enforce memsw but not mem: job should be able to consume all physical memory set as limit for all jobs but only a small amount of additional swap """ self.test_cgroup_enforce_default(enforce_flags=('false', 'true')) def test_cgroup_enforce_default_exclhost(self): """ Test to verify if the flags to enforce default mem are working and to ensure mem and memsw limits are set as expected; enforce neither mem nor memsw by enabling flags to ignore enforcement for exclhost jobs and submitting an exclhost job: job should be able to consume all physical memory and memsw set as limit for all jobs """ # enforce flags should both be overrided by exclhost self.test_cgroup_enforce_default(enforce_flags=('true', 'true'), exclhost=True) def test_manage_rlimit_as(self): if not self.mem: self.skipTest('Test requires memory subystem mounted') if self.swapctl != 'true': self.skipTest('Test requires memsw accounting enabled') # Make sure job history is enabled to see when job has ended a = {'job_history_enable': 'True'} rc = self.server.manager(MGR_CMD_SET, SERVER, a) self.assertEqual(rc, 0) self.server.expect(SERVER, {'job_history_enable': 'True'}) self.load_config(self.cfg16 % ('true', 'true')) # First job -- request vmem and no pvmem, # RLIMIT_AS shoud be unlimited a = {'Resource_List.select': '1:ncpus=0:mem=400mb:vmem=400mb:vnode=%s' % self.mom.shortname} j = Job(TEST_USER, attrs=a) j.create_script("#!/bin/bash\nulimit -v") jid = self.server.submit(j) bs = {'job_state': 'F'} self.server.expect(JOB, bs, jid, extend='x', offset=1) thisjob = self.server.status(JOB, id=jid, extend='x') try: job_output_file = thisjob[0]['Output_Path'].split(':')[1] except Exception: self.assertTrue(False, "Could not determine job output path") result = self.du.cat(hostname=self.server.hostname, filename=job_output_file, sudo=True) self.assertTrue('out' in result, "Nothing in job output file?") job_out = '\n'.join(result['out']) self.logger.info("job_out=%s" % job_out) self.assertTrue('unlimited' in job_out) self.logger.info("Job that requests vmem " "but no pvmem correctly has unlimited RLIMIT_AS") # Second job -- see if pvmem still works # RLIMIT_AS should correspond to pvmem a['Resource_List.pvmem'] = '400mb' j = Job(TEST_USER, attrs=a) j.create_script("#!/bin/bash\nulimit -v") jid = self.server.submit(j) bs = {'job_state': 'F'} self.server.expect(JOB, bs, jid, extend='x', offset=1) thisjob = self.server.status(JOB, id=jid, extend='x') try: job_output_file = thisjob[0]['Output_Path'].split(':')[1] except Exception: self.assertTrue(False, "Could not determine job output path") result = self.du.cat(hostname=self.server.hostname, filename=job_output_file, sudo=True) self.assertTrue('out' in result, "Nothing in job output file?") job_out = '\n'.join(result['out']) self.logger.info("job_out=%s" % job_out) # ulimit reports kb, not bytes self.assertTrue(str(400 * 1024) in job_out) self.logger.info("Job that requests 400mb pvmem " "correctly has 400mb RLIMIT_AS") def test_cgroup_mount_paths(self): """ Test to see if the cgroup hook picks the shortest path, but also if it can be overrided in the config file """ if self.du.isdir(self.hosts_list[0], '/dev/tstc'): self.skipTest('Test requires /dev/tstc not to exist') if self.du.isdir(self.hosts_list[0], '/dev/tstm'): self.skipTest('Test requires /dev/tstm not to exist') self.load_config(self.cfg17) dir_created = self.du.mkdir(hostname=self.hosts_list[0], path='/dev/tstm', mode=0o0755, sudo=True) if not dir_created: self.skipTest('not able to create /dev/tstm') result = self.du.run_cmd(self.hosts_list[0], ['mount', '-t', 'cgroup', '-o', 'rw,nosuid,nodev,noexec,relatime,seclabel,' 'memory', 'cgroup', '/dev/tstm'], sudo=True) if result['rc'] != 0: self.du.run_cmd(self.hosts_list[0], ['rmdir', '/dev/tstm'], sudo=True) self.skipTest('not able to mount /dev/tstm') dir_created = self.du.mkdir(hostname=self.hosts_list[0], path='/dev/tstc', mode=0o0755, sudo=True) if not dir_created: self.du.run_cmd(self.hosts_list[0], ['umount', '/dev/tstm'], sudo=True) self.du.run_cmd(self.hosts_list[0], ['rmdir', '/dev/tstm'], sudo=True) self.skipTest('not able to create /dev/tstc') result = self.du.run_cmd(self.hosts_list[0], ['mount', '-t', 'cgroup', '-o', 'rw,nosuid,nodev,noexec,relatime,seclabel,' 'cpuset', 'cgroup', '/dev/tstc'], sudo=True) if result['rc'] != 0: self.du.run_cmd(self.hosts_list[0], ['umount', '/dev/tstm'], sudo=True) self.du.run_cmd(self.hosts_list[0], ['rmdir', '/dev/tstm'], sudo=True) self.du.run_cmd(self.hosts_list[0], ['rmdir', '/dev/tstc'], sudo=True) self.skipTest('not able to mount /dev/tstc') # sleep 2s: make sure no old log lines will match 'begin' time time.sleep(2) begin = int(time.time()) # sleep 2s to allow for small time differences and rounding errors time.sleep(2) a = {'Resource_List.select': "1:ncpus=1:host=%s" % self.hosts_list[0]} j = Job(TEST_USER, attrs=a) j.create_script(self.sleep600_job) jid = self.server.submit(j) a = {'job_state': 'R'} self.server.expect(JOB, a, jid) failure = False try: self.moms_list[0].log_match(msg='create_job: Creating directory ' '/sys/fs/cgroup/cpuset/' 'pbs_jobs.service/jobid/%s' % jid, n='ALL', starttime=begin, max_attempts=1) except Exception: failure = True try: self.moms_list[0].log_match(msg='create_job: Creating directory ' '/dev/tstm/' 'pbs_jobs.service/jobid/%s' % jid, n='ALL', starttime=begin, max_attempts=1) except Exception: failure = True self.du.run_cmd(self.hosts_list[0], ['umount', '/dev/tstm'], sudo=True) self.du.run_cmd(self.hosts_list[0], ['rmdir', '/dev/tstm'], sudo=True) self.du.run_cmd(self.hosts_list[0], ['umount', '/dev/tstc'], sudo=True) self.du.run_cmd(self.hosts_list[0], ['rmdir', '/dev/tstc'], sudo=True) self.assertFalse(failure, 'Did not find correct paths for created cgroup dirs') def cleanup_cgroup_subsys(self, host): # Remove the jobdir if any under other cgroups cgroup_subsys = ('systemd', 'cpu', 'cpuacct', 'cpuset', 'devices', 'memory', 'hugetlb', 'perf_event', 'freezer', 'blkio', 'pids', 'net_cls', 'net_prio') for subsys in cgroup_subsys: if (subsys in self.paths[host] and self.paths[host][subsys]): self.logger.info('Looking for orphaned jobdir in %s' % subsys) cdir = self.paths[host][subsys] if self.du.isdir(host, cdir): self.logger.info("Inspecting " + cdir) cpath = self.find_main_cpath(cdir, host) # not always immediately under main path if cpath is not None and self.du.isdir(host, cpath): tasks_files = ( glob.glob(os.path.join(cpath, '*', '*', 'tasks')) + glob.glob(os.path.join(cpath, '*', 'tasks'))) if tasks_files != []: self.logger.info("Tasks files found in %s: %s" % (cpath, tasks_files)) for tasks_file in tasks_files: jdir = os.path.dirname(tasks_file) if not self.du.isdir(host, jdir): continue self.logger.info('deleting jobdir %s' % jdir) # Kill tasks before trying to rmdir freezer cgroup_tasks = os.path.join(jdir, 'tasks') ret = self.du.cat(hostname=host, filename=cgroup_tasks, sudo=True) if ret['rc'] == 0: for taskstr in ret['out']: self.logger.info("trying to kill %s on %s" % (taskstr, host)) self.du.run_cmd(host, ['kill', '-9'] + [taskstr], sudo=True) for count in range(30): ret = self.du.cat(hostname=host, filename=cgroup_tasks, sudo=True) if ret['rc'] != 0: self.logger.info("Cannot confirm " "cgroup tasks; sleeping " "30 seconds instead") time.sleep(30) break if ret['out'] == [] or ret['out'][0] == '': self.logger.info("Processes in cgroup " "are gone") break else: self.logger.info("tasks still in cgroup: " + str(ret['out'])) time.sleep(1) cmd2 = ['rmdir', jdir] self.du.run_cmd(host, cmd=cmd2, sudo=True) def cleanup_frozen_jobs(self, host): # Cleanup frozen jobs # Thaw ALL freezers found # If directory starts with a number (i.e. a job) # kill processes in the freezers and remove them if 'freezer' in self.paths[host]: # Find freezers to thaw self.logger.info('Cleaning up frozen jobs ****') fdir = self.paths[host]['freezer'] freezer_states = \ glob.glob(os.path.join(fdir, '*', '*', '*', 'freezer.state')) freezer_states += \ glob.glob(os.path.join(fdir, '*', '*', 'freezer.state')) freezer_states += \ glob.glob(os.path.join(fdir, '*', 'freezer.state')) self.logger.info('*** found freezer states %s' % str(freezer_states)) for freezer_state in freezer_states: # thaw the freezer self.logger.info('Thawing ' + freezer_state) state = 'THAWED' fn = self.du.create_temp_file(body=state) self.du.run_copy(hosts=host, src=fn, dest=freezer_state, sudo=True, uid='root', gid='root', mode=0o644) # Confirm it's thawed for count in range(30): ret = self.du.cat(hostname=host, filename=freezer_state, sudo=True) if ret['rc'] != 0: self.logger.info("Cannot confirm freezer state" "sleeping 30 seconds instead") time.sleep(30) break if ret['out'][0] == 'THAWED': self.logger.info("freezer processes reported as" " THAWED") break else: self.logger.info("freezer state reported as " + ret['out'][0]) time.sleep(1) freezer_basename = os.path.basename( os.path.dirname(freezer_state)) jobid = None try: jobid = int(freezer_basename.split('.')[0]) except Exception: # not a job directory pass if jobid is not None: self.logger.info("Apparently found job freezer for job %s" % freezer_basename) freezer_tasks = os.path.join( os.path.dirname(freezer_state), "tasks") # Kill tasks before trying to rmdir freezer ret = self.du.cat(hostname=host, filename=freezer_tasks, sudo=True) if ret['rc'] == 0: for taskstr in ret['out']: self.logger.info("trying to kill %s on %s" % (taskstr, self.hosts_list[0])) self.du.run_cmd(host, ['kill', '-9'] + [taskstr], sudo=True) for count in range(30): ret = self.du.cat(hostname=host, filename=freezer_tasks, sudo=True) if ret['rc'] != 0: self.logger.info("Cannot confirm freezer tasks; " "sleeping 30 seconds instead") time.sleep(30) break if ret['out'] == [] or ret['out'][0] == '': self.logger.info("Processes in thawed freezer" " are gone") break else: self.logger.info("tasks still in thawed freezer: " + str(ret['out'])) time.sleep(1) cmd = ["rmdir", os.path.dirname(freezer_state)] self.logger.info("Executing %s" % ' '.join(cmd)) self.du.run_cmd(hosts=host, cmd=cmd, sudo=True) def tearDown(self): TestFunctional.tearDown(self) mom_checks = True if self.moms_list[0].is_cpuset_mom(): mom_checks = False self.load_default_config(mom_checks=mom_checks) if not self.iscray: self.remove_vntype() events = ['execjob_begin', 'execjob_launch', 'execjob_attach', 'execjob_epilogue', 'execjob_end', 'exechost_startup', 'exechost_periodic', 'execjob_resize', 'execjob_abort'] # Disable the cgroups hook conf = {'enabled': 'False', 'freq': 10, 'event': events} self.server.manager(MGR_CMD_SET, HOOK, conf, self.hook_name) # Cleanup any temp file created self.logger.info('Deleting temporary files %s' % self.tempfile) self.du.rm(hostname=self.serverA, path=self.tempfile, force=True, recursive=True, sudo=True) for host in self.hosts_list: self.cleanup_frozen_jobs(host) self.cleanup_cgroup_subsys(host) ================================================ FILE: test/tests/functional/pbs_check_job_attrib.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * class TestCheckJobAttrib(TestFunctional): """ This testsuite is to validate job attributes and values """ def test_exec_vnode_after_job_rerun(self): """ Test unsetting of exec_vnode of a job which got requeued after stage-in and make sure stage-in files are cleaned up. """ hook_name = "momhook" hook_body = "import pbs\npbs.event().reject('my custom message')\n" a = {'event': 'execjob_begin', 'enabled': 'True'} self.server.create_import_hook(hook_name, a, hook_body) self.server.log_match(".*successfully sent hook file.*" + hook_name + ".PY" + ".*", regexp=True, max_attempts=100, interval=5) storage_info = {} starttime = int(time.time()) stagein_path = self.mom.create_and_format_stagein_path( storage_info, asuser=str(TEST_USER)) a = {ATTR_stagein: stagein_path} j = Job(TEST_USER, a) jid = self.server.submit(j) self.server.expect(JOB, 'exec_vnode', id=jid, op=UNSET) # make scheduling off to avoid any race conditions # otherwise scheduler tries to run job till it reached H state self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) self.server.expect(JOB, {'run_count': (GT, 0)}, id=jid) self.server.log_match('my custom message', starttime=starttime) path = stagein_path.split("@") msg = "Staged in file not cleaned" self.assertFalse(self.mom.isfile(path[0]), msg) ================================================ FILE: test/tests/functional/pbs_checkpoint.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from ptl.utils.pbs_crayutils import CrayUtils from tests.functional import * class TestCheckpoint(TestFunctional): """ This test suite targets Checkpoint functionality. """ abort_file = '' cu = CrayUtils() def setUp(self): TestFunctional.setUp(self) a = {'job_history_enable': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) abort_script = """#!/bin/bash kill $1 exit 0 """ self.abort_file = self.mom.add_checkpoint_abort_script( body=abort_script) self.platform = self.du.get_platform() if self.platform != 'cray' and self.platform != 'craysim': self.attrs = {ATTR_l + '.select': '1:ncpus=1', ATTR_l + '.place': 'excl'} else: nv = self.cu.num_compute_vnodes(self.server) self.assertNotEqual(nv, 0, "No cray_compute vnodes are present.") self.attrs = {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter'} def verify_checkpoint_abort(self, jid, stime): """ Verify that checkpoint and abort happened. """ self.ck_dir = os.path.join(self.mom.pbs_conf['PBS_HOME'], 'checkpoint', jid + '.CK') self.assertTrue(self.du.isdir(hostname=self.mom.hostname, path=self.ck_dir, runas=ROOT_USER), msg="Checkpoint directory %s not found" % self.ck_dir) _msg1 = "%s;req_holdjob: Checkpoint initiated." % jid self.mom.log_match(_msg1, starttime=stime) _msg2 = "%s;checkpoint_abort script %s: exit code 0" % ( jid, self.abort_file) self.mom.log_match(_msg2, starttime=stime) _msg3 = "%s;checkpointed to %s" % (jid, self.ck_dir) self.mom.log_match(_msg3, starttime=stime) _msg4 = "%s;task 00000001 terminated" % jid self.mom.log_match(_msg4, starttime=stime) def start_server_hot(self): """ Start the server with the hot option. """ pbs_exec = self.server.pbs_conf['PBS_EXEC'] svrname = self.server.pbs_server_name pbs_server_hot = [os.path.join( pbs_exec, 'sbin', 'pbs_server'), '-t', 'hot'] self.du.run_cmd(svrname, cmd=pbs_server_hot, sudo=True) self.assertTrue(self.server.isUp()) def checkpoint_abort_with_qterm_restart_hot(self, qterm_type): """ Checkpointing with qterm -t , hot server restart. """ j1 = Job(TEST_USER, self.attrs) j1.set_sleep_time(20) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) start_time = int(time.time()) self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) self.server.qterm(manner=qterm_type) self.verify_checkpoint_abort(jid1, start_time) self.start_server_hot() self.assertTrue(self.server.isUp()) msg = "%s;Requeueing job, substate: 10 Requeued in queue: workq" % jid1 self.server.log_match(msg, starttime=start_time) # wait for the server to hot start the job self.server.expect(JOB, {'job_state': 'R'}, id=jid1, interval=2) self.server.expect(JOB, 'exec_vnode', id=jid1, op=SET) self.assertFalse(os.path.exists(self.ck_dir), msg=self.ck_dir + " still exists") self.server.expect(JOB, {'job_state': 'F'}, jid1, extend='x', interval=5) def test_checkpoint_abort_with_preempt(self): """ This test verifies that checkpoint_abort works as expected when a job is preempted via checkpoint. It does so by submitting a job in express queue which preempts a running job in the default queue. """ self.server.manager(MGR_CMD_SET, SCHED, {'preempt_order': 'C'}, runas=ROOT_USER) a = {'queue_type': 'execution', 'started': 'True', 'enabled': 'True', 'Priority': 200} self.server.manager(MGR_CMD_CREATE, QUEUE, a, "expressq") j1 = Job(TEST_USER, self.attrs) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) self.attrs['queue'] = 'expressq' j2 = Job(TEST_USER, self.attrs) j2.set_sleep_time(20) start_time = int(time.time()) jid2 = self.server.submit(j2) self.server.expect(JOB, {'job_state': 'R'}, id=jid2) self.server.expect(JOB, {'job_state': 'Q'}, id=jid1) self.verify_checkpoint_abort(jid1, start_time) self.server.expect(JOB, {'job_state': 'F'}, jid2, extend='x', interval=5) self.server.expect(JOB, {'job_state': 'F'}, jid1, extend='x', interval=5) def test_checkpoint_abort_with_qhold(self): """ This test uses qhold for checkpointing. """ j1 = Job(TEST_USER, self.attrs) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) start_time = int(time.time()) self.server.holdjob(jid1) self.server.expect(JOB, {'job_state': 'H'}, id=jid1) self.verify_checkpoint_abort(jid1, start_time) def test_checkpoint_abort_with_qterm_immediate_restart_hot(self): """ This tests checkpointing with qterm -t immediate, hot server restart. """ self.checkpoint_abort_with_qterm_restart_hot("immediate") def test_checkpoint_abort_with_qterm_delay_restart_hot(self): """ This tests checkpointing with qterm -t delay, hot server restart. """ self.checkpoint_abort_with_qterm_restart_hot("delay") def tearDown(self): TestFunctional.tearDown(self) self.du.rm(hostname=self.mom.hostname, path=self.abort_file, sudo=True, force=True) ================================================ FILE: test/tests/functional/pbs_client_response.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * import time class TestClientResponse(TestFunctional): """ Test cases to check number of response getting from client command in 1 second """ def test_qstat_reponse(self): """ Test to check how many qstat can be done in 1 second. """ count = 0 t = time.time() + 1 qstat_cmd = os.path.join(self.server.pbs_conf["PBS_EXEC"], "bin", "qstat") while time.time() < t: ret = self.du.run_cmd(self.server.hostname, qstat_cmd) self.assertTrue('rc', 0) count += 1 self.logger.info("Number qstat response:%d", count) ================================================ FILE: test/tests/functional/pbs_complete_running_parent_job.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * class Test_complete_running_parent_job(TestFunctional): """ This test suite is for testing the complete_running() procedure is processed for parent array job. """ def setUp(self): """ Set eligible_time_enable = True. This is due to test the issue in PP-1211 """ TestFunctional.setUp(self) self.server.manager(MGR_CMD_SET, SERVER, { 'eligible_time_enable': True}) def test_parent_job_S_accounting_record(self): """ Submit an array job and test whether the 'S' accounting record is created for parent job. """ J = Job(TEST_USER, attrs={ATTR_J: '1-2'}) J.set_sleep_time(1) parent_jid = self.server.submit(J) self.server.accounting_match(msg='.*;S;' + re.escape(parent_jid) + ".*", id=parent_jid, regexp=True) def test_parent_job_comment_and_stime(self): """ Submit an array job and test whether the comment and stime is set for parent job. """ J = Job(TEST_USER, attrs={ATTR_J: '1-2'}) J.set_sleep_time(10) parent_jid = self.server.submit(J) attr = { ATTR_comment: (MATCH_RE, 'Job Array Began at .*'), ATTR_stime: (MATCH_RE, '.+') } self.server.expect(JOB, attr, id=parent_jid, attrop=PTL_AND) ================================================ FILE: test/tests/functional/pbs_conf_resv_stale_vnode.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * class TestResvStaleVnode(TestFunctional): """ Test that the scheduler won't confirm a reservation on stale vnode and make sure reservations that have nodes that have gone stale get degreaded """ def setUp(self): TestFunctional.setUp(self) # Create 3 vnodes named different things in different vnodedef files # This allows us to delete a vnodedef file and make that node stale self.mom.add_config(conf={'$vnodedef_additive': 'False'}) a = {'resources_available.ncpus': 1, 'priority': 100} self.mom.create_vnodes(a, 1, fname='nat', restart=False, usenatvnode=True, expect=False, vname='foo') a['priority'] = 10 self.mom.create_vnodes(a, 1, fname='fname1', delall=False, restart=False, additive=True, expect=False, vname='vn') a['priority'] = 1 self.mom.create_vnodes(a, 1, fname='fname2', delall=False, additive=True, expect=False, vname='vnode') self.scheduler.set_sched_config({'node_sort_key': '\"sort_priority HIGH\"'}) def test_conf_resv_stale_vnode(self): """ Test that the scheduler won't confirm a reservation on a stale node. """ # Ensure the hostsets aren't used by associating a node to a queue a = {'queue_type': 'Execution', 'enabled': 'True', 'started': 'True'} self.server.manager(MGR_CMD_CREATE, QUEUE, a, id='workq2') self.server.manager(MGR_CMD_SET, NODE, {'queue': 'workq2'}, id=self.mom.shortname) # Submit a job that will run on our stale vnode a = {'Resource_List.select': '1:vnode=vn[0]', 'Resource_List.walltime': 3600} J = Job(TEST_USER, attrs=a) jid = self.server.submit(J) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) self.mom.delete_vnode_defs(vdefname='fname1') self.mom.signal('-HUP') self.server.expect(NODE, {'state': (MATCH_RE, 'Stale')}, id='vn[0]') now = int(time.time()) a = {'reserve_start': now + 5400, 'reserve_end': now + 7200} R = Reservation(TEST_USER, a) rid = self.server.submit(R) # Reservation should be confirmed on vnode[0] since vn[0] is Stale a = {'resv_nodes': '(vnode[0]:ncpus=1)'} a2 = {'reserve_state': (MATCH_RE, 'RESV_CONFIRMED|2')} self.server.expect(RESV, a, id=rid) self.server.expect(RESV, a2, id=rid) def test_stale_degraded(self): """ Test that a reservation goes into the degraded state when one of its vnodes go stale """ self.server.expect(NODE, {'state=free': 3}) now = int(time.time()) a = {'Resource_List.select': '3:ncpus=1', 'Resource_List.place': 'vscatter', 'reserve_start': now + 3600, 'reserve_end': now + 7200} R = Reservation(TEST_USER, attrs=a) rid = self.server.submit(R) a = {'reserve_state': (MATCH_RE, 'RESV_CONFIRMED|2')} self.server.expect(RESV, a, id=rid) self.mom.delete_vnode_defs(vdefname='fname1') self.mom.signal('-HUP') self.server.expect(NODE, {'state': (MATCH_RE, 'Stale')}, id='vn[0]') a = {'reserve_state': (MATCH_RE, 'RESV_DEGRADED|10')} self.server.expect(RESV, a, id=rid) ================================================ FILE: test/tests/functional/pbs_config.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. import tarfile from tests.functional import * class TestPBSConfig(TestFunctional): """ Test cases for pbs_config tool """ snapdirs = [] snaptars = [] def test_config_for_snapshot(self): """ Test pbs_config's --snap option """ pbs_snapshot_path = os.path.join( self.server.pbs_conf["PBS_EXEC"], "sbin", "pbs_snapshot") if not os.path.isfile(pbs_snapshot_path): self.skipTest("pbs_snapshot not found") pbs_config_path = os.path.join( self.server.pbs_conf["PBS_EXEC"], "unsupported", "pbs_config") if not os.path.isfile(pbs_config_path): self.skipTest("pbs_config not found") # Create 4 vnodes a = {ATTR_rescavail + ".ncpus": 2} self.mom.create_vnodes(attrib=a, num=4, usenatvnode=True) self.server.expect(VNODE, {'state=free': 4}, count=True) # Create a queue a = {'queue_type': 'execution', 'started': 'True', 'enabled': 'True', 'Priority': 200} self.server.manager(MGR_CMD_CREATE, QUEUE, a, id="expressq") # Set preempt_order to 'R' a = {"preempt_order": "R"} self.server.manager(MGR_CMD_SET, SCHED, a, id="default") # Set sched_config 'smp_cluster_dist' to 'round_robin' self.scheds["default"].set_sched_config( {"smp_cluster_dist": "round_robin"}) # Now that we have a custom configuration, take a snapshot outdir = pwd.getpwnam(self.du.get_current_user()).pw_dir snap_cmd = [pbs_snapshot_path, "-o " + outdir, "--with-sudo"] ret = self.du.run_cmd(cmd=snap_cmd, logerr=False, as_script=True) self.assertEqual(ret["rc"], 0, "pbs_snapshot command failed") snap_out = ret['out'][0] output_tar = snap_out.split(":")[1] output_tar = output_tar.strip() # Check that the output tarball was created self.assertTrue(os.path.isfile(output_tar), "Error capturing snapshot:\n" + str(ret)) self.snaptars.append(output_tar) # Unwrap the tarball tar = tarfile.open(output_tar) tar.extractall(path=outdir) tar.close() # snapshot directory name = .tgz[:-4] snap_dir = output_tar[:-4] self.assertTrue(os.path.isdir(snap_dir)) self.snapdirs.append(snap_dir) # Let's revert the system back to default now TestFunctional.setUp(self) # Now, use pbs_config --snap to build the system captured # previously in the snapshot config_cmd = [pbs_config_path, "--snap=" + snap_dir] self.du.run_cmd(cmd=config_cmd, sudo=True, logerr=False) # Verify that there are 4 vnodes, expressq, preempt_order=R and # smp_cluster_dist=round_robin self.server.expect(VNODE, {'state=free': 4}, count=True) self.server.expect(QUEUE, {"Priority": 200}, id="expressq") self.server.expect(SCHED, {"preempt_order": "R"}, id="default") self.scheds["default"].parse_sched_config() self.assertEqual( self.scheds["default"].sched_config["smp_cluster_dist"], "round_robin", "pbs_config didn't load sched_config correctly") def tearDown(self): # Cleanup snapshot dirs and tars for snap_dir in self.snapdirs: self.du.rm(path=snap_dir, recursive=True, force=True) for snap_tar in self.snaptars: self.du.rm(path=snap_tar, force=True) ================================================ FILE: test/tests/functional/pbs_cpuset.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * @requirements(num_moms=2) class TestPbsCpuset(TestFunctional): """ This testsuite covers various features using cgroup cpuset systems - Reliable Job Startup Feature - Node Rampdown Feature """ def check_stageout_file_size(self): """ This function will check that at least 1gb of test.img file which is to be stagedout is created within 10 seconds. """ fpath = os.path.join(TEST_USER.home, "test.img") cmd = ['stat', '-c', '%s', fpath] fsize = 0 for i in range(11): rc = self.du.run_cmd(hosts=self.h0, cmd=cmd, runas=TEST_USER) if rc['rc'] == 0 and len(rc['out']) == 1: try: fsize = int(rc['out'][0]) except Exception: pass # 1073741824 == 1Gb if fsize > 1073741824: break else: time.sleep(1) if fsize <= 1073741824: self.fail("Failed to create 1gb file at %s" % fpath) def setUp(self): TestFunctional.setUp(self) # skip if there are no cpuset systems in the test cluster no_csetmom = True for mom in self.moms.values(): if mom.is_cpuset_mom(): no_csetmom = False if no_csetmom: self.skipTest("Skip on cluster without cgroup cpuset system.") # Various host names self.h0 = self.moms.values()[0].shortname self.h1 = self.moms.values()[1].shortname self.hostA = socket.getfqdn(self.h0) self.hostB = socket.getfqdn(self.h1) # Various node names. First mom may or may not be a cpuset system. try: self.n0 = self.server.status( NODE, id='%s[0]' % (self.h0))[0]['id'] except PbsStatusError: self.n0 = self.h0 self.n1 = self.h1 self.n2 = '%s[0]' % (self.n1) self.n3 = '%s[1]' % (self.n1) # Skip if there are less than four vnodes. There should be # three from cpuset system (natural + two NUMA vnodes) nodeinfo = self.server.status(NODE) if len(nodeinfo) < 4: self.skipTest("Not enough vnodes to run the test.") # skip if second mom has less than two NUMA vnodes try: self.server.status(NODE, id=self.n3) except PbsStatusError: self.skipTest("vnode %s doesn't exist on pbs server" % (self.n3)) # skip if vnodes are not in free state for node in nodeinfo: if node['state'] != 'free': self.skipTest("Not all the vnodes are in free state") self.pbs_release_nodes_cmd = os.path.join( self.server.pbs_conf['PBS_EXEC'], 'bin', 'pbs_release_nodes') # number of resource ncpus to request initially ncpus = self.server.status(NODE, 'resources_available.ncpus', id=self.n3)[0] # request a partial amount of ncpus in self.n3 self.ncpus2 = int(ncpus['resources_available.ncpus']) / 2 # cgroup cpuset path on second node cmd = ['grep cgroup', '/proc/mounts', '|', 'grep cpuset', '|', 'grep -v', '/dev/cpuset'] ret = self.server.du.run_cmd(self.n1, cmd, runas=TEST_USER) self.cset_path = ret['out'][0].split()[1] # launch hook self.launch_hook_body = """ import pbs import time e=pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "Executing launch") # print out the vnode_list[] values for vn in e.vnode_list: v = e.vnode_list[vn] pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]") # print out the vnode_list_fail[] values: for vn in e.vnode_list_fail: v = e.vnode_list_fail[vn] pbs.logjobmsg(e.job.id, "launch: found vnode_list_fail[" + v.name + "]") if e.job.in_ms_mom(): pj = e.job.release_nodes(keep_select="ncpus=1:mem=1gb") if pj is None: e.job.Hold_Types = pbs.hold_types("s") e.job.rerun() e.reject("unsuccessful at LAUNCH") pbs.logmsg(pbs.LOG_DEBUG, "Sleeping for 20sec") time.sleep(20) """ self.script = {} self.job1_select = "ncpus=1:mem=1gb+" + \ "ncpus=%d:mem=1gb:vnode=%s+" % (self.ncpus2, self.n2) + \ "ncpus=%d:mem=1gb:vnode=%s" % (self.ncpus2, self.n3) self.job1_place = "vscatter" # expected values upon successful job submission self.job1_schedselect = "1:ncpus=1:mem=1gb+" + \ "1:ncpus=%d:mem=1gb:vnode=%s+" % (self.ncpus2, self.n2) + \ "1:ncpus=%d:mem=1gb:vnode=%s" % (self.ncpus2, self.n3) self.job1_exec_host = "%s/0+%s/0*%d+%s/1*%d" % ( self.h0, self.h1, self.ncpus2, self.n1, self.ncpus2) self.job1_exec_vnode = "(%s:ncpus=1:mem=1048576kb)+" % (self.n0,) + \ "(%s:ncpus=%d:mem=1048576kb)+" % (self.n2, self.ncpus2) + \ "(%s:ncpus=%d:mem=1048576kb)" % (self.n3, self.ncpus2) # expected values after release of vnode of self.n3 self.job1_schedsel1 = "1:ncpus=1:mem=1048576kb+" + \ "1:ncpus=%d:mem=1048576kb:vnode=%s" % (self.ncpus2, self.n2) self.job1_exec_host1 = "%s/0+%s/0*%d" % (self.h0, self.h1, self.ncpus2) self.job1_exec_vnode1 = "(%s:ncpus=1:mem=1048576kb)+" % (self.n0,) + \ "(%s:ncpus=%d:mem=1048576kb)" % (self.n2, self.ncpus2) # expected values during lengthy stageout self.job1_newsel = "1:ncpus=1:mem=1048576kb" self.job1_new_exec_host = "%s/0" % self.h0 self.job1_new_exec_vnode = "(%s:ncpus=1:mem=1048576kb)" % self.n0 # values to use when matching accounting logs self.job1_exec_host_esc = self.job1_exec_host.replace( "*", r"\*").replace("[", r"\[").replace("]", r"\]").replace( "+", r"\+") self.job1_exec_vnode_esc = self.job1_exec_vnode.replace( "[", r"\[").replace("]", r"\]").replace("(", r"\(").replace( ")", r"\)").replace("+", r"\+") self.job1_sel_esc = self.job1_select.replace( "[", r"\[").replace("]", r"\]").replace("(", r"\(").replace( ")", r"\)").replace("+", r"\+") self.job1_new_exec_vnode_esc = self.job1_new_exec_vnode.replace( "[", r"\[").replace("]", r"\]").replace("(", r"\(").replace( ")", r"\)").replace("+", r"\+") def tearDown(self): for host in [self.h0, self.h1]: test_img = os.path.join("/home", "pbsuser", "test.img") self.du.rm(hostname=host, path=test_img, force=True, runas=TEST_USER) TestFunctional.tearDown(self) def test_reliable_job_startup_on_cpuset(self): """ A job is started with two numa nodes and goes in R state. An execjob_launch hook will force job to have only one numa node. The released numa node can be used in another job. """ # instantiate execjob_launch hook hook_event = "execjob_launch" hook_name = "launch" a = {'event': hook_event, 'enabled': 'true'} stime = time.time() self.server.create_import_hook(hook_name, a, self.launch_hook_body) # Check mom logs that the launch hook got propagated msg = "Hook;launch.PY;copy hook-related file request received" self.moms.values()[1].log_match(msg, starttime=stime) # Submit job1 that uses second mom's two NUMA nodes, in R state a = {ATTR_l + '.select': self.job1_select, ATTR_l + '.place': self.job1_place, ATTR_W: 'tolerate_node_failures=job_start'} j = Job(TEST_USER, attrs=a) stime = time.time() jid = self.server.submit(j) # Check the exec_vnode while in substate 41 self.server.expect(JOB, {ATTR_substate: '41'}, id=jid) self.server.expect(JOB, 'exec_vnode', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) execvnode1 = job_stat[0]['exec_vnode'] self.logger.info("initial exec_vnode: %s" % execvnode1) initial_vnodes = execvnode1.split('+') # Check the exec_vnode after job is in substate 42 self.server.expect(JOB, {ATTR_substate: '42'}, offset=20, id=jid) self.server.expect(JOB, 'exec_vnode', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) execvnode2 = job_stat[0]['exec_vnode'] self.logger.info("pruned exec_vnode: %s" % execvnode2) # Check mom logs for pruned from and pruned to messages self.moms.values()[0].log_match("Job;%s;pruned from exec_vnode=%s" % ( jid, execvnode1), starttime=stime) self.moms.values()[0].log_match("Job;%s;pruned to exec_vnode=%s" % ( jid, execvnode2), starttime=stime) # Find out the released vnode if initial_vnodes[0] == execvnode2: execvnodeB = initial_vnodes[1] else: execvnodeB = initial_vnodes[0] vnodeB = execvnodeB.split(':')[0].split('(')[1] self.logger.info("released vnode: %s" % vnodeB) # Submit job2 requesting all of the released vnode's cpus, job runs a = {ATTR_l + '.select': '1:ncpus=%d:mem=1gb:vnode=%s' % ( self.ncpus2 * 2, vnodeB)} j2 = Job(TEST_USER, attrs=a) stime = time.time() jid2 = self.server.submit(j2) self.server.expect(JOB, {ATTR_state: 'R'}, offset=20, id=jid2) # Check if vnode for job2 matches released vnode from job1 self.server.expect(JOB, 'exec_vnode', id=jid2, op=SET) job_stat = self.server.status(JOB, id=jid2) execvnode3 = job_stat[0]['exec_vnode'] vnode3 = execvnode3.split(':')[0].split('(')[1] self.assertEqual(vnode3, vnodeB) self.logger.info("job2 vnode %s is the released vnode %s" % ( vnode3, vnodeB)) def test_release_nodes_on_cpuset_sis(self): """ On a cluster where the second mom is a cgroup cpuset system with two NUMA nodes, submit a job that will use cpus on both NUMA vnodes. The job goes in R state. Use pbs_release_nodes to successfully release one of the NUMA vnodes and its resources used in the job. Compare the job's cgroup cpuset info before and after calling pbs_release_nodes to verify that NUMA node's cpu resources were released. """ # Submit a job that uses second mom's two NUMA nodes, in R state a = {ATTR_l + '.select': self.job1_select, ATTR_l + '.place': self.job1_place} j1 = Job(TEST_USER, attrs=a) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R', 'Resource_List.mem': '3gb', 'Resource_List.ncpus': 1 + self.ncpus2 * 2, 'Resource_List.nodect': 3, 'schedselect': self.job1_schedselect, 'exec_host': self.job1_exec_host, 'exec_vnode': self.job1_exec_vnode}, id=jid1) # Check the cpuset before releasing self.n3 from jid1 cset_file = os.path.join(self.cset_path, 'pbs_jobs.service/jobid', jid1, 'cpuset.cpus') cset_before = self.du.cat(self.n1, cset_file) cset_j1_before = cset_before['out'] self.logger.info("cset_j1_before : %s" % cset_j1_before) before_release = time.time() # Release a NUMA vnode on second mom using command pbs_release_nodes cmd = [self.pbs_release_nodes_cmd, '-j', jid1, self.n3] ret = self.server.du.run_cmd(self.server.hostname, cmd, runas=TEST_USER) self.assertEqual(ret['rc'], 0) self.server.expect(JOB, {'job_state': 'R', 'Resource_List.ncpus': 1 + self.ncpus2, 'Resource_List.nodect': 2, 'schedselect': self.job1_schedsel1, 'exec_host': self.job1_exec_host1, 'exec_vnode': self.job1_exec_vnode1}, id=jid1) # Check if sister mom updated its internal nodes table after release self.moms.values()[1].log_match('Job;%s;updated nodes info' % jid1, starttime=before_release - 1) # Check the cpuset for the job after releasing self.n3 cset_after = self.du.cat(self.n1, cset_file) cset_j1_after = cset_after['out'] self.logger.info("cset_j1_after : %s" % cset_j1_after) # Compare the before and after cpusets info msg = "%s: cpuset cpus remain after release of %s" % (jid1, self.n3) self.assertNotEqual(cset_j1_before, cset_j1_after, msg) def test_release_nodes_on_stageout_cset(self): """ Submit a job, with -W release_nodes_on_stageout=true as a PBS directive in the job script, that will use cpus and mem on two NUMA vnodes on the second mom. The job goes in R state. The job creates a huge stageout file. When the job is deleted the sister NUMA vnodes are released during lengthy stageout and only the primary execution host's vnode is left assigned to the job. """ FIB40 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin', '') + \ 'pbs_python -c "exec(\\\"def fib(i):\\n if i < 2:\\n \ return i\\n return fib(i-1) + fib(i-2)\\n\\nprint(fib(40))\\\")"' FIB400 = os.path.join(self.server.pbs_conf['PBS_EXEC'], 'bin', '') + \ 'pbs_python -c "exec(\\\"def fib(i):\\n if i < 2:\\n \ return i\\n return fib(i-1) + fib(i-2)\\n\\nprint(fib(400))\\\")"' self.script['job1'] = \ "#PBS -S /bin/bash\n" \ "#PBS -l select=" + self.job1_select + "\n" + \ "#PBS -l place=" + self.job1_place + "\n" + \ "#PBS -W stageout=test.img@%s:test.img\n" % (self.n1,) + \ "#PBS -W release_nodes_on_stageout=true\n" + \ "dd if=/dev/zero of=test.img count=1024 bs=2097152\n" + \ "pbsdsh -n 1 -- %s\n" % (FIB40,) + \ "pbsdsh -n 2 -- %s\n" % (FIB40,) + \ "%s\n" % (FIB400,) stime = time.time() j = Job(TEST_USER) j.create_script(self.script['job1']) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R', 'release_nodes_on_stageout': 'True', 'Resource_List.mem': '3gb', 'Resource_List.ncpus': 1 + self.ncpus2 * 2, 'Resource_List.nodect': 3, 'Resource_List.select': self.job1_select, 'Resource_List.place': self.job1_place, 'schedselect': self.job1_schedselect, 'exec_host': self.job1_exec_host, 'exec_vnode': self.job1_exec_vnode}, id=jid) # Check various vnode status. attr0 = {'state': 'job-busy', 'jobs': jid + '/0', 'resources_assigned.ncpus': 1, 'resources_assigned.mem': '1048576kb'} self.server.expect(VNODE, attr0, id=self.n0) attr1 = {'state': 'free', 'resources_assigned.ncpus': 0, 'resources_assigned.mem': '0kb'} self.server.expect(VNODE, attr1, id=self.n1) jobs = '' for i in range(0, int(self.ncpus2)): jobs += ' %s/%d,' % (jid, i) jobs = jobs.strip().strip(',') attr2 = {'state': 'free', 'jobs': jobs, 'resources_assigned.ncpus': int(self.ncpus2), 'resources_assigned.mem': '1048576kb'} for vn in [self.n2, self.n3]: self.server.expect(VNODE, attr2, id=vn) # job's PBS_NODEFILE contents should match exec_host pbs_nodefile = os.path.join(self.server. pbs_conf['PBS_HOME'], 'aux', jid) cmd = ['cat', pbs_nodefile] ret = self.server.du.run_cmd(self.h0, cmd, sudo=False) self.assertTrue(self.hostA and self.hostB in ret['out']) # The job will write out enough file size to have a lengthy stageout self.check_stageout_file_size() # Deleting the job will trigger the stageout process # at which time the sister node is automatically released # due to release_nodes_stageout=true set self.server.delete(jid) # Verify remaining job resources. self.server.expect(JOB, {'job_state': 'E', 'Resource_List.mem': '1gb', 'Resource_List.ncpus': 1, 'Resource_List.select': self.job1_newsel, 'Resource_List.place': self.job1_place, 'Resource_List.nodect': 1, 'schedselect': self.job1_newsel, 'exec_host': self.job1_new_exec_host, 'exec_vnode': self.job1_new_exec_vnode}, id=jid) # Check various vnode status attr0 = {'state': 'job-busy', 'jobs': jid + '/0', 'resources_assigned.ncpus': 1, 'resources_assigned.mem': '1048576kb'} self.server.expect(VNODE, attr0, id=self.n0) attr1 = {'state': 'free', 'resources_assigned.ncpus': '0', 'resources_assigned.mem': '0kb'} for vn in [self.n1, self.n2, self.n3]: self.server.expect(VNODE, attr1, id=vn) # job's PBS_NODEFILE contents should match exec_host ret = self.server.du.run_cmd(self.h0, cmd, sudo=False) self.assertTrue(self.hostA in ret['out']) self.assertFalse(self.hostB in ret['out']) # Verify mom_logs self.moms.values()[0].log_match( "Job;%s;%s.+cput=.+ mem=.+" % (jid, self.n1), n=10, regexp=True) self.moms.values()[1].log_match( "Job;%s;DELETE_JOB2 received" % (jid,), n=20) # Check account update ('u') record msg0 = ".*%s;%s.*exec_host=%s" % ('u', jid, self.job1_exec_host_esc) msg1 = ".*exec_vnode=%s" % self.job1_exec_vnode_esc msg2 = r".*Resource_List\.mem=%s" % '3gb' msg3 = r".*Resource_List\.ncpus=%d" % 9 msg4 = r".*Resource_List\.place=%s" % self.job1_place msg5 = r".*Resource_List\.select=%s.*" % self.job1_sel_esc msg = msg0 + msg1 + msg2 + msg3 + msg4 + msg5 self.server.accounting_match(msg=msg, regexp=True, n="ALL", starttime=stime) # Check to make sure 'c' (next) record got generated msg0 = ".*%s;%s.*exec_host=%s" % ('c', jid, self.job1_new_exec_host) msg1 = ".*exec_vnode=%s" % self.job1_new_exec_vnode_esc msg2 = r".*Resource_List\.mem=%s" % '1048576kb' msg3 = r".*Resource_List\.ncpus=%d" % 1 msg4 = r".*Resource_List\.place=%s" % self.job1_place msg5 = r".*Resource_List\.select=%s.*" % self.job1_newsel msg = msg0 + msg1 + msg2 + msg3 + msg4 + msg5 self.server.accounting_match(msg=msg, regexp=True, n="ALL", starttime=stime) ================================================ FILE: test/tests/functional/pbs_cray_check_node_exclusivity.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * @tags('cray', 'reservation') class TestCheckNodeExclusivity(TestFunctional): """ Test suite for reservation. This test Suite checks the exclusivity of node when a reservation asks for it. Adapted for Cray Configuration """ ncpus = None vnode = None def setUp(self): if not self.du.get_platform().startswith('cray'): self.skipTest("Test suite only meant to run on a Cray") self.script = [] self.script += ['echo Hello World\n'] self.script += ['aprun -b -B /bin/sleep 10'] TestFunctional.setUp(self) def submit_and_confirm_resv(self, a=None, index=None): """ This is common function to submit reservation and verify reservation confirmed """ r = Reservation(TEST_USER, attrs=a) rid = self.server.submit(r) a = {'reserve_state': (MATCH_RE, 'RESV_CONFIRMED|2')} if index is not None: a['reserve_index'] = index self.server.expect(RESV, a, id=rid) return rid def get_vnode_ncpus_value(self): all_nodes = self.server.status(NODE) for n in all_nodes: if n['resources_available.vntype'] == 'cray_compute': self.ncpus = n['resources_available.ncpus'] self.vnode = n['resources_available.vnode'] break def test_node_state_with_adavance_resv(self): """ Test node state will change when reservation asks for exclusivity. """ # Submit a reservation with place=excl start_time = time.time() now = int(start_time) a = {'Resource_List.select': '1:ncpus=1:vntype=cray_compute', 'Resource_List.place': 'excl', 'reserve_start': now + 30, 'reserve_end': now + 60} rid = self.submit_and_confirm_resv(a) rid_q = rid.split('.')[0] self.server.status(RESV, 'resv_nodes', id=rid) resv_node = self.server.reservations[rid].get_vnodes()[0] self.server.expect(NODE, {'state': 'free'}, id=resv_node) self.server.restart() self.server.expect(NODE, {'state': 'free'}, id=resv_node) self.logger.info('Waiting 20s for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5")} self.server.expect(RESV, a, id=rid, offset=20) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node) # Wait for reservation to delete from server msg = "Que;" + rid_q + ";deleted at request of pbs_server@" self.server.log_match(msg, starttime=start_time, interval=10) self.server.expect(NODE, {'state': 'free'}, id=resv_node) def test_node_state_with_standing_resv(self): """ Test node state will change when reservation asks for exclusivity. """ if 'PBS_TZID' in self.conf: tzone = self.conf['PBS_TZID'] elif 'PBS_TZID' in os.environ: tzone = os.environ['PBS_TZID'] else: self.logger.info('Missing timezone, using America/Los_Angeles') tzone = 'America/Los_Angeles' # Submit a standing reservation to occur every other minute for a # total count of 2 start = time.time() + 20 now = start + 20 start = int(start) end = int(now) a = {'Resource_List.select': '1:ncpus=1:vntype=cray_compute', 'Resource_List.place': 'excl', ATTR_resv_rrule: 'FREQ=MINUTELY;COUNT=2', ATTR_resv_timezone: tzone, 'reserve_start': start, 'reserve_end': end, } rid = self.submit_and_confirm_resv(a, 1) rid_q = rid.split(".")[0] self.server.status(RESV, 'resv_nodes', id=rid) resv_node = self.server.reservations[rid].get_vnodes()[0] self.server.expect(NODE, {'state': 'free'}, id=resv_node) self.logger.info('Waiting 10s for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5"), 'reserve_index': 1} self.server.expect(RESV, a, id=rid, offset=10) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node) # Wait for standing reservation first instance to finished self.logger.info( 'Waiting 20 sec for second instance of reservation to start') exp_attr = {'reserve_state': (MATCH_RE, "RESV_CONFIRMED|2"), 'reserve_index': 2} self.server.expect(RESV, exp_attr, id=rid, offset=20) # Node state of the nodes in resv_nodes should be free self.server.expect(NODE, {'state': 'free'}, id=resv_node) # Wait for standing reservation second instance to start self.logger.info( 'Waiting 40 sec for second instance of reservation to start') exp_attr = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5"), 'reserve_index': 2} self.server.expect(RESV, exp_attr, id=rid, offset=40, interval=1) # check the node state of the nodes in resv_nodes self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node) # Wait for reservations to be finished msg = "Que;" + rid_q + ";deleted at request of pbs_server@" self.server.log_match(msg, starttime=now, interval=2) self.server.expect(NODE, {'state': 'free'}, id=resv_node) def test_job_outside_resv_not_allowed(self): """ Test Job outside the reservation will not be allowed to run if reservation has place=excl. """ # Submit a reservation with place=excl start_time = time.time() now = int(start_time) a = {'Resource_List.select': '1:ncpus=1:vntype=cray_compute', 'Resource_List.place': 'excl', 'reserve_start': now + 20, 'reserve_end': now + 30} rid = self.submit_and_confirm_resv(a) rid_q = rid.split('.')[0] self.server.status(RESV, 'resv_nodes', id=rid) resv_node = self.server.reservations[rid].get_vnodes()[0] self.server.expect(NODE, {'state': 'free'}, id=resv_node) self.logger.info('Waiting 20s for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5")} self.server.expect(RESV, a, id=rid, offset=20) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node) # Submit a job outside the reservation requesting resv_nodes submit_dir = self.du.create_temp_dir(asuser=TEST_USER) a = {ATTR_q: 'workq', ATTR_l + '.select': '1:vnode=%s' % resv_node} j1 = Job(TEST_USER, attrs=a) j1.create_script(self.script) jid1 = self.server.submit(j1, submit_dir=submit_dir) comment = 'Not Running: Insufficient amount of resource: vnode' self.server.expect( JOB, {'job_state': 'Q', 'comment': comment}, id=jid1) # Wait for reservation to end and verify node state # changed as job-exclusive msg = "Que;" + rid_q + ";deleted at request of pbs_server@" self.server.log_match(msg, starttime=start_time, interval=2) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) self.server.expect(NODE, {'state': 'job-exclusive'}, id=resv_node) def test_conflict_reservation_on_resv_exclusive_node(self): """ Test no other reservation will get confirmed (in the duration) when a node has a exclusive reservation confirmed on it. Reservation2 is inside the duration of confirmed reservation requesting the same vnode in Reservation1. """ # Submit a reservation with place=excl start_time = time.time() now = int(start_time) a = {'Resource_List.select': '1:ncpus=1:vntype=cray_compute', 'Resource_List.place': 'excl', 'reserve_start': now + 20, 'reserve_end': now + 60} rid = self.submit_and_confirm_resv(a) self.server.status(RESV, 'resv_nodes', id=rid) resv_node = self.server.reservations[rid].get_vnodes()[0] self.logger.info('Waiting 20s for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5")} self.server.expect(RESV, a, id=rid, offset=20) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node) # Submit another reservation requesting on vnode in resv_node a = {ATTR_l + '.select': '1:ncpus=1:vnode=%s' % resv_node, 'reserve_start': now + 25, 'reserve_end': now + 30} r = Reservation(TEST_USER, attrs=a) rid2 = self.server.submit(r) msg = "Resv;" + rid2 + ";Reservation denied" self.server.log_match(msg, starttime=start_time, interval=2) msg2 = "Resv;" + rid2 + ";reservation deleted" self.server.log_match(msg2, starttime=now, interval=2) msg3 = "Resv;" + rid2 + ";PBS Failed to confirm resv: Insufficient " msg3 += "amount of resource: vnode" self.scheduler.log_match(msg3, starttime=now, interval=2) def test_node_exclusivity_with_multinode_reservation(self): """ Test Jobs run correctly in multinode reservation and accordingly update node exclusivity. """ self.get_vnode_ncpus_value() # Submit a reservation with place=excl now = int(time.time()) a = {ATTR_l + '.select': '2:ncpus=%d' % (int(self.ncpus)), 'Resource_List.place': 'excl', 'reserve_start': now + 10, 'reserve_end': now + 1600} rid = self.submit_and_confirm_resv(a) rid_q = rid.split(".")[0] self.server.status(RESV, 'resv_nodes', id=rid) resv_node = self.server.reservations[rid].get_vnodes() self.logger.info('Waiting 10s for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5")} self.server.expect(RESV, a, id=rid, offset=10) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node[0]) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node[1]) # Submit a job inside the reservation submit_dir = self.du.create_temp_dir(asuser=TEST_USER) a = {ATTR_q: rid_q, ATTR_l + '.select': '1:ncpus=1', 'Resource_List.place': 'shared'} j1 = Job(TEST_USER, attrs=a) j1.create_script(self.script) jid1 = self.server.submit(j1, submit_dir=submit_dir) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) self.server.expect(NODE, {'state': 'job-exclusive,resv-exclusive'}, id=resv_node[0]) # Submit another job inside the reservation submit_dir = self.du.create_temp_dir(asuser=TEST_USER) a = {ATTR_q: rid_q, ATTR_l + '.select': '2:ncpus=1', 'Resource_List.place': 'shared'} j2 = Job(TEST_USER, attrs=a) j2.create_script(self.script) jid2 = self.server.submit(j2, submit_dir=submit_dir) self.server.expect(JOB, {'job_state': 'R'}, id=jid2) self.server.expect(NODE, {'state': 'job-exclusive,resv-exclusive'}, id=resv_node[0]) def test_multiple_reservation_request_exclusive_placement(self): """ Test Multiple reservations requesting exclusive placement are confirmed when not overlapping in time. """ self.get_vnode_ncpus_value() # Submit a reservation with place=excl now = int(time.time()) a = {ATTR_l + '.select': '1:ncpus=1:vnode=%s' % self.vnode, 'Resource_List.place': 'excl', 'reserve_start': now + 10, 'reserve_duration': 3600} rid = self.submit_and_confirm_resv(a) self.server.status(RESV, 'resv_nodes', id=rid) resv_node = self.server.reservations[rid].get_vnodes()[0] # Submit a non-overlapping reservation requesting place=excl a = {ATTR_l + '.select': '1:ncpus=1:vnode=%s' % resv_node, 'Resource_List.place': 'excl', 'reserve_start': now + 7200, 'reserve_duration': 3600} self.submit_and_confirm_resv(a) def test_delete_future_resv_not_effect_node_state(self): """ Test (Advance Reservation)Multiple reservations requesting exclusive placement are confirmed when not overlapping. Deleting the latter reservation after earlier one starts running leaves node in state resv-exclusive. """ self.get_vnode_ncpus_value() # Submit a reservation with place=excl now = int(time.time()) a = {ATTR_l + '.select': '1:ncpus=1:vnode=%s' % self.vnode, 'Resource_List.place': 'excl', 'reserve_start': now + 10, 'reserve_duration': 3600} rid = self.submit_and_confirm_resv(a) self.server.status(RESV, 'resv_nodes', id=rid) resv_node = self.server.reservations[rid].get_vnodes()[0] # Submit a non-overlapping reservation requesting place=excli # on vnode in resv_node a = {ATTR_l + '.select': '1:ncpus=1:vnode=%s' % resv_node, 'Resource_List.place': 'excl', 'reserve_start': now + 7200, 'reserve_duration': 3600} rid2 = self.submit_and_confirm_resv(a) self.logger.info('Waiting 10s for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5")} self.server.expect(RESV, a, id=rid, offset=10) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node) # Delete future reservation rid2 and verify that resv node # is still in state resv-exclusive self.server.delete(rid2) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node) def test_delete_future_standing_resv_not_effect_node_state(self): """ Test (Standing Reservation)Multiple reservations requesting exclusive placement are confirmed when not overlapping. Deleting the latter reservation after earlier one starts running leaves node in state resv-exclusive. """ self.get_vnode_ncpus_value() if 'PBS_TZID' in self.conf: tzone = self.conf['PBS_TZID'] elif 'PBS_TZID' in os.environ: tzone = os.environ['PBS_TZID'] else: self.logger.info('Missing timezone, using America/Los_Angeles') tzone = 'America/Los_Angeles' # Submit a standing reservation with place=excl now = int(time.time()) a = {ATTR_l + '.select': '1:ncpus=1:vnode=%s' % self.vnode, 'Resource_List.place': 'excl', ATTR_resv_rrule: 'FREQ=HOURLY;COUNT=2', ATTR_resv_timezone: tzone, 'reserve_start': now + 10, 'reserve_end': now + 3100} rid = self.submit_and_confirm_resv(a) self.server.status(RESV, 'resv_nodes', id=rid) resv_node = self.server.reservations[rid].get_vnodes()[0] # Submit a non-overlapping reservation requesting place=excli # on vnode in resv_node a = {ATTR_l + '.select': '1:ncpus=1:vnode=%s' % resv_node, 'Resource_List.place': 'excl', ATTR_resv_rrule: 'FREQ=HOURLY;COUNT=2', ATTR_resv_timezone: tzone, 'reserve_start': now + 7200, 'reserve_end': now + 10800} rid2 = self.submit_and_confirm_resv(a) self.logger.info('Waiting 10s for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5")} self.server.expect(RESV, a, id=rid, offset=10) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node) # Delete future reservation rid2 and verify that resv node # is still in state resv-exclusive self.server.delete(rid2) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node) def test_job_inside_exclusive_reservation(self): """ Test Job will run correctly inside the exclusive reservation """ self.script2 = [] self.script2 += ['echo Hello World\n'] self.script2 += ['/bin/sleep 10'] # Submit a reservation with place=excl start_time = time.time() now = int(start_time) a = {'Resource_List.select': '1:ncpus=1:vntype=cray_login', 'Resource_List.place': 'excl', 'reserve_start': now + 20, 'reserve_end': now + 40} rid = self.submit_and_confirm_resv(a) rid_q = rid.split('.')[0] self.server.status(RESV, 'resv_nodes', id=rid) resv_node = self.server.reservations[rid].get_vnodes()[0] self.server.expect(NODE, {'state': 'free'}, id=resv_node) self.logger.info('Waiting 20s for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5")} self.server.expect(RESV, a, id=rid, offset=20) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node) # Submit a job inside the reservation submit_dir = self.du.create_temp_dir(asuser=TEST_USER) a = {ATTR_q: rid_q, ATTR_l + '.select': '1:ncpus=1:vntype=cray_login', 'Resource_List.place': 'excl'} j1 = Job(TEST_USER, attrs=a) j1.create_script(self.script2) jid1 = self.server.submit(j1, submit_dir=submit_dir) self.server.expect( JOB, {'job_state': 'R'}, id=jid1) self.server.expect(NODE, {'state': 'job-exclusive,resv-exclusive'}, id=resv_node) # wait 5 sec for job to end self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node, offset=5, interval=10) # Wait for reservation to end and verify node state # changed as free msg = "Que;" + rid_q + ";deleted at request of pbs_server@" self.server.log_match(msg, starttime=start_time, interval=2) self.server.expect(NODE, {'state': 'free'}, id=resv_node) # Test Job will run correctly inside the exclusive # standing reservation requesting compute_node if 'PBS_TZID' in self.conf: tzone = self.conf['PBS_TZID'] elif 'PBS_TZID' in os.environ: tzone = os.environ['PBS_TZID'] else: self.logger.info('Missing timezone, using America/Los_Angeles') tzone = 'America/Los_Angeles' # Submit a standing reservation with place=excl now = int(time.time()) a = {ATTR_l + '.select': '1:ncpus=1:vntype=cray_compute', 'Resource_List.place': 'excl', ATTR_resv_rrule: 'FREQ=HOURLY;COUNT=1', ATTR_resv_timezone: tzone, 'reserve_start': now + 10, 'reserve_end': now + 300} rid = self.submit_and_confirm_resv(a) rid_q = rid.split('.')[0] self.server.status(RESV, 'resv_nodes', id=rid) resv_node = self.server.reservations[rid].get_vnodes()[0] self.server.expect(NODE, {'state': 'free'}, id=resv_node) self.logger.info('Waiting 10s for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5")} self.server.expect(RESV, a, id=rid, offset=10) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node) # Submit a job inside the reservation submit_dir = self.du.create_temp_dir(asuser=TEST_USER) a = {ATTR_q: rid_q} j1 = Job(TEST_USER, attrs=a) j1.create_script(self.script) jid1 = self.server.submit(j1, submit_dir=submit_dir) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) self.server.expect(NODE, {'state': 'job-exclusive,resv-exclusive'}, id=resv_node) # wait 5 sec for job to end self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node, offset=5, interval=10) def test_reservation_request_node_ignore_excl(self): """ Test Reservation asking for place=excl will not get confirmed if node has ignore_excl set on it. """ a = {'sharing': 'ignore_excl'} self.mom.create_vnodes(a, 1, createnode=False, delall=False, usenatvnode=True) self.server.expect(NODE, {'state': 'free', 'sharing': 'ignore_excl'}, id=self.mom.shortname) # Submit a reservation now = int(time.time()) a = {'Resource_List.select': '1:ncpus=1:vntype=cray_login', 'Resource_List.place': 'excl', 'reserve_start': now + 20, 'reserve_end': now + 40} rid = self.submit_and_confirm_resv(a) self.server.status(RESV, 'resv_nodes', id=rid) resv_node = self.server.reservations[rid].get_vnodes()[0] # Wait for reservation to start and verify # node state should not be resv-exclusive self.logger.info('Waiting 10s for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5")} self.server.expect(RESV, a, id=rid, offset=10) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node, op=NE) def test_multijob_on_resv_exclusive_node(self): """ Test multiple jobs request inside a reservation if none(node,reservation or job) asks for exclusivity """ now = int(time.time()) a = {'Resource_List.select': '1:ncpus=2:vntype=cray_compute', 'Resource_List.place': 'shared', 'reserve_start': now + 20, 'reserve_end': now + 40} rid = self.submit_and_confirm_resv(a) rid_q = rid.split('.')[0] self.server.status(RESV, 'resv_nodes', id=rid) resv_node = self.server.reservations[rid].get_vnodes()[0] self.logger.info('Waiting for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5")} self.server.expect(RESV, a, id=rid, offset=10) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node) a = {ATTR_q: rid_q} j1 = Job(TEST_USER, attrs=a) j1.create_script(self.script) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) self.server.expect(NODE, {'state': 'job-exclusive,resv-exclusive'}, id=resv_node) j2 = Job(TEST_USER, attrs=a) j2.create_script(self.script) jid2 = self.server.submit(j2) self.server.expect(JOB, {'job_state': 'Q'}, id=jid2) def test_job_with_exclusive_placement(self): """ Job will honour exclusivity inside the reservation """ now = int(time.time()) a = {'Resource_List.select': '1:ncpus=2:vntype=cray_compute', 'Resource_List.place': 'excl', 'reserve_start': now + 20, 'reserve_end': now + 40} rid = self.submit_and_confirm_resv(a) rid_q = rid.split('.')[0] self.logger.info('Waiting for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5")} self.server.expect(RESV, a, id=rid, offset=10) a = {ATTR_q: rid_q, ATTR_l + '.select': '1:ncpus=1', 'Resource_List.place': 'excl'} j1 = Job(TEST_USER, attrs=a) j1.create_script(self.script) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) a = {ATTR_q: rid_q, ATTR_l + '.select': '1:ncpus=1', 'Resource_List.place': 'shared'} j2 = Job(TEST_USER, attrs=a) j2.create_script(self.script) jid2 = self.server.submit(j2) self.server.expect(JOB, {'job_state': 'Q'}, id=jid2) self.server.expect(JOB, 'queue', op=UNSET, id=jid1, offset=5) self.server.expect(JOB, {'job_state': 'R'}, id=jid2) def test_job_running_on_multinode_reservation(self): """ Test to submit job on multinode reservation with different placement """ ncpus = [] vnodes = self.server.status(NODE) num_vnodes = 2 i = 0 for vnode in vnodes: if i < 2: if vnode['resources_available.vntype'] == 'cray_compute': ncpus.append(int(vnode['resources_available.ncpus'])) i += 1 if i == 2: break total_ncpus = ncpus[0] + ncpus[1] req_ncpus = min(ncpus[0] / 2, ncpus[1] / 2) now = int(time.time()) a = { 'Resource_List.select': '2:ncpus=%d:vntype=cray_compute' % min( ncpus[0], ncpus[1]), 'Resource_List.place': 'excl', 'reserve_start': now + 20, 'reserve_end': now + 60} rid = self.submit_and_confirm_resv(a) rid_q = rid.split('.')[0] self.logger.info('Waiting for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5")} self.server.expect(RESV, a, id=rid, offset=20) a = {ATTR_q: rid_q, ATTR_l + '.select': '2:ncpus=%d' % req_ncpus, 'Resource_List.place': 'scatter'} j1 = Job(TEST_USER, attrs=a) j1.create_script(self.script) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) a = {ATTR_q: rid_q, ATTR_l + '.select': '1:ncpus=%d' % ncpus[0], 'Resource_List.place': 'excl'} j2 = Job(TEST_USER, attrs=a) j2.create_script(self.script) jid2 = self.server.submit(j2) self.server.expect(JOB, {'job_state': 'Q'}, id=jid2) a = {ATTR_q: rid_q, ATTR_l + '.select': '1:ncpus=%d' % ncpus[1], 'Resource_List.place': 'shared'} j3 = Job(TEST_USER, attrs=a) j3.create_script(self.script) jid3 = self.server.submit(j3) self.server.expect(JOB, {'job_state': 'Q'}, id=jid3) self.server.expect(JOB, 'queue', op=UNSET, id=jid1, offset=5) self.server.expect(JOB, {'job_state': 'R'}, id=jid2) self.server.expect(JOB, {'job_state': 'R'}, id=jid3) def test_job_with_exclhost_placement_inside_resv(self): """ Job inside a reservation asking for place=exclhost on host will have all resources of the vnodes present on host assigned to it """ now = int(time.time()) a = {'Resource_List.select': '1:ncpus=2:vntype=cray_compute', 'Resource_List.place': 'exclhost', 'reserve_start': now + 20, 'reserve_end': now + 40} rid = self.submit_and_confirm_resv(a) rid_q = rid.split('.')[0] self.server.status(RESV, 'resv_nodes', id=rid) resv_node = self.server.reservations[rid].get_vnodes()[0] self.logger.info('Waiting for reservation to start') a = {'reserve_state': (MATCH_RE, "RESV_RUNNING|5")} self.server.expect(RESV, a, id=rid, offset=10) self.server.expect(NODE, {'state': 'resv-exclusive'}, id=resv_node) a = {ATTR_q: rid_q} j1 = Job(TEST_USER, attrs=a) j1.create_script(self.script) jid1 = self.server.submit(j1) self.server.expect(JOB, {'job_state': 'R'}, id=jid1) self.server.expect(NODE, {'state': 'job-exclusive,resv-exclusive'}, id=resv_node) a = {ATTR_q: rid_q} j2 = Job(TEST_USER, attrs=a) j2.create_script(self.script) jid2 = self.server.submit(j2) self.server.expect(JOB, {'job_state': 'Q'}, id=jid2) self.server.expect(JOB, 'queue', op=UNSET, id=jid1, offset=10) self.server.expect(RESV, 'queue', op=UNSET, id=rid, offset=10) self.server.expect(NODE, {'state': 'free'}, id=resv_node) ================================================ FILE: test/tests/functional/pbs_cray_hyperthread.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * from ptl.utils.pbs_crayutils import CrayUtils import os @tags('cray') class TestCrayHyperthread(TestFunctional): """ The test will submit a job script that calls aprun with the option that will allow callers to use the hyperthreads on a hyperthreaded compute node. """ def setUp(self): if not self.du.get_platform().startswith('cray'): self.skipTest("Test suite only meant to run on a Cray") TestFunctional.setUp(self) def test_hyperthread(self): """ Check for a compute node that has hyperthreads, if there is one submit a job to that node requesting the hyperthreads. Check there are no errors in the job error output. If there is no node with hyperthreads, skip the test. """ # Get the compute nodes from PBS and see if they are threaded cu = CrayUtils() all_nodes = self.server.status(NODE) threaded = 0 for n in all_nodes: if n['resources_available.vntype'] == 'cray_compute': numthreads = cu.get_numthreads( n['resources_available.PBScraynid']) if numthreads > 1: self.logger.info("Node %s has %s hyperthreads" % (n['resources_available.vnode'], numthreads)) ncpus = n['resources_available.ncpus'] vnode = n['resources_available.vnode'] threaded = 1 break if not threaded: self.skipTest("Test suite needs nodes with hyperthreads") # There is a node with hyperthreads, get the number of cpus aprun_args = '-j %d -n %d' % (int(numthreads), int(ncpus)) self.server.manager(MGR_CMD_SET, SERVER, {'job_history_enable': 'True'}) j1 = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=%d:vnode=%s' % (int(ncpus), vnode), ATTR_N: 'hyperthread'}) scr = [] scr += ['hostname\n'] scr += ['/bin/sleep 5\n'] scr += ['aprun -b %s /bin/hostname\n' % aprun_args] sub_dir = self.du.create_temp_dir(asuser=TEST_USER) j1.create_script(scr) jid1 = self.server.submit(j1, submit_dir=sub_dir) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) # Verify the contents of the output/error files self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x') error_file = os.path.join( sub_dir, 'hyperthread.e' + jid1.split('.')[0]) self.assertEqual(os.stat(error_file).st_size, 0, msg="Job error file should be empty") ================================================ FILE: test/tests/functional/pbs_cray_pagg_id.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * @tags('cray') class TestCrayPaggIdUniqueness(TestFunctional): """ This test suite is written to verify that the PAGG ID provided to ALPS while confirming and releasing an ALPS reservation is not equal to the session ID of the job. This test is specific to Cray and will also not work on the Cray simulator, hence, will be skipped on non-Cray systems and Cray simulator. """ def setUp(self): platform = self.du.get_platform() if platform != 'cray': self.skipTest("not a cray") TestFunctional.setUp(self) def test_pagg_id(self): """ This test case submits a job, waits for it to run and then checks the MoM logs to confirm that the PAGG ID provided in the ALPS query is not equal to the session ID of the job. """ j1 = Job(TEST_USER) jid = self.server.submit(j1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) self.mom.log_match("Job;%s;Started, pid" % (jid,), n=100, max_attempts=5, interval=5, regexp=True) self.server.status(JOB, [ATTR_session], jid) sess_id = j1.attributes[ATTR_session] msg = "pagg_id =\"" + sess_id + "\"" try: self.mom.log_match(msg, n='ALL') except PtlLogMatchError: self.logger.info("pagg_id is not equal to session id, test passes") else: self.assertFalse("pagg_id is equal to session id, test fails.") ================================================ FILE: test/tests/functional/pbs_cray_reliable_job_startup.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. import time import fnmatch from tests.functional import * from ptl.utils.pbs_logutils import PBSLogUtils @tags('cray') class TestPbsReliableJobStartupOnCray(TestFunctional): """ This tests the Reliable Job Startup Feature on Cray. A job can be started with extra nodes with node failures tolerated during job start but setting is not supported and ignored on Cray. """ def setUp(self): if not self.du.get_platform().startswith('cray'): self.skipTest("Test suite only meant to run on a Cray") TestFunctional.setUp(self) # queuejob hook self.qjob_hook_body = """ import pbs e=pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "queuejob hook executed") # Save current select spec in resource 'site' e.job.Resource_List["site"] = str(e.job.Resource_List["select"]) new_select = e.job.Resource_List["select"].increment_chunks(1) e.job.Resource_List["select"] = new_select e.job.tolerate_node_failures = "job_start" """ # prologue hook self.prolo_hook_body = """ import pbs e=pbs.event() pbs.logmsg(pbs.LOG_DEBUG, "Executing prologue") # print out the vnode_list[] values for vn in e.vnode_list: v = e.vnode_list[vn] pbs.logjobmsg(e.job.id, "prologue: found vnode_list[" + v.name + "]") # print out the vnode_list_fail[] values for vn in e.vnode_list_fail: v = e.vnode_list_fail[vn] pbs.logjobmsg(e.job.id, "prologue: found vnode_list_fail[" + v.name + "]") if e.job.in_ms_mom(): pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"]) if pj is None: e.job.Hold_Types = pbs.hold_types("s") e.job.rerun() e.reject("unsuccessful at PROLOGUE") """ # launch hook self.launch_hook_body = """ import pbs e=pbs.event() if 'PBS_NODEFILE' not in e.env: e.accept() pbs.logmsg(pbs.LOG_DEBUG, "Executing launch") # print out the vnode_list[] values for vn in e.vnode_list: v = e.vnode_list[vn] pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]") # print out the vnode_list_fail[] values: for vn in e.vnode_list_fail: v = e.vnode_list_fail[vn] pbs.logjobmsg(e.job.id, "launch: found vnode_list_fail[" + v.name + "]") v.state = pbs.ND_OFFLINE if e.job.in_ms_mom(): pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"]) if pj is None: e.job.Hold_Types = pbs.hold_types("s") e.job.rerun() e.reject("unsuccessful at LAUNCH") """ def match_str_in_input_file(self, file_path, file_pattern, search_str): """ Assert that search string appears in the input file that matches file_pattern """ input_file = None for item in self.du.listdir(path=file_path, sudo=True): if fnmatch.fnmatch(item, file_pattern): input_file = item break self.assertTrue(input_file is not None) with PBSLogUtils().open_log(input_file, sudo=True) as f: self.assertTrue(search_str in f.read().decode()) self.logger.info("Found \"%s\" in %s" % (search_str, input_file)) @tags('cray') def test_reliable_job_startup_not_supported_on_cray(self): """ A job is started with extra nodes. Mom superior will show no sign of tolerating node failure. Accounting logs won't have 's' record. Input files to prologue and launch hooks will show the tolerate_node_failures=none value. """ # instantiate queuejob hook hook_event = 'queuejob' hook_name = 'qjob' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.qjob_hook_body) # instantiate execjob_prologue hook hook_event = 'execjob_prologue' hook_name = 'prolo' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.prolo_hook_body) # instantiate execjob_launch hook hook_event = 'execjob_launch' hook_name = 'launch' a = {'event': hook_event, 'enabled': 'true'} self.server.create_import_hook(hook_name, a, self.launch_hook_body) # Submit a job j = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=3:mem=2gb:vntype=' + 'cray_compute+1:ncpus=3:mem=2gb:vntype=' + 'cray_compute', ATTR_l + '.place': 'scatter'}) start_time = time.time() jid = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) # Check for msg in mom superior logs msg = "no nodes released as job does not tolerate node failures" self.server.expect(JOB, 'exec_host', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) exechost = job_stat[0]['exec_host'].partition('/')[0] mom_superior = self.moms[exechost] mom_superior.log_match(msg, starttime=start_time) # Check that 's' record is absent since release_nodes() was not called self.server.accounting_match( msg=".*%s;%s;.*" % ('s', jid), regexp=True, n=50, max_attempts=10, existence=False) self.logger.info( "There was no 's' record found for job %s, test passes" % jid) # On mom superior check the input files to prologue and launch hooks # showed the tolerate_node_failures=none value search_str = 'pbs.event().job.tolerate_node_failures=none' self.mom_hooks_tmp_dir = os.path.join( self.server.pbs_conf['PBS_HOME'], 'mom_priv', 'hooks', 'tmp') hook_name = 'prolo' input_file_pattern = os.path.join( self.mom_hooks_tmp_dir, 'hook_execjob_prologue_%s*.in' % hook_name) self.match_str_in_input_file( self.mom_hooks_tmp_dir, input_file_pattern, search_str) hook_name = 'launch' input_file_pattern = os.path.join( self.mom_hooks_tmp_dir, 'hook_execjob_launch_%s*.in' % hook_name) self.match_str_in_input_file( self.mom_hooks_tmp_dir, input_file_pattern, search_str) ================================================ FILE: test/tests/functional/pbs_cray_smoketest.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * from ptl.utils.pbs_crayutils import CrayUtils import os @tags('cray', 'smoke') class TestCraySmokeTest(TestFunctional): """ Set of tests that qualifies as smoketest for Cray platform """ def setUp(self): if not self.du.get_platform().startswith('cray'): self.skipTest("Test suite only meant to run on a Cray") TestFunctional.setUp(self) # no node in 'resv' and 'use' in apstat cu = CrayUtils() self.assertEqual(cu.count_node_summ('resv'), 0, "No compute node should be having ALPS reservation") self.assertEqual(cu.count_node_summ('use'), 0, "No compute node should be in use") # The number of compute nodes in State up and batch mode # (State = 'UP B') should equal the number of cray_compute nodes. nodes_up_b = cu.count_node_state('UP B') self.logger.info("Nodes with State 'UP B' : %s" % nodes_up_b) nodes_up_i = cu.count_node_state('UP I') self.logger.info("Nodes with State 'UP I' : %s" % nodes_up_i) nodes = self.server.filter(NODE, {ATTR_rescavail + '.vntype': 'cray_compute'}) num_cray_compute = len(nodes[ATTR_rescavail + '.vntype=cray_compute']) self.assertEqual(nodes_up_b, num_cray_compute) self.logger.info("nodes in State 'UP B': %s == cray_compute: %s" % (nodes_up_b, num_cray_compute)) # nodes are free and resources are available. nodes = self.server.status(NODE) for node in nodes: self.assertEqual(node['state'], 'free') self.assertEqual(node['resources_assigned.ncpus'], '0') self.assertEqual(node['resources_assigned.mem'], '0kb') @staticmethod def find_hw(output_file): """ Find the string "Hello World" in the specified file. Return 1 if found. """ found = 0 with open(output_file, 'r') as outf: for line in outf: if "Hello World" in line: found = 1 break else: continue return found @tags('cray', 'smoke') def test_cray_login_job(self): """ Submit a simple sleep job that requests to run on a login node and expect that job to go in running state on a login node. Verify that the job runs to completion and check job output/error. """ self.server.manager(MGR_CMD_SET, SERVER, {'job_history_enable': 'True'}) j1 = Job(TEST_USER, {ATTR_l + '.vntype': 'cray_login', ATTR_N: 'cray_login'}) scr = [] scr += ['echo Hello World\n'] scr += ['/bin/sleep 5\n'] sub_dir = self.du.create_temp_dir(asuser=TEST_USER) j1.create_script(scr) jid1 = self.server.submit(j1, submit_dir=sub_dir) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) # fetch node name where the job is running and check that the # node is a login node self.server.status(JOB, 'exec_vnode', id=jid1) vname = j1.get_vnodes()[0] self.server.expect(NODE, {ATTR_rescavail + '.vntype': 'cray_login'}, id=vname, max_attempts=1) cu = CrayUtils() # Check if number of compute nodes in use are 0 self.assertEqual(cu.count_node_summ('use'), 0) # verify the contents of output/error files self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x') error_file = os.path.join(sub_dir, 'cray_login.e' + jid1.split('.')[0]) self.assertEqual(os.stat(error_file).st_size, 0, msg="Job error file should be empty") output_file = os.path.join( sub_dir, 'cray_login.o' + jid1.split('.')[0]) foundhw = self.find_hw(output_file) self.assertEqual(foundhw, 1, msg="Job output file incorrect") @tags('cray', 'smoke') def test_cray_compute_job(self): """ Submit a simple sleep job that runs on a compute node and expect the job to go in running state on a compute node. Verify that the job runs to completion and check job output/error. """ self.server.manager(MGR_CMD_SET, SERVER, {'job_history_enable': 'True'}) j1 = Job(TEST_USER, {ATTR_l + '.vntype': 'cray_compute', ATTR_N: 'cray_compute'}) scr = [] scr += ['echo Hello World\n'] scr += ['/bin/sleep 5\n'] scr += ['aprun -b -B /bin/sleep 10\n'] sub_dir = self.du.create_temp_dir(asuser=TEST_USER) j1.create_script(scr) jid1 = self.server.submit(j1, submit_dir=sub_dir) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) # fetch node name where the job is running and check that the # node is a compute node self.server.status(JOB, 'exec_vnode', id=jid1) vname = j1.get_vnodes()[0] self.server.expect(NODE, {ATTR_rescavail + '.vntype': 'cray_compute'}, id=vname) # Sleep for some time before aprun actually starts # using the reservation self.logger.info( "Sleeping 6 seconds before aprun starts using the reservation") time.sleep(6) cu = CrayUtils() # Check if number of compute nodes in use is 1 self.assertEqual(cu.count_node_summ('resv'), 1) if self.du.get_platform() == 'cray': # Cray simulator will not show anything in 'use' because # aprun command is just a pass through on simulator self.assertEqual(cu.count_node_summ('use'), 1) # verify the contents of output/error files self.server.expect(JOB, {'job_state': 'F'}, id=jid1, extend='x') error_file = os.path.join( sub_dir, 'cray_compute.e' + jid1.split('.')[0]) self.assertEqual(os.stat(error_file).st_size, 0, msg="Job error file should be empty") output_file = os.path.join( sub_dir, 'cray_compute.o' + jid1.split('.')[0]) foundhw = self.find_hw(output_file) self.assertEqual(foundhw, 1, msg="Job output file incorrect") (cu.node_status, cu.node_summary) = cu.parse_apstat_rn() self.assertEqual(cu.count_node_summ('resv'), 0) if self.du.get_platform() == 'cray': self.assertEqual(cu.count_node_summ('use'), 0) ================================================ FILE: test/tests/functional/pbs_cray_suspend_resume.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. import time from tests.functional import * from ptl.utils.pbs_crayutils import CrayUtils @tags('cray') class TestSuspendResumeOnCray(TestFunctional): """ Test special cases where suspend/resume functionality differs on cray as compared to other platforms. This test suite expects the platform to be 'cray' and assumes that suspend/resume feature is enabled on it. """ cu = CrayUtils() def setUp(self): if not self.du.get_platform().startswith('cray'): self.skipTest("Test suite only meant to run on a Cray") TestFunctional.setUp(self) @tags('cray', 'smoke') def test_default_restrict_res_to_release_on_suspend_setting(self): """ Check that on Cray restrict_res_to_release_on_suspend is always set to 'ncpus' by default """ # Set restrict_res_to_release_on_suspend server attribute a = {ATTR_restrict_res_to_release_on_suspend: 'ncpus'} self.server.expect(SERVER, a) def test_exclusive_job_not_suspended(self): """ If a running job is a job with exclusive placement then this job can not be suspended. This test is checking for a log message which is an unstable interface and may need change in future when interface changes. """ msg_expected = "BASIL;ERROR: ALPS error: apsched: \ at least resid .* is exclusive" # Submit a job j = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=1', ATTR_l + '.place': 'excl'}) check_after = time.time() jid = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) # suspend job try: self.server.sigjob(jobid=jid, signal="suspend") except PbsSignalError as e: self.assertTrue("Switching ALPS reservation failed" in e.msg[0]) self.server.expect(JOB, 'exec_host', id=jid, op=SET) job_stat = self.server.status(JOB, id=jid) s = self.mom.log_match(msg_expected, starttime=check_after, regexp=True, max_attempts=10) self.assertTrue(s) @tags('cray') def test_basic_admin_suspend_restart(self): """ Test basic admin-suspend funcionality for jobs and array jobs with restart on Cray. The restart will test if the node recovers properly in maintenance. After turning off scheduling and a mom restart, a subjob is always requeued and node shows up as free. """ j1 = Job(TEST_USER) jid1 = self.server.submit(j1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) qstat = self.server.status(JOB, 'exec_vnode', id=jid1) vname = qstat[0]['exec_vnode'].partition(':')[0].strip('(') # admin-suspend regular job self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1) self.server.expect(NODE, {'state': 'maintenance'}, id=vname) self.server.expect(NODE, {'maintenance_jobs': jid1}) self.server.restart() self.server.expect(NODE, {'state': 'maintenance'}, id=vname) self.server.expect(NODE, {'maintenance_jobs': jid1}) # Adding sleep to avoid failure at resume since PBS licenses # might not be available and as a result resume fails time.sleep(2) # admin-resume regular job. Make sure the node retuns to state # job-exclusive. self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.expect(NODE, {'state': 'job-exclusive'}, id=vname) self.server.cleanup_jobs() # admin-suspend job array jA = Job(TEST_USER, {ATTR_l + '.select': '1:ncpus=1', ATTR_J: '1-2'}) jidA = self.server.submit(jA) self.server.expect(JOB, {ATTR_state: 'B'}, id=jidA) subjobs = self.server.status(JOB, id=jidA, extend='t') # subjobs[0] is the array itself. Need the subjobs jid1 = subjobs[1]['id'] jid2 = subjobs[2]['id'] self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2) qstat = self.server.status(JOB, 'exec_vnode', id=jid1) vname1 = qstat[0]['exec_vnode'].partition(':')[0].strip('(') qstat = self.server.status(JOB, 'exec_vnode', id=jid2) vname2 = qstat[0]['exec_vnode'].partition(':')[0].strip('(') # admin-suspend subjob 1 self.server.sigjob(jid1, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1) self.server.expect(NODE, {'state': 'maintenance'}, id=vname1) self.server.expect(NODE, {'maintenance_jobs': jid1}) # admin-resume subjob 1 . Make sure the node retuns to state # job-exclusive. self.server.sigjob(jid1, 'admin-resume', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.expect(NODE, {'state': 'job-exclusive'}, id=vname1) # admin-suspend subjob 2 self.server.sigjob(jid2, 'admin-suspend', runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid2) self.server.expect(NODE, {'state': 'maintenance'}, id=vname2) self.server.expect(NODE, {'maintenance_jobs': jid2}) # Turn off scheduling and restart mom self.server.manager(MGR_CMD_SET, SERVER, {'scheduling': 'False'}) self.mom.restart() # Check that nodes are now free self.server.expect(NODE, {'state': 'free'}, id=vname1) self.server.expect(NODE, {'state': 'free'}, id=vname2) def test_admin_suspend_wrong_state(self): """ Check that wrong 'resume' signal is correctly rejected. """ j1 = Job(TEST_USER) jid1 = self.server.submit(j1) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) self.server.sigjob(jid1, "suspend", runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1) try: self.server.sigjob(jid1, "admin-resume", runas=ROOT_USER) except PbsSignalError as e: self.assertTrue( 'Job can not be resumed with the requested resume signal' in e.msg[0]) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid1) j2 = Job(TEST_USER) jid2 = self.server.submit(j2) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2) self.server.sigjob(jid2, "admin-suspend", runas=ROOT_USER) self.server.expect(JOB, {ATTR_state: 'S', ATTR_substate: 43}, id=jid2) try: self.server.sigjob(jid2, "resume", runas=ROOT_USER) except PbsSignalError as e: self.assertTrue( 'Job can not be resumed with the requested resume signal' in e.msg[0]) # The job should be in the same state as it was prior to the signal self.server.expect(JOB, {ATTR_state: 'S', ATTR_substate: 43}, id=jid2) def submit_resv(self, resv_start, chunks, resv_dur): """ Function to request a PBS reservation with start time, chunks and duration as arguments. """ a = {'Resource_List.select': '%d:ncpus=1:vntype=cray_compute' % chunks, 'Resource_List.place': 'scatter', 'reserve_start': int(resv_start), 'reserve_duration': int(resv_dur) } r = Reservation(TEST_USER, attrs=a) rid = self.server.submit(r) try: a = {'reserve_state': (MATCH_RE, 'RESV_CONFIRMED|2')} d = self.server.expect(RESV, a, id=rid) except PtlExpectError as e: d = e.rv return d @timeout(300) def test_preempt_STF(self): """ Test shrink to fit by creating a reservation for all compute nodes starting in 100 sec. with a duration of two hours. A preempted STF job with min_walltime of 1 min. and max_walltime of 2 hours will stay suspended after higher priority job goes away if its min_walltime can't be satisfied. """ qname = 'highp' a = {'queue_type': 'execution'} self.server.manager(MGR_CMD_CREATE, QUEUE, a, qname) a = {'enabled': 'True', 'started': 'True', 'priority': '150'} self.server.manager(MGR_CMD_SET, QUEUE, a, qname) # Reserve all the compute nodes nv = self.cu.num_compute_vnodes(self.server) self.assertNotEqual(nv, 0, "There are no cray_compute vnodes present.") now = time.time() resv_start = now + 100 resv_dur = 7200 d = self.submit_resv(resv_start, nv, resv_dur) self.assertTrue(d) j = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter', ATTR_l + '.min_walltime': '00:01:00', ATTR_l + '.max_walltime': '02:00:00'}) jid = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid) self.server.expect( JOB, {ATTR_l + '.walltime': (LE, '00:01:40')}, id=jid) self.server.expect( JOB, {ATTR_l + '.walltime': (GE, '00:01:00')}, id=jid) # The sleep below will leave less than 1 minute window for jid # after j2id is deleted. The min_walltime of jid can't be # satisfied and jid will stay in S state. time.sleep(35) j2 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.walltime': '00:01:00', ATTR_l + '.place': 'scatter', ATTR_q: 'highp'}) j2id = self.server.submit(j2) self.server.expect(JOB, {ATTR_state: 'R'}, id=j2id) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) # The sleep below will leave less than 1 minute window for jid time.sleep(50) self.server.delete(j2id) a = {'scheduling': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) self.server.expect(SERVER, {'server_state': 'Active'}) self.server.expect(JOB, {ATTR_state: 'S'}, id=jid) def test_multi_express(self): """ Test of multiple express queues of different priorities. See that jobs from the higher express queues preempt jobs from lower express queues. Also see when express jobs finish (or are deleted), suspended jobs restart. Make sure loadLimit is set to 4 on the server node: # apmgr config loadLimit 4 """ _t = ('\"express_queue, normal_jobs, server_softlimits,' + ' queue_softlimits\"') a = {'preempt_prio': _t} self.scheduler.set_sched_config(a) a = {'queue_type': 'e', 'started': 'True', 'enabled': 'True', 'Priority': 150} self.server.manager(MGR_CMD_CREATE, QUEUE, a, "expressq") a['Priority'] = 160 self.server.manager(MGR_CMD_CREATE, QUEUE, a, "expressq2") a['Priority'] = 170 self.server.manager(MGR_CMD_CREATE, QUEUE, a, "expressq3") # Count the compute nodes nv = self.cu.num_compute_vnodes(self.server) self.assertNotEqual(nv, 0, "There are no cray_compute vnodes present.") j1 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter', ATTR_l + '.walltime': 3600}) j1id = self.server.submit(j1) self.server.expect(JOB, {ATTR_state: 'R'}, id=j1id) j2 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter', ATTR_l + '.walltime': 3600, ATTR_q: 'expressq'}) j2id = self.server.submit(j2) self.server.expect(JOB, {ATTR_state: 'S'}, id=j1id) self.server.expect(JOB, {ATTR_state: 'R'}, id=j2id) j3 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter', ATTR_l + '.walltime': 3600, ATTR_q: 'expressq2'}) j3id = self.server.submit(j3) self.server.expect(JOB, {ATTR_state: 'S'}, id=j2id) self.server.expect(JOB, {ATTR_state: 'R'}, id=j3id) j4 = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter', ATTR_l + '.walltime': 3600, ATTR_q: 'expressq3'}) j4id = self.server.submit(j4) self.server.expect(JOB, {ATTR_state: 'S'}, id=j3id) self.server.expect(JOB, {ATTR_state: 'R'}, id=j4id) self.server.delete(j4id) self.server.expect(JOB, {ATTR_state: 'R'}, id=j3id) def test_preempted_topjob_calendared(self): """ That even if topjob_ineligible is set for a preempted job and sched_preempt_enforce_resumption is set true, the preempted job will be calendared """ self.server.manager(MGR_CMD_SET, SCHED, {'sched_preempt_enforce_resumption': 'true'}) self.server.manager(MGR_CMD_SET, SERVER, {'backfill_depth': '2'}) # Count the compute nodes nv = self.cu.num_compute_vnodes(self.server) self.assertNotEqual(nv, 0, "There are no cray_compute vnodes present.") # Submit a job j = Job(TEST_USER, {ATTR_l + '.select': '%d:ncpus=1' % nv, ATTR_l + '.place': 'scatter', ATTR_l + '.walltime': '120'}) jid1 = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) # Alter topjob_ineligible for runnng job self.server.alterjob(jid1, {ATTR_W: "topjob_ineligible = true"}, runas=ROOT_USER, logerr=True) # Create a high priority queue a = {'queue_type': 'e', 'started': 't', 'enabled': 'True', 'priority': '150'} self.server.manager(MGR_CMD_CREATE, QUEUE, a, id="highp") # Submit a job to high priority queue j = Job(TEST_USER, {ATTR_queue: 'highp', ATTR_l + '.walltime': '60'}) jid2 = self.server.submit(j) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid2) # Verify that job1 is calendared self.server.expect(JOB, 'estimated.start_time', op=SET, id=jid1) qstat = self.server.status(JOB, 'estimated.start_time', id=jid1) est_time = qstat[0]['estimated.start_time'] self.assertNotEqual(est_time, None) self.scheduler.log_match(jid1 + ";Job is a top job", starttime=self.server.ctime, max_attempts=10) ================================================ FILE: test/tests/functional/pbs_cray_vnode_per_numa.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * @tags('cray', 'mom', 'configuration') class TestVnodePerNumaNode(TestFunctional): """ This test suite is for testing the new mom_priv configuration parameter, vnode_per_numa_node. Test that the information is correctly being compressed into one vnode using the default setting (equivalent to FALSE). """ def setUp(self): if not self.du.get_platform().startswith('cray'): self.skipTest("Test suite only meant to run on a Cray") TestFunctional.setUp(self) @tags('cray', 'smoke') def test_settings(self): """ vnode_per_numa_node is unset (defaults to FALSE). Set $vnode_per_numa_node to TRUE Sum up the ncpus, memory, and naccelerators for all vnodes that have the same host (i.e. NUMA nodes that belong to the same compute node). Unset $vnode_per_numa_node in mom_priv/config. Now for each host, compare the ncpus, mem, and naccelerators against the values we got when $vnode_per_numa_node was set to TRUE. They should be equal. Verify that PBS created only one vnode, and: - PBScrayseg attribute is not set - ncpus is a total from all NUMA nodes of that node - mem is a total from all NUMA nodes of that node - the naccelerators value is correct - the accelerator_memory value is correct Set $vnode_per_numa_node to FALSE. Compare the pbsnodes output when vnode_per_numa_node was unset versus when vnode_per_numa_node was set to False. """ dncpus = {} dmem = {} dacc = {} daccmem = {} # First we mimic old behavior by setting vnode_per_numa_node to TRUE # Do not HUP now, we will do so when we reset the nodes rv = self.mom.add_config({'$vnode_per_numa_node': True}, False) self.assertTrue(rv) # Start from a clean slate, delete any existing nodes and re-create # them momname = self.mom.shortname self.reset_nodes(momname) # Get the pbsnodes -av output for comparison later vnodes_pernuma = self.server.status(NODE) for n in vnodes_pernuma: if n['resources_available.host'] not in dncpus.keys(): dncpus[n['resources_available.host']] = int( n['resources_available.ncpus']) else: dncpus[n['resources_available.host'] ] += int(n['resources_available.ncpus']) if n['resources_available.host'] not in dmem.keys(): dmem[n['resources_available.host']] = int( n['resources_available.mem'][0:-2]) else: dmem[n['resources_available.host'] ] += int(n['resources_available.mem'][0:-2]) if 'resources_available.naccelerators' in n.keys(): if n['resources_available.naccelerators'][0] != '@': if n['resources_available.host'] not in dacc.keys(): dacc[n['resources_available.host']] = int( n['resources_available.naccelerators']) else: dacc[n['resources_available.host'] ] += int(n['resources_available.naccelerators']) if 'resources_available.accelerator_memory' in n.keys(): if n['resources_available.accelerator_memory'][0] != '@': if n['resources_available.host'] not in daccmem.keys(): daccmem[n['resources_available.host']] = int( n['resources_available.accelerator_memory'][0:-2]) else: daccmem[n['resources_available.host']] += int(n[ 'resources_available.accelerator_memory'][0:-2]) # Remove the configuration setting and re-read the vnodes rv = self.mom.unset_mom_config('$vnode_per_numa_node', False) self.assertTrue(rv) self.reset_nodes(momname) vnodes_combined = self.server.status(NODE) # Compare the multiple vnodes values to the combined vnode output for n in vnodes_combined: if 'resources_available.PBScrayseg' in n: self.logger.error( "ERROR resources_available.PBScrayseg was found.") self.assertTrue(False) self.assertEqual(int(n['resources_available.ncpus']), dncpus[ n['resources_available.host']]) self.assertEqual(int(n['resources_available.mem'][0:-2]), dmem[ n['resources_available.host']]) if 'resources_available.naccelerators' in n: self.assertEqual(int(n['resources_available.naccelerators']), dacc[n['resources_available.host']]) if 'resources_available.accelerator_memory' in n: self.assertEqual(int(n['resources_available.accelerator_memory' ][0:-2]), daccmem[n['resources_available.host']]) # Set vnode_per_numa_node to FALSE and re-read the vnodes rv = self.mom.add_config({'$vnode_per_numa_node': False}, False) self.assertTrue(rv) self.reset_nodes(momname) vnodes_combined1 = self.server.status(NODE) # Compare the pbsnodes output when vnode_per_numa_node was unset # versus when vnode_per_numa_node was set to False. # List of resources to be ignored while comparing. ignr_rsc = ['license', 'last_state_change_time'] len_vnodes_combined1 = len(vnodes_combined1) len_vnodes_combined = len(vnodes_combined) n = 0 if len_vnodes_combined == len_vnodes_combined1: self.logger.info( "pbsnodes outputs are equal in length") for vdict in vnodes_combined: for key in vdict: if key in ignr_rsc: continue if key in vnodes_combined1[n]: if vdict[key] != vnodes_combined1[n][key]: self.fail("ERROR vnode %s has " "differing element." % key) else: self.fail("ERROR vnode %s has " "differing element." % key) n += 1 else: self.fail("ERROR pbsnodes outputs differ in length.") def restartPBS(self): try: svcs = PBSInitServices() svcs.restart() except PbsInitServicesError as e: self.logger.error("PBS restart failed: \n" + e.msg) self.assertTrue(e.rv) def reset_nodes(self, hostA): """ Reset nodes. """ # Remove all nodes rv = self.server.manager(MGR_CMD_DELETE, NODE, None, "") self.assertEqual(rv, 0) # Restart PBS self.restartPBS() # Create node rv = self.server.manager(MGR_CMD_CREATE, NODE, None, hostA) self.assertEqual(rv, 0) # Wait for 3 seconds for changes to take effect time.sleep(3) ================================================ FILE: test/tests/functional/pbs_cray_vnode_pool.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * @tags('cray', 'configuration') class TestVnodePool(TestFunctional): """ This test suite tests how PBS makes use of node attribute "vnode_pool" It expects at least 2 moms to be specified to it while executing. """ def setUp(self): if not self.du.get_platform().startswith('cray'): self.skipTest("This test can only run on a cray") TestFunctional.setUp(self) if len(self.moms.values()) < 2: self.skipTest("Provide at least 2 moms while invoking test") # The moms provided to the test may have unwanted vnodedef files. if self.moms.values()[0].has_vnode_defs(): self.moms.values()[0].delete_vnode_defs() if self.moms.values()[1].has_vnode_defs(): self.moms.values()[1].delete_vnode_defs() # Check if vnodes exist before deleting nodes. # Clean all default nodes because each test case will set up nodes. try: self.server.status(NODE) self.server.manager(MGR_CMD_DELETE, NODE, None, "") except PbsStatusError as e: self.assertTrue("Server has no node list" in e.msg[0]) def test_invalid_values(self): """ Invalid vnode_pool values shall result in errors. """ self.momA = self.moms.values()[0] self.momB = self.moms.values()[1] self.hostA = self.momA.shortname self.hostB = self.momB.shortname attr_A = {'vnode_pool': '-1'} try: self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostA, attrib=attr_A) except PbsManagerError as e: self.assertTrue("Illegal attribute or resource value" in e.msg[0]) attr_A = {'vnode_pool': '0'} try: self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostA, attrib=attr_A) except PbsManagerError as e: self.assertTrue("Illegal attribute or resource value" in e.msg[0]) attr_A = {'vnode_pool': 'a'} try: self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostA, attrib=attr_A) except PbsManagerError as e: self.assertTrue("Illegal attribute or resource value" in e.msg[0]) def test_two_moms_single_vnode_pool(self): """ Same vnode_pool for two moms shall result in one mom being the inventory mom and the other the non-inventory mom. The inventory mom goes down (e.g. killed). Compute nodes remain up even when the inventory mom is killed, since another mom is reporting them. Check that a new inventory mom is listed in the log. Bring up killed mom. """ self.server.manager(MGR_CMD_SET, SERVER, {"log_events": -1}) self.momA = self.moms.values()[0] self.momB = self.moms.values()[1] self.hostA = self.momA.shortname self.hostB = self.momB.shortname attr = {'vnode_pool': '1'} start_time = time.time() self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostA, attrib=attr) self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostB, attrib=attr) self.server.log_match("Mom %s added to vnode_pool %s" % (self.momB.hostname, '1'), max_attempts=5, starttime=start_time) _msg = "Hello (no inventory required) from server" try: self.momA.log_match(_msg, max_attempts=9, starttime=start_time) found_in_momA = 1 except PtlLogMatchError: found_in_momA = 0 try: self.momB.log_match(_msg, max_attempts=9, starttime=start_time) found_in_momB = 1 except PtlLogMatchError: found_in_momB = 0 self.assertEqual(found_in_momA + found_in_momB, 1, msg="an inventory mom not chosen correctly") # Only one mom is inventory mom if (found_in_momA == 0): inv_mom = self.momA noninv_mom = self.momB else: inv_mom = self.momB noninv_mom = self.momA self.logger.info("Inventory mom is %s." % inv_mom.shortname) self.logger.info("Non-inventory mom is %s." % noninv_mom.shortname) start_time = time.time() # Kill inventory mom inv_mom.signal('-KILL') # Check that former inventory mom is down rv = self.server.expect( VNODE, {'state': 'down'}, id=inv_mom.shortname, max_attempts=10, interval=2) self.assertTrue(rv) # Check if inventory mom changed and is listed in the server log. self.server.log_match( "Setting inventory_mom for vnode_pool %s to %s" % ('1', noninv_mom.shortname), max_attempts=5, starttime=start_time) self.logger.info( "Inventory mom is now %s in server logs." % (noninv_mom.shortname)) # Check compute nodes are up vlist = [] try: vnl = self.server.filter( VNODE, {'resources_available.vntype': 'cray_compute'}) vlist = vnl["resources_available.vntype=cray_compute"] except Exception: pass # Loop through each compute vnode in the list and check if state = free for v1 in vlist: # Check that the node is in free state rv = self.server.expect( VNODE, {'state': 'free'}, id=v1, max_attempts=3, interval=2) self.assertTrue(rv) # Start the previous inv mom. inv_mom.start() # Check previous inventory mom is up rv = self.server.expect( VNODE, {'state': 'free'}, id=inv_mom.shortname, max_attempts=3, interval=2) self.assertTrue(rv) def test_two_moms_different_vnode_pool(self): """ Differing vnode_pool for two moms shall result in both moms reporting inventory. """ self.momA = self.moms.values()[0] self.momB = self.moms.values()[1] self.hostA = self.momA.shortname self.hostB = self.momB.shortname attr_A = {'vnode_pool': '1'} attr_B = {'vnode_pool': '2'} start_time = time.time() self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostA, attrib=attr_A) self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostB, attrib=attr_B) _msg = "Hello (no inventory required) from server" try: self.momA.log_match(_msg, max_attempts=5, starttime=start_time) found_in_momA = 1 except PtlLogMatchError: found_in_momA = 0 try: self.momB.log_match(_msg, max_attempts=5, starttime=start_time) found_in_momB = 1 except PtlLogMatchError: found_in_momB = 0 self.assertTrue((found_in_momA + found_in_momB == 0), msg="Both moms must report inventory") def test_invalid_usage(self): """ Setting vnode_pool for an existing mom that does not have a vnode_pool attribute shall not be allowable. Setting vnode_pool for an existing mom having a vnode_pool attribute shall not be allowable. Unsetting vnode_pool for an existing mom having a vnode_pool attribute shall not be allowable. """ self.momA = self.moms.values()[0] self.hostA = self.momA.shortname self.logger.info("hostA is %s." % self.hostA) start_time = time.time() self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostA) attr_2 = {'vnode_pool': '2'} try: self.server.manager( MGR_CMD_SET, NODE, id=self.hostA, attrib=attr_2) except PbsManagerError as e: self.assertTrue("Invalid request" in e.msg[0]) self.server.log_match("Unsupported actions for vnode_pool", max_attempts=5, starttime=start_time) self.logger.info("Found correct server log message") self.momB = self.moms.values()[1] self.hostB = self.momB.shortname attr_1 = {'vnode_pool': '1'} start_time = time.time() self.server.manager(MGR_CMD_CREATE, NODE, id=self.hostB, attrib=attr_1) attr_2 = {'vnode_pool': '2'} try: self.server.manager(MGR_CMD_SET, NODE, id=self.hostB, attrib=attr_2) except PbsManagerError as e: self.assertTrue("Invalid request" in e.msg[0]) self.server.log_match("Unsupported actions for vnode_pool", max_attempts=5, starttime=start_time) try: self.server.manager(MGR_CMD_UNSET, NODE, id=self.hostB, attrib='vnode_pool') except PbsManagerError as e: self.assertTrue("Illegal value for node vnode_pool" in e.msg[0]) ================================================ FILE: test/tests/functional/pbs_daemon_service_user.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. import resource from tests.functional import * class TestDaemonServiceUser(TestFunctional): """ Test suite to test running schedulers as a non-root user """ def setUp(self): TestFunctional.setUp(self) def common_test(self, binary, runas, scheduser, msg, setup_sched=False): """ Test if running `binary` as `runas` with PBS_DAEMON_SERVICE_USER set as `scheduser` Check to see `msg` is in stderr If `msg` is None, make sure command passed """ if scheduser: self.du.set_pbs_config( self.server.hostname, confs={'PBS_DAEMON_SERVICE_USER': str(scheduser)} ) else: self.du.unset_pbs_config( self.server.hostname, confs='PBS_DAEMON_SERVICE_USER' ) self.server.restart() pbs_conf = self.du.parse_pbs_config(self.server.shortname) if setup_sched: sched_logs = os.path.join(pbs_conf['PBS_HOME'], 'sched_logs') sched_priv = os.path.join(pbs_conf['PBS_HOME'], 'sched_priv') self.du.chown(path=sched_logs, uid=scheduser, recursive=True, sudo=True, level=logging.INFO) self.du.chown(path=sched_priv, uid=scheduser, recursive=True, sudo=True, level=logging.INFO) binpath = os.path.join(pbs_conf['PBS_EXEC'], 'sbin', binary) ret = self.du.run_cmd(self.server.shortname, cmd=[binpath], runas=runas) if msg: self.assertEquals(ret['rc'], 1) self.assertIn(msg, '\n'.join(ret['err'])) else: self.assertEquals(ret['rc'], 0) self.assertFalse(ret['err']) def test_sched_runas_nonroot(self): """ Test if running sched as nonroot with PBS_DAEMON_SERVICE_USER set as another user """ self.common_test('pbs_sched', TEST_USER, TEST_USER1, 'Must be run by PBS_DAEMON_SERVICE_USER') def test_pbsfs_runas_nonroot(self): """ Test if running pbsfs as root with PBS_DAEMON_SERVICE_USER set as another user """ self.common_test('pbsfs', TEST_USER, TEST_USER1, 'Must be run by PBS_DAEMON_SERVICE_USER') def test_sched_runas_nonroot_notset(self): """ Test if running sched as nonroot with PBS_DAEMON_SERVICE_USER not set """ self.common_test('pbs_sched', TEST_USER, None, 'Must be run by PBS_DAEMON_SERVICE_USER if ' 'set or root if not set') def test_pbsfs_runas_nonroot_notset(self): """ Test if running pbsfs as nonroot with PBS_DAEMON_SERVICE_USER not set """ self.common_test('pbsfs', TEST_USER, None, 'Must be run by PBS_DAEMON_SERVICE_USER if ' 'set or root if not set') def test_sched_runas_nonroot_pass(self): """ Test if sched runs as non-root user """ self.scheduler.stop() self.common_test('pbs_sched', TEST_USER, TEST_USER, None, setup_sched=True) j = Job(TEST_USER1) jid = self.server.submit(j) self.server.expect(JOB, {'job_state': 'R'}, id=jid) ================================================ FILE: test/tests/functional/pbs_dup_acc_log_for_resv.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * class TestDupAccLogForResv(TestFunctional): """ This test suite is for testing duplicate records in accounting log for start of reservations. """ def setUp(self): TestFunctional.setUp(self) def test_accounting_logs(self): r1 = Reservation(TEST_USER) a = {'Resource_List.select': '1:ncpus=1', 'reserve_start': int( time.time() + 5), 'reserve_end': int(time.time() + 60)} r1.set_attributes(a) r1id = self.server.submit(r1) time.sleep(8) self.server.restart() m = self.server.accounting_match( msg='.*B;' + r1id, id=r1id, n='ALL', allmatch=True, regexp=True) self.assertEqual(len(m), 1) ================================================ FILE: test/tests/functional/pbs_eligible_time.py ================================================ # coding: utf-8 # Copyright (C) 1994-2021 Altair Engineering, Inc. # For more information, contact Altair at www.altair.com. # # This file is part of both the OpenPBS software ("OpenPBS") # and the PBS Professional ("PBS Pro") software. # # Open Source License Information: # # OpenPBS is free software. You can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # OpenPBS is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # # Commercial License Information: # # PBS Pro is commercially licensed software that shares a common core with # the OpenPBS software. For a copy of the commercial license terms and # conditions, go to: (http://www.pbspro.com/agreement.html) or contact the # Altair Legal Department. # # Altair's dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of OpenPBS and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair's trademarks, including but not limited to "PBS™", # "OpenPBS®", "PBS Professional®", and "PBS Pro™" and Altair's logos is # subject to Altair's trademark licensing policies. from tests.functional import * from ptl.utils.pbs_logutils import PBSLogUtils class TestEligibleTime(TestFunctional): """ Test suite for eligible time tests """ def setUp(self): TestFunctional.setUp(self) a = {'eligible_time_enable': 'True'} self.server.manager(MGR_CMD_SET, SERVER, a) self.accrue = {'ineligible': 1, 'eligible': 2, 'run': 3, 'exit': 4} def test_eligible_time_updated(self): """ Test that eligible time gets updated when a job is eligible """ a = {'resources_available.ncpus': 1} self.server.manager(MGR_CMD_SET, NODE, a, id=self.mom.shortname) self.server.manager(MGR_CMD_SET, SERVER, {"eligible_time_enable": "True"}) jid1 = self.server.submit(Job()) self.server.expect(JOB, {ATTR_state: 'R'}, id=jid1) jid2 = self.server.submit(Job()) a = {ATTR_state: 'Q', "accrue_type": "2"} self.server.expect(JOB, a, id=jid2) self.server.expect(JOB, {"eligible_time": "00:00:00"}, op=NE, id=jid2) def test_qsub_a(self): """ Test that jobs requsting qsub -a