Skip to content

Commit 46899ea

Browse files
author
Michael Jennings
committed
Some lightly modified patches for better integration/interaction with
Grid Engine from Dave Love <d.love@liverpool.ac.uk>, current author and maintainer of the open source Son of Grid Engine project at the University of Liverpool (see https://arc.liv.ac.uk/trac/SGE for project info). This significantly improves compatibility with SoGE, UGE, OGS, and other derived works based on the original Sun Grid Engine.
1 parent 030a750 commit 46899ea

File tree

4 files changed

+81
-48
lines changed

4 files changed

+81
-48
lines changed

Diff for: nhc

+42-46
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,10 @@ function die() {
3838
if [[ -n "$NHC_DETACHED" ]]; then
3939
echo "$RET $*" > $RESULTFILE
4040
elif [[ "$NHC_RM" == "sge" ]]; then
41-
echo "begin"
42-
echo "$HOSTNAME:healthy:false"
43-
echo "$HOSTNAME:diagnosis:NHC: $*"
44-
echo "end"
41+
echo "begin" >$NHC_FD_OUT
42+
echo "$HOSTNAME:healthy:false" >$NHC_FD_OUT
43+
echo "$HOSTNAME:diagnosis:NHC: $*" >$NHC_FD_OUT
44+
echo "end" >$NHC_FD_OUT
4545
return 77
4646
elif [[ -n "$LOGFILE" ]]; then
4747
oecho "ERROR: $NAME: Health check failed: $*"
@@ -51,7 +51,7 @@ function die() {
5151
return 0
5252
fi
5353
kill_watchdog
54-
[[ -n "$LOGFILE" ]] && exec 1>&3- 2>&4-
54+
[[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3- 2>&4-
5555
exit $RET
5656
}
5757

@@ -91,11 +91,7 @@ function oecho() {
9191

9292
if [[ "$SILENT" == "0" ]]; then
9393
[[ $TS -ne 0 ]] && PREFIX="[$SECONDS] - "
94-
if [[ -n "$LOGFILE" ]]; then
95-
echo "$PREFIX$@" >&3
96-
else
97-
echo "$PREFIX$@"
98-
fi
94+
echo "$PREFIX$@" >&$NHC_FD_OUT
9995
fi
10096
}
10197

@@ -105,11 +101,7 @@ function eecho() {
105101

106102
if [[ "$SILENT" == "0" ]]; then
107103
[[ $TS -ne 0 ]] && PREFIX="[$SECONDS] - "
108-
if [[ -n "$LOGFILE" ]]; then
109-
echo "$PREFIX$@" >&4
110-
else
111-
echo "$PREFIX$@"
112-
fi
104+
echo "$PREFIX$@" >&$NHC_FD_ERR
113105
fi
114106
}
115107

@@ -119,11 +111,7 @@ function vecho() {
119111

120112
if [[ "$VERBOSE" == "1" ]]; then
121113
[[ $TS -ne 0 ]] && PREFIX="[$SECONDS] - "
122-
if [[ -n "$LOGFILE" ]]; then
123-
echo "$PREFIX$@" >&3
124-
else
125-
echo "$PREFIX$@"
126-
fi
114+
echo "$PREFIX$@" >&$NHC_FD_OUT
127115
fi
128116
}
129117

@@ -171,8 +159,10 @@ function nhcmain_init_env() {
171159
WATCHDOG_PID=0
172160
FAIL_CNT=0
173161
FORCE_SETSID=0
162+
NHC_FD_OUT=1
163+
NHC_FD_ERR=2
174164
export PATH SYSCONFIGDIR LIBEXECDIR HOSTNAME HOSTNAME_S RET LOGGER_TEXT
175-
export NHC_PID NHC_START_TS WATCHDOG_PID FAIL_CNT FORCE_SETSID
165+
export NHC_PID NHC_START_TS WATCHDOG_PID FAIL_CNT FORCE_SETSID NHC_FD_OUT NHC_FD_ERR
176166

177167
# Users may override this in /etc/sysconfig/nhc.
178168
NAME=${0/#*\/}
@@ -294,12 +284,15 @@ function nhcmain_finalize_env() {
294284
DETACHED_MODE=${DETACHED_MODE:-0}
295285
DETACHED_MODE_FAIL_NODATA=${DETACHED_MODE_FAIL_NODATA:-0}
296286
TIMEOUT=${TIMEOUT:-10}
297-
MAX_SYS_UID=${MAX_SYS_UID:-99}
298287
NHC_CHECK_ALL=${NHC_CHECK_ALL:-0}
299288
NHC_CHECK_FORKED=${NHC_CHECK_FORKED:-0}
300289
FORCE_SETSID=${FORCE_SETSID:-0}
301290
export NHC_SID=0
302291

292+
# Set from system defaults if present.
293+
[[ -z "$MAX_SYS_UID" ]] && nhc_common_get_max_sys_uid
294+
MAX_SYS_UID=${MAX_SYS_UID:-99}
295+
303296
# Check for session leader.
304297
kill -s 0 -- -$NHC_PID >/dev/null 2>&1
305298
if [[ $? -eq 0 ]]; then
@@ -369,26 +362,27 @@ function nhcmain_find_rm() {
369362
if [[ -d /var/spool/torque ]]; then
370363
NHC_RM="pbs"
371364
return 0
365+
elif [[ -n "$SGE_ROOT" && -x "$SGE_ROOT/util/arch" ]]; then
366+
# SGE binaries typically won't be on the path defined above in the
367+
# load sensor environment, but SGE_ROOT will be there.
368+
NHC_RM="sge"
369+
fi
370+
371+
# Search PATH for commands
372+
if type -a -p -f -P pbsnodes >&/dev/null ; then
373+
NHC_RM="pbs"
374+
return 0
375+
elif type -a -p -f -P scontrol >&/dev/null ; then
376+
NHC_RM="slurm"
377+
return 0
378+
elif type -a -p -f -P badmin >&/dev/null ; then
379+
NHC_RM="lsf"
380+
return 0
381+
elif type -a -p -f -P qselect >&/dev/null ; then
382+
NHC_RM="sge"
383+
return 0
372384
fi
373385

374-
IFS=':'
375-
DIRLIST=( $PATH )
376-
IFS=$' \t\n'
377-
for DIR in "${DIRLIST[@]}" ; do
378-
if [[ -x "$DIR/pbsnodes" ]]; then
379-
NHC_RM="pbs"
380-
return 0
381-
elif [[ -x "$DIR/scontrol" ]]; then
382-
NHC_RM="slurm"
383-
return 0
384-
elif [[ -x "$DIR/badmin" ]]; then
385-
NHC_RM="lsf"
386-
return 0
387-
elif [[ -x "$DIR/qselect" ]]; then
388-
NHC_RM="sge"
389-
return 0
390-
fi
391-
done
392386
if [[ -z "$NHC_RM" ]]; then
393387
log "Unable to detect resource manager."
394388
return 1
@@ -407,6 +401,8 @@ function nhcmain_redirect_output() {
407401
exit 1
408402
else
409403
dbg "Output redirected per LOGFILE variable $LOGFILE"
404+
NHC_FD_OUT=3
405+
NHC_FD_ERR=4
410406
fi
411407
fi
412408
}
@@ -506,7 +502,7 @@ function nhcmain_detach() {
506502
nhcmain_redirect_output
507503
ELAPSED=$((SECONDS-NHC_START_TS))
508504
vlog "Node Health Check detached parent completed successfully (${ELAPSED}s)."
509-
[[ -n "$LOGFILE" ]] && exec 1>&3- 2>&4-
505+
[[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3- 2>&4-
510506
exit 0
511507
}
512508

@@ -565,14 +561,14 @@ function nhcmain_finish() {
565561
ELAPSED=$((SECONDS-NHC_START_TS))
566562
vlog "Node Health Check completed successfully (${ELAPSED}s)."
567563
if [[ "$NHC_RM" == "sge" ]]; then
568-
echo "begin"
569-
echo "$HOSTNAME:healthy:true"
570-
echo "$HOSTNAME:diagnosis:HEALTHY"
571-
echo "end"
564+
echo "begin" >$NHC_FD_OUT
565+
echo "$HOSTNAME:healthy:true" >$NHC_FD_OUT
566+
echo "$HOSTNAME:diagnosis:HEALTHY" >$NHC_FD_OUT
567+
echo "end" >$NHC_FD_OUT
572568
return 0
573569
fi
574570
kill_watchdog
575-
[[ -n "$LOGFILE" ]] && exec 1>&3- 2>&4-
571+
[[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3- 2>&4-
576572
exit 0
577573
}
578574

Diff for: scripts/common.nhc

+15
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#
88

99
PASSWD_DATA_SRC="${PASSWD_DATA_SRC:-/etc/passwd}"
10+
LOGIN_DEFS_SRC="${LOGIN_DEFS_SRC:-/etc/login.defs}"
1011

1112
RANGE_MATCH_REGEXP1='^[-a-zA-Z0-9_]+[0-9]+[-\.a-zA-Z0-9]*$'
1213
RANGE_MATCH_REGEXP2='^([-a-zA-Z0-9_]+)\[([0-9]+)\-([0-9]+)\]([-\.a-zA-Z0-9]*)$'
@@ -538,3 +539,17 @@ function nhc_cmd_with_timeout() {
538539
#exec 2>&3 3>&-
539540
return $RET
540541
}
542+
543+
# Find system definition for UID range
544+
function nhc_common_get_max_sys_uid() {
545+
local LINE
546+
547+
if [[ -e "$LOGIN_DEFS_SRC" ]]; then
548+
while read LINE ; do
549+
if [[ "${LINE##UID_MIN}" != "$LINE" ]]; then
550+
MAX_SYS_UID="${LINE//[^0-9]}"
551+
break
552+
fi
553+
done < "$LOGIN_DEFS_SRC"
554+
fi
555+
}

Diff for: scripts/ww_ps.nhc

+6-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,12 @@ function nhc_ps_gather_data() {
3838
elif [[ "$NHC_RM" == "slurm" ]]; then
3939
RM_DAEMON_MATCH="${RM_DAEMON_MATCH:-/\bslurmstepd\b/}"
4040
elif [[ "$NHC_RM" == "sge" ]]; then
41-
RM_DAEMON_MATCH="${RM_DAEMON_MATCH:-/\bsge_execd\b/}"
41+
# If you limit this to execd, you lose when it's been restarted,
42+
# and the shepherd is detached. Even if execd is safe because of
43+
# system uids, it can spawn mail commands as the job owner, at
44+
# least. (The shepherd process name is normally
45+
# sge_shepherd-<jobnum>, but maybe not if you change shepherd_cmd.)
46+
RM_DAEMON_MATCH="${RM_DAEMON_MATCH:-/\bsge_(execd|shepherd)\b/}"
4247
else
4348
dbg "Unsupported RM detected in ${FUNCNAME}(): \"$NHC_RM\""
4449
fi

Diff for: test/test_common.nhc

+18-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# $Id$
44
#
55

6-
plan $((9+5+8+5+4+6+8+7+9)) "common.nhc" && {
6+
plan $((11+5+8+5+4+6+8+7+9+7)) "common.nhc" && {
77
is "`type -t mcheck_regexp 2>&1`" 'function' 'mcheck_regexp() loaded properly'
88
is "`type -t mcheck_range 2>&1`" 'function' 'mcheck_regexp() loaded properly'
99
is "`type -t mcheck_glob 2>&1`" 'function' 'mcheck_glob() loaded properly'
@@ -13,6 +13,8 @@ plan $((9+5+8+5+4+6+8+7+9)) "common.nhc" && {
1313
is "`type -t nhc_common_get_uid 2>&1`" 'function' 'nhc_common_get_uid() loaded properly'
1414
is "`type -t nhc_common_parse_size 2>&1`" 'function' 'nhc_common_parse_size() loaded properly'
1515
is "`type -t nhc_common_unparse_size 2>&1`" 'function' 'nhc_common_unparse_size() loaded properly'
16+
is "`type -t nhc_common_get_unix_time 2>&1`" 'function' 'nhc_common_get_unix_time() loaded properly'
17+
is "`type -t nhc_common_get_max_sys_uid 2>&1`" 'function' 'nhc_common_get_max_sys_uid() loaded properly'
1618

1719
mcheck "This is a test." '/test/'
1820
is $? 0 "Basic regexp match via mcheck()"
@@ -156,4 +158,19 @@ plan $((9+5+8+5+4+6+8+7+9)) "common.nhc" && {
156158
nhc_common_unparse_count $OSIZE NSIZE
157159
is "$NSIZE" 999 "nhc_common_unparse_count(): $OSIZE -> 999"
158160

161+
LOGIN_DEFS_SRC=<(echo -e "UID_MIN\t\t\t 500") nhc_common_get_max_sys_uid
162+
is "$MAX_SYS_UID" 500 "nhc_common_get_max_sys_uid(): \$MAX_SYS_UID <- 500"
163+
LOGIN_DEFS_SRC=<(echo -e "UID_MIN 999") nhc_common_get_max_sys_uid
164+
is "$MAX_SYS_UID" 999 "nhc_common_get_max_sys_uid(): \$MAX_SYS_UID <- 999"
165+
LOGIN_DEFS_SRC=<(echo -e "UID_MIN\t0\t") nhc_common_get_max_sys_uid
166+
is "$MAX_SYS_UID" 0 "nhc_common_get_max_sys_uid(): \$MAX_SYS_UID <- 0"
167+
LOGIN_DEFS_SRC=<(echo -e "GID_MIN\t\t\t 1234") nhc_common_get_max_sys_uid
168+
is "$MAX_SYS_UID" 0 "nhc_common_get_max_sys_uid(): Bad syntax"
169+
LOGIN_DEFS_SRC=<(echo -e "2345") nhc_common_get_max_sys_uid
170+
is "$MAX_SYS_UID" 0 "nhc_common_get_max_sys_uid(): Ignore plain number"
171+
LOGIN_DEFS_SRC=<(echo -e "stuff\nGID_MIN 1\nGID_MAX 4\nUID_MIN 3\nUID_MAX 7\nblah blah blah\n") nhc_common_get_max_sys_uid
172+
is "$MAX_SYS_UID" 3 "nhc_common_get_max_sys_uid(): Multiline input"
173+
LOGIN_DEFS_SRC=<(echo -e "UID_MIN\t\t\t 500") nhc_common_get_max_sys_uid
174+
is "$MAX_SYS_UID" 500 "nhc_common_get_max_sys_uid(): Reset default"
175+
159176
} ; unplan

0 commit comments

Comments
 (0)