@@ -38,10 +38,10 @@ function die() {
38
38
if [[ -n " $NHC_DETACHED " ]]; then
39
39
echo " $RET $* " > $RESULTFILE
40
40
elif [[ " $NHC_RM " == " sge" ]]; then
41
- echo " begin"
42
- echo " $HOSTNAME :healthy:false"
43
- echo " $HOSTNAME :diagnosis:NHC: $* "
44
- echo " end"
41
+ echo " begin" > $NHC_FD_OUT
42
+ echo " $HOSTNAME :healthy:false" > $NHC_FD_OUT
43
+ echo " $HOSTNAME :diagnosis:NHC: $* " > $NHC_FD_OUT
44
+ echo " end" > $NHC_FD_OUT
45
45
return 77
46
46
elif [[ -n " $LOGFILE " ]]; then
47
47
oecho " ERROR: $NAME : Health check failed: $* "
@@ -51,7 +51,7 @@ function die() {
51
51
return 0
52
52
fi
53
53
kill_watchdog
54
- [[ -n " $LOGFILE " ]] && exec 1>&3 - 2>&4 -
54
+ [[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3 - 2>&4 -
55
55
exit $RET
56
56
}
57
57
@@ -91,11 +91,7 @@ function oecho() {
91
91
92
92
if [[ " $SILENT " == " 0" ]]; then
93
93
[[ $TS -ne 0 ]] && PREFIX=" [$SECONDS ] - "
94
- if [[ -n " $LOGFILE " ]]; then
95
- echo " $PREFIX $@ " >&3
96
- else
97
- echo " $PREFIX $@ "
98
- fi
94
+ echo " $PREFIX $@ " >& $NHC_FD_OUT
99
95
fi
100
96
}
101
97
@@ -105,11 +101,7 @@ function eecho() {
105
101
106
102
if [[ " $SILENT " == " 0" ]]; then
107
103
[[ $TS -ne 0 ]] && PREFIX=" [$SECONDS ] - "
108
- if [[ -n " $LOGFILE " ]]; then
109
- echo " $PREFIX $@ " >&4
110
- else
111
- echo " $PREFIX $@ "
112
- fi
104
+ echo " $PREFIX $@ " >& $NHC_FD_ERR
113
105
fi
114
106
}
115
107
@@ -119,11 +111,7 @@ function vecho() {
119
111
120
112
if [[ " $VERBOSE " == " 1" ]]; then
121
113
[[ $TS -ne 0 ]] && PREFIX=" [$SECONDS ] - "
122
- if [[ -n " $LOGFILE " ]]; then
123
- echo " $PREFIX $@ " >&3
124
- else
125
- echo " $PREFIX $@ "
126
- fi
114
+ echo " $PREFIX $@ " >& $NHC_FD_OUT
127
115
fi
128
116
}
129
117
@@ -171,8 +159,10 @@ function nhcmain_init_env() {
171
159
WATCHDOG_PID=0
172
160
FAIL_CNT=0
173
161
FORCE_SETSID=0
162
+ NHC_FD_OUT=1
163
+ NHC_FD_ERR=2
174
164
export PATH SYSCONFIGDIR LIBEXECDIR HOSTNAME HOSTNAME_S RET LOGGER_TEXT
175
- export NHC_PID NHC_START_TS WATCHDOG_PID FAIL_CNT FORCE_SETSID
165
+ export NHC_PID NHC_START_TS WATCHDOG_PID FAIL_CNT FORCE_SETSID NHC_FD_OUT NHC_FD_ERR
176
166
177
167
# Users may override this in /etc/sysconfig/nhc.
178
168
NAME=${0/#* \/ }
@@ -294,12 +284,15 @@ function nhcmain_finalize_env() {
294
284
DETACHED_MODE=${DETACHED_MODE:- 0}
295
285
DETACHED_MODE_FAIL_NODATA=${DETACHED_MODE_FAIL_NODATA:- 0}
296
286
TIMEOUT=${TIMEOUT:- 10}
297
- MAX_SYS_UID=${MAX_SYS_UID:- 99}
298
287
NHC_CHECK_ALL=${NHC_CHECK_ALL:- 0}
299
288
NHC_CHECK_FORKED=${NHC_CHECK_FORKED:- 0}
300
289
FORCE_SETSID=${FORCE_SETSID:- 0}
301
290
export NHC_SID=0
302
291
292
+ # Set from system defaults if present.
293
+ [[ -z " $MAX_SYS_UID " ]] && nhc_common_get_max_sys_uid
294
+ MAX_SYS_UID=${MAX_SYS_UID:- 99}
295
+
303
296
# Check for session leader.
304
297
kill -s 0 -- -$NHC_PID > /dev/null 2>&1
305
298
if [[ $? -eq 0 ]]; then
@@ -369,26 +362,27 @@ function nhcmain_find_rm() {
369
362
if [[ -d /var/spool/torque ]]; then
370
363
NHC_RM=" pbs"
371
364
return 0
365
+ elif [[ -n " $SGE_ROOT " && -x " $SGE_ROOT /util/arch" ]]; then
366
+ # SGE binaries typically won't be on the path defined above in the
367
+ # load sensor environment, but SGE_ROOT will be there.
368
+ NHC_RM=" sge"
369
+ fi
370
+
371
+ # Search PATH for commands
372
+ if type -a -p -f -P pbsnodes >& /dev/null ; then
373
+ NHC_RM=" pbs"
374
+ return 0
375
+ elif type -a -p -f -P scontrol >& /dev/null ; then
376
+ NHC_RM=" slurm"
377
+ return 0
378
+ elif type -a -p -f -P badmin >& /dev/null ; then
379
+ NHC_RM=" lsf"
380
+ return 0
381
+ elif type -a -p -f -P qselect >& /dev/null ; then
382
+ NHC_RM=" sge"
383
+ return 0
372
384
fi
373
385
374
- IFS=' :'
375
- DIRLIST=( $PATH )
376
- IFS=$' \t\n '
377
- for DIR in " ${DIRLIST[@]} " ; do
378
- if [[ -x " $DIR /pbsnodes" ]]; then
379
- NHC_RM=" pbs"
380
- return 0
381
- elif [[ -x " $DIR /scontrol" ]]; then
382
- NHC_RM=" slurm"
383
- return 0
384
- elif [[ -x " $DIR /badmin" ]]; then
385
- NHC_RM=" lsf"
386
- return 0
387
- elif [[ -x " $DIR /qselect" ]]; then
388
- NHC_RM=" sge"
389
- return 0
390
- fi
391
- done
392
386
if [[ -z " $NHC_RM " ]]; then
393
387
log " Unable to detect resource manager."
394
388
return 1
@@ -407,6 +401,8 @@ function nhcmain_redirect_output() {
407
401
exit 1
408
402
else
409
403
dbg " Output redirected per LOGFILE variable $LOGFILE "
404
+ NHC_FD_OUT=3
405
+ NHC_FD_ERR=4
410
406
fi
411
407
fi
412
408
}
@@ -506,7 +502,7 @@ function nhcmain_detach() {
506
502
nhcmain_redirect_output
507
503
ELAPSED=$(( SECONDS- NHC_START_TS))
508
504
vlog " Node Health Check detached parent completed successfully (${ELAPSED} s)."
509
- [[ -n " $LOGFILE " ]] && exec 1>&3 - 2>&4 -
505
+ [[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3 - 2>&4 -
510
506
exit 0
511
507
}
512
508
@@ -565,14 +561,14 @@ function nhcmain_finish() {
565
561
ELAPSED=$(( SECONDS- NHC_START_TS))
566
562
vlog " Node Health Check completed successfully (${ELAPSED} s)."
567
563
if [[ " $NHC_RM " == " sge" ]]; then
568
- echo " begin"
569
- echo " $HOSTNAME :healthy:true"
570
- echo " $HOSTNAME :diagnosis:HEALTHY"
571
- echo " end"
564
+ echo " begin" > $NHC_FD_OUT
565
+ echo " $HOSTNAME :healthy:true" > $NHC_FD_OUT
566
+ echo " $HOSTNAME :diagnosis:HEALTHY" > $NHC_FD_OUT
567
+ echo " end" > $NHC_FD_OUT
572
568
return 0
573
569
fi
574
570
kill_watchdog
575
- [[ -n " $LOGFILE " ]] && exec 1>&3 - 2>&4 -
571
+ [[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3 - 2>&4 -
576
572
exit 0
577
573
}
578
574
0 commit comments