armbian-next: still fighting tee leaking: now I think I won, again

- avoid using the "last resort, use lazy umount" by killing tee_pid _and descendants_ before hand. Works!
- introduce `get_descendants_of_pid_array()` using recursive trick to get all descendants
- do not `unset CURRENT_LOGFILE` -- during `logging_error_show_log()` (from SHOW_LOG=no days)
  - do `unset CURRENT_LOGFILE` immediately before processing logs -- although it magically survives. mysteries of traps in bash...
- `declare -g` instead of `export` for `start_logging_section()`
- don't `check_and_close_fd_13()` in `run_cleanup_handlers()`
- do run `check_and_close_fd_13()`, but only after archiving the old logs (in logging trap!)
- add a default error for unhandled TRAP's -- of which should be done
- fix reset of `global_tee_pid=0` in section-logging
This commit is contained in:
Ricardo Pardini
2023-01-18 20:22:39 +01:00
parent c1100fa461
commit bf3f346bee
6 changed files with 78 additions and 29 deletions

View File

@@ -272,3 +272,24 @@ function trap_handler_reset_output_owner() {
display_alert "Resetting tmp directory owner" "${SRC}/.tmp" "debug"
reset_uid_owner_non_recursive "${SRC}/.tmp"
}
# Recursive function to find all descendant processes of a given PID. Writes to stdout.
function list_descendants_of_pid() {
local children
children=$(ps -o "pid=" --ppid "$1" | xargs echo -n)
for pid in $children; do
list_descendants_of_pid "$pid"
done
echo -n "${children} "
}
function get_descendants_of_pid_array() {
local descendants
descendants="$(list_descendants_of_pid "$1")"
display_alert "Descendants of PID $1: ${descendants}" "string - get_descendants_of_pid_array" "debug"
# shellcheck disable=SC2206 # lets expand!
descendants_of_pid_array_result=(${descendants})
display_alert "Descendants of PID $1: ${descendants_of_pid_array_result[*]}" "array = get_descendants_of_pid_array" "debug"
}

View File

@@ -38,9 +38,10 @@ function logging_init() {
}
function logging_error_show_log() {
local logfile_to_show="${CURRENT_LOGFILE}" # store current logfile in separate var
unset CURRENT_LOGFILE # stop logging, otherwise crazy
[[ "${SHOW_LOG}" == "yes" ]] && return 0 # Do nothing if we're already showing the log on stderr.
# Do NOT unset CURRENT_LOGFILE here... it's used by traps.
local logfile_to_show="${CURRENT_LOGFILE}" # store current logfile in separate var
if [[ "${CI}" == "true" ]]; then # Close opened CI group, even if there is none; errors would be buried otherwise.
echo "::endgroup::"
fi
@@ -48,14 +49,14 @@ function logging_error_show_log() {
if [[ -f "${logfile_to_show}" ]]; then
local prefix_sed_contents="${normal_color}${left_marker}${padding}👉${padding}${right_marker} "
local prefix_sed_cmd="s/^/${prefix_sed_contents}/;"
display_alert " 👇👇👇 Showing logfile below 👇👇👇" "${logfile_to_show}" "err"
CURRENT_LOGFILE="" display_alert " 👇👇👇 Showing logfile below 👇👇👇" "${logfile_to_show}" "err"
# shellcheck disable=SC2002 # my cat is great. thank you, shellcheck.
cat "${logfile_to_show}" | grep -v -e "^$" | sed -e "${prefix_sed_cmd}" 1>&2 # write it to stderr!!
display_alert " 👆👆👆 Showing logfile above 👆👆👆" "${logfile_to_show}" "err"
CURRENT_LOGFILE="" display_alert " 👆👆👆 Showing logfile above 👆👆👆" "${logfile_to_show}" "err"
else
display_alert "✋ Error log not available at this stage of build" "check messages above" "debug"
CURRENT_LOGFILE="" display_alert "✋ Error log not available at this stage of build" "check messages above" "debug"
fi
return 0
}
@@ -108,25 +109,44 @@ function print_current_asset_log_base_file() {
}
function check_and_close_fd_13() {
if [[ -e /proc/self/fd/13 ]]; then
sync # let the disk catch up
if [[ -e /proc/self/fd/13 ]]; then
display_alert "Closing fd 13" "log still open" "cleanup" # no reason to be alarmed
exec 13>&- || true # close the file descriptor, lest sed keeps running forever.
sync # make sure the file is written to disk
sleep 1 # give it a second to die.
else
display_alert "Not closing fd 13" "log already closed" "cleanup"
fi
# "tee_pid" is a misnomer: it in reality is a shell pid with tee and sed children.
display_alert "Checking if global_tee_pid is set and running" "global_tee_pid: ${global_tee_pid}" "cleanup"
if [[ -n "${global_tee_pid}" && ${global_tee_pid} -gt 1 ]] && ps -p "${global_tee_pid}" > /dev/null; then
display_alert "Killing global_tee_pid's children" "global_tee_pid: ${global_tee_pid}" "cleanup"
declare -a descendants_of_pid_array_result=()
get_descendants_of_pid_array "${global_tee_pid}" || true
# loop over descendants_of_pid_array_result and kill'em'all
for descendant_pid in "${descendants_of_pid_array_result[@]}"; do
# check if PID is still alive before killing; it might have died already due to death of parent.
if ps -p "${descendant_pid}" > /dev/null; then
display_alert "Killing descendant pid" "${descendant_pid}" "cleanup"
{ kill "${descendant_pid}" && wait "${global_tee_pid}"; } || true
else
display_alert "Descendant PID already dead" "${descendant_pid}" "cleanup"
fi
done
# If the global_tee_pid is still alive, kill it.
if ps -p "${global_tee_pid}" > /dev/null; then
display_alert "Killing global_tee_pid" "${global_tee_pid}" "cleanup"
kill "${global_tee_pid}" && wait "${global_tee_pid}"
else
display_alert "global_tee_pid already dead after descendants killed" "${global_tee_pid}" "cleanup"
fi
sync # wait for the disk to catch up
else
display_alert "Not killing global_tee_pid" "${global_tee_pid} not running" "cleanup"
fi
}
function discard_logs_tmp_dir() {

View File

@@ -211,9 +211,8 @@ function run_host_command_logged_raw() {
if [[ -f "${CURRENT_LOGFILE}" ]]; then # echo -e "\033[91mBright Red\033[0m"
echo -e "\033[91m-->--> command failed with error code ${exit_code} after $((SECONDS - seconds_start)) seconds${normal_color:-}" >> "${CURRENT_LOGFILE}"
fi
# This is very specific; remove CURRENT_LOGFILE's value when calling display_alert here otherwise logged twice.
CURRENT_LOGFILE="" display_alert "cmd exited with code ${exit_code}" "$*" "wrn"
CURRENT_LOGFILE="" display_alert "stacktrace for failed command" "$(show_caller_full)" "wrn" # @TODO logfile might be useless without this
display_alert "cmd exited with code ${exit_code}" "$*" "wrn"
display_alert "stacktrace for failed command" "$(show_caller_full)" "wrn"
# Obtain extra info about error, eg, log files produced, extra messages set by caller, etc.
logging_enrich_run_command_error_info

View File

@@ -1,11 +1,11 @@
function start_logging_section() {
export logging_section_counter=$((logging_section_counter + 1)) # increment counter, used in filename
export CURRENT_LOGGING_COUNTER
declare -g logging_section_counter=$((logging_section_counter + 1)) # increment counter, used in filename
declare -g CURRENT_LOGGING_COUNTER
CURRENT_LOGGING_COUNTER="$(printf "%03d" "$logging_section_counter")"
export CURRENT_LOGGING_SECTION=${LOG_SECTION:-early} # default to "early", should be overwritten soon enough
export CURRENT_LOGGING_SECTION_START=${SECONDS}
export CURRENT_LOGGING_DIR="${LOGDIR}" # set in cli-entrypoint.sh
export CURRENT_LOGFILE="${CURRENT_LOGGING_DIR}/${CURRENT_LOGGING_COUNTER}.000.${CURRENT_LOGGING_SECTION}.log"
declare -g CURRENT_LOGGING_SECTION=${LOG_SECTION:-early} # default to "early", should be overwritten soon enough
declare -g CURRENT_LOGGING_SECTION_START=${SECONDS}
declare -g CURRENT_LOGGING_DIR="${LOGDIR}" # set in cli-entrypoint.sh
declare -g CURRENT_LOGFILE="${CURRENT_LOGGING_DIR}/${CURRENT_LOGGING_COUNTER}.000.${CURRENT_LOGGING_SECTION}.log"
mkdir -p "${CURRENT_LOGGING_DIR}"
touch "${CURRENT_LOGFILE}" # Touch it, make sure it's writable.
@@ -44,14 +44,14 @@ function do_with_logging() {
# So whatever is being called, should prevent rogue stuff writing to stderr.
# this is mostly handled by redirecting stderr to stdout: 2>&1
# global var to store the pid of spawned logging process.
declare -g -i global_tee_pid=0
if [[ "${SHOW_LOG}" == "yes" ]]; then
local prefix_sed_contents
prefix_sed_contents="$(logging_echo_prefix_for_pv "tool") $(echo -n -e "${tool_color}")" # spaces are significant
local prefix_sed_cmd="s/^/${prefix_sed_contents}/;"
# global var to store the pid of spawned logging process.
declare -g -i global_tee_pid=0
# Create a 13rd file descriptor sending it to sed. https://unix.stackexchange.com/questions/174849/redirecting-stdout-to-terminal-and-file-without-using-a-pipe
# Also terrible: don't hold a reference to cwd by changing to SRC always
# There's handling of this fd 13 in check_and_close_fd_13()

View File

@@ -19,9 +19,6 @@ function trap_handler_cleanup_logging() {
local target_path="${DEST}/logs"
mkdir_recursive_and_set_uid_owner "${target_path}" # @TODO: this might be full of logs and is slow
# Check if fd 13 is still open; close it and wait for tee to die. This is done again in discard_logs_tmp_dir()
check_and_close_fd_13
# Before writing new logfile, compress and move existing ones to archive folder.
# - Unless running under CI.
# - Also not if signalled via SKIP_LOG_ARCHIVE=yes
@@ -53,6 +50,16 @@ function trap_handler_cleanup_logging() {
display_alert "Not archiving old logs." "CI=${CI:-false}, SKIP_LOG_ARCHIVE=${SKIP_LOG_ARCHIVE:-no}" "debug"
fi
## Here -- we need to definitely stop logging, cos we're gonna consolidate and delete the logs.
display_alert "End of logging" "STOP LOGGING: CURRENT_LOGFILE: ${CURRENT_LOGFILE}" "debug"
# Stop logging to file...
CURRENT_LOGFILE=""
unset CURRENT_LOGFILE
# Check if fd 13 is still open; close it and wait for tee to die. This is done again in discard_logs_tmp_dir()
check_and_close_fd_13
# Export Markdown assets.
local target_file="${target_path}/armbian-${ARMBIAN_LOG_CLI_ID}-${ARMBIAN_BUILD_UUID}.md"
export_markdown_logs

View File

@@ -78,6 +78,9 @@ function main_trap_handler() {
# Run the cleanup handlers, always.
run_cleanup_handlers || true
;;
*)
display_alert "main_trap_handler" "Unknown trap type '${trap_type}'" "err"
;;
esac
}
@@ -163,8 +166,7 @@ function exit_with_error() {
#exec {FD}> /var/lock/armbian-debootstrap-losetup
#flock -u "${FD}"
# let's try early to close tee/sed and fd 13 which might be opened if this happened in a logging section
check_and_close_fd_13
# do NOT close the fd 13 here, otherwise the error will not be logged to logfile...
exit 43
}