Skip to content

Commit

Permalink
Merge pull request #740 from thomasjacquin/removeBadImages-changes-fixes
Browse files Browse the repository at this point in the history
removeBadImage.sh: misc changes/fixes
  • Loading branch information
linuxkidd authored Nov 5, 2021
2 parents ad701d7 + 8ecff7e commit 98212d4
Showing 1 changed file with 163 additions and 52 deletions.
215 changes: 163 additions & 52 deletions scripts/removeBadImages.sh
Original file line number Diff line number Diff line change
@@ -1,80 +1,191 @@
#!/bin/bash

REMOVE_BAD_IMAGES_THRESHOLD_LOW=${REMOVE_BAD_IMAGES_THRESHOLD_LOW:-0} # in case not in config.sh file
REMOVE_BAD_IMAGES_THRESHOLD_HIGH=${REMOVE_BAD_IMAGES_THRESHOLD_HIGH:-0} # in case not in config.sh file
ME="$(basename "${BASH_ARGV0}")"

ME="$(basename "$BASH_ARGV0")" # Include script name in output so it's easier to find in the log file
source "${ALLSKY_HOME}/variables.sh"
source "${ALLSKY_CONFIG}/config.sh"
source "${ALLSKY_SCRIPTS}/filename.sh"

if [ $# -ne 1 -o 'x$1' == 'x-h' ] ; then
echo "Remove images with corrupt data which might mess up startrails and keograms"
echo "usage: $ME <directory>"
exit 1
usage()
{
retcode="${1}"
echo
echo "Remove images with corrupt data which might mess up startrails and keograms."
[ "${retcode}" -ne 0 ] && echo -en "${RED}"
echo -n "Usage: ${ME} [--help] [--debug] directory [file]"
[ "${retcode}" -ne 0 ] && echo -e "${NC}"
echo
echo "You must enter the arguments in the above order."
# TODO: use getopts to allow any order
echo "Turning on debug will indicate bad images but will not remove them."
echo "If 'file' is specified, only that file in 'directory' will be checked,"
echo "otherwise all files in 'directory' will be checked."
exit ${retcode}
}
[ "${1}" = "-h" -o "${1}" = "--help" ] && usage 0
if [ "${1}" = "-d" -o "${1}" = "--debug" ]; then
DEBUG="true"
r="would be removed"
shift
else
DEBUG="false"
r="removed"
fi

[ $# -eq 0 -o $# -gt 2 ] && usage 1

DATE="${1}"
FILE="${2}"

# If we're running in debug mode don't display ${ME} since it makes the output harder to read.
if [ ${DEBUG} = "true" -o "${ON_TTY}" = "1" ]; then
ME=""
else
ME="${ME}:"
fi
if [ ! -d "${DATE}" ]; then
echo -e "${RED}${ME} '${DATE}' is not a directory${NC}"
exit 2
fi

if [ \! -d "$1" ] ; then
echo "$ME: $1 is not a directory"
exit 1
if [ "${FILE}" != "" -a ! -f "${DATE}/${FILE}" ]; then
echo -e "${RED}${ME} '${FILE}' not found in '${DATE}'${NC}"
exit 2
fi
DIR=$1

# Super simple: find the full size image-*jpg and image-*png files (not the
# thumbnails) and ask imagemagick to compute a histogram (which is discarded)
# in order to capture the diagnostics from libjpeg. Will have to benchmark to
# be sure, but I suspect it's faster to produce histogram output than another
# image format which would be discarded anyway. If an input image does produce
# a warning message grep will match it and it will be deleted.
#
# This leaves us just images that decompress properly and won't introduce junk
# into the processing pipeline.
#
# Why on G-d's green earth would I do something like this? Because for whatever
# reason, my raspberry pi produces corrupt captures occasionally and this tool
# means I get good startrails and keograms in the morning.
#

DARK_MODE=$(jq -r '.darkframe' "${CAMERA_SETTINGS}")
if [ "${DARK_MODE}" = "1" ]; then
# Disable low brightness check since darks will have extremely low brightness.
# But continue with the other checks in case the dark file is corrupted.
REMOVE_BAD_IMAGES_THRESHOLD_LOW=0
fi

# Find the full size image-*jpg and image-*png files (not the thumbnails) and
# have "convert" compute a histogram (which is discarded),
# in order to capture any error messages.
# If an image DOES produce an error message grep will match it and the file will be deleted.

# Doing this allows good startrails and keograms to be produced on machines that
# sometimes produce corrupt or zero-length files.

# If GNU Parallel is installed (it's not by default), then blast through and
# clean all the images as fast as possible without regard for CPU utilization.

# Use IMAGE_FILES and ERROR_WORDS to avoid duplicating them.
# Remove 0-length files ("insufficient image data") and files too dim or bright.
# $DIR may end in a "/" so there will be "//" in the filenames, but there's no harm in that.
cd $DIR
IMAGE_FILES="$( find . -type f \( -iname image-\*.jpg -o -iname image-\*.png \) \! -ipath \*thumbnail\* )"
ERROR_WORDS="Huffman|Bogus|Corrupt|Invalid|Trunc|Missing|insufficient image data|no decode delegate"
# ${DATE} may end in a "/" so there will be "//" in the filenames, but there's no harm in that.

cd "${DATE}"
if [ "${FILE}" != "" ]; then
IMAGE_FILES="${FILE}"
else
IMAGE_FILES="$( find . -type f -iname "${FILENAME}"-\*.${EXTENSION} \! -ipath \*thumbnail\* )"
fi
ERROR_WORDS="Huffman|Bogus|Corrupt|Invalid|Trunc|Missing|insufficient image data|no decode delegate|no images defined"

TMP=badError.txt
TMP="${ALLSKY_TMP}/badError.txt"

# Save all output to a temp file so don't potentially swamp the system log file.
OUTPUT="${ALLSKY_TMP}/removeBadImages.log"
> ${OUTPUT}

typeset -i num_bad=0
if which parallel > /dev/null ; then
echo $IMAGE_FILES | \
parallel -- "convert {} histogram:/dev/null 2>&1 | egrep -q "$ERROR_WORDS" && rm -vf {}"
if [ ${DEBUG} = "true" ]; then
rm=""
else
rm="&& rm -vf {}"
fi
echo ${IMAGE_FILES} | \
parallel -- "convert {} histogram:/dev/null 2>&1 | egrep -q ${ERROR_WORDS} ${rm}"
# xxxxxxxxxx need to add THRESHOLD checking here and remove bad thumbnails...
# xxxxxxxxxx Can we replace "rm -vf" above with "echo" and redirect output to the tmp file,
# xxxxxxxxxx then do a "for f in $(cat $TMP); do" and remove the files that way?
# xxxxxxxxxx then do a "for f in $(< ${TMP}); do" and remove the files that way?

else
typeset -i num_bad=0
# If the low threshold is 0 it's disabled.
# If the high one is 0 or 100 (nothing can be brighter than 100) it's disabled.
if [ ${REMOVE_BAD_IMAGES_THRESHOLD_HIGH} -gt 100 -o ${REMOVE_BAD_IMAGES_THRESHOLD_HIGH} -eq 0 ]; then
HIGH=0
else
HIGH=${REMOVE_BAD_IMAGES_THRESHOLD_HIGH}
fi
LOW=${REMOVE_BAD_IMAGES_THRESHOLD_LOW}

# If we're processing a whole directory assume it's done in the background so "nice" it.
# If we're only processing one file we want it done quickly.
if [ "${FILE}" = "" ]; then
NICE="nice"
else
NICE=""
fi

for f in ${IMAGE_FILES} ; do
MEAN=$(nice convert "$f" -colorspace Gray -format "%[fx:image.mean]" info: 2> $TMP)
BAD=""
egrep -q "$ERROR_WORDS" $TMP
RET=$?
if [ $RET -eq 0 ] ; then
rm -f "$f" "thumbnails/$f"
BAD="'$f' (corrupt file: $(cat $TMP))"
let num_bad=num_bad+1

if [ ! -s "${f}" ]; then
BAD="'${f}' (zero length)"
else
# Multiply MEAN by 100 to convert to integer (0-100 %) since bash doesn't work with floats.
MEAN=$(echo "$MEAN" | awk '{ printf("%d", $1 * 100); }')
if [ $MEAN -lt $REMOVE_BAD_IMAGES_THRESHOLD_LOW -o $MEAN -gt $REMOVE_BAD_IMAGES_THRESHOLD_HIGH ]; then
rm -f "$f" "thumbnails/$f"
BAD="'$f' (bad threshold: MEAN=$MEAN)"
let num_bad=num_bad+1
# MEAN is a number between 0.0 and 1.0.
MEAN=$(${NICE} convert "${f}" -colorspace Gray -format "%[fx:image.mean]" info: 2> "${TMP}")
egrep -q "${ERROR_WORDS}" "${TMP}"

if [ $? -eq 0 ]; then # at least one error word was found in the output
BAD="'${f}' (corrupt file: $(< "${TMP}"))"

else
# Multiply MEAN by 100 to convert to integer (0-100 %) since
# bash doesn't work with floats.
MEAN=$(echo "${MEAN} * 100" | bc)
MSG=""

if [ ${HIGH} -ne 0 ]; then
x=$(echo "${MEAN} > ${HIGH}" | bc)
if [ ${x} -eq 1 ]; then
BAD="'${f}' (above threshold: MEAN=${MEAN}, threshold = ${HIGH})"
elif [ ${DEBUG} = "true" ]; then
MSG="===== OK: ${f}, MEAN=${MEAN}, HIGH=${HIGH}, LOW=${LOW}"
fi
fi

# An image can't be both HIGH and LOW so if it was HIGH don't check for LOW.
if [ "${BAD}" = "" -a ${LOW} -ne 0 ]; then
x=$(echo "${MEAN} < ${LOW}" | bc)
if [ ${x} -eq 1 ]; then
BAD="'${f}' (below threshold: MEAN=${MEAN}, threshold = ${LOW})"
elif [ ${DEBUG} = "true" -a "${MSG}" = "" ]; then
MSG="===== OK: ${f}, MEAN=${MEAN}, HIGH=${HIGH}, LOW=${LOW}"
fi
fi

if [ ${DEBUG} = "true" -a "${BAD}" = "" -a "${MSG}" != "" ]; then
echo "${MSG}"
fi
fi

fi

if [ "${BAD}" != "" ]; then
echo "${r} ${BAD}" >> "${OUTPUT}"
[ ${DEBUG} = "false" ] && rm -f "${f}" "thumbnails/${f}"
let num_bad=num_bad+1
fi
[ "$BAD" != "" ] && echo "$ME: Removed $BAD"
done

if [ $num_bad -eq 0 ]; then
echo "$ME: No bad files found."
# If only one file, "no news is good news".
[ "${FILE}" = "" ] && echo -e "\n${ME} ${GREEN}No bad files found.${NC}"
rm -f "${OUTPUT}"
else
echo "$ME: $num_bad bad file(s) found and removed."
if [ "${FILE}" = "" ]; then
echo "${ME} ${num_bad} bad file(s) found and ${r}. See ${OUTPUT}."
# Do NOT remove ${OUTPUT} in case the user wants to look at it.
else # only 1 file so show it
echo "${ME} File is bad: $(< "${OUTPUT}")"
rm -f "${OUTPUT}"
fi
fi
fi
rm -f $TMP
rm -f "${TMP}"

exit $num_bad

0 comments on commit 98212d4

Please sign in to comment.