sane-scan-pdf/scan_perpage
2023-04-19 12:45:06 -04:00

145 lines
4.7 KiB
Bash
Executable File

#!/bin/bash
# Usage: scan_perpage <imagefile>
# where imagefile is the data just scanned
# (specify this script to scanadf via -S)
usage()
{
echo "Usage: $0 <imagefile>"
echo "Set the following environment variables:"
echo " UNPAPER"
echo " SEARCHABLE"
echo " LANGUAGE"
echo " RESOLUTION"
echo " PGWIDTHIN"
echo " PGHEIGHTIN"
echo " SKIP_EMPTY_PAGES"
echo " BRIGHTNESS_CONTRAST"
echo " PS2PDF_OPTS (optional)"
echo " VERBOSE (optional)"
echo " LOCKFILE (required if VERBOSE=1)"
}
log()
{
if [[ $VERBOSE == 1 ]]; then
echo "scan_perpage: $1"
fi
}
logstdout()
{
if [[ $VERBOSE == 1 ]]; then
cat
else
cat > /dev/null
fi
}
runconstrained()
{
if [[ -x "$(command -v sem)" ]]; then
# use up to 75% of the cores available
sem --jobs 75% --id scan_perpage --fg "$@"
else
"$@"
fi
}
if [[ $# < 1 ]]; then
usage
exit 1
fi
if [[ "$UNPAPER" == "" || "$SEARCHABLE" == "" || "$RESOLUTION" == "" || "$RESOLUTION" == "" || "$SKIP_EMPTY_PAGES" == "" ]]; then
usage
exit 1
fi
IMAGE_PATH=$1
IMAGE_DIR=$(dirname $1)
IMAGE_FILE=$(basename $1)
TIMEVERBOSE=
if [[ $VERBOSE == 1 ]]; then
TIMEVERBOSE=time
fi
process_page() {
log ""
log "-------------------------------------------------------------------------------"
log "Post-processing scanned page ${IMAGE_PATH}, deskew=$UNPAPER, searchable=$SEARCHABLE, skip-empty-pages=$SKIP_EMPTY_PAGES, white-threshold=$WHITE_THRESHOLD, brightness-contrast-sw=$BRIGHTNESS_CONTRAST..."
log "-------------------------------------------------------------------------------"
if [[ "$BRIGHTNESS_CONTRAST" != "" ]]; then
log "Adjust brightness and contrast in ImageMagick by $BRIGHTNESS_CONTRAST"
convert "$IMAGE_PATH" -brightness-contrast $BRIGHTNESS_CONTRAST "$IMAGE_PATH"
fi
if [[ $SKIP_EMPTY_PAGES == 1 ]]; then
PERCENTAGE_WHITE=$(convert "$IMAGE_PATH" -fuzz 0% -negate -threshold 0 -negate -format "%[fx:100*mean]" info:) || PERCENTAGE_WHITE=0
log "$IMAGE_PATH has $PERCENTAGE_WHITE % white"
else
PERCENTAGE_WHITE=0
fi
PP_PREFIX=
if [[ $SKIP_EMPTY_PAGES == 1 && $(echo "$PERCENTAGE_WHITE > $WHITE_THRESHOLD" | bc -l) == 1 ]]; then
log "Skipping empty page $IMAGE_FILE with white percentage $PERCENTAGE_WHITE"
else
if [[ $UNPAPER == 1 ]]; then
log "Applying unpaper post-processing to image data..."
PP_PREFIX="unpaper-"
if [[ $VERBOSE == 1 ]]; then
UNPAPERVERBOSE="-v"
fi
#runconstrained $TIMEVERBOSE unpaper $UNPAPERVERBOSE --no-mask-scan --overwrite --deskew-scan-range=35 --dpi $RESOLUTION --no-blackfilter $IMAGE_FILE $PP_PREFIX$IMAGE_FILE | logstdout
runconstrained $TIMEVERBOSE unpaper $UNPAPERVERBOSE --overwrite --deskew-scan-range=35 --dpi $RESOLUTION $IMAGE_PATH $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE | logstdout
fi
if [[ $SEARCHABLE == 1 ]]; then
log "Converting image data to searchable pdf..."
# tesseract uses the input's DPI header, we need to convert to a format that supports this (like tiff)
log "...Running convert"
runconstrained $TIMEVERBOSE convert -density ${RESOLUTION}x${RESOLUTION} -units PixelsPerInch $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff | logstdout
log "...Running tesseract"
runconstrained $TIMEVERBOSE tesseract $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout
[[ -f $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff ]] && rm $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff
else
log "Converting image data to pdf..."
if [[ "$PGWIDTHIN" == "" || "$PGHEIGHTIN" == "" ]]; then
PAGEOPTS="-equalpixels -dpi=$RESOLUTION -noturn"
else
PAGEOPTS="-imagewidth $PGWIDTHIN -imageheight $PGHEIGHTIN"
fi
PNMVERBOSE=
# older versions of pnmtops (Ubuntu, Debian) don't have the -verbose option, test for it
if [[ $VERBOSE == 1 && ! "$(pnmtops -verbose 2>&1 < /dev/null)" =~ "unrecognized option" ]]; then
PNMVERBOSE="-verbose"
fi
log "...Running pnmtops on $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE using page options: $PAGEOPTS"
runconstrained $TIMEVERBOSE pnmtops $PNMVERBOSE $PAGEOPTS $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE > $IMAGE_DIR/${IMAGE_FILE}.ps | logstdout
log "...Running ps2pdf on $IMAGE_DIR/${IMAGE_FILE}.ps"
runconstrained $TIMEVERBOSE ps2pdf $PS2PDF_OPTS $IMAGE_DIR/${IMAGE_FILE}.ps $IMAGE_DIR/${IMAGE_FILE}.pdf | logstdout
[[ -f $IMAGE_DIR/${IMAGE_FILE}.ps ]] && rm $IMAGE_DIR/${IMAGE_FILE}.ps
fi
fi
status=$?
rm $IMAGE_PATH
[[ -f $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE} ]] && rm $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}
log ""
log "Scan page processing done, status = $status"
}
if [[ $VERBOSE == 1 ]]; then
(
flock 200
process_page
) 200>$LOCKFILE
else
process_page
fi;
exit $status