mirror of
https://github.com/rocketraman/sane-scan-pdf.git
synced 2025-05-15 23:20:18 -07:00
145 lines
4.7 KiB
Bash
Executable File
145 lines
4.7 KiB
Bash
Executable File
#!/bin/bash
|
|
# Usage: scan_perpage <imagefile>
|
|
# where imagefile is the data just scanned
|
|
# (specify this script to scanadf via -S)
|
|
|
|
usage()
|
|
{
|
|
echo "Usage: $0 <imagefile>"
|
|
echo "Set the following environment variables:"
|
|
echo " UNPAPER"
|
|
echo " SEARCHABLE"
|
|
echo " LANGUAGE"
|
|
echo " RESOLUTION"
|
|
echo " PGWIDTHIN"
|
|
echo " PGHEIGHTIN"
|
|
echo " SKIP_EMPTY_PAGES"
|
|
echo " BRIGHTNESS_CONTRAST"
|
|
echo " PS2PDF_OPTS (optional)"
|
|
echo " VERBOSE (optional)"
|
|
echo " LOCKFILE (required if VERBOSE=1)"
|
|
}
|
|
|
|
log()
|
|
{
|
|
if [[ $VERBOSE == 1 ]]; then
|
|
echo "scan_perpage: $1"
|
|
fi
|
|
}
|
|
|
|
logstdout()
|
|
{
|
|
if [[ $VERBOSE == 1 ]]; then
|
|
cat
|
|
else
|
|
cat > /dev/null
|
|
fi
|
|
}
|
|
|
|
runconstrained()
|
|
{
|
|
if [[ -x "$(command -v sem)" ]]; then
|
|
# use up to 75% of the cores available
|
|
sem --jobs 75% --id scan_perpage --fg "$@"
|
|
else
|
|
"$@"
|
|
fi
|
|
}
|
|
|
|
if [[ $# < 1 ]]; then
|
|
usage
|
|
exit 1
|
|
fi
|
|
|
|
if [[ "$UNPAPER" == "" || "$SEARCHABLE" == "" || "$RESOLUTION" == "" || "$RESOLUTION" == "" || "$SKIP_EMPTY_PAGES" == "" ]]; then
|
|
usage
|
|
exit 1
|
|
fi
|
|
|
|
IMAGE_PATH=$1
|
|
IMAGE_DIR=$(dirname $1)
|
|
IMAGE_FILE=$(basename $1)
|
|
|
|
TIMEVERBOSE=
|
|
if [[ $VERBOSE == 1 ]]; then
|
|
TIMEVERBOSE=time
|
|
fi
|
|
|
|
process_page() {
|
|
log ""
|
|
log "-------------------------------------------------------------------------------"
|
|
log "Post-processing scanned page ${IMAGE_PATH}, deskew=$UNPAPER, searchable=$SEARCHABLE, skip-empty-pages=$SKIP_EMPTY_PAGES, white-threshold=$WHITE_THRESHOLD, brightness-contrast-sw=$BRIGHTNESS_CONTRAST..."
|
|
log "-------------------------------------------------------------------------------"
|
|
|
|
if [[ "$BRIGHTNESS_CONTRAST" != "" ]]; then
|
|
log "Adjust brightness and contrast in ImageMagick by $BRIGHTNESS_CONTRAST"
|
|
convert "$IMAGE_PATH" -brightness-contrast $BRIGHTNESS_CONTRAST "$IMAGE_PATH"
|
|
fi
|
|
|
|
if [[ $SKIP_EMPTY_PAGES == 1 ]]; then
|
|
PERCENTAGE_WHITE=$(convert "$IMAGE_PATH" -fuzz 0% -negate -threshold 0 -negate -format "%[fx:100*mean]" info:) || PERCENTAGE_WHITE=0
|
|
log "$IMAGE_PATH has $PERCENTAGE_WHITE % white"
|
|
else
|
|
PERCENTAGE_WHITE=0
|
|
fi
|
|
|
|
PP_PREFIX=
|
|
if [[ $SKIP_EMPTY_PAGES == 1 && $(echo "$PERCENTAGE_WHITE > $WHITE_THRESHOLD" | bc -l) == 1 ]]; then
|
|
log "Skipping empty page $IMAGE_FILE with white percentage $PERCENTAGE_WHITE"
|
|
else
|
|
if [[ $UNPAPER == 1 ]]; then
|
|
log "Applying unpaper post-processing to image data..."
|
|
PP_PREFIX="unpaper-"
|
|
if [[ $VERBOSE == 1 ]]; then
|
|
UNPAPERVERBOSE="-v"
|
|
fi
|
|
#runconstrained $TIMEVERBOSE unpaper $UNPAPERVERBOSE --no-mask-scan --overwrite --deskew-scan-range=35 --dpi $RESOLUTION --no-blackfilter $IMAGE_FILE $PP_PREFIX$IMAGE_FILE | logstdout
|
|
runconstrained $TIMEVERBOSE unpaper $UNPAPERVERBOSE --overwrite --deskew-scan-range=35 --dpi $RESOLUTION $IMAGE_PATH $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE | logstdout
|
|
fi
|
|
if [[ $SEARCHABLE == 1 ]]; then
|
|
log "Converting image data to searchable pdf..."
|
|
# tesseract uses the input's DPI header, we need to convert to a format that supports this (like tiff)
|
|
log "...Running convert"
|
|
runconstrained $TIMEVERBOSE convert -density ${RESOLUTION}x${RESOLUTION} -units PixelsPerInch $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff | logstdout
|
|
log "...Running tesseract"
|
|
runconstrained $TIMEVERBOSE tesseract $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout
|
|
[[ -f $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff ]] && rm $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff
|
|
else
|
|
log "Converting image data to pdf..."
|
|
if [[ "$PGWIDTHIN" == "" || "$PGHEIGHTIN" == "" ]]; then
|
|
PAGEOPTS="-equalpixels -dpi=$RESOLUTION -noturn"
|
|
else
|
|
PAGEOPTS="-imagewidth $PGWIDTHIN -imageheight $PGHEIGHTIN"
|
|
fi
|
|
PNMVERBOSE=
|
|
# older versions of pnmtops (Ubuntu, Debian) don't have the -verbose option, test for it
|
|
if [[ $VERBOSE == 1 && ! "$(pnmtops -verbose 2>&1 < /dev/null)" =~ "unrecognized option" ]]; then
|
|
PNMVERBOSE="-verbose"
|
|
fi
|
|
log "...Running pnmtops on $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE using page options: $PAGEOPTS"
|
|
runconstrained $TIMEVERBOSE pnmtops $PNMVERBOSE $PAGEOPTS $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE > $IMAGE_DIR/${IMAGE_FILE}.ps | logstdout
|
|
log "...Running ps2pdf on $IMAGE_DIR/${IMAGE_FILE}.ps"
|
|
runconstrained $TIMEVERBOSE ps2pdf $PS2PDF_OPTS $IMAGE_DIR/${IMAGE_FILE}.ps $IMAGE_DIR/${IMAGE_FILE}.pdf | logstdout
|
|
[[ -f $IMAGE_DIR/${IMAGE_FILE}.ps ]] && rm $IMAGE_DIR/${IMAGE_FILE}.ps
|
|
fi
|
|
fi
|
|
|
|
status=$?
|
|
rm $IMAGE_PATH
|
|
[[ -f $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE} ]] && rm $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}
|
|
|
|
log ""
|
|
log "Scan page processing done, status = $status"
|
|
}
|
|
|
|
if [[ $VERBOSE == 1 ]]; then
|
|
(
|
|
flock 200
|
|
process_page
|
|
) 200>$LOCKFILE
|
|
else
|
|
process_page
|
|
fi;
|
|
|
|
exit $status
|