#!/bin/bash # Usage: scan_perpage # where imagefile is the data just scanned # (specify this script to scanadf via -S) usage() { echo "Usage: $0 " echo "Set the following environment variables:" echo " UNPAPER" echo " SEARCHABLE" echo " LANGUAGE" echo " RESOLUTION" echo " PGWIDTHIN" echo " PGHEIGHTIN" echo " SKIP_EMPTY_PAGES" echo " PS2PDF_OPTS (optional)" echo " VERBOSE (optional)" echo " LOCKFILE (required if VERBOSE=1)" } log() { if [ $VERBOSE = 1 ]; then echo "scan_perpage: $1" fi } logstdout() { if [ $VERBOSE = 1 ]; then cat else cat > /dev/null fi } runconstrained() { if [ -x "$(command -v sem)" ]; then # use up to 75% of the cores available sem --jobs 75% --id scan_perpage --fg "$@" else "$@" fi } if [ $# -lt 1 ]; then usage exit 1 fi if [ "$UNPAPER" == "" -o "$SEARCHABLE" == "" -o "$RESOLUTION" == "" -o "$RESOLUTION" == "" -o "$SKIP_EMPTY_PAGES" == "" ]; then usage exit 1 fi IMAGE_PATH=$1 IMAGE_DIR=$(dirname $1) IMAGE_FILE=$(basename $1) process_page() { log "" log "-------------------------------------------------------------------------------" log "Post-processing scanned page ${IMAGE_PATH}, deskew=$UNPAPER, searchable=$SEARCHABLE..." if [[ $SKIP_EMPTY_PAGES -eq 1 ]]; then PERCENTAGE_WHITE=$(convert "$IMAGE_PATH" -fuzz 0% -negate -threshold 0 -negate -format "%[fx:100*mean]" info:) || PERCENTAGE_WHITE=0 log "$IMAGE_PATH has $PERCENTAGE_WHITE % white" else PERCENTAGE_WHITE=0 fi PP_PREFIX= if (( $(echo "$PERCENTAGE_WHITE < 99.8" | bc -l) )); then if [ $UNPAPER -eq 1 ]; then log "Applying unpaper post-processing to image data..." PP_PREFIX="unpaper-" if [ $VERBOSE = 1 ]; then UNPAPERVERBOSE="-v" fi #runconstrained unpaper $UNPAPERVERBOSE --no-mask-scan --overwrite --dpi $RESOLUTION --no-blackfilter $IMAGE_FILE $PP_PREFIX$IMAGE_FILE | logstdout runconstrained unpaper $UNPAPERVERBOSE --overwrite --dpi $RESOLUTION $IMAGE_PATH $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE | logstdout fi log "" log "-------------------------------------------------------------------------------" if [ $SEARCHABLE -eq 1 ]; then log "Converting image data to searchable pdf..." runconstrained tesseract $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout else log "Converting image data to pdf..." if [ "$PGWIDTHIN" == "" -o "$PGHEIGHTIN" == "" ]; then PAGEOPTS="-equalpixels -dpi=$RESOLUTION -noturn" else PAGEOPTS="-imagewidth $PGWIDTHIN -imageheight $PGHEIGHTIN" fi PNMVERBOSE= # older versions of pnmtops (Ubuntu, Debian) don't have the -verbose option, test for it if [[ $VERBOSE = 1 && ! "$(pnmtops -verbose 2>&1 < /dev/null)" =~ "unrecognized option" ]]; then PNMVERBOSE="-verbose" fi log "Using page options: $PAGEOPTS" runconstrained pnmtops $PNMVERBOSE $PAGEOPTS $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE | ps2pdf $PS2PDF_OPTS - > $IMAGE_DIR/${IMAGE_FILE%.*}.pdf | logstdout fi else log "Skipping empty page $IMAGE_FILE with white percentage $PERCENTAGE_WHITE" fi status=$? rm $IMAGE_PATH if [ -f $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE ]; then rm $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE fi log "" log "Scan processing done, status = $status" } if [ $VERBOSE = 1 ]; then ( flock 200 process_page ) 200>$LOCKFILE else process_page fi; exit $status