From 573b1df2daa5ab99e1ea6cae4c57f608bfd06fcc Mon Sep 17 00:00:00 2001 From: Stefan Armbruster Date: Mon, 29 Oct 2018 13:55:03 +0100 Subject: [PATCH] Remove empty pages based on cmd line option --- README.md | 8 ++++++ scan | 37 +++++++++++++++++++-------- scan_perpage | 72 ++++++++++++++++++++++++++++++---------------------- 3 files changed, 77 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 1a4287f..2bbe87f 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ Tested and run regularly on Fedora, but should work on other distributions with * pnmtops (netpbm-progs) * ps2pdf (ghostscript) * pdfunite +* ImageMagick (if --skip-empty-pages) is used ### Optional @@ -68,9 +69,16 @@ OPTIONS Run post-processing deskew and black edge detection (requires unpaper) --ocr Run OCR to make the PDF searchable (requires tesseract) + --skip-empty-pages + remove empty pages from resulting PDF document (e.g. one sided doc in duplex mode) OUTPUT -o, --output Output to named file default=scan.pdf -l, --outputlist Output to named files for each scanned page, can be used with append ``` + +## Contributors + +* [Raman Gupta](https://github.com/rocketraman/) +* [Stefan Armbruster](https://github.com/sarmbruster/) diff --git a/scan b/scan index 3d49ffb..356c951 100755 --- a/scan +++ b/scan @@ -12,6 +12,7 @@ SCRIPT="$DIR/scan_perpage" DUPLEX=0 UNPAPER=0 SEARCHABLE=0 +LANGUAGE=eng MAXPAGE= TRUNCPAGE=0 HELP=0 @@ -23,6 +24,9 @@ PGWIDTHIN= CROP=0 DESKEW=0 VERBOSE=0 +SKIP_EMPTY_PAGES=0 +TMP_DIR=`mktemp -d` +trap "rm -rf $TMP_DIR" 0 # Parse command-line options while [ $# -gt 0 ]; do @@ -58,6 +62,10 @@ while [ $# -gt 0 ]; do --searchable|--ocr) SEARCHABLE=1 ;; + --language) LANGUAGE=$1 ;; + + --skip-empty-pages) SKIP_EMPTY_PAGES=1 ;; + -o|--output) shift; OUTPUT="$1" ;; -l|--outputlist) shift; USEARRAY=1; OUTPUT=(); OUTPUT+=("$1") ;; @@ -94,6 +102,8 @@ if [ $HELP -eq 1 ]; then echo " Custom Page Height in mm" echo " -pw, --page-width" echo " Custom Page Width in mm" + echo " -x, --device" + echo " Override scanner device name, defaulting to `fujitsu`" echo " --crop" echo " Crop to contents (driver must support this)" echo " --deskew" @@ -102,6 +112,10 @@ if [ $HELP -eq 1 ]; then echo " Run post-processing deskew and black edge detection (requires unpaper)" echo " --ocr" echo " Run OCR to make the PDF searchable (requires tesseract)" + echo " --language " + echo " which language to use for OCR" + echo " --skip-empty-pages" + echo " remove empty pages from resulting PDF document (e.g. one sided doc in duplex mode)" echo "" echo "OUTPUT" echo " -o, --output " @@ -175,10 +189,12 @@ fi export VERBOSE export UNPAPER export SEARCHABLE +export LANGUAGE export RESOLUTION export PGWIDTHIN export PGHEIGHTIN export PS2PDF_OPTS +export SKIP_EMPTY_PAGES if [ $VERBOSE = 1 ]; then LOCKFILE=$(mktemp) @@ -186,12 +202,13 @@ if [ $VERBOSE = 1 ]; then export LOCKFILE fi; + echo >&2 "Scanning..." #eval strace -f -o /tmp/scan-trace.txt scanadf -d $DEVICE $MAXPAGE $PGHEIGHT $PGWIDTH -S $SCRIPT --script-wait --resolution $RESOLUTION --mode $MODE $DESKEW $CROP $SOURCE -o scan-%04d -eval scanadf -d "$DEVICE" $MAXPAGE $PGHEIGHT $PGWIDTH -S $SCRIPT --script-wait --resolution $RESOLUTION --mode $MODE $DESKEW $CROP $SOURCE -o scan-%04d +eval scanadf -d "$DEVICE" $MAXPAGE $PGHEIGHT $PGWIDTH -S $SCRIPT --script-wait --resolution $RESOLUTION --mode $MODE $DESKEW $CROP $SOURCE -o $TMP_DIR/scan-%04d shopt -s extglob nullglob -pdffiles=(scan-[0-9]*.pdf) +pdffiles=($TMP_DIR/scan-[0-9]*.pdf) numscans=${#pdffiles[@]} if [ $numscans -gt 0 ]; then echo "Processing $numscans pages" @@ -213,13 +230,13 @@ if [ $numscans -gt 0 ]; then if [ -f "${OUTPUT[$index]}.orig" ]; then pdffiles+=("${OUTPUT[$index]}.orig") fi - pdffiles+=(scan-*(0)$scanno.pdf) - pdfunite "${pdffiles[@]}" "${OUTPUT[$index]}" && rm scan-*(0)$scanno.pdf + pdffiles+=($TMP_DIR/scan-*(0)$scanno.pdf) + pdfunite "${pdffiles[@]}" "${OUTPUT[$index]}" && rm $TMP_DIR/scan-*(0)$scanno.pdf else - mv scan-*(0)$scanno.pdf "${OUTPUT[$index]}" + mv $TMP_DIR/scan-*(0)$scanno.pdf "${OUTPUT[$index]}" fi else - mv scan-*(0)$scanno.pdf "${OUTPUT[$index]}" + mv $TMP_DIR/scan-*(0)$scanno.pdf "${OUTPUT[$index]}" fi let "index = $index + 1" done @@ -232,13 +249,13 @@ if [ $numscans -gt 0 ]; then if [ -f "${OUTPUT}.orig" ]; then pdffiles+=("${OUTPUT}.orig") fi - pdffiles+=(scan-[0-9]*.pdf) - pdfunite "${pdffiles[@]}" "$OUTPUT" && rm scan-[0-9]*.pdf + pdffiles+=($TMP_DIR/scan-[0-9]*.pdf) + pdfunite "${pdffiles[@]}" "$OUTPUT" && rm $TMP_DIR/scan-[0-9]*.pdf else if [ $USEARRAY = 1 ]; then - mv scan-0*.pdf "${OUTPUT[0]}" + mv $TMP_DIR/scan-0*.pdf "${OUTPUT[0]}" else - mv scan-0*.pdf "$OUTPUT" + mv $TMP_DIR/scan-0*.pdf "$OUTPUT" fi fi echo "" diff --git a/scan_perpage b/scan_perpage index d546ee2..366de62 100755 --- a/scan_perpage +++ b/scan_perpage @@ -9,9 +9,11 @@ usage() echo "Set the following environment variables:" echo " UNPAPER" echo " SEARCHABLE" + echo " LANGUAGE" echo " RESOLUTION" echo " PGWIDTHIN" echo " PGHEIGHTIN" + echo " SKIP_EMPTY_PAGES" echo " PS2PDF_OPTS (optional)" echo " VERBOSE (optional)" echo " LOCKFILE (required if VERBOSE=1)" @@ -38,53 +40,63 @@ if [ $# -lt 1 ]; then exit 1 fi -if [ "$UNPAPER" == "" -o "$SEARCHABLE" == "" -o "$RESOLUTION" == "" -o "$RESOLUTION" == "" ]; then +if [ "$UNPAPER" == "" -o "$SEARCHABLE" == "" -o "$RESOLUTION" == "" -o "$RESOLUTION" == "" -o "$SKIP_EMPTY_PAGES" == "" ]; then usage exit 1 fi -IMAGE_FILE=$1 +IMAGE_PATH=$1 +IMAGE_DIR=`dirname $1` +IMAGE_FILE=`basename $1` process_page() { log "" log "-------------------------------------------------------------------------------" log "Post-processing scanned page ${IMAGE_FILE}, deskew=$UNPAPER, searchable=$SEARCHABLE..." + + [[ $SKIP_EMPTY_PAGES -eq 1 ]] && PERCENTAGE_WHITE=`convert "$IMAGE_PATH" -fuzz 0% -negate -threshold 0 -negate -format "%[fx:100*mean]" info:` || PERCENTAGE_WHITE=0 + log "$IMAGE_PATH has $PERCENTAGE_WHITE % white" + PP_PREFIX= - if [ $UNPAPER -eq 1 ]; then - log "Applying unpaper post-processing to image data..." - PP_PREFIX="unpaper-" - if [ $VERBOSE = 1 ]; then - UNPAPERVERBOSE="-v" + if (( $(echo "$PERCENTAGE_WHITE < 99.99" |bc -l) )); then + if [ $UNPAPER -eq 1 ]; then + log "Applying unpaper post-processing to image data..." + PP_PREFIX="unpaper-" + if [ $VERBOSE = 1 ]; then + UNPAPERVERBOSE="-v" + fi + #unpaper $UNPAPERVERBOSE --no-mask-scan --overwrite --dpi $RESOLUTION --no-blackfilter $IMAGE_FILE $PP_PREFIX$IMAGE_FILE | logstdout + unpaper $UNPAPERVERBOSE --overwrite --dpi $RESOLUTION $IMAGE_PATH $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE | logstdout fi - #unpaper $UNPAPERVERBOSE --no-mask-scan --overwrite --dpi $RESOLUTION --no-blackfilter $IMAGE_FILE $PP_PREFIX$IMAGE_FILE | logstdout - unpaper $UNPAPERVERBOSE --overwrite --dpi $RESOLUTION $IMAGE_FILE $PP_PREFIX$IMAGE_FILE | logstdout - fi - log "" - log "-------------------------------------------------------------------------------" - log "Converting image data to pdf..." - if [ $SEARCHABLE -eq 1 ]; then - log "Converting image data to searchable pdf..." - tesseract $PP_PREFIX$IMAGE_FILE ${IMAGE_FILE%.*} -l eng pdf | logstdout - else + log "" + log "-------------------------------------------------------------------------------" log "Converting image data to pdf..." - PNMVERBOSE= - if [ $VERBOSE = 1 ]; then - PNMVERBOSE="-verbose" - fi - if [ "$PGWIDTHIN" == "" -o "$PGHEIGHTIN" == "" ]; then - PAGEOPTS="-equalpixels -dpi=$RESOLUTION -noturn" + if [ $SEARCHABLE -eq 1 ]; then + log "Converting image data to searchable pdf..." + tesseract $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout else - PAGEOPTS="-imagewidth $PGWIDTHIN -imageheight $PGHEIGHTIN" - fi - log "Using page options: $PAGEOPTS" - pnmtops $PNMVERBOSE $PAGEOPTS $PP_PREFIX$IMAGE_FILE | ps2pdf $PS2PDF_OPTS - > ${IMAGE_FILE%.*}.pdf | logstdout + log "Converting image data to pdf..." + PNMVERBOSE= + if [ $VERBOSE = 1 ]; then + PNMVERBOSE="-verbose" + fi + if [ "$PGWIDTHIN" == "" -o "$PGHEIGHTIN" == "" ]; then + PAGEOPTS="-equalpixels -dpi=$RESOLUTION -noturn" + else + PAGEOPTS="-imagewidth $PGWIDTHIN -imageheight $PGHEIGHTIN" + fi + log "Using page options: $PAGEOPTS" + pnmtops $PNMVERBOSE $PAGEOPTS $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE | ps2pdf $PS2PDF_OPTS - > $IMAGE_DIR/${IMAGE_FILE%.*}.pdf | logstdout + fi + else + log "skipping empty page $IMAGE_FILE" fi status=$? - rm $IMAGE_FILE - if [ -f $PP_PREFIX$IMAGE_FILE ]; then - rm $PP_PREFIX$IMAGE_FILE + rm $IMAGE_PATH + if [ -f $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE ]; then + rm $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE fi log ""