mirror of
https://github.com/rocketraman/sane-scan-pdf.git
synced 2025-05-16 23:50:39 -07:00
Remove empty pages based on cmd line option
This commit is contained in:
parent
349e0a5dea
commit
573b1df2da
@ -24,6 +24,7 @@ Tested and run regularly on Fedora, but should work on other distributions with
|
|||||||
* pnmtops (netpbm-progs)
|
* pnmtops (netpbm-progs)
|
||||||
* ps2pdf (ghostscript)
|
* ps2pdf (ghostscript)
|
||||||
* pdfunite
|
* pdfunite
|
||||||
|
* ImageMagick (if --skip-empty-pages) is used
|
||||||
|
|
||||||
### Optional
|
### Optional
|
||||||
|
|
||||||
@ -68,9 +69,16 @@ OPTIONS
|
|||||||
Run post-processing deskew and black edge detection (requires unpaper)
|
Run post-processing deskew and black edge detection (requires unpaper)
|
||||||
--ocr
|
--ocr
|
||||||
Run OCR to make the PDF searchable (requires tesseract)
|
Run OCR to make the PDF searchable (requires tesseract)
|
||||||
|
--skip-empty-pages
|
||||||
|
remove empty pages from resulting PDF document (e.g. one sided doc in duplex mode)
|
||||||
|
|
||||||
OUTPUT
|
OUTPUT
|
||||||
-o, --output <outputfile>
|
-o, --output <outputfile>
|
||||||
Output to named file default=scan.pdf
|
Output to named file default=scan.pdf
|
||||||
-l, --outputlist <outputfile-1...outputfile-n> Output to named files for each scanned page, can be used with append
|
-l, --outputlist <outputfile-1...outputfile-n> Output to named files for each scanned page, can be used with append
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Contributors
|
||||||
|
|
||||||
|
* [Raman Gupta](https://github.com/rocketraman/)
|
||||||
|
* [Stefan Armbruster](https://github.com/sarmbruster/)
|
||||||
|
37
scan
37
scan
@ -12,6 +12,7 @@ SCRIPT="$DIR/scan_perpage"
|
|||||||
DUPLEX=0
|
DUPLEX=0
|
||||||
UNPAPER=0
|
UNPAPER=0
|
||||||
SEARCHABLE=0
|
SEARCHABLE=0
|
||||||
|
LANGUAGE=eng
|
||||||
MAXPAGE=
|
MAXPAGE=
|
||||||
TRUNCPAGE=0
|
TRUNCPAGE=0
|
||||||
HELP=0
|
HELP=0
|
||||||
@ -23,6 +24,9 @@ PGWIDTHIN=
|
|||||||
CROP=0
|
CROP=0
|
||||||
DESKEW=0
|
DESKEW=0
|
||||||
VERBOSE=0
|
VERBOSE=0
|
||||||
|
SKIP_EMPTY_PAGES=0
|
||||||
|
TMP_DIR=`mktemp -d`
|
||||||
|
trap "rm -rf $TMP_DIR" 0
|
||||||
|
|
||||||
# Parse command-line options
|
# Parse command-line options
|
||||||
while [ $# -gt 0 ]; do
|
while [ $# -gt 0 ]; do
|
||||||
@ -58,6 +62,10 @@ while [ $# -gt 0 ]; do
|
|||||||
|
|
||||||
--searchable|--ocr) SEARCHABLE=1 ;;
|
--searchable|--ocr) SEARCHABLE=1 ;;
|
||||||
|
|
||||||
|
--language) LANGUAGE=$1 ;;
|
||||||
|
|
||||||
|
--skip-empty-pages) SKIP_EMPTY_PAGES=1 ;;
|
||||||
|
|
||||||
-o|--output) shift; OUTPUT="$1" ;;
|
-o|--output) shift; OUTPUT="$1" ;;
|
||||||
|
|
||||||
-l|--outputlist) shift; USEARRAY=1; OUTPUT=(); OUTPUT+=("$1") ;;
|
-l|--outputlist) shift; USEARRAY=1; OUTPUT=(); OUTPUT+=("$1") ;;
|
||||||
@ -94,6 +102,8 @@ if [ $HELP -eq 1 ]; then
|
|||||||
echo " Custom Page Height in mm"
|
echo " Custom Page Height in mm"
|
||||||
echo " -pw, --page-width"
|
echo " -pw, --page-width"
|
||||||
echo " Custom Page Width in mm"
|
echo " Custom Page Width in mm"
|
||||||
|
echo " -x, --device"
|
||||||
|
echo " Override scanner device name, defaulting to `fujitsu`"
|
||||||
echo " --crop"
|
echo " --crop"
|
||||||
echo " Crop to contents (driver must support this)"
|
echo " Crop to contents (driver must support this)"
|
||||||
echo " --deskew"
|
echo " --deskew"
|
||||||
@ -102,6 +112,10 @@ if [ $HELP -eq 1 ]; then
|
|||||||
echo " Run post-processing deskew and black edge detection (requires unpaper)"
|
echo " Run post-processing deskew and black edge detection (requires unpaper)"
|
||||||
echo " --ocr"
|
echo " --ocr"
|
||||||
echo " Run OCR to make the PDF searchable (requires tesseract)"
|
echo " Run OCR to make the PDF searchable (requires tesseract)"
|
||||||
|
echo " --language <lang>"
|
||||||
|
echo " which language to use for OCR"
|
||||||
|
echo " --skip-empty-pages"
|
||||||
|
echo " remove empty pages from resulting PDF document (e.g. one sided doc in duplex mode)"
|
||||||
echo ""
|
echo ""
|
||||||
echo "OUTPUT"
|
echo "OUTPUT"
|
||||||
echo " -o, --output <outputfile>"
|
echo " -o, --output <outputfile>"
|
||||||
@ -175,10 +189,12 @@ fi
|
|||||||
export VERBOSE
|
export VERBOSE
|
||||||
export UNPAPER
|
export UNPAPER
|
||||||
export SEARCHABLE
|
export SEARCHABLE
|
||||||
|
export LANGUAGE
|
||||||
export RESOLUTION
|
export RESOLUTION
|
||||||
export PGWIDTHIN
|
export PGWIDTHIN
|
||||||
export PGHEIGHTIN
|
export PGHEIGHTIN
|
||||||
export PS2PDF_OPTS
|
export PS2PDF_OPTS
|
||||||
|
export SKIP_EMPTY_PAGES
|
||||||
|
|
||||||
if [ $VERBOSE = 1 ]; then
|
if [ $VERBOSE = 1 ]; then
|
||||||
LOCKFILE=$(mktemp)
|
LOCKFILE=$(mktemp)
|
||||||
@ -186,12 +202,13 @@ if [ $VERBOSE = 1 ]; then
|
|||||||
export LOCKFILE
|
export LOCKFILE
|
||||||
fi;
|
fi;
|
||||||
|
|
||||||
|
|
||||||
echo >&2 "Scanning..."
|
echo >&2 "Scanning..."
|
||||||
#eval strace -f -o /tmp/scan-trace.txt scanadf -d $DEVICE $MAXPAGE $PGHEIGHT $PGWIDTH -S $SCRIPT --script-wait --resolution $RESOLUTION --mode $MODE $DESKEW $CROP $SOURCE -o scan-%04d
|
#eval strace -f -o /tmp/scan-trace.txt scanadf -d $DEVICE $MAXPAGE $PGHEIGHT $PGWIDTH -S $SCRIPT --script-wait --resolution $RESOLUTION --mode $MODE $DESKEW $CROP $SOURCE -o scan-%04d
|
||||||
eval scanadf -d "$DEVICE" $MAXPAGE $PGHEIGHT $PGWIDTH -S $SCRIPT --script-wait --resolution $RESOLUTION --mode $MODE $DESKEW $CROP $SOURCE -o scan-%04d
|
eval scanadf -d "$DEVICE" $MAXPAGE $PGHEIGHT $PGWIDTH -S $SCRIPT --script-wait --resolution $RESOLUTION --mode $MODE $DESKEW $CROP $SOURCE -o $TMP_DIR/scan-%04d
|
||||||
|
|
||||||
shopt -s extglob nullglob
|
shopt -s extglob nullglob
|
||||||
pdffiles=(scan-[0-9]*.pdf)
|
pdffiles=($TMP_DIR/scan-[0-9]*.pdf)
|
||||||
numscans=${#pdffiles[@]}
|
numscans=${#pdffiles[@]}
|
||||||
if [ $numscans -gt 0 ]; then
|
if [ $numscans -gt 0 ]; then
|
||||||
echo "Processing $numscans pages"
|
echo "Processing $numscans pages"
|
||||||
@ -213,13 +230,13 @@ if [ $numscans -gt 0 ]; then
|
|||||||
if [ -f "${OUTPUT[$index]}.orig" ]; then
|
if [ -f "${OUTPUT[$index]}.orig" ]; then
|
||||||
pdffiles+=("${OUTPUT[$index]}.orig")
|
pdffiles+=("${OUTPUT[$index]}.orig")
|
||||||
fi
|
fi
|
||||||
pdffiles+=(scan-*(0)$scanno.pdf)
|
pdffiles+=($TMP_DIR/scan-*(0)$scanno.pdf)
|
||||||
pdfunite "${pdffiles[@]}" "${OUTPUT[$index]}" && rm scan-*(0)$scanno.pdf
|
pdfunite "${pdffiles[@]}" "${OUTPUT[$index]}" && rm $TMP_DIR/scan-*(0)$scanno.pdf
|
||||||
else
|
else
|
||||||
mv scan-*(0)$scanno.pdf "${OUTPUT[$index]}"
|
mv $TMP_DIR/scan-*(0)$scanno.pdf "${OUTPUT[$index]}"
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
mv scan-*(0)$scanno.pdf "${OUTPUT[$index]}"
|
mv $TMP_DIR/scan-*(0)$scanno.pdf "${OUTPUT[$index]}"
|
||||||
fi
|
fi
|
||||||
let "index = $index + 1"
|
let "index = $index + 1"
|
||||||
done
|
done
|
||||||
@ -232,13 +249,13 @@ if [ $numscans -gt 0 ]; then
|
|||||||
if [ -f "${OUTPUT}.orig" ]; then
|
if [ -f "${OUTPUT}.orig" ]; then
|
||||||
pdffiles+=("${OUTPUT}.orig")
|
pdffiles+=("${OUTPUT}.orig")
|
||||||
fi
|
fi
|
||||||
pdffiles+=(scan-[0-9]*.pdf)
|
pdffiles+=($TMP_DIR/scan-[0-9]*.pdf)
|
||||||
pdfunite "${pdffiles[@]}" "$OUTPUT" && rm scan-[0-9]*.pdf
|
pdfunite "${pdffiles[@]}" "$OUTPUT" && rm $TMP_DIR/scan-[0-9]*.pdf
|
||||||
else
|
else
|
||||||
if [ $USEARRAY = 1 ]; then
|
if [ $USEARRAY = 1 ]; then
|
||||||
mv scan-0*.pdf "${OUTPUT[0]}"
|
mv $TMP_DIR/scan-0*.pdf "${OUTPUT[0]}"
|
||||||
else
|
else
|
||||||
mv scan-0*.pdf "$OUTPUT"
|
mv $TMP_DIR/scan-0*.pdf "$OUTPUT"
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
echo ""
|
echo ""
|
||||||
|
70
scan_perpage
70
scan_perpage
@ -9,9 +9,11 @@ usage()
|
|||||||
echo "Set the following environment variables:"
|
echo "Set the following environment variables:"
|
||||||
echo " UNPAPER"
|
echo " UNPAPER"
|
||||||
echo " SEARCHABLE"
|
echo " SEARCHABLE"
|
||||||
|
echo " LANGUAGE"
|
||||||
echo " RESOLUTION"
|
echo " RESOLUTION"
|
||||||
echo " PGWIDTHIN"
|
echo " PGWIDTHIN"
|
||||||
echo " PGHEIGHTIN"
|
echo " PGHEIGHTIN"
|
||||||
|
echo " SKIP_EMPTY_PAGES"
|
||||||
echo " PS2PDF_OPTS (optional)"
|
echo " PS2PDF_OPTS (optional)"
|
||||||
echo " VERBOSE (optional)"
|
echo " VERBOSE (optional)"
|
||||||
echo " LOCKFILE (required if VERBOSE=1)"
|
echo " LOCKFILE (required if VERBOSE=1)"
|
||||||
@ -38,53 +40,63 @@ if [ $# -lt 1 ]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$UNPAPER" == "" -o "$SEARCHABLE" == "" -o "$RESOLUTION" == "" -o "$RESOLUTION" == "" ]; then
|
if [ "$UNPAPER" == "" -o "$SEARCHABLE" == "" -o "$RESOLUTION" == "" -o "$RESOLUTION" == "" -o "$SKIP_EMPTY_PAGES" == "" ]; then
|
||||||
usage
|
usage
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
IMAGE_FILE=$1
|
IMAGE_PATH=$1
|
||||||
|
IMAGE_DIR=`dirname $1`
|
||||||
|
IMAGE_FILE=`basename $1`
|
||||||
|
|
||||||
process_page() {
|
process_page() {
|
||||||
log ""
|
log ""
|
||||||
log "-------------------------------------------------------------------------------"
|
log "-------------------------------------------------------------------------------"
|
||||||
log "Post-processing scanned page ${IMAGE_FILE}, deskew=$UNPAPER, searchable=$SEARCHABLE..."
|
log "Post-processing scanned page ${IMAGE_FILE}, deskew=$UNPAPER, searchable=$SEARCHABLE..."
|
||||||
|
|
||||||
|
|
||||||
|
[[ $SKIP_EMPTY_PAGES -eq 1 ]] && PERCENTAGE_WHITE=`convert "$IMAGE_PATH" -fuzz 0% -negate -threshold 0 -negate -format "%[fx:100*mean]" info:` || PERCENTAGE_WHITE=0
|
||||||
|
log "$IMAGE_PATH has $PERCENTAGE_WHITE % white"
|
||||||
|
|
||||||
PP_PREFIX=
|
PP_PREFIX=
|
||||||
if [ $UNPAPER -eq 1 ]; then
|
if (( $(echo "$PERCENTAGE_WHITE < 99.99" |bc -l) )); then
|
||||||
log "Applying unpaper post-processing to image data..."
|
if [ $UNPAPER -eq 1 ]; then
|
||||||
PP_PREFIX="unpaper-"
|
log "Applying unpaper post-processing to image data..."
|
||||||
if [ $VERBOSE = 1 ]; then
|
PP_PREFIX="unpaper-"
|
||||||
UNPAPERVERBOSE="-v"
|
if [ $VERBOSE = 1 ]; then
|
||||||
|
UNPAPERVERBOSE="-v"
|
||||||
|
fi
|
||||||
|
#unpaper $UNPAPERVERBOSE --no-mask-scan --overwrite --dpi $RESOLUTION --no-blackfilter $IMAGE_FILE $PP_PREFIX$IMAGE_FILE | logstdout
|
||||||
|
unpaper $UNPAPERVERBOSE --overwrite --dpi $RESOLUTION $IMAGE_PATH $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE | logstdout
|
||||||
fi
|
fi
|
||||||
#unpaper $UNPAPERVERBOSE --no-mask-scan --overwrite --dpi $RESOLUTION --no-blackfilter $IMAGE_FILE $PP_PREFIX$IMAGE_FILE | logstdout
|
log ""
|
||||||
unpaper $UNPAPERVERBOSE --overwrite --dpi $RESOLUTION $IMAGE_FILE $PP_PREFIX$IMAGE_FILE | logstdout
|
log "-------------------------------------------------------------------------------"
|
||||||
fi
|
|
||||||
log ""
|
|
||||||
log "-------------------------------------------------------------------------------"
|
|
||||||
log "Converting image data to pdf..."
|
|
||||||
if [ $SEARCHABLE -eq 1 ]; then
|
|
||||||
log "Converting image data to searchable pdf..."
|
|
||||||
tesseract $PP_PREFIX$IMAGE_FILE ${IMAGE_FILE%.*} -l eng pdf | logstdout
|
|
||||||
else
|
|
||||||
log "Converting image data to pdf..."
|
log "Converting image data to pdf..."
|
||||||
PNMVERBOSE=
|
if [ $SEARCHABLE -eq 1 ]; then
|
||||||
if [ $VERBOSE = 1 ]; then
|
log "Converting image data to searchable pdf..."
|
||||||
PNMVERBOSE="-verbose"
|
tesseract $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout
|
||||||
fi
|
|
||||||
if [ "$PGWIDTHIN" == "" -o "$PGHEIGHTIN" == "" ]; then
|
|
||||||
PAGEOPTS="-equalpixels -dpi=$RESOLUTION -noturn"
|
|
||||||
else
|
else
|
||||||
PAGEOPTS="-imagewidth $PGWIDTHIN -imageheight $PGHEIGHTIN"
|
log "Converting image data to pdf..."
|
||||||
|
PNMVERBOSE=
|
||||||
|
if [ $VERBOSE = 1 ]; then
|
||||||
|
PNMVERBOSE="-verbose"
|
||||||
|
fi
|
||||||
|
if [ "$PGWIDTHIN" == "" -o "$PGHEIGHTIN" == "" ]; then
|
||||||
|
PAGEOPTS="-equalpixels -dpi=$RESOLUTION -noturn"
|
||||||
|
else
|
||||||
|
PAGEOPTS="-imagewidth $PGWIDTHIN -imageheight $PGHEIGHTIN"
|
||||||
|
fi
|
||||||
|
log "Using page options: $PAGEOPTS"
|
||||||
|
pnmtops $PNMVERBOSE $PAGEOPTS $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE | ps2pdf $PS2PDF_OPTS - > $IMAGE_DIR/${IMAGE_FILE%.*}.pdf | logstdout
|
||||||
fi
|
fi
|
||||||
log "Using page options: $PAGEOPTS"
|
else
|
||||||
pnmtops $PNMVERBOSE $PAGEOPTS $PP_PREFIX$IMAGE_FILE | ps2pdf $PS2PDF_OPTS - > ${IMAGE_FILE%.*}.pdf | logstdout
|
log "skipping empty page $IMAGE_FILE"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
status=$?
|
status=$?
|
||||||
rm $IMAGE_FILE
|
rm $IMAGE_PATH
|
||||||
if [ -f $PP_PREFIX$IMAGE_FILE ]; then
|
if [ -f $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE ]; then
|
||||||
rm $PP_PREFIX$IMAGE_FILE
|
rm $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log ""
|
log ""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user