From b6ff50a4b0ce2fe546e727f6ea69dfb70122b77d Mon Sep 17 00:00:00 2001 From: Raman Gupta Date: Wed, 29 Jul 2020 18:01:20 -0400 Subject: [PATCH] Specify dpi in image header for tesseract See https://github.com/tesseract-ocr/tesseract/issues/150 --- README.md | 4 ++-- scan_perpage | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 474ec19..e805437 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Tested and run regularly on Fedora, but should work on other distributions with * pnmtops (netpbm-progs) * ps2pdf (ghostscript) * pdfunite -* ImageMagick (if --skip-empty-pages) is used +* ImageMagick (if --skip-empty-pages or --ocr is used) ### Optional @@ -71,7 +71,7 @@ OPTIONS --unpaper Run post-processing deskew and black edge detection (requires unpaper) --ocr - Run OCR to make the PDF searchable (requires tesseract) + Run OCR to make the PDF searchable (requires tesseract and ImageMagick) --skip-empty-pages remove empty pages from resulting PDF document (e.g. one sided doc in duplex mode) diff --git a/scan_perpage b/scan_perpage index c0ac2bc..0356414 100755 --- a/scan_perpage +++ b/scan_perpage @@ -87,7 +87,9 @@ process_page() { log "-------------------------------------------------------------------------------" if [ $SEARCHABLE -eq 1 ]; then log "Converting image data to searchable pdf..." - runconstrained tesseract $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout + # tesseract uses the input's DPI header, we need to convert to a format that supports this (like tiff) + convert -density ${RESOLUTION}x${RESOLUTION} -units PixelsPerInch $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff + runconstrained tesseract $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout else log "Converting image data to pdf..." if [ "$PGWIDTHIN" == "" -o "$PGHEIGHTIN" == "" ]; then