Specify dpi in image header for tesseract

See https://github.com/tesseract-ocr/tesseract/issues/150
2025-05-16 23:50:39 -07:00 · 2020-07-29 18:01:20 -04:00 · 2020-07-29 18:01:20 -04:00 · b6ff50a4b0
commit b6ff50a4b0
parent 30a7be1fd8
2 changed files with 5 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -26,7 +26,7 @@ Tested and run regularly on Fedora, but should work on other distributions with
 * pnmtops (netpbm-progs)
 * ps2pdf (ghostscript)
 * pdfunite
-* ImageMagick (if --skip-empty-pages) is used
+* ImageMagick (if --skip-empty-pages or --ocr is used)

 ### Optional

@ -71,7 +71,7 @@ OPTIONS
 --unpaper
   Run post-processing deskew and black edge detection (requires unpaper)
 --ocr
-   Run OCR to make the PDF searchable (requires tesseract)
+   Run OCR to make the PDF searchable (requires tesseract and ImageMagick)
 --skip-empty-pages
   remove empty pages from resulting PDF document (e.g. one sided doc in duplex mode)

--- a/4
+++ b/4
@ -87,7 +87,9 @@ process_page() {
    log "-------------------------------------------------------------------------------"
    if [ $SEARCHABLE -eq 1 ]; then
      log "Converting image data to searchable pdf..."
-      runconstrained tesseract $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout
+      # tesseract uses the input's DPI header, we need to convert to a format that supports this (like tiff)
+      convert -density ${RESOLUTION}x${RESOLUTION} -units PixelsPerInch $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff
+      runconstrained tesseract $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout
    else
      log "Converting image data to pdf..."
      if [ "$PGWIDTHIN" == "" -o "$PGHEIGHTIN" == "" ]; then