Specify dpi in image header for tesseract

See https://github.com/tesseract-ocr/tesseract/issues/150
This commit is contained in:
Raman Gupta 2020-07-29 18:01:20 -04:00
parent 30a7be1fd8
commit b6ff50a4b0
2 changed files with 5 additions and 3 deletions

View File

@ -26,7 +26,7 @@ Tested and run regularly on Fedora, but should work on other distributions with
* pnmtops (netpbm-progs) * pnmtops (netpbm-progs)
* ps2pdf (ghostscript) * ps2pdf (ghostscript)
* pdfunite * pdfunite
* ImageMagick (if --skip-empty-pages) is used * ImageMagick (if --skip-empty-pages or --ocr is used)
### Optional ### Optional
@ -71,7 +71,7 @@ OPTIONS
--unpaper --unpaper
Run post-processing deskew and black edge detection (requires unpaper) Run post-processing deskew and black edge detection (requires unpaper)
--ocr --ocr
Run OCR to make the PDF searchable (requires tesseract) Run OCR to make the PDF searchable (requires tesseract and ImageMagick)
--skip-empty-pages --skip-empty-pages
remove empty pages from resulting PDF document (e.g. one sided doc in duplex mode) remove empty pages from resulting PDF document (e.g. one sided doc in duplex mode)

View File

@ -87,7 +87,9 @@ process_page() {
log "-------------------------------------------------------------------------------" log "-------------------------------------------------------------------------------"
if [ $SEARCHABLE -eq 1 ]; then if [ $SEARCHABLE -eq 1 ]; then
log "Converting image data to searchable pdf..." log "Converting image data to searchable pdf..."
runconstrained tesseract $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout # tesseract uses the input's DPI header, we need to convert to a format that supports this (like tiff)
convert -density ${RESOLUTION}x${RESOLUTION} -units PixelsPerInch $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff
runconstrained tesseract $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout
else else
log "Converting image data to pdf..." log "Converting image data to pdf..."
if [ "$PGWIDTHIN" == "" -o "$PGHEIGHTIN" == "" ]; then if [ "$PGWIDTHIN" == "" -o "$PGHEIGHTIN" == "" ]; then