mirror of
https://github.com/rocketraman/sane-scan-pdf.git
synced 2025-05-17 16:10:43 -07:00
Specify dpi in image header for tesseract
See https://github.com/tesseract-ocr/tesseract/issues/150
This commit is contained in:
parent
30a7be1fd8
commit
b6ff50a4b0
@ -26,7 +26,7 @@ Tested and run regularly on Fedora, but should work on other distributions with
|
|||||||
* pnmtops (netpbm-progs)
|
* pnmtops (netpbm-progs)
|
||||||
* ps2pdf (ghostscript)
|
* ps2pdf (ghostscript)
|
||||||
* pdfunite
|
* pdfunite
|
||||||
* ImageMagick (if --skip-empty-pages) is used
|
* ImageMagick (if --skip-empty-pages or --ocr is used)
|
||||||
|
|
||||||
### Optional
|
### Optional
|
||||||
|
|
||||||
@ -71,7 +71,7 @@ OPTIONS
|
|||||||
--unpaper
|
--unpaper
|
||||||
Run post-processing deskew and black edge detection (requires unpaper)
|
Run post-processing deskew and black edge detection (requires unpaper)
|
||||||
--ocr
|
--ocr
|
||||||
Run OCR to make the PDF searchable (requires tesseract)
|
Run OCR to make the PDF searchable (requires tesseract and ImageMagick)
|
||||||
--skip-empty-pages
|
--skip-empty-pages
|
||||||
remove empty pages from resulting PDF document (e.g. one sided doc in duplex mode)
|
remove empty pages from resulting PDF document (e.g. one sided doc in duplex mode)
|
||||||
|
|
||||||
|
@ -87,7 +87,9 @@ process_page() {
|
|||||||
log "-------------------------------------------------------------------------------"
|
log "-------------------------------------------------------------------------------"
|
||||||
if [ $SEARCHABLE -eq 1 ]; then
|
if [ $SEARCHABLE -eq 1 ]; then
|
||||||
log "Converting image data to searchable pdf..."
|
log "Converting image data to searchable pdf..."
|
||||||
runconstrained tesseract $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout
|
# tesseract uses the input's DPI header, we need to convert to a format that supports this (like tiff)
|
||||||
|
convert -density ${RESOLUTION}x${RESOLUTION} -units PixelsPerInch $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff
|
||||||
|
runconstrained tesseract $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout
|
||||||
else
|
else
|
||||||
log "Converting image data to pdf..."
|
log "Converting image data to pdf..."
|
||||||
if [ "$PGWIDTHIN" == "" -o "$PGHEIGHTIN" == "" ]; then
|
if [ "$PGWIDTHIN" == "" -o "$PGHEIGHTIN" == "" ]; then
|
||||||
|
Loading…
x
Reference in New Issue
Block a user