[HOME] [BASH] [Window Managers]

Extract scanned text from PDF

#!/bin/bash
#
## ocrpdf - Extract scanned text from a PDF file using Tesseract,
## 	    Poppler and ImageMagick.
#

lang=spa	# put here the document's language

## Check if all necessary executables are installed
which tesseract pdfimages convert >/dev/null || {
	echo 'This script needs Tesseract, Poppler and ImageMagick.'
	exit 1
}

## Check if exist the file and is a pdf
ls "$1" 2>/dev/null | grep pdf >/dev/null || {
	echo "Usage: `basename $0` <document.pdf>"
	exit 1
}

## People that use graphical file managers happily use names like
## "This is my file.pdf" and include brackets, commas, non ascii
## symbols.  And in some cases slashes too!  Then they send this file
## like an email attachment and it travels across different machines
## and programs, protocols, encodings, character sets, etc.  Despite
## most today operating systems and programs are able to get rid of
## this, it is not a good practice.
#
## Cleaning some weird symbols from file's name
filename=$(echo $1 | sed  's/[, ()\[\]]*//g')

cp "$1" $filename

name=`basename $filename .pdf`
mkdir -p /tmp/$name
temporal=/tmp/$name

## Convert pdf to ppm images
pdfimages $filename $temporal/$name

## Convert PPM images to TIF.  If you use TIFF (two efs) tesseract
## returns an error
for i in $temporal/* ; do
    convert $i \
	-density 100x100 \
	-resize 200% \
	-fill white \
	-tint 50 \
	-level 20%,80%,1.0 \
	-sigmoidal-contrast 30,50% \
	-sharpen 0x2 \
	-compress none \
	-monochrome \
	$temporal/$(basename $i .ppm).tif

## Convert tif images to plain text
    tesseract $temporal/$(basename $i .ppm).tif \
	$temporal/$(basename $i .ppm) -l $lang
done

## Join txt files
[ -e $temporal ] && {
	cat $temporal/*.txt > $name.txt

## Clean tmp
	rm -rf $temporal
}

rm -rf $filename

exit 0

## End ocrpdf
<= Prev Next =>


[HOME] [BASH] [Window Managers]