[HOME]
[BASH]
[Window Managers]
Extract scanned text from PDF
#!/bin/bash
#
## ocrpdf - Extract scanned text from a PDF file using Tesseract,
## Poppler and ImageMagick.
#
lang=spa # put here the document's language
## Check if all necessary executables are installed
which tesseract pdfimages convert >/dev/null || {
echo 'This script needs Tesseract, Poppler and ImageMagick.'
exit 1
}
## Check if exist the file and is a pdf
ls "$1" 2>/dev/null | grep pdf >/dev/null || {
echo "Usage: `basename $0` <document.pdf>"
exit 1
}
## People that use graphical file managers happily use names like
## "This is my file.pdf" and include brackets, commas, non ascii
## symbols. And in some cases slashes too! Then they send this file
## like an email attachment and it travels across different machines
## and programs, protocols, encodings, character sets, etc. Despite
## most today operating systems and programs are able to get rid of
## this, it is not a good practice.
#
## Cleaning some weird symbols from file's name
filename=$(echo $1 | sed 's/[, ()\[\]]*//g')
cp "$1" $filename
name=`basename $filename .pdf`
mkdir -p /tmp/$name
temporal=/tmp/$name
## Convert pdf to ppm images
pdfimages $filename $temporal/$name
## Convert PPM images to TIF. If you use TIFF (two efs) tesseract
## returns an error
for i in $temporal/* ; do
convert $i \
-density 100x100 \
-resize 200% \
-fill white \
-tint 50 \
-level 20%,80%,1.0 \
-sigmoidal-contrast 30,50% \
-sharpen 0x2 \
-compress none \
-monochrome \
$temporal/$(basename $i .ppm).tif
## Convert tif images to plain text
tesseract $temporal/$(basename $i .ppm).tif \
$temporal/$(basename $i .ppm) -l $lang
done
## Join txt files
[ -e $temporal ] && {
cat $temporal/*.txt > $name.txt
## Clean tmp
rm -rf $temporal
}
rm -rf $filename
exit 0
## End ocrpdf
[HOME]
[BASH]
[Window Managers]