[HOME]
[BASH]
[Window Managers]
Extract scanned text from PDF
#!/bin/bash
# - ocrpdf - Extract scanned text from a PDF file using Tesseract,
# Poppler and ImageMagick.
lang=spa # put here your document language
# Check if all necessary executables are installed
which tesseract pdfimages convert >/dev/null || {
echo 'This script needs Tesseract, Poppler and ImageMagick.'
exit 1
}
# Check if exist the file and is a pdf
ls "$1" 2>/dev/null | grep pdf >/dev/null || {
echo "Usage: `basename $0` <document.pdf>"
exit 1
}
# People that use graphical file managers happily use names like
# "This is my file.pdf" and include brackets, commas, non ascii
# symbols. And in some cases slashes too! Then they send this file
# like an email attachment and it travels across different machines
# and programs, protocols, encodings, character sets, etc. Despite
# most today operating systems and programs are able to get rid of
# this, it is not a good practice.
#
# Cleaning some weird symbols from file's name
filename=$(echo $1 | sed 's/[, ()\[\]]*//g')
cp "$1" $filename
name=`basename $filename .pdf`
mkdir -p /tmp/$name
temporal=/tmp/$name
# Convert pdf to ppm images
pdfimages $filename $temporal/$name
# Convert PPM images to TIF. If you use TIFF (two efs) tesseract
# returns an error
for i in $temporal/* ; do
convert $i \
-density 100x100 \
-resize 200% \
-fill white \
-tint 50 \
-level 20%,80%,1.0 \
-sigmoidal-contrast 30,50% \
-sharpen 0x2 \
-compress none \
-monochrome \
$temporal/$(basename $i .ppm).tif
# Convert tif images to plain text
tesseract $temporal/$(basename $i .ppm).tif \
$temporal/$(basename $i .ppm) -l $lang
done
# Join txt files
[ -e $temporal ] && {
cat $temporal/*.txt > $name.txt
# Clean tmp
rm -rf $temporal
}
rm -rf $filename
exit 0
# End ocrpdf
[HOME]
[BASH]
[Window Managers]