#! /bin/bash
#
## lx2xl -- Walter Alejandro Iglesias (last correction: Jun 2010)
#
## This is a very basic LaTeX to Xhtml converter useful for novelists.
## I've just included support for the minimal necessary. For
## cientific article staff docbook or latex2html are better choices.
#
## IT DOESN'T DO: Being latex so complex there are thousands of
## commands not translatable by this script. In general, it can't
## lead with not enclosed environmets (.i.e \em or \large tags). You
## may try using \emphasis{} or \begin{Large} \end{Large} in you tex
## file.
##
## Important things NOT supported (I'd like to add support for them in
## the future, so this script would be useful for cientific article
## use too):
## - numbering sections
## - tables
## - math formulae (it just translates symbols)
## - table of contents
## - columns
#
## Interesting things IT DOES:
## - Use Imagemagick to convert encapsulated postcript images
## to png and add them to the document with its alt width and foot
## properties specified in latex caption and width.
## - Includes files from a multifile tex environment.
## - Supports UTF-8
#
if [ ! -e $1 ] || [ "$1" == "" ] ||
[ "$(grep 'begin{document}' $1)" == "" ] ; then
echo "Usage: `basename $0` </path/to/master-file.tex>"
exit 1
fi
## Change to work dir
cd `dirname $1`
## Creating environment
mkdir -p html/img
output=html/`basename $1 .tex`.html
img_output=html/img
## Copying images that doesn't need be converted
for i in $(grep '\includegraphics' *.tex | sed 's/.*\]{\([^}]*\)}.*/\1/') ; do
if echo $i | grep -q 'png\|jpg' ; then
[ -e $img_output/`basename $i` ] || cp $i $img_output
elif [ ! -e $img_output/`basename $i .eps`.png ] ; then
## Converting eps images to png (needs Imagemagick installed)
convert -density 100 $i \
-flatten $img_output/`basename $i .eps`.png
fi
done &&
## Saving document title, author and date to variables
document_title=$(sed -n '/%/!s/\\title{\([^{}]*\)}/\1/p' `basename $1`)
author=$(sed -n '/%/!s/\\author{\([^{}]*\)}/\1/p' `basename $1`)
if [ "$(sed -n '/%/!s/\\date{\([^{}]*\)}/\1/p' `basename $1`)" == "\today" ] ; then
date=$(date '+%B %d, %Y')
else
date=$(sed -n '/%/!s/\\date{\(.*\)}/\1/p' `basename $1`)
fi
## Delete tex comments
sed '/\\%/!s/%.*//g' `basename $1` > tempfile
## Delete spaces at the beginning and the end of lines
sed -i '/\\begin{verbatim}/,/\\end{verbatim}/!s/^[ ]*//' tempfile
sed -i '/\\begin{verbatim}/,/\\end{verbatim}/!s/ *$//' tempfile
## Include files called with include latex command
for i in $(sed -n 's/[ ]*\\include{\(.*\)}/\1/p' tempfile) ; do
sed -i "/\\include{$i}/r $i.tex" tempfile
done
## Replace include entries with spaces
sed -i 's/\\include{[^{}]*}/<br \/><br \/><br \/><br \/>/' tempfile
## Once again cleaning comments and spaces
sed -i '/\\%/!s/%.*//g' tempfile
sed -i '/\\begin{verbatim}/,/\\end{verbatim}/!s/^[ ]*//' tempfile
sed -i '/\\begin{verbatim}/,/\\end{verbatim}/!s/ *$//' tempfile
## Delete latex headings
sed -i '1,/\\begin{document}/d' tempfile
## Math symbols (This script doesn't support equation environment,
## just replace some symbols)
#
## Greek Letters
sed -i '/\$/s/\\Gamma/\Γ/g
/\$/s/\\Delta/\Δ/g
/\$/s/\\Theta/\Θ/g
/\$/s/\\Lamda/\Λ/g
/\$/s/\\Xi/\Ξ/g
/\$/s/\\Pi/\Π/g
/\$/s/\\Sigma/\Σ/g
/\$/s/\\Phi/\Φ/g
/\$/s/\\Psi/\Ψ/g
/\$/s/\\Omega/\Ω/g
/\$/s/\\alpha/\α/g
/\$/s/\\beta/\β/g
/\$/s/\\gamma/\γ/g
/\$/s/\\delta/\δ/g
/\$/s/\\epsilon/\ε/g
/\$/s/\\zeta/\ζ/g
/\$/s/\\eta/\η/g
/\$/s/\\theta/\θ/g
/\$/s/\\iota/\ι/g
/\$/s/\\kappa/\κ/g
/\$/s/\\lambda/\λ/g
/\$/s/\\mu/\μ/g
/\$/s/\\nu/\ν/g
/\$/s/\\xi/\ξ/g
/\$/s/\\pi/\π/g
/\$/s/\\rho/\ρ/g
/\$/s/\\sigmaf/\ς/g
/\$/s/\\sigma/\σ/g
/\$/s/\\tau/\τ/g
/\$/s/\\upsilon/\υ/g
/\$/s/\\phi/\&\φ/g
/\$/s/\\chi/\χ/g
/\$/s/\\psi/\ψ/g
/\$/s/\\omega/\ω/g
/\$/s/\\thetasym/\ϑ/g
/\$/s/\\upsih/\ϒ/g
/\$/s/\\piv/\ϖ/g' tempfile
## Some Punctuation Simbols
sed -i '/\$/s/\\bullet/\•/g
/\$/s/\\wp/\℘/g
/\$/s/\\Re/\ℜ/g
/\$/s/\\aleph/\ℵ/g' tempfile
## Arrows
sed -i '/\$/s/\\leftarrow/\←/g
/\$/s/\\rightarrow/\→/g
/\$/s/\\downarrow/\↓/g
/\$/s/\\uparrow/\↑/g
/\$/s/\\leftrightarrow/\↔/g
/\$/s/\\hookleftarrow/\↵/g
/\$/s/\\Leftarrow/\⇐/g
/\$/s/\\Uparrow/\⇑/g
/\$/s/\\Rightarrow/\⇒/g
/\$/s/\\Downarrow/\⇓/g
/\$/s/\\Leftrightarrow/\⇔/g' tempfile
## Mathematical Operators and Relations
sed -i '/\$/s/\\forall/\∀/g
/\$/s/\\partial/\∂/g
/\$/s/\\exists/\∃/g
/\$/s/\\emptyset/\∅/g
/\$/s/\\nabla/\∇/g
/\$/s/\\in/\∈/g
/\$/s/\\ni/\∋/g
/\$/s/\\sqcap/\∏/g
/\$/s/\\sum/\∑/g
/\$/s/\\ast/\∗/g
/\$/s/\\surd/\√/g
/\$/s/\\propto/\∝/g
/\$/s/\\infty/\∞/g
/\$/s/\\angle/\∠/g
/\$/s/\\wedge/\∧/g
/\$/s/\\vee/\∨/g
/\$/s/\\cap/\∩/g
/\$/s/\\cup/\∪/g
/\$/s/\\int/\∫/g
/\$/s/\\sim/\∼/g
/\$/s/\\cong/\≅/g
/\$/s/\\approx/\≈/g
/\$/s/\\neq/\≠/g
/\$/s/\\equiv/\≡/g
/\$/s/\\leq/\≤/g
/\$/s/\\geq/\≥/g
/\$/s/\\subset/\⊂/g
/\$/s/\\supset/\⊃/g
/\$/s/\\subseteq/\⊆/g
/\$/s/\\supseteq/\⊇/g
/\$/s/\\oplus/\⊕/g
/\$/s/\\otimes/\⊗/g
/\$/s/\\perp/\⊥/g
/\$/s/\\cdot/\⋅/g' tempfile
##Corners, Angle brackets
sed -i '/\$/s/\\lceil/\⌈/g
/\$/s/\\rceil/\⌉/g
/\$/s/\\lfloor/\⌊/g
/\$/s/\\rfloor/\⌋/g
/\$/s/\\langle/\⟨/g
/\$/s/\\rangle/\⟩/g' tempfile
## Remove math marks
sed -i 's/\$//g' tempfile
## Mark verbatim newlines
sed -i '/\\begin{verbatim}/,/\\end{verbatim}/s/$/nEwLiNe/' tempfile
## Workaround to delete newlines only in paragraphs
sed -i 's/^$/bLaNkLiNe/' tempfile
tr '\n' ' ' < tempfile > $output
rm -f tempfile
## Recovering blanklines between paragraphs
sed -i 's/\([ ]*\(bLaNkLiNe\)[ ]*\)*/\2/g' $output
sed -i 's/bLaNkLiNe/\
/g' $output
## Form here paragraphs are whole lines. Take it in care at time of
## addressing sed commands.
## ------------------------------------------------------------------
## Once again with spaces
sed -i '/\\begin{verbatim}/!s/^[ ]*//' $output
sed -i '/\\begin{verbatim}/!s/ *$//' $output
## TO DO (and not translatable)
sed -i 's/\\end{document}//
s/\\begin{table}//g
s/\\end{table}//g
s/\\newpage//g
s/\\maketitle//
s/\\thispagestyle{[^}]*}//g
s/\\noindent[ ]*//g
s/\\nopagebreak[ ]*//g
s/\\doublespacing//g
s/\\singlespacing//g
s/\\setcounter{[^}]*}{[^}]*}[ ]*//g
s/\\centering//g
s/\\tiny//g
s/\\scriptsize//g
s/\\footnotesize//g
s/\\small //g
s/\\normalsize//g
s/\\.arge//g
s/\\LARGE//g
s/\\Huge//g' $output
## Vertical spaces
sed -i 's/\\\(big\|med\|small\)skip/<br \/>/g
s/[ ]*\\vspace[*]*{[^{}]*}[ ]*/<br \/><br \/>/g' $output
## Font size (simple tag not supported)
sed -i 's/\\begin{tiny}/<div style="font-size: 0.8em">/g
s/\\begin{scriptsize}/<div style="font-size: 0.85em">/g
s/\\begin{footnotesize}/<div style="font-size: 0.9em">/g
s/\\begin{small}/<div style="font-size: 0.95em">/g
s/\\begin{normalsize}/<div style="font-size: 1em">/g
s/\\begin{large}/<div style="font-size: 1.05em">/g
s/\\begin{Large}/<div style="font-size: 1.1em">/g
s/\\begin{LARGE}/<div style="font-size: 1.15em">/g
s/\\begin{Huge}/<div style="font-size: 1.2em">/g
s/\\end{\(\|tiny\|scriptsize\|footnotesize\|small\|normalsize\|.arge\|LARGE\|huge\|Huge\)}/<\/div>/g' $output
## Punctuation
sed -i 's/<</«/g
s/>>/»/g
s/``/"/g
s/'\'\''/\"/g
s/[\]*ldots[\]*/\.\.\./g
s/\\\\/<br \/>/g
s/---/\—/g
s/--/\–/g
s/\\%/%/g
s/\\_/_/g
s/\.\\ /\.\ /g' $output
## Replacing environments (some repeated for get rid of nested)
sed -i 's/\\emph{\([^{}]*\)}/<em>\1<\/em>/g
s/\\textit{\([^{}]*\)}/<i>\1<\/i>/g
s/\\textbf{\([^{}]*\)}/<b>\1<\/b>/g
s/\\texttt{\([^{}]*\)}/<tt>\1<\/tt>/g
s/\\footnote{\([^{}]*\)}/\[*<tt style="font-size: 12px; color: gray">\1\<\/tt>]/g
s/\\emph{\([^{}]*\)}/<em>\1<\/em>/g
s/\\textit{\([^{}]*\)}/<i>\1<\/i>/g
s/\\textbf{\([^{}]*\)}/<b>\1<\/b>/g
s/\\texttt{\([^{}]*\)}/<tt>\1<\/tt>/g
s/\\footnote{\([^{}]*\)}/\[*<tt style="font-size: 12px; color: gray">\1\<\/tt>]/g' $output
sed -i '/\([sub]*section\|chapter\)[\*]{/s/\\begin{\(flushright\|center[ing]*\)}//g
/\\\([sub]*section\|chapter\)[\*]/s/\\end{\(flushright\|center[ing]*\)}//g
s/\\begin{flushright}[ ]*/<div class="right">/g
s/[ ]*\\end{flushright}/<\/div>/g
s/\\begin{center[ing]*}[ ]*/<div class="center">/g
s/[ ]*\\end{center[ing]*}/<\/div>/g
s/\\subsubsection[^{}]*{[ ]*\([^{}]*\)[ ]*}/<h4>\1<\/h4>/g
s/\\subsection[^{}]*{[ ]*\([^{}]*\)[ ]*}/<h3>\1<\/h3>/g
s/\\section[^{}]*{[ ]*\([^{}]*\)[ ]*}/<h2>\1<\/h2>/g
s/\\chapter[^{}]*{[ ]*\([^{}]*\)[ ]*}/<br \/><br \/><br \/><h1>\1\ <\/h1><br \/><br \/>/g' $output
## Image with caption
sed -i 's/\\begin{figure}[^{]*\\includegraphics\[[^{]*\]{[^}]*\/\([^}]*\)\.\(eps\|png\|jpg\)}[^{}]*\\caption{<[^>]*>\([^}]*\)<[^>]*>}.*\\end{figure}/<br \/><div class="center"><img style="max-width: 90%" src="img\/\1.png" alt="\3" \/><br \/><br \/><em>\3<\/em><\/div>/g' $output
## Image without caption
sed -i 's/\\begin{figure}[^{]*\\includegraphics\[[^{]*\]{[^}]*\/\([^}]*\)\.\(eps\|png\|jpg\)}[^{}]*\\end{figure}/<br \/><div class="center"><img style="max-width: 90%" src="img\/\1.png" alt="image" \/><\/div>/g' $output
sed -i 's/\\begin{quot[ae]\(\|tion\)}[ ]*/<blockquote>/g
s/[ ]*\\end{quot[ae]\(\|tion\)}/<\/blockquote>/g
s/\\begin{titlepage}/<div id="titlepage">/g
s/\\end{titlepage}/<\/div>/g' $output
## Verse
sed -i 's/\\begin{verse}//g
s/\\end{verse}//g' $output
## Verbatim
sed -i 's/\\begin{verbatim}/<pre>/
s/\\end{verbatim}/<\/pre>/' $output
## Lists
sed -i '/\\begin{\(itemize\|enumerate\)}/s/[ ]*\\item[ ]*//
/\\begin{\(itemize\|enumerate\)}/s/[ ]*\\item[ ]*/<\/li><li>/g
s/\\begin{itemize}[ ]*/<ul><li>/
s/[ ]*\\end{itemize}[ ]*/<\/li><\/ul>/
s/\\begin{enumerate}[ ]*/<ol><li>/
s/[ ]*\\end{enumerate}[ ]*/<\/li><\/ol>/' $output
## Once again with spaces
sed -i '/<pre>/!s/\(^[ ]*\| *$\)//' $output
## Set paragraphs
sed -i '/\(^$\)\|<\(pre\|br \|[\/]*h[12345]\|[\/]*blockquote\|[\/]*div\|[\/]*ul\)/!s/[ ]*\(.*\)[ ]*/<p>\1<\/p>/' $output
## Recovering newlines in verbatim environment
sed -i 's/nEwLiNe/\
/g' $output
## Set html header (change charset in meta to your system's default)
sed -i 1i\ "\\
<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"> \\
<html xmlns=\"http://www.w3.org/1999/xhtml\"> \\
<head> \\
<title>$document_title</title> \\
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /> \\
<meta name=\"author\" content=\"$author\" /> \\
<meta name=\"description\" content=\"\" /> \\
<meta name=\"keywords\" content=\"\" /> \\
<style type=\"text/css\"> \\
body { \\
max-width: 650px; \\
padding: 10px \\
} \\
h1 { \\
padding-top: 20mm; \\
padding-bottom: 5mm; \\
text-align: center \\
} \\
h2 { \\
padding-top: 8mm; \\
} \\
h3 { \\
padding-top: 6mm; \\
} \\
h4 { \\
padding-top: 4mm; \\
} \\
pre { \\
background-color: #ccc; \\
padding: 10px \\
} \\
#titlepage { \\
font-size: 2em; \\
} \\
.right { \\
text-align: right \\
} \\
.center { \\
text-align: center \\
} \\
</style> \\
</head> \\
<body> \\
<h1 class=\"center\">$document_title\ </h1> \\
<p class=\"center\">$author\ </p> \\
<p class=\"center\">$date\ </p>" $output
## Html foot
sed -i '$a\
<br \/><br \/><br \/><br \/> \
<hr \/> \
<p class="right">Converted with <tt>lx2xl<\/tt> (eloi at roquesor.com)<\/p> \
<br \/><br \/> \
<\/body> \
<\/html>' $output
## Finally, if you have tidy installed uncomment this to show html
## errors (remove or change the -utf8 tag in case)
#tidy -e -utf8 $output
## Or this to format
#tidy -m -i -utf8 $output
## And to debug this script this will show you tex tags not converted
## in the html
echo '
** Lines with not converted latex tags:'
grep '\\.*' $output || echo -e 'None\n'
## End lx2xl
| <= Prev | Next => |
You can mail me to eloi at roquesor.com
.