[HOME] [BASH] [Window Managers]

Convet Latex to Xhtml with sed

#! /bin/bash
#
## lx2xl -- Walter Alejandro Iglesias (last correction: Jun 2010)
#
## This is a very basic LaTeX to Xhtml converter useful for novelists.
## I've just included support for the minimal necessary.  For
## cientific article staff docbook or latex2html are better choices.
#
## IT DOESN'T DO: Being latex so complex there are thousands of
## commands not translatable by this script.  In general, it can't
## lead with not enclosed environmets (.i.e \em or \large tags).  You
## may try using \emphasis{} or \begin{Large} \end{Large} in you tex
## file.
##
## Important things NOT supported (I'd like to add support for them in
## the future, so this script would be useful for cientific article
## use too):
## - numbering sections
## - tables
## - math formulae (it just translates symbols)
## - table of contents
## - columns
#
## Interesting things IT DOES:
## - Use Imagemagick to convert encapsulated postcript images
## to png and add them to the document with its alt width and foot
## properties specified in latex caption and width.
## - Includes files from a multifile tex environment.
## - Supports UTF-8
#

if [ ! -e $1 ] || [ "$1" == "" ] ||
	[ "$(grep 'begin{document}' $1)" == "" ] ; then
	echo "Usage: `basename $0` </path/to/master-file.tex>"
	exit 1
fi

## Change to work dir
cd `dirname $1`

## Creating environment
mkdir -p html/img
output=html/`basename $1 .tex`.html
img_output=html/img

## Copying images that doesn't need be converted
for i in $(grep '\includegraphics' *.tex | sed 's/.*\]{\([^}]*\)}.*/\1/') ; do
	if echo $i | grep -q 'png\|jpg' ; then
		[ -e $img_output/`basename $i` ] || cp $i $img_output
	elif [ ! -e $img_output/`basename $i .eps`.png ] ; then
## Converting eps images to png (needs Imagemagick installed)
		convert -density 100 $i \
			-flatten $img_output/`basename $i .eps`.png
	fi
done &&


## Saving document title, author and date to variables
document_title=$(sed -n '/%/!s/\\title{\([^{}]*\)}/\1/p' `basename $1`)
author=$(sed -n '/%/!s/\\author{\([^{}]*\)}/\1/p' `basename $1`)
if [ "$(sed -n '/%/!s/\\date{\([^{}]*\)}/\1/p' `basename $1`)" == "\today" ] ; then
	date=$(date '+%B %d, %Y')
else
	date=$(sed -n '/%/!s/\\date{\(.*\)}/\1/p' `basename $1`)
fi

## Delete tex comments
sed '/\\%/!s/%.*//g' `basename $1` > tempfile

## Delete spaces at the beginning and the end of lines
sed -i '/\\begin{verbatim}/,/\\end{verbatim}/!s/^[ ]*//' tempfile
sed -i '/\\begin{verbatim}/,/\\end{verbatim}/!s/ *$//' tempfile

## Include files called with include latex command
for i in $(sed -n 's/[ ]*\\include{\(.*\)}/\1/p' tempfile) ; do
 	sed -i "/\\include{$i}/r $i.tex" tempfile
done
## Replace include entries with spaces
sed -i 's/\\include{[^{}]*}/<br \/><br \/><br \/><br \/>/' tempfile

## Once again cleaning comments and spaces
sed -i '/\\%/!s/%.*//g' tempfile
sed -i '/\\begin{verbatim}/,/\\end{verbatim}/!s/^[ ]*//' tempfile
sed -i '/\\begin{verbatim}/,/\\end{verbatim}/!s/ *$//' tempfile

## Delete latex headings
sed -i '1,/\\begin{document}/d' tempfile

## Math symbols (This script doesn't support equation environment,
## just replace some symbols)
#
## Greek Letters
sed -i '/\$/s/\\Gamma/\&Gamma;/g
/\$/s/\\Delta/\&Delta;/g
/\$/s/\\Theta/\&Theta;/g
/\$/s/\\Lamda/\&Lambda;/g
/\$/s/\\Xi/\&Xi;/g
/\$/s/\\Pi/\&Pi;/g
/\$/s/\\Sigma/\&Sigma;/g
/\$/s/\\Phi/\&Phi;/g
/\$/s/\\Psi/\&Psi;/g
/\$/s/\\Omega/\&Omega;/g
/\$/s/\\alpha/\&alpha;/g
/\$/s/\\beta/\&beta;/g
/\$/s/\\gamma/\&gamma;/g
/\$/s/\\delta/\&delta;/g
/\$/s/\\epsilon/\&epsilon;/g
/\$/s/\\zeta/\&zeta;/g
/\$/s/\\eta/\&eta;/g
/\$/s/\\theta/\&theta;/g
/\$/s/\\iota/\&iota;/g
/\$/s/\\kappa/\&kappa;/g
/\$/s/\\lambda/\&lambda;/g
/\$/s/\\mu/\&mu;/g
/\$/s/\\nu/\&nu;/g
/\$/s/\\xi/\&xi;/g
/\$/s/\\pi/\&pi;/g
/\$/s/\\rho/\&rho;/g
/\$/s/\\sigmaf/\&sigmaf;/g
/\$/s/\\sigma/\&sigma;/g
/\$/s/\\tau/\&tau;/g
/\$/s/\\upsilon/\&upsilon;/g
/\$/s/\\phi/\&\&phi;/g
/\$/s/\\chi/\&chi;/g
/\$/s/\\psi/\&psi;/g
/\$/s/\\omega/\&omega;/g
/\$/s/\\thetasym/\&thetasym;/g
/\$/s/\\upsih/\&upsih;/g
/\$/s/\\piv/\&piv;/g' tempfile

## Some Punctuation Simbols
sed -i '/\$/s/\\bullet/\&bull;/g
/\$/s/\\wp/\&weierp;/g
/\$/s/\\Re/\&real;/g
/\$/s/\\aleph/\&alefsym;/g' tempfile

## Arrows
sed -i '/\$/s/\\leftarrow/\&larr;/g
/\$/s/\\rightarrow/\&rarr;/g
/\$/s/\\downarrow/\&darr;/g
/\$/s/\\uparrow/\&uarr;/g
/\$/s/\\leftrightarrow/\&harr;/g
/\$/s/\\hookleftarrow/\&crarr;/g
/\$/s/\\Leftarrow/\&lArr;/g
/\$/s/\\Uparrow/\&uArr;/g
/\$/s/\\Rightarrow/\&rArr;/g
/\$/s/\\Downarrow/\&dArr;/g
/\$/s/\\Leftrightarrow/\&hArr;/g' tempfile

## Mathematical Operators and Relations
sed -i '/\$/s/\\forall/\&forall;/g
/\$/s/\\partial/\&part;/g
/\$/s/\\exists/\&exist;/g
/\$/s/\\emptyset/\&empty;/g
/\$/s/\\nabla/\&nabla;/g
/\$/s/\\in/\&isin;/g
/\$/s/\\ni/\&ni;/g
/\$/s/\\sqcap/\&prod;/g
/\$/s/\\sum/\&sum;/g
/\$/s/\\ast/\&lowast;/g
/\$/s/\\surd/\&radic;/g
/\$/s/\\propto/\&prop;/g
/\$/s/\\infty/\&infin;/g
/\$/s/\\angle/\&ang;/g
/\$/s/\\wedge/\&and;/g
/\$/s/\\vee/\&or;/g
/\$/s/\\cap/\&cap;/g
/\$/s/\\cup/\&cup;/g
/\$/s/\\int/\&int;/g
/\$/s/\\sim/\&sim;/g
/\$/s/\\cong/\&cong;/g
/\$/s/\\approx/\&asymp;/g
/\$/s/\\neq/\&ne;/g
/\$/s/\\equiv/\&equiv;/g
/\$/s/\\leq/\&le;/g
/\$/s/\\geq/\&ge;/g
/\$/s/\\subset/\&sub;/g
/\$/s/\\supset/\&sup;/g
/\$/s/\\subseteq/\&sube;/g
/\$/s/\\supseteq/\&supe;/g
/\$/s/\\oplus/\&oplus;/g
/\$/s/\\otimes/\&otimes;/g
/\$/s/\\perp/\&perp;/g
/\$/s/\\cdot/\&sdot;/g' tempfile

##Corners, Angle brackets
sed -i '/\$/s/\\lceil/\&lceil;/g
/\$/s/\\rceil/\&rceil;/g
/\$/s/\\lfloor/\&lfloor;/g
/\$/s/\\rfloor/\&rfloor;/g
/\$/s/\\langle/\&lang;/g
/\$/s/\\rangle/\&rang;/g' tempfile

## Remove math marks
sed -i 's/\$//g' tempfile


## Mark verbatim newlines
sed -i '/\\begin{verbatim}/,/\\end{verbatim}/s/$/nEwLiNe/' tempfile

## Workaround to delete newlines only in paragraphs
sed -i 's/^$/bLaNkLiNe/' tempfile
tr '\n' ' ' < tempfile > $output
rm -f tempfile
## Recovering blanklines between paragraphs
sed -i 's/\([ ]*\(bLaNkLiNe\)[ ]*\)*/\2/g' $output
sed -i 's/bLaNkLiNe/\
/g' $output

## Form here paragraphs are whole lines.  Take it in care at time of
## addressing sed commands.
## ------------------------------------------------------------------

## Once again with spaces
sed -i '/\\begin{verbatim}/!s/^[ ]*//' $output
sed -i '/\\begin{verbatim}/!s/ *$//' $output

## TO DO (and not translatable)
sed -i 's/\\end{document}//
s/\\begin{table}//g
s/\\end{table}//g
s/\\newpage//g
s/\\maketitle//
s/\\thispagestyle{[^}]*}//g
s/\\noindent[ ]*//g
s/\\nopagebreak[ ]*//g
s/\\doublespacing//g
s/\\singlespacing//g
s/\\setcounter{[^}]*}{[^}]*}[ ]*//g
s/\\centering//g
s/\\tiny//g
s/\\scriptsize//g
s/\\footnotesize//g
s/\\small //g
s/\\normalsize//g
s/\\.arge//g
s/\\LARGE//g
s/\\Huge//g' $output


## Vertical spaces
sed -i 's/\\\(big\|med\|small\)skip/<br \/>/g
s/[ ]*\\vspace[*]*{[^{}]*}[ ]*/<br \/><br \/>/g' $output

## Font size (simple tag not supported)
sed -i 's/\\begin{tiny}/<div style="font-size: 0.8em">/g
s/\\begin{scriptsize}/<div style="font-size: 0.85em">/g
s/\\begin{footnotesize}/<div style="font-size: 0.9em">/g
s/\\begin{small}/<div style="font-size: 0.95em">/g
s/\\begin{normalsize}/<div style="font-size: 1em">/g
s/\\begin{large}/<div style="font-size: 1.05em">/g
s/\\begin{Large}/<div style="font-size: 1.1em">/g
s/\\begin{LARGE}/<div style="font-size: 1.15em">/g
s/\\begin{Huge}/<div style="font-size: 1.2em">/g
s/\\end{\(\|tiny\|scriptsize\|footnotesize\|small\|normalsize\|.arge\|LARGE\|huge\|Huge\)}/<\/div>/g' $output

## Punctuation
sed -i 's/<</«/g
s/>>/»/g
s/``/"/g
s/'\'\''/\"/g
s/[\]*ldots[\]*/\.\.\./g
s/\\\\/<br \/>/g
s/---/\&mdash;/g
s/--/\&ndash;/g
s/\\%/%/g
s/\\_/_/g
s/\.\\ /\.\ /g' $output

## Replacing environments (some repeated for get rid of nested)
sed -i 's/\\emph{\([^{}]*\)}/<em>\1<\/em>/g
s/\\textit{\([^{}]*\)}/<i>\1<\/i>/g
s/\\textbf{\([^{}]*\)}/<b>\1<\/b>/g
s/\\texttt{\([^{}]*\)}/<tt>\1<\/tt>/g
s/\\footnote{\([^{}]*\)}/\[*<tt style="font-size: 12px; color: gray">\1\<\/tt>]/g
s/\\emph{\([^{}]*\)}/<em>\1<\/em>/g
s/\\textit{\([^{}]*\)}/<i>\1<\/i>/g
s/\\textbf{\([^{}]*\)}/<b>\1<\/b>/g
s/\\texttt{\([^{}]*\)}/<tt>\1<\/tt>/g
s/\\footnote{\([^{}]*\)}/\[*<tt style="font-size: 12px; color: gray">\1\<\/tt>]/g' $output

sed -i '/\([sub]*section\|chapter\)[\*]{/s/\\begin{\(flushright\|center[ing]*\)}//g
/\\\([sub]*section\|chapter\)[\*]/s/\\end{\(flushright\|center[ing]*\)}//g
s/\\begin{flushright}[ ]*/<div class="right">/g
s/[ ]*\\end{flushright}/<\/div>/g
s/\\begin{center[ing]*}[ ]*/<div class="center">/g
s/[ ]*\\end{center[ing]*}/<\/div>/g
s/\\subsubsection[^{}]*{[ ]*\([^{}]*\)[ ]*}/<h4>\1<\/h4>/g
s/\\subsection[^{}]*{[ ]*\([^{}]*\)[ ]*}/<h3>\1<\/h3>/g
s/\\section[^{}]*{[ ]*\([^{}]*\)[ ]*}/<h2>\1<\/h2>/g
s/\\chapter[^{}]*{[ ]*\([^{}]*\)[ ]*}/<br \/><br \/><br \/><h1>\1\&nbsp;<\/h1><br \/><br \/>/g' $output

## Image with caption
sed -i 's/\\begin{figure}[^{]*\\includegraphics\[[^{]*\]{[^}]*\/\([^}]*\)\.\(eps\|png\|jpg\)}[^{}]*\\caption{<[^>]*>\([^}]*\)<[^>]*>}.*\\end{figure}/<br \/><div class="center"><img style="max-width: 90%" src="img\/\1.png" alt="\3" \/><br \/><br \/><em>\3<\/em><\/div>/g' $output

## Image without caption
sed -i 's/\\begin{figure}[^{]*\\includegraphics\[[^{]*\]{[^}]*\/\([^}]*\)\.\(eps\|png\|jpg\)}[^{}]*\\end{figure}/<br \/><div class="center"><img style="max-width: 90%" src="img\/\1.png" alt="image" \/><\/div>/g' $output

sed -i 's/\\begin{quot[ae]\(\|tion\)}[ ]*/<blockquote>/g
s/[ ]*\\end{quot[ae]\(\|tion\)}/<\/blockquote>/g
s/\\begin{titlepage}/<div id="titlepage">/g
s/\\end{titlepage}/<\/div>/g' $output

## Verse
sed -i 's/\\begin{verse}//g
s/\\end{verse}//g' $output

## Verbatim
sed -i	's/\\begin{verbatim}/<pre>/
s/\\end{verbatim}/<\/pre>/' $output

## Lists
sed -i '/\\begin{\(itemize\|enumerate\)}/s/[ ]*\\item[ ]*//
/\\begin{\(itemize\|enumerate\)}/s/[ ]*\\item[ ]*/<\/li><li>/g
s/\\begin{itemize}[ ]*/<ul><li>/
s/[ ]*\\end{itemize}[ ]*/<\/li><\/ul>/
s/\\begin{enumerate}[ ]*/<ol><li>/
s/[ ]*\\end{enumerate}[ ]*/<\/li><\/ol>/' $output

## Once again with spaces
sed -i '/<pre>/!s/\(^[ ]*\| *$\)//' $output

## Set paragraphs
sed -i '/\(^$\)\|<\(pre\|br \|[\/]*h[12345]\|[\/]*blockquote\|[\/]*div\|[\/]*ul\)/!s/[ ]*\(.*\)[ ]*/<p>\1<\/p>/' $output

## Recovering newlines in verbatim environment
sed -i 's/nEwLiNe/\
/g' $output

## Set html header (change charset in meta to your system's default)
sed -i 1i\ "\\
<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"> \\
<html xmlns=\"http://www.w3.org/1999/xhtml\"> \\
<head> \\
<title>$document_title</title> \\
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /> \\
<meta name=\"author\" content=\"$author\" /> \\
<meta name=\"description\" content=\"\" /> \\
<meta name=\"keywords\" content=\"\" /> \\
<style type=\"text/css\"> \\
body { \\
max-width: 650px; \\
padding: 10px \\
} \\
h1 { \\
padding-top: 20mm; \\
padding-bottom: 5mm; \\
text-align: center \\
} \\
h2 { \\
padding-top: 8mm; \\
} \\
h3 { \\
padding-top: 6mm; \\
} \\
h4 { \\
padding-top: 4mm; \\
} \\
pre { \\
background-color: #ccc; \\
padding: 10px \\
} \\
#titlepage { \\
font-size: 2em; \\
} \\
.right { \\
text-align: right \\
} \\
.center { \\
text-align: center \\
} \\
</style> \\
</head> \\
<body> \\
<h1 class=\"center\">$document_title\&nbsp;</h1> \\
<p class=\"center\">$author\&nbsp;</p> \\
<p class=\"center\">$date\&nbsp;</p>" $output

## Html foot
sed -i '$a\
<br \/><br \/><br \/><br \/> \
<hr \/> \
<p class="right">Converted with <tt>lx2xl<\/tt> (eloi at roquesor.com)<\/p> \
<br \/><br \/> \
<\/body> \
<\/html>' $output

## Finally, if you have tidy installed uncomment this to show html
## errors (remove or change the -utf8 tag in case)
#tidy -e -utf8 $output
## Or this to format
#tidy -m -i -utf8 $output

## And to debug this script this will show you tex tags not converted
## in the html
echo '

** Lines with not converted latex tags:'
grep '\\.*' $output || echo -e 'None\n'

## End lx2xl
    
<= Prev Next =>


[HOME] [BASH] [Window Managers]

You can mail me to eloi at roquesor.com.