Source code for core.pylatexenc.latex2text

import re;
import unicodedata;
import latexwalker;
import logging;


log = logging.getLogger(__name__);



class EnvDef:
[docs]    def __init__(self, envname, simplify_repl=None, discard=False):
        self.envname = envname
        self.simplify_repl = simplify_repl
        self.discard = discard

class MacroDef:
[docs]    def __init__(self, macname, simplify_repl=None, discard=None):
        """
        Arguments:
            - `macname`: the name of the macro (no backslash)
            - `simplify_repl`: either a string or a callable. The string
              may contain '%s' replacements, in which the macro arguments
              will be substituted. The callable should accept the
              :py:class:`~latexwalker.LatexMacroNode` as an argument.
        """
        if (isinstance(macname, MacroDef)):
            o = macname
            self.macname = o.macname
            self.discard = o.discard
            self.simplify_repl = o.simplify_repl
        elif (isinstance(macname, tuple)):
            (self.macname, self.simplify_repl) = macname
            self.discard = True if (discard is None) else discard ;
            if (simplify_repl is not None or discard is not None):
                raise ValueError("macname=%r is tuple but other parameters specified" %(macname,))
        else:
            self.macname = macname
            self.discard = True if (discard is None) else discard ;
            self.simplify_repl = simplify_repl


env_list = [
    EnvDef('', discard=False), # default for unknown environments

    EnvDef('equation', discard=False),
    EnvDef('eqnarray', discard=False),
    EnvDef('align', discard=False),
    EnvDef('multline', discard=False),

    # spaces added so that database indexing doesn't index the word "array" or "pmatrix"
    EnvDef('array', simplify_repl='< a r r a y >'),
    EnvDef('pmatrix', simplify_repl='< p m a t r i x >'),
    EnvDef('bmatrix', simplify_repl='< b m a t r i x >'),
    EnvDef('smallmatrix', simplify_repl='< s m a l l m a t r i x >'),

    EnvDef('center', simplify_repl='\n%s\n'),
    EnvDef('flushleft', simplify_repl='\n%s\n'),
    EnvDef('flushright', simplify_repl='\n%s\n'),
    
    EnvDef('exenumerate', discard=False),
    EnvDef('enumerate', discard=False),
    EnvDef('list', discard=False),
    EnvDef('itemize', discard=False),
    EnvDef('subequations', discard=False),
    EnvDef('figure', discard=False),
    EnvDef('table', discard=False),
    ];


# NOTE: macro will only be assigned arguments if they are explicitely defined as accepting arguments
#       in latexwalker.py.

macro_list = [
    MacroDef('', discard=True), # default for unknown macros

    MacroDef('textbf', discard=False),
    MacroDef('textit', discard=False),
    MacroDef('textsl', discard=False),
    MacroDef('textsc', discard=False),
    MacroDef('text', discard=False),
    MacroDef('mathrm', discard=False),

    # spaces added so that database indexing doesn't index the word "graphics"
    ('includegraphics', '< g r a p h i c s >'),

    ('ref', '<ref>'),
    ('eqref', '(<ref>)'),
    ('url', '<%s>'),
    ('item', lambda r: '\n  '+(latexnodes2text([r.nodeoptarg]) if r.nodeoptarg else '*')),
    ('footnote', '[%s]'),

    ('texorpdfstring', lambda node: latexnodes2text(node.nodeargs[1:2])), # use second argument

    ('oe', u'\u0153'),
    ('OE', u'\u0152'),
    ('ae', u'\u00e6'),
    ('AE', u'\u00c6'),
    ('aa', u'\u00e5'), # a norvegien/nordique
    ('AA', u'\u00c5'), # A norvegien/nordique
    ('o', u'\u00f8'), # o norvegien/nordique
    ('O', u'\u00d8'), # O norvegien/nordique
    ('ss', u'\u00df'), # s-z allemand
    ('L', u"\N{LATIN CAPITAL LETTER L WITH STROKE}"),
    ('l', u"\N{LATIN SMALL LETTER L WITH STROKE}"),
    ('i', u"\N{LATIN SMALL LETTER DOTLESS I}"),
    ('j', u"\N{LATIN SMALL LETTER DOTLESS J}"),

    ("~", "~" ),
    ("&", "\\&" ), # HACK, see below for text replacement of '&'
    ("$", "$" ),
    ("{", "{" ),
    ("}", "}" ),
    ("%", lambda arg: u"%" ), # careful: % is formatting substituion symbol...
    ("#", "#" ),
    ("_", "_" ),

    ("\\", '\n'),

    ("textquoteleft", "`"),
    ("textquoteright", "'"),
    ("textquotedblright", u"\N{RIGHT DOUBLE QUOTATION MARK}"),
    ("textquotedblleft", u"\N{LEFT DOUBLE QUOTATION MARK}"),
    ("textendash", u"\N{EN DASH}"),
    ("textemdash", u"\N{EM DASH}"),

    ('textpm', u"\N{PLUS-MINUS SIGN}"),
    ('textmp', u"\N{MINUS-OR-PLUS SIGN}"),

    ("texteuro", u"\N{EURO SIGN}"),

    # math stuff

    ("hbar", u"\N{LATIN SMALL LETTER H WITH STROKE}"),
    ("ell", u"\N{SCRIPT SMALL L}"),

    ('forall', u"\N{FOR ALL}"),
    ('complement', u"\N{COMPLEMENT}"),
    ('partial', u"\N{PARTIAL DIFFERENTIAL}"),
    ('exists', u"\N{THERE EXISTS}"),
    ('nexists', u"\N{THERE DOES NOT EXIST}"),
    ('varnothing', u"\N{EMPTY SET}"),
    ('emptyset', u"\N{EMPTY SET}"),
    # increment?
    ('nabla', u"\N{NABLA}"),
    #
    ('in', u"\N{ELEMENT OF}"),
    ('notin', u"\N{NOT AN ELEMENT OF}"),
    ('ni', u"\N{CONTAINS AS MEMBER}"),
    ('prod', u'\N{N-ARY PRODUCT}'),
    ('coprod', u'\N{N-ARY COPRODUCT}'),
    ('sum', u'\N{N-ARY SUMMATION}'),
    ('setminus', u'\N{SET MINUS}'),
    ('smallsetminus', u'\N{SET MINUS}'),
    ('ast', u'\N{ASTERISK OPERATOR}'),
    ('circ', u'\N{RING OPERATOR}'),
    ('bullet', u'\N{BULLET OPERATOR}'),
    ('sqrt', u'\N{SQUARE ROOT}(%s)'),
    ('propto', u'\N{PROPORTIONAL TO}'),
    ('infty', u'\N{INFINITY}'),
    ('parallel', u'\N{PARALLEL TO}'),
    ('nparallel', u'\N{NOT PARALLEL TO}'),
    ('wedge', u"\N{LOGICAL AND}"),
    ('vee', u"\N{LOGICAL OR}"),
    ('cap', u'\N{INTERSECTION}'),
    ('cup', u'\N{UNION}'),
    ('int', u'\N{INTEGRAL}'),
    ('iint', u'\N{DOUBLE INTEGRAL}'),
    ('iiint', u'\N{TRIPLE INTEGRAL}'),
    ('oint', u'\N{CONTOUR INTEGRAL}'),

    ('sim', u'\N{TILDE OPERATOR}'),
    ('backsim', u'\N{REVERSED TILDE}'),
    ('simeq', u'\N{ASYMPTOTICALLY EQUAL TO}'),
    ('approx', u'\N{ALMOST EQUAL TO}'),
    ('neq', u'\N{NOT EQUAL TO}'),
    ('equiv', u'\N{IDENTICAL TO}'),
    ('ge', u'>'),#
    ('le', u'<'),#
    ('leq', u'\N{LESS-THAN OR EQUAL TO}'),
    ('geq', u'\N{GREATER-THAN OR EQUAL TO}'),
    ('leqslant', u'\N{LESS-THAN OR EQUAL TO}'),
    ('geqslant', u'\N{GREATER-THAN OR EQUAL TO}'),
    ('leqq', u'\N{LESS-THAN OVER EQUAL TO}'),
    ('geqq', u'\N{GREATER-THAN OVER EQUAL TO}'),
    ('lneqq', u'\N{LESS-THAN BUT NOT EQUAL TO}'),
    ('gneqq', u'\N{GREATER-THAN BUT NOT EQUAL TO}'),
    ('ll', u'\N{MUCH LESS-THAN}'),
    ('gg', u'\N{MUCH GREATER-THAN}'),
    ('nless', u'\N{NOT LESS-THAN}'),
    ('ngtr', u'\N{NOT GREATER-THAN}'),
    ('nleq', u'\N{NEITHER LESS-THAN NOR EQUAL TO}'),
    ('ngeq', u'\N{NEITHER GREATER-THAN NOR EQUAL TO}'),
    ('lesssim', u'\N{LESS-THAN OR EQUIVALENT TO}'),
    ('gtrsim', u'\N{GREATER-THAN OR EQUIVALENT TO}'),
    ('lessgtr', u'\N{LESS-THAN OR GREATER-THAN}'),
    ('gtrless', u'\N{GREATER-THAN OR LESS-THAN}'),
    ('prec', u'\N{PRECEDES}'),
    ('succ', u'\N{SUCCEEDS}'),
    ('preceq', u'\N{PRECEDES OR EQUAL TO}'),
    ('succeq', u'\N{SUCCEEDS OR EQUAL TO}'),
    ('precsim', u'\N{PRECEDES OR EQUIVALENT TO}'),
    ('succsim', u'\N{SUCCEEDS OR EQUIVALENT TO}'),
    ('nprec', u'\N{DOES NOT PRECEDE}'),
    ('nsucc', u'\N{DOES NOT SUCCEED}'),
    ('subset', u'\N{SUBSET OF}'),
    ('supset', u'\N{SUPERSET OF}'),
    ('subseteq', u'\N{SUBSET OF OR EQUAL TO}'),
    ('supseteq', u'\N{SUPERSET OF OR EQUAL TO}'),
    ('nsubseteq', u'\N{NEITHER A SUBSET OF NOR EQUAL TO}'),
    ('nsupseteq', u'\N{NEITHER A SUPERSET OF NOR EQUAL TO}'),
    ('subsetneq', u'\N{SUBSET OF WITH NOT EQUAL TO}'),
    ('supsetneq', u'\N{SUPERSET OF WITH NOT EQUAL TO}'),


    ('cdot', u'\N{MIDDLE DOT}'),
    ('times', u'\N{MULTIPLICATION SIGN}'),
    ('otimes', u'\N{CIRCLED TIMES}'),
    ('oplus', u'\N{CIRCLED PLUS}'),
    ('bigotimes', u'\N{CIRCLED TIMES}'),
    ('bigoplus', u'\N{CIRCLED PLUS}'),

    ('frac', '%s/%s'),
    ('nicefrac', '%s/%s'),

    ('cos', 'cos'),
    ('sin', 'sin'),
    ('tan', 'tan'),
    ('arccos', 'arccos'),
    ('arcsin', 'arcsin'),
    ('arctan', 'arctan'),

    ('prime', "'"),
    ('dag', u"\N{DAGGER}"),
    ('dagger', u"\N{DAGGER}"),
    ('pm', u"\N{PLUS-MINUS SIGN}"),
    ('mp', u"\N{MINUS-OR-PLUS SIGN}"),

    (',', u" "),
    (';', u" "),
    (':', u" "),
    (' ', u" "),
    ('!', u""), # sorry, no negative space in ascii
    ('quad', u"  "),
    ('qquad', u"    "),

    ('ldots', u"..."),
    ('cdots', u"..."),
    ('ddots', u"..."),
    ('dots', u"..."),
    
    ('langle', u'\N{LEFT ANGLE BRACKET}'),
    ('rangle', u'\N{RIGHT ANGLE BRACKET}'),
    ('mid', u'|'),
    ('nmid', u'\N{DOES NOT DIVIDE}'),
    
    ('ket', u'|%s\N{RIGHT ANGLE BRACKET}'),
    ('bra', u'\N{LEFT ANGLE BRACKET}%s|'),
    ('braket', u'\N{LEFT ANGLE BRACKET}%s|%s\N{RIGHT ANGLE BRACKET}'),
    ('ketbra', u'|%s\N{RIGHT ANGLE BRACKET}\N{LEFT ANGLE BRACKET}%s|'),
    ('uparrow', u'\N{UPWARDS ARROW}'),
    ('downarrow', u'\N{DOWNWARDS ARROW}'),
    ('rightarrow', u'\N{RIGHTWARDS ARROW}'),
    ('to', u'\N{RIGHTWARDS ARROW}'),
    ('leftarrow', u'\N{LEFTWARDS ARROW}'),
    ('longrightarrow', u'\N{LONG RIGHTWARDS ARROW}'),
    ('longleftarrow', u'\N{LONG LEFTWARDS ARROW}'),

    # we use these conventions as Identity operator (\mathbbm{1})
    ('id', u'\N{MATHEMATICAL DOUBLE-STRUCK CAPITAL I}'),
    ('Ident', u'\N{MATHEMATICAL DOUBLE-STRUCK CAPITAL I}'),
    ];



def _format_uebung(n):
    s = '\n%s\n' %(latexnodes2text([n.nodeargs[0]]));
    optarg = n.nodeargs[1];
    if (optarg is not None):
        s += '[%s]\n' %(latexnodes2text([optarg]));
    return s


macro_list += [
    # some ethuebung.sty macros
    ('exercise', _format_uebung),
    ('uebung', _format_uebung),
    ('hint', 'Hint: %s'),
    ('hints', 'Hints: %s'),
    ('hinweis', 'Hinweis: %s'),
    ('hinweise', 'Hinweise: %s'),
    ];





def greekletters(letterlist):
[docs]    for l in letterlist:
        ucharname = l.upper()
        if (ucharname == 'LAMBDA'):
            ucharname = 'LAMDA'
        smallname = "GREEK SMALL LETTER "+ucharname;
        if (ucharname == 'EPSILON'):
            smallname = "GREEK LUNATE EPSILON SYMBOL"
        if (ucharname == 'PHI'):
            smallname = "GREEK PHI SYMBOL"
        macro_list.append(
            (l, unicodedata.lookup(smallname))
            );
        macro_list.append(
            (l[0].upper()+l[1:], unicodedata.lookup("GREEK CAPITAL LETTER "+ucharname))
            );
greekletters( ('alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa',
               'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'tau', 'upsilon', 'phi',
               'chi', 'psi', 'omega') )
macro_list += [
    ('varepsilon', u'\N{GREEK SMALL LETTER EPSILON}'),
    ('vartheta', u'\N{GREEK THETA SYMBOL}'),
    ('varpi', u'\N{GREEK PI SYMBOL}'),
    ('varrho', u'\N{GREEK RHO SYMBOL}'),
    ('varsigma', u'\N{GREEK SMALL LETTER FINAL SIGMA}'),
    ('varphi', u'\N{GREEK SMALL LETTER PHI}'),
    ]

unicode_accents_list = (
    # see http://en.wikibooks.org/wiki/LaTeX/Special_Characters for a list
    ("'", u"\N{COMBINING ACUTE ACCENT}"),
    ("`", u"\N{COMBINING GRAVE ACCENT}"),
    ('"', u"\N{COMBINING DIAERESIS}"),
    ("c", u"\N{COMBINING CEDILLA}"),
    ("^", u"\N{COMBINING CIRCUMFLEX ACCENT}"),
    ("~", u"\N{COMBINING TILDE}"),
    ("H", u"\N{COMBINING DOUBLE ACUTE ACCENT}"),
    ("k", u"\N{COMBINING OGONEK}"),
    ("=", u"\N{COMBINING MACRON}"),
    ("b", u"\N{COMBINING MACRON BELOW}"),
    (".", u"\N{COMBINING DOT ABOVE}"),
    ("d", u"\N{COMBINING DOT BELOW}"),
    ("r", u"\N{COMBINING RING ABOVE}"),
    ("u", u"\N{COMBINING BREVE}"),
    ("v", u"\N{COMBINING CARON}"),

    ("vec", u"\N{COMBINING RIGHT ARROW ABOVE}"),
    ("dot", u"\N{COMBINING DOT ABOVE}"),
    ("hat", u"\N{COMBINING CIRCUMFLEX ACCENT}"),
    ("check", u"\N{COMBINING CARON}"),
    ("breve", u"\N{COMBINING BREVE}"),
    ("acute", u"\N{COMBINING ACUTE ACCENT}"),
    ("grave", u"\N{COMBINING GRAVE ACCENT}"),
    ("tilde", u"\N{COMBINING TILDE}"),
    ("bar", u"\N{COMBINING OVERLINE}"),
    ("ddot", u"\N{COMBINING DIAERESIS}"),

    ("not", u"\N{COMBINING LONG SOLIDUS OVERLAY}"),

    );

def make_accented_char(node, combining):
[docs]    nodearg = node.nodeargs[0] if len(node.nodeargs) else latexwalker.LatexCharsNode(chars=' ')

    c = latexnodes2text([nodearg]).strip();

    def getaccented(ch, combining):
        ch = unicode(ch)
        combining = unicode(combining)
        if (ch == u"\N{LATIN SMALL LETTER DOTLESS I}"):
            ch = u"i"
        if (ch == u"\N{LATIN SMALL LETTER DOTLESS I}"):
            ch = u"j"
        #print u"Accenting %s with %s"%(ch, combining) # this causes UnicdeDecodeError!!!
        return unicodedata.normalize('NFC', unicode(ch)+combining)

    return u"".join([getaccented(ch, combining) for ch in c]);


for u in unicode_accents_list:
    (mname, mcombining) = u;
    macro_list.append(
        (mname, lambda x, c=mcombining: make_accented_char(x, c))
        );






text_replacements = (
    # remove indentation provided by LaTeX
    #(re.compile(r'\n[ \t]*'), '\n'),
    
    ("~", " "),
    ("``", '"'),
    ("''", '"'),

    (r'(?<!\\)&', '   '), # ignore tabular alignments, just add a little space
    ('\\&', '&'), # but preserve the \& escapes, that we before *hackingly* kept as '\&' for this purpose ...

    );









env_dict = dict([(e.envname, e) for e in env_list])
macro_dict = dict([(m.macname, m) for m in (MacroDef(m) for m in macro_list)])





def latex2text(content, tolerant_parsing=False, keep_inline_math=False, keep_comments=False):
[docs]    """
    Extracts text from `content` meant for database indexing. `content` is
    some LaTeX code.
    """

    (nodelist, tpos, tlen) = latexwalker.get_latex_nodes(content, keep_inline_math=keep_inline_math,
                                                         tolerant_parsing=tolerant_parsing);

    return latexnodes2text(nodelist, keep_inline_math=keep_inline_math, keep_comments=keep_comments);


def latexnodes2text(nodelist, keep_inline_math=False, keep_comments=False):
[docs]    """
    Extracts text from a node list. `nodelist` is a list of nodes as returned by
    `latexwalker.get_latex_nodes()`.
    """
    

    def text_from_node(node):
        if (node is None):
            return ""
        
        if (node.isNodeType(latexwalker.LatexCharsNode)):
            return node.chars
        if (node.isNodeType(latexwalker.LatexCommentNode)):
            if (keep_comments):
                return '%'+node.comment
            return ""
        if (node.isNodeType(latexwalker.LatexGroupNode)):
            return "".join([text_from_node(n) for n in node.nodelist]);
        if (node.isNodeType(latexwalker.LatexMacroNode)):
            # get macro behavior definition.
            macroname = node.macroname.rstrip('*');
            if (macroname in macro_dict):
                mac = macro_dict[macroname]
            else:
                # no predefined behavior, use default:
                mac = macro_dict['']
            if mac.simplify_repl:
                if (callable(mac.simplify_repl)):
                    return mac.simplify_repl(node)
                if ('%' in mac.simplify_repl):
                    try:
                        return mac.simplify_repl % tuple([text_from_node(nn) for nn in node.nodeargs])
                    except (TypeError, ValueError):
                        log.warning("WARNING: Error in configuration: macro '%s' failed its substitution!"
                                    %(macroname));
                        return mac.simplify_repl; # too bad, keep the percent signs as they are...
                return mac.simplify_repl
            if mac.discard:
                return ""
            a = node.nodeargs;
            if (node.nodeoptarg):
                a.prepend(node.nodeoptarg)
            return "".join([text_from_node(n) for n in a])

        if (node.isNodeType(latexwalker.LatexEnvironmentNode)):
            # get environment behavior definition.
            envname = node.envname.rstrip('*');
            if (envname in env_dict):
                envdef = env_dict[envname]
            else:
                # no predefined behavior, use default:
                envdef = env_dict['']
            if envdef.simplify_repl:
                if (callable(envdef.simplify_repl)):
                    return envdef.simplify_repl(node)
                if ('%' in envdef.simplify_repl):
                    return envdef.simplify_repl % ("".join([text_from_node(nn) for nn in node.nodelist]))
                return envdef.simplify_repl
            if envdef.discard:
                return ""
            return "".join([text_from_node(n) for n in node.nodelist])

        if (node.isNodeType(latexwalker.LatexMathNode)):
            # if we have a math node, this means we care about math modes and we should keep this verbatim.
            return latexwalker.math_node_to_latex(node);

        print "extract_from_latex(): IGNORING NODE: "+repr(node)

        # discard anything else.
        return ""
    

    s = "".join([text_from_node(n) for n in nodelist]);

    # now, perform suitable replacements
    for pattern, replacement in text_replacements:
        if (hasattr(pattern, 'sub')):
            s = pattern.sub(replacement, s)
        else:
            s = s.replace(pattern, replacement)


    #  ###TODO: more clever handling of math modes??

    if (not keep_inline_math):
        s = s.replace('$', ''); # removing math mode inline signs, just keep their Unicode counterparts..

    return s







if __name__ == '__main__':

    try:

        #latex = '\\textit{hi there!} This is {\em an equation}: \\begin{equation}\n a + bi = 0\n\\end{equation}\n\nwhere $i$ is the imaginary unit.\n';

        import fileinput

        print "Please type some latex text (Ctrl+D twice to stop) ..."

        latex = ''
        for line in fileinput.input():
            latex += line;


        print '\n--- WORDS ---\n'
        print latex2text(latex.decode('utf-8')#, keep_inline_math=True
                         ).encode('utf-8')
        print '\n-------------\n'

    except:
        import pdb;
        import traceback;
        import sys;
        (exc_type, exc_value, exc_traceback) = sys.exc_info()
        
        print "\nEXCEPTION: " + unicode(sys.exc_value) + "\n"
        
        pdb.post_mortem()