Source code for core.pylatexenc.latex2text

import re;
import unicodedata;
import latexwalker;
import logging;


log = logging.getLogger(__name__);



class EnvDef:
[docs] def __init__(self, envname, simplify_repl=None, discard=False): self.envname = envname self.simplify_repl = simplify_repl self.discard = discard class MacroDef:
[docs] def __init__(self, macname, simplify_repl=None, discard=None): """ Arguments: - `macname`: the name of the macro (no backslash) - `simplify_repl`: either a string or a callable. The string may contain '%s' replacements, in which the macro arguments will be substituted. The callable should accept the :py:class:`~latexwalker.LatexMacroNode` as an argument. """ if (isinstance(macname, MacroDef)): o = macname self.macname = o.macname self.discard = o.discard self.simplify_repl = o.simplify_repl elif (isinstance(macname, tuple)): (self.macname, self.simplify_repl) = macname self.discard = True if (discard is None) else discard ; if (simplify_repl is not None or discard is not None): raise ValueError("macname=%r is tuple but other parameters specified" %(macname,)) else: self.macname = macname self.discard = True if (discard is None) else discard ; self.simplify_repl = simplify_repl env_list = [
EnvDef('', discard=False), # default for unknown environments EnvDef('equation', discard=False), EnvDef('eqnarray', discard=False), EnvDef('align', discard=False), EnvDef('multline', discard=False), # spaces added so that database indexing doesn't index the word "array" or "pmatrix" EnvDef('array', simplify_repl='< a r r a y >'), EnvDef('pmatrix', simplify_repl='< p m a t r i x >'), EnvDef('bmatrix', simplify_repl='< b m a t r i x >'), EnvDef('smallmatrix', simplify_repl='< s m a l l m a t r i x >'), EnvDef('center', simplify_repl='\n%s\n'), EnvDef('flushleft', simplify_repl='\n%s\n'), EnvDef('flushright', simplify_repl='\n%s\n'), EnvDef('exenumerate', discard=False), EnvDef('enumerate', discard=False), EnvDef('list', discard=False), EnvDef('itemize', discard=False), EnvDef('subequations', discard=False), EnvDef('figure', discard=False), EnvDef('table', discard=False), ]; # NOTE: macro will only be assigned arguments if they are explicitely defined as accepting arguments # in latexwalker.py. macro_list = [ MacroDef('', discard=True), # default for unknown macros MacroDef('textbf', discard=False), MacroDef('textit', discard=False), MacroDef('textsl', discard=False), MacroDef('textsc', discard=False), MacroDef('text', discard=False), MacroDef('mathrm', discard=False), # spaces added so that database indexing doesn't index the word "graphics" ('includegraphics', '< g r a p h i c s >'), ('ref', '<ref>'), ('eqref', '(<ref>)'), ('url', '<%s>'), ('item', lambda r: '\n '+(latexnodes2text([r.nodeoptarg]) if r.nodeoptarg else '*')), ('footnote', '[%s]'), ('texorpdfstring', lambda node: latexnodes2text(node.nodeargs[1:2])), # use second argument ('oe', u'\u0153'), ('OE', u'\u0152'), ('ae', u'\u00e6'), ('AE', u'\u00c6'), ('aa', u'\u00e5'), # a norvegien/nordique ('AA', u'\u00c5'), # A norvegien/nordique ('o', u'\u00f8'), # o norvegien/nordique ('O', u'\u00d8'), # O norvegien/nordique ('ss', u'\u00df'), # s-z allemand ('L', u"\N{LATIN CAPITAL LETTER L WITH STROKE}"), ('l', u"\N{LATIN SMALL LETTER L WITH STROKE}"), ('i', u"\N{LATIN SMALL LETTER DOTLESS I}"), ('j', u"\N{LATIN SMALL LETTER DOTLESS J}"), ("~", "~" ), ("&", "\\&" ), # HACK, see below for text replacement of '&' ("$", "$" ), ("{", "{" ), ("}", "}" ), ("%", lambda arg: u"%" ), # careful: % is formatting substituion symbol... ("#", "#" ), ("_", "_" ), ("\\", '\n'), ("textquoteleft", "`"), ("textquoteright", "'"), ("textquotedblright", u"\N{RIGHT DOUBLE QUOTATION MARK}"), ("textquotedblleft", u"\N{LEFT DOUBLE QUOTATION MARK}"), ("textendash", u"\N{EN DASH}"), ("textemdash", u"\N{EM DASH}"), ('textpm', u"\N{PLUS-MINUS SIGN}"), ('textmp', u"\N{MINUS-OR-PLUS SIGN}"), ("texteuro", u"\N{EURO SIGN}"), # math stuff ("hbar", u"\N{LATIN SMALL LETTER H WITH STROKE}"), ("ell", u"\N{SCRIPT SMALL L}"), ('forall', u"\N{FOR ALL}"), ('complement', u"\N{COMPLEMENT}"), ('partial', u"\N{PARTIAL DIFFERENTIAL}"), ('exists', u"\N{THERE EXISTS}"), ('nexists', u"\N{THERE DOES NOT EXIST}"), ('varnothing', u"\N{EMPTY SET}"), ('emptyset', u"\N{EMPTY SET}"), # increment? ('nabla', u"\N{NABLA}"), # ('in', u"\N{ELEMENT OF}"), ('notin', u"\N{NOT AN ELEMENT OF}"), ('ni', u"\N{CONTAINS AS MEMBER}"), ('prod', u'\N{N-ARY PRODUCT}'), ('coprod', u'\N{N-ARY COPRODUCT}'), ('sum', u'\N{N-ARY SUMMATION}'), ('setminus', u'\N{SET MINUS}'), ('smallsetminus', u'\N{SET MINUS}'), ('ast', u'\N{ASTERISK OPERATOR}'), ('circ', u'\N{RING OPERATOR}'), ('bullet', u'\N{BULLET OPERATOR}'), ('sqrt', u'\N{SQUARE ROOT}(%s)'), ('propto', u'\N{PROPORTIONAL TO}'), ('infty', u'\N{INFINITY}'), ('parallel', u'\N{PARALLEL TO}'), ('nparallel', u'\N{NOT PARALLEL TO}'), ('wedge', u"\N{LOGICAL AND}"), ('vee', u"\N{LOGICAL OR}"), ('cap', u'\N{INTERSECTION}'), ('cup', u'\N{UNION}'), ('int', u'\N{INTEGRAL}'), ('iint', u'\N{DOUBLE INTEGRAL}'), ('iiint', u'\N{TRIPLE INTEGRAL}'), ('oint', u'\N{CONTOUR INTEGRAL}'), ('sim', u'\N{TILDE OPERATOR}'), ('backsim', u'\N{REVERSED TILDE}'), ('simeq', u'\N{ASYMPTOTICALLY EQUAL TO}'), ('approx', u'\N{ALMOST EQUAL TO}'), ('neq', u'\N{NOT EQUAL TO}'), ('equiv', u'\N{IDENTICAL TO}'), ('ge', u'>'),# ('le', u'<'),# ('leq', u'\N{LESS-THAN OR EQUAL TO}'), ('geq', u'\N{GREATER-THAN OR EQUAL TO}'), ('leqslant', u'\N{LESS-THAN OR EQUAL TO}'), ('geqslant', u'\N{GREATER-THAN OR EQUAL TO}'), ('leqq', u'\N{LESS-THAN OVER EQUAL TO}'), ('geqq', u'\N{GREATER-THAN OVER EQUAL TO}'), ('lneqq', u'\N{LESS-THAN BUT NOT EQUAL TO}'), ('gneqq', u'\N{GREATER-THAN BUT NOT EQUAL TO}'), ('ll', u'\N{MUCH LESS-THAN}'), ('gg', u'\N{MUCH GREATER-THAN}'), ('nless', u'\N{NOT LESS-THAN}'), ('ngtr', u'\N{NOT GREATER-THAN}'), ('nleq', u'\N{NEITHER LESS-THAN NOR EQUAL TO}'), ('ngeq', u'\N{NEITHER GREATER-THAN NOR EQUAL TO}'), ('lesssim', u'\N{LESS-THAN OR EQUIVALENT TO}'), ('gtrsim', u'\N{GREATER-THAN OR EQUIVALENT TO}'), ('lessgtr', u'\N{LESS-THAN OR GREATER-THAN}'), ('gtrless', u'\N{GREATER-THAN OR LESS-THAN}'), ('prec', u'\N{PRECEDES}'), ('succ', u'\N{SUCCEEDS}'), ('preceq', u'\N{PRECEDES OR EQUAL TO}'), ('succeq', u'\N{SUCCEEDS OR EQUAL TO}'), ('precsim', u'\N{PRECEDES OR EQUIVALENT TO}'), ('succsim', u'\N{SUCCEEDS OR EQUIVALENT TO}'), ('nprec', u'\N{DOES NOT PRECEDE}'), ('nsucc', u'\N{DOES NOT SUCCEED}'), ('subset', u'\N{SUBSET OF}'), ('supset', u'\N{SUPERSET OF}'), ('subseteq', u'\N{SUBSET OF OR EQUAL TO}'), ('supseteq', u'\N{SUPERSET OF OR EQUAL TO}'), ('nsubseteq', u'\N{NEITHER A SUBSET OF NOR EQUAL TO}'), ('nsupseteq', u'\N{NEITHER A SUPERSET OF NOR EQUAL TO}'), ('subsetneq', u'\N{SUBSET OF WITH NOT EQUAL TO}'), ('supsetneq', u'\N{SUPERSET OF WITH NOT EQUAL TO}'), ('cdot', u'\N{MIDDLE DOT}'), ('times', u'\N{MULTIPLICATION SIGN}'), ('otimes', u'\N{CIRCLED TIMES}'), ('oplus', u'\N{CIRCLED PLUS}'), ('bigotimes', u'\N{CIRCLED TIMES}'), ('bigoplus', u'\N{CIRCLED PLUS}'), ('frac', '%s/%s'), ('nicefrac', '%s/%s'), ('cos', 'cos'), ('sin', 'sin'), ('tan', 'tan'), ('arccos', 'arccos'), ('arcsin', 'arcsin'), ('arctan', 'arctan'), ('prime', "'"), ('dag', u"\N{DAGGER}"), ('dagger', u"\N{DAGGER}"), ('pm', u"\N{PLUS-MINUS SIGN}"), ('mp', u"\N{MINUS-OR-PLUS SIGN}"), (',', u" "), (';', u" "), (':', u" "), (' ', u" "), ('!', u""), # sorry, no negative space in ascii ('quad', u" "), ('qquad', u" "), ('ldots', u"..."), ('cdots', u"..."), ('ddots', u"..."), ('dots', u"..."), ('langle', u'\N{LEFT ANGLE BRACKET}'), ('rangle', u'\N{RIGHT ANGLE BRACKET}'), ('mid', u'|'), ('nmid', u'\N{DOES NOT DIVIDE}'), ('ket', u'|%s\N{RIGHT ANGLE BRACKET}'), ('bra', u'\N{LEFT ANGLE BRACKET}%s|'), ('braket', u'\N{LEFT ANGLE BRACKET}%s|%s\N{RIGHT ANGLE BRACKET}'), ('ketbra', u'|%s\N{RIGHT ANGLE BRACKET}\N{LEFT ANGLE BRACKET}%s|'), ('uparrow', u'\N{UPWARDS ARROW}'), ('downarrow', u'\N{DOWNWARDS ARROW}'), ('rightarrow', u'\N{RIGHTWARDS ARROW}'), ('to', u'\N{RIGHTWARDS ARROW}'), ('leftarrow', u'\N{LEFTWARDS ARROW}'), ('longrightarrow', u'\N{LONG RIGHTWARDS ARROW}'), ('longleftarrow', u'\N{LONG LEFTWARDS ARROW}'), # we use these conventions as Identity operator (\mathbbm{1}) ('id', u'\N{MATHEMATICAL DOUBLE-STRUCK CAPITAL I}'), ('Ident', u'\N{MATHEMATICAL DOUBLE-STRUCK CAPITAL I}'), ]; def _format_uebung(n): s = '\n%s\n' %(latexnodes2text([n.nodeargs[0]])); optarg = n.nodeargs[1]; if (optarg is not None): s += '[%s]\n' %(latexnodes2text([optarg])); return s macro_list += [ # some ethuebung.sty macros ('exercise', _format_uebung), ('uebung', _format_uebung), ('hint', 'Hint: %s'), ('hints', 'Hints: %s'), ('hinweis', 'Hinweis: %s'), ('hinweise', 'Hinweise: %s'), ]; def greekletters(letterlist):
[docs] for l in letterlist: ucharname = l.upper() if (ucharname == 'LAMBDA'): ucharname = 'LAMDA' smallname = "GREEK SMALL LETTER "+ucharname; if (ucharname == 'EPSILON'): smallname = "GREEK LUNATE EPSILON SYMBOL" if (ucharname == 'PHI'): smallname = "GREEK PHI SYMBOL" macro_list.append( (l, unicodedata.lookup(smallname)) ); macro_list.append( (l[0].upper()+l[1:], unicodedata.lookup("GREEK CAPITAL LETTER "+ucharname)) ); greekletters( ('alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa',
'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega') ) macro_list += [ ('varepsilon', u'\N{GREEK SMALL LETTER EPSILON}'), ('vartheta', u'\N{GREEK THETA SYMBOL}'), ('varpi', u'\N{GREEK PI SYMBOL}'), ('varrho', u'\N{GREEK RHO SYMBOL}'), ('varsigma', u'\N{GREEK SMALL LETTER FINAL SIGMA}'), ('varphi', u'\N{GREEK SMALL LETTER PHI}'), ] unicode_accents_list = ( # see http://en.wikibooks.org/wiki/LaTeX/Special_Characters for a list ("'", u"\N{COMBINING ACUTE ACCENT}"), ("`", u"\N{COMBINING GRAVE ACCENT}"), ('"', u"\N{COMBINING DIAERESIS}"), ("c", u"\N{COMBINING CEDILLA}"), ("^", u"\N{COMBINING CIRCUMFLEX ACCENT}"), ("~", u"\N{COMBINING TILDE}"), ("H", u"\N{COMBINING DOUBLE ACUTE ACCENT}"), ("k", u"\N{COMBINING OGONEK}"), ("=", u"\N{COMBINING MACRON}"), ("b", u"\N{COMBINING MACRON BELOW}"), (".", u"\N{COMBINING DOT ABOVE}"), ("d", u"\N{COMBINING DOT BELOW}"), ("r", u"\N{COMBINING RING ABOVE}"), ("u", u"\N{COMBINING BREVE}"), ("v", u"\N{COMBINING CARON}"), ("vec", u"\N{COMBINING RIGHT ARROW ABOVE}"), ("dot", u"\N{COMBINING DOT ABOVE}"), ("hat", u"\N{COMBINING CIRCUMFLEX ACCENT}"), ("check", u"\N{COMBINING CARON}"), ("breve", u"\N{COMBINING BREVE}"), ("acute", u"\N{COMBINING ACUTE ACCENT}"), ("grave", u"\N{COMBINING GRAVE ACCENT}"), ("tilde", u"\N{COMBINING TILDE}"), ("bar", u"\N{COMBINING OVERLINE}"), ("ddot", u"\N{COMBINING DIAERESIS}"), ("not", u"\N{COMBINING LONG SOLIDUS OVERLAY}"), ); def make_accented_char(node, combining):
[docs] nodearg = node.nodeargs[0] if len(node.nodeargs) else latexwalker.LatexCharsNode(chars=' ') c = latexnodes2text([nodearg]).strip(); def getaccented(ch, combining): ch = unicode(ch) combining = unicode(combining) if (ch == u"\N{LATIN SMALL LETTER DOTLESS I}"): ch = u"i" if (ch == u"\N{LATIN SMALL LETTER DOTLESS I}"): ch = u"j" #print u"Accenting %s with %s"%(ch, combining) # this causes UnicdeDecodeError!!! return unicodedata.normalize('NFC', unicode(ch)+combining) return u"".join([getaccented(ch, combining) for ch in c]); for u in unicode_accents_list:
(mname, mcombining) = u; macro_list.append( (mname, lambda x, c=mcombining: make_accented_char(x, c)) ); text_replacements = ( # remove indentation provided by LaTeX #(re.compile(r'\n[ \t]*'), '\n'), ("~", " "), ("``", '"'), ("''", '"'), (r'(?<!\\)&', ' '), # ignore tabular alignments, just add a little space ('\\&', '&'), # but preserve the \& escapes, that we before *hackingly* kept as '\&' for this purpose ... ); env_dict = dict([(e.envname, e) for e in env_list]) macro_dict = dict([(m.macname, m) for m in (MacroDef(m) for m in macro_list)]) def latex2text(content, tolerant_parsing=False, keep_inline_math=False, keep_comments=False):
[docs] """ Extracts text from `content` meant for database indexing. `content` is some LaTeX code. """ (nodelist, tpos, tlen) = latexwalker.get_latex_nodes(content, keep_inline_math=keep_inline_math, tolerant_parsing=tolerant_parsing); return latexnodes2text(nodelist, keep_inline_math=keep_inline_math, keep_comments=keep_comments); def latexnodes2text(nodelist, keep_inline_math=False, keep_comments=False):
[docs] """ Extracts text from a node list. `nodelist` is a list of nodes as returned by `latexwalker.get_latex_nodes()`. """ def text_from_node(node): if (node is None): return "" if (node.isNodeType(latexwalker.LatexCharsNode)): return node.chars if (node.isNodeType(latexwalker.LatexCommentNode)): if (keep_comments): return '%'+node.comment return "" if (node.isNodeType(latexwalker.LatexGroupNode)): return "".join([text_from_node(n) for n in node.nodelist]); if (node.isNodeType(latexwalker.LatexMacroNode)): # get macro behavior definition. macroname = node.macroname.rstrip('*'); if (macroname in macro_dict): mac = macro_dict[macroname] else: # no predefined behavior, use default: mac = macro_dict[''] if mac.simplify_repl: if (callable(mac.simplify_repl)): return mac.simplify_repl(node) if ('%' in mac.simplify_repl): try: return mac.simplify_repl % tuple([text_from_node(nn) for nn in node.nodeargs]) except (TypeError, ValueError): log.warning("WARNING: Error in configuration: macro '%s' failed its substitution!" %(macroname)); return mac.simplify_repl; # too bad, keep the percent signs as they are... return mac.simplify_repl if mac.discard: return "" a = node.nodeargs; if (node.nodeoptarg): a.prepend(node.nodeoptarg) return "".join([text_from_node(n) for n in a]) if (node.isNodeType(latexwalker.LatexEnvironmentNode)): # get environment behavior definition. envname = node.envname.rstrip('*'); if (envname in env_dict): envdef = env_dict[envname] else: # no predefined behavior, use default: envdef = env_dict[''] if envdef.simplify_repl: if (callable(envdef.simplify_repl)): return envdef.simplify_repl(node) if ('%' in envdef.simplify_repl): return envdef.simplify_repl % ("".join([text_from_node(nn) for nn in node.nodelist])) return envdef.simplify_repl if envdef.discard: return "" return "".join([text_from_node(n) for n in node.nodelist]) if (node.isNodeType(latexwalker.LatexMathNode)): # if we have a math node, this means we care about math modes and we should keep this verbatim. return latexwalker.math_node_to_latex(node); print "extract_from_latex(): IGNORING NODE: "+repr(node) # discard anything else. return "" s = "".join([text_from_node(n) for n in nodelist]); # now, perform suitable replacements for pattern, replacement in text_replacements: if (hasattr(pattern, 'sub')): s = pattern.sub(replacement, s) else: s = s.replace(pattern, replacement) # ###TODO: more clever handling of math modes?? if (not keep_inline_math): s = s.replace('$', ''); # removing math mode inline signs, just keep their Unicode counterparts.. return s if __name__ == '__main__':
try: #latex = '\\textit{hi there!} This is {\em an equation}: \\begin{equation}\n a + bi = 0\n\\end{equation}\n\nwhere $i$ is the imaginary unit.\n'; import fileinput print "Please type some latex text (Ctrl+D twice to stop) ..." latex = '' for line in fileinput.input(): latex += line; print '\n--- WORDS ---\n' print latex2text(latex.decode('utf-8')#, keep_inline_math=True ).encode('utf-8') print '\n-------------\n' except: import pdb; import traceback; import sys; (exc_type, exc_value, exc_traceback) = sys.exc_info() print "\nEXCEPTION: " + unicode(sys.exc_value) + "\n" pdb.post_mortem()