Source code for filters.util.arxivutil

################################################################################
#                                                                              #
#   This file is part of the Bibolamazi Project.                               #
#   Copyright (C) 2013 by Philippe Faist                                       #
#   philippe.faist@bluewin.ch                                                  #
#                                                                              #
#   Bibolamazi is free software: you can redistribute it and/or modify         #
#   it under the terms of the GNU General Public License as published by       #
#   the Free Software Foundation, either version 3 of the License, or          #
#   (at your option) any later version.                                        #
#                                                                              #
#   Bibolamazi is distributed in the hope that it will be useful,              #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of             #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
#   GNU General Public License for more details.                               #
#                                                                              #
#   You should have received a copy of the GNU General Public License          #
#   along with Bibolamazi.  If not, see <http://www.gnu.org/licenses/>.        #
#                                                                              #
################################################################################

import arxiv2bib
import re
from urllib2 import URLError, HTTPError
import logging
logger = logging.getLogger(__name__)

from core.bibusercache import BibUserCacheAccessor, BibUserCacheError
from core.bibusercache.tokencheckers import EntryFieldsTokenChecker 
from core import butils


[docs]class BibArxivApiFetchError(BibUserCacheError):
    def __init__(self, msg):
        super(BibArxivApiFetchError).__init__('arxiv_fetched_api_info', msg)


# --- code to detect arXiv info ---

_RX_BEFORE = r'(?:\s*([;,]?\s*)|\b|\s+|^)'
_RX_AFTER = r'(?:\s*[;,]?\s*|$)'

_RX_PRIMARY_CLASS_PAT = r'[-a-zA-Z0-9\._]+'

_RX_ARXIVID_NUM_PAT = r'(?<!\d)(?:\d{4}\.\d{4,}|\d{7})(?:v\d+)?' # only the numerical arxiv ID (+possible version)
_RX_ARXIVID_NUM = r'(?P<arxivid>'+_RX_ARXIVID_NUM_PAT+r')' 
_RX_ARXIVID_TOL = r'(?P<arxivid>(?:'+_RX_PRIMARY_CLASS_PAT+r'/)?'+_RX_ARXIVID_NUM_PAT+r')' # allow primary-class/ etc.

def _mk_braced_pair_rx(mid):
    return [ re.compile(_RX_BEFORE + r'\{\s*' + mid + r'\s*\}' + _RX_AFTER, re.IGNORECASE) ,
             re.compile(_RX_BEFORE + mid + _RX_AFTER, re.IGNORECASE) ]

# a list of regexes that we will need often.
#
# The following are regexes we check for in url fields. Don't include all regexes, because
# some DOI or parts of URLs may contain sequences of chars which match the easier arXiv
# regexes.
_rxarxiv_in_url = (# not tuple, just a multiline expression
    []
    + _mk_braced_pair_rx(
        r'\\href\s*\{\s*(?:http://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL + r'\s*\}\s*\{[^\{\}]*\}'
        )
    + _mk_braced_pair_rx(
        r'\\(?:url|href)\s*\{\s*(?:http://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL + r's*\}'
        )
    + _mk_braced_pair_rx(
        r'(?:http://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL + r's*'
        )
    )
# And these regexes are the most tolerant ones, we'll check for these more or less
# everywhere except in the URL fields.
_rxarxiv = _rxarxiv_in_url + (# not tuple, just a multiline expression
    _mk_braced_pair_rx(
        r'(?:http://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL
        )
    + _mk_braced_pair_rx(
        r'(?:arXiv[-.:/\s]+)?((?P<primaryclass>' + _RX_PRIMARY_CLASS_PAT + r'/)?' + _RX_ARXIVID_NUM + r')'
        )
    )

# getting "pure" arxiv ID means the arxiv ID (with primary class for old IDs only), without version information.
_rx_purearxivid = re.compile(r'(?P<purearxivid>((\d{4}\.\d{4,})|'+
                             r'('+_RX_PRIMARY_CLASS_PAT+r'/\d{7}))(v\d+)?)', re.IGNORECASE)

_rx_aid_year = re.compile(r'(?P<year>\d{2})(?P<mon>\d{2})(?:\.\d{4,}|\d{3})')

#
# A list of fields which are inspected for arXiv information. This is useful for cache
# invalidation in various instances.
#
arxivinfo_from_bibtex_fields = [
    'journal', 'doi', 'eprint', 'arxivid', 'url',
    'note', 'annote', 'primaryclass',
    'archiveprefix', ]


# extract arXiv info from an entry
[docs]def detectEntryArXivInfo(entry):
    """
    Extract arXiv information from a `pybtex.database.Entry` bibliographic entry.

    Returns upon success a dictionary of the form::
    
        { 'primaryclass': <primary class, if available>,
          'arxivid': <the (minimal) arXiv ID (in format XXXX.XXXX  or  archive/XXXXXXX)>,
          'archiveprefix': value of the 'archiveprefix' field
          'published': True/False <whether this entry was published in a journal other than arxiv>,
          'doi': <DOI of entry if any, otherwise None>
          'year': <Year in preprint arXiv ID number. 4-digit, string type.>
        }

    Note that 'published' is set to True for PhD and Master's thesis. Also, the arxiv.py
    filter handles this case separately and explicitly, the option there
    `-dThesesCountAsPublished=0` has no effect here.

    If no arXiv information was detected, then this function returns None.
    """
    
    fields = entry.fields;

    d =  { 'primaryclass': None ,
           'arxivid': None ,
           'published': True ,
           'archiveprefix': None,
           'doi': None,
           'year': None,
           };

    #
    # NOTE: If you add/change the fields that are used here, make sure you update the
    # EntryFieldsTokenChecker below!
    #
    
    if (entry.type == u'unpublished' or entry.type == u'misc'):
        d['published'] = False
    elif entry.type in (u'phdthesis', u'mastersthesis',):
        # by default, PhD theses and Master's thesis count as published (although this
        # case is handled specially in the arxiv filter)
        d['published'] = True
    elif entry.type in (u'book', u'booksection', u'inproceedings', u'incollection', u'conference',
                        u'inbook', u'proceedings',):
        # proceedings, books, etc. are published
        d['published'] = True
    elif ('journal' in fields and re.search(r'arxiv', fields['journal'], re.IGNORECASE)):
        # if journal is the arXiv, then it's not published.
        d['published'] = False
    elif ('journal' in fields and fields['journal'].strip()):
        # otherwise, if there is a journal, it's published
        d['published'] = True
    elif ('journal' not in fields or fields['journal'].strip() == ""):
        # if there's no journal for an article or an unknown publication type, it's the arxiv.
        d['published'] = False
    else:
        logger.longdebug('No decisive information about whether this entry is published: %s (type %s), '
                         'defaulting to True.', entry.key, entry.type);


    def extract_pure_id(x, primaryclass=None):
        m = _rx_purearxivid.search( (primaryclass+'/' if primaryclass else "") + x)
        if m is None:
            raise IndexError
        return m.group('purearxivid')


    if ('doi' in fields and fields['doi']):
        d['doi'] = fields['doi']

    if ('eprint' in fields):
        # this gives the arxiv ID
        try:
            d['arxivid'] = extract_pure_id(fields['eprint'], primaryclass=fields.get('primaryclass', None));
            m = re.match('^([-\w.]+)/', d['arxivid']);
            if (m):
                d['primaryclass'] = m.group(1);
        except IndexError as e:
            logger.longdebug("Indexerror: invalid arXiv ID [%r/]%r: %s",
                             fields.get('primaryclass',None), fields['eprint'], e)
            logger.warning("Entry `%s' has invalid arXiv ID %r", entry.key, fields['eprint'])

    if ('primaryclass' in fields):
        d['primaryclass'] = fields['primaryclass'];

    if ('archiveprefix' in fields):
        d['archiveprefix'] = fields['archiveprefix'];


    def processNoteField(notefield, d, isurl=False):

        if isurl:
            rxlist = _rxarxiv_in_url
        else:
            rxlist = _rxarxiv

        for rx in rxlist:
            m = rx.search(notefield);
            if m:
                if (not d['arxivid']):
                    try:
                        primaryclass = None
                        try: primaryclass = m.group('primaryclass')
                        except IndexError: pass

                        d['arxivid'] = extract_pure_id(m.group('arxivid'), primaryclass=primaryclass)
                    except IndexError as e:
                        logger.longdebug("indexerror while getting arxivid in note=%r, m=%r: %s", notefield, m, e)
                        pass
                if (not d['primaryclass']):
                    try:
                        d['primaryclass'] = m.group('primaryclass');
                    except IndexError:
                        pass
            if d['arxivid'] and d['primaryclass']:
                return
                
    if ('note' in fields):
        processNoteField(fields['note'], d);

    if ('annote' in fields):
        processNoteField(fields['annote'], d);

    if ('url' in fields):
        processNoteField(fields['url'], d, isurl=True);

    if (d['arxivid'] is None):
        # no arXiv info.
        return None

    # FIX: if archive-ID is old style, and does not contain the primary class, add it as "quant-ph/XXXXXXX"
    if (re.match(r'^\d{7}$', d['arxivid']) and d['primaryclass'] and len(d['primaryclass']) > 0):
        d['arxivid'] = d['primaryclass']+'/'+d['arxivid']

    # get the year
    m = _rx_aid_year.search(d['arxivid'])
    if not m:
        logger.warning("Couldn't find the year in arXiv ID %r", d['arxivid'])
    else:
        # 91->1991, 89->2089 (arXiv started in 1991)
        d['year'] = str(1990 + (int(m.group('year')) - 90) % 100)
        
    return d


[docs]def stripArXivInfoInNote(notestr):
    """Assumes that notestr is a string in a note={} field of a bibtex entry, and strips any arxiv identifier
    information found, e.g. of the form 'arxiv:XXXX.YYYY' (or similar).
    """

    newnotestr = notestr
    for rx in _rxarxiv:
        # replace all occurences of rx's in _rxarxiv with nothing.
        newnotestr = rx.sub('', newnotestr)

    if (notestr != newnotestr):
        logger.longdebug("stripArXivInfoInNote: stripped %r to %r", notestr, newnotestr)
    return newnotestr






# ---- API info ------



[docs]class ArxivFetchedAPIInfoCacheAccessor(BibUserCacheAccessor):
    """
    A `BibUserCacheAccessor` for fetching and accessing information retrieved from the
    arXiv API.
    """
    def __init__(self, **kwargs):
        super(ArxivFetchedAPIInfoCacheAccessor, self).__init__(
            cache_name='arxiv_fetched_api_info',
            **kwargs
            )

[docs]    def initialize(self, cache_obj, **kwargs):
        dic = self.cacheDic()
        dic.setdefault('fetched', {})
        #logger.longdebug("dic is %r\n"
        #                 "id(dic['fetched'])=%r", dic, id(dic['fetched']))

        logger.debug("arxiv_fetched_api_info: adding validation checker; time valid is %r",
                     cache_obj.cacheExpirationTokenChecker().time_valid)

        # validate each entry with an expiration checker. Do this per entry, rather than
        # globally on the full cache. (So don't use installCacheExpirationChecker())
        dic['fetched'].set_validation(cache_obj.cacheExpirationTokenChecker())
        


[docs]    def fetchArxivApiInfo(self, idlist):
        """
        Populates the given cache with information about the arXiv entries given in
        `idlist`. This must be, yes you guessed right, a list of arXiv identifiers that we
        should fetch.

        This function performs a query on the arXiv.org API, using the arxiv2bib library. 
        Please note that you should avoid making rapid fire requests in a row (this should
        normally not happen anyway thanks to our cache mechanism). However, beware that if
        we get a ``403 Forbidden`` HTTP answer, we should not continue or else arXiv.org
        might interpret our requests as a DOS attack. If a ``403 Forbidden`` HTTP answer
        is received this function raises :py:exc:`BibArxivApiFetchError` with a meaningful
        error text.

        Only those entries in `idlist` which are not already in the cache are fetched.

        `idlist` can be any iterable.
        """

        cache_entrydic = self.cacheDic()['fetched']
        logger.longdebug("fetchArxivApiInfo(): "
                         "id(dic['fetched'])=%r, \nid(self.cacheObject().cachedic['arxiv_fetched_api_info']=%r\n"
                         "len(dic['fetched'])=%d",
                         id(cache_entrydic), id(self.cacheObject().cachedic['arxiv_fetched_api_info']),
                         len(cache_entrydic))

        logger.longdebug("fetchArxivApiInfo(): in the cache, we have keys %r",
                         cache_entrydic.keys())

        missing_ids = []
        #debug_allids = []
        for aid in idlist:
            #debug_allids.append(aid)
            if (aid not in cache_entrydic  or
                cache_entrydic.get(aid) is None  or
                isinstance(cache_entrydic.get(aid), arxiv2bib.ReferenceErrorInfo)):
                missing_ids.append(aid)

        #logger.longdebug("fetchArxivApiInfo(): debug_allids=%r, missing_ids=%r", debug_allids, missing_ids)

        if not missing_ids:
            logger.longdebug('nothing to fetch: no missing ids')
            # nothing to fetch
            return True

        logger.info("Fetching missing information from the arXiv API...")
        logger.debug('fetching missing id list %r' %(missing_ids))
        try:
            arxivdict = arxiv2bib.arxiv2bib_dict(missing_ids)
            logger.longdebug('got entries %r: %r' %(arxivdict.keys(), arxivdict))
        except URLError as error:
            if isinstance(error, HTTPError) and error.getcode() == 403:
                raise BibArxivApiFetchError(
                    textwrap.dedent("""\
                    Error fetching ArXiv API Info: ** 403 Forbidden **

                    This usually happens when you make many rapid fire requests in a
                    row. If you continue to do this, arXiv.org may interpret your requests
                    as a denial of service attack.

                    For more information, see http://arxiv.org/help/robots.
                    """))
            else:
                msg = (("%d: %s" %(error.code, error.reason)) if isinstance(error, HTTPError)
                       else error.reason)
                logger.warning("HTTP Connection Error: %s.", msg)
                logger.warning("ArXiv API information will not be retreived, and your bibliography "
                               "might be incomplete.")
                return False
                #
                # Don't raise an error, in case the guy is running bibolamazi on his laptop on the
                # train. In that case he might prefer some missing entries rather than a critical failure.
                #
                #            raise BibFilterError(
                #                filtname,
                #                "HTTP Connection Error: {0}".format(error.getcode())
                #                )

        for (k,ref) in arxivdict.iteritems():
            logger.longdebug("Got reference object for id %s: %r" %(k, ref.__dict__))
            cache_entrydic[k]['reference'] = ref
            bibtex = ref.bibtex()
            cache_entrydic[k]['bibtex'] = bibtex


        logger.longdebug("arxiv api info: Got all references. cacheDic() is now:  %r", self.cacheDic())
        logger.longdebug("... and cacheObject().cachedic is now:  %r", self.cacheObject().cachedic)

        return True


[docs]    def getArxivApiInfo(self, arxivid):
        """
        Returns a dictionary::

            {
              'reference':  <arxiv2bib.Reference>,
              'bibtex': <bibtex string>
            }

        for the given arXiv id in the cache. If the information is not in the cache,
        returns `None`.

        Don't forget to first call :py:meth:`fetchArxivApiInfo()` to retrieve the
        information in the first place.

        Note the reference part may be a :py:class:`arxiv2bib.ReferenceErrorInfo`, if
        there was an error retreiving the reference.
        """
        return self.cacheDic()['fetched'].get(arxivid, None)





[docs]class ArxivInfoCacheAccessor(BibUserCacheAccessor):
    """
    A `BibUserCacheAccessor` for fetching and accessing information retrieved from the
    arXiv API.
    """
    def __init__(self, **kwargs):
        super(ArxivInfoCacheAccessor, self).__init__(
            cache_name='arxiv_info',
            **kwargs
            )

[docs]    def initialize(self, cache_obj, **kwargs):
        cache_dic = self.cacheDic()
        cache_dic['entries'].set_validation(
            EntryFieldsTokenChecker(self.bibolamaziFile().bibliographyData(),
                                    store_type=True,
                                    fields=arxivinfo_from_bibtex_fields)
            )
        cache_dic.setdefault('cache_built', False)


[docs]    def rebuild_cache(self, bibdata, arxiv_api_accessor):
        """
        Clear and rebuild the entry cache completely.
        """
        entrydic = self.cacheDic()['entries']
        entrydic.clear()
        self.complete_cache(bibdata, arxiv_api_accessor)


[docs]    def revalidate(self, bibolamazifile):
        """
        Re-validates the cache (with validate()), and calls again complete_cache()
        to fetch all missing or out-of-date entries.
        """
        self.cacheDic()['entries'].validate()
        self.complete_cache(
            bibolamazifile.bibliographyData(),
            bibolamazifile.cacheAccessor(ArxivFetchedAPIInfoCacheAccessor)
        )

[docs]    def complete_cache(self, bibdata, arxiv_api_accessor):
        """
        Makes sure the cache is complete for all items in `bibdata`.
        """

        entrydic = self.cacheDic()['entries']

        # A list if pairs (citekey, arxiv-id) of entries that still need to be completed
        # with info from the arXiv API.
        needs_to_be_completed = []

        #
        # Do a first scan through all the bibdata entries, and detect the API information
        # using only what we have. We'll do a query to the arXiv API in a second step
        # below.
        #
        for k,v in bibdata.entries.iteritems():
            if (k in entrydic):
                continue
            arinfo = detectEntryArXivInfo(v);
            entrydic[k] = arinfo;
            logger.longdebug("got arXiv information for `%s': %r.", k, arinfo)
            
            if (entrydic[k] is not None):
                needs_to_be_completed.append( (k, arinfo['arxivid'],) )

        logger.longdebug("complete_cache(): needs_to_be_completed=%r\nentrydic=%r\n",
                         needs_to_be_completed,
                         entrydic)

        #
        # Complete the entry arXiv info using fetched info from the arXiv API.
        #
        arxiv_api_accessor.fetchArxivApiInfo( (x[1] for x in needs_to_be_completed), )

        for (k,aid) in needs_to_be_completed:
            api_info = arxiv_api_accessor.getArxivApiInfo(aid)
            if (api_info is None):
                logger.warning("Failed to fetch arXiv information for %s", aid);
                continue
            
            entrydic[k]['primaryclass'] = self._reference_category(api_info['reference'])
            entrydic[k]['doi'] = self._reference_doi(api_info['reference']);


[docs]    def getArXivInfo(self, entrykey):
        """
        Get the arXiv information corresponding to entry citekey `entrykey`. If the entry
        is not in the cache, returns `None`. Call `complete_cache()` first!
        """
        logger.longdebug("Getting arxiv info for key %r from cache.", entrykey)

        entrydic = self.cacheDic()['entries']

        if (entrykey not in entrydic):
            logger.longdebug("    --> not found :(")
            return None

        return entrydic.get(entrykey, None)


    def _reference_doi(self, ref):
        try:
            doi = ref._field_text('doi', namespace=arxiv2bib.ARXIV)
        except:
            return None
        if (doi):
            return doi
        return None

    def _reference_category(self, ref):
        try:
            return ref.category;
        except AttributeError:
            # happens for ReferenceErrorInfo, for example
            return None





[docs]def setup_and_get_arxiv_accessor(bibolamazifile):
    arxivinfoaccessor = bibolamazifile.cacheAccessor(ArxivInfoCacheAccessor)
    arxivinfoaccessor.complete_cache(
        bibolamazifile.bibliographyData(),
        bibolamazifile.cacheAccessor(ArxivFetchedAPIInfoCacheAccessor)
        )
    return arxivinfoaccessor



# deprecated:
[docs]def get_arxiv_cache_access(bibolamazifile):
    butils.warn_deprecated(None, "get_arxiv_cache_access()", "setup_and_get_arxiv_accessor()",
                           modulename="arxivutil.py",
                           explanation="We now use the new cache mechanism; your filter should "
                           "also explicitly request the cache accessors ArxivInfoCacheAccessor "
                           "and ArxivFetchedAPIInfoCacheAccessor so that the cache is correctly "
                           "set up.")
    return setup_and_get_arxiv_accessor(bibolamazifile)