Source code for filters.util.arxivutil

#                                                                              #
#   This file is part of the Bibolamazi Project.                               #
#   Copyright (C) 2013 by Philippe Faist                                       #
#                                                  #
#                                                                              #
#   Bibolamazi is free software: you can redistribute it and/or modify         #
#   it under the terms of the GNU General Public License as published by       #
#   the Free Software Foundation, either version 3 of the License, or          #
#   (at your option) any later version.                                        #
#                                                                              #
#   Bibolamazi is distributed in the hope that it will be useful,              #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of             #
#   GNU General Public License for more details.                               #
#                                                                              #
#   You should have received a copy of the GNU General Public License          #
#   along with Bibolamazi.  If not, see <>.        #
#                                                                              #

import arxiv2bib
import re
from urllib2 import URLError, HTTPError
import logging
logger = logging.getLogger(__name__)

from core.bibusercache import BibUserCacheAccessor, BibUserCacheError
from core.bibusercache.tokencheckers import EntryFieldsTokenChecker 
from core import butils

[docs]class BibArxivApiFetchError(BibUserCacheError): def __init__(self, msg): super(BibArxivApiFetchError).__init__('arxiv_fetched_api_info', msg) # --- code to detect arXiv info ---
_RX_BEFORE = r'(?:\s*([;,]?\s*)|\b|\s+|^)' _RX_AFTER = r'(?:\s*[;,]?\s*|$)' _RX_PRIMARY_CLASS_PAT = r'[-a-zA-Z0-9\._]+' _RX_ARXIVID_NUM_PAT = r'(?<!\d)(?:\d{4}\.\d{4,}|\d{7})(?:v\d+)?' # only the numerical arxiv ID (+possible version) _RX_ARXIVID_NUM = r'(?P<arxivid>'+_RX_ARXIVID_NUM_PAT+r')' _RX_ARXIVID_TOL = r'(?P<arxivid>(?:'+_RX_PRIMARY_CLASS_PAT+r'/)?'+_RX_ARXIVID_NUM_PAT+r')' # allow primary-class/ etc. def _mk_braced_pair_rx(mid): return [ re.compile(_RX_BEFORE + r'\{\s*' + mid + r'\s*\}' + _RX_AFTER, re.IGNORECASE) , re.compile(_RX_BEFORE + mid + _RX_AFTER, re.IGNORECASE) ] # a list of regexes that we will need often. # # The following are regexes we check for in url fields. Don't include all regexes, because # some DOI or parts of URLs may contain sequences of chars which match the easier arXiv # regexes. _rxarxiv_in_url = (# not tuple, just a multiline expression [] + _mk_braced_pair_rx( r'\\href\s*\{\s*(?:http://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL + r'\s*\}\s*\{[^\{\}]*\}' ) + _mk_braced_pair_rx( r'\\(?:url|href)\s*\{\s*(?:http://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL + r's*\}' ) + _mk_braced_pair_rx( r'(?:http://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL + r's*' ) ) # And these regexes are the most tolerant ones, we'll check for these more or less # everywhere except in the URL fields. _rxarxiv = _rxarxiv_in_url + (# not tuple, just a multiline expression _mk_braced_pair_rx( r'(?:http://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL ) + _mk_braced_pair_rx( r'(?:arXiv[-.:/\s]+)?((?P<primaryclass>' + _RX_PRIMARY_CLASS_PAT + r'/)?' + _RX_ARXIVID_NUM + r')' ) ) # getting "pure" arxiv ID means the arxiv ID (with primary class for old IDs only), without version information. _rx_purearxivid = re.compile(r'(?P<purearxivid>((\d{4}\.\d{4,})|'+ r'('+_RX_PRIMARY_CLASS_PAT+r'/\d{7}))(v\d+)?)', re.IGNORECASE) _rx_aid_year = re.compile(r'(?P<year>\d{2})(?P<mon>\d{2})(?:\.\d{4,}|\d{3})') # # A list of fields which are inspected for arXiv information. This is useful for cache # invalidation in various instances. # arxivinfo_from_bibtex_fields = [ 'journal', 'doi', 'eprint', 'arxivid', 'url', 'note', 'annote', 'primaryclass', 'archiveprefix', ] # extract arXiv info from an entry
[docs]def detectEntryArXivInfo(entry): """ Extract arXiv information from a `pybtex.database.Entry` bibliographic entry. Returns upon success a dictionary of the form:: { 'primaryclass': <primary class, if available>, 'arxivid': <the (minimal) arXiv ID (in format XXXX.XXXX or archive/XXXXXXX)>, 'archiveprefix': value of the 'archiveprefix' field 'published': True/False <whether this entry was published in a journal other than arxiv>, 'doi': <DOI of entry if any, otherwise None> 'year': <Year in preprint arXiv ID number. 4-digit, string type.> } Note that 'published' is set to True for PhD and Master's thesis. Also, the filter handles this case separately and explicitly, the option there `-dThesesCountAsPublished=0` has no effect here. If no arXiv information was detected, then this function returns None. """ fields = entry.fields; d = { 'primaryclass': None , 'arxivid': None , 'published': True , 'archiveprefix': None, 'doi': None, 'year': None, }; # # NOTE: If you add/change the fields that are used here, make sure you update the # EntryFieldsTokenChecker below! # if (entry.type == u'unpublished' or entry.type == u'misc'): d['published'] = False elif entry.type in (u'phdthesis', u'mastersthesis',): # by default, PhD theses and Master's thesis count as published (although this # case is handled specially in the arxiv filter) d['published'] = True elif entry.type in (u'book', u'booksection', u'inproceedings', u'incollection', u'conference', u'inbook', u'proceedings',): # proceedings, books, etc. are published d['published'] = True elif ('journal' in fields and'arxiv', fields['journal'], re.IGNORECASE)): # if journal is the arXiv, then it's not published. d['published'] = False elif ('journal' in fields and fields['journal'].strip()): # otherwise, if there is a journal, it's published d['published'] = True elif ('journal' not in fields or fields['journal'].strip() == ""): # if there's no journal for an article or an unknown publication type, it's the arxiv. d['published'] = False else: logger.longdebug('No decisive information about whether this entry is published: %s (type %s), ' 'defaulting to True.', entry.key, entry.type); def extract_pure_id(x, primaryclass=None): m = (primaryclass+'/' if primaryclass else "") + x) if m is None: raise IndexError return'purearxivid') if ('doi' in fields and fields['doi']): d['doi'] = fields['doi'] if ('eprint' in fields): # this gives the arxiv ID try: d['arxivid'] = extract_pure_id(fields['eprint'], primaryclass=fields.get('primaryclass', None)); m = re.match('^([-\w.]+)/', d['arxivid']); if (m): d['primaryclass'] =; except IndexError as e: logger.longdebug("Indexerror: invalid arXiv ID [%r/]%r: %s", fields.get('primaryclass',None), fields['eprint'], e) logger.warning("Entry `%s' has invalid arXiv ID %r", entry.key, fields['eprint']) if ('primaryclass' in fields): d['primaryclass'] = fields['primaryclass']; if ('archiveprefix' in fields): d['archiveprefix'] = fields['archiveprefix']; def processNoteField(notefield, d, isurl=False): if isurl: rxlist = _rxarxiv_in_url else: rxlist = _rxarxiv for rx in rxlist: m =; if m: if (not d['arxivid']): try: primaryclass = None try: primaryclass ='primaryclass') except IndexError: pass d['arxivid'] = extract_pure_id('arxivid'), primaryclass=primaryclass) except IndexError as e: logger.longdebug("indexerror while getting arxivid in note=%r, m=%r: %s", notefield, m, e) pass if (not d['primaryclass']): try: d['primaryclass'] ='primaryclass'); except IndexError: pass if d['arxivid'] and d['primaryclass']: return if ('note' in fields): processNoteField(fields['note'], d); if ('annote' in fields): processNoteField(fields['annote'], d); if ('url' in fields): processNoteField(fields['url'], d, isurl=True); if (d['arxivid'] is None): # no arXiv info. return None # FIX: if archive-ID is old style, and does not contain the primary class, add it as "quant-ph/XXXXXXX" if (re.match(r'^\d{7}$', d['arxivid']) and d['primaryclass'] and len(d['primaryclass']) > 0): d['arxivid'] = d['primaryclass']+'/'+d['arxivid'] # get the year m =['arxivid']) if not m: logger.warning("Couldn't find the year in arXiv ID %r", d['arxivid']) else: # 91->1991, 89->2089 (arXiv started in 1991) d['year'] = str(1990 + (int('year')) - 90) % 100) return d
[docs]def stripArXivInfoInNote(notestr): """Assumes that notestr is a string in a note={} field of a bibtex entry, and strips any arxiv identifier information found, e.g. of the form 'arxiv:XXXX.YYYY' (or similar). """ newnotestr = notestr for rx in _rxarxiv: # replace all occurences of rx's in _rxarxiv with nothing. newnotestr = rx.sub('', newnotestr) if (notestr != newnotestr): logger.longdebug("stripArXivInfoInNote: stripped %r to %r", notestr, newnotestr) return newnotestr # ---- API info ------
[docs]class ArxivFetchedAPIInfoCacheAccessor(BibUserCacheAccessor): """ A `BibUserCacheAccessor` for fetching and accessing information retrieved from the arXiv API. """ def __init__(self, **kwargs): super(ArxivFetchedAPIInfoCacheAccessor, self).__init__( cache_name='arxiv_fetched_api_info', **kwargs )
[docs] def initialize(self, cache_obj, **kwargs): dic = self.cacheDic() dic.setdefault('fetched', {}) #logger.longdebug("dic is %r\n" # "id(dic['fetched'])=%r", dic, id(dic['fetched'])) logger.debug("arxiv_fetched_api_info: adding validation checker; time valid is %r", cache_obj.cacheExpirationTokenChecker().time_valid) # validate each entry with an expiration checker. Do this per entry, rather than # globally on the full cache. (So don't use installCacheExpirationChecker()) dic['fetched'].set_validation(cache_obj.cacheExpirationTokenChecker())
[docs] def fetchArxivApiInfo(self, idlist): """ Populates the given cache with information about the arXiv entries given in `idlist`. This must be, yes you guessed right, a list of arXiv identifiers that we should fetch. This function performs a query on the API, using the arxiv2bib library. Please note that you should avoid making rapid fire requests in a row (this should normally not happen anyway thanks to our cache mechanism). However, beware that if we get a ``403 Forbidden`` HTTP answer, we should not continue or else might interpret our requests as a DOS attack. If a ``403 Forbidden`` HTTP answer is received this function raises :py:exc:`BibArxivApiFetchError` with a meaningful error text. Only those entries in `idlist` which are not already in the cache are fetched. `idlist` can be any iterable. """ cache_entrydic = self.cacheDic()['fetched'] logger.longdebug("fetchArxivApiInfo(): " "id(dic['fetched'])=%r, \nid(self.cacheObject().cachedic['arxiv_fetched_api_info']=%r\n" "len(dic['fetched'])=%d", id(cache_entrydic), id(self.cacheObject().cachedic['arxiv_fetched_api_info']), len(cache_entrydic)) logger.longdebug("fetchArxivApiInfo(): in the cache, we have keys %r", cache_entrydic.keys()) missing_ids = [] #debug_allids = [] for aid in idlist: #debug_allids.append(aid) if (aid not in cache_entrydic or cache_entrydic.get(aid) is None or isinstance(cache_entrydic.get(aid), arxiv2bib.ReferenceErrorInfo)): missing_ids.append(aid) #logger.longdebug("fetchArxivApiInfo(): debug_allids=%r, missing_ids=%r", debug_allids, missing_ids) if not missing_ids: logger.longdebug('nothing to fetch: no missing ids') # nothing to fetch return True"Fetching missing information from the arXiv API...") logger.debug('fetching missing id list %r' %(missing_ids)) try: arxivdict = arxiv2bib.arxiv2bib_dict(missing_ids) logger.longdebug('got entries %r: %r' %(arxivdict.keys(), arxivdict)) except URLError as error: if isinstance(error, HTTPError) and error.getcode() == 403: raise BibArxivApiFetchError( textwrap.dedent("""\ Error fetching ArXiv API Info: ** 403 Forbidden ** This usually happens when you make many rapid fire requests in a row. If you continue to do this, may interpret your requests as a denial of service attack. For more information, see """)) else: msg = (("%d: %s" %(error.code, error.reason)) if isinstance(error, HTTPError) else error.reason) logger.warning("HTTP Connection Error: %s.", msg) logger.warning("ArXiv API information will not be retreived, and your bibliography " "might be incomplete.") return False # # Don't raise an error, in case the guy is running bibolamazi on his laptop on the # train. In that case he might prefer some missing entries rather than a critical failure. # # raise BibFilterError( # filtname, # "HTTP Connection Error: {0}".format(error.getcode()) # ) for (k,ref) in arxivdict.iteritems(): logger.longdebug("Got reference object for id %s: %r" %(k, ref.__dict__)) cache_entrydic[k]['reference'] = ref bibtex = ref.bibtex() cache_entrydic[k]['bibtex'] = bibtex logger.longdebug("arxiv api info: Got all references. cacheDic() is now: %r", self.cacheDic()) logger.longdebug("... and cacheObject().cachedic is now: %r", self.cacheObject().cachedic) return True
[docs] def getArxivApiInfo(self, arxivid): """ Returns a dictionary:: { 'reference': <arxiv2bib.Reference>, 'bibtex': <bibtex string> } for the given arXiv id in the cache. If the information is not in the cache, returns `None`. Don't forget to first call :py:meth:`fetchArxivApiInfo()` to retrieve the information in the first place. Note the reference part may be a :py:class:`arxiv2bib.ReferenceErrorInfo`, if there was an error retreiving the reference. """ return self.cacheDic()['fetched'].get(arxivid, None)
[docs]class ArxivInfoCacheAccessor(BibUserCacheAccessor): """ A `BibUserCacheAccessor` for fetching and accessing information retrieved from the arXiv API. """ def __init__(self, **kwargs): super(ArxivInfoCacheAccessor, self).__init__( cache_name='arxiv_info', **kwargs )
[docs] def initialize(self, cache_obj, **kwargs): cache_dic = self.cacheDic() cache_dic['entries'].set_validation( EntryFieldsTokenChecker(self.bibolamaziFile().bibliographyData(), store_type=True, fields=arxivinfo_from_bibtex_fields) ) cache_dic.setdefault('cache_built', False)
[docs] def rebuild_cache(self, bibdata, arxiv_api_accessor): """ Clear and rebuild the entry cache completely. """ entrydic = self.cacheDic()['entries'] entrydic.clear() self.complete_cache(bibdata, arxiv_api_accessor)
[docs] def revalidate(self, bibolamazifile): """ Re-validates the cache (with validate()), and calls again complete_cache() to fetch all missing or out-of-date entries. """ self.cacheDic()['entries'].validate() self.complete_cache( bibolamazifile.bibliographyData(), bibolamazifile.cacheAccessor(ArxivFetchedAPIInfoCacheAccessor) )
[docs] def complete_cache(self, bibdata, arxiv_api_accessor): """ Makes sure the cache is complete for all items in `bibdata`. """ entrydic = self.cacheDic()['entries'] # A list if pairs (citekey, arxiv-id) of entries that still need to be completed # with info from the arXiv API. needs_to_be_completed = [] # # Do a first scan through all the bibdata entries, and detect the API information # using only what we have. We'll do a query to the arXiv API in a second step # below. # for k,v in bibdata.entries.iteritems(): if (k in entrydic): continue arinfo = detectEntryArXivInfo(v); entrydic[k] = arinfo; logger.longdebug("got arXiv information for `%s': %r.", k, arinfo) if (entrydic[k] is not None): needs_to_be_completed.append( (k, arinfo['arxivid'],) ) logger.longdebug("complete_cache(): needs_to_be_completed=%r\nentrydic=%r\n", needs_to_be_completed, entrydic) # # Complete the entry arXiv info using fetched info from the arXiv API. # arxiv_api_accessor.fetchArxivApiInfo( (x[1] for x in needs_to_be_completed), ) for (k,aid) in needs_to_be_completed: api_info = arxiv_api_accessor.getArxivApiInfo(aid) if (api_info is None): logger.warning("Failed to fetch arXiv information for %s", aid); continue entrydic[k]['primaryclass'] = self._reference_category(api_info['reference']) entrydic[k]['doi'] = self._reference_doi(api_info['reference']);
[docs] def getArXivInfo(self, entrykey): """ Get the arXiv information corresponding to entry citekey `entrykey`. If the entry is not in the cache, returns `None`. Call `complete_cache()` first! """ logger.longdebug("Getting arxiv info for key %r from cache.", entrykey) entrydic = self.cacheDic()['entries'] if (entrykey not in entrydic): logger.longdebug(" --> not found :(") return None return entrydic.get(entrykey, None)
def _reference_doi(self, ref): try: doi = ref._field_text('doi', namespace=arxiv2bib.ARXIV) except: return None if (doi): return doi return None def _reference_category(self, ref): try: return ref.category; except AttributeError: # happens for ReferenceErrorInfo, for example return None
[docs]def setup_and_get_arxiv_accessor(bibolamazifile): arxivinfoaccessor = bibolamazifile.cacheAccessor(ArxivInfoCacheAccessor) arxivinfoaccessor.complete_cache( bibolamazifile.bibliographyData(), bibolamazifile.cacheAccessor(ArxivFetchedAPIInfoCacheAccessor) ) return arxivinfoaccessor # deprecated:
[docs]def get_arxiv_cache_access(bibolamazifile): butils.warn_deprecated(None, "get_arxiv_cache_access()", "setup_and_get_arxiv_accessor()", modulename="", explanation="We now use the new cache mechanism; your filter should " "also explicitly request the cache accessors ArxivInfoCacheAccessor " "and ArxivFetchedAPIInfoCacheAccessor so that the cache is correctly " "set up.") return setup_and_get_arxiv_accessor(bibolamazifile)