Source code for bibolamazi.filters.util.arxivutil

# -*- coding: utf-8 -*-
################################################################################
#                                                                              #
#   This file is part of the Bibolamazi Project.                               #
#   Copyright (C) 2013 by Philippe Faist                                       #
#   philippe.faist@bluewin.ch                                                  #
#                                                                              #
#   Bibolamazi is free software: you can redistribute it and/or modify         #
#   it under the terms of the GNU General Public License as published by       #
#   the Free Software Foundation, either version 3 of the License, or          #
#   (at your option) any later version.                                        #
#                                                                              #
#   Bibolamazi is distributed in the hope that it will be useful,              #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of             #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
#   GNU General Public License for more details.                               #
#                                                                              #
#   You should have received a copy of the GNU General Public License          #
#   along with Bibolamazi.  If not, see <http://www.gnu.org/licenses/>.        #
#                                                                              #
################################################################################


import re
from urllib.error import URLError, HTTPError
import textwrap
import time
import logging
logger = logging.getLogger(__name__)

import arxiv2bib

from bibolamazi.core.bibusercache import BibUserCacheAccessor, BibUserCacheError
from bibolamazi.core.bibusercache.tokencheckers import EntryFieldsTokenChecker 
from bibolamazi.core import butils


[docs]class BibArxivApiFetchError(BibUserCacheError): def __init__(self, msg): super().__init__('arxiv_fetched_api_info', msg)
# # --- code to detect arXiv info --- # _RX_BEFORE = r'(?:\s*([;,]?\s*)|\b|\s+|^)' _RX_AFTER = r'(?:\s*[;,]?\s*|$)' _RX_PRIMARY_CLASS_PAT = r'[-a-zA-Z0-9\._]+' # only the numerical arxiv ID (+possible version) _RX_ARXIVID_NUM_PAT = r'(?<!\d)(?:\d{4}\.\d{4,}|\d{7})(?:v\d+)?' _RX_ARXIVID_NUM = r'(?P<arxivid>'+_RX_ARXIVID_NUM_PAT+r')' # allow primary-class/ etc. _RX_ARXIVID_TOL = r'(?P<arxivid>(?:'+_RX_PRIMARY_CLASS_PAT+r'/)?'+_RX_ARXIVID_NUM_PAT+r')' def _mk_braced_pair_rx(mid): return [ re.compile(_RX_BEFORE + r'\{\s*' + mid + r'\s*\}' + _RX_AFTER, re.IGNORECASE) , re.compile(_RX_BEFORE + mid + _RX_AFTER, re.IGNORECASE) ] # A list of regexes that we will need often. # # The following are regexes we check for in url fields. Don't include all # regexes, because some DOI or parts of URLs may contain sequences of chars # which match the easier arXiv regexes. _rxarxiv_in_url = \ [] + \ _mk_braced_pair_rx( r'\\href\s*\{\s*(?:https?://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL + r'\s*\}\s*\{[^\{\}]*\}' ) + \ _mk_braced_pair_rx( r'\\(?:url|href)\s*\{\s*(?:https?://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL + r's*\}' ) + \ _mk_braced_pair_rx( r'(?:https?://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL + r's*' ) # And these regexes are the most tolerant ones, we'll check for these more or # less everywhere except in the URL fields. _rxarxiv = _rxarxiv_in_url + \ _mk_braced_pair_rx( r'(?:https?://)?arxiv\.org/(?:abs|pdf)/' + _RX_ARXIVID_TOL ) + \ _mk_braced_pair_rx( r'(?:arXiv[-.:/\s]+)?((?:(?P<primaryclass>' + _RX_PRIMARY_CLASS_PAT + r')/)?' + _RX_ARXIVID_NUM + r')' + r'(?:\s*\[(?P<primaryclass2>' + _RX_PRIMARY_CLASS_PAT + r')\])?' ) # getting "pure" arxiv ID means the arxiv ID (with primary class for old IDs # only), without version information. _rx_purearxivid = re.compile(r'(?P<purearxivid>((\d{4}\.\d{4,})|'+ r'('+_RX_PRIMARY_CLASS_PAT+r'/\d{7}))(v\d+)?)', re.IGNORECASE) _rx_aid_year = re.compile(r'(?P<year>\d{2})(?P<mon>\d{2})(?:\.\d{4,}|\d{3})') rx_arxiv_own_doi = re.compile(r'^((https?://)?(dx\.)?(doi\.org/))?10\.48550/arXiv\.(?P<arxivid>.*)$', re.IGNORECASE) # # A list of fields which are inspected for arXiv information. This is useful for # cache invalidation in various instances. # arxivinfo_from_bibtex_fields = [ 'journal', 'doi', 'eprint', 'arxivid', 'arxiv', 'url', 'note', 'annote', 'primaryclass', 'archiveprefix', ] # extract arXiv info from an entry
[docs]def detectEntryArXivInfo(entry): """ Extract arXiv information from a `pybtex.database.Entry` bibliographic entry. Returns upon success a dictionary of the form:: { 'primaryclass': <primary class, if available>, 'arxivid': <the (minimal) arXiv ID (in format XXXX.XXXX or archive/XXXXXXX)>, 'archiveprefix': value of the 'archiveprefix' field 'published': True/False <whether this entry was published in a journal other than arxiv>, 'doi': <DOI of entry if any, otherwise None> 'year': <Year in preprint arXiv ID number. 4-digit, string type.> 'isoldarxivid': <Whether the arXiv ID is of old style, i.e. 'primary-class/XXXXXXX'> 'isnewarxivid': <Whether the arXiv ID is of new style, i.e. 'XXXX.XXXX+' (with 4 or more digits after dot)>, } Note that 'published' is set to True for PhD and Master's thesis. Also, the `arxiv` filter handles this case separately and explicitly, the option there `-dThesesCountAsPublished=0` has no effect here. If no arXiv information was detected, then this function returns None. """ fields = entry.fields d = { 'primaryclass': None , 'arxivid': None , 'published': True , 'archiveprefix': None, 'doi': None, 'year': None, 'isoldarxivid': None, 'isnewarxivid': None, } # # NOTE: If you add/change the fields that are used here, make sure you # update the EntryFieldsTokenChecker below! # if (entry.type == u'unpublished' or entry.type == u'misc'): d['published'] = False elif entry.type in (u'phdthesis', u'mastersthesis',): # by default, PhD theses and Master's thesis count as published # (although this case is handled specially in the arxiv filter) d['published'] = True elif entry.type in (u'book', u'booksection', u'inproceedings', u'incollection', u'conference', u'inbook', u'proceedings',): # proceedings, books, etc. are published d['published'] = True elif ('journal' in fields and re.search(r'arxiv', fields['journal'], re.IGNORECASE)): # if journal is the arXiv, then it's not published. d['published'] = False elif ('journal' in fields and fields['journal'].strip()): # otherwise, if there is a journal, it's published d['published'] = True elif ('journal' not in fields or fields['journal'].strip() == ""): # if there's no journal for an article or an unknown publication type, # it's the arxiv. d['published'] = False else: logger.longdebug('No decisive information about whether this entry is published: %s (type %s), ' 'defaulting to True.', entry.key, entry.type) def extract_pure_id(x, primaryclass=None): m = _rx_purearxivid.search( (primaryclass+'/' if primaryclass else "") + x) if m is None: raise IndexError return m.group('purearxivid') if 'doi' in fields: dois = re.split(r'[ \t\n,]+', fields['doi']) for doi in dois: if doi.strip() == "": continue m = re.match(rx_arxiv_own_doi, doi) if m is not None: # get arXiv ID d['arxivid'] = m.group('arxivid') else: # this is a journal DOI -- keep only first one if d.get('doi', None) is not None: d['doi'] = fields['doi'] if d['arxivid'] is None: for eprintfield in ('arxivid', 'arxiv', 'eprint'): if not eprintfield in fields: continue # this field might reveal the arxiv ID arxivid = None try: arxivid = extract_pure_id(fields[eprintfield], primaryclass=fields.get('primaryclass', None)) except IndexError as e: logger.longdebug("Indexerror: invalid arXiv ID in field ‘%s’ [%r/]%r: %s", eprintfield, fields.get('primaryclass',None), fields[eprintfield], e) # could be because, e.g., Zotero exporter used the PubMed ID here. logger.debug("Entry `%s' has invalid arXiv ID %r in eprint field ‘%s’", entry.key, fields[eprintfield], eprintfield) continue if arxivid is not None: d['arxivid'] = arxivid m = re.match(r'^([-\w.]+)/', arxivid) if (m): d['primaryclass'] = m.group(1) break if ('primaryclass' in fields): d['primaryclass'] = fields['primaryclass'] if ('archiveprefix' in fields): d['archiveprefix'] = fields['archiveprefix'] logger.longdebug("processed doi,eprint,arxiv,arxivid,primaryclass,archiveprefix fields -> d = %r", d) def processNoteField(notefield, d, isurl=False): if isurl: rxlist = _rxarxiv_in_url else: rxlist = _rxarxiv for rx in rxlist: m = rx.search(notefield) if m: #logger.longdebug("Note field %r: arxiv rx %r matched", notefield, rx.pattern) if (not d['arxivid']): try: primaryclass = None try: primaryclass = m.group('primaryclass2') except IndexError: pass try: primaryclass = m.group('primaryclass') except IndexError: pass #logger.longdebug("arxivid (maybe with prim-class) = %r", m.group('arxivid')) d['arxivid'] = extract_pure_id(m.group('arxivid'), primaryclass=primaryclass) except IndexError as e: logger.longdebug("indexerror while getting arxivid in note=%r, m=%r: %s", notefield, m, e) if (not d['primaryclass']): primaryclass = None try: primaryclass = m.group('primaryclass') except IndexError: pass try: primaryclass = m.group('primaryclass2') except IndexError: pass if primaryclass and primaryclass not in ['abs', 'pdf', 'abs/', 'pdf/']: d['primaryclass'] = primaryclass if d['arxivid'] and d['primaryclass']: return #logger.longdebug("d = %r", d) if ('note' in fields): processNoteField(fields['note'], d) if ('annote' in fields): processNoteField(fields['annote'], d) if ('url' in fields): processNoteField(fields['url'], d, isurl=True) logger.longdebug("processed note,annote,url fields -> d = %r", d) if (d['arxivid'] is None): # no arXiv info. return None # FIX: if archive-ID is old style, and does not contain the primary class, add it as "quant-ph/XXXXXXX" if (re.match(r'^\d{7}$', d['arxivid']) and d['primaryclass'] and len(d['primaryclass']) > 0): d['arxivid'] = d['primaryclass']+'/'+d['arxivid'] # set whether old style or new style arXiv ID if re.match(r'^\d{4}\.\d{4,}(v\d+)?$', d['arxivid']): d['isoldarxivid'] = False d['isnewarxivid'] = True elif re.match(r'^'+_RX_PRIMARY_CLASS_PAT+r'/\d{7}(v\d+)?$', d['arxivid']): d['isoldarxivid'] = True d['isnewarxivid'] = False else: d['isoldarxivid'] = False # can't determine arxiv ID style ... d['isnewarxivid'] = False # can't determine arxiv ID style ... # get the year m = _rx_aid_year.search(d['arxivid']) if not m: logger.warning("Couldn't find the year in arXiv ID %r", d['arxivid']) else: # 91->1991, 89->2089 (arXiv started in 1991) d['year'] = str(1990 + (int(m.group('year')) - 90) % 100) logger.longdebug("finished detection -> d = %r", d) return d
[docs]def stripArXivInfoInNote(notestr): """ Assumes that notestr is a string in a note={} field of a bibtex entry, and strips any arxiv identifier information found, e.g. of the form 'arxiv:XXXX.YYYY' (or similar). """ newnotestr = notestr for rx in _rxarxiv: # replace all occurences of rx's in _rxarxiv with nothing. newnotestr = rx.sub('', newnotestr) if (notestr != newnotestr): logger.longdebug("stripArXivInfoInNote: stripped %r to %r", notestr, newnotestr) return newnotestr
# ---- API info ------
[docs]class ArxivFetchedAPIInfoCacheAccessor(BibUserCacheAccessor): """ A `BibUserCacheAccessor` for fetching and accessing information retrieved from the arXiv API. """ def __init__(self, **kwargs): super().__init__( cache_name='arxiv_fetched_api_info', **kwargs ) # save arxiv IDs for which we couldn't retreive information because the # server responded with an error. # # For these IDs, we do not re-attempt to fetch them right away (to avoid # multiple doomed requests), so that a new attempt is made only next # time bibolamazi is run. self.error_arxivids = {}
[docs] def initialize(self, cache_obj, **kwargs): dic = self.cacheDic() dic.setdefault('fetched', {}) #logger.longdebug("dic is %r\n" # "id(dic['fetched'])=%r", dic, id(dic['fetched'])) logger.debug("arxiv_fetched_api_info: adding validation checker; time valid is %r", cache_obj.cacheExpirationTokenChecker().time_valid) # validate each entry with an expiration checker. Do this per entry, rather than # globally on the full cache. (So don't use installCacheExpirationChecker()) dic['fetched'].set_validation(cache_obj.cacheExpirationTokenChecker())
[docs] def fetchArxivApiInfo(self, idlist): """ Populates the given cache with information about the arXiv entries given in `idlist`. This must be, yes you guessed right, a list of arXiv identifiers that we should fetch. This function performs a query on the arXiv.org API, using the arxiv2bib library. Please note that you should avoid making rapid fire requests in a row (this should normally not happen anyway thanks to our cache mechanism). However, beware that if we get a ``403 Forbidden`` HTTP answer, we should not continue or else arXiv.org might interpret our requests as a DOS attack. If a ``403 Forbidden`` HTTP answer is received this function raises :py:exc:`BibArxivApiFetchError` with a meaningful error text. Only those entries in `idlist` which are not already in the cache are fetched. `idlist` can be any iterable. """ logger.longdebug("fetchArxivApiInfo(): idlist=%r", idlist) # first, see which IDs of the idlist we actually need to fetch cache_entrydic = self.cacheDic()['fetched'] logger.longdebug("fetchArxivApiInfo(): " "id(dic['fetched'])=%r, \n" "id(self.cacheObject().cachedic['arxiv_fetched_api_info']=%r\n" "len(dic['fetched'])=%d", id(cache_entrydic), id(self.cacheObject().cachedic['arxiv_fetched_api_info']), len(cache_entrydic)) logger.longdebug("fetchArxivApiInfo(): in the cache, we have keys %r", cache_entrydic.keys()) still_to_fetch = [] for aid in idlist: if aid in self.error_arxivids: logger.debug("Not re-trying to fetch info for %s, query failed moments ago", aid) # we already tried to fetch this ID moments ago but failed---don't insist continue if (aid not in cache_entrydic or cache_entrydic.get(aid) is None or cache_entrydic.get(aid).get('error', False)): still_to_fetch.append(aid) logger.longdebug("fetchArxivApiInfo(): still_to_fetch=%r", still_to_fetch) # make sure we're not requesting more than batch_len arxiv ids at a time # (or URLs can get too long and we'll get a HTTP 414 "URL too long" # response) batch_len = 64 sleep_interval = 1 # 1 req/second: see https://groups.google.com/d/msg/arxiv-api/wcPh0w38XN0/p7vKsxjb6ykJ num_batches, rest = divmod(len(still_to_fetch), batch_len) if rest: num_batches += 1 k = 0 logger.longdebug("fetchArxivApiInfo(): We need to fetch keys: %r", still_to_fetch) while still_to_fetch: thisbatch = still_to_fetch[:batch_len] logger.info("Fetching information from arXiv.org (%d/%d)", k+1, num_batches) # At the second time, wait a little while. Don't make rapid fire # requests to the arxiv because they don't like that: # https://arxiv.org/help/robots if k > 0: time.sleep(sleep_interval) ok = self._do_fetch_arxiv_api_info(thisbatch) if not ok: # logs logger.info("Fetching information from arXiv.org failed :(") return False k += 1 still_to_fetch = still_to_fetch[len(thisbatch):] if k > 0: # message only if we actually fetched anything logger.info("Fetching information from arXiv.org done.") return True
def _do_fetch_arxiv_api_info(self, idlist): if ArxivFetchedAPIInfoCacheAccessor.arxiv_403_received: logger.warning("Not fetching any more arXiv data because we've been " "previously sent a \"HTTP 403 Forbidden\" response. " "See https://arxiv.org/help/robots") return None cache_entrydic = self.cacheDic()['fetched'] logger.debug('fetching missing id list %r', idlist) try: arxivdict = arxiv2bib.arxiv2bib_dict(idlist) # USE FOR DEBUGGING: #arxivdict = {} #logger.critical("DEACTIVATED ARXIV QUERY FOR DEBUGGING") except HTTPError as error: if error.getcode() == 403: ArxivFetchedAPIInfoCacheAccessor.arxiv_403_received = True raise BibArxivApiFetchError( textwrap.dedent("""\ Error fetching ArXiv API Info: ** 403 Forbidden ** This usually happens when you make many rapid fire requests in a row. If you continue to do this, arXiv.org may interpret your requests as a denial of service attack. For more information, see https://arxiv.org/help/robots. """)) logger.warning("HTTP connection error %d: %s.", error.code, error.reason) logger.warning("ArXiv API information will not be retrieved, and your bibliography " "might be incomplete.") return False except URLError as error: logger.warning("Error fetching info from arXiv.org: %s.", error.reason) logger.warning("ArXiv API information will not be retrieved, and your bibliography " "might be incomplete.") return False logger.longdebug('got entries %r: %r' %(arxivdict.keys(), arxivdict)) for (k,ref) in arxivdict.items(): logger.longdebug("Got reference object for id %s: %r" %(k, ref.__dict__)) cache_entrydic[k]['reference'] = ref if ref is None or isinstance(ref, arxiv2bib.ReferenceErrorInfo): errorstr = '<UNKNOWN ERROR>' if ref is None else str(ref) self.error_arxivids[k] = errorstr cache_entrydic[k]['error'] = errorstr cache_entrydic[k]['bibtex'] = '' else: cache_entrydic[k]['error'] = None bibtex = ref.bibtex() cache_entrydic[k]['bibtex'] = bibtex logger.longdebug("arxiv api info: Got all references. cacheDic() is now: %r", self.cacheDic()) logger.longdebug("... and cacheObject().cachedic is now: %r", self.cacheObject().cachedic) return True
[docs] def getArxivApiInfo(self, arxivid): """ Returns a dictionary:: { 'reference': <arxiv2bib.Reference>, 'bibtex': <bibtex string>, 'error': <None or an error string>, } for the given arXiv id in the cache. If the information is not in the cache, returns `None`. Don't forget to first call :py:meth:`fetchArxivApiInfo()` to retrieve the information in the first place. Note the reference part may be a :py:class:`arxiv2bib.ReferenceErrorInfo`, if there was an error retreiving the reference. In that case, the key 'error' contains an error string. """ return self.cacheDic()['fetched'].get(arxivid, None)
ArxivFetchedAPIInfoCacheAccessor.arxiv_403_received = False
[docs]class ArxivInfoCacheAccessor(BibUserCacheAccessor): """ Cache accessor for detected arXiv information about bibliography entries. """ def __init__(self, **kwargs): super().__init__( cache_name='arxiv_info', **kwargs ) # save bibtex keys for which we couldn't retreive information because # the API accessor returned an error. We do this to avoid re-attempting # to query information during this run of bibolamazi; Don't save this to # cache, we should re-attempt next time bibolamazi is run. self.failed_keys = []
[docs] def initialize(self, cache_obj, **kwargs): cache_dic = self.cacheDic() cache_dic['entries'].set_validation( EntryFieldsTokenChecker(self.bibolamaziFile().bibliographyData(), store_type=True, fields=arxivinfo_from_bibtex_fields) ) cache_dic.setdefault('cache_built', False)
[docs] def rebuild_cache(self, bibdata, arxiv_api_accessor): """ Clear and rebuild the entry cache completely. """ entrydic = self.cacheDic()['entries'] entrydic.clear() self.complete_cache(bibdata, arxiv_api_accessor)
[docs] def revalidate(self, bibolamazifile): """ Re-validates the cache (with validate()), and calls again complete_cache() to fetch all missing or out-of-date entries. """ self.cacheDic()['entries'].validate() self.complete_cache( bibolamazifile.bibliographyData(), bibolamazifile.cacheAccessor(ArxivFetchedAPIInfoCacheAccessor) )
[docs] def complete_cache(self, bibdata, arxiv_api_accessor): """ Makes sure the cache is complete for all items in `bibdata`. """ entrydic = self.cacheDic()['entries'] # A list if pairs (citekey, arxiv-id) of entries that still need to be completed # with info from the arXiv API. needs_to_be_completed = [] summary_info_mismatch = [] # # Do a first scan through all the bibdata entries, and detect the API # information using only what we have (to figure out the arxiv # ID!). We'll do a query to the arXiv API in a second step below. # for k,v in bibdata.entries.items(): # arxiv info is in cache and updated with info fetched from the arXiv API if (k in entrydic and entrydic[k] is not None and (entrydic[k].get('updated_with_api_info', False) or k in self.failed_keys)): continue # else, there's something to refresh and update arinfo = detectEntryArXivInfo(v) entrydic[k] = arinfo logger.longdebug("detected arXiv information for `%s': %r", k, arinfo) if arinfo is not None: needs_to_be_completed.append( (k, arinfo['arxivid'],) ) logger.longdebug("complete_cache(): needs_to_be_completed=%r\nentrydic=%r\n", needs_to_be_completed, entrydic) # # Complete the entry arXiv info using fetched info from the arXiv API. # arxiv_api_accessor.fetchArxivApiInfo( (x[1] for x in needs_to_be_completed), ) fail_aids = [] for (k,aid) in needs_to_be_completed: api_info = arxiv_api_accessor.getArxivApiInfo(aid) if (api_info is None or api_info['error'] or 'reference' not in api_info): errstr = "" if api_info: errstr = ": " + api_info['error'] logger.debug("Failed to fetch arXiv information for %s%s", aid, errstr) fail_aids.append(aid) self.failed_keys.append(k) continue ref = api_info['reference'] logger.longdebug("%s: %s: api_info is %r, ref is %r", k, aid, api_info, ref) primaryclass = ref.category try: doi = ref._field_text('doi', namespace=arxiv2bib.ARXIV) except: logger.debug("Couldn't get DOI field for %s from %r", aid, ref) if (primaryclass and entrydic[k]['primaryclass'] and # compare overlap only, so that 'cond-mat' and # 'cond-mat.stat-mech' don't generate the warning entrydic[k]['primaryclass'][:len(primaryclass)] != primaryclass[:len(entrydic[k]['primaryclass'])]): # # ### Ignore mismatches in primaryclass, e.g. Zotero's exporter # ### exports all archive classes as a comma-separated list # ### which would be terrible for us to parse and check... # # summary_info_mismatch.append( # (k, aid, 'primaryclass', entrydic[k]['primaryclass'], primaryclass) # ) # logger.warning("Conflicting primaryclass values for entry %s (%s): " # "%s (given in bibtex) != %s (retrieved from the arxiv)", # k, aid, entrydic[k]['primaryclass'], primaryclass) pass else: entrydic[k]['primaryclass'] = primaryclass if (doi and entrydic[k]['doi'] and entrydic[k]['doi'].lower() != doi.lower()): summary_info_mismatch.append( (k, aid, 'doi', entrydic[k]['doi'], doi) ) # logger.warning("Conflicting doi values for entry %s (%s): " # "%s (given in bibtex) != %s (retrieved from the arxiv)", # k, aid, entrydic[k]['doi'], doi) else: entrydic[k]['doi'] = doi entrydic[k]['updated_with_api_info'] = True if fail_aids: joined = ", ".join(fail_aids if len(fail_aids) <= 8 else fail_aids[:7]+['...']) logger.warning("Failed to fetch information from the arXiv for %d entries: %s", len(fail_aids), joined) # warning for info mismatch if summary_info_mismatch: logger.warning( "Mismatch: info in bibtex ≠ info from arxiv.org\n" + "\n".join( "- ‘{key}’ ({arxivid}) [{field}]:\n" " {value_from_bibtex}{value_from_arxivorg}" .format(key=key, arxivid=arxivid, field=field, value_from_bibtex='“'+value_from_bibtex+'”', value_from_arxivorg='“'+value_from_arxivorg+'”',) for key, arxivid, field, value_from_bibtex, value_from_arxivorg in summary_info_mismatch ) )
[docs] def getArXivInfo(self, entrykey): """ Get the arXiv information corresponding to entry citekey `entrykey`. If the entry is not in the cache, returns `None`. Call `complete_cache()` first! """ logger.longdebug("Getting arxiv info for key %r from cache.", entrykey) entrydic = self.cacheDic()['entries'] if (entrykey not in entrydic): logger.longdebug(" --> not found :(") return None return entrydic.get(entrykey, None)
#def _reference_doi(self, ref): # try: # doi = ref._field_text('doi', namespace=arxiv2bib.ARXIV) # except: # return None # if (doi): # return doi # return None # #def _reference_category(self, ref): # try: # return ref.category # except AttributeError: # # happens for ReferenceErrorInfo, for example # return None
[docs]def setup_and_get_arxiv_accessor(bibolamazifile): arxivinfoaccessor = bibolamazifile.cacheAccessor(ArxivInfoCacheAccessor) arxivinfoaccessor.complete_cache( bibolamazifile.bibliographyData(), bibolamazifile.cacheAccessor(ArxivFetchedAPIInfoCacheAccessor) ) return arxivinfoaccessor
# deprecated:
[docs]def get_arxiv_cache_access(bibolamazifile): butils.warn_deprecated(None, "get_arxiv_cache_access()", "setup_and_get_arxiv_accessor()", modulename="arxivutil.py", explanation="We now use the new cache mechanism; your filter should " "also explicitly request the cache accessors ArxivInfoCacheAccessor " "and ArxivFetchedAPIInfoCacheAccessor so that the cache is correctly " "set up.") return setup_and_get_arxiv_accessor(bibolamazifile)