Source code for bibolamazi.core.bibusercache

# -*- coding: utf-8 -*-
################################################################################
#                                                                              #
#   This file is part of the Bibolamazi Project.                               #
#   Copyright (C) 2014 by Philippe Faist                                       #
#   philippe.faist@bluewin.ch                                                  #
#                                                                              #
#   Bibolamazi is free software: you can redistribute it and/or modify         #
#   it under the terms of the GNU General Public License as published by       #
#   the Free Software Foundation, either version 3 of the License, or          #
#   (at your option) any later version.                                        #
#                                                                              #
#   Bibolamazi is distributed in the hope that it will be useful,              #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of             #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
#   GNU General Public License for more details.                               #
#                                                                              #
#   You should have received a copy of the GNU General Public License          #
#   along with Bibolamazi.  If not, see <http://www.gnu.org/licenses/>.        #
#                                                                              #
################################################################################


try:
    from collections.abc import MutableMapping, MutableSequence
except ImportError:
    from collections import MutableMapping, MutableSequence


import inspect
import pickle
import traceback
import logging

import bibolamazi.init
from pybtex.database import Entry, Person
from bibolamazi.core.butils import call_with_args, BibolamaziError
from bibolamazi.core.bibusercache import tokencheckers

logger = logging.getLogger(__name__)




def _to_bibusercacheobj(obj, parent):
    if (isinstance(obj, BibUserCacheDic) or isinstance(obj, BibUserCacheList)):
        # make sure we don't make copies of these objects, but keep references
        # to the original instance. Especially important for the on_set_bind_to
        # feature.
        obj.set_parent(parent)
        return obj
    if (isinstance(obj, dict)):
        return BibUserCacheDic(obj, parent=parent)
    if (isinstance(obj, list)):
        return BibUserCacheList(obj, parent=parent)
    return obj





[docs]class BibUserCacheDic(MutableMapping): """ Implements a cache where information may be stored between different runs of bibolamazi, and between different filter runs. This is a dictionary of key=value pairs, and can be used like a regular python dictionary. This implements *cache validation*, i.e. making sure that the values stored in the cache are up-to-date. Each entry of the dictionary has a corresponding *token*, i.e. a value (of any python picklable type) which will identify whether the cache is invalid or not. For example, the value could be `datetime` corresponding to the time when the entry was created, and the rule for validating the cache might be to check that the entry is not more than e.g. 3 days old. """ def __init__(self, *args, **kwargs): self._init_empty(on_set_bind_to_key=kwargs.pop('on_set_bind_to_key', None), parent = kwargs.pop('parent', None)) # by default, no validation. self.tokenchecker = None self.update(dict(*args, **kwargs)) def _init_empty(self, on_set_bind_to_key=None, parent=None): self.dic = {} self.tokens = {} self.tokenchecker = None self._on_set_bind_to_key = on_set_bind_to_key self.parent = parent def _guess_name_for_dbg(self): if not self.parent: return "<root>" return next( (key for key, val in self.parent.items() if val is self), "<unknown>")
[docs] def set_validation(self, tokenchecker, validate=True): """ Set a function that will calculate the token for a given entry, for cache validation. The `tokenchecker` should be a :py:class:`~core.bibusercache.tokencheckers.TokenChecker` instance. See the documentation for the :py:mod:`tokencheckers` modules for more information about cache validation. If `validate` is `True`, then we immediately validate the contents of the cache. """ if self.tokenchecker is tokenchecker: # no change return # store this token checker self.tokenchecker = tokenchecker # this counts as a change, so save it self._do_pending_bind() if validate: self.validate()
[docs] def validate(self): """ Validate this whole dictionary, i.e. make sure that each entry is still valid. This calls `validate_item()` for each item in the dictionary. """ keylist = list(self.dic.keys()) for key in keylist: self.validate_item(key)
[docs] def validate_item(self, key): """ Validate an entry of the dictionary manually. Usually not needed. If the value is valid, and happens to be a BibUserCacheDic, then that dictionary is also validated. Invalid entries are deleted. Returns `True` if have valid item, otherwise `False`. """ if not key in self.dic: # not valid anyway. logger.longdebug("validate_item(): %s: no such key %s", self._guess_name_for_dbg(), key) return False if not self.tokenchecker: # no validation logger.longdebug("validate_item(): %s[%s]: no validation set", self._guess_name_for_dbg(), key) return True logger.longdebug("Validating item `%s' in `%s', ...", key, self._guess_name_for_dbg()) val = self.dic[key] ok = None try: ok = self.tokenchecker.cmp_tokens(key=key, value=val, oldtoken=self.tokens.get(key,None)) except Exception as e: logger.debug("%s: Got exception in cmp_tokens(): ignoring and invalidating: %s", key, e) ok = False if ok: if isinstance(val, BibUserCacheDic): #logger.longdebug("Validating sub-dictionary `%s' ...", key) val.validate() # still return True independently of what happens in val.validate(), # because this dictionary is still valid. logger.longdebug("Cache item `%s' is valid; keeping", key) return True # otherwise, invalidate the cache. Don't just set to None or {} or [] because we # don't know what type the value is. This way is safe, because if getitem is # called, automatically an empty dic will be created. logger.longdebug("Cache item `%s' is NO LONGER VALID; trashing.", key) del self.dic[key] if key in self.tokens: del self.tokens[key] return False
[docs] def token_for(self, key): """ Return the token that was stored associated with the given `key`. Raise an exception if no cache validation set or if the `key` doesn't exist. """ return self.tokens[key]
[docs] def new_value_set(self, key=None): """ Informs the dic that the value for `key` has been updated, and a new validation token should be stored. If `key` is `None`, then this call is meant for the current object, so this call will relay to the parent dictionary. """ self._do_pending_bind() if key is None: if not self.parent: logger.warning("BibUserCacheDic.new_value_set(): No parent set!") try: self.parent.new_value_set(next( (k for k,v in self.parent.items() if v is self) )) except StopIteration: logger.warning("BibUserCacheDic.new_value_set(): Can't find ourselves in parent!") if self.tokenchecker: self.tokens[key] = self.tokenchecker.new_token(key=key, value=self.dic.get(key)) logger.longdebug("value changed in cache (key=%s), new value=%r, new token=%r", key, self.dic.get(key), self.tokens[key]) if self.parent: self.parent.child_notify_changed(self)
def __getitem__(self, key): return self.dic.get(key, BibUserCacheDic({}, parent=self, on_set_bind_to_key=key)) def __setitem__(self, key, val): self.dic[key] = _to_bibusercacheobj(val, parent=self) self._do_pending_bind() # assume that we __setitem__ is called, the value is up-to-date, ie. update the # corresponding token. self.new_value_set(key) def __delitem__(self, key): del self.dic[key] if key in self.tokens: del self.tokens[key] if self.parent: self.parent.child_notify_changed(self)
[docs] def items(self): return self.dic.items()
def __iter__(self): return iter(self.dic) def __len__(self): return len(self.dic) def __contains__(self, key): return key in self.dic
[docs] def child_notify_changed(self, obj): # update cache validation tokens for this object if self.tokenchecker: for key, val in self.dic.items(): if val is obj: self.tokens[key] = self.tokenchecker.new_token(key=key, value=val) # don't break, as it could be that the same object is pointed to by # different keys... so complete the for loop if self.parent: self.parent.child_notify_changed(self)
[docs] def set_parent(self, parent): self.parent = parent
def _do_pending_bind(self): if (self._on_set_bind_to_key is not None and self.parent is not None): # self.parent[self._on_set_bind_to_key] = self self._on_set_bind_to_key = None def __repr__(self): return 'BibUserCacheDic(%r)' %(self.dic if hasattr(self,'dic') else {}) def __setstate__(self, state): #logger.longdebug("Set state to empty; loading state=%r", state) self._init_empty() if not ('cache' in state and 'tokens' in state and 'parent' in state): # invalid cache logger.debug("Ignoring invalid cache") return self.parent = state['parent'] self.dic = state['cache'] self.tokens = state['tokens'] def __getstate__(self): state = { 'parent': self.parent, 'cache': self.dic, 'tokens': self.tokens, } return state
[docs]class BibUserCacheList(MutableSequence): def __init__(self, *args, **kwargs): self.lst = [] self.parent = kwargs.pop('parent', None) for x in list(*args, **kwargs): self.append(x) def __getitem__(self, index): return self.lst[index] def __delitem__(self, index): def deltheitem(value=None): del self.lst[index] self._do_changing_operation(None, deltheitem) def __contains__(self, value): return value in self.lst def __len__(self): return len(self.lst)
[docs] def insert(self, index, value): self._do_changing_operation(value, lambda x: self.lst.insert(index, x))
[docs] def append(self, value): self._do_changing_operation(value, lambda x: self.lst.append(x))
def __setitem__(self, key, val): self._do_changing_operation(val, lambda x: self.lst.__setitem__(key, x)) def _do_changing_operation(self, val, fn): ret = fn(None if val is None else _to_bibusercacheobj(val, parent=self)) if self.parent: self.parent.child_notify_changed(self) return ret def __repr__(self): return 'BibUserCacheList(%r)' %(self.lst)
[docs]class BibUserCache: """ The basic root cache object. This object stores the corresponding cache dictionaries for each cache. (See :py:meth:`cacheFor`.) (Internally, the caches are stored in one root :py:class:`BibUserCacheDic`.) """ def __init__(self, cache_version=None): logger.longdebug("BibUserCache: Constructor!") self.cachedic = BibUserCacheDic({}) self.entry_validation_checker = tokencheckers.TokenCheckerPerEntry() self.comb_validation_checker = tokencheckers.TokenCheckerCombine( tokencheckers.VersionTokenChecker(cache_version), self.entry_validation_checker, ) self.cachedic.set_validation(self.comb_validation_checker) # an instance of an expiry_checker that several entries might share in # self.entry_validation_checker. self.expiry_checker = tokencheckers.TokenCheckerDate()
[docs] def setDefaultInvalidationTime(self, time_delta): """ A timedelta object giving the amount of time for which data in cache is consdered valid (by default). """ self.expiry_checker.set_time_valid(time_delta)
[docs] def cacheFor(self, cache_name): """ Returns the cache dictionary object for the given cache name. If the cache dictionary does not exist, it is created. """ if not cache_name in self.cachedic: self.cachedic[cache_name] = {} # will be turned into a BibUserCacheDic automatically return self.cachedic[cache_name]
[docs] def cacheExpirationTokenChecker(self): """ Returns a cache expiration token checker validator which is configured with the default cache invalidation time. This object may be used by subclasses as a token checker for sub-caches that need regular invalidation (typically several days in the default configuration). Consider using though `installCacheExpirationChecker()`, which simply applies a general validator to your full cache; this is generally what you might want. """ return self.expiry_checker
[docs] def installCacheExpirationChecker(self, cache_name): """ Installs a cache expiration checker on the given cache. This is a utility that is at the disposal of the cache accessors to easily set up an expiration validator on their caches. Also, a single instance of an expiry token checker (see `TokenCheckerDate`) is shared between the different sub-caches and handled by this main cache object. The duration of the expiry is typically several days; because the token checker instance is shared this cannot be changed easily nor should it be relied upon. If you have custom needs or need more control over this, create your own token checker. Returns: the cache dictionary. This may have changed to a new empty object if the cache didn't validate! WARNING: the cache dictionary may have been altered with the validation of the cache! Use the return value of this function, or call :py:meth:`BibUserCacheAccessor.cacheDic` again! Note: this validation will not validate individual items in the cache dictionary, but the dictionary as a whole. Depending on your use case, it might be worth introducing per-entry validation. For that, check out the various token checkers in :py:mod:`.tokencheckers` and call :py:meth:`~core.bibusercache.BibUserCacheDic.set_validation` to install a specific validator instance. """ if not cache_name in self.cachedic: raise ValueError("Invalid cache name: %s"%(cache_name)) # normal thing, i.e. the cache expires after N days if not self.entry_validation_checker.has_entry_for(cache_name): logger.longdebug("Adding expiry checker for %s", cache_name) self.entry_validation_checker.add_entry_check(cache_name, self.expiry_checker) self.cachedic.validate_item(cache_name) return self.cacheFor(cache_name)
[docs] def hasCache(self): """ Returns `True` if we have any cache at all. This only returns `False` if there are no cache dictionaries defined. """ return bool(self.cachedic)
[docs] def loadCache(self, cachefobj): """ Load the cache from a file-like object `cachefobj`. This tries to unpickle the data and restore the cache. If the loading fails, e.g. because of an I/O error, the exception is logged but ignored, and an empty cache is initialized. Note that at this stage only the basic validation is performed; the cache accessors should then each initialize their own subcaches with possibly their own specialized validators. """ try: data = pickle.load(cachefobj); self.cachedic = data['cachedic'] except Exception as e: logger.longdebug("EXCEPTION IN pickle.load():\n%s", traceback.format_exc()) logger.debug("IGNORING EXCEPTION IN pickle.load(): %s.", e) self.cachedic = BibUserCacheDic({}) self.cachedic.set_validation(self.comb_validation_checker)
[docs] def saveCache(self, cachefobj): """ Saves the cache to the file-like object `cachefobj`. This dumps a pickle-d version of the cache information into the stream. """ # # TODO: first, serialize self.cachedic using compression to reduce file size. # data = { # cache pickle versions for Bibolamazi versions: # --1.4: <no information saved, incompatible> # 1.5: 1 # 2.0+: 2 'cachepickleversion': 2, 'cachedic': self.cachedic, } logger.longdebug("Saving cache. Cache keys are: %r", self.cachedic.dic.keys()) pickle.dump(data, cachefobj, protocol=2)
# ------------------------------------------------------------------------------
[docs]class BibUserCacheError(BibolamaziError): """ An exception which occurred when handling user caches. Usually, problems in the cache are silently ignored, because the cache can usually be safely regenerated. However, if there is a serious error which prevents the cache from being regenerated, for example, then this error should be raised. """ def __init__(self, cache_name, message): if not isinstance(cache_name, str): cache_name = '<unknown>' super().__init__("Cache ‘{}’: {}".format(cache_name, str(message))) self.cache_name = cache_name self.message = message
[docs]class BibUserCacheAccessor: """ Base class for a cache accessor. Filters should access the bibolamazi cache through a *cache accessor*. A cache accessor organizes how the caches are used and maintained. This is needed since several filters may want to access the same cache (e.g. fetched arXiv info from the arxiv.org API), so it is necessary to abstract out the cache object and how it is maintained out of the filter. This also avoids issues such as which filter is responsible for creating/refreshing the cache, etc. A unique accessor instance is attached to a particular cache name (e.g. 'arxiv_info'). It is instantiated by the BibolamaziFile. It is instructed to initialize the cache, possibly install token checkers, etc. at the beginning, before running any filters. The accessor is free to handle the cache as it prefers--build it right away, refresh it on demand only, etc. Filters access the cache by requesting an instance to the accessor. This is done by calling :py:meth:`~core.bibolamazifile.BibolamaziFile.cacheAccessor()` (you can use :py:meth:`~core.bibfilter.BibFilter.bibolamaziFile()` to get a pointer to the `bibolamazifile` object.). Filters should declare in advance which caches they would like to have access to by reimplementing the :py:meth:`~core.bibfilter.BibFilter.requested_cache_accessors` method. Accessors are free to implement their public API how they deem it best. There is no obligation or particular structure to follow. (Although `refreshCache()`, `fetchMissingItems(list)`, or similar function names may be typical.) Cache accessor objects are instantiated by the bibolamazi file. Their constructors should accept a keyword argument `bibolamazifile` and pass it on to the superclass constructor. Constructors should also accept `**kwargs` for possible compatibility with future additions and pass it on to the parent constructor. The `cache_name` argument of this constructor should be a fixed string passed by the subclass, identifying this cache (e.g. 'arxiv_info'). """ def __init__(self, cache_name, bibolamazifile, **kwargs): super().__init__(**kwargs) self._cache_name = cache_name self._bibolamazifile = bibolamazifile self._cache_obj = None
[docs] def initialize(self, cache_obj): """ Initialize the cache. Subclasses should perform any initialization tasks, such as install *token checkers*. This function should not return anything. Note that it is *strongly* recommended to install some form of cache invalidation, would it be just even an expiry validator. You may want to call :py:meth:`~core.bibusercache.BibUserCache.installCacheExpirationChecker` on `cache_obj`. Note that the order in which the `initialize()` method of the various caches is called is undefined. Use the :py:meth:`cacheDic` method to access the cache dictionary. Note that if you install token checkers on this cache, e.g. with `cache_obj.installCacheExpirationChecker()`, then the cache dictionary object may have changed! (To be sure, call :py:meth:`cacheDic` again.) The default implementation raises a `NotImplementedError` exception. """ raise NotImplementedError("Subclasses of BibUserCacheAccess must reimplement initialize()")
[docs] def cacheName(self): """ Return the cache name, as set in the constructor. Subclasses do not need to reimplement this function. """ return self._cache_name
[docs] def cacheDic(self): """ Returns the cache dictionary. This is meant as a 'protected' method for the accessor only. Objects that query the accessor should use the accessor-specific API to access data. The cache dictionary is a :py:class:`BibUserCacheDic` object. In particular, subcaches may want to set custom token checkers for proper cache invalidation (this should be done in the :py:meth:`initialize` method). This returns the data in the cache object that was set internally by the :py:class:`BibolamaziFile` via the method :py:meth:`setCacheObj`. Don't call that manually, though, unless you're implementing an alternative :py:class:`BibolamaziFile` class ! """ return self._cache_obj.cacheFor(self.cacheName())
[docs] def cacheObject(self): """ Returns the parent :py:class:`BibUserCache` object in which :py:meth:`cacheDic` is a sub-cache. This is provided FOR CONVENIENCE! Don't abuse this! You should never need to access the object directly. Maybe just read-only to get some standard attributes such as the root cache version. If you're writing directly to the root cache object, there is most likely a design flaw in your code! Most of all, don't write into other sub-caches!! """ return self._cache_obj
[docs] def setCacheObj(self, cache_obj): """ Sets the cache dictionary and cache object that will be returned by `cacheDic()` and `cacheObject()`, respectively. Accessors and filters should not call (nor reimplement) this function. This function gets called by the `BibolamaziFile`. """ self._cache_obj = cache_obj
[docs] def bibolamaziFile(self): """ Returns the parent bibolamazifile of this cache accessor. This may be useful, e.g. to initialize a token cache validator in `initialize()`. Returns the object given in the constructor argument. Do not reimplement this function. """ return self._bibolamazifile