Added periodic idmapper cache size check along with conditional flush mechanism. Ran a lot of tests and stress tests to get statistics on usage.

2014-05-18 18:28:10 +02:00 · 2014-05-18 18:28:10 +02:00 · f2c75bd0f6
commit f2c75bd0f6
parent a143733ccf
8 changed files with 145 additions and 33 deletions
--- a/src/commands/default/system.py
+++ b/src/commands/default/system.py
@ -13,7 +13,7 @@ import twisted
 from time import time as timemeasure
 from django.conf import settings
-from src.server.caches import get_cache_sizes
+#from src.server.caches import get_cache_sizes
 from src.server.sessionhandler import SESSIONS
 from src.scripts.models import ScriptDB
 from src.objects.models import ObjectDB
@ -678,8 +678,8 @@ class CmdServerLoad(MuxCommand):
        loadavg = os.getloadavg()
        psize = _resource.getpagesize()
        pid = os.getpid()
-        rmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "rss")).read()) / 1024.0  # resident memory
+        rmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "rss")).read()) / 1000.0  # resident memory
-        vmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "vsz")).read()) / 1024.0  # virtual memory
+        vmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "vsz")).read()) / 1000.0  # virtual memory
        pmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "%mem")).read())  # percent of resident memory to total
        rusage = resource.getrusage(resource.RUSAGE_SELF)
--- a/src/scripts/scripts.py
+++ b/src/scripts/scripts.py
@ -19,8 +19,6 @@ __all__ = ["Script", "DoNothing", "CheckSessions",
 _GA = object.__getattribute__
 _SESSIONS = None
 # attr-cache size in MB
 _ATTRIBUTE_CACHE_MAXSIZE = settings.ATTRIBUTE_CACHE_MAXSIZE
 class ExtendedLoopingCall(LoopingCall):
    """
@ -155,7 +153,7 @@ class ScriptBase(TypeClass):
    def _step_errback(self, e):
        "callback for runner errors"
        cname = self.__class__.__name__
-        estring = _("Script %(key)s(#%(dbid)i) of type '%(cname)s': at_repeat() error '%(err)s'.") % \
+        estring = _("Script %(key)s(#%(dbid)s) of type '%(cname)s': at_repeat() error '%(err)s'.") % \
                          {"key": self.key, "dbid": self.dbid, "cname": cname,
                           "err": e.getErrorMessage()}
        try:
@ -538,6 +536,24 @@ class CheckSessions(Script):
        #print "ValidateSessions run"
        _SESSIONS.validate_sessions()
 _FLUSH_CACHE = None
 _IDMAPPER_CACHE_MAX_MEMORY = settings.IDMAPPER_CACHE_MAXSIZE
 class ValidateIdmapperCache(Script):
    """
    Check memory use of idmapper cache
    """
    def at_script_creation(self):
        self.key = "sys_cache_validate"
        self.desc = _("Restrains size of idmapper cache.")
        self.interval = 61 * 5 # staggered compared to session check
        self.persistent = True
    def at_repeat(self):
        "Called every ~5 mins"
        global _FLUSH_CACHE
        if not _FLUSH_CACHE:
            from src.utils.idmapper.base import conditional_flush as _FLUSH_CACHE
        _FLUSH_CACHE(_IDMAPPER_CACHE_MAX_MEMORY)
 class ValidateScripts(Script):
    "Check script validation regularly"
--- a/src/server/initial_setup.py
+++ b/src/server/initial_setup.py
@ -161,7 +161,10 @@ def create_system_scripts():
    script2 = create.create_script(scripts.ValidateScripts)
    # update the channel handler to make sure it's in sync
    script3 = create.create_script(scripts.ValidateChannelHandler)
-    if not script1 or not script2 or not script3:
+    # flush the idmapper cache
    script4 = create.create_script(scripts.ValidateIdmapperCache)
    if not script1 or not script2 or not script3 or not script4:
        print " Error creating system scripts."
--- a/src/server/server.py
+++ b/src/server/server.py
@ -379,9 +379,6 @@ EVENNIA = Evennia(application)
 print '-' * 50
 print ' %(servername)s Server (%(version)s) started.' % {'servername': SERVERNAME, 'version': VERSION}
 if not settings.GAME_CACHE_TYPE:
    print "  caching disabled"
 if AMP_ENABLED:
    # The AMP protocol handles the communication between
--- a/src/settings_default.py
+++ b/src/settings_default.py
@ -125,18 +125,23 @@ ENCODINGS = ["utf-8", "latin-1", "ISO-8859-1"]
 AMP_HOST = 'localhost'
 AMP_PORT = 5000
 AMP_INTERFACE = '127.0.0.1'
-# Caching speeds up all forms of database access, often considerably. There
+# Database objects are cached in what is known as the idmapper. The idmapper
-# are (currently) only two settings, "local" or None, the latter of which turns
+# caching results in a massive speedup of the server (since it dramatically
-# off all caching completely. Local caching stores data in the process. It's
+# limits the number of database accesses needed) and also allows for
-# very fast but will go out of sync if more than one process writes to the
+# storing temporary data on objects. It is however also the main memory
-# database (such as when using procpool or an extensice web precense).
+# consumer of Evennia. With this setting the cache can be capped and
-GAME_CACHE_TYPE = "local"
+# flushed when it reaches a certain size.
-# Attributes on objects are cached aggressively for speed. If the number of
+# Empirically, N_objects_in_cache ~ (RMEM - 35) / 0.0157.
-# objects is large (and their attributes are often accessed) this can use up
+#  mem(MB)   |  objs in cache   ||   mem(MB)   |   objs in cache
-# a lot of memory. So every now and then Evennia checks the size of this
+#      50    |       ~1000      ||      800    |     ~49 000
-# cache and resets it if it's too big. This variable sets the maximum
+#     100    |       ~4000      ||     1200    |     ~75 000
-# size (in MB).
+#     200    |      ~10 000     ||     1600    |    ~100 000
-ATTRIBUTE_CACHE_MAXSIZE = 100
+#     500    |      ~30 000     ||     2000    |    ~125 000
 # Note that the exact memory usage is not exact, so err on the side of
 # caution if running on a server with limited memory.  How many objects
 # need to be in memory at any given time depends very much on each game
 # and how many players are active. Using None disables the cache cap.
 IDMAPPER_CACHE_MAXSIZE = 200
 ######################################################################
 # Evennia Database config
--- a/src/utils/dummyrunner/dummyrunner_actions.py
+++ b/src/utils/dummyrunner/dummyrunner_actions.py
@ -210,6 +210,4 @@ def c_moves(client):
 ## "heavy digger memory tester" definition
 ACTIONS = (c_login,
           c_logout,
-           (0.1, c_looks),
+           (1.0, c_digs))
           (0.1, c_creates_obj),
           (0.8, c_digs))
--- a/src/utils/dummyrunner/memplot.py
+++ b/src/utils/dummyrunner/memplot.py
@ -49,12 +49,12 @@ if __name__ == "__main__":
    vmem = data[:,2]
    nobj = data[:,3]
-    # count total amount of objects
+    # calculate derivative of obj creation
-    #ntot = data[:,3].copy()
+    oderiv = (0.5*(nobj[2:] - nobj[:-2]) / (secs[2:] - secs[:-2])).copy()
    fig = pp.figure()
    ax1 = fig.add_subplot(111)
-    ax1.set_title("Memory usage")
+    ax1.set_title("Memory usage (200 bots, auto-flush at RMEM ~ 200MB)")
    ax1.set_xlabel("Time (mins)")
    ax1.set_ylabel("Memory usage (MB)")
    ax1.plot(secs, rmem, "r", label="RMEM", lw=2)
@ -63,9 +63,36 @@ if __name__ == "__main__":
    ax2 = ax1.twinx()
    ax2.plot(secs, nobj, "g--", label="objs in cache", lw=2)
-    #ax2.plot(secs, ntot, "r--", label="objs total", lw=2)
+    #ax2.plot(secs[:-2], oderiv/60.0, "g--", label="Objs/second", lw=2)
    ax2.set_ylabel("Number of objects")
    ax2.legend(loc="lower right")
-    ax2.annotate("idmapper\nflush", xy=(70,480))
+    #ax2.annotate("All bots\nfinished\nconnecting", xy=(10, 16900))
-    ax2.annotate("@reload", xy=(185,600))
+    #ax2.annotate("idmapper\nflush", xy=(70,480))
    #ax2.annotate("@reload", xy=(185,600))
 #    # plot mem vs cachesize
 #    nobj, rmem, vmem = nobj[:262].copy(), rmem[:262].copy(), vmem[:262].copy()
 #
 #    fig = pp.figure()
 #    ax1 = fig.add_subplot(111)
 #    ax1.set_title("Memory usage per cache size")
 #    ax1.set_xlabel("Cache size (number of objects)")
 #    ax1.set_ylabel("Memory usage (MB)")
 #    ax1.plot(nobj, rmem, "r", label="RMEM", lw=2)
 #    ax1.plot(nobj, vmem, "b", label="VMEM", lw=2)
 #
 ##    # empirical estimate of memory usage: rmem = 35.0 + 0.0157 * Ncache
 ##    # Ncache = int((rmem - 35.0) / 0.0157)  (rmem in MB)
 #
 #    rderiv_aver = 0.0157
 #    fig = pp.figure()
 #    ax1 = fig.add_subplot(111)
 #    ax1.set_title("Relation between memory and cache size")
 #    ax1.set_xlabel("Memory usage (MB)")
 #    ax1.set_ylabel("Idmapper Cache Size (number of objects)")
 #    rmem = numpy.linspace(35, 2000, 2000)
 #    nobjs = numpy.array([int((mem - 35.0) / 0.0157) for mem in rmem])
 #    ax1.plot(rmem, nobjs, "r", lw=2)
    pp.show()
--- a/src/utils/idmapper/base.py
+++ b/src/utils/idmapper/base.py
@ -7,7 +7,7 @@ leave caching unexpectedly (no use of WeakRefs).
 Also adds cache_size() for monitoring the size of the cache.
 """
-import os, threading, gc
+import os, threading, gc, time
 #from twisted.internet import reactor
 #from twisted.internet.threads import blockingCallFromThread
 from weakref import WeakValueDictionary
@ -15,10 +15,13 @@ from twisted.internet.reactor import callFromThread
 from django.core.exceptions import ObjectDoesNotExist, FieldError
 from django.db.models.base import Model, ModelBase
 from django.db.models.signals import post_save, pre_delete, post_syncdb
 from src.utils import logger
 from src.utils.utils import dbref, get_evennia_pids, to_str
 from manager import SharedMemoryManager
 AUTO_FLUSH_MIN_INTERVAL = 60.0 * 5 # at least 5 mins between cache flushes
 _GA = object.__getattribute__
 _SA = object.__setattr__
 _DA = object.__delattr__
@ -36,7 +39,6 @@ _SERVER_PID, _PORTAL_PID = get_evennia_pids()
 _IS_SUBPROCESS = (_SERVER_PID and _PORTAL_PID) and not _SELF_PID in (_SERVER_PID, _PORTAL_PID)
 _IS_MAIN_THREAD = threading.currentThread().getName() == "MainThread"
 class SharedMemoryModelBase(ModelBase):
    # CL: upstream had a __new__ method that skipped ModelBase's __new__ if
    # SharedMemoryModelBase was not in the model class's ancestors. It's not
@ -225,6 +227,7 @@ class SharedMemoryModel(Model):
        Method to store an instance in the cache.
        """
        if instance._get_pk_val() is not None:
            cls.__instance_cache__[instance._get_pk_val()] = instance
    cache_instance = classmethod(cache_instance)
@ -301,6 +304,7 @@ class WeakSharedMemoryModelBase(SharedMemoryModelBase):
        cls.__instance_cache__ = WeakValueDictionary()
        cls._idmapper_recache_protection = False
 class WeakSharedMemoryModel(SharedMemoryModel):
    """
    Uses a WeakValue dictionary for caching instead of a regular one
@ -309,6 +313,7 @@ class WeakSharedMemoryModel(SharedMemoryModel):
    class Meta:
        abstract = True
 def flush_cache(**kwargs):
    """
    Flush idmapper cache. When doing so the cache will
@ -346,6 +351,7 @@ def flush_cached_instance(sender, instance, **kwargs):
    sender.flush_cached_instance(instance)
 pre_delete.connect(flush_cached_instance)
 def update_cached_instance(sender, instance, **kwargs):
    """
    Re-cache the given instance in the idmapper cache
@ -355,6 +361,66 @@ def update_cached_instance(sender, instance, **kwargs):
    sender.cache_instance(instance)
 post_save.connect(update_cached_instance)
 LAST_FLUSH = None
 def conditional_flush(max_rmem, force=False):
    """
    Flush the cache if the estimated memory usage exceeds max_rmem.
    The flusher has a timeout to avoid flushing over and over
    in particular situations (this means that for some setups
    the memory usage will exceed the requirement and a server with
    more memory is probably required for the given game)
    force - forces a flush, regardless of timeout.
    """
    global LAST_FLUSH
    def mem2cachesize(desired_rmem):
        """
        Estimate the size of the idmapper cache based on the memory
        desired. This is used to optionally cap the cache size.
        desired_rmem - memory in MB (minimum 50MB)
        The formula is empirically estimated from usage tests (Linux)
        and is
            Ncache = RMEM - 35.0 / 0.0157
        where RMEM is given in MB and Ncache is the size of the cache
        for this memory usage. VMEM tends to be about 100MB higher
        than RMEM for large memory usage.
        """
        vmem = max(desired_rmem, 50.0)
        Ncache = int(abs(float(vmem) - 35.0) / 0.0157)
        return Ncache
    if not max_rmem:
        # auto-flush is disabled
        return
    now = time.time()
    if not LAST_FLUSH:
        # server is just starting
        LAST_FLUSH = now
        return
    if ((now - LAST_FLUSH) < AUTO_FLUSH_MIN_INTERVAL) and not force:
        # too soon after last flush.
        logger.log_warnmsg("Warning: Idmapper flush called more than "\
                            "once in %s min interval. Check memory usage." % (AUTO_FLUSH_MIN_INTERVAL/60.0))
        return
    # check actual memory usage
    Ncache_max = mem2cachesize(max_rmem)
    Ncache, _ = cache_size()
    actual_rmem = float(os.popen('ps -p %d -o %s | tail -1' % (os.getpid(), "rss")).read()) / 1000.0  # resident memory
    if Ncache >= Ncache_max and actual_rmem > max_rmem * 0.9:
        # flush cache when number of objects in cache is big enough and our
        # actual memory use is within 10% of our set max
        flush_cache()
        LAST_FLUSH = now
 def cache_size(mb=True):
    """
    Calculate statistics about the cache.