diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py
index 17f9113b09abf154f8896f5a8c081804772629fa..c141e95df9792701e521b67f6ba149d1fc3f57c2 100644
--- a/python/dune/perftool/generation/__init__.py
+++ b/python/dune/perftool/generation/__init__.py
@@ -37,6 +37,7 @@ from dune.perftool.generation.loopy import (barrier,
                                             globalarg,
                                             iname,
                                             instruction,
+                                            kernel_cached,
                                             noop_instruction,
                                             silenced_warning,
                                             temporary_variable,
diff --git a/python/dune/perftool/generation/cache.py b/python/dune/perftool/generation/cache.py
index 53c8eada2d1722bc554bd802527264facbad8ed9..bcbc8b1e82a27e3612452d2b68217f03c8d8b341 100644
--- a/python/dune/perftool/generation/cache.py
+++ b/python/dune/perftool/generation/cache.py
@@ -1,6 +1,7 @@
 """ This module provides the memoization infrastructure for code
 generating functions.
 """
+from dune.perftool.generation.context import get_global_context_value
 from dune.perftool.generation.counter import get_counter
 
 # Store a global list of generator functions
@@ -51,13 +52,15 @@ class _RegisteredFunction(object):
                  cache_key_generator=lambda *a, **kw: a,
                  counted=False,
                  on_store=lambda x: x,
-                 item_tags=()
+                 item_tags=(),
+                 context_tags=(),
                  ):
         self.func = func
         self.cache_key_generator = cache_key_generator
         self.counted = counted
         self.on_store = on_store
         self.item_tags = item_tags
+        self.context_tags = context_tags
 
         # Initialize the memoization cache
         self._memoize_cache = {}
@@ -81,9 +84,14 @@ class _RegisteredFunction(object):
     def __call__(self, *args, **kwargs):
         # Get the cache key from the given arguments
         cache_key = self.cache_key_generator(*args, **kwargs)
+
         # Make sure that all keyword arguments have vanished from the cache_args
         assert (lambda *a, **k: len(k) == 0)(cache_key)
 
+        # Add any context tags to the cache key
+        context_key = tuple(get_global_context_value(t, None) for t in self.context_tags)
+        cache_key = (cache_key, context_key)
+
         # check whether we have a cache hit
         if cache_key not in self._memoize_cache:
             # evaluate the original function
@@ -128,10 +136,17 @@ def generator_factory(**factory_kwargs):
         items automatically turns to a tuple with the counttag being the first entry.
     no_deco : bool
         Instead of a decorator, return a function that uses identity as a body.
+    context_tags: tuple, str
+        A single tag or tuple thereof, that will be added to the cache key. This
+        feature can be used to maintain multiple sets of memoized function evaluations,
+        for example if you generate multiple loopy kernels at the same time. The
+        given strings are used to look up in the global context manager for a tag.
     """
     # Tuplize the item_tags parameter
     if "item_tags" in factory_kwargs and isinstance(factory_kwargs["item_tags"], str):
         factory_kwargs["item_tags"] = (factory_kwargs["item_tags"],)
+    if "context_tags" in factory_kwargs and isinstance(factory_kwargs["context_tags"], str):
+        factory_kwargs["context_tags"] = (factory_kwargs["context_tags"],)
 
     no_deco = factory_kwargs.pop("no_deco", False)
 
diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py
index 8e7e835137e0c9ab987db1e7e1c3b7bab1abdf6a..e6b214b9927578c449c760476d2d9fb00c0760d7 100644
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -6,15 +6,15 @@ from dune.perftool.generation import (get_counter,
                                       no_caching,
                                       preamble,
                                       )
-from dune.perftool.generation.context import get_global_context_value
 from dune.perftool.error import PerftoolLoopyError
 
-import loopy
-import numpy
+import loopy as lp
+import numpy as np
 
-iname = generator_factory(item_tags=("iname",))
-function_mangler = generator_factory(item_tags=("mangler",))
-silenced_warning = generator_factory(item_tags=("silenced_warning",), no_deco=True)
+iname = generator_factory(item_tags=("iname",), context_tags="kernel")
+function_mangler = generator_factory(item_tags=("mangler",), context_tags="kernel")
+silenced_warning = generator_factory(item_tags=("silenced_warning",), no_deco=True, context_tags="kernel")
+kernel_cached = generator_factory(item_tags=("default_cached",), context_tags="kernel")
 
 
 class DuneGlobalArg(loopy.GlobalArg):
@@ -22,6 +22,7 @@ class DuneGlobalArg(loopy.GlobalArg):
 
 
 @generator_factory(item_tags=("argument", "globalarg"),
+                   context_tags="kernel",
                    cache_key_generator=lambda n, **kw: n)
 def globalarg(name, shape=loopy.auto, managed=True, **kw):
     if isinstance(shape, str):
@@ -31,21 +32,23 @@ def globalarg(name, shape=loopy.auto, managed=True, **kw):
 
 
 @generator_factory(item_tags=("argument", "constantarg"),
+                   context_tags="kernel",
                    cache_key_generator=lambda n, **kw: n)
-def constantarg(name, shape=loopy.auto, **kw):
+def constantarg(name, shape=None, **kw):
     if isinstance(shape, str):
         shape = (shape,)
-    dtype = kw.pop("dtype", numpy.float64)
-    return loopy.GlobalArg(name, dtype=dtype, shape=shape, **kw)
+    dtype = kw.pop("dtype", np.float64)
+    return lp.GlobalArg(name, dtype=dtype, shape=shape, **kw)
 
 
 @generator_factory(item_tags=("argument", "valuearg"),
+                   context_tags="kernel",
                    cache_key_generator=lambda n, **kw: n)
 def valuearg(name, **kw):
-    return loopy.ValueArg(name, **kw)
+    return lp.ValueArg(name, **kw)
 
 
-@generator_factory(item_tags=("domain",))
+@generator_factory(item_tags=("domain",), context_tags="kernel")
 def domain(iname, shape):
     if isinstance(shape, str):
         valuearg(shape)
@@ -56,7 +59,9 @@ def get_temporary_name():
     return 'expr_{}'.format(str(get_counter('__temporary').zfill(4)))
 
 
-@generator_factory(item_tags=("temporary",), cache_key_generator=lambda n, **kw: n)
+@generator_factory(item_tags=("temporary",),
+                   context_tags="kernel",
+                   cache_key_generator=lambda n, **kw: n)
 def temporary_variable(name, **kwargs):
     from dune.perftool.loopy.temporary import DuneTemporaryVariable
     return DuneTemporaryVariable(name, scope=loopy.temp_var_scope.PRIVATE, **kwargs)
@@ -69,6 +74,7 @@ def temporary_variable(name, **kwargs):
 
 
 @generator_factory(item_tags=("instruction", "cinstruction"),
+                   context_tags="kernel",
                    cache_key_generator=lambda *a, **kw: kw['code'],
                    )
 def c_instruction_impl(**kw):
@@ -77,24 +83,26 @@ def c_instruction_impl(**kw):
     kw['assignees'] = frozenset(Variable(i) for i in kw['assignees'])
     inames = kw.pop('inames', kw.get('forced_iname_deps', []))
 
-    return loopy.CInstruction(inames, **kw)
+    return lp.CInstruction(inames, **kw)
 
 
 @generator_factory(item_tags=("instruction", "exprinstruction"),
+                   context_tags="kernel",
                    cache_key_generator=lambda *a, **kw: kw['expression'],
                    )
 def expr_instruction_impl(**kw):
     if 'assignees' in kw:
         from pymbolic.primitives import Variable
         kw['assignees'] = frozenset(Variable(i) for i in kw['assignees'])
-    return loopy.ExpressionInstruction(**kw)
+    return lp.ExpressionInstruction(**kw)
 
 
 @generator_factory(item_tags=("instruction", "callinstruction"),
+                   context_tags="kernel",
                    cache_key_generator=lambda *a, **kw: kw['expression'],
                    )
 def call_instruction_impl(**kw):
-    return loopy.CallInstruction(**kw)
+    return lp.CallInstruction(**kw)
 
 
 def _insn_cache_key(code=None, expression=None, **kwargs):
@@ -105,7 +113,9 @@ def _insn_cache_key(code=None, expression=None, **kwargs):
     raise ValueError("Please specify either code or expression for instruction!")
 
 
-@generator_factory(item_tags=("insn_id",), cache_key_generator=_insn_cache_key)
+@generator_factory(item_tags=("insn_id",),
+                   context_tags="kernel",
+                   cache_key_generator=_insn_cache_key)
 def instruction(code=None, expression=None, **kwargs):
     assert (code is not None) or (expression is not None)
     assert not ((code is not None) and (expression is not None))
@@ -127,12 +137,15 @@ def instruction(code=None, expression=None, **kwargs):
     return id
 
 
-@generator_factory(item_tags=("instruction",), cache_key_generator=lambda **kw: kw['id'])
+@generator_factory(item_tags=("instruction",),
+                   context_tags="kernel",
+                   cache_key_generator=lambda **kw: kw['id'])
 def noop_instruction(**kwargs):
-    return loopy.NoOpInstruction(**kwargs)
+    return lp.NoOpInstruction(**kwargs)
 
 
 @generator_factory(item_tags=("transformation",),
+                   context_tags="kernel",
                    cache_key_generator=no_caching,
                    )
 def transform(trafo, *args):
diff --git a/python/dune/perftool/pdelab/argument.py b/python/dune/perftool/pdelab/argument.py
index fcea7ca6612b01c0f6f236a96c9b0a9b6aaf8562..f948414ab1728b8266a35f0d2ea48853f5e487c0 100644
--- a/python/dune/perftool/pdelab/argument.py
+++ b/python/dune/perftool/pdelab/argument.py
@@ -6,13 +6,13 @@ Namely:
 """
 
 from dune.perftool.options import get_option
-from dune.perftool.generation import (cached,
-                                      domain,
+from dune.perftool.generation import (domain,
                                       function_mangler,
                                       iname,
                                       globalarg,
                                       valuearg,
-                                      get_global_context_value
+                                      get_global_context_value,
+                                      kernel_cached,
                                       )
 from dune.perftool.pdelab.index import name_index
 from dune.perftool.pdelab.basis import (evaluate_coefficient,
@@ -131,7 +131,7 @@ def name_applycontainer(restriction):
     return name
 
 
-@cached
+@kernel_cached
 def pymbolic_coefficient(container, lfs, index):
     # TODO introduce a proper type for local function spaces!
     if isinstance(lfs, str):
diff --git a/python/dune/perftool/pdelab/basis.py b/python/dune/perftool/pdelab/basis.py
index cdbdbd3557b32949d61045383e6c5820d9d60af8..b0d04c92d24f61ccdf7e542afb2bc1077fe93c29 100644
--- a/python/dune/perftool/pdelab/basis.py
+++ b/python/dune/perftool/pdelab/basis.py
@@ -1,12 +1,12 @@
 """ Generators for basis evaluations """
 
 from dune.perftool.generation import (backend,
-                                      cached,
                                       class_member,
                                       generator_factory,
                                       get_backend,
                                       include_file,
                                       instruction,
+                                      kernel_cached,
                                       preamble,
                                       temporary_variable,
                                       )
@@ -65,7 +65,7 @@ def declare_cache_temporary(element, restriction, which):
 
 
 @backend(interface="evaluate_basis")
-@cached
+@kernel_cached
 def evaluate_basis(leaf_element, name, restriction):
     lfs = name_leaf_lfs(leaf_element, restriction)
     temporary_variable(name, shape=(name_lfs_bound(lfs),), decl_method=declare_cache_temporary(leaf_element, restriction, 'Function'))
@@ -93,7 +93,7 @@ def pymbolic_basis(leaf_element, restriction, number, context=''):
 
 
 @backend(interface="evaluate_grad")
-@cached
+@kernel_cached
 def evaluate_reference_gradient(leaf_element, name, restriction):
     lfs = name_leaf_lfs(leaf_element, restriction)
     temporary_variable(name, shape=(name_lfs_bound(lfs), 1, name_dimension()), decl_method=declare_cache_temporary(leaf_element, restriction, 'Jacobian'))
@@ -129,7 +129,7 @@ def shape_as_pymbolic(shape):
     return tuple(_shape_as_pymbolic(s) for s in shape)
 
 
-@cached
+@kernel_cached
 def evaluate_coefficient(element, name, container, restriction, component):
     from ufl.functionview import select_subelement
     sub_element = select_subelement(element, component)
@@ -165,7 +165,7 @@ def evaluate_coefficient(element, name, container, restriction, component):
                 )
 
 
-@cached
+@kernel_cached
 def evaluate_coefficient_gradient(element, name, container, restriction, component):
     # First we determine the rank of the tensor we are talking about
     from ufl.functionview import select_subelement
diff --git a/python/dune/perftool/pdelab/geometry.py b/python/dune/perftool/pdelab/geometry.py
index 9d8a1c28d1bfbd50af29e9c3f6c850b60a949de1..7afe2854f597d1a4207c1b8f20088bd11c0b9be0 100644
--- a/python/dune/perftool/pdelab/geometry.py
+++ b/python/dune/perftool/pdelab/geometry.py
@@ -1,13 +1,13 @@
 from dune.perftool.ufl.modified_terminals import Restriction
 from dune.perftool.pdelab.restriction import restricted_name
 from dune.perftool.generation import (backend,
-                                      cached,
                                       domain,
                                       get_backend,
                                       get_global_context_value,
                                       globalarg,
                                       iname,
                                       include_file,
+                                      kernel_cached,
                                       preamble,
                                       temporary_variable,
                                       valuearg,
@@ -276,7 +276,7 @@ def type_jacobian_inverse_transposed(restriction):
     return "typename {}::JacobianInverseTransposed".format(geo)
 
 
-@cached
+@kernel_cached
 def define_jacobian_inverse_transposed_temporary(restriction):
     @preamble
     def _define_jacobian_inverse_transposed_temporary(name, shape, shape_impl):
diff --git a/python/dune/perftool/pdelab/index.py b/python/dune/perftool/pdelab/index.py
index cc87edfdcb3ff2be39ec3fe4fa136c49e389b5cf..73bf17eb655f95dbe59db620cca367b736009ea8 100644
--- a/python/dune/perftool/pdelab/index.py
+++ b/python/dune/perftool/pdelab/index.py
@@ -1,10 +1,10 @@
-from dune.perftool.generation import cached
+from dune.perftool.generation import kernel_cached
 
 from ufl.classes import MultiIndex, Index
 
 
 # Now define some commonly used generators that do not fall into a specific category
-@cached
+@kernel_cached
 def name_index(index):
     if isinstance(index, Index):
         # This failed for index > 9 because ufl placed curly brackets around
diff --git a/python/dune/perftool/pdelab/parameter.py b/python/dune/perftool/pdelab/parameter.py
index 602182163e0ad531e7992a04525a9cc36acd4802..46946434ba75ca652c59fc424dbf522ca2fd1abc 100644
--- a/python/dune/perftool/pdelab/parameter.py
+++ b/python/dune/perftool/pdelab/parameter.py
@@ -1,12 +1,12 @@
 """ Generators for parameter functions """
 
-from dune.perftool.generation import (cached,
-                                      class_basename,
+from dune.perftool.generation import (class_basename,
                                       class_member,
                                       constructor_parameter,
                                       generator_factory,
                                       get_backend,
                                       initializer_list,
+                                      kernel_cached,
                                       preamble,
                                       temporary_variable
                                       )
@@ -206,7 +206,7 @@ def construct_nested_fieldvector(t, shape):
     return 'Dune::FieldVector<{}, {}>'.format(construct_nested_fieldvector(t, shape[1:]), shape[0])
 
 
-@cached
+@kernel_cached
 def cell_parameter_function(name, expr, restriction, cellwise_constant, t='double'):
     shape = expr.ufl_element().value_shape()
     shape_impl = ('fv',) * len(shape)
@@ -218,7 +218,7 @@ def cell_parameter_function(name, expr, restriction, cellwise_constant, t='doubl
         evaluate_cell_parameter_function(name, restriction)
 
 
-@cached
+@kernel_cached
 def intersection_parameter_function(name, expr, cellwise_constant, t='double'):
     shape = expr.ufl_element().value_shape()
     shape_impl = ('fv',) * len(shape)
diff --git a/python/dune/perftool/pdelab/quadrature.py b/python/dune/perftool/pdelab/quadrature.py
index 72a42aa73ca20e0543f390ccbd193b5fc2895943..8ee1402c1ef5775982e0085a72b568b70fad9f38 100644
--- a/python/dune/perftool/pdelab/quadrature.py
+++ b/python/dune/perftool/pdelab/quadrature.py
@@ -1,7 +1,6 @@
 import numpy
 
 from dune.perftool.generation import (backend,
-                                      cached,
                                       class_member,
                                       domain,
                                       get_backend,
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index 77edc9e0151799baa1f9c60c6f7355d1dbcc2006..0af2d38c0c20ddaaf11cb5569a3a24355df1e8d3 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -4,12 +4,12 @@ NB: Basis evaluation is only needed for the trial function argument in jacobians
 multiplication with the test function is part of the sum factorization kernel.
 """
 from dune.perftool.generation import (backend,
-                                      cached,
                                       domain,
                                       get_counter,
                                       get_global_context_value,
                                       iname,
                                       instruction,
+                                      kernel_cached,
                                       temporary_variable,
                                       )
 from dune.perftool.sumfact.amatrix import (AMatrix,
@@ -39,7 +39,7 @@ def name_sumfact_base_buffer():
     return name
 
 
-@cached
+@kernel_cached
 def sumfact_evaluate_coefficient_gradient(element, name, restriction, component):
     # Get a temporary for the gradient
     from ufl.functionview import select_subelement
@@ -97,7 +97,7 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component)
                     )
 
 
-@cached
+@kernel_cached
 def pymbolic_trialfunction_gradient(element, restriction, component):
     rawname = "gradu" + "_".join(str(c) for c in component)
     name = restricted_name(rawname, restriction)
@@ -108,7 +108,7 @@ def pymbolic_trialfunction_gradient(element, restriction, component):
     return Variable(name)
 
 
-@cached
+@kernel_cached
 def pymbolic_trialfunction(element, restriction, component):
     theta = name_theta()
     rows = quadrature_points_per_direction()
@@ -151,7 +151,7 @@ def lfs_inames(element, restriction, number=1, context=''):
 
 
 @backend(interface="evaluate_basis")
-@cached
+@kernel_cached
 def evaluate_basis(element, name, restriction):
     temporary_variable(name, shape=())
     theta = name_theta()
@@ -183,7 +183,7 @@ def pymbolic_basis(element, restriction, number):
 
 
 @backend(interface="evaluate_grad")
-@cached
+@kernel_cached
 def evaluate_reference_gradient(element, name, restriction):
     from dune.perftool.pdelab.geometry import name_dimension
     temporary_variable(