diff --git a/README.md b/README.md
index 8896b12177ec1532e69d64f1748b81f140123643..5856f3514a91a64db9734af6cf7825952b3935c0 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,25 @@ ctest
 
 Note that this takes quite a while.
 
+## Building and Running dune-codegen in an offline environment
+
+dune-codegen relies on installing Python packages into self-contained environments
+during its configuration and build process. In order to do this in an offline
+environment, we recommend using the tool `devpi`. One of its use cases is to provide
+a local mirror for the Python package index. A quickstart tutorial for this use case
+is available [5]. It boils down to the following:
+
+* Installing the `devpi-server` package through your favorite method
+* Setting up a local server with `devpi-server --init`
+* Making sure it is running in the background (explicitly with `devpi-server --start/stop` or by configuring a systemd service.
+* Have the environment variable `PIP_INDEX_URL` to its index, e.g. by adding this line to your `~/.bashrc` (where `http://localhost:3141` might differ depending on your devpi configuration):
+```
+export PIP_INDEX_URL=http://localhost:3141/root/pypi/+simple/
+```
+
+At first installation, the locally mirrored package index will access PyPI.
+Later on, it will install packages from its local cache.
+
 ## Links
 
 [0]: https://git-lfs.github.com/
@@ -112,3 +131,4 @@ Note that this takes quite a while.
 [2]: https://gitlab.dune-project.org/quality/dune-testtools
 [3]: http://isl.gforge.inria.fr/
 [4]: https://www.dune-project.org/doc/installation/
+[5]: https://github.com/devpi/devpi/blob/master/doc/quickstart-pypimirror.rst
diff --git a/cmake/modules/DuneCodegenMacros.cmake b/cmake/modules/DuneCodegenMacros.cmake
index da3225866785c75a8cf73e6aa78b6e3e0eea42f9..91e48d73f4c79ccc6a7258071dbbfeaf8a27a1c6 100644
--- a/cmake/modules/DuneCodegenMacros.cmake
+++ b/cmake/modules/DuneCodegenMacros.cmake
@@ -116,6 +116,11 @@ function(dune_add_generated_executable)
     message(FATAL_ERROR "Unrecognized arguments in dune_add_generated_executable. This usually indicates a typo.")
   endif()
 
+  set(MPI_OPTION "0")
+  if(MPI_FOUND)
+    set(MPI_OPTION "1")
+  endif()
+
   # Apply defaults and enforce requirements
   if(NOT GEN_TARGET)
     message(FATAL_ERROR "Need to specify the TARGET parameter for dune_add_generated_executable")
@@ -139,6 +144,7 @@ function(dune_add_generated_executable)
                                --target-name ${GEN_TARGET}
                                --driver-file ${GEN_SOURCE}
                                --project-basedir ${CMAKE_BINARY_DIR}
+                               --with-mpi ${MPI_OPTION}
                                ${GEN_FORM_COMPILER_ARGS}
                        DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_CODEGEN_ADDITIONAL_PYTHON_SOURCES}
                        COMMENT "Generating driver for the target ${GEN_TARGET}"
@@ -199,6 +205,7 @@ function(dune_add_generated_executable)
                                --ini-file ${GEN_INIFILE}
                                --target-name ${GEN_TARGET}
                                --operator-to-build ${op}
+                               --with-mpi ${MPI_OPTION}
                                ${ANALYZE_GRID_OPTION}
                        DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_CODEGEN_ADDITIONAL_PYTHON_SOURCES} ${ANALYZE_GRID_FILE}
                        COMMENT "Generating operator file ${depdata___${op}} for the target ${GEN_TARGET}"
diff --git a/python/dune/codegen/generation/__init__.py b/python/dune/codegen/generation/__init__.py
index bed0256407b7259bab61b6e932c4a17761097e75..97090e18852359b10d1a2d3f74a268a3abac60f1 100644
--- a/python/dune/codegen/generation/__init__.py
+++ b/python/dune/codegen/generation/__init__.py
@@ -24,6 +24,7 @@ from dune.codegen.generation.cpp import (base_class,
                                          preamble,
                                          post_include,
                                          template_parameter,
+                                         dump_ssc_marks
                                          )
 
 from dune.codegen.generation.hooks import (hook,
diff --git a/python/dune/codegen/generation/cpp.py b/python/dune/codegen/generation/cpp.py
index b918291067f45c5f988bc8fdcea55651d538a9db..2ea4c346590ee80ef329fdc9394b9fbc3c59db9c 100644
--- a/python/dune/codegen/generation/cpp.py
+++ b/python/dune/codegen/generation/cpp.py
@@ -55,3 +55,10 @@ def dump_accumulate_timer(name):
 @generator_factory(item_tags=("register_likwid_timers",))
 def register_liwkid_timer(name):
     return "LIKWID_MARKER_REGISTER(\"{}\");".format(name)
+
+
+@generator_factory(item_tags=("register_ssc_marks",))
+def dump_ssc_marks(name):
+    from dune.codegen.pdelab.driver.timings import get_region_marks
+    return 'std::cout << "{}: " << {} << " <--> " << {} << std::endl;'.format(name,
+                                                                              *get_region_marks(name, driver=False))
diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py
index 97f3ce47af0d543141672b08f1e837cbc2ff0cc6..72697492907cbf4afdfe62d1a3789606a8e2c290 100644
--- a/python/dune/codegen/options.py
+++ b/python/dune/codegen/options.py
@@ -57,7 +57,9 @@ class CodegenGlobalOptionsArray(ImmutableRecord):
     operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!")
     debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).")
     use_likwid = CodegenOption(default=False, helpstr="Use likwid instead of own performance measurements.")
+    use_sde = CodegenOption(default=False, helpstr="Use sde instead of own performance measurements.")
     autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).")
+    with_mpi = CodegenOption(default=True, helpstr="The module was configured with mpi")
 
     # Arguments that are mainly to be set by logic depending on other options
     max_vector_width = CodegenOption(default=256, helpstr=None)
diff --git a/python/dune/codegen/pdelab/argument.py b/python/dune/codegen/pdelab/argument.py
index 5124c77608fa78fe1ea5f72a75cc7c097ef178d9..dc1acd660c137c42be6fb65bb687bafe03fbc730 100644
--- a/python/dune/codegen/pdelab/argument.py
+++ b/python/dune/codegen/pdelab/argument.py
@@ -117,6 +117,10 @@ def type_coefficientcontainer():
     return "X"
 
 
+def type_linearizationpointcontainer():
+    return "Z"
+
+
 def name_jacobian(restriction1, restriction2):
     # Restrictions may only differ if NONE
     if (restriction1 == Restriction.NONE) or (restriction2 == Restriction.NONE):
diff --git a/python/dune/codegen/pdelab/driver/__init__.py b/python/dune/codegen/pdelab/driver/__init__.py
index 50effdf1f6730c8a6241292b3dd8375e7d472f4c..124354b50dd354263800c84d99425cc6c9a57c01 100644
--- a/python/dune/codegen/pdelab/driver/__init__.py
+++ b/python/dune/codegen/pdelab/driver/__init__.py
@@ -215,7 +215,10 @@ def name_initree():
 @preamble(section="init")
 def define_mpihelper(name):
     include_file("dune/common/parallel/mpihelper.hh", filetag="driver")
-    return "Dune::MPIHelper& {} = Dune::MPIHelper::instance(argc, argv);".format(name)
+    if get_option("with_mpi"):
+        return "Dune::MPIHelper& {} = Dune::MPIHelper::instance(argc, argv);".format(name)
+    else:
+        return "Dune::FakeMPIHelper& {} = Dune::FakeMPIHelper::instance(argc, argv);".format(name)
 
 
 def name_mpihelper():
@@ -285,6 +288,13 @@ def generate_driver():
 
     contents = []
 
+    # Assert that this program was called with ini file
+    contents += ['if (argc != 2){',
+                 '  std::cerr << "This program needs to be called with an ini file" << std::endl;',
+                 '  return 1;',
+                 '}',
+                 '']
+
     def add_section(tag, comment):
         tagcontents = [i for i in retrieve_cache_items("preamble and {}".format(tag), make_generable=True)]
         if tagcontents:
diff --git a/python/dune/codegen/pdelab/driver/timings.py b/python/dune/codegen/pdelab/driver/timings.py
index aeca64d46c73f2327b48c22f07dca7a85a044104..6bbbd07e4b7701fe516eff9509525165ac23a5eb 100644
--- a/python/dune/codegen/pdelab/driver/timings.py
+++ b/python/dune/codegen/pdelab/driver/timings.py
@@ -4,7 +4,7 @@ from dune.codegen.generation import (cached,
                                      include_file,
                                      pre_include,
                                      preamble,
-                                     )
+                                     post_include)
 from dune.codegen.options import get_option
 from dune.codegen.pdelab.driver import (get_form_ident,
                                         is_linear,
@@ -24,6 +24,9 @@ from dune.codegen.pdelab.driver.solve import (name_vector,
                                               )
 
 
+_sde_marks = {}
+
+
 @preamble(section="timings")
 def define_timing_identifier(name):
     ini = name_initree()
@@ -125,6 +128,17 @@ def local_operator_likwid():
     return "{}.register_likwid_timers();".format(lop_name)
 
 
+@preamble(section="timings")
+def local_operator_ssc_marks():
+    lop_name = name_localoperator(get_form_ident())
+    return "{}.dump_ssc_marks();".format(lop_name)
+
+
+def ssc_macro():
+    return '#define __SSC_MARK(x) do{ __asm__ __volatile__' \
+           '("movl %0, %%ebx; .byte 100, 103, 144" : :"i"(x) : "%ebx"); } while(0)'
+
+
 @cached
 def setup_timer():
     # TODO check that we are using YASP?
@@ -138,6 +152,10 @@ def setup_timer():
             logger.warning("timings: using instrumentation level >= 3 with likwid will slow down your code considerably")
             local_operator_likwid()
         finalize_likwid()
+    elif get_option("use_sde"):
+        post_include(ssc_macro(), filetag='driver')
+        if get_option('instrumentation_level') >= 3:
+            local_operator_ssc_marks()
     else:
         from dune.codegen.loopy.target import type_floatingpoint
         pre_include("#define HP_TIMER_OPCOUNTER {}".format(type_floatingpoint()), filetag="driver")
@@ -156,14 +174,26 @@ def init_region_timer(region):
     setup_timer()
     if get_option("use_likwid"):
         init_likwid_timer(region)
+    elif get_option("use_sde"):
+        pass
     else:
         from dune.codegen.generation import post_include
         post_include("HP_DECLARE_TIMER({});".format(region), filetag="driver")
 
 
+def get_region_marks(region, driver):
+    if driver:
+        return _sde_marks.setdefault(region, (2 * (len(_sde_marks) + 1) * 11, (2 * (len(_sde_marks) + 1) + 1) * 11))
+    else:
+        return _sde_marks.setdefault(region, (2 * (len(_sde_marks) + 1) * 1, (2 * (len(_sde_marks) + 1) + 1) * 1))
+
+
 def start_region_timer(region):
     if get_option("use_likwid"):
         return ["LIKWID_MARKER_START(\"{}\");".format(region)]
+    elif get_option("use_sde"):
+        marks = get_region_marks(region, driver=True)
+        return ["__SSC_MARK(0x{});".format(marks[0])]
     else:
         return ["HP_TIMER_START({});".format(region)]
 
@@ -171,6 +201,10 @@ def start_region_timer(region):
 def stop_region_timer(region):
     if get_option("use_likwid"):
         return ["LIKWID_MARKER_STOP(\"{}\");".format(region)]
+    elif get_option("use_sde"):
+        marks = get_region_marks(region, driver=True)
+        return ["__SSC_MARK(0x{});".format(marks[1]),
+                "std::cout << \"Timed region {}: {} <--> {}\" << std::endl;".format(region, *marks)]
     else:
         timestream = name_timing_stream()
         return ["HP_TIMER_STOP({});".format(region),
@@ -207,7 +241,7 @@ def timed_region(region, actions):
 
         init_region_timer(region)
 
-        if get_option('instrumentation_level') >= 3 and not get_option('use_likwid'):
+        if get_option('instrumentation_level') >= 3 and not (get_option('use_likwid') or get_option("use_sde")):
             timestream = name_timing_stream()
             lop_name = name_localoperator(get_form_ident())
             print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier()))
diff --git a/python/dune/codegen/pdelab/localoperator.py b/python/dune/codegen/pdelab/localoperator.py
index 6db956e498c1de5d7e25cb47fb40771314119299..0ca6bfb45ed80271af92800980df7405200dca8b 100644
--- a/python/dune/codegen/pdelab/localoperator.py
+++ b/python/dune/codegen/pdelab/localoperator.py
@@ -32,6 +32,7 @@ from dune.codegen.generation import (accumulation_mixin,
                                      ReturnArg,
                                      run_hook,
                                      template_parameter,
+                                     dump_ssc_marks
                                      )
 from dune.codegen.cgen.clazz import (AccessModifier,
                                      BaseClass,
@@ -696,6 +697,19 @@ class RegisterLikwidMethod(ClassMember):
         ClassMember.__init__(self, content)
 
 
+class RegisterSSCMarksMethod(ClassMember):
+    def __init__(self):
+        knl = name_example_kernel()
+        assert(knl is not None)
+
+        content = ["void dump_ssc_marks()"
+                   "{"]
+        register_liwkid_timers = [i for i in retrieve_cache_items(condition='register_ssc_marks')]
+        content.extend(map(lambda x: '  ' + x, register_liwkid_timers))
+        content += ["}"]
+        ClassMember.__init__(self, content)
+
+
 class LoopyKernelMethod(ClassMember):
     def __init__(self, signature, kernel, add_timings=True, initializer_list=[]):
         from loopy import generate_body
@@ -723,6 +737,12 @@ class LoopyKernelMethod(ClassMember):
                     init_likwid_timer(timer_name)
                     content.append('  ' + 'LIKWID_MARKER_START(\"{}\");'.format(timer_name))
                     register_liwkid_timer(timer_name)
+                elif get_option('use_sde'):
+                    from dune.codegen.pdelab.driver.timings import get_region_marks, ssc_macro
+                    post_include(ssc_macro(), filetag='operatorfile')
+                    marks = get_region_marks(timer_name, driver=False)
+                    content.append('  ' + '__SSC_MARK(0x{});'.format(marks[0]))
+                    dump_ssc_marks(timer_name)
                 else:
                     post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
                     content.append('  ' + 'HP_TIMER_START({});'.format(timer_name))
@@ -735,6 +755,11 @@ class LoopyKernelMethod(ClassMember):
                         init_likwid_timer(setuptimer)
                         content.append('  ' + 'LIKWID_MARKER_START(\"{}\");'.format(setuptimer))
                         register_liwkid_timer(setuptimer)
+                    elif get_option('use_sde'):
+                        from dune.codegen.pdelab.driver.timings import get_region_marks
+                        setup_marks = get_region_marks(setuptimer, driver=False)
+                        content.append('  ' + '__SSC_MARK(0x{});'.format(setup_marks[0]))
+                        dump_ssc_marks(setuptimer)
                     else:
                         post_include('HP_DECLARE_TIMER({});'.format(setuptimer), filetag='operatorfile')
                         content.append('  HP_TIMER_START({});'.format(setuptimer))
@@ -747,6 +772,8 @@ class LoopyKernelMethod(ClassMember):
             if add_timings and get_option('instrumentation_level') >= 4:
                 if get_option('use_likwid'):
                     content.append('  ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(setuptimer))
+                elif get_option('use_sde'):
+                    content.append('  ' + '__SSC_MARK(0x{});'.format(setup_marks[1]))
                 else:
                     content.append('  ' + 'HP_TIMER_STOP({});'.format(setuptimer))
 
@@ -757,6 +784,8 @@ class LoopyKernelMethod(ClassMember):
             if add_timings and get_option('instrumentation_level') >= 3:
                 if get_option('use_likwid'):
                     content.append('  ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(timer_name))
+                elif get_option('use_sde'):
+                    content.append('  ' + '__SSC_MARK(0x{});'.format(marks[1]))
                 else:
                     content.append('  ' + 'HP_TIMER_STOP({});'.format(timer_name))
 
@@ -1219,6 +1248,8 @@ def generate_localoperator_file(kernels, filename):
         include_file('dune/codegen/common/timer.hh', filetag='operatorfile')
         if get_option('use_likwid'):
             operator_methods.append(RegisterLikwidMethod())
+        elif get_option('use_sde'):
+            operator_methods.append(RegisterSSCMarksMethod())
         else:
             operator_methods.append(TimerMethod())
     elif get_option('opcounter'):
diff --git a/python/dune/codegen/pdelab/signatures.py b/python/dune/codegen/pdelab/signatures.py
index 09b832ac252138181d8be19c9fd4c098b5bb9b68..34acea22ea1fda1b8369bf499d7ce8ed0c37859d 100644
--- a/python/dune/codegen/pdelab/signatures.py
+++ b/python/dune/codegen/pdelab/signatures.py
@@ -9,6 +9,7 @@ from dune.codegen.pdelab.argument import (name_accumulation_variable,
                                           name_coefficientcontainer,
                                           type_coefficientcontainer,
                                           name_applycontainer,
+                                          type_linearizationpointcontainer,
                                           )
 from dune.codegen.pdelab.spaces import (name_testfunctionspace,
                                         type_testfunctionspace,
@@ -293,8 +294,9 @@ def nonlinear_jacobian_apply_volume_templates():
     lfsut = type_trialfunctionspace()
     lfsvt = type_testfunctionspace()
     cct = type_coefficientcontainer()
+    lpt = type_linearizationpointcontainer()
     avt = type_accumulation_variable()
-    return (geot, lfsut, cct, cct, lfsvt, avt)
+    return (geot, lfsut, cct, lpt, lfsvt, avt)
 
 
 def nonlinear_jacobian_apply_volume_args():
@@ -312,8 +314,9 @@ def nonlinear_jacobian_apply_boundary_templates():
     lfsut = type_trialfunctionspace()
     lfsvt = type_testfunctionspace()
     cct = type_coefficientcontainer()
+    lpt = type_linearizationpointcontainer()
     avt = type_accumulation_variable()
-    return (geot, lfsut, cct, cct, lfsvt, avt)
+    return (geot, lfsut, cct, lpt, lfsvt, avt)
 
 
 def nonlinear_jacobian_apply_boundary_args():
@@ -331,8 +334,9 @@ def nonlinear_jacobian_apply_skeleton_templates():
     lfsut = type_trialfunctionspace()
     lfsvt = type_testfunctionspace()
     cct = type_coefficientcontainer()
+    lpt = type_linearizationpointcontainer()
     avt = type_accumulation_variable()
-    return (geot, lfsut, cct, cct, lfsvt, lfsut, cct, cct, lfsvt, avt, avt)
+    return (geot, lfsut, cct, lpt, lfsvt, lfsut, cct, lpt, lfsvt, avt, avt)
 
 
 def nonlinear_jacobian_apply_skeleton_args():
diff --git a/python/dune/codegen/pdelab/tensors.py b/python/dune/codegen/pdelab/tensors.py
index 4fdf59e448e8bb49172ae7a798796fb2afce9f5e..7a86ba52eff5f255c3d040d31951043a65266890 100644
--- a/python/dune/codegen/pdelab/tensors.py
+++ b/python/dune/codegen/pdelab/tensors.py
@@ -9,6 +9,7 @@ from dune.codegen.loopy.symbolic import FusedMultiplyAdd as FMA
 from loopy.match import Writes
 
 import pymbolic.primitives as prim
+import numpy as np
 import loopy as lp
 import itertools as it
 
@@ -145,11 +146,35 @@ def name_assembled_tensor(o, visitor):
 
 
 @kernel_cached
-def pymbolic_matrix_inverse(o, visitor):
-    expr = o.ufl_operands[0]
+def code_generation_time_inversion(expr, visitor):
+    mat = np.ndarray(expr.ufl_shape)
+    for indices in it.product(*tuple(range(i) for i in expr.ufl_shape)):
+        visitor.indices = indices
+        val = visitor.call(expr.ufl_operands[0])
+        if not isinstance(val, (float, int)):
+            visitor.indices = None
+            return None
+
+        mat[indices] = val
+
+    visitor.indices = None
+    return np.linalg.inv(mat)
 
+
+def pymbolic_matrix_inverse(o, visitor):
+    # Try to evaluate the matrix at code generation time.
+    # If this works (it does e.g. for Maxwell on structured grids)
+    # we can invert the matrix at code generation time!!!
     indices = visitor.indices
     visitor.indices = None
+
+    mat = code_generation_time_inversion(o, visitor)
+    if mat is not None:
+        return mat[indices]
+
+    # If code generation time inversion failed, we assemble it in C++
+    # and invert it there.
+    expr = o.ufl_operands[0]
     name = name_assembled_tensor(expr, visitor)
 
     if expr.shape[0] <= 3:
@@ -160,9 +185,8 @@ def pymbolic_matrix_inverse(o, visitor):
                     depends_on=frozenset({lp.match.Writes(name),
                                           lp.match.Tagged("sumfact_stage1"),
                                           }),
-                    tags=frozenset({"quad"}),
+                    tags=frozenset({name}),
                     )
 
     visitor.indices = indices
-
     return prim.Variable(name)
diff --git a/python/dune/codegen/ufl/visitor.py b/python/dune/codegen/ufl/visitor.py
index ab7c334f323bbc51221d926d357466d6ebc8cf88..e774e03d90e8bc61a1108767867ab046bfa56c81 100644
--- a/python/dune/codegen/ufl/visitor.py
+++ b/python/dune/codegen/ufl/visitor.py
@@ -37,6 +37,7 @@ from ufl.classes import (Coefficient,
                          JacobianDeterminant,
                          )
 
+from pytools import product as ptproduct
 import pymbolic.primitives as prim
 import numpy as np
 
@@ -278,7 +279,10 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
     #
 
     def product(self, o):
-        return prim.flattened_product(tuple(self.call(op) for op in o.ufl_operands))
+        ops = tuple(self.call(op) for op in o.ufl_operands)
+        if all(isinstance(op, (int, float)) for op in ops):
+            return ptproduct(ops)
+        return prim.flattened_product(ops)
 
     def float_value(self, o):
         return o.value()
@@ -290,7 +294,10 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker):
         return prim.quotient(self.call(o.ufl_operands[0]), self.call(o.ufl_operands[1]))
 
     def sum(self, o):
-        return prim.flattened_sum(tuple(self.call(op) for op in o.ufl_operands))
+        ops = tuple(self.call(op) for op in o.ufl_operands)
+        if all(isinstance(op, (int, float)) for op in ops):
+            return sum(ops)
+        return prim.flattened_sum(ops)
 
     def zero(self, o):
         # UFL has Zeroes with shape. We ignore those indices.