diff --git a/dune/perftool/matrixfree.hh b/dune/perftool/matrixfree.hh
new file mode 100644
index 0000000000000000000000000000000000000000..3ffa62f1a9fbba76168e55ce0c8dbfdb43301dd7
--- /dev/null
+++ b/dune/perftool/matrixfree.hh
@@ -0,0 +1,353 @@
+#ifndef DUNE_PERFTOOL_MATRIXFREE_HH
+#define DUNE_PERFTOOL_MATRIXFREE_HH
+
+#include <iostream>
+
+#include <dune/common/timer.hh>
+#include <dune/common/parametertree.hh>
+
+#include <dune/pdelab/backend/interface.hh>
+#include <dune/pdelab/constraints/common/constraints.hh>
+#include <dune/pdelab/backend/solver.hh>
+
+namespace Dune{
+  namespace PDELab{
+
+    template <typename GO, typename V>
+    void solveMatrixFree(GO& go, V& x ){
+      typedef Dune::PDELab::OnTheFlyOperator<V,V,GO> ISTLOnTheFlyOperator;
+      ISTLOnTheFlyOperator opb(go);
+      Dune::Richardson<V,V> richardson(1.0);
+      Dune::CGSolver<V> solverb(opb,richardson,1E-10,5000,2);
+      Dune::InverseOperatorResult stat;
+      // evaluate residual w.r.t initial guess
+      using TrialGridFunctionSpace = typename GO::Traits::TrialGridFunctionSpace;
+      using W = Dune::PDELab::Backend::Vector<TrialGridFunctionSpace,typename V::ElementType>;
+      W r(go.testGridFunctionSpace(),0.0);
+      go.residual(x,r);
+      // solve the jacobian system
+      V z(go.trialGridFunctionSpace(),0.0);
+      solverb.apply(z,r,stat);
+      x -= z;
+    }
+
+    // template <template<class> class Solver, typename GO>
+    // class ISTLMatrixFreeSolverBackend_Base
+    //   : public SequentialNorm, public LinearResultStorage
+    // {
+    // public:
+    //   ISTLMatrixFreeSolverBackend_Base(GO go, unsigned maxiter_=5000, int verbose_=1)
+    //     : _go(go), maxiter(maxiter_), verbose(verbose_)
+    //   {}
+
+    //   template <typename V, typename W>
+    //   void apply(V z, W r, typename Dune::template FieldTraits<typename W::ElementType >::real_type reduction) {
+    //     using Backend::Native;
+    //     using Backend::native;
+    //     Dune::Richardson<V,V> prec(1.0);
+    //     OnTheFlyOperator<V,V,GO> opa(_go);
+    //     Solver<Native<V>> solver(opa, prec, reduction, maxiter, verbose);
+    //     Dune::InverseOperatorResult stat;
+    //     solver.apply(native(z), native(r), stat);
+    //   }
+
+    // private:
+    //   GO _go;
+    //   unsigned maxiter;
+    //   int verbose;
+    // };
+
+    // template <typename GO>
+    // class ISTLMatrixFreeBackend_SEQ_CG
+    //   : public ISTLMatrixFreeSolverBackend_Base<Dune::CGSolver,GO>
+    // {
+    // public:
+    //   ISTLMatrixFreeBackend_SEQ_CG (GO& go, unsigned maxiter_=5000, int verbose_=1)
+    //     : ISTLMatrixFreeSolverBackend_Base<Dune::CGSolver,GO>(go, maxiter_, verbose_)
+    //   {}
+    // };
+
+
+    // // Status information of linear problem solver
+    // template<class RFType>
+    // struct StationaryLinearMatrixFreeProblemSolverResult : LinearSolverResult<RFType>
+    // {
+    //   RFType first_defect;       // the first defect
+    //   RFType defect;             // the final defect
+    //   double assembler_time;     // Cumulative time for matrix assembly
+    //   double linear_solver_time; // Cumulative time for linear solver
+    //   int linear_solver_iterations; // Total number of linear iterations
+
+    //   StationaryLinearMatrixFreeProblemSolverResult()
+    //     : first_defect(0.0)
+    //     , defect(0.0)
+    //     , assembler_time(0.0)
+    //     , linear_solver_time(0.0)
+    //     , linear_solver_iterations(0)
+    //   {}
+
+    // };
+
+
+    // template<typename GO, typename LS, typename V>
+    // class StationaryLinearMatrixFreeProblemSolver
+    // {
+    //   typedef typename Dune::template FieldTraits<typename V::ElementType >::real_type Real;
+    //   typedef typename GO::Traits::Jacobian M;
+    //   typedef typename GO::Traits::TrialGridFunctionSpace TrialGridFunctionSpace;
+    //   using W = Dune::PDELab::Backend::Vector<TrialGridFunctionSpace,typename V::ElementType>;
+    //   typedef GO GridOperator;
+
+    // public:
+    //   typedef StationaryLinearMatrixFreeProblemSolverResult<double> Result;
+
+    //   StationaryLinearMatrixFreeProblemSolver(GO& go, LS& ls, V& x, Real reduction, Real min_defect = 1e-99, int verbose=1)
+    //     : _go(go)
+    //     , _ls(ls)
+    //     , _x(&x)
+    //     , _reduction(reduction)
+    //     , _min_defect(min_defect)
+    //     , _hanging_node_modifications(false)
+    //     , _keep_matrix(true)
+    //     , _verbose(verbose)
+    //   {}
+
+    //   StationaryLinearMatrixFreeProblemSolver (const GO& go, LS& ls, Real reduction, Real min_defect = 1e-99, int verbose=1)
+    //     : _go(go)
+    //     , _ls(ls)
+    //     , _x()
+    //     , _reduction(reduction)
+    //     , _min_defect(min_defect)
+    //     , _hanging_node_modifications(false)
+    //     , _keep_matrix(true)
+    //     , _verbose(verbose)
+    //   {}
+
+    //   //! Construct a StationaryLinearProblemSolver for the given objects and read parameters from a ParameterTree.
+    //   /**
+    //    * This constructor reads the parameter controlling its operation from a passed-in ParameterTree
+    //    * instead of requiring the user to specify all of them as individual constructor parameters.
+    //    * Currently the following parameters are read:
+    //    *
+    //    * Name                       | Default Value | Explanation
+    //    * -------------------------- | ------------- | -----------
+    //    * reduction                  |               | Required relative defect reduction
+    //    * min_defect                 | 1e-99         | minimum absolute defect at which to stop
+    //    * hanging_node_modifications | false         | perform required transformations for hanging nodes
+    //    * keep_matrix                | true          | keep matrix between calls to apply() (but reassemble values every time)
+    //    * verbosity                  | 1             | control amount of debug output
+    //    *
+    //    * Apart from reduction, all parameters have a default value and are optional.
+    //    * The actual reduction for a call to apply() is calculated as r = max(reduction,min_defect/start_defect),
+    //    * where start defect is the norm of the residual of x.
+    //    */
+    //   StationaryLinearMatrixFreeProblemSolver(const GO& go, LS& ls, V& x, const ParameterTree& params)
+    //     : _go(go)
+    //     , _ls(ls)
+    //     , _x(&x)
+    //     , _reduction(params.get<Real>("reduction"))
+    //     , _min_defect(params.get<Real>("min_defect",1e-99))
+    //     , _hanging_node_modifications(params.get<bool>("hanging_node_modifications",false))
+    //     , _keep_matrix(params.get<bool>("keep_matrix",true))
+    //     , _verbose(params.get<int>("verbosity",1))
+    //   {}
+
+    //   //! Construct a StationaryLinearProblemSolver for the given objects and read parameters from a ParameterTree.
+    //   /**
+    //    * This constructor reads the parameter controlling its operation from a passed-in ParameterTree
+    //    * instead of requiring the user to specify all of them as individual constructor parameters.
+    //    * Currently the following parameters are read:
+    //    *
+    //    * Name                       | Default Value | Explanation
+    //    * -------------------------- | ------------- | -----------
+    //    * reduction                  |               | Required relative defect reduction
+    //    * min_defect                 | 1e-99         | minimum absolute defect at which to stop
+    //    * hanging_node_modifications | false         | perform required transformations for hanging nodes
+    //    * keep_matrix                | true          | keep matrix between calls to apply() (but reassemble values every time)
+    //    * verbosity                  | 1             | control amount of debug output
+    //    *
+    //    * Apart from reduction, all parameters have a default value and are optional.
+    //    * The actual reduction for a call to apply() is calculated as r = max(reduction,min_defect/start_defect),
+    //    * where start defect is the norm of the residual of x.
+    //    */
+    //   StationaryLinearMatrixFreeProblemSolver(const GO& go, LS& ls, const ParameterTree& params)
+    //     : _go(go)
+    //     , _ls(ls)
+    //     , _x()
+    //     , _reduction(params.get<Real>("reduction"))
+    //     , _min_defect(params.get<Real>("min_defect",1e-99))
+    //     , _hanging_node_modifications(params.get<bool>("hanging_node_modifications",false))
+    //     , _keep_matrix(params.get<bool>("keep_matrix",true))
+    //     , _verbose(params.get<int>("verbosity",1))
+    //   {}
+
+    //   // //! Set whether the solver should apply the necessary transformations for calculations on hanging nodes.
+    //   // void setHangingNodeModifications(bool b)
+    //   // {
+    //   //   _hanging_node_modifications=b;
+    //   // }
+
+    //   // //! Return whether the solver performs the necessary transformations for calculations on hanging nodes.
+    //   // bool hangingNodeModifications() const
+    //   // {
+    //   //   return _hanging_node_modifications;
+    //   // }
+
+    //   // //! Set whether the jacobian matrix should be kept across calls to apply().
+    //   // void setKeepMatrix(bool b)
+    //   // {
+    //   //   _keep_matrix = b;
+    //   // }
+
+    //   // //! Return whether the jacobian matrix is kept across calls to apply().
+    //   // bool keepMatrix() const
+    //   // {
+    //   //   return _keep_matrix;
+    //   // }
+
+    //   const Result& result() const
+    //   {
+    //     return _res;
+    //   }
+
+    //   void apply(V& x, bool reuse_matrix = false) {
+    //     _x = &x;
+    //     apply(reuse_matrix);
+    //   }
+
+    //   void apply (bool reuse_matrix = false)
+    //   {
+    //     Dune::Timer watch;
+    //     double timing,assembler_time=0;
+
+    //     // // assemble matrix; optional: assemble only on demand!
+    //     // watch.reset();
+
+    //     // if (!_jacobian)
+    //     //   {
+    //     //     _jacobian = std::make_shared<M>(_go);
+    //     //     timing = watch.elapsed();
+    //     //     if (_go.trialGridFunctionSpace().gridView().comm().rank()==0 && _verbose>=1)
+    //     //       std::cout << "=== matrix setup (max) " << timing << " s" << std::endl;
+    //     //     watch.reset();
+    //     //     assembler_time += timing;
+    //     //   }
+    //     // else if (_go.trialGridFunctionSpace().gridView().comm().rank()==0 && _verbose>=1)
+    //     //   std::cout << "=== matrix setup skipped (matrix already allocated)" << std::endl;
+
+    //     // if (_hanging_node_modifications)
+    //     //   {
+    //     //     Dune::PDELab::set_shifted_dofs(_go.localAssembler().trialConstraints(),0.0,*_x); // set hanging node DOFs to zero
+    //     //     _go.localAssembler().backtransform(*_x); // interpolate hanging nodes adjacent to Dirichlet nodes
+    //     //   }
+
+    //     // if (!reuse_matrix)
+    //     //   {
+    //     //     (*_jacobian) = Real(0.0);
+    //     //     _go.jacobian(*_x,*_jacobian);
+    //     //   }
+
+    //     // timing = watch.elapsed();
+    //     // // timing = gos.trialGridFunctionSpace().gridView().comm().max(timing);
+    //     // if (_go.trialGridFunctionSpace().gridView().comm().rank()==0 && _verbose>=1)
+    //     //   {
+    //     //     if (reuse_matrix)
+    //     //       std::cout << "=== matrix assembly SKIPPED" << std::endl;
+    //     //     else
+    //     //       std::cout << "=== matrix assembly (max) " << timing << " s" << std::endl;
+    //     //   }
+
+    //     // assembler_time += timing;
+
+    //     // assemble residual
+    //     watch.reset();
+
+    //     W r(_go.testGridFunctionSpace(),0.0);
+    //     _go.residual(*_x,r);  // residual is additive
+
+    //     timing = watch.elapsed();
+    //     // timing = gos.trialGridFunctionSpace().gridView().comm().max(timing);
+    //     if (_go.trialGridFunctionSpace().gridView().comm().rank()==0 && _verbose>=1)
+    //       std::cout << "=== residual assembly (max) " << timing << " s" << std::endl;
+    //     assembler_time += timing;
+    //     _res.assembler_time = assembler_time;
+
+    //     auto defect = _ls.norm(r);
+
+    //     // compute correction
+    //     watch.reset();
+    //     V z(_go.trialGridFunctionSpace(),0.0);
+    //     auto red = std::max(_reduction,_min_defect/defect);
+    //     if (_go.trialGridFunctionSpace().gridView().comm().rank()==0)
+    //       std::cout << "=== solving (reduction: " << red << ") " << std::endl;
+
+    //     // NOTE: Different than in StationaryLinearProblemSolver!
+    //     _ls.apply(z, r, red); // solver makes right hand side consistent
+
+
+    //     _linear_solver_result = _ls.result();
+    //     timing = watch.elapsed();
+    //     // timing = gos.trialGridFunctionSpace().gridView().comm().max(timing);
+    //     if (_go.trialGridFunctionSpace().gridView().comm().rank()==0 && _verbose>=1)
+    //       std::cout << timing << " s" << std::endl;
+    //     _res.linear_solver_time = timing;
+
+    //     _res.converged = _linear_solver_result.converged;
+    //     _res.iterations = _linear_solver_result.iterations;
+    //     _res.elapsed = _linear_solver_result.elapsed;
+    //     _res.reduction = _linear_solver_result.reduction;
+    //     _res.conv_rate = _linear_solver_result.conv_rate;
+    //     _res.first_defect = static_cast<double>(defect);
+    //     _res.defect = static_cast<double>(defect)*_linear_solver_result.reduction;
+    //     _res.linear_solver_iterations = _linear_solver_result.iterations;
+
+    //     // // and update
+    //     // if (_hanging_node_modifications)
+    //     //   Dune::PDELab::set_shifted_dofs(_go.localAssembler().trialConstraints(),0.0,*_x); // set hanging node DOFs to zero
+    //     // *_x -= z;
+    //     // if (_hanging_node_modifications)
+    //     //   _go.localAssembler().backtransform(*_x); // interpolate hanging nodes adjacent to Dirichlet nodes
+
+    //     // if (!_keep_matrix)
+    //     //   _jacobian.reset();
+    //   }
+
+    //   // //! Discard the stored Jacobian matrix.
+    //   // void discardMatrix()
+    //   // {
+    //   //   if(_jacobian)
+    //   //     _jacobian.reset();
+    //   // }
+
+    //   const Dune::PDELab::LinearSolverResult<double>& ls_result() const{
+    //     return _linear_solver_result;
+    //   }
+
+    //   Real reduction() const
+    //   {
+    //     return _reduction;
+    //   }
+
+    //   void setReduction(Real reduction)
+    //   {
+    //     _reduction = reduction;
+    //   }
+
+
+    // private:
+    //   const GO& _go;
+    //   LS& _ls;
+    //   V* _x;
+    //   // shared_ptr<M> _jacobian;
+    //   Real _reduction;
+    //   Real _min_defect;
+    //   Dune::PDELab::LinearSolverResult<double> _linear_solver_result;
+    //   Result _res;
+    //   bool _hanging_node_modifications;
+    //   bool _keep_matrix;
+    //   int _verbose;
+    // };
+
+  }
+}
+#endif
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index ff563cfb2bd1a50af135571fc43a248009b26018..bbea79cfe4e129bc19fc3e96563dd2c34cc2ff43 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -13,6 +13,7 @@ def get_form_compiler_arguments():
     parser.add_argument("--operator-file", type=str, help="The filename for the generated local operator header")
     parser.add_argument('--version', action='version', version='%(prog)s 0.1')
     parser.add_argument("--numerical-jacobian", action="store_true", help="use numerical jacobians (only makes sense, if uflpdelab for some reason fails to generate analytic jacobians)")
+    parser.add_argument("--matrix-free", action="store_true", help="use iterative solver with matrix free jacobian application")
     parser.add_argument(
         "--exact-solution-expression",
         type=str,
diff --git a/python/dune/perftool/pdelab/driver.py b/python/dune/perftool/pdelab/driver.py
index 9940aba70a83b17e606b3912bda72aa55ef4dd9a..ff37f6d9b578dc722662c7ccdc925a5b148bba7e 100644
--- a/python/dune/perftool/pdelab/driver.py
+++ b/python/dune/perftool/pdelab/driver.py
@@ -903,8 +903,14 @@ def name_stationarynonlinearproblemsolver():
 def dune_solve():
     # Test if form is linear in ansatzfunction
     if is_linear(_form):
-        slp = name_stationarylinearproblemsolver()
-        return "{}.apply();".format(slp)
+        if not get_option("matrix_free"):
+            slp = name_stationarylinearproblemsolver()
+            return "{}.apply();".format(slp)
+        else:
+            go = name_gridoperator()
+            x = name_vector()
+            include_file("dune/perftool/matrixfree.hh", filetag="drive")
+            return "solveMatrixFree({},{});".format(go,x)
     else:
         snp = name_stationarynonlinearproblemsolver()
         return "{}.apply();".format(snp)
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index bf4f93e0ede645c726d630841c1b76f6f3b20696..3b89f76e513eb8b1a26c2e82eafcd7fccb1ebef9 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -319,16 +319,30 @@ def generate_localoperator_kernels(formdata, data):
 
                     _, loptype = class_type_from_cache("operator")
                     which = ufl_measure_to_pdelab_measure(measure)
+
+                    # Numerical jacobian methods
                     base_class("Dune::PDELab::NumericalJacobian{}<{}>".format(which, loptype), classtag="operator")
 
                     # Add the initializer list for that base class
                     ini = name_initree_member()
                     ini_constructor = name_initree_constructor_param()
+
+                    # Numerical jacobian methods
                     initializer_list("Dune::PDELab::NumericalJacobian{}<{}>".format(which, loptype),
                                      ["{}.get<double>(\"numerical_epsilon.{}\", 1e-9)".format(ini_constructor, ini, which.lower())],
                                      classtag="operator",
                                      )
 
+                    if get_option("matrix_free"):
+                        # Numeical jacobian apply base class
+                        base_class("Dune::PDELab::NumericalJacobianApply{}<{}>".format(which, loptype), classtag="operator")
+
+                        # Numerical jacobian apply initializer list
+                        initializer_list("Dune::PDELab::NumericalJacobianApply{}<{}>".format(which, loptype),
+                                         ["{}.get<double>(\"numerical_epsilon.{}\", 1e-9)".format(ini_constructor, ini, which.lower())],
+                                         classtag="operator",
+                                        )
+
                 operator_kernels[(measure, 'residual')] = kernel
 
     # Generate the necessary jacobian methods
diff --git a/test/poisson/CMakeLists.txt b/test/poisson/CMakeLists.txt
index 88f33d3a7d5a8727eabec2abdf3d286bff882680..3f95382ff6ec91b8f3fd8f3c1f61d231a7ea02b4 100644
--- a/test/poisson/CMakeLists.txt
+++ b/test/poisson/CMakeLists.txt
@@ -88,6 +88,18 @@ dune_add_system_test(TARGET poisson_dg_neumann_symdiff
                      INIFILE poisson_dg_neumann_symdiff.mini
                      )
 
+# 9. Poisson Test Case: source f, bc g + numerical differentiation
+add_generated_executable(UFLFILE poisson.ufl
+                         TARGET poisson_numdiff_matrix_free
+                         FORM_COMPILER_ARGS --numerical-jacobian --matrix-free
+                         )
+
+dune_add_system_test(TARGET poisson_numdiff_matrix_free
+                     INIFILE poisson_numdiff_matrix_free.mini
+                     SCRIPT dune_vtkcompare.py
+                     )
+
+
 # Add the reference code with the PDELab localoperator that produced
 # the reference vtk file
 add_executable(poisson_dg_ref reference_main.cc)
diff --git a/test/poisson/poisson_numdiff_matrix_free.mini b/test/poisson/poisson_numdiff_matrix_free.mini
new file mode 100644
index 0000000000000000000000000000000000000000..4662d83881c20c13ffe49055338e1fc775020ab3
--- /dev/null
+++ b/test/poisson/poisson_numdiff_matrix_free.mini
@@ -0,0 +1,11 @@
+__name = poisson_numdiff_matrix_free
+
+lowerleft = 0.0 0.0
+upperright = 1.0 1.0
+elements = 32 32
+elementType = simplical
+
+[wrapper.vtkcompare]
+name = {__name}
+reference = poisson_ref
+extension = vtu