Hello,
Perhaps this post is a repetition of a problem that I am facing. I am trying to replace the use of laplace and mass matrices with cell wise assembly as seen in step-4 to know if I have understood things properly, which I feel, since I am posting this problem I have not.
What does the following code involve:
I have left the code for the u equation be the same and made changes only in the v equation. All the changes I have made is only restricted to the run function of step-23. And here one can find the run function that I have. I am not adding the force terms since they are considered 0 in the original problem as well. 
The problem I face with this is that the energy decreases as time passes. I am sure I have made a mistake in the assembly but not able to pinpoint where. Can someone help me please?
Regards,
Dulcimer
template <int dim>
  void WaveEquation<dim>::run ()
  {
    setup_system();
    VectorTools::project (dof_handler, constraints, QGauss<dim>(3),
                          InitialValuesU<dim>(),
                          old_solution_u);
    VectorTools::project (dof_handler, constraints, QGauss<dim>(3),
                          InitialValuesV<dim>(),
                          old_solution_v);
    Vector<double> tmp (solution_u.size());
    Vector<double> forcing_terms (solution_u.size());
    for (; time<=5; time+=time_step, ++timestep_number)
      {
        std::cout << "Time step " << timestep_number
                  << " at t=" << time
                  << std::endl;
        mass_matrix.vmult (system_rhs, old_solution_u);
        mass_matrix.vmult (tmp, old_solution_v);
        system_rhs.add (time_step, tmp);
        laplace_matrix.vmult (tmp, old_solution_u);
        system_rhs.add (-theta * (1-theta) * time_step * time_step, tmp);
        RightHandSide<dim> rhs_function;
        rhs_function.set_time (time);
        VectorTools::create_right_hand_side (dof_handler, QGauss<dim>(2),
                                             rhs_function, tmp);
        forcing_terms = tmp;
        forcing_terms *= theta * time_step;
        rhs_function.set_time (time-time_step);
        VectorTools::create_right_hand_side (dof_handler, QGauss<dim>(2),
                                             rhs_function, tmp);
        forcing_terms.add ((1-theta) * time_step, tmp);
        system_rhs.add (theta * time_step, forcing_terms);
        {
          BoundaryValuesU<dim> boundary_values_u_function;
          boundary_values_u_function.set_time (time);
          std::map<types::global_dof_index,double> boundary_values;
          VectorTools::interpolate_boundary_values (dof_handler,
                                                    0,
                                                    boundary_values_u_function,
                                                    boundary_values);
          matrix_u.copy_from (mass_matrix);
          matrix_u.add (theta * theta * time_step * time_step, laplace_matrix);
          MatrixTools::apply_boundary_values (boundary_values,
                                              matrix_u,
                                              solution_u,
                                              system_rhs);
        }
        solve_u ();
//----------------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------------
        QGauss<dim> quadrature_formula(3);
        FEValues<dim> fe_values (fe, quadrature_formula,
                                 update_values | update_gradients |
                                 update_quadrature_points | update_JxW_values);
        const unsigned int dofs_per_cell = fe.dofs_per_cell;
        const unsigned int n_q_points    = quadrature_formula.size();
        FullMatrix<double> cell_matrix(dofs_per_cell, dofs_per_cell);
        Vector<double> cell_rhs(dofs_per_cell);
        Vector<double> cell_rhs_1(dofs_per_cell);
        std::vector<types::global_dof_index> local_dof_indices(dofs_per_cell);
        system_rhs.reinit (dof_handler.n_dofs());
        std::vector<double> old_u_q(n_q_points);
        std::vector<double> old_v_q(n_q_points);
        std::vector<double> u_q(n_q_points);
        for(const auto &cell: dof_handler.active_cell_iterators())
        {
            fe_values.reinit(cell);
            cell_matrix   = 0;
            cell_rhs = 0;
            cell_rhs_1 = 0;
            fe_values.get_function_values(old_solution_u, old_u_q);
            fe_values.get_function_values(old_solution_v, old_v_q);
            fe_values.get_function_values(solution_u, u_q);
            for(unsigned int q_index = 0; q_index < n_q_points; ++q_index)
            {
                double old_u =  old_u_q[q_index];
                double old_v =  old_v_q[q_index];
                double u = u_q[q_index];
                for(unsigned int i = 0; i < dofs_per_cell; ++i)
                {
                    for(unsigned int j = 0; j < dofs_per_cell; ++j)
                    {
                        /*M*/
                        cell_matrix(i, j) += (fe_values.shape_value(i, q_index) *
                                              fe_values.shape_value(j, q_index)
                                              ) *
                                              fe_values.JxW(q_index);
                        cell_rhs_1(i) +=
                           /*(MV^(n-1) - k(theta*A*U^(n) + (1-theta)*A*U^(n-1))*/
                          (
                               fe_values.shape_value(i, q_index) *
                               fe_values.shape_value(j, q_index)
                               *
                               old_v
                               -
                               time_step*
                               fe_values.shape_grad(i, q_index) *
                               fe_values.shape_grad(j, q_index) *
                               (
                                    theta *
                                    u
                                    +
                                    (1-theta)*
                                    old_u
                               )
                           );
                    }
                    cell_rhs(i) += cell_rhs_1(i) * fe_values.JxW (q_index);
                }
            }
            cell->get_dof_indices (local_dof_indices);
            for(unsigned int i = 0; i < dofs_per_cell; i++)
            {
                for (unsigned int j = 0; j < dofs_per_cell; j++)
                {
                    matrix_v.add (local_dof_indices[i],
                                   local_dof_indices[j],
                                   cell_matrix(i,j));
                }
                system_rhs(local_dof_indices[i]) += cell_rhs(i);
            }
        }
        //------------------------------------------------------------------
/*        laplace_matrix.vmult (system_rhs, solution_u);
        system_rhs *= -theta * time_step;
        mass_matrix.vmult (tmp, old_solution_v);
        system_rhs += tmp;
        laplace_matrix.vmult (tmp, old_solution_u);
        system_rhs.add (-time_step * (1-theta), tmp);
        system_rhs += forcing_terms;*/
        //------------------------------------------------------------------
        {
          BoundaryValuesV<dim> boundary_values_v_function;
          boundary_values_v_function.set_time (time);
          std::map<types::global_dof_index,double> boundary_values;
          VectorTools::interpolate_boundary_values (dof_handler,
                                                    0,
                                                    boundary_values_v_function,
                                                    boundary_values);
          //matrix_v.copy_from (mass_matrix);
          MatrixTools::apply_boundary_values (boundary_values,
                                              matrix_v,
                                              solution_v,
                                              system_rhs);
        }
        solve_v ();
        output_results ();
        std::cout << "   Total energy: "
                  << (mass_matrix.matrix_norm_square (solution_v) +
                      laplace_matrix.matrix_norm_square (solution_u)) / 2
                  << std::endl;
        old_solution_u = solution_u;
        old_solution_v = solution_v;
      }
  }
}