diff --git a/pytential/linalg/skeletonization.py b/pytential/linalg/skeletonization.py
index ba15ec7ee..ac6fcad4e 100644
--- a/pytential/linalg/skeletonization.py
+++ b/pytential/linalg/skeletonization.py
@@ -26,7 +26,7 @@
 
 import numpy as np
 
-from arraycontext import PyOpenCLArrayContext
+from arraycontext import PyOpenCLArrayContext, Array, ArrayT
 
 from pytential import GeometryCollection, sym
 from pytential.linalg.utils import IndexList, TargetAndSourceClusterList
@@ -52,7 +52,72 @@
 
 # {{{ wrangler
 
-def _approximate_geometry_waa_magnitude(actx, places, nbrindex, domain):
+# Adding weights to proxy evaluation: Why? How? Huh?
+# --------------------------------------------------
+#
+# We have a couple of interaction evaluations that need to happen for the
+# proxy-based skeletonization, namely
+# 1. with the proxies + neighbors when the current cluster is the source
+# 2. with the proxies + neighbors when the current cluster is the target
+#
+# The operator that we want to skeletonize at the end of the day is the full
+# layer potential, that contains quadrature weights and area elements. That
+# means that it is in our interest to include some of that information in
+# the proxy + neighbor interaction matrices that get passed to the ID
+# (interpolative decomposition). This helps on two fronts:
+# * first, it gives the ID a better chance at guessing a good range. This is more
+#   important for the nearfield than the farfield.
+# * second, as the ID constructs an approximation until a relative error is
+#   reached, columns with a larger norm will be preferred. However, since
+#   we ultimately reconstruct both nearfield and farfield, we have no reason to
+#   weigh the proxy or neighbor interactions more heavily. Ideally both sets of
+#   interactions are weighted similarly, which is what we try to achieve below.
+#
+# In the first case, the geometry is the source and we can directly
+# right multiply with the same weights, e.g.
+#
+#   P_w <- P @ W
+#   N_w <- N @ W
+#
+# where `P` and `N` just evaluate the direct proxy interactions (P2P) and
+# the neighbor interactions (QBX) without any other changes. Then, the matrix
+# `hstack([P_w, N_w])` is the one that gets ID-ed. As the proxies and neighbors
+# are at a comparable distance from the cluster, multiplying both by the same
+# weights is expected to ensure similar norms. Adding these weights is
+# controlled by `SkeletonizationWrangler.weighted_sources`.
+#
+# In the second case, the source points are either the proxy points or the
+# neighboring points. For neighboring points, we retrieve the weights on the
+# geometry (same as above). However, for the proxy points we generally do not
+# construct (or wish to) a full discretization with a quadrature scheme. From a
+# linear algebra point of view, we want to find a matrix `W_p` in
+#
+#   P_w <- P @ W_p
+#   N_w <- N @ W
+#
+# such that `P_w` and `N_w` have comparable norms. For now, we approximate `W_p`
+# by taking the average of `W`, i.e. `W_p <- avg(W) * I`. As `W` is diagonal,
+# we could easily approximate the `\ell^2` or Frobenius norm, but the average
+# is thought of as reasonable compromise. Adding these weights is controlled by
+# `SkeletonizationWrangler.weighted_targets` and computed below in
+# `_approximate_geometry_waa_magnitude`.
+
+
+def _approximate_geometry_waa_magnitude(
+        actx: PyOpenCLArrayContext,
+        places: GeometryCollection,
+        cluster_index: "IndexList",
+        domain: sym.DOFDescriptor) -> Array:
+    """
+    :arg cluster_index: a :class:`~pytential.linalg.IndexList` representing the
+        clusters on which to operate. In practice, this can be the cluster
+        indices themselves or the indices of neighboring points to each
+        cluster (i.e. points inside the proxy ball).
+
+    :returns: an array of size ``(nclusters,)`` with a characteristic magnitude
+        of the quadrature weights and area elements in the cluster. Currently,
+        this is a simple average.
+    """
     from pytools import memoize_in
     from pytential import bind, sym
 
@@ -73,50 +138,69 @@ def prg():
 
     waa = bind(places, sym.weights_and_area_elements(
         places.ambient_dim, dofdesc=domain))(actx)
-    result = actx.zeros((nbrindex.nclusters,), dtype=waa.entry_dtype)
+    result = actx.zeros((cluster_index.nclusters,), dtype=waa.entry_dtype)
 
     from arraycontext import flatten
     _, (waa_per_cluster,) = prg()(actx.queue,         # pylint: disable=not-callable
             waa=flatten(waa, actx),
             result=result,
-            indices=nbrindex.indices,
-            starts=nbrindex.starts)
+            indices=cluster_index.indices,
+            starts=cluster_index.starts)
 
     return waa_per_cluster
 
 
-def _apply_weights(actx, mat, places, tgt_pxy_index, nbrindex, domain):
-    assert tgt_pxy_index.nclusters == nbrindex.nclusters
+def _apply_weights(
+        actx: PyOpenCLArrayContext,
+        mat: ArrayT,
+        places: GeometryCollection,
+        tgt_pxy_index: TargetAndSourceClusterList,
+        cluster_index: IndexList,
+        domain: sym.DOFDescriptor) -> ArrayT:
+    """Computes the weights using :func:`_approximate_geometry_waa_magnitude`
+    and multiplies each cluster in *mat* by its corresponding weight.
+
+    :returns: *mat* multiplied by the weights.
+    """
+    assert tgt_pxy_index.nclusters == cluster_index.nclusters
     waa = actx.to_numpy(
-            _approximate_geometry_waa_magnitude(actx, places, nbrindex, domain)
+            _approximate_geometry_waa_magnitude(actx, places, cluster_index, domain)
             )
 
+    result = np.zeros_like(mat)
     for i in range(tgt_pxy_index.nclusters):
         istart, iend = tgt_pxy_index._flat_cluster_starts[i:i + 2]
-        mat[istart:iend] *= waa[i]
+        result[istart:iend] = mat[istart:iend] * waa[i]
 
-    return mat
+    return result
 
 
 @dataclass(frozen=True)
 class SkeletonizationWrangler:
     """
+    .. attribute:: nrows
+
+        Number of output :attr:`exprs` in the operator.
+
+    .. attribute:: ncols
+
+        Number of :attr:`input_exprs` in the operator.
+
     .. attribute:: exprs
 
-        An :class:`~numpy.ndarray` of expressions (layer potentials)
-        that correspond to the output blocks of the matrix. These expressions
-        are tagged for nearfield neighbor evalution.
+        An :class:`~numpy.ndarray` of shape ``(nrows,)`` of expressions
+        (layer potentials) that correspond to the output blocks of the matrix.
+        These expressions are tagged for nearfield neighbor evalution.
 
     .. attribute:: source_proxy_exprs
     .. attribute:: target_proxy_exprs
 
-        Like :attr:`exprs`, but stripped down for farfield proxy
-        evaluation.
+        Like :attr:`exprs`, but stripped down for farfield proxy evaluation.
 
     .. attribute:: input_exprs
 
-        A :class:`tuple` of densities that correspond to the input blocks
-        of the matrix.
+        A :class:`tuple` of size ``(ncols,)`` of densities that correspond to
+        the input blocks of the matrix.
 
     .. attribute:: domains
 
@@ -133,10 +217,14 @@ class SkeletonizationWrangler:
     .. attribute:: weighted_sources
 
         A flag which if *True* adds a weight to the source to proxy evaluation.
+        This can only be meaningfully set to *False* when skeletonizing direct
+        P2P interactions.
 
     .. attribute:: weighted_targets
 
         A flag which if *True* adds a weight to the proxy to target evaluation.
+        This can only be meaningfully set to *False* when skeletonizing direct
+        P2P interactions.
 
     .. attribute:: proxy_source_cluster_builder
     .. attribute:: proxy_target_cluster_builder
@@ -160,7 +248,7 @@ class SkeletonizationWrangler:
     # operator
     exprs: np.ndarray
     input_exprs: Tuple[sym.var, ...]
-    domains: Tuple[Hashable, ...]
+    domains: Tuple[sym.DOFDescriptor, ...]
     context: Dict[str, Any]
 
     neighbor_cluster_builder: Callable[..., np.ndarray]
@@ -176,11 +264,11 @@ class SkeletonizationWrangler:
     proxy_source_cluster_builder: Callable[..., np.ndarray]
 
     @property
-    def nrows(self):
+    def nrows(self) -> int:
         return len(self.exprs)
 
     @property
-    def ncols(self):
+    def ncols(self) -> int:
         return len(self.input_exprs)
 
     def _evaluate_expr(self,
@@ -300,7 +388,8 @@ def make_skeletonization_wrangler(
         _weighted_proxy: Optional[Union[bool, Tuple[bool, bool]]] = None,
         _proxy_source_cluster_builder: Optional[Callable[..., np.ndarray]] = None,
         _proxy_target_cluster_builder: Optional[Callable[..., np.ndarray]] = None,
-        _neighbor_cluster_builder: Optional[Callable[..., np.ndarray]] = None):
+        _neighbor_cluster_builder: Optional[Callable[..., np.ndarray]] = None,
+        ) -> SkeletonizationWrangler:
     if context is None:
         context = {}
 
@@ -382,6 +471,42 @@ def make_skeletonization_wrangler(
 
 @dataclass(frozen=True)
 class _ProxyNeighborEvaluationResult:
+    """
+    .. attribute:: pxy
+
+        A :class:`~pytential.linalg.ProxyClusterGeometryData` containing the
+        proxy points from which :attr:`pxymat` is obtained. This data is also
+        used to construct :attr:`nbrindex` and evaluate :attr:`nbrmat`.
+
+    .. attribute:: pxymat
+
+        Interaction matrix between the proxy points and the source or
+        target points. This matrix is flattened to a shape of ``(nsize,)``,
+        which is consistent with the sum of the cluster sizes in :attr:`pxyindex`,
+        as obtained from
+        :meth:`~pytential.linalg.TargetAndSourceClusterList.cluster_size`.
+
+    .. attribute:: pxyindex
+
+        A :class:`~pytential.linalg.TargetAndSourceClusterList` used to describe
+        the cluster interactions in :attr:`pxymat`.
+
+    .. attribute:: nbrmat
+
+        Interaction matrix between the neighboring points and the source or
+        target points. This matrix is flattened to a shape of ``(nsize,)``,
+        which is consistent with the sum of the cluster sizes in :attr:`nbrindex`,
+        as obtained from
+        :meth:`~pytential.linalg.TargetAndSourceClusterList.cluster_size`.
+
+    .. attribute:: nbrindex
+
+        A :class:`~pytential.linalg.TargetAndSourceClusterList` used to describe
+        the cluster interactions in :attr:`nbrmat`.
+
+    .. automethod:: __getitem__
+    """
+
     pxy: ProxyClusterGeometryData
 
     pxymat: np.ndarray
@@ -391,53 +516,54 @@ class _ProxyNeighborEvaluationResult:
     nbrindex: TargetAndSourceClusterList
 
     def __getitem__(self, i: int) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        :returns: a :class:`tuple` of ``(pxymat, nbrmat)`` containing the
+            :math:`i`-th cluster interactions. The matrices are reshaped into
+            their full sizes.
+        """
+
         shape = self.nbrindex.cluster_shape(i, i)
         nbrmat_i = self.nbrindex.flat_cluster_take(self.nbrmat, i).reshape(*shape)
 
         shape = self.pxyindex.cluster_shape(i, i)
         pxymat_i = self.pxyindex.flat_cluster_take(self.pxymat, i).reshape(*shape)
 
-        return [pxymat_i, nbrmat_i]
+        return pxymat_i, nbrmat_i
 
 
-def _make_block_proxy_skeleton(
-        actx, ibrow, ibcol,
-        places, proxy_generator, wrangler, cluster_index,
-        evaluate_proxy, evaluate_neighbor,
-        max_particles_in_box=None):
-    """Builds a block matrix that can be used to skeletonize the
-    rows (targets) or columns (sources) of the symbolic matrix block
-    described by ``(ibrow, ibcol)``.
+def _evaluate_proxy_skeletonization_interaction(
+        actx: PyOpenCLArrayContext,
+        places: GeometryCollection,
+        proxy_generator: ProxyGeneratorBase,
+        wrangler: SkeletonizationWrangler,
+        cluster_index: IndexList, *,
+        evaluate_proxy: Callable[...,
+            Tuple[np.ndarray, TargetAndSourceClusterList]],
+        evaluate_neighbor: Callable[...,
+            Tuple[np.ndarray, TargetAndSourceClusterList]],
+        dofdesc: Optional[sym.DOFDescriptor] = None,
+        max_particles_in_box: Optional[int] = None,
+        ) -> _ProxyNeighborEvaluationResult:
+    """Evaluate the proxy to cluster and neighbor to cluster interactions for
+    each cluster in *cluster_index*.
     """
 
     if cluster_index.nclusters == 1:
         raise ValueError("cannot make a proxy skeleton for a single cluster")
 
-    # {{{ generate proxies
-
-    domain = wrangler.domains[ibcol]
-    pxy = proxy_generator(actx, domain, cluster_index)
-
-    # }}}
-
-    # {{{ evaluate proxy and neighbor interactions
-
     from pytential.linalg import gather_cluster_neighbor_points
+    pxy = proxy_generator(actx, dofdesc, cluster_index)
     nbrindex = gather_cluster_neighbor_points(
             actx, pxy,
             max_particles_in_box=max_particles_in_box)
 
-    pxymat, pxy_geo_index = evaluate_proxy(
-            actx, places, pxy, nbrindex, ibrow=ibrow, ibcol=ibcol)
-    nbrmat, nbr_geo_index = evaluate_neighbor(
-            actx, places, pxy, nbrindex, ibrow=ibrow, ibcol=ibcol)
-
-    # }}}
+    pxymat, pxy_cluster_index = evaluate_proxy(actx, places, pxy, nbrindex)
+    nbrmat, nbr_cluster_index = evaluate_neighbor(actx, places, pxy, nbrindex)
 
     return _ProxyNeighborEvaluationResult(
             pxy=pxy,
-            pxymat=pxymat, pxyindex=pxy_geo_index,
-            nbrmat=nbrmat, nbrindex=nbr_geo_index)
+            pxymat=pxymat, pxyindex=pxy_cluster_index,
+            nbrmat=nbrmat, nbrindex=nbr_cluster_index)
 
 
 def _skeletonize_block_by_proxy_with_mats(
@@ -448,34 +574,44 @@ def _skeletonize_block_by_proxy_with_mats(
         tgt_src_index: TargetAndSourceClusterList, *,
         id_eps: Optional[float] = None, id_rank: Optional[int] = None,
         max_particles_in_box: Optional[int] = None
-        ):
+        ) -> "SkeletonizationResult":
     nclusters = tgt_src_index.nclusters
     if nclusters == 1:
         raise ValueError("cannot make a proxy skeleton for a single cluster")
 
     # construct proxy matrices to skeletonize
     from functools import partial
-    make_proxy_skeleton = partial(_make_block_proxy_skeleton,
-            actx, ibrow, ibcol, places, proxy_generator, wrangler,
+    evaluate_skeletonization_interaction = partial(
+            _evaluate_proxy_skeletonization_interaction,
+            actx, places, proxy_generator, wrangler,
+            dofdesc=wrangler.domains[ibcol],
             max_particles_in_box=max_particles_in_box)
 
-    src_result = make_proxy_skeleton(
+    src_result = evaluate_skeletonization_interaction(
             tgt_src_index.sources,
-            evaluate_proxy=wrangler.evaluate_source_proxy_interaction,
-            evaluate_neighbor=wrangler.evaluate_source_neighbor_interaction,
+            evaluate_proxy=partial(
+                wrangler.evaluate_source_proxy_interaction,
+                ibrow=ibrow, ibcol=ibcol),
+            evaluate_neighbor=partial(
+                wrangler.evaluate_source_neighbor_interaction,
+                ibrow=ibrow, ibcol=ibcol),
             )
-    tgt_result = make_proxy_skeleton(
+    tgt_result = evaluate_skeletonization_interaction(
             tgt_src_index.targets,
-            evaluate_proxy=wrangler.evaluate_target_proxy_interaction,
-            evaluate_neighbor=wrangler.evaluate_target_neighbor_interaction,
+            evaluate_proxy=partial(
+                wrangler.evaluate_target_proxy_interaction,
+                ibrow=ibrow, ibcol=ibcol),
+            evaluate_neighbor=partial(
+                wrangler.evaluate_target_neighbor_interaction,
+                ibrow=ibrow, ibcol=ibcol)
             )
 
     src_skl_indices = np.empty(nclusters, dtype=object)
     tgt_skl_indices = np.empty(nclusters, dtype=object)
     skel_starts = np.zeros(nclusters + 1, dtype=np.int32)
 
-    L = np.full((nclusters, nclusters), 0, dtype=object)
-    R = np.full((nclusters, nclusters), 0, dtype=object)
+    L = np.empty(nclusters, dtype=object)
+    R = np.empty(nclusters, dtype=object)
 
     from pytential.linalg import interp_decomp
     for i in range(nclusters):
@@ -483,33 +619,39 @@ def _skeletonize_block_by_proxy_with_mats(
         src_mat = np.vstack(src_result[i])
         tgt_mat = np.hstack(tgt_result[i])
 
-        assert np.all(np.isfinite(tgt_mat)), np.where(np.isfinite(tgt_mat))
-        assert np.all(np.isfinite(src_mat)), np.where(np.isfinite(src_mat))
+        if __debug__:
+            isfinite = np.isfinite(tgt_mat)
+            assert np.all(isfinite), np.where(isfinite)
+            isfinite = np.isfinite(src_mat)
+            assert np.all(isfinite), np.where(isfinite)
 
         # skeletonize target points
-        k, idx, interp = interp_decomp(tgt_mat.T, k, id_eps)
+        k, idx, interp = interp_decomp(tgt_mat.T, rank=k, eps=id_eps)
         assert k > 0
 
-        L[i, i] = interp.T
+        L[i] = interp.T
         tgt_skl_indices[i] = tgt_src_index.targets.cluster_indices(i)[idx[:k]]
 
         # skeletonize source points
-        k, idx, interp = interp_decomp(src_mat, k, id_eps)
+        k, idx, interp = interp_decomp(src_mat, rank=k, eps=None)
         assert k > 0
 
-        R[i, i] = interp
+        R[i] = interp
         src_skl_indices[i] = tgt_src_index.sources.cluster_indices(i)[idx[:k]]
 
         skel_starts[i + 1] = skel_starts[i] + k
-        assert R[i, i].shape == (k, src_mat.shape[1])
-        assert L[i, i].shape == (tgt_mat.shape[0], k)
+        assert R[i].shape == (k, src_mat.shape[1])
+        assert L[i].shape == (tgt_mat.shape[0], k)
 
     from pytential.linalg import make_index_list
     src_skl_index = make_index_list(np.hstack(src_skl_indices), skel_starts)
     tgt_skl_index = make_index_list(np.hstack(tgt_skl_indices), skel_starts)
     skel_tgt_src_index = TargetAndSourceClusterList(tgt_skl_index, src_skl_index)
 
-    return L, R, skel_tgt_src_index, src_result, tgt_result
+    return SkeletonizationResult(
+            L=L, R=R,
+            tgt_src_index=tgt_src_index, skel_tgt_src_index=skel_tgt_src_index,
+            _src_eval_result=src_result, _tgt_eval_result=tgt_result)
 
 # }}}
 
@@ -528,7 +670,14 @@ class SkeletonizationResult:
 
     where :math:`S = A_{I, J}` for a subset :math:`I` and :math:`J` of the
     rows and columns of :math:`A`, respectively. This applies to each cluster
-    in :attr:`tgt_src_index`.
+    in :attr:`tgt_src_index`. In particular, for a cluster pair :math:`(i, j)`,
+    we can reconstruct the matrix entries as follows
+
+    .. code:: python
+
+        Aij = tgt_src_index.cluster_take(A, i, j)
+        Sij = skel_tgt_src_index.cluster_take(A, i, j)
+        Aij_approx = L[i] @ Sij @ R[j]
 
     .. attribute:: nclusters
 
@@ -536,11 +685,11 @@ class SkeletonizationResult:
 
     .. attribute:: L
 
-        A block diagonal :class:`~numpy.ndarray` object array.
+        An object :class:`~numpy.ndarray` of size ``(nclusters,)``.
 
     .. attribute:: R
 
-        A block diagonal :class:`~numpy.ndarray` object array.
+        An object :class:`~numpy.ndarray` of size ``(nclusters,)``.
 
     .. attribute:: tgt_src_index
 
@@ -550,8 +699,9 @@ class SkeletonizationResult:
     .. attribute:: skel_tgt_src_index
 
         A :class:`~pytential.linalg.TargetAndSourceClusterList` representing a
-        subset of :attr:`tgt_src_index` of the matrix :math:`S`, i.e. the
-        skeleton of each cluster of :math:`A`.
+        subset of :attr:`tgt_src_index`, i.e. the skeleton of each cluster of
+        :math:`A`. These indices can be used to reconstruct the :math:`S`
+        matrix.
     """
 
     L: np.ndarray
@@ -560,14 +710,19 @@ class SkeletonizationResult:
     tgt_src_index: TargetAndSourceClusterList
     skel_tgt_src_index: TargetAndSourceClusterList
 
+    # NOTE: these are meant only for testing! They contain the interactions
+    # between the source / target points and their proxies / neighbors.
+    _src_eval_result: Optional[_ProxyNeighborEvaluationResult] = None
+    _tgt_eval_result: Optional[_ProxyNeighborEvaluationResult] = None
+
     def __post_init__(self):
         if __debug__:
             nclusters = self.tgt_src_index.nclusters
-            shape = (nclusters, nclusters)
+            shape = (nclusters,)
 
             if self.tgt_src_index.nclusters != self.skel_tgt_src_index.nclusters:
                 raise ValueError("'tgt_src_index' and 'skel_tgt_src_index' have "
-                        f"different number of blocks: {self.tgt_src_index.nclusters}"
+                        f"different number of clusters: {nclusters}"
                         f" vs {self.skel_tgt_src_index.nclusters}")
 
             if self.L.shape != shape:
@@ -630,16 +785,11 @@ def skeletonize_by_proxy(
     skels = np.empty((wrangler.nrows, wrangler.ncols), dtype=object)
     for ibrow in range(wrangler.nrows):
         for ibcol in range(wrangler.ncols):
-            L, R, skel_tgt_src_index, _, _ = _skeletonize_block_by_proxy_with_mats(
+            skels[ibrow, ibcol] = _skeletonize_block_by_proxy_with_mats(
                     actx, ibrow, ibcol, places, proxy, wrangler, tgt_src_index,
                     id_eps=id_eps, id_rank=id_rank,
                     max_particles_in_box=max_particles_in_box)
 
-            skels[ibrow, ibcol] = SkeletonizationResult(
-                    L=L, R=R,
-                    tgt_src_index=tgt_src_index,
-                    skel_tgt_src_index=skel_tgt_src_index)
-
     return skels
 
 # }}}
diff --git a/pytential/linalg/utils.py b/pytential/linalg/utils.py
index 1bb8b2bec..151b394b4 100644
--- a/pytential/linalg/utils.py
+++ b/pytential/linalg/utils.py
@@ -325,14 +325,16 @@ def make_flat_cluster_diag(
 # {{{ interpolative decomposition
 
 def interp_decomp(
-        A: np.ndarray, rank: Optional[int], eps: Optional[float],
+        A: np.ndarray, *, rank: Optional[int], eps: Optional[float],
         ) -> Tuple[int, np.ndarray, np.ndarray]:
     """Wrapper for :func:`~scipy.linalg.interpolative.interp_decomp` that
     always has the same output signature.
 
-    :return: a tuple ``(k, idx, interp)`` containing the numerical rank,
-        the column indices and the resulting interpolation matrix.
+    :return: a tuple ``(k, idx, interp)`` containing the numerical rank *k*,
+        the column indices *idx* and the resulting interpolation matrix *interp*.
     """
+    if rank is not None and eps is not None:
+        raise ValueError("providing both 'rank' and 'eps' is not supported")
 
     import scipy.linalg.interpolative as sli    # pylint:disable=no-name-in-module
     if rank is None:
@@ -353,6 +355,31 @@ def cluster_skeletonization_error(
         mat: np.ndarray, skeleton: "SkeletonizationResult", *,
         ord: Optional[float] = None,
         relative: bool = False) -> np.ndarray:
+    r"""Evaluate the cluster-wise skeletonization errors for the given *skeleton*.
+
+    Errors are computed for all interactions between cluster :math:`i` and
+    cluster :math:`j` as
+
+    .. math::
+
+        E^T_{ij} = \|A_{ij} - L_i T_{ij}\|
+        \quad \text{and} \quad
+        E^S_{ij} = \|A_{ij} - S_{ij} R_j\|
+
+    where :math:`A_{ij}` is the exact interaction between the two clusters,
+    :math:`(L_i, R_j)` are the ID interpolation matrices and
+    :math:`(T_{ij}, S_{ij})` are the target and source skeleton matrices.
+    The exact matrix and the skeleton matrices are extracted from the full
+    matrix *mat*.
+
+    :arg ord: the type of the matrix norm used to compute errors, as described
+        in :func:`numpy.linalg.norm`. This norm is used in computing the
+        cluster-wise errors above.
+    :arg relative: if *True*, a relative norm of type *ord* is computed.
+    :returns: a :class:`tuple` of ``(src_errors, tgt_errors)``. Each error is
+        an object :class:`~numpy.ndarray` of shape ``(nclusters, nclusters)``
+        containing the errors between all non-self cluster interactions.
+    """
     from itertools import product
 
     L = skeleton.L
@@ -388,10 +415,10 @@ def mnorm(x: np.ndarray, y: np.ndarray) -> float:
 
         # compute cluster-wise errors
         S = mat[np.ix_(s_tgt, f_src)]
-        tgt_error[i, j] = mnorm(A, L[i, i] @ S)
+        tgt_error[i, j] = mnorm(A, L[i] @ S)
 
         S = mat[np.ix_(f_tgt, s_src)]
-        src_error[i, j] = mnorm(A, S @ R[j, j])
+        src_error[i, j] = mnorm(A, S @ R[j])
 
     # }}}
 
@@ -402,6 +429,28 @@ def skeletonization_error(
         mat: np.ndarray, skeleton: "SkeletonizationResult", *,
         ord: Optional[float] = None,
         relative: bool = False) -> np.ndarray:
+    r"""Computes the skeletonization error for the entire matrix *mat*.
+
+    The error computed here is given by
+
+    .. math::
+
+        E = \|A - L S R\|,
+
+    where :math:`A` is simply *mat*, :math:`L` and :math:`R` are block
+    diagonal matrices reconstructed from the block in *skeleton* and
+    :math:`S` is the skeleton matrix (which is a subset of the rows and
+    columns of :math:`A`).
+
+    Reconstructing the full matrix can be very costly. In these cases,
+    :func:`cluster_skeletonization_error` may be more appropriate.
+
+    :arg ord: the type of the matrix norm used to compute errors, as described
+        in :func:`numpy.linalg.norm`. This norm is used in computing the
+        reconstruction error above.
+    :arg relative: if *True*, a relative norm of type *ord* is computed.
+    """
+
     L = skeleton.L
     R = skeleton.R
     tgt_src_index = skeleton.tgt_src_index
@@ -425,7 +474,7 @@ def skeletonization_error(
         s_src = skel_tgt_src_index.sources.cluster_indices(j)
 
         S = mat[np.ix_(s_tgt, s_src)]
-        skl[np.ix_(f_tgt, f_src)] = L[i, i] @ S @ R[j, j]
+        skl[np.ix_(f_tgt, f_src)] = L[i] @ S @ R[j]
 
     # }}}
 
diff --git a/test/test_linalg_skeletonization.py b/test/test_linalg_skeletonization.py
index 292ebba63..33088723d 100644
--- a/test/test_linalg_skeletonization.py
+++ b/test/test_linalg_skeletonization.py
@@ -97,7 +97,11 @@ def _plot_skeleton_with_proxies(name, sources, pxy, srcindex, sklindex):
     replace(SKELETONIZE_TEST_CASES[0], op_type="double", knl_class_or_helmholtz_k=5),
     ])
 def test_skeletonize_symbolic(actx_factory, case, visualize=False):
-    """Tests that the symbolic manipulations work for different kernels / IntGs."""
+    """Tests that the symbolic manipulations work for different kernels / IntGs.
+    This tests that `prepare_expr` and `prepare_proxy_expr` can "clean" the
+    given integral equations and that the result can be evaluated and skeletonized.
+    """
+
     actx = actx_factory()
 
     if visualize:
@@ -152,7 +156,7 @@ def test_skeletonize_symbolic(actx_factory, case, visualize=False):
 
 def run_skeletonize_by_proxy(actx, case, resolution,
                              places=None, mat=None,
-                             force_assert=True,
+                             ctol=None, rtol=None,
                              suffix="", visualize=False):
     from pytools import ProcessTimer
 
@@ -223,52 +227,47 @@ def run_skeletonize_by_proxy(actx, case, resolution,
     from pytential.linalg.skeletonization import \
             _skeletonize_block_by_proxy_with_mats
     with ProcessTimer() as p:
-        L, R, skel_tgt_src_index, src, tgt = _skeletonize_block_by_proxy_with_mats(
+        skeleton = _skeletonize_block_by_proxy_with_mats(
                 actx, 0, 0, places, proxy_generator, wrangler, tgt_src_index,
                 id_eps=case.id_eps,
                 max_particles_in_box=case.max_particles_in_box)
 
     logger.info("[time] skeletonization by proxy: %s", p)
 
+    L, R = skeleton.L, skeleton.R
     for i in range(tgt_src_index.nclusters):
         # targets (rows)
         bi = np.searchsorted(
             tgt_src_index.targets.cluster_indices(i),
-            skel_tgt_src_index.targets.cluster_indices(i),
+            skeleton.skel_tgt_src_index.targets.cluster_indices(i),
             )
 
-        A = np.hstack(tgt[i])
+        A = np.hstack(skeleton._tgt_eval_result[i])
         S = A[bi, :]
-        tgt_error = la.norm(A - L[i, i] @ S, ord=2) / la.norm(A, ord=2)
+        tgt_error = la.norm(A - L[i] @ S, ord=2) / la.norm(A, ord=2)
 
         # sources (columns)
         bj = np.searchsorted(
             tgt_src_index.sources.cluster_indices(i),
-            skel_tgt_src_index.sources.cluster_indices(i),
+            skeleton.skel_tgt_src_index.sources.cluster_indices(i),
             )
 
-        A = np.vstack(src[i])
+        A = np.vstack(skeleton._src_eval_result[i])
         S = A[:, bj]
-        src_error = la.norm(A - S @ R[i, i], ord=2) / la.norm(A, ord=2)
+        src_error = la.norm(A - S @ R[i], ord=2) / la.norm(A, ord=2)
 
         logger.info("[%04d] id_eps %.5e src %.5e tgt %.5e rank %d/%d",
                 i, case.id_eps,
-                src_error, tgt_error, R[i, i].shape[0], R[i, i].shape[1])
+                src_error, tgt_error, R[i].shape[0], R[i].shape[1])
 
-        if force_assert:
-            assert src_error < 6 * case.id_eps
-            assert tgt_error < 6 * case.id_eps
+        if ctol is not None:
+            assert src_error < ctol * case.id_eps
+            assert tgt_error < ctol * case.id_eps
 
     # }}}
 
     # {{{ check skeletonize
 
-    from pytential.linalg import SkeletonizationResult
-    skeleton = SkeletonizationResult(
-            L=L, R=R,
-            tgt_src_index=tgt_src_index,
-            skel_tgt_src_index=skel_tgt_src_index)
-
     from pytential.linalg.utils import (
             cluster_skeletonization_error, skeletonization_error)
 
@@ -289,16 +288,14 @@ def run_skeletonize_by_proxy(actx, case, resolution,
 
         logger.info("[time] full error: %s", p)
 
-    # FIXME: why is the 3D error so large?
-    rtol = 10**places.ambient_dim * case.id_eps
-
     logger.info("error: id_eps %.5e R %.5e L %.5e F %.5e (rtol %.5e)",
-            case.id_eps, err_r, err_l, err_f, rtol)
+            case.id_eps, err_r, err_l, err_f,
+            rtol if rtol is not None else 0.0)
 
-    if force_assert:
-        assert err_l < rtol
-        assert err_r < rtol
-        assert err_f < rtol
+    if rtol:
+        assert err_l < rtol * case.id_eps
+        assert err_r < rtol * case.id_eps
+        assert err_f < rtol * case.id_eps
 
     # }}}
 
@@ -327,9 +324,9 @@ def run_skeletonize_by_proxy(actx, case, resolution,
                     ).reshape(places.ambient_dim, -1)
 
             _plot_skeleton_with_proxies(f"sources{suffix}", sources, pxy,
-                    tgt_src_index.sources, skel_tgt_src_index.sources)
+                    tgt_src_index.sources, skeleton.skel_tgt_src_index.sources)
             _plot_skeleton_with_proxies(f"targets{suffix}", sources, pxy,
-                    tgt_src_index.targets, skel_tgt_src_index.targets)
+                    tgt_src_index.targets, skeleton.skel_tgt_src_index.targets)
         else:
             # TODO: would be nice to figure out a way to visualize some of these
             # skeletonization results in 3D. Probably need to teach the
@@ -341,9 +338,18 @@ def run_skeletonize_by_proxy(actx, case, resolution,
     return err_f, (places, mat)
 
 
-@pytest.mark.parametrize("case", SKELETONIZE_TEST_CASES)
+@pytest.mark.parametrize("case", [
+    # NOTE: skip 2d tests, since they're better checked for convergence in
+    # `test_skeletonize_by_proxy_convergence`
+    # SKELETONIZE_TEST_CASES[0], SKELETONIZE_TEST_CASES[1],
+    SKELETONIZE_TEST_CASES[2],
+    ])
 def test_skeletonize_by_proxy(actx_factory, case, visualize=False):
-    """Test single-level level skeletonization accuracy."""
+    r"""Test single-level skeletonization accuracy. Checks that the error
+    satisfies :math:`e < c \epsilon_{id}` for a fixed ID tolerance and an
+    empirically determined (not too huge) :math:`c`.
+    """
+
     import scipy.linalg.interpolative as sli    # pylint:disable=no-name-in-module
     sli.seed(42)
 
@@ -355,7 +361,12 @@ def test_skeletonize_by_proxy(actx_factory, case, visualize=False):
     case = replace(case, approx_cluster_count=6, id_eps=1.0e-8)
     logger.info("\n%s", case)
 
-    run_skeletonize_by_proxy(actx, case, case.resolutions[0], visualize=visualize)
+    run_skeletonize_by_proxy(
+        actx, case, case.resolutions[0],
+        ctol=6,
+        # FIXME: why is the 3D error so large?
+        rtol=10**case.ambient_dim,
+        visualize=visualize)
 
 # }}}
 
@@ -382,7 +393,10 @@ def test_skeletonize_by_proxy(actx_factory, case, visualize=False):
 def test_skeletonize_by_proxy_convergence(
         actx_factory, case, weighted=True,
         visualize=False):
-    """Test single-level level skeletonization accuracy."""
+    r"""Test single-level skeletonization accuracy. Checks that the
+    accuracy of the skeletonization scales linearly with :math:`\epsilon_{id}`
+    (the ID tolerance).
+    """
     import scipy.linalg.interpolative as sli    # pylint:disable=no-name-in-module
     sli.seed(42)
 
@@ -416,15 +430,14 @@ def test_skeletonize_by_proxy_convergence(
     places = mat = None
     for i in range(id_eps.size):
         case = replace(case, id_eps=id_eps[i], weighted_proxy=weighted)
-
-        if not was_zero:
-            rec_error[i], (places, mat) = run_skeletonize_by_proxy(
-                actx, case, r, places=places, mat=mat,
-                force_assert=False,
-                suffix=f"{suffix}_{i:04d}", visualize=False)
+        rec_error[i], (places, mat) = run_skeletonize_by_proxy(
+            actx, case, r, places=places, mat=mat,
+            suffix=f"{suffix}_{i:04d}", visualize=False)
 
         was_zero = rec_error[i] == 0.0
         eoc.add_data_point(id_eps[i], rec_error[i])
+        if was_zero:
+            break
 
     logger.info("\n%s", eoc.pretty_print())