PrunaAI · johannaSommer · Aug 8, 2025 · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025
diff --git a/README.md b/README.md
@@ -165,6 +165,7 @@ Since Pruna offers a broad range of optimization algorithms, the following table
 | `factorizer` | Factorization batches several small matrix multiplications into one large fused operation. | ✅ | ➖ | ➖ |
 | `enhancer`   | Enhances the model output by applying post-processing algorithms such as denoising or upscaling. | ❌ | ➖ | ✅ |
 | `distributer`   | Distributes the inference, the model or certain calculations across multiple devices. | ✅ | ❌ | ➖ |
+| `kernel`   | Kernels are specialized GPU routines that speed up parts of the computation.  | ✅ | ➖ | ➖ |
 
 ✅ (improves), ➖ (approx. the same), ❌ (worsens)
 

diff --git a/docs/user_manual/configure.rst b/docs/user_manual/configure.rst
@@ -124,6 +124,11 @@ The table underneath provides a general overview of the impact of each algorithm
      - ✅
      - ❌
      - ➖
+   * - ``kernel``
+     - Specialized GPU routines that speed up parts of the computation.
+     - ✅
+     - ➖
+     - ➖
 
 ✅(improves), ➖(approx. the same), ❌(worsens)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -118,6 +118,8 @@ dependencies = [
     "gliner; python_version >= '3.10'",
     "piq",
     "opencv-python",
+    "kernels",
+    "aenum"
 
 ]
 

diff --git a/src/pruna/algorithms/caching/fora.py b/src/pruna/algorithms/caching/fora.py
@@ -43,6 +43,7 @@ class FORACacher(PrunaCacher):
         compiler=["stable_fast", "torch_compile"],
         quantizer=["diffusers_int8", "hqq_diffusers", "torchao"],
         factorizer=["qkv_diffusers"],
+        kernel=["flash_attn3"],
     )
 
     def get_hyperparameters(self) -> list:

diff --git a/src/pruna/algorithms/compilation/torch_compile.py b/src/pruna/algorithms/compilation/torch_compile.py
@@ -55,6 +55,7 @@ class TorchCompileCompiler(PrunaCompiler):
         quantizer=["half", "hqq_diffusers", "diffusers_int8", "gptq", "llm_int8", "hqq", "torchao"],
         cacher=["deepcache", "fora"],
         pruner=["torch_structured"],
+        kernel=["flash_attn3"],
     )
 
     def get_hyperparameters(self) -> list:

diff --git a/src/pruna/algorithms/kernels/__init__.py b/src/pruna/algorithms/kernels/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2025 - Pruna AI GmbH. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pruna.algorithms.pruna_base import PrunaAlgorithmBase
+from pruna.config.smash_space import KERNEL
+from pruna.engine.save import SAVE_FUNCTIONS
+
+
+class PrunaKernel(PrunaAlgorithmBase):
+    """Base class for kernel algorithms."""
+
+    algorithm_group = KERNEL
+    save_fn = SAVE_FUNCTIONS.reapply
-Original file line number
+Diff line change
@@ Expand Up @@
          - ✅
          - ❌
          - ➖
+       * - ``kernel``
+         - Specialized GPU routines that speed up parts of the computation.
+         - ✅
+         - ➖
+         - ➖
     ✅(improves), ➖(approx. the same), ❌(worsens)
@@ Expand Down @@