PrunaAI · johnrachwan123 · Jun 24, 2025 · Jun 24, 2025
diff --git a/docs/user_manual/customize_metric.rst b/docs/user_manual/customize_metric.rst
@@ -10,11 +10,11 @@ If this is your first time contributing to |pruna|, please refer to the :ref:`ho
 1. Choosing the right type of metric
 ------------------------------------
 
-|pruna|'s evaluation system supports two types of metrics, located under ``pruna/evaluation/metrics``: ``BaseMetric`` and ``StatefulMetric``. 
+|pruna|'s evaluation system supports two types of metrics, located under ``pruna/evaluation/metrics``: ``BaseMetric`` and ``StatefulMetric``.
 
 These two types are designed to accommodate different use cases.
 
-- **BaseMetric**: Inherit from ``BaseMetric`` and compute values directly without maintaining state. 
+- **BaseMetric**: Inherit from ``BaseMetric`` and compute values directly without maintaining state.
     - Used when isolated inference is required (e.g., ``latency``, ``disk_memory``, etc.)
 - **StatefulMetric**: Inherit from ``StatefulMetric`` and accumulate state across multiple batches.
     - Best suited for quality evaluations (e.g, ``accuracy``, ``clip_score``, etc.)
@@ -27,7 +27,7 @@ These two types are designed to accommodate different use cases.
 
 Create a new file in ``pruna/evaluation/metrics`` with a descriptive name for your metric. (e.g, ``your_new_metric.py``)
 
-We use snake_case for the file names (e.g., ``your_new_metric.py``), PascalCase for the class names (e.g, ``YourNewMetric``) and NumPy style docstrings for documentation. 
+We use snake_case for the file names (e.g., ``your_new_metric.py``), PascalCase for the class names (e.g, ``YourNewMetric``) and NumPy style docstrings for documentation.
 
 Both  ``BaseMetric`` and ``StatefulMetric`` return a ``MetricResult`` object, which contains the metric name, result value and other metadata.
 
@@ -40,14 +40,14 @@ Your metric should have a ``metric_name`` attribute and a ``higher_is_better`` a
 
 ``compute()`` takes two parameters: ``model`` and ``dataloader``.
 
-Inside ``compute()``, you are responsible for running inference manually. 
+Inside ``compute()``, you are responsible for running inference manually.
 
 Your method should return a ``MetricResult`` object with the metric name, result value and other metadata. The result value should be a float or int.
 
 .. code-block:: python
 
     from pruna.evaluation.metrics.metric_base import BaseMetric
-    from pruna.evaluation.metrics.metric_result import MetricResult
+    from pruna.evaluation.metrics.result import MetricResult
 
     class YourNewMetric(BaseMetric):
         '''Your metric description'''
@@ -105,7 +105,7 @@ Here's a complete example implementing a ``StatefulMetric`` with a single ``call
             self.param1 = param1
             self.param2 = param2
             self.call_type = get_call_type_for_single_metric(call_type, self.default_call_type) # Call the correct helper function to get the correct call_type
-            
+
             # Initialize state variables
             self.add_state("total", torch.zeros(1))
             self.add_state("count", torch.zeros(1))
@@ -148,39 +148,45 @@ Understanding Call Types
 | `pairwise_y_gt`    | Base model's output first, then subsequent model's output   |
 +--------------------+-------------------------------------------------------------+
 | `pairwise_gt_y`    | Subsequent model's output first, then base model's output   |
-+--------------------+-------------------------------------------------------------+ 
++--------------------+-------------------------------------------------------------+
 
 
-You need to decide on the default ``call_type`` based on the metric you are implementing. 
+You need to decide on the default ``call_type`` based on the metric you are implementing.
 
 For example, if you are implementing a metric that compares two models, you should use the ``pairwise_y_gt`` call type. Examples from |pruna| include ``psnr``, ``ssim``, ``lpips``.
 
 If you are implementing an alignment metric comparing model's output with the input, you should use the ``x_gt`` or ``gt_x`` call type. Examples from |pruna| include ``clip_score``.
 
 If you are implementing a metric that compares the model's output with the ground truth, you should use the ``y_gt`` or ``gt_y`` call type. Examples from |pruna| include ``fid``, ``cmmd``, ``accuracy``, ``recall``, ``precision``.
 
-You may want to switch the mode of the metric despite your default ``call_type``. For instance you may want to use ``fid`` in pairwise mode to get a single comparison score for two models. 
+You may want to switch the mode of the metric despite your default ``call_type``. For instance you may want to use ``fid`` in pairwise mode to get a single comparison score for two models.
 
 In this case, you can pass ``pairwise`` to the ``call_type`` parameter of the ``StatefulMetric`` constructor.
 
-
 .. container:: hidden_code
 
     .. code-block:: python
 
-        import sys
-        import types
+        import sys, types
 
-        dummy_your_metric = types.ModuleType("pruna.evaluation.metrics.your_metric_file")
-        dummy_your_metric.YourNewStatefulMetric = "dummy_your_metric"
-        sys.modules["pruna.evaluation.metrics.your_metric_file"] = dummy_your_metric
+        mod_name = "pruna.evaluation.metrics.your_metric_file"
+        dummy = types.ModuleType(mod_name)
+
+        class YourNewStatefulMetric:
+            def __init__(self, *args, **kwargs): pass
+            def reset(self):  ...
+            def update(self, *a, **k): ...
+            def compute(self): return 0.0
+
+        dummy.YourNewStatefulMetric = YourNewStatefulMetric
+        sys.modules[mod_name] = dummy
 
 .. code-block:: python
 
     from pruna.evaluation.metrics.your_metric_file import YourNewStatefulMetric
-    
+
     # Initialize your metric from the instance
-    YourNewStatefulMetric(param1='value1', param2='value2', call_type="pairwise") 
+    YourNewStatefulMetric(param1='value1', param2='value2', call_type="pairwise")
 
 If you have implemented your metric using the correct ``get_call_type_for_metric`` function and ``metric_data_processor`` function, this will work as expected.
 
@@ -212,30 +218,39 @@ Thanks to this registry system, everyone using |pruna| can now refer to your met
     .. code-block:: python
 
         # mock certain imports to make the code block runnable
-        import sys
-        import types
 
-        dummy_your_metric = types.ModuleType("pruna.evaluation.metrics.your_metric_file")
-        dummy_your_metric.YourNewMetric = "dummy_your_metric"
-        sys.modules["pruna.evaluation.metrics.your_metric_file"] = dummy_your_metric
+        import sys, types
+        from pruna.evaluation.metrics.registry import MetricRegistry
+
+        mod_name = "pruna.evaluation.metrics.your_metric_file"
+        dummy = types.ModuleType(mod_name)
+
+        @MetricRegistry.register("your_new_metric_name")
+        class YourNewMetric:
+            def __init__(self, *args, **kwargs): pass
+            def compute(self): return 0.0
+
+        dummy.YourNewMetric = YourNewMetric
+        sys.modules[mod_name] = dummy
 
 .. code-block:: python
 
     from pruna.evaluation.metrics.your_metric_file import YourNewMetric
 
     # Classic way: Initialize your metric from the instance
-    YourNewMetric(param1='value1', param2='value2') 
+    YourNewMetric(param1='value1', param2='value2')
 
 .. code-block:: python
 
     from pruna.evaluation.task import Task
+    from pruna.data.pruna_datamodule import PrunaDataModule
 
     metrics = [
-        'your_metric_name'
+        'your_new_metric_name'
     ]
 
     # Now you can create a task with your metric from the metric name.
-    task = Task(request=metrics, data_module=pruna.data.pruna_datamodule.PrunaDataModule.from_string('LAION256'))  
+    task = Task(request=metrics, datamodule=PrunaDataModule.from_string('LAION256'))
 
 
 One important thing: the registration happens when your module is imported. To ensure your metric is always available, we suggest importing it in ``pruna/evaluation/metrics/__init__.py`` file.
@@ -261,23 +276,24 @@ Once you've implemented your metric, everyone can use it in Pruna's evaluation p
     .. code-block:: python
 
         # mock certain imports to make the code block runnable
-        import sys
-        import types
-        from diffusers import StableDiffusionPipeline
+        import sys, types
 
-        dummy_your_metric = types.ModuleType("pruna.evaluation.metrics.your_metric_file")
-        dummy_your_metric.YourNewMetric = "dummy_your_metric"
-        sys.modules["pruna.evaluation.metrics.your_metric_file"] = dummy_your_metric
+        modname = "pruna.evaluation.metrics.your_metric_file"
+        dummy = types.ModuleType(modname)
 
-        model_path = "CompVis/stable-diffusion-v1-4"
-        model = StableDiffusionPipeline.from_pretrained(model_path)
+        class YourNewMetric:
+            def __init__(self, *a, **k): ...
+            def compute(self): return 0.0
+
+        dummy.YourNewMetric = YourNewMetric
+        sys.modules[modname] = dummy
 
 .. code-block:: python
     :emphasize-lines: 2, 6
 
     from pruna.evaluation.metrics.metric_torch import TorchMetricWrapper
     from pruna.evaluation.metrics.your_metric_file import YourNewMetric
-    
+
     metrics = [
         'clip_score',
         'your_new_metric_name'
@@ -288,4 +304,3 @@ Once you've implemented your metric, everyone can use it in Pruna's evaluation p
     eval_agent = EvaluationAgent(task=task)
 
     results = eval_agent.evaluate(model)
-
diff --git a/docs/user_manual/evaluate.rst b/docs/user_manual/evaluate.rst
@@ -106,8 +106,8 @@ The ``EvaluationAgent`` is the main class for evaluating model performance. It c
             from pruna.data.pruna_datamodule import PrunaDataModule
 
             eval_agent = EvaluationAgent(
-                request=["accuracy", "perplexity"],
-                datamodule=PrunaDataModule.from_string('WikiText'),
+                request=["cmmd", "ssim"],
+                datamodule=PrunaDataModule.from_string('LAION256'),
                 device="cpu"
             )
 
@@ -122,8 +122,8 @@ The ``EvaluationAgent`` is the main class for evaluating model performance. It c
             from pruna.data.pruna_datamodule import PrunaDataModule
 
             task = Task(
-                request=["accuracy", "perplexity"],
-                datamodule=PrunaDataModule.from_string('WikiText'),
+                request=["cmmd", "ssim"],
+                datamodule=PrunaDataModule.from_string('LAION256'),
                 device="cpu"
             )
             eval_agent = EvaluationAgent(task)
@@ -146,8 +146,8 @@ The ``Task`` class provides an alternative way to define evaluation configuratio
     from pruna.data.pruna_datamodule import PrunaDataModule
 
     task = Task(
-        request=["accuracy", "perplexity"],
-        datamodule=PrunaDataModule.from_string('WikiText'),
+        request=["cmmd", "ssim"],
+        datamodule=PrunaDataModule.from_string('LAION256'),
         device="cpu"
     )
 
@@ -306,7 +306,7 @@ The ``MetricResult`` class stores the metric's name, any associated parameters,
 
 .. code-block:: python
 
-  # Example output
+    # Example output
     MetricResult(
         name="clip_score",
         params={"param1": "value1", "param2": "value2"},
@@ -336,7 +336,8 @@ The ``EvaluationAgent`` accepts ``PrunaDataModule`` in two different ways:
             from pruna.data.pruna_datamodule import PrunaDataModule
 
             # Load the tokenizer
-            tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1b-Instruct")
+            tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
+            tokenizer.pad_token = tokenizer.eos_token
 
             # Create the data Module
             datamodule = PrunaDataModule.from_string(
@@ -359,7 +360,8 @@ The ``EvaluationAgent`` accepts ``PrunaDataModule`` in two different ways:
             from pruna.data.utils import split_train_into_train_val_test
 
             # Load the tokenizer
-            tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1b-Instruct")
+            tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
+            tokenizer.pad_token = tokenizer.eos_token
 
             # Load custom datasets
             train_ds = load_dataset("SamuelYang/bookcorpus")["train"]
@@ -387,7 +389,8 @@ Lastly, you can limit the number of samples in the dataset by using the ``PrunaD
     from pruna.data.pruna_datamodule import PrunaDataModule
 
     # Create the data module
-    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1b-Instruct")
+    tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
+    tokenizer.pad_token = tokenizer.eos_token
     datamodule = PrunaDataModule.from_string("WikiText", tokenizer=tokenizer)
 
     # Limit all splits to 100 samples

diff --git a/docs/user_manual/smash.rst b/docs/user_manual/smash.rst
@@ -48,12 +48,15 @@ Let's see what that looks like in code.
 
     # Evaluate the model
     metrics = ["clip_score", "psnr"]
-    task = Task(metrics, datamodule=PrunaDataModule.from_string("LAION256"))
+    datamodule = PrunaDataModule.from_string("LAION256")
+    datamodule.limit_datasets(10) # You can limit the number of samples.
+    task = Task(metrics, datamodule=datamodule)
     eval_agent = EvaluationAgent(task)
     eval_agent.evaluate(optimized_model)
 
     # Run inference
     optimized_model.set_progress_bar_config(disable=True)
+    optimized_model.to("cuda")
     optimized_model("A serene landscape with mountains").images[0].save("output.png")
 
 Step-by-Step Optimisation Workflow
@@ -125,9 +128,11 @@ To evaluate the optimized model, we can use the same interface as the original m
 
     from pruna.data.pruna_datamodule import PrunaDataModule
     from pruna.evaluation.evaluation_agent import EvaluationAgent
+    from pruna.engine.pruna_model import PrunaModel
+    from pruna.evaluation.task import Task
 
     # Load the optimized model
-    optimized_model = PrunaModel.from_pretrained("PrunaAI/Segmind-Vega-smashed")
+    optimized_model = PrunaModel.from_hub("PrunaAI/Segmind-Vega-smashed")
 
     # Define metrics
     metrics = ['clip_score', 'psnr']
@@ -138,7 +143,8 @@ To evaluate the optimized model, we can use the same interface as the original m
     # Evaluate the model
     eval_agent = EvaluationAgent(task)
     results = eval_agent.evaluate(optimized_model)
-    print(results)
+    for result in results:
+        print(result)
 
 To understand how to run more complex evaluation workflows, see :doc:`Evaluate a model </docs_pruna/user_manual/evaluate>`.
 
@@ -149,7 +155,7 @@ To run inference with the optimized model, we can use the same interface as the
 
 .. code-block:: python
 
-    from pruna import PrunaModel
+    from pruna.engine.pruna_model import PrunaModel
 
     # Load the optimized model
     optimized_model = PrunaModel.from_hub("PrunaAI/Segmind-Vega-smashed")
@@ -178,7 +184,6 @@ Example 1: Diffusion Model Optimization
 
     # Create and configure SmashConfig
     smash_config = SmashConfig()
-    smash_config["compiler"] = "torch_compile"
     smash_config["quantizer"] = "hqq_diffusers"
 
     # Optimize the model
@@ -199,7 +204,7 @@ Example 2: Large Language Model Optimization
     from pruna import SmashConfig, smash
 
     # Load the model
-    model_id = "meta-llama/Llama-3.2-1b-Instruct"
+    model_id = "NousResearch/Llama-3.2-1B"
     pipe = pipeline("text-generation", model=model_id)
 
     # Create and configure SmashConfig
@@ -231,6 +236,7 @@ Example 3: Speech Recognition Optimization
     # Create and configure SmashConfig
     smash_config = SmashConfig()
     smash_config.add_processor(model_id)  # Required for Whisper
+    smash_config.add_tokenizer(model_id)
     smash_config["compiler"] = "c_whisper"
     smash_config["batcher"] = "whisper_s2t"
 

diff --git a/tests/documentation/test_code_blocks.py b/tests/documentation/test_code_blocks.py
@@ -8,22 +8,16 @@
 
 TUTORIAL_PATH = Path(os.path.dirname(__file__)).parent.parent / "docs"
 
-
 @pytest.mark.parametrize(
-    "rst_name",
-    [
-        pytest.param(f"user_manual/{path.stem}", marks=pytest.mark.cuda)
-        for path in TUTORIAL_PATH.glob("user_manual/*.rst")
-    ]
+    "rst_path",
+    (TUTORIAL_PATH / "user_manual").glob("*.rst"),
+    ids=lambda p: p.stem,
 )
-def test_codeblocks_cuda(rst_name: str) -> None:
-    """Test to ensure the notebook runs without errors."""
-    rst_file_path = str(TUTORIAL_PATH / f"{rst_name}.rst")
-    output_dir = "tmp/code_blocks"
-    extract_python_code_blocks(rst_file_path, output_dir)
+def test_codeblocks_cuda(rst_path, tmp_path):
+    out_dir = tmp_path / "blocks"          # unique per test instance
+    extract_python_code_blocks(rst_path, out_dir)
 
-    for file in sorted(os.listdir(output_dir)):
-        file_path = os.path.join(output_dir, file)
-        run_script_successfully(file_path)
+    for script in sorted(out_dir.iterdir()):
+        run_script_successfully(script)
 
-    shutil.rmtree(output_dir)
+    shutil.rmtree(out_dir)