diff --git a/docs/user_manual/customize_metric.rst b/docs/user_manual/customize_metric.rst index bb66f157..d720f59b 100644 --- a/docs/user_manual/customize_metric.rst +++ b/docs/user_manual/customize_metric.rst @@ -10,11 +10,11 @@ If this is your first time contributing to |pruna|, please refer to the :ref:`ho 1. Choosing the right type of metric ------------------------------------ -|pruna|'s evaluation system supports two types of metrics, located under ``pruna/evaluation/metrics``: ``BaseMetric`` and ``StatefulMetric``. +|pruna|'s evaluation system supports two types of metrics, located under ``pruna/evaluation/metrics``: ``BaseMetric`` and ``StatefulMetric``. These two types are designed to accommodate different use cases. -- **BaseMetric**: Inherit from ``BaseMetric`` and compute values directly without maintaining state. +- **BaseMetric**: Inherit from ``BaseMetric`` and compute values directly without maintaining state. - Used when isolated inference is required (e.g., ``latency``, ``disk_memory``, etc.) - **StatefulMetric**: Inherit from ``StatefulMetric`` and accumulate state across multiple batches. - Best suited for quality evaluations (e.g, ``accuracy``, ``clip_score``, etc.) @@ -27,7 +27,7 @@ These two types are designed to accommodate different use cases. Create a new file in ``pruna/evaluation/metrics`` with a descriptive name for your metric. (e.g, ``your_new_metric.py``) -We use snake_case for the file names (e.g., ``your_new_metric.py``), PascalCase for the class names (e.g, ``YourNewMetric``) and NumPy style docstrings for documentation. +We use snake_case for the file names (e.g., ``your_new_metric.py``), PascalCase for the class names (e.g, ``YourNewMetric``) and NumPy style docstrings for documentation. Both ``BaseMetric`` and ``StatefulMetric`` return a ``MetricResult`` object, which contains the metric name, result value and other metadata. @@ -40,14 +40,14 @@ Your metric should have a ``metric_name`` attribute and a ``higher_is_better`` a ``compute()`` takes two parameters: ``model`` and ``dataloader``. -Inside ``compute()``, you are responsible for running inference manually. +Inside ``compute()``, you are responsible for running inference manually. Your method should return a ``MetricResult`` object with the metric name, result value and other metadata. The result value should be a float or int. .. code-block:: python from pruna.evaluation.metrics.metric_base import BaseMetric - from pruna.evaluation.metrics.metric_result import MetricResult + from pruna.evaluation.metrics.result import MetricResult class YourNewMetric(BaseMetric): '''Your metric description''' @@ -105,7 +105,7 @@ Here's a complete example implementing a ``StatefulMetric`` with a single ``call self.param1 = param1 self.param2 = param2 self.call_type = get_call_type_for_single_metric(call_type, self.default_call_type) # Call the correct helper function to get the correct call_type - + # Initialize state variables self.add_state("total", torch.zeros(1)) self.add_state("count", torch.zeros(1)) @@ -148,10 +148,10 @@ Understanding Call Types | `pairwise_y_gt` | Base model's output first, then subsequent model's output | +--------------------+-------------------------------------------------------------+ | `pairwise_gt_y` | Subsequent model's output first, then base model's output | -+--------------------+-------------------------------------------------------------+ ++--------------------+-------------------------------------------------------------+ -You need to decide on the default ``call_type`` based on the metric you are implementing. +You need to decide on the default ``call_type`` based on the metric you are implementing. For example, if you are implementing a metric that compares two models, you should use the ``pairwise_y_gt`` call type. Examples from |pruna| include ``psnr``, ``ssim``, ``lpips``. @@ -159,28 +159,34 @@ If you are implementing an alignment metric comparing model's output with the in If you are implementing a metric that compares the model's output with the ground truth, you should use the ``y_gt`` or ``gt_y`` call type. Examples from |pruna| include ``fid``, ``cmmd``, ``accuracy``, ``recall``, ``precision``. -You may want to switch the mode of the metric despite your default ``call_type``. For instance you may want to use ``fid`` in pairwise mode to get a single comparison score for two models. +You may want to switch the mode of the metric despite your default ``call_type``. For instance you may want to use ``fid`` in pairwise mode to get a single comparison score for two models. In this case, you can pass ``pairwise`` to the ``call_type`` parameter of the ``StatefulMetric`` constructor. - .. container:: hidden_code .. code-block:: python - import sys - import types + import sys, types - dummy_your_metric = types.ModuleType("pruna.evaluation.metrics.your_metric_file") - dummy_your_metric.YourNewStatefulMetric = "dummy_your_metric" - sys.modules["pruna.evaluation.metrics.your_metric_file"] = dummy_your_metric + mod_name = "pruna.evaluation.metrics.your_metric_file" + dummy = types.ModuleType(mod_name) + + class YourNewStatefulMetric: + def __init__(self, *args, **kwargs): pass + def reset(self): ... + def update(self, *a, **k): ... + def compute(self): return 0.0 + + dummy.YourNewStatefulMetric = YourNewStatefulMetric + sys.modules[mod_name] = dummy .. code-block:: python from pruna.evaluation.metrics.your_metric_file import YourNewStatefulMetric - + # Initialize your metric from the instance - YourNewStatefulMetric(param1='value1', param2='value2', call_type="pairwise") + YourNewStatefulMetric(param1='value1', param2='value2', call_type="pairwise") If you have implemented your metric using the correct ``get_call_type_for_metric`` function and ``metric_data_processor`` function, this will work as expected. @@ -212,30 +218,39 @@ Thanks to this registry system, everyone using |pruna| can now refer to your met .. code-block:: python # mock certain imports to make the code block runnable - import sys - import types - dummy_your_metric = types.ModuleType("pruna.evaluation.metrics.your_metric_file") - dummy_your_metric.YourNewMetric = "dummy_your_metric" - sys.modules["pruna.evaluation.metrics.your_metric_file"] = dummy_your_metric + import sys, types + from pruna.evaluation.metrics.registry import MetricRegistry + + mod_name = "pruna.evaluation.metrics.your_metric_file" + dummy = types.ModuleType(mod_name) + + @MetricRegistry.register("your_new_metric_name") + class YourNewMetric: + def __init__(self, *args, **kwargs): pass + def compute(self): return 0.0 + + dummy.YourNewMetric = YourNewMetric + sys.modules[mod_name] = dummy .. code-block:: python from pruna.evaluation.metrics.your_metric_file import YourNewMetric # Classic way: Initialize your metric from the instance - YourNewMetric(param1='value1', param2='value2') + YourNewMetric(param1='value1', param2='value2') .. code-block:: python from pruna.evaluation.task import Task + from pruna.data.pruna_datamodule import PrunaDataModule metrics = [ - 'your_metric_name' + 'your_new_metric_name' ] # Now you can create a task with your metric from the metric name. - task = Task(request=metrics, data_module=pruna.data.pruna_datamodule.PrunaDataModule.from_string('LAION256')) + task = Task(request=metrics, datamodule=PrunaDataModule.from_string('LAION256')) One important thing: the registration happens when your module is imported. To ensure your metric is always available, we suggest importing it in ``pruna/evaluation/metrics/__init__.py`` file. @@ -261,23 +276,24 @@ Once you've implemented your metric, everyone can use it in Pruna's evaluation p .. code-block:: python # mock certain imports to make the code block runnable - import sys - import types - from diffusers import StableDiffusionPipeline + import sys, types - dummy_your_metric = types.ModuleType("pruna.evaluation.metrics.your_metric_file") - dummy_your_metric.YourNewMetric = "dummy_your_metric" - sys.modules["pruna.evaluation.metrics.your_metric_file"] = dummy_your_metric + modname = "pruna.evaluation.metrics.your_metric_file" + dummy = types.ModuleType(modname) - model_path = "CompVis/stable-diffusion-v1-4" - model = StableDiffusionPipeline.from_pretrained(model_path) + class YourNewMetric: + def __init__(self, *a, **k): ... + def compute(self): return 0.0 + + dummy.YourNewMetric = YourNewMetric + sys.modules[modname] = dummy .. code-block:: python :emphasize-lines: 2, 6 from pruna.evaluation.metrics.metric_torch import TorchMetricWrapper from pruna.evaluation.metrics.your_metric_file import YourNewMetric - + metrics = [ 'clip_score', 'your_new_metric_name' @@ -288,4 +304,3 @@ Once you've implemented your metric, everyone can use it in Pruna's evaluation p eval_agent = EvaluationAgent(task=task) results = eval_agent.evaluate(model) - diff --git a/docs/user_manual/evaluate.rst b/docs/user_manual/evaluate.rst index bc2cd527..5d981b44 100644 --- a/docs/user_manual/evaluate.rst +++ b/docs/user_manual/evaluate.rst @@ -106,8 +106,8 @@ The ``EvaluationAgent`` is the main class for evaluating model performance. It c from pruna.data.pruna_datamodule import PrunaDataModule eval_agent = EvaluationAgent( - request=["accuracy", "perplexity"], - datamodule=PrunaDataModule.from_string('WikiText'), + request=["cmmd", "ssim"], + datamodule=PrunaDataModule.from_string('LAION256'), device="cpu" ) @@ -122,8 +122,8 @@ The ``EvaluationAgent`` is the main class for evaluating model performance. It c from pruna.data.pruna_datamodule import PrunaDataModule task = Task( - request=["accuracy", "perplexity"], - datamodule=PrunaDataModule.from_string('WikiText'), + request=["cmmd", "ssim"], + datamodule=PrunaDataModule.from_string('LAION256'), device="cpu" ) eval_agent = EvaluationAgent(task) @@ -146,8 +146,8 @@ The ``Task`` class provides an alternative way to define evaluation configuratio from pruna.data.pruna_datamodule import PrunaDataModule task = Task( - request=["accuracy", "perplexity"], - datamodule=PrunaDataModule.from_string('WikiText'), + request=["cmmd", "ssim"], + datamodule=PrunaDataModule.from_string('LAION256'), device="cpu" ) @@ -306,7 +306,7 @@ The ``MetricResult`` class stores the metric's name, any associated parameters, .. code-block:: python - # Example output + # Example output MetricResult( name="clip_score", params={"param1": "value1", "param2": "value2"}, @@ -336,7 +336,8 @@ The ``EvaluationAgent`` accepts ``PrunaDataModule`` in two different ways: from pruna.data.pruna_datamodule import PrunaDataModule # Load the tokenizer - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1b-Instruct") + tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B") + tokenizer.pad_token = tokenizer.eos_token # Create the data Module datamodule = PrunaDataModule.from_string( @@ -359,7 +360,8 @@ The ``EvaluationAgent`` accepts ``PrunaDataModule`` in two different ways: from pruna.data.utils import split_train_into_train_val_test # Load the tokenizer - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1b-Instruct") + tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B") + tokenizer.pad_token = tokenizer.eos_token # Load custom datasets train_ds = load_dataset("SamuelYang/bookcorpus")["train"] @@ -387,7 +389,8 @@ Lastly, you can limit the number of samples in the dataset by using the ``PrunaD from pruna.data.pruna_datamodule import PrunaDataModule # Create the data module - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1b-Instruct") + tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B") + tokenizer.pad_token = tokenizer.eos_token datamodule = PrunaDataModule.from_string("WikiText", tokenizer=tokenizer) # Limit all splits to 100 samples diff --git a/docs/user_manual/smash.rst b/docs/user_manual/smash.rst index 024e898c..b58e798a 100644 --- a/docs/user_manual/smash.rst +++ b/docs/user_manual/smash.rst @@ -48,12 +48,15 @@ Let's see what that looks like in code. # Evaluate the model metrics = ["clip_score", "psnr"] - task = Task(metrics, datamodule=PrunaDataModule.from_string("LAION256")) + datamodule = PrunaDataModule.from_string("LAION256") + datamodule.limit_datasets(10) # You can limit the number of samples. + task = Task(metrics, datamodule=datamodule) eval_agent = EvaluationAgent(task) eval_agent.evaluate(optimized_model) # Run inference optimized_model.set_progress_bar_config(disable=True) + optimized_model.to("cuda") optimized_model("A serene landscape with mountains").images[0].save("output.png") Step-by-Step Optimisation Workflow @@ -125,9 +128,11 @@ To evaluate the optimized model, we can use the same interface as the original m from pruna.data.pruna_datamodule import PrunaDataModule from pruna.evaluation.evaluation_agent import EvaluationAgent + from pruna.engine.pruna_model import PrunaModel + from pruna.evaluation.task import Task # Load the optimized model - optimized_model = PrunaModel.from_pretrained("PrunaAI/Segmind-Vega-smashed") + optimized_model = PrunaModel.from_hub("PrunaAI/Segmind-Vega-smashed") # Define metrics metrics = ['clip_score', 'psnr'] @@ -138,7 +143,8 @@ To evaluate the optimized model, we can use the same interface as the original m # Evaluate the model eval_agent = EvaluationAgent(task) results = eval_agent.evaluate(optimized_model) - print(results) + for result in results: + print(result) To understand how to run more complex evaluation workflows, see :doc:`Evaluate a model `. @@ -149,7 +155,7 @@ To run inference with the optimized model, we can use the same interface as the .. code-block:: python - from pruna import PrunaModel + from pruna.engine.pruna_model import PrunaModel # Load the optimized model optimized_model = PrunaModel.from_hub("PrunaAI/Segmind-Vega-smashed") @@ -178,7 +184,6 @@ Example 1: Diffusion Model Optimization # Create and configure SmashConfig smash_config = SmashConfig() - smash_config["compiler"] = "torch_compile" smash_config["quantizer"] = "hqq_diffusers" # Optimize the model @@ -199,7 +204,7 @@ Example 2: Large Language Model Optimization from pruna import SmashConfig, smash # Load the model - model_id = "meta-llama/Llama-3.2-1b-Instruct" + model_id = "NousResearch/Llama-3.2-1B" pipe = pipeline("text-generation", model=model_id) # Create and configure SmashConfig @@ -231,6 +236,7 @@ Example 3: Speech Recognition Optimization # Create and configure SmashConfig smash_config = SmashConfig() smash_config.add_processor(model_id) # Required for Whisper + smash_config.add_tokenizer(model_id) smash_config["compiler"] = "c_whisper" smash_config["batcher"] = "whisper_s2t" diff --git a/tests/documentation/test_code_blocks.py b/tests/documentation/test_code_blocks.py index 9f96c1d6..610d87ff 100644 --- a/tests/documentation/test_code_blocks.py +++ b/tests/documentation/test_code_blocks.py @@ -8,22 +8,16 @@ TUTORIAL_PATH = Path(os.path.dirname(__file__)).parent.parent / "docs" - @pytest.mark.parametrize( - "rst_name", - [ - pytest.param(f"user_manual/{path.stem}", marks=pytest.mark.cuda) - for path in TUTORIAL_PATH.glob("user_manual/*.rst") - ] + "rst_path", + (TUTORIAL_PATH / "user_manual").glob("*.rst"), + ids=lambda p: p.stem, ) -def test_codeblocks_cuda(rst_name: str) -> None: - """Test to ensure the notebook runs without errors.""" - rst_file_path = str(TUTORIAL_PATH / f"{rst_name}.rst") - output_dir = "tmp/code_blocks" - extract_python_code_blocks(rst_file_path, output_dir) +def test_codeblocks_cuda(rst_path, tmp_path): + out_dir = tmp_path / "blocks" # unique per test instance + extract_python_code_blocks(rst_path, out_dir) - for file in sorted(os.listdir(output_dir)): - file_path = os.path.join(output_dir, file) - run_script_successfully(file_path) + for script in sorted(out_dir.iterdir()): + run_script_successfully(script) - shutil.rmtree(output_dir) + shutil.rmtree(out_dir)