diff --git a/examples/compress/README.md b/examples/compress/README.md index 3bd218aa4..755b6090e 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -13,7 +13,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg ## Environment -- Install TensorRT-Model-Optimizer in editable mode with the corresponding dependencies: +- Install Model-Optimizer in editable mode with the corresponding dependencies: ```bash pip install -e .[hf,compress] @@ -94,7 +94,7 @@ pip install -e .[hf,compress] block_29: attention gqa_4 ffn intermediate_14336 block_30: attention gqa_4 ffn intermediate_14336 block_31: attention gqa_4 ffn intermediate_14336 - + [2025-11-02 04:53:11,332]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 75796.4140625, 'stats.ffn_num_params': 5637275648, 'stats.num_kv_heads': 160, 'stats.kv_cache_memory_mib': 61440.0, 'stats.ffn_memory_mib': 10752.25, 'stats.attention_memory_mib': 63040.15625, 'stats.attention_num_params': 838942720, 'stats.num_params': 7526895616, 'stats.has_attention': 20, 'stats.has_ffn': 32} ... ################################################################ diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml index 70b5304c5..133fe0b77 100644 --- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml +++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml @@ -9,7 +9,7 @@ defaults: puzzle_dir: ??? teacher_dir: ${puzzle_dir}/ckpts/teacher/ replacement_library_path: ${puzzle_dir}/replacement_library.json -dataset_path: ??? # path to v0.4_mini +dataset_path: ??? # path to v0.4_mini skip_realize_model: false @@ -21,10 +21,10 @@ calc_subblock_stats: batch_sizes: [64, 96, 128] prefill_seq_len: 4096 generation_seq_len: 4096 - num_active_tokens_override: # Optional override for sequence lengths + num_active_tokens_override: # Optional override for sequence lengths prefill_queue_size: 0 allocate_prefill_query: false - benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking + benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking merge_with_existing_stats: false subblock_stats_filename: "subblock_stats.json" moe_stats_filename: "moe_stats.json" @@ -56,8 +56,6 @@ mip: # puzzle_profile: objective: metrics.cosine_embedding_loss_hidden_states bigger_is_better: false - num_solutions: 1 - minimal_diversity: 2 subblock_stats_args: - batch_size: 96 @@ -81,10 +79,7 @@ mip: target_memory: 78_000 mip_constraints: - use_greedy_search: false - is_multi_layer_puzzle: true metric_overrides: - constrain_search_func: max_seconds_per_solution: 60 realize_model: @@ -92,10 +87,10 @@ realize_model: tokenizer_name: ${to_path:${teacher_dir}} replacement_library_path: ${replacement_library_path} save_models: true - solutions_path: # Filled dynamically + solutions_path: # Filled dynamically # Validate params - skip_validation: false # To enable validation of the model solution set `skip_validation` as False + skip_validation: false # To enable validation of the model solution set `skip_validation` as False eval_samples: 128 micro_batch_size: 1 seed: 42 diff --git a/examples/pruning/README.md b/examples/pruning/README.md index bbc0e7bde..9e5188e62 100644 --- a/examples/pruning/README.md +++ b/examples/pruning/README.md @@ -23,7 +23,7 @@ This section focuses on applying Model Optimizer's state-of-the-art complementar -For more advanced pruning strategies, such as the [Puzzle methodology](https://arxiv.org/pdf/2411.19146), please see [Puzzle pruning example](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/feature/compress/examples/compress). +For more advanced pruning strategies, such as the [Puzzle methodology](https://arxiv.org/pdf/2411.19146), please see [Puzzle pruning example](../compress/README.md). ## Pre-Requisites diff --git a/modelopt/torch/_compress/mip/constrain_search_space.py b/modelopt/torch/_compress/mip/constrain_search_space.py deleted file mode 100644 index e30ee2478..000000000 --- a/modelopt/torch/_compress/mip/constrain_search_space.py +++ /dev/null @@ -1,407 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Constrains the search space for the MIP optimization.""" - -import traceback - -from modelopt.torch._compress.decilm.deci_lm_hf_code.block_config import ( - AttentionConfig, - BlockConfig, - FFNConfig, -) -from modelopt.torch._compress.utils.utils import load_json - - -def drop_attentions_only(gathered_metrics, teacher_intermediate_size, teacher_n_heads_in_group): - """ - changes the search space such that puzzle is not allowed to change the ffns - but is only allowed to drop or reduce attention. - - Usage example: - add the following flags to your run_puzzle command: - - --constrain_search_func drop_attentions_only --constrain_search_args {\"teacher_intermediate_size\": 14336, \"teacher_n_heads_in_group\": 16, \"above_layer\": 60} - - """ - - for block_name, block_variants in gathered_metrics.items(): - to_delete = [] # Collect keys to delete after the loop - for variant_config, variant_metrics in block_variants.items(): - block_intermediate_size = variant_config.ffn.intermediate_size - block_attn_n_heads = variant_config.attention.n_heads_in_group - if ( - ( - block_intermediate_size is not None - and block_intermediate_size != teacher_intermediate_size - ) - or variant_config.ffn.replace_with_linear - or variant_config.ffn.no_op ## uncomment this line if you want to drop only attns - or variant_config.attention.replace_with_linear - or ( - block_attn_n_heads is not None - and block_attn_n_heads != teacher_n_heads_in_group - ) - ): - print(f"Marking for deletion: {block_name}-{variant_config}") - to_delete.append(variant_config) - for key in to_delete: - del block_variants[key] - - print("new search space in block 0", gathered_metrics["block_0"]) - return gathered_metrics - - -def reduce_only_ffns( - gathered_metrics, - teacher_intermediate_size: int, - teacher_n_heads_in_group: int, - above_layer: int, - allow_no_ops: bool, -): - """ - only allows to reduce FFNs but not to completely drop them from layer 60 onwards - attention is only allowed to be like uniform teacher - - Usage example: - add the following flags to your run_puzzle command: - constrain_search_args='{"teacher_intermediate_size": 14336, "teacher_n_heads_in_group": 16, "above_layer": 60, "allow_no_ops": false}' - - sbatch puzzle/cli/run_puzzle ... --constrain_search_func reduce_only_ffns --constrain_search_args="$(echo "$constrain_search_args" | jq -c .)" - """ - print(f"{teacher_n_heads_in_group=}") - for block_name, block_variants in gathered_metrics.items(): - to_delete = [] # Collect keys to delete after the loop - block_id = int(block_name.split("_")[1]) - - for variant_config, variant_metrics in block_variants.items(): - block_intermediate_size = variant_config.ffn.intermediate_size - block_attn_n_heads = variant_config.attention.n_heads_in_group - - attn_no_op = variant_config.attention.no_op - attn_linear = variant_config.attention.replace_with_linear - if ( - attn_no_op - or attn_linear - or (block_attn_n_heads != teacher_n_heads_in_group) # keep attention as the teacher - or ( - block_id <= above_layer - and (block_intermediate_size != teacher_intermediate_size) - ) - or ((not allow_no_ops) and variant_config.ffn.no_op) - ): - # print(f"Marking for deletion: {block_name}-{variant_config}") - to_delete.append(variant_config) # Add key to delete list - - for key in to_delete: - del block_variants[key] - - print("new search space in block 0", gathered_metrics["block_0"]) - return gathered_metrics - - -def drop_entire_blocks_only(gathered_metrics): - teacher_block_config = _infer_teacher_config(gathered_metrics) - for block_name, block_variants in gathered_metrics.items(): - to_delete = [] # Collect keys to delete after the loop - for variant_config, variant_metrics in block_variants.items(): - is_no_op_block = ( - variant_config.ffn.no_op - and variant_config.attention.no_op - and getattr(variant_config, "parallel_blocks", None) is None - ) - is_teacher = variant_config == teacher_block_config - if not is_no_op_block and not is_teacher: - to_delete.append(variant_config) - for key in to_delete: - del block_variants[key] - - print("new search space in block 0", gathered_metrics["block_0"]) - return gathered_metrics - - -def css_to_reference_attention(gathered_metrics, attention_pruned_arch): - """ - given a reference architecture we fix the search space to only include options that change the FFNs - but to never change the Attentions from the reference arch's Attentions. - """ - - attention_pruned_arch = load_json(attention_pruned_arch)[0] - attention_dropped_blocks = [ - block_name - for block_name, block_config in attention_pruned_arch["chosen_items"].items() - if block_config["attention"]["no_op"] - ] - - for block_name, block_variants in gathered_metrics.items(): - to_delete = [] # Collect keys to delete after the loop - for variant_config, _ in block_variants.items(): - # Uncomment and adjust this block if needed - # does drop only attention - block_attn_n_heads = variant_config.attention.n_heads_in_group - - reference_arch_attn = attention_pruned_arch["chosen_items"][block_name]["attention"][ - "n_heads_in_group" - ] - if ( # we reduce the search space by keeping the reference arch attention as is - (block_name in attention_dropped_blocks and not variant_config.attention.no_op) - or ( - block_name not in attention_dropped_blocks - and block_attn_n_heads != reference_arch_attn - ) - ): - print(f"Marking for deletion: {block_name}-{variant_config}") - to_delete.append(variant_config) - - # Delete marked keys outside the loop - for key in to_delete: - del block_variants[key] - - print("new search space in block 0", gathered_metrics["block_0"]) - return gathered_metrics - - -def css_to_reference_ffn(gathered_metrics, ffn_pruned_arch, allow_linear_attn=True): - """ - given a reference architecture we fix the search space to only include options that change the Attentions - but to never change the FFNs from the reference arch's FFNs. - """ - - ffn_pruned_arch = load_json(ffn_pruned_arch)[0] - - for block_name, block_variants in gathered_metrics.items(): - to_delete = [] # Collect keys to delete after the loop - for variant_config, _ in block_variants.items(): - block_ffn = variant_config.ffn - is_linear_attn = variant_config.attention.replace_with_linear - - reference_arch_ffn = ffn_pruned_arch["chosen_items"][block_name]["ffn"] - reference_arch_ffn = FFNConfig(**reference_arch_ffn) - - if ( # we reduce the search space by keeping the reference arch ffn as is - (block_ffn != reference_arch_ffn) or (not allow_linear_attn and is_linear_attn) - ): - # print(f"Marking for deletion: {block_name}-{variant_config}") - to_delete.append(variant_config) - - # Delete marked keys outside the loop - for key in to_delete: - del block_variants[key] - - print("new search space in block 0", gathered_metrics["block_0"]) - return gathered_metrics - - -def avoid_variable_gqa( - gathered_metrics, - allow_no_op_attn: bool = True, - allow_linear_attn: bool = False, - target_n_heads_in_group: int = None, -): - """ - Allow only the teacher n_heads_in_group, - and optionally also attention no-op (default allow) - and attention linear (default avoid). - - This reducer affects only the attention layers: FFNs are allowed their entire search space. - """ - is_multi_layer_puzzle = is_replacement_gathered_metrics(gathered_metrics) - if is_multi_layer_puzzle: - teacher_block_config = infer_teacher_replacement_config(gathered_metrics) - else: - teacher_block_config = _infer_teacher_config(gathered_metrics) - - if target_n_heads_in_group is None: - target_n_heads_in_group = teacher_block_config.attention.n_heads_in_group - - if not is_multi_layer_puzzle: - for block_name, block_variants in gathered_metrics.items(): - to_delete = [] # Collect keys to delete after the loop - - for variant_config, variant_metrics in block_variants.items(): - if not ( - (variant_config.attention.n_heads_in_group == target_n_heads_in_group) - or (variant_config.attention.no_op and allow_no_op_attn) - or (variant_config.attention.replace_with_linear and allow_linear_attn) - ): - to_delete.append(variant_config) - - for key in to_delete: - del block_variants[key] - else: - to_delete = [] # Collect keys to delete after the loop - for replacement_id, replacement in gathered_metrics.items(): - variant_config = replacement["block_config"] - if not ( - (variant_config.attention.n_heads_in_group == target_n_heads_in_group) - or (variant_config.attention.no_op and allow_no_op_attn) - or (variant_config.attention.replace_with_linear and allow_linear_attn) - ): - to_delete.append(replacement_id) - - for key in to_delete: - del gathered_metrics[key] - if not is_multi_layer_puzzle: - print("new search space in block 0", gathered_metrics["block_0"]) - else: - parent_layer_idx = 0 - print( - "new search space in block {parent_layer_idx}", - [ - replacement["block_config"] - for replacement_id, replacement in gathered_metrics.items() - if replacement["parent_layer_indices"][0] == parent_layer_idx - ], - ) - return gathered_metrics - - -def reduce_in_range( - gathered_metrics, - layer_start: int, - layer_end: int, -): - """ - Allow only reduction of layers between layer_start and layer_end. Leyers before layers start, and after layer_end are kept as is (the teacher). - - """ - assert layer_start < layer_end, ( - f"Wrong input arguments: {layer_start=} must be less than {layer_end=}" - ) - is_multi_layer_puzzle = is_replacement_gathered_metrics(gathered_metrics) - if is_multi_layer_puzzle: - teacher_block_config = infer_teacher_replacement_config(gathered_metrics) - else: - teacher_block_config = _infer_teacher_config(gathered_metrics) - - to_delete = [] # Collect keys to delete after the loop - for replacement_id, replacement in gathered_metrics.items(): - block_id = max(replacement["parent_layer_indices"]) - variant_config = replacement["block_config"] - is_teacher = variant_config == teacher_block_config - if (block_id < layer_start or block_id > layer_end) and not is_teacher: - to_delete.append(replacement_id) - - for key in to_delete: - del gathered_metrics[key] - - if not is_multi_layer_puzzle: - print("new search space in block 0", gathered_metrics["block_0"]) - else: - parent_layer_idx = 0 - print( - "new search space in block {parent_layer_idx}", - [ - replacement["block_config"] - for replacement_id, replacement in gathered_metrics.items() - if replacement["parent_layer_indices"][0] == parent_layer_idx - ], - ) - return gathered_metrics - - -############################################################################################# - - -# automatically builds a dictionary mapping method names in this module to their functions -# this dictionary is used to dynamically dispatch functions -dispatcher = { - method_name: method_callable - for method_name, method_callable in globals().items() - if callable(method_callable) -} - - -def is_replacement_gathered_metrics(gathered_metrics) -> bool: - # if the gathered metrics is a replacement, then it is a dictionary of the form {'replacement_{id}': replacement_metrics} - - return isinstance(gathered_metrics, dict) and all( - key.startswith("replacement_") for key in gathered_metrics - ) - - -def _infer_teacher_config(gathered_metrics) -> BlockConfig: - n_heads_in_group, intermediate_size = zip( - *[ - (variant_config.attention.n_heads_in_group, variant_config.ffn.intermediate_size) - for block_name, block_variants in gathered_metrics.items() - for variant_config, variant_metrics in block_variants.items() - ] - ) - teacher_n_heads_in_group = min(filter(None, n_heads_in_group)) - teacher_intermediate_size = max(filter(None, intermediate_size)) - - unique_teacher_candidates = set() - for block_name, block_variants in gathered_metrics.items(): - for variant_config, variant_metrics in block_variants.items(): - if ( - variant_config.ffn.intermediate_size == teacher_intermediate_size - and variant_config.attention.n_heads_in_group == teacher_n_heads_in_group - ): - unique_teacher_candidates.add(variant_config) - - assert len(unique_teacher_candidates) == 1, ( - f"Woops, expected example one candidate to be the teacher block config, instead found: {unique_teacher_candidates=}" - ) - - teacher_block_config = unique_teacher_candidates.pop() - return teacher_block_config - - -def infer_teacher_replacement_config(gathered_metrics) -> BlockConfig: - n_heads_in_group, intermediate_size = zip( - *[ - ( - replacement["block_config"].attention.n_heads_in_group, - replacement["block_config"].ffn.intermediate_size, - ) - for replacement_id, replacement in gathered_metrics.items() - ] - ) - teacher_intermediate_size = max(filter(None, intermediate_size)) - teacher_n_heads_in_group = min(filter(None, n_heads_in_group)) - unique_teacher_candidates = set() - for replacement_id, replacement in gathered_metrics.items(): - if ( - replacement["block_config"].ffn.intermediate_size == teacher_intermediate_size - and replacement["block_config"].attention.n_heads_in_group == teacher_n_heads_in_group - ): - unique_teacher_candidates.add(replacement["block_config"]) - - assert len(unique_teacher_candidates) == 1, ( - f"Woops, expected example one candidate to be the teacher block config, instead found: {unique_teacher_candidates=}" - ) - - teacher_replacement_config = unique_teacher_candidates.pop() - return teacher_replacement_config - - -def apply(css_func_name, gathered_metrics, method_kwargs): - search_space_reducer = dispatcher.get(css_func_name) - if search_space_reducer is None: - raise ValueError( - f"could not find a function called `{css_func_name}` in {__name__}.py to reduce search space " - ) - - try: - gathered_metrics = search_space_reducer(gathered_metrics, **method_kwargs) - except Exception as e: - traceback.print_exc() - raise ValueError( - f"something went wrong when trying to apply the following search space reducer `{css_func_name}` \ - with the folloing args: {method_kwargs}, here's the exception: {e}" - ) - - return gathered_metrics diff --git a/modelopt/torch/_compress/mip/greedy_search_with_multi_layer_replacements.py b/modelopt/torch/_compress/mip/greedy_search_with_multi_layer_replacements.py deleted file mode 100644 index 719643cc2..000000000 --- a/modelopt/torch/_compress/mip/greedy_search_with_multi_layer_replacements.py +++ /dev/null @@ -1,180 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Performs greedy search to find optimal multi-layer replacements under resource constraints.""" - -# mypy: ignore-errors -import math -from copy import deepcopy -from random import random -from typing import Any, Hashable, TypeAlias - -from .utils import InfeasibleError, consecutive_ngrams, get_nested_key, sort_replacements - -ReplacementID: TypeAlias = Hashable -Replacement: TypeAlias = dict[str, Any] -ChosenReplacements: TypeAlias = list[Replacement] - - -def run_greedy_search( - teacher_replacements: list[Replacement], - student_replacements: list[Replacement], - objective: str, - constraints: dict[str, float], - bigger_is_better: bool, -) -> tuple[ChosenReplacements, float, dict[str, float]]: - print("####### running greedy search #######") - teacher_replacements = deepcopy(teacher_replacements) - student_replacements = deepcopy(student_replacements) - chosen_replacements: ChosenReplacements = [] - - teacher_replacements = { - replacement["parent_layer_indices"][0]: replacement for replacement in teacher_replacements - } - - all_parent_layers = set(teacher_replacements.keys()) - uncovered_parent_layers = set(all_parent_layers) - - while True: - if len(student_replacements) == 0: - raise InfeasibleError() - - choice_func = max if bigger_is_better else min - best_replacement = choice_func( - student_replacements, key=lambda replacement: get_nested_key(replacement, objective) - ) - chosen_replacements.append(best_replacement) - uncovered_parent_layers -= set(best_replacement["parent_layer_indices"]) - student_replacements = _filter_overlapping_replacements( - student_replacements, uncovered_parent_layers - ) - - padded_chosen_replacements = list(chosen_replacements) - for uncovered_block_idx in uncovered_parent_layers: - padded_chosen_replacements.append(teacher_replacements[uncovered_block_idx]) - - all_constraints_satisfied = True - for constraint_key, max_cost in constraints.items(): - total_cost = sum( - get_nested_key(replacement, constraint_key) - for replacement in padded_chosen_replacements - ) - is_constraint_satisfied = total_cost < max_cost or math.isclose( - total_cost, max_cost, rel_tol=1e-9 - ) - if not is_constraint_satisfied: - all_constraints_satisfied = False - - if all_constraints_satisfied: - chosen_replacements = padded_chosen_replacements - break - - # Trust But Verify: calculate total value and costs, and check that all the constraints are filled - total_value = 0.0 - total_costs = {constraint_key: 0 for constraint_key in constraints.keys()} - chosen_layers = set() - for replacement in chosen_replacements: - total_value += get_nested_key(replacement, objective) - for constraint_key in constraints.keys(): - total_costs[constraint_key] += get_nested_key(replacement, constraint_key) - for parent_layer_idx in replacement["parent_layer_indices"]: - assert parent_layer_idx not in chosen_layers, ( - f"Found duplicate chosen layer {parent_layer_idx}" - ) - chosen_layers.add(parent_layer_idx) - - missing_layers = all_parent_layers - set(chosen_layers) - assert len(missing_layers) == 0, ( - f"The following layers were not chosen by any replacement:\n{missing_layers=}\n{chosen_replacements}" - ) - - for constraint_key, max_cost in constraints.items(): - assert total_costs[constraint_key] < max_cost or math.isclose( - total_costs[constraint_key], max_cost, rel_tol=1e-9 - ), ( - f"this constraint was violated {constraint_key} in the solution, sol val={total_costs[constraint_key]} <= {max_cost=}" - ) - - chosen_replacements = sort_replacements(chosen_replacements) - for cr in chosen_replacements: - if "block_config" in cr: - cr["child_block_configs"] = cr["block_config"] - - return [ - { - "chosen_replacements": chosen_replacements, - "total_value": total_value, - "total_costs": total_costs, - } - ] - - -def _filter_overlapping_replacements( - replacements: list[Replacement], - uncovered_parent_layers: set[int], -) -> list[Replacement]: - return [ - replacement - for replacement in replacements - if set(replacement["parent_layer_indices"]).issubset(uncovered_parent_layers) - ] - - -def usage_example(): - num_layers = 32 - num_options_per_parent_replacement = 5 - - teacher_replacements = [] - student_replacements = [] - for num_layers_in_replacement in (1, 2, 3): - for i_option in range(num_options_per_parent_replacement): - for parent_layer_indices in consecutive_ngrams(num_layers, num_layers_in_replacement): - is_teacher = num_layers_in_replacement == 1 and i_option == 0 - replacement_id = f"parent layers {parent_layer_indices} child config {i_option}" - replacement = { - "parent_layer_indices": parent_layer_indices, - "metrics": {"loss": random() if not is_teacher else 0.0}, - "stats": {"cost": 1}, - "replacement_id": replacement_id, - } - if is_teacher: - teacher_replacements.append(replacement) - else: - student_replacements.append(replacement) - - constraints = {"stats.cost": num_layers - 8} - (result,) = run_greedy_search( - teacher_replacements, - student_replacements, - objective="metrics.loss", - constraints=constraints, - bigger_is_better=False, - ) - chosen_replacements = result["chosen_replacements"] - total_value = result["total_value"] - total_costs = result["total_costs"] - - print() - print() - print(f"{total_value=}") - print(f"{total_costs=}") - print(f"{constraints=}") - print("chosen_replacements=") - print(chosen_replacements) - print("\n".join([rep["replacement_id"] for rep in chosen_replacements])) - - -if __name__ == "__main__": - usage_example() diff --git a/modelopt/torch/_compress/mip/grouped_knapsack.py b/modelopt/torch/_compress/mip/grouped_knapsack.py deleted file mode 100644 index 5769ded3c..000000000 --- a/modelopt/torch/_compress/mip/grouped_knapsack.py +++ /dev/null @@ -1,231 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Solves the grouped knapsack problem using Mixed Integer Programming to find optimal item selections.""" - -# mypy: ignore-errors -import math -import warnings -from copy import deepcopy -from random import random -from typing import Any, Hashable, Iterable, Optional, TypeAlias, Union - -from mip import BINARY, Model, maximize, minimize, xsum -from tqdm import tqdm - -from .utils import InfeasibleError, get_nested_key - -Item: TypeAlias = dict[str, float | dict[str, float]] -Group: TypeAlias = dict[Hashable, Item] -ChosenItems: TypeAlias = dict[Hashable, Hashable] - - -def multi_solution_grouped_knapsack( - groups: dict[Hashable, Group], - objective: str, - constraints: dict[str, float], - bigger_is_better: bool, - num_solutions: int, - minimal_diversity: int = 1, - max_seconds_per_solution: Optional[float] = None, -) -> list[dict[str, Union[ChosenItems, float]]]: - solutions = [] - previous_choices = [] - for i_run in tqdm(range(num_solutions), desc="multi_solution_grouped_knapsack"): - try: - chosen_items, total_value, total_costs = grouped_knapsack( - groups, - objective, - constraints, - bigger_is_better, - previous_choices, - minimal_diversity, - max_seconds_per_solution, - ) - except InfeasibleError: - warnings.warn(f"Found only {i_run} feasible solutions (requested {num_solutions})") - break - previous_choices.append(chosen_items) - solutions.append( - {"chosen_items": chosen_items, "total_value": total_value, "total_costs": total_costs} - ) - return solutions - - -def grouped_knapsack( - groups: dict[Hashable, Group], - objective: str, - constraints: dict[str, float | tuple[float, float]], - bigger_is_better: bool, - previous_choices: Optional[list[ChosenItems]] = None, - minimal_diversity: int = 1, - max_seconds_per_solution: Optional[float] = None, -) -> tuple[ChosenItems, float, dict[str, float]]: - groups = deepcopy(groups) - mip_model = Model() - - objective_vars = [] - constraint_vars = {constraint_key: [] for constraint_key in constraints.keys()} - for group_name, group_items in groups.items(): - group_vars = [] - for item_name, item in group_items.items(): - is_chosen = mip_model.add_var(var_type=BINARY) - item["is_chosen"] = is_chosen - group_vars.append(is_chosen) - objective_vars.append(is_chosen * get_nested_objective(item, objective)) - for constraint_key in constraints.keys(): - constraint_vars[constraint_key].append( - is_chosen * get_nested_key(item, constraint_key) - ) - - mip_model += xsum(group_vars) == 1 - - for constraint_key, max_cost in constraints.items(): - min_cost = None - if isinstance(max_cost, Iterable): - min_cost, max_cost = max_cost - - if max_cost is not None: - mip_model += xsum(constraint_vars[constraint_key]) <= max_cost - if min_cost is not None: - mip_model += xsum(constraint_vars[constraint_key]) >= min_cost - - if previous_choices is not None: - for previous_chosen_items in previous_choices: - corresponding_vars = [ - groups[group_name][item_name]["is_chosen"] - for group_name, item_name in previous_chosen_items.items() - ] - mip_model += xsum(corresponding_vars) <= len(groups) - minimal_diversity - - mip_model.objective = ( - maximize(xsum(objective_vars)) if bigger_is_better else minimize(xsum(objective_vars)) - ) - - if max_seconds_per_solution is not None: - mip_model.max_seconds = max_seconds_per_solution - - mip_model.optimize() - - if is_chosen.x is None: - raise InfeasibleError() - - total_value = 0.0 - total_costs = {constraint_key: 0 for constraint_key in constraints.keys()} - chosen_items: ChosenItems = dict() - for group_name, group_items in groups.items(): - for item_name, item in group_items.items(): - is_chosen = item["is_chosen"].x >= 0.99 - if is_chosen: - assert group_name not in chosen_items - chosen_items[group_name] = item_name - total_value += get_nested_objective(item, objective) - for constraint_key in constraints.keys(): - total_costs[constraint_key] += get_nested_key(item, constraint_key) - - if len(chosen_items) != len(groups): - in_groups_and_not_in_chosen_items = set(groups.keys()) - set(chosen_items.keys()) - in_chosen_items_and_not_in_groups = set(chosen_items.keys()) - set(groups.keys()) - missing_groups = [groups[key] for key in in_groups_and_not_in_chosen_items] - raise RuntimeError(f""" - Different number of 'chosen_items' and 'groups': {len(chosen_items)=} {len(groups)=} - {in_groups_and_not_in_chosen_items=} - {in_chosen_items_and_not_in_groups=} - {missing_groups=} - """) - - for constraint_key, max_cost in constraints.items(): - min_cost = None - if isinstance(max_cost, Iterable): - min_cost, max_cost = max_cost - - if max_cost is not None: - assert total_costs[constraint_key] < max_cost or math.isclose( - total_costs[constraint_key], max_cost, rel_tol=1e-9 - ), ( - f"This max_cost was violated {constraint_key} in the solution, sol val={total_costs[constraint_key]} > {max_cost=}" - ) - if min_cost is not None: - assert total_costs[constraint_key] > min_cost or math.isclose( - total_costs[constraint_key], min_cost, rel_tol=1e-9 - ), ( - f"This min_cost was violated {constraint_key} in the solution, sol val={total_costs[constraint_key]} < {min_cost=}" - ) - - for previous_chosen_items in previous_choices: - num_differences = 0 - for group_name in groups.keys(): - num_differences += previous_chosen_items[group_name] != chosen_items[group_name] - assert num_differences >= minimal_diversity - - return chosen_items, total_value, total_costs - - -def get_nested_objective(dictionary: dict[str, Any], nested_key: str) -> Any: - if nested_key.startswith("metrics."): - # handle metrics that have '.' in their name - metric = nested_key.split("metrics.")[1] - return dictionary["metrics"][metric] - else: - return get_nested_key(dictionary, nested_key) - - -def usage_example(): - num_layers = 32 - num_configs_per_block = 100 - groups = { - f"layer_{i_layer}": { - f"config_{i_config}": { - "metrics": {"accuracy": random()}, - "stats": {"memory_mib": random() * 100, "runtime_ms": random() * 10}, - } - for i_config in range(num_configs_per_block) - } - for i_layer in range(num_layers) - } - - minimal_diversity = 10 - constraints = {"stats.memory_mib": num_layers * 50.0, "stats.runtime_ms": num_layers * 5.0} - solutions = multi_solution_grouped_knapsack( - groups, - objective="metrics.accuracy", - constraints=constraints, - bigger_is_better=True, - num_solutions=10, - minimal_diversity=minimal_diversity, - ) - - print() - print(constraints) - - for i_run, solution in enumerate(solutions): - print() - print(f"run {i_run}") - print(solution) - - print(f"Checking differences, should be at least {minimal_diversity}:") - for a in range(len(solutions)): - for b in range(a + 1, len(solutions)): - num_differences = 0 - for group_name in groups.keys(): - num_differences += ( - solutions[a]["chosen_items"][group_name] - != solutions[b]["chosen_items"][group_name] - ) - print(a, "<>", b, "=", num_differences) - - -if __name__ == "__main__": - usage_example() diff --git a/modelopt/torch/_compress/mip/mip_and_realize_models.py b/modelopt/torch/_compress/mip/mip_and_realize_models.py index 83d8b23f5..f6d77d262 100644 --- a/modelopt/torch/_compress/mip/mip_and_realize_models.py +++ b/modelopt/torch/_compress/mip/mip_and_realize_models.py @@ -44,12 +44,19 @@ def launch_realize_model(cfg: DictConfig, runtime: IRuntime): def launch_mip_and_realize_model(cfg: DictConfig, runtime: IRuntime): + # Determine device for distributed operations (NCCL requires CUDA tensors) + device = "cpu" + if runtime.world_size > 1 and dist.is_initialized(): + backend = dist.get_backend() + if backend == "nccl": + device = torch.cuda.current_device() + if runtime.is_main_process: solution_paths = launch_mip(cfg) - length_tensor = torch.tensor([len(solution_paths)], dtype=torch.long) + length_tensor = torch.tensor([len(solution_paths)], dtype=torch.long, device=device) else: solution_paths = None - length_tensor = torch.tensor([0], dtype=torch.long) + length_tensor = torch.tensor([0], dtype=torch.long, device=device) if not cfg.skip_realize_model: if runtime.world_size > 1: @@ -75,7 +82,7 @@ def main(cfg: DictConfig) -> None: cfg = hydra.utils.instantiate(cfg) _runtime = ( - NativeDDP_Runtime( + NativeDdpRuntime( dtype=torch.bfloat16, torch_distributed_timeout=getattr(cfg, "nccl_timeout_minutes") ) if is_distributed() diff --git a/modelopt/torch/_compress/mip/mip_with_multi_layer_replacements.py b/modelopt/torch/_compress/mip/mip_with_multi_layer_replacements.py index 50525c846..438db3312 100644 --- a/modelopt/torch/_compress/mip/mip_with_multi_layer_replacements.py +++ b/modelopt/torch/_compress/mip/mip_with_multi_layer_replacements.py @@ -25,7 +25,12 @@ from mip import BINARY, Model, maximize, minimize, xsum -from .utils import InfeasibleError, consecutive_ngrams, get_nested_key, sort_replacements +from modelopt.torch._compress.mip.utils import ( + InfeasibleError, + consecutive_ngrams, + get_nested_key, + sort_replacements, +) ReplacementID: TypeAlias = Hashable Replacement: TypeAlias = dict[str, Any] diff --git a/modelopt/torch/_compress/mip/run_puzzle.py b/modelopt/torch/_compress/mip/run_puzzle.py index fd883e969..5773349c1 100644 --- a/modelopt/torch/_compress/mip/run_puzzle.py +++ b/modelopt/torch/_compress/mip/run_puzzle.py @@ -28,15 +28,11 @@ import yaml from omegaconf import DictConfig, ListConfig, OmegaConf -import modelopt.torch._compress.mip.constrain_search_space as css from modelopt.torch._compress.decilm.deci_lm_hf_code.block_config import ( AttentionConfig, BlockConfig, FFNConfig, ) -from modelopt.torch._compress.mip.greedy_search_with_multi_layer_replacements import ( - run_greedy_search, -) from modelopt.torch._compress.mip.mip_with_multi_layer_replacements import ( run_mip as run_multi_layer_replacement_mip, ) @@ -211,8 +207,6 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--human_constraints", type=parse_json) parser.add_argument("--report_additional_costs", type=str, action="append", default=[]) - parser.add_argument("--num_solutions", type=int) - parser.add_argument("--minimal_diversity", type=int) parser.add_argument( "--output_path", type=parse_path, @@ -227,21 +221,6 @@ def parse_args() -> argparse.Namespace: help="Set this if using accuracy objective, don't set if using loss objective", ) - parser.add_argument("--constrain_search_func", type=str, default=None) - parser.add_argument("--constrain_search_args", type=parse_json, default=dict()) - - parser.add_argument( - "--is_multi_layer_puzzle", - action="store_true", - default=True, - help="[DEPRECATED] This flag is now always True. Kept for backward compatibility.", - ) - parser.add_argument( - "--use_greedy_search", - action="store_true", - help="Use greedy search instead of mip. Only supported for multi-layer puzzle.", - ) - args = parser.parse_args() return args @@ -254,17 +233,14 @@ def run_single_puzzle_config( constraints: PuzzleConstraints, output_folder, ) -> None: - from modelopt.torch._compress.mip.grouped_knapsack import multi_solution_grouped_knapsack - - args = deepcopy( - args - ) # we override the constraints and subblock_stats_args for this run to keep reporting out the same way. + # we override the constraints and subblock_stats_args for this run to keep reporting out the same way. + args = deepcopy(args) subblock_stats = filter_subblock_stats_by_args(subblock_stats, subblock_stats_args) _add_block_stats_to_gathered_metrics(gathered_metrics, subblock_stats) output_folder.mkdir(parents=True, exist_ok=True) - _dump_gathered_metrics(gathered_metrics, output_folder, args.is_multi_layer_puzzle) + _dump_gathered_metrics(gathered_metrics, output_folder) non_block_stats = {"stats": _get_block_stats(subblock_stats, "non_block")} batch_size = subblock_stats["args"]["batch_size"] @@ -304,40 +280,13 @@ def run_single_puzzle_config( mprint(f"After non-block adjustments: {mip_constraints=}") - if args.is_multi_layer_puzzle: - if not args.use_greedy_search: - solutions = run_multi_layer_replacement_mip( - replacements=gathered_metrics, - objective=args.objective, - constraints=mip_constraints, - bigger_is_better=args.bigger_is_better, - max_seconds_per_solution=args.max_seconds_per_solution, - ) - else: - teacher_replacements, student_replacements = [], [] - for replacement in gathered_metrics.values(): - if replacement["is_teacher"]: - teacher_replacements.append(replacement) - else: - student_replacements.append(replacement) - - solutions = run_greedy_search( - teacher_replacements=teacher_replacements, - student_replacements=student_replacements, - objective=args.objective, - constraints=mip_constraints, - bigger_is_better=args.bigger_is_better, - ) - else: - solutions = multi_solution_grouped_knapsack( - groups=gathered_metrics, - objective=args.objective, - constraints=mip_constraints, - bigger_is_better=args.bigger_is_better, - num_solutions=args.num_solutions, - minimal_diversity=args.minimal_diversity, - max_seconds_per_solution=args.max_seconds_per_solution, - ) + solutions = run_multi_layer_replacement_mip( + replacements=gathered_metrics, + objective=args.objective, + constraints=mip_constraints, + bigger_is_better=args.bigger_is_better, + max_seconds_per_solution=args.max_seconds_per_solution, + ) for solution in solutions: for stat_name in set([*orig_mip_constraints.keys(), *args.report_additional_costs]): @@ -379,25 +328,10 @@ def run_single_puzzle_config( return solutions_file -def _dump_gathered_metrics( - gathered_metrics: PuzzleMetrics, output_folder: Path, is_multi_layer_puzzle: bool = False -) -> None: - if is_multi_layer_puzzle: - for replacement_id, replacement_info in gathered_metrics.items(): - replacement_info["block_repr"] = block_config_to_str(replacement_info["block_config"]) - gathered_metrics_for_dump = gathered_metrics - else: - gathered_metrics_for_dump = { - block_name: { - block_config_to_str(variant_config).strip(): { - **variant_metrics, - "block_config": variant_config, - "block_repr": block_config_to_str(variant_config).strip(), - } - for variant_config, variant_metrics in block_variants.items() - } - for block_name, block_variants in gathered_metrics.items() - } +def _dump_gathered_metrics(gathered_metrics: PuzzleMetrics, output_folder: Path) -> None: + for replacement_id, replacement_info in gathered_metrics.items(): + replacement_info["block_repr"] = block_config_to_str(replacement_info["block_config"]) + gathered_metrics_for_dump = gathered_metrics json_dump(gathered_metrics_for_dump, output_folder / "replacement_metrics_and_stats.json") @@ -451,17 +385,12 @@ def _override_args_from_profile(args, puzzle_profile): if arg_name in puzzle_profile: if arg_name not in ("mip_constraints", "human_constraints", "subblock_stats_args"): setattr(args, arg_name, puzzle_profile[arg_name]) - if isinstance(args.constrain_search_args, str): - args.constrain_search_args = parse_json(args.constrain_search_args) - assert args.is_multi_layer_puzzle, "multi-layer puzzle is now the only supported mode." def _assert_valid_config(args, puzzle_profile): required_args = ( "subblock_stats_path", "objective", - "num_solutions", - "minimal_diversity", "output_path", ) missing_args = [arg for arg in required_args if arg not in args or getattr(args, arg) is None] @@ -488,11 +417,6 @@ def _assert_valid_config(args, puzzle_profile): ) exit(1) - if args.use_greedy_search: - assert args.is_multi_layer_puzzle, ( - "--use_greedy_search is only supported for multi layer puzzle" - ) - def _get_minimal_unique_names(dicts: List[dict]) -> List[str]: all_keys = set(k for d in dicts for k in d.keys()) @@ -517,23 +441,13 @@ def run_puzzle(args: argparse.Namespace) -> List[str]: if args.gathered_metrics_path is not None: gathered_metrics = json.loads(args.gathered_metrics_path.read_text()) else: - gather_func = ( - gather_puzzle_metrics - if not args.is_multi_layer_puzzle - else gather_multi_layer_puzle_metrics + gathered_metrics = gather_multi_layer_puzle_metrics( + args.single_block_replacement_validation_dir ) - gathered_metrics = gather_func(args.single_block_replacement_validation_dir) if args.metric_overrides is not None: gathered_metrics = {**gathered_metrics, **args.metric_overrides} - if args.constrain_search_func is not None: - mprint(f"{args.constrain_search_args=}") - # assert not args.is_multi_layer_puzzle, "conditional search is not implementd yet for multi-layer puzzles, did you implement it?" - gathered_metrics = css.apply( - args.constrain_search_func, gathered_metrics, args.constrain_search_args - ) - subblock_stats = json.loads(args.subblock_stats_path.read_text()) all_subblock_args = _load_all_subblock_stats_args(args, puzzle_profile) diff --git a/modelopt/torch/_compress/sewing_kit/utils.py b/modelopt/torch/_compress/sewing_kit/utils.py index 16fe1b3fd..ff47c289b 100644 --- a/modelopt/torch/_compress/sewing_kit/utils.py +++ b/modelopt/torch/_compress/sewing_kit/utils.py @@ -447,13 +447,33 @@ def get_parent_module_names(module_name: str): return parent_module_names +def _get_device_for_distributed( + group: Optional[torch.distributed.ProcessGroup] = None, +) -> str: + """ + Determine the appropriate device for distributed communication based on the backend. + NCCL backend requires CUDA tensors, while Gloo supports both CPU and CUDA. + """ + if not torch.distributed.is_initialized(): + return "cpu" + + backend = torch.distributed.get_backend(group) + if backend == "nccl": + # NCCL requires CUDA tensors + return torch.cuda.current_device() + else: + # Gloo and other backends support CPU tensors + return "cpu" + + def distributed_isend_obj( obj: Any, dst: int = 0, group: Optional[torch.distributed.ProcessGroup] = None, ) -> list[Optional[torch.distributed.Work]]: + device = _get_device_for_distributed(group) obj_tensor, obj_size_tensor = torch.distributed.distributed_c10d._object_to_tensor( - obj, device="cpu", **_get_group_kwarg_if_necessary() + obj, device=device, **_get_group_kwarg_if_necessary() ) works: list[Optional[torch.distributed.Work]] = [ torch.distributed.isend(obj_size_tensor, dst, group), @@ -484,11 +504,12 @@ def distributed_recv_obj( src: Optional[int] = None, group: Optional[torch.distributed.ProcessGroup] = None, ) -> Any: - obj_size_tensor = torch.LongTensor(1, device="cpu") + device = _get_device_for_distributed(group) + obj_size_tensor = torch.LongTensor(1).to(device) torch.distributed.recv(obj_size_tensor, src=src, group=group) obj_size = int(obj_size_tensor.item()) - obj_tensor = torch.ByteTensor(obj_size, device="cpu") + obj_tensor = torch.ByteTensor(obj_size).to(device) torch.distributed.recv(obj_tensor, src=src, group=group) obj = torch.distributed.distributed_c10d._tensor_to_object( diff --git a/setup.py b/setup.py index d4077f709..20a271fe1 100644 --- a/setup.py +++ b/setup.py @@ -105,13 +105,13 @@ "compress": [ "fire", "hydra-core==1.3.2", - "omegaconf==2.3.0", - "wandb~=0.17.5", - "lru-dict", - "typeguard", - "pandas", "immutabledict", + "lru-dict", "mip", + "omegaconf==2.3.0", + "pandas", + "typeguard", + "wandb~=0.17.5", ], } diff --git a/tests/gpu/torch/_compress/compress_test_utils.py b/tests/gpu/torch/_compress/compress_test_utils.py index a1102e7fa..9df5f5bfc 100644 --- a/tests/gpu/torch/_compress/compress_test_utils.py +++ b/tests/gpu/torch/_compress/compress_test_utils.py @@ -29,11 +29,7 @@ def setup_test_model_and_data( tmp_path: Path, rank: int, runtime, -) -> tuple[ - Path, - Path, - Path, -]: +) -> tuple[Path, Path, Path]: """ Setup the test model and data for the compress NAS search. @@ -132,7 +128,7 @@ def setup_puzzle_dir(puzzle_dir: str): Path(puzzle_dir).mkdir(parents=True, exist_ok=True) -def save_dummy_dataset(dataset_path: str): +def save_dummy_dataset(dataset_path: Path | str): """ Save a dummy dataset for testing purposes. """ @@ -170,4 +166,4 @@ def save_dummy_dataset(dataset_path: str): # For train-val splits data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)}) - data_dict.save_to_disk(dataset_path) + data_dict.save_to_disk(str(dataset_path)) diff --git a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-attn-pruning.yaml b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-attn-pruning.yaml index 21a3486f0..473a5d418 100644 --- a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-attn-pruning.yaml +++ b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-attn-pruning.yaml @@ -9,7 +9,7 @@ defaults: puzzle_dir: ??? teacher_dir: ${puzzle_dir}/ckpts/teacher/ replacement_library_path: ${puzzle_dir}/replacement_library.json -dataset_path: ??? # path to v0.4_mini +dataset_path: ??? # path to v0.4_mini skip_realize_model: false @@ -21,10 +21,10 @@ calc_subblock_stats: batch_sizes: [64, 96, 128] prefill_seq_len: 4096 generation_seq_len: 4096 - num_active_tokens_override: # Optional override for sequence lengths + num_active_tokens_override: # Optional override for sequence lengths prefill_queue_size: 0 allocate_prefill_query: false - benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking + benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking merge_with_existing_stats: false subblock_stats_filename: "subblock_stats.json" moe_stats_filename: "moe_stats.json" @@ -54,8 +54,6 @@ mip: # puzzle_profile: objective: metrics.cosine_embedding_loss_hidden_states bigger_is_better: false - num_solutions: 1 - minimal_diversity: 2 subblock_stats_args: - batch_size: 96 @@ -79,10 +77,7 @@ mip: target_memory: 780_000 # 78_000 mip_constraints: - use_greedy_search: false - is_multi_layer_puzzle: true metric_overrides: - constrain_search_func: max_seconds_per_solution: 60 realize_model: @@ -90,10 +85,10 @@ realize_model: tokenizer_name: ${to_path:${teacher_dir}} replacement_library_path: ${replacement_library_path} save_models: true - solutions_path: # Filled dynamically + solutions_path: # Filled dynamically # Validate params - skip_validation: false # To enable validation of the model solution set `skip_validation` as False + skip_validation: false # To enable validation of the model solution set `skip_validation` as False eval_samples: 2 micro_batch_size: 1 dataset_path: ${dataset_path}/valid diff --git a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-ffn-pruning.yaml b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-ffn-pruning.yaml index 1d8fac655..8af352660 100644 --- a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-ffn-pruning.yaml +++ b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-ffn-pruning.yaml @@ -9,7 +9,7 @@ defaults: puzzle_dir: ??? teacher_dir: ${puzzle_dir}/ckpts/teacher/ replacement_library_path: ${puzzle_dir}/replacement_library.json -dataset_path: ??? # path to v0.4_mini +dataset_path: ??? # path to v0.4_mini skip_realize_model: false @@ -21,10 +21,10 @@ calc_subblock_stats: batch_sizes: [64, 96, 128] prefill_seq_len: 4096 generation_seq_len: 4096 - num_active_tokens_override: # Optional override for sequence lengths + num_active_tokens_override: # Optional override for sequence lengths prefill_queue_size: 0 allocate_prefill_query: false - benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking + benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking merge_with_existing_stats: false subblock_stats_filename: "subblock_stats.json" moe_stats_filename: "moe_stats.json" @@ -54,8 +54,6 @@ mip: # puzzle_profile: objective: metrics.cosine_embedding_loss_hidden_states bigger_is_better: false - num_solutions: 1 - minimal_diversity: 2 subblock_stats_args: - batch_size: 96 @@ -79,10 +77,7 @@ mip: target_memory: 780_000 # 78_000 mip_constraints: - use_greedy_search: false - is_multi_layer_puzzle: true metric_overrides: - constrain_search_func: max_seconds_per_solution: 60 realize_model: @@ -90,10 +85,10 @@ realize_model: tokenizer_name: ${to_path:${teacher_dir}} replacement_library_path: ${replacement_library_path} save_models: true - solutions_path: # Filled dynamically + solutions_path: # Filled dynamically # Validate params - skip_validation: false # To enable validation of the model solution set `skip_validation` as False + skip_validation: false # To enable validation of the model solution set `skip_validation` as False eval_samples: 2 micro_batch_size: 1 dataset_path: ${dataset_path}/valid diff --git a/tests/gpu/torch/_compress/test_compress.py b/tests/gpu/torch/_compress/test_compress.py index b00be2485..e40756602 100644 --- a/tests/gpu/torch/_compress/test_compress.py +++ b/tests/gpu/torch/_compress/test_compress.py @@ -33,20 +33,6 @@ # # Note: Bypass is disabled now in the test. -# How to run this test (currently only supported internally at Nvidia). -# -# Have both modelopt and puzzle source code in the same directory: -# /workspace/modelopt -# /workspace/puzzletron -# -# submit_job --partition interactive --time 0 \ -# --image gitlab-master.nvidia.com/deci/puzzletron:modelopt_main \ -# --workdir $MODELOPT SRC DIRECTORY --interactive --gpu 1 -# -# export PYTHONPATH=$PYTHONPATH:.:/workspace/puzzletron/v1 -# -# pytest -s -v ./tests/gpu/torch/_compress/test_compress.py::test_compress -o addopts="" - def test_compress(project_root_path: Path, tmp_path: Path): spawn_multiprocess_job(