NVIDIA · kevalmorabia97 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
@@ -13,7 +13,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg
 
 ## Environment
 
-- Install TensorRT-Model-Optimizer in editable mode with the corresponding dependencies:
+- Install Model-Optimizer in editable mode with the corresponding dependencies:
 
 ```bash
 pip install -e .[hf,compress]
@@ -94,7 +94,7 @@ pip install -e .[hf,compress]
    block_29:  attention  gqa_4   ffn  intermediate_14336
    block_30:  attention  gqa_4   ffn  intermediate_14336
    block_31:  attention  gqa_4   ffn  intermediate_14336
-   
+
    [2025-11-02 04:53:11,332]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 75796.4140625, 'stats.ffn_num_params': 5637275648, 'stats.num_kv_heads': 160, 'stats.kv_cache_memory_mib': 61440.0, 'stats.ffn_memory_mib': 10752.25, 'stats.attention_memory_mib': 63040.15625, 'stats.attention_num_params': 838942720, 'stats.num_params': 7526895616, 'stats.has_attention': 20, 'stats.has_ffn': 32}
    ...
    ################################################################

@@ -9,7 +9,7 @@ defaults:
 puzzle_dir: ???
 teacher_dir: ${puzzle_dir}/ckpts/teacher/
 replacement_library_path: ${puzzle_dir}/replacement_library.json
-dataset_path: ???     # path to v0.4_mini
+dataset_path: ??? # path to v0.4_mini
 
 skip_realize_model: false
 
@@ -21,10 +21,10 @@ calc_subblock_stats:
   batch_sizes: [64, 96, 128]
   prefill_seq_len: 4096
   generation_seq_len: 4096
-  num_active_tokens_override:       # Optional override for sequence lengths
+  num_active_tokens_override: # Optional override for sequence lengths
   prefill_queue_size: 0
   allocate_prefill_query: false
-  benchmark_iterations:       # Set to a number (e.g., 1000) to enable runtime benchmarking
+  benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking
   merge_with_existing_stats: false
   subblock_stats_filename: "subblock_stats.json"
   moe_stats_filename: "moe_stats.json"
@@ -56,8 +56,6 @@ mip:
   # puzzle_profile:
   objective: metrics.cosine_embedding_loss_hidden_states
   bigger_is_better: false
-  num_solutions: 1
-  minimal_diversity: 2
 
   subblock_stats_args:
     - batch_size: 96
@@ -81,21 +79,18 @@ mip:
     target_memory: 78_000
 
   mip_constraints:
-  use_greedy_search: false
-  is_multi_layer_puzzle: true
   metric_overrides:
-  constrain_search_func:
   max_seconds_per_solution: 60
 
 realize_model:
   teacher_dir: ${to_path:${teacher_dir}}
   tokenizer_name: ${to_path:${teacher_dir}}
   replacement_library_path: ${replacement_library_path}
   save_models: true
-  solutions_path:     # Filled dynamically
+  solutions_path: # Filled dynamically
 
   # Validate params
-  skip_validation: false    # To enable validation of the model solution set `skip_validation` as False
+  skip_validation: false # To enable validation of the model solution set `skip_validation` as False
   eval_samples: 128
   micro_batch_size: 1
   seed: 42

@@ -23,7 +23,7 @@ This section focuses on applying Model Optimizer's state-of-the-art complementar
 
 </div>
 
-For more advanced pruning strategies, such as the [Puzzle methodology](https://arxiv.org/pdf/2411.19146), please see [Puzzle pruning example](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/feature/compress/examples/compress).
+For more advanced pruning strategies, such as the [Puzzle methodology](https://arxiv.org/pdf/2411.19146), please see [Puzzle pruning example](../compress/README.md).
 
 ## Pre-Requisites
-Original file line number
+Diff line change
@@ Expand Up @@
     </div>
-    For more advanced pruning strategies, such as the [Puzzle methodology](https://arxiv.org/pdf/2411.19146), please see [Puzzle pruning example](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/feature/compress/examples/compress).
+    For more advanced pruning strategies, such as the [Puzzle methodology](https://arxiv.org/pdf/2411.19146), please see [Puzzle pruning example](../compress/README.md).
     ## Pre-Requisites
@@ Expand Down @@