diff --git a/examples/compress/README.md b/examples/compress/README.md
index 3bd218aa4..755b6090e 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -13,7 +13,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg
 
 ## Environment
 
-- Install TensorRT-Model-Optimizer in editable mode with the corresponding dependencies:
+- Install Model-Optimizer in editable mode with the corresponding dependencies:
 
 ```bash
 pip install -e .[hf,compress]
@@ -94,7 +94,7 @@ pip install -e .[hf,compress]
    block_29:  attention  gqa_4   ffn  intermediate_14336
    block_30:  attention  gqa_4   ffn  intermediate_14336
    block_31:  attention  gqa_4   ffn  intermediate_14336
-   
+
    [2025-11-02 04:53:11,332]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 75796.4140625, 'stats.ffn_num_params': 5637275648, 'stats.num_kv_heads': 160, 'stats.kv_cache_memory_mib': 61440.0, 'stats.ffn_memory_mib': 10752.25, 'stats.attention_memory_mib': 63040.15625, 'stats.attention_num_params': 838942720, 'stats.num_params': 7526895616, 'stats.has_attention': 20, 'stats.has_ffn': 32}
    ...
    ################################################################
diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
index 70b5304c5..133fe0b77 100644
--- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
+++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
@@ -9,7 +9,7 @@ defaults:
 puzzle_dir: ???
 teacher_dir: ${puzzle_dir}/ckpts/teacher/
 replacement_library_path: ${puzzle_dir}/replacement_library.json
-dataset_path: ???     # path to v0.4_mini
+dataset_path: ??? # path to v0.4_mini
 
 skip_realize_model: false
 
@@ -21,10 +21,10 @@ calc_subblock_stats:
   batch_sizes: [64, 96, 128]
   prefill_seq_len: 4096
   generation_seq_len: 4096
-  num_active_tokens_override:       # Optional override for sequence lengths
+  num_active_tokens_override: # Optional override for sequence lengths
   prefill_queue_size: 0
   allocate_prefill_query: false
-  benchmark_iterations:       # Set to a number (e.g., 1000) to enable runtime benchmarking
+  benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking
   merge_with_existing_stats: false
   subblock_stats_filename: "subblock_stats.json"
   moe_stats_filename: "moe_stats.json"
@@ -56,8 +56,6 @@ mip:
   # puzzle_profile:
   objective: metrics.cosine_embedding_loss_hidden_states
   bigger_is_better: false
-  num_solutions: 1
-  minimal_diversity: 2
 
   subblock_stats_args:
     - batch_size: 96
@@ -81,10 +79,7 @@ mip:
     target_memory: 78_000
 
   mip_constraints:
-  use_greedy_search: false
-  is_multi_layer_puzzle: true
   metric_overrides:
-  constrain_search_func:
   max_seconds_per_solution: 60
 
 realize_model:
@@ -92,10 +87,10 @@ realize_model:
   tokenizer_name: ${to_path:${teacher_dir}}
   replacement_library_path: ${replacement_library_path}
   save_models: true
-  solutions_path:     # Filled dynamically
+  solutions_path: # Filled dynamically
 
   # Validate params
-  skip_validation: false    # To enable validation of the model solution set `skip_validation` as False
+  skip_validation: false # To enable validation of the model solution set `skip_validation` as False
   eval_samples: 128
   micro_batch_size: 1
   seed: 42
diff --git a/examples/pruning/README.md b/examples/pruning/README.md
index bbc0e7bde..9e5188e62 100644
--- a/examples/pruning/README.md
+++ b/examples/pruning/README.md
@@ -23,7 +23,7 @@ This section focuses on applying Model Optimizer's state-of-the-art complementar
 
 </div>
 
-For more advanced pruning strategies, such as the [Puzzle methodology](https://arxiv.org/pdf/2411.19146), please see [Puzzle pruning example](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/feature/compress/examples/compress).
+For more advanced pruning strategies, such as the [Puzzle methodology](https://arxiv.org/pdf/2411.19146), please see [Puzzle pruning example](../compress/README.md).
 
 ## Pre-Requisites
 
diff --git a/modelopt/torch/_compress/mip/constrain_search_space.py b/modelopt/torch/_compress/mip/constrain_search_space.py
deleted file mode 100644
index e30ee2478..000000000
--- a/modelopt/torch/_compress/mip/constrain_search_space.py
+++ /dev/null
@@ -1,407 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Constrains the search space for the MIP optimization."""
-
-import traceback
-
-from modelopt.torch._compress.decilm.deci_lm_hf_code.block_config import (
-    AttentionConfig,
-    BlockConfig,
-    FFNConfig,
-)
-from modelopt.torch._compress.utils.utils import load_json
-
-
-def drop_attentions_only(gathered_metrics, teacher_intermediate_size, teacher_n_heads_in_group):
-    """
-    changes the search space such that puzzle is not allowed to change the ffns
-    but is only allowed to drop or reduce attention.
-
-    Usage example:
-    add the following flags to your run_puzzle command:
-
-    --constrain_search_func drop_attentions_only --constrain_search_args {\"teacher_intermediate_size\": 14336, \"teacher_n_heads_in_group\": 16, \"above_layer\": 60}
-
-    """
-
-    for block_name, block_variants in gathered_metrics.items():
-        to_delete = []  # Collect keys to delete after the loop
-        for variant_config, variant_metrics in block_variants.items():
-            block_intermediate_size = variant_config.ffn.intermediate_size
-            block_attn_n_heads = variant_config.attention.n_heads_in_group
-            if (
-                (
-                    block_intermediate_size is not None
-                    and block_intermediate_size != teacher_intermediate_size
-                )
-                or variant_config.ffn.replace_with_linear
-                or variant_config.ffn.no_op  ## uncomment this line if you want to drop only attns
-                or variant_config.attention.replace_with_linear
-                or (
-                    block_attn_n_heads is not None
-                    and block_attn_n_heads != teacher_n_heads_in_group
-                )
-            ):
-                print(f"Marking for deletion: {block_name}-{variant_config}")
-                to_delete.append(variant_config)
-        for key in to_delete:
-            del block_variants[key]
-
-    print("new search space in block 0", gathered_metrics["block_0"])
-    return gathered_metrics
-
-
-def reduce_only_ffns(
-    gathered_metrics,
-    teacher_intermediate_size: int,
-    teacher_n_heads_in_group: int,
-    above_layer: int,
-    allow_no_ops: bool,
-):
-    """
-    only allows to reduce FFNs but not to completely drop them from layer 60 onwards
-    attention is only allowed to be like uniform teacher
-
-    Usage example:
-    add the following flags to your run_puzzle command:
-    constrain_search_args='{"teacher_intermediate_size": 14336, "teacher_n_heads_in_group": 16, "above_layer": 60, "allow_no_ops": false}'
-
-    sbatch puzzle/cli/run_puzzle ... --constrain_search_func reduce_only_ffns --constrain_search_args="$(echo "$constrain_search_args" | jq -c .)"
-    """
-    print(f"{teacher_n_heads_in_group=}")
-    for block_name, block_variants in gathered_metrics.items():
-        to_delete = []  # Collect keys to delete after the loop
-        block_id = int(block_name.split("_")[1])
-
-        for variant_config, variant_metrics in block_variants.items():
-            block_intermediate_size = variant_config.ffn.intermediate_size
-            block_attn_n_heads = variant_config.attention.n_heads_in_group
-
-            attn_no_op = variant_config.attention.no_op
-            attn_linear = variant_config.attention.replace_with_linear
-            if (
-                attn_no_op
-                or attn_linear
-                or (block_attn_n_heads != teacher_n_heads_in_group)  # keep attention as the teacher
-                or (
-                    block_id <= above_layer
-                    and (block_intermediate_size != teacher_intermediate_size)
-                )
-                or ((not allow_no_ops) and variant_config.ffn.no_op)
-            ):
-                # print(f"Marking for deletion: {block_name}-{variant_config}")
-                to_delete.append(variant_config)  # Add key to delete list
-
-        for key in to_delete:
-            del block_variants[key]
-
-    print("new search space in block 0", gathered_metrics["block_0"])
-    return gathered_metrics
-
-
-def drop_entire_blocks_only(gathered_metrics):
-    teacher_block_config = _infer_teacher_config(gathered_metrics)
-    for block_name, block_variants in gathered_metrics.items():
-        to_delete = []  # Collect keys to delete after the loop
-        for variant_config, variant_metrics in block_variants.items():
-            is_no_op_block = (
-                variant_config.ffn.no_op
-                and variant_config.attention.no_op
-                and getattr(variant_config, "parallel_blocks", None) is None
-            )
-            is_teacher = variant_config == teacher_block_config
-            if not is_no_op_block and not is_teacher:
-                to_delete.append(variant_config)
-        for key in to_delete:
-            del block_variants[key]
-
-    print("new search space in block 0", gathered_metrics["block_0"])
-    return gathered_metrics
-
-
-def css_to_reference_attention(gathered_metrics, attention_pruned_arch):
-    """
-    given a reference architecture we fix the search space to only include options that change the FFNs
-    but to never change the Attentions from the reference arch's Attentions.
-    """
-
-    attention_pruned_arch = load_json(attention_pruned_arch)[0]
-    attention_dropped_blocks = [
-        block_name
-        for block_name, block_config in attention_pruned_arch["chosen_items"].items()
-        if block_config["attention"]["no_op"]
-    ]
-
-    for block_name, block_variants in gathered_metrics.items():
-        to_delete = []  # Collect keys to delete after the loop
-        for variant_config, _ in block_variants.items():
-            # Uncomment and adjust this block if needed
-            # does drop only attention
-            block_attn_n_heads = variant_config.attention.n_heads_in_group
-
-            reference_arch_attn = attention_pruned_arch["chosen_items"][block_name]["attention"][
-                "n_heads_in_group"
-            ]
-            if (  # we reduce the search space by keeping the reference arch attention as is
-                (block_name in attention_dropped_blocks and not variant_config.attention.no_op)
-                or (
-                    block_name not in attention_dropped_blocks
-                    and block_attn_n_heads != reference_arch_attn
-                )
-            ):
-                print(f"Marking for deletion: {block_name}-{variant_config}")
-                to_delete.append(variant_config)
-
-        # Delete marked keys outside the loop
-        for key in to_delete:
-            del block_variants[key]
-
-    print("new search space in block 0", gathered_metrics["block_0"])
-    return gathered_metrics
-
-
-def css_to_reference_ffn(gathered_metrics, ffn_pruned_arch, allow_linear_attn=True):
-    """
-    given a reference architecture we fix the search space to only include options that change the Attentions
-    but to never change the FFNs from the reference arch's FFNs.
-    """
-
-    ffn_pruned_arch = load_json(ffn_pruned_arch)[0]
-
-    for block_name, block_variants in gathered_metrics.items():
-        to_delete = []  # Collect keys to delete after the loop
-        for variant_config, _ in block_variants.items():
-            block_ffn = variant_config.ffn
-            is_linear_attn = variant_config.attention.replace_with_linear
-
-            reference_arch_ffn = ffn_pruned_arch["chosen_items"][block_name]["ffn"]
-            reference_arch_ffn = FFNConfig(**reference_arch_ffn)
-
-            if (  # we reduce the search space by keeping the reference arch ffn as is
-                (block_ffn != reference_arch_ffn) or (not allow_linear_attn and is_linear_attn)
-            ):
-                # print(f"Marking for deletion: {block_name}-{variant_config}")
-                to_delete.append(variant_config)
-
-        # Delete marked keys outside the loop
-        for key in to_delete:
-            del block_variants[key]
-
-    print("new search space in block 0", gathered_metrics["block_0"])
-    return gathered_metrics
-
-
-def avoid_variable_gqa(
-    gathered_metrics,
-    allow_no_op_attn: bool = True,
-    allow_linear_attn: bool = False,
-    target_n_heads_in_group: int = None,
-):
-    """
-    Allow only the teacher n_heads_in_group,
-    and optionally also attention no-op (default allow)
-    and attention linear (default avoid).
-
-    This reducer affects only the attention layers: FFNs are allowed their entire search space.
-    """
-    is_multi_layer_puzzle = is_replacement_gathered_metrics(gathered_metrics)
-    if is_multi_layer_puzzle:
-        teacher_block_config = infer_teacher_replacement_config(gathered_metrics)
-    else:
-        teacher_block_config = _infer_teacher_config(gathered_metrics)
-
-    if target_n_heads_in_group is None:
-        target_n_heads_in_group = teacher_block_config.attention.n_heads_in_group
-
-    if not is_multi_layer_puzzle:
-        for block_name, block_variants in gathered_metrics.items():
-            to_delete = []  # Collect keys to delete after the loop
-
-            for variant_config, variant_metrics in block_variants.items():
-                if not (
-                    (variant_config.attention.n_heads_in_group == target_n_heads_in_group)
-                    or (variant_config.attention.no_op and allow_no_op_attn)
-                    or (variant_config.attention.replace_with_linear and allow_linear_attn)
-                ):
-                    to_delete.append(variant_config)
-
-            for key in to_delete:
-                del block_variants[key]
-    else:
-        to_delete = []  # Collect keys to delete after the loop
-        for replacement_id, replacement in gathered_metrics.items():
-            variant_config = replacement["block_config"]
-            if not (
-                (variant_config.attention.n_heads_in_group == target_n_heads_in_group)
-                or (variant_config.attention.no_op and allow_no_op_attn)
-                or (variant_config.attention.replace_with_linear and allow_linear_attn)
-            ):
-                to_delete.append(replacement_id)
-
-        for key in to_delete:
-            del gathered_metrics[key]
-    if not is_multi_layer_puzzle:
-        print("new search space in block 0", gathered_metrics["block_0"])
-    else:
-        parent_layer_idx = 0
-        print(
-            "new search space in block {parent_layer_idx}",
-            [
-                replacement["block_config"]
-                for replacement_id, replacement in gathered_metrics.items()
-                if replacement["parent_layer_indices"][0] == parent_layer_idx
-            ],
-        )
-    return gathered_metrics
-
-
-def reduce_in_range(
-    gathered_metrics,
-    layer_start: int,
-    layer_end: int,
-):
-    """
-    Allow only reduction of layers between layer_start and layer_end. Leyers before layers start, and after layer_end are kept as is (the teacher).
-
-    """
-    assert layer_start < layer_end, (
-        f"Wrong input arguments: {layer_start=} must be less than {layer_end=}"
-    )
-    is_multi_layer_puzzle = is_replacement_gathered_metrics(gathered_metrics)
-    if is_multi_layer_puzzle:
-        teacher_block_config = infer_teacher_replacement_config(gathered_metrics)
-    else:
-        teacher_block_config = _infer_teacher_config(gathered_metrics)
-
-    to_delete = []  # Collect keys to delete after the loop
-    for replacement_id, replacement in gathered_metrics.items():
-        block_id = max(replacement["parent_layer_indices"])
-        variant_config = replacement["block_config"]
-        is_teacher = variant_config == teacher_block_config
-        if (block_id < layer_start or block_id > layer_end) and not is_teacher:
-            to_delete.append(replacement_id)
-
-    for key in to_delete:
-        del gathered_metrics[key]
-
-    if not is_multi_layer_puzzle:
-        print("new search space in block 0", gathered_metrics["block_0"])
-    else:
-        parent_layer_idx = 0
-        print(
-            "new search space in block {parent_layer_idx}",
-            [
-                replacement["block_config"]
-                for replacement_id, replacement in gathered_metrics.items()
-                if replacement["parent_layer_indices"][0] == parent_layer_idx
-            ],
-        )
-    return gathered_metrics
-
-
-#############################################################################################
-
-
-# automatically builds a dictionary mapping method names in this module to their functions
-# this dictionary is used to dynamically dispatch functions
-dispatcher = {
-    method_name: method_callable
-    for method_name, method_callable in globals().items()
-    if callable(method_callable)
-}
-
-
-def is_replacement_gathered_metrics(gathered_metrics) -> bool:
-    # if the gathered metrics is a replacement, then it is a dictionary of the form {'replacement_{id}': replacement_metrics}
-
-    return isinstance(gathered_metrics, dict) and all(
-        key.startswith("replacement_") for key in gathered_metrics
-    )
-
-
-def _infer_teacher_config(gathered_metrics) -> BlockConfig:
-    n_heads_in_group, intermediate_size = zip(
-        *[
-            (variant_config.attention.n_heads_in_group, variant_config.ffn.intermediate_size)
-            for block_name, block_variants in gathered_metrics.items()
-            for variant_config, variant_metrics in block_variants.items()
-        ]
-    )
-    teacher_n_heads_in_group = min(filter(None, n_heads_in_group))
-    teacher_intermediate_size = max(filter(None, intermediate_size))
-
-    unique_teacher_candidates = set()
-    for block_name, block_variants in gathered_metrics.items():
-        for variant_config, variant_metrics in block_variants.items():
-            if (
-                variant_config.ffn.intermediate_size == teacher_intermediate_size
-                and variant_config.attention.n_heads_in_group == teacher_n_heads_in_group
-            ):
-                unique_teacher_candidates.add(variant_config)
-
-    assert len(unique_teacher_candidates) == 1, (
-        f"Woops, expected example one candidate to be the teacher block config, instead found: {unique_teacher_candidates=}"
-    )
-
-    teacher_block_config = unique_teacher_candidates.pop()
-    return teacher_block_config
-
-
-def infer_teacher_replacement_config(gathered_metrics) -> BlockConfig:
-    n_heads_in_group, intermediate_size = zip(
-        *[
-            (
-                replacement["block_config"].attention.n_heads_in_group,
-                replacement["block_config"].ffn.intermediate_size,
-            )
-            for replacement_id, replacement in gathered_metrics.items()
-        ]
-    )
-    teacher_intermediate_size = max(filter(None, intermediate_size))
-    teacher_n_heads_in_group = min(filter(None, n_heads_in_group))
-    unique_teacher_candidates = set()
-    for replacement_id, replacement in gathered_metrics.items():
-        if (
-            replacement["block_config"].ffn.intermediate_size == teacher_intermediate_size
-            and replacement["block_config"].attention.n_heads_in_group == teacher_n_heads_in_group
-        ):
-            unique_teacher_candidates.add(replacement["block_config"])
-
-    assert len(unique_teacher_candidates) == 1, (
-        f"Woops, expected example one candidate to be the teacher block config, instead found: {unique_teacher_candidates=}"
-    )
-
-    teacher_replacement_config = unique_teacher_candidates.pop()
-    return teacher_replacement_config
-
-
-def apply(css_func_name, gathered_metrics, method_kwargs):
-    search_space_reducer = dispatcher.get(css_func_name)
-    if search_space_reducer is None:
-        raise ValueError(
-            f"could not find a function called `{css_func_name}` in {__name__}.py to reduce search space "
-        )
-
-    try:
-        gathered_metrics = search_space_reducer(gathered_metrics, **method_kwargs)
-    except Exception as e:
-        traceback.print_exc()
-        raise ValueError(
-            f"something went wrong when trying to apply the following search space reducer `{css_func_name}` \
-                         with the folloing args: {method_kwargs}, here's the exception: {e}"
-        )
-
-    return gathered_metrics
diff --git a/modelopt/torch/_compress/mip/greedy_search_with_multi_layer_replacements.py b/modelopt/torch/_compress/mip/greedy_search_with_multi_layer_replacements.py
deleted file mode 100644
index 719643cc2..000000000
--- a/modelopt/torch/_compress/mip/greedy_search_with_multi_layer_replacements.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Performs greedy search to find optimal multi-layer replacements under resource constraints."""
-
-# mypy: ignore-errors
-import math
-from copy import deepcopy
-from random import random
-from typing import Any, Hashable, TypeAlias
-
-from .utils import InfeasibleError, consecutive_ngrams, get_nested_key, sort_replacements
-
-ReplacementID: TypeAlias = Hashable
-Replacement: TypeAlias = dict[str, Any]
-ChosenReplacements: TypeAlias = list[Replacement]
-
-
-def run_greedy_search(
-    teacher_replacements: list[Replacement],
-    student_replacements: list[Replacement],
-    objective: str,
-    constraints: dict[str, float],
-    bigger_is_better: bool,
-) -> tuple[ChosenReplacements, float, dict[str, float]]:
-    print("#######  running greedy search  #######")
-    teacher_replacements = deepcopy(teacher_replacements)
-    student_replacements = deepcopy(student_replacements)
-    chosen_replacements: ChosenReplacements = []
-
-    teacher_replacements = {
-        replacement["parent_layer_indices"][0]: replacement for replacement in teacher_replacements
-    }
-
-    all_parent_layers = set(teacher_replacements.keys())
-    uncovered_parent_layers = set(all_parent_layers)
-
-    while True:
-        if len(student_replacements) == 0:
-            raise InfeasibleError()
-
-        choice_func = max if bigger_is_better else min
-        best_replacement = choice_func(
-            student_replacements, key=lambda replacement: get_nested_key(replacement, objective)
-        )
-        chosen_replacements.append(best_replacement)
-        uncovered_parent_layers -= set(best_replacement["parent_layer_indices"])
-        student_replacements = _filter_overlapping_replacements(
-            student_replacements, uncovered_parent_layers
-        )
-
-        padded_chosen_replacements = list(chosen_replacements)
-        for uncovered_block_idx in uncovered_parent_layers:
-            padded_chosen_replacements.append(teacher_replacements[uncovered_block_idx])
-
-        all_constraints_satisfied = True
-        for constraint_key, max_cost in constraints.items():
-            total_cost = sum(
-                get_nested_key(replacement, constraint_key)
-                for replacement in padded_chosen_replacements
-            )
-            is_constraint_satisfied = total_cost < max_cost or math.isclose(
-                total_cost, max_cost, rel_tol=1e-9
-            )
-            if not is_constraint_satisfied:
-                all_constraints_satisfied = False
-
-        if all_constraints_satisfied:
-            chosen_replacements = padded_chosen_replacements
-            break
-
-    # Trust But Verify: calculate total value and costs, and check that all the constraints are filled
-    total_value = 0.0
-    total_costs = {constraint_key: 0 for constraint_key in constraints.keys()}
-    chosen_layers = set()
-    for replacement in chosen_replacements:
-        total_value += get_nested_key(replacement, objective)
-        for constraint_key in constraints.keys():
-            total_costs[constraint_key] += get_nested_key(replacement, constraint_key)
-        for parent_layer_idx in replacement["parent_layer_indices"]:
-            assert parent_layer_idx not in chosen_layers, (
-                f"Found duplicate chosen layer {parent_layer_idx}"
-            )
-            chosen_layers.add(parent_layer_idx)
-
-    missing_layers = all_parent_layers - set(chosen_layers)
-    assert len(missing_layers) == 0, (
-        f"The following layers were not chosen by any replacement:\n{missing_layers=}\n{chosen_replacements}"
-    )
-
-    for constraint_key, max_cost in constraints.items():
-        assert total_costs[constraint_key] < max_cost or math.isclose(
-            total_costs[constraint_key], max_cost, rel_tol=1e-9
-        ), (
-            f"this constraint was violated {constraint_key} in the solution, sol val={total_costs[constraint_key]} <= {max_cost=}"
-        )
-
-    chosen_replacements = sort_replacements(chosen_replacements)
-    for cr in chosen_replacements:
-        if "block_config" in cr:
-            cr["child_block_configs"] = cr["block_config"]
-
-    return [
-        {
-            "chosen_replacements": chosen_replacements,
-            "total_value": total_value,
-            "total_costs": total_costs,
-        }
-    ]
-
-
-def _filter_overlapping_replacements(
-    replacements: list[Replacement],
-    uncovered_parent_layers: set[int],
-) -> list[Replacement]:
-    return [
-        replacement
-        for replacement in replacements
-        if set(replacement["parent_layer_indices"]).issubset(uncovered_parent_layers)
-    ]
-
-
-def usage_example():
-    num_layers = 32
-    num_options_per_parent_replacement = 5
-
-    teacher_replacements = []
-    student_replacements = []
-    for num_layers_in_replacement in (1, 2, 3):
-        for i_option in range(num_options_per_parent_replacement):
-            for parent_layer_indices in consecutive_ngrams(num_layers, num_layers_in_replacement):
-                is_teacher = num_layers_in_replacement == 1 and i_option == 0
-                replacement_id = f"parent layers {parent_layer_indices}  child config {i_option}"
-                replacement = {
-                    "parent_layer_indices": parent_layer_indices,
-                    "metrics": {"loss": random() if not is_teacher else 0.0},
-                    "stats": {"cost": 1},
-                    "replacement_id": replacement_id,
-                }
-                if is_teacher:
-                    teacher_replacements.append(replacement)
-                else:
-                    student_replacements.append(replacement)
-
-    constraints = {"stats.cost": num_layers - 8}
-    (result,) = run_greedy_search(
-        teacher_replacements,
-        student_replacements,
-        objective="metrics.loss",
-        constraints=constraints,
-        bigger_is_better=False,
-    )
-    chosen_replacements = result["chosen_replacements"]
-    total_value = result["total_value"]
-    total_costs = result["total_costs"]
-
-    print()
-    print()
-    print(f"{total_value=}")
-    print(f"{total_costs=}")
-    print(f"{constraints=}")
-    print("chosen_replacements=")
-    print(chosen_replacements)
-    print("\n".join([rep["replacement_id"] for rep in chosen_replacements]))
-
-
-if __name__ == "__main__":
-    usage_example()
diff --git a/modelopt/torch/_compress/mip/grouped_knapsack.py b/modelopt/torch/_compress/mip/grouped_knapsack.py
deleted file mode 100644
index 5769ded3c..000000000
--- a/modelopt/torch/_compress/mip/grouped_knapsack.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Solves the grouped knapsack problem using Mixed Integer Programming to find optimal item selections."""
-
-# mypy: ignore-errors
-import math
-import warnings
-from copy import deepcopy
-from random import random
-from typing import Any, Hashable, Iterable, Optional, TypeAlias, Union
-
-from mip import BINARY, Model, maximize, minimize, xsum
-from tqdm import tqdm
-
-from .utils import InfeasibleError, get_nested_key
-
-Item: TypeAlias = dict[str, float | dict[str, float]]
-Group: TypeAlias = dict[Hashable, Item]
-ChosenItems: TypeAlias = dict[Hashable, Hashable]
-
-
-def multi_solution_grouped_knapsack(
-    groups: dict[Hashable, Group],
-    objective: str,
-    constraints: dict[str, float],
-    bigger_is_better: bool,
-    num_solutions: int,
-    minimal_diversity: int = 1,
-    max_seconds_per_solution: Optional[float] = None,
-) -> list[dict[str, Union[ChosenItems, float]]]:
-    solutions = []
-    previous_choices = []
-    for i_run in tqdm(range(num_solutions), desc="multi_solution_grouped_knapsack"):
-        try:
-            chosen_items, total_value, total_costs = grouped_knapsack(
-                groups,
-                objective,
-                constraints,
-                bigger_is_better,
-                previous_choices,
-                minimal_diversity,
-                max_seconds_per_solution,
-            )
-        except InfeasibleError:
-            warnings.warn(f"Found only {i_run} feasible solutions (requested {num_solutions})")
-            break
-        previous_choices.append(chosen_items)
-        solutions.append(
-            {"chosen_items": chosen_items, "total_value": total_value, "total_costs": total_costs}
-        )
-    return solutions
-
-
-def grouped_knapsack(
-    groups: dict[Hashable, Group],
-    objective: str,
-    constraints: dict[str, float | tuple[float, float]],
-    bigger_is_better: bool,
-    previous_choices: Optional[list[ChosenItems]] = None,
-    minimal_diversity: int = 1,
-    max_seconds_per_solution: Optional[float] = None,
-) -> tuple[ChosenItems, float, dict[str, float]]:
-    groups = deepcopy(groups)
-    mip_model = Model()
-
-    objective_vars = []
-    constraint_vars = {constraint_key: [] for constraint_key in constraints.keys()}
-    for group_name, group_items in groups.items():
-        group_vars = []
-        for item_name, item in group_items.items():
-            is_chosen = mip_model.add_var(var_type=BINARY)
-            item["is_chosen"] = is_chosen
-            group_vars.append(is_chosen)
-            objective_vars.append(is_chosen * get_nested_objective(item, objective))
-            for constraint_key in constraints.keys():
-                constraint_vars[constraint_key].append(
-                    is_chosen * get_nested_key(item, constraint_key)
-                )
-
-        mip_model += xsum(group_vars) == 1
-
-    for constraint_key, max_cost in constraints.items():
-        min_cost = None
-        if isinstance(max_cost, Iterable):
-            min_cost, max_cost = max_cost
-
-        if max_cost is not None:
-            mip_model += xsum(constraint_vars[constraint_key]) <= max_cost
-        if min_cost is not None:
-            mip_model += xsum(constraint_vars[constraint_key]) >= min_cost
-
-    if previous_choices is not None:
-        for previous_chosen_items in previous_choices:
-            corresponding_vars = [
-                groups[group_name][item_name]["is_chosen"]
-                for group_name, item_name in previous_chosen_items.items()
-            ]
-            mip_model += xsum(corresponding_vars) <= len(groups) - minimal_diversity
-
-    mip_model.objective = (
-        maximize(xsum(objective_vars)) if bigger_is_better else minimize(xsum(objective_vars))
-    )
-
-    if max_seconds_per_solution is not None:
-        mip_model.max_seconds = max_seconds_per_solution
-
-    mip_model.optimize()
-
-    if is_chosen.x is None:
-        raise InfeasibleError()
-
-    total_value = 0.0
-    total_costs = {constraint_key: 0 for constraint_key in constraints.keys()}
-    chosen_items: ChosenItems = dict()
-    for group_name, group_items in groups.items():
-        for item_name, item in group_items.items():
-            is_chosen = item["is_chosen"].x >= 0.99
-            if is_chosen:
-                assert group_name not in chosen_items
-                chosen_items[group_name] = item_name
-                total_value += get_nested_objective(item, objective)
-                for constraint_key in constraints.keys():
-                    total_costs[constraint_key] += get_nested_key(item, constraint_key)
-
-    if len(chosen_items) != len(groups):
-        in_groups_and_not_in_chosen_items = set(groups.keys()) - set(chosen_items.keys())
-        in_chosen_items_and_not_in_groups = set(chosen_items.keys()) - set(groups.keys())
-        missing_groups = [groups[key] for key in in_groups_and_not_in_chosen_items]
-        raise RuntimeError(f"""
-        Different number of 'chosen_items' and 'groups': {len(chosen_items)=}  {len(groups)=}
-        {in_groups_and_not_in_chosen_items=}
-        {in_chosen_items_and_not_in_groups=}
-        {missing_groups=}
-        """)
-
-    for constraint_key, max_cost in constraints.items():
-        min_cost = None
-        if isinstance(max_cost, Iterable):
-            min_cost, max_cost = max_cost
-
-        if max_cost is not None:
-            assert total_costs[constraint_key] < max_cost or math.isclose(
-                total_costs[constraint_key], max_cost, rel_tol=1e-9
-            ), (
-                f"This max_cost was violated {constraint_key} in the solution, sol val={total_costs[constraint_key]} > {max_cost=}"
-            )
-        if min_cost is not None:
-            assert total_costs[constraint_key] > min_cost or math.isclose(
-                total_costs[constraint_key], min_cost, rel_tol=1e-9
-            ), (
-                f"This min_cost was violated {constraint_key} in the solution, sol val={total_costs[constraint_key]} < {min_cost=}"
-            )
-
-    for previous_chosen_items in previous_choices:
-        num_differences = 0
-        for group_name in groups.keys():
-            num_differences += previous_chosen_items[group_name] != chosen_items[group_name]
-        assert num_differences >= minimal_diversity
-
-    return chosen_items, total_value, total_costs
-
-
-def get_nested_objective(dictionary: dict[str, Any], nested_key: str) -> Any:
-    if nested_key.startswith("metrics."):
-        # handle metrics that have '.' in their name
-        metric = nested_key.split("metrics.")[1]
-        return dictionary["metrics"][metric]
-    else:
-        return get_nested_key(dictionary, nested_key)
-
-
-def usage_example():
-    num_layers = 32
-    num_configs_per_block = 100
-    groups = {
-        f"layer_{i_layer}": {
-            f"config_{i_config}": {
-                "metrics": {"accuracy": random()},
-                "stats": {"memory_mib": random() * 100, "runtime_ms": random() * 10},
-            }
-            for i_config in range(num_configs_per_block)
-        }
-        for i_layer in range(num_layers)
-    }
-
-    minimal_diversity = 10
-    constraints = {"stats.memory_mib": num_layers * 50.0, "stats.runtime_ms": num_layers * 5.0}
-    solutions = multi_solution_grouped_knapsack(
-        groups,
-        objective="metrics.accuracy",
-        constraints=constraints,
-        bigger_is_better=True,
-        num_solutions=10,
-        minimal_diversity=minimal_diversity,
-    )
-
-    print()
-    print(constraints)
-
-    for i_run, solution in enumerate(solutions):
-        print()
-        print(f"run {i_run}")
-        print(solution)
-
-    print(f"Checking differences, should be at least {minimal_diversity}:")
-    for a in range(len(solutions)):
-        for b in range(a + 1, len(solutions)):
-            num_differences = 0
-            for group_name in groups.keys():
-                num_differences += (
-                    solutions[a]["chosen_items"][group_name]
-                    != solutions[b]["chosen_items"][group_name]
-                )
-            print(a, "<>", b, "=", num_differences)
-
-
-if __name__ == "__main__":
-    usage_example()
diff --git a/modelopt/torch/_compress/mip/mip_and_realize_models.py b/modelopt/torch/_compress/mip/mip_and_realize_models.py
index 83d8b23f5..f6d77d262 100644
--- a/modelopt/torch/_compress/mip/mip_and_realize_models.py
+++ b/modelopt/torch/_compress/mip/mip_and_realize_models.py
@@ -44,12 +44,19 @@ def launch_realize_model(cfg: DictConfig, runtime: IRuntime):
 
 
 def launch_mip_and_realize_model(cfg: DictConfig, runtime: IRuntime):
+    # Determine device for distributed operations (NCCL requires CUDA tensors)
+    device = "cpu"
+    if runtime.world_size > 1 and dist.is_initialized():
+        backend = dist.get_backend()
+        if backend == "nccl":
+            device = torch.cuda.current_device()
+
     if runtime.is_main_process:
         solution_paths = launch_mip(cfg)
-        length_tensor = torch.tensor([len(solution_paths)], dtype=torch.long)
+        length_tensor = torch.tensor([len(solution_paths)], dtype=torch.long, device=device)
     else:
         solution_paths = None
-        length_tensor = torch.tensor([0], dtype=torch.long)
+        length_tensor = torch.tensor([0], dtype=torch.long, device=device)
 
     if not cfg.skip_realize_model:
         if runtime.world_size > 1:
@@ -75,7 +82,7 @@ def main(cfg: DictConfig) -> None:
     cfg = hydra.utils.instantiate(cfg)
 
     _runtime = (
-        NativeDDP_Runtime(
+        NativeDdpRuntime(
             dtype=torch.bfloat16, torch_distributed_timeout=getattr(cfg, "nccl_timeout_minutes")
         )
         if is_distributed()
diff --git a/modelopt/torch/_compress/mip/mip_with_multi_layer_replacements.py b/modelopt/torch/_compress/mip/mip_with_multi_layer_replacements.py
index 50525c846..438db3312 100644
--- a/modelopt/torch/_compress/mip/mip_with_multi_layer_replacements.py
+++ b/modelopt/torch/_compress/mip/mip_with_multi_layer_replacements.py
@@ -25,7 +25,12 @@
 
 from mip import BINARY, Model, maximize, minimize, xsum
 
-from .utils import InfeasibleError, consecutive_ngrams, get_nested_key, sort_replacements
+from modelopt.torch._compress.mip.utils import (
+    InfeasibleError,
+    consecutive_ngrams,
+    get_nested_key,
+    sort_replacements,
+)
 
 ReplacementID: TypeAlias = Hashable
 Replacement: TypeAlias = dict[str, Any]
diff --git a/modelopt/torch/_compress/mip/run_puzzle.py b/modelopt/torch/_compress/mip/run_puzzle.py
index fd883e969..5773349c1 100644
--- a/modelopt/torch/_compress/mip/run_puzzle.py
+++ b/modelopt/torch/_compress/mip/run_puzzle.py
@@ -28,15 +28,11 @@
 import yaml
 from omegaconf import DictConfig, ListConfig, OmegaConf
 
-import modelopt.torch._compress.mip.constrain_search_space as css
 from modelopt.torch._compress.decilm.deci_lm_hf_code.block_config import (
     AttentionConfig,
     BlockConfig,
     FFNConfig,
 )
-from modelopt.torch._compress.mip.greedy_search_with_multi_layer_replacements import (
-    run_greedy_search,
-)
 from modelopt.torch._compress.mip.mip_with_multi_layer_replacements import (
     run_mip as run_multi_layer_replacement_mip,
 )
@@ -211,8 +207,6 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--human_constraints", type=parse_json)
     parser.add_argument("--report_additional_costs", type=str, action="append", default=[])
 
-    parser.add_argument("--num_solutions", type=int)
-    parser.add_argument("--minimal_diversity", type=int)
     parser.add_argument(
         "--output_path",
         type=parse_path,
@@ -227,21 +221,6 @@ def parse_args() -> argparse.Namespace:
         help="Set this if using accuracy objective, don't set if using loss objective",
     )
 
-    parser.add_argument("--constrain_search_func", type=str, default=None)
-    parser.add_argument("--constrain_search_args", type=parse_json, default=dict())
-
-    parser.add_argument(
-        "--is_multi_layer_puzzle",
-        action="store_true",
-        default=True,
-        help="[DEPRECATED] This flag is now always True. Kept for backward compatibility.",
-    )
-    parser.add_argument(
-        "--use_greedy_search",
-        action="store_true",
-        help="Use greedy search instead of mip. Only supported for multi-layer puzzle.",
-    )
-
     args = parser.parse_args()
     return args
 
@@ -254,17 +233,14 @@ def run_single_puzzle_config(
     constraints: PuzzleConstraints,
     output_folder,
 ) -> None:
-    from modelopt.torch._compress.mip.grouped_knapsack import multi_solution_grouped_knapsack
-
-    args = deepcopy(
-        args
-    )  # we override the constraints and subblock_stats_args for this run to keep reporting out the same way.
+    # we override the constraints and subblock_stats_args for this run to keep reporting out the same way.
+    args = deepcopy(args)
 
     subblock_stats = filter_subblock_stats_by_args(subblock_stats, subblock_stats_args)
     _add_block_stats_to_gathered_metrics(gathered_metrics, subblock_stats)
 
     output_folder.mkdir(parents=True, exist_ok=True)
-    _dump_gathered_metrics(gathered_metrics, output_folder, args.is_multi_layer_puzzle)
+    _dump_gathered_metrics(gathered_metrics, output_folder)
 
     non_block_stats = {"stats": _get_block_stats(subblock_stats, "non_block")}
     batch_size = subblock_stats["args"]["batch_size"]
@@ -304,40 +280,13 @@ def run_single_puzzle_config(
 
     mprint(f"After non-block adjustments: {mip_constraints=}")
 
-    if args.is_multi_layer_puzzle:
-        if not args.use_greedy_search:
-            solutions = run_multi_layer_replacement_mip(
-                replacements=gathered_metrics,
-                objective=args.objective,
-                constraints=mip_constraints,
-                bigger_is_better=args.bigger_is_better,
-                max_seconds_per_solution=args.max_seconds_per_solution,
-            )
-        else:
-            teacher_replacements, student_replacements = [], []
-            for replacement in gathered_metrics.values():
-                if replacement["is_teacher"]:
-                    teacher_replacements.append(replacement)
-                else:
-                    student_replacements.append(replacement)
-
-            solutions = run_greedy_search(
-                teacher_replacements=teacher_replacements,
-                student_replacements=student_replacements,
-                objective=args.objective,
-                constraints=mip_constraints,
-                bigger_is_better=args.bigger_is_better,
-            )
-    else:
-        solutions = multi_solution_grouped_knapsack(
-            groups=gathered_metrics,
-            objective=args.objective,
-            constraints=mip_constraints,
-            bigger_is_better=args.bigger_is_better,
-            num_solutions=args.num_solutions,
-            minimal_diversity=args.minimal_diversity,
-            max_seconds_per_solution=args.max_seconds_per_solution,
-        )
+    solutions = run_multi_layer_replacement_mip(
+        replacements=gathered_metrics,
+        objective=args.objective,
+        constraints=mip_constraints,
+        bigger_is_better=args.bigger_is_better,
+        max_seconds_per_solution=args.max_seconds_per_solution,
+    )
 
     for solution in solutions:
         for stat_name in set([*orig_mip_constraints.keys(), *args.report_additional_costs]):
@@ -379,25 +328,10 @@ def run_single_puzzle_config(
     return solutions_file
 
 
-def _dump_gathered_metrics(
-    gathered_metrics: PuzzleMetrics, output_folder: Path, is_multi_layer_puzzle: bool = False
-) -> None:
-    if is_multi_layer_puzzle:
-        for replacement_id, replacement_info in gathered_metrics.items():
-            replacement_info["block_repr"] = block_config_to_str(replacement_info["block_config"])
-        gathered_metrics_for_dump = gathered_metrics
-    else:
-        gathered_metrics_for_dump = {
-            block_name: {
-                block_config_to_str(variant_config).strip(): {
-                    **variant_metrics,
-                    "block_config": variant_config,
-                    "block_repr": block_config_to_str(variant_config).strip(),
-                }
-                for variant_config, variant_metrics in block_variants.items()
-            }
-            for block_name, block_variants in gathered_metrics.items()
-        }
+def _dump_gathered_metrics(gathered_metrics: PuzzleMetrics, output_folder: Path) -> None:
+    for replacement_id, replacement_info in gathered_metrics.items():
+        replacement_info["block_repr"] = block_config_to_str(replacement_info["block_config"])
+    gathered_metrics_for_dump = gathered_metrics
 
     json_dump(gathered_metrics_for_dump, output_folder / "replacement_metrics_and_stats.json")
 
@@ -451,17 +385,12 @@ def _override_args_from_profile(args, puzzle_profile):
         if arg_name in puzzle_profile:
             if arg_name not in ("mip_constraints", "human_constraints", "subblock_stats_args"):
                 setattr(args, arg_name, puzzle_profile[arg_name])
-    if isinstance(args.constrain_search_args, str):
-        args.constrain_search_args = parse_json(args.constrain_search_args)
-    assert args.is_multi_layer_puzzle, "multi-layer puzzle is now the only supported mode."
 
 
 def _assert_valid_config(args, puzzle_profile):
     required_args = (
         "subblock_stats_path",
         "objective",
-        "num_solutions",
-        "minimal_diversity",
         "output_path",
     )
     missing_args = [arg for arg in required_args if arg not in args or getattr(args, arg) is None]
@@ -488,11 +417,6 @@ def _assert_valid_config(args, puzzle_profile):
         )
         exit(1)
 
-    if args.use_greedy_search:
-        assert args.is_multi_layer_puzzle, (
-            "--use_greedy_search is only supported for multi layer puzzle"
-        )
-
 
 def _get_minimal_unique_names(dicts: List[dict]) -> List[str]:
     all_keys = set(k for d in dicts for k in d.keys())
@@ -517,23 +441,13 @@ def run_puzzle(args: argparse.Namespace) -> List[str]:
     if args.gathered_metrics_path is not None:
         gathered_metrics = json.loads(args.gathered_metrics_path.read_text())
     else:
-        gather_func = (
-            gather_puzzle_metrics
-            if not args.is_multi_layer_puzzle
-            else gather_multi_layer_puzle_metrics
+        gathered_metrics = gather_multi_layer_puzle_metrics(
+            args.single_block_replacement_validation_dir
         )
-        gathered_metrics = gather_func(args.single_block_replacement_validation_dir)
 
     if args.metric_overrides is not None:
         gathered_metrics = {**gathered_metrics, **args.metric_overrides}
 
-    if args.constrain_search_func is not None:
-        mprint(f"{args.constrain_search_args=}")
-        # assert not args.is_multi_layer_puzzle, "conditional search is not implementd yet for multi-layer puzzles, did you implement it?"
-        gathered_metrics = css.apply(
-            args.constrain_search_func, gathered_metrics, args.constrain_search_args
-        )
-
     subblock_stats = json.loads(args.subblock_stats_path.read_text())
 
     all_subblock_args = _load_all_subblock_stats_args(args, puzzle_profile)
diff --git a/modelopt/torch/_compress/sewing_kit/utils.py b/modelopt/torch/_compress/sewing_kit/utils.py
index 16fe1b3fd..ff47c289b 100644
--- a/modelopt/torch/_compress/sewing_kit/utils.py
+++ b/modelopt/torch/_compress/sewing_kit/utils.py
@@ -447,13 +447,33 @@ def get_parent_module_names(module_name: str):
     return parent_module_names
 
 
+def _get_device_for_distributed(
+    group: Optional[torch.distributed.ProcessGroup] = None,
+) -> str:
+    """
+    Determine the appropriate device for distributed communication based on the backend.
+    NCCL backend requires CUDA tensors, while Gloo supports both CPU and CUDA.
+    """
+    if not torch.distributed.is_initialized():
+        return "cpu"
+
+    backend = torch.distributed.get_backend(group)
+    if backend == "nccl":
+        # NCCL requires CUDA tensors
+        return torch.cuda.current_device()
+    else:
+        # Gloo and other backends support CPU tensors
+        return "cpu"
+
+
 def distributed_isend_obj(
     obj: Any,
     dst: int = 0,
     group: Optional[torch.distributed.ProcessGroup] = None,
 ) -> list[Optional[torch.distributed.Work]]:
+    device = _get_device_for_distributed(group)
     obj_tensor, obj_size_tensor = torch.distributed.distributed_c10d._object_to_tensor(
-        obj, device="cpu", **_get_group_kwarg_if_necessary()
+        obj, device=device, **_get_group_kwarg_if_necessary()
     )
     works: list[Optional[torch.distributed.Work]] = [
         torch.distributed.isend(obj_size_tensor, dst, group),
@@ -484,11 +504,12 @@ def distributed_recv_obj(
     src: Optional[int] = None,
     group: Optional[torch.distributed.ProcessGroup] = None,
 ) -> Any:
-    obj_size_tensor = torch.LongTensor(1, device="cpu")
+    device = _get_device_for_distributed(group)
+    obj_size_tensor = torch.LongTensor(1).to(device)
     torch.distributed.recv(obj_size_tensor, src=src, group=group)
     obj_size = int(obj_size_tensor.item())
 
-    obj_tensor = torch.ByteTensor(obj_size, device="cpu")
+    obj_tensor = torch.ByteTensor(obj_size).to(device)
     torch.distributed.recv(obj_tensor, src=src, group=group)
 
     obj = torch.distributed.distributed_c10d._tensor_to_object(
diff --git a/setup.py b/setup.py
index d4077f709..20a271fe1 100644
--- a/setup.py
+++ b/setup.py
@@ -105,13 +105,13 @@
     "compress": [
         "fire",
         "hydra-core==1.3.2",
-        "omegaconf==2.3.0",
-        "wandb~=0.17.5",
-        "lru-dict",
-        "typeguard",
-        "pandas",
         "immutabledict",
+        "lru-dict",
         "mip",
+        "omegaconf==2.3.0",
+        "pandas",
+        "typeguard",
+        "wandb~=0.17.5",
     ],
 }
 
diff --git a/tests/gpu/torch/_compress/compress_test_utils.py b/tests/gpu/torch/_compress/compress_test_utils.py
index a1102e7fa..9df5f5bfc 100644
--- a/tests/gpu/torch/_compress/compress_test_utils.py
+++ b/tests/gpu/torch/_compress/compress_test_utils.py
@@ -29,11 +29,7 @@ def setup_test_model_and_data(
     tmp_path: Path,
     rank: int,
     runtime,
-) -> tuple[
-    Path,
-    Path,
-    Path,
-]:
+) -> tuple[Path, Path, Path]:
     """
     Setup the test model and data for the compress NAS search.
 
@@ -132,7 +128,7 @@ def setup_puzzle_dir(puzzle_dir: str):
         Path(puzzle_dir).mkdir(parents=True, exist_ok=True)
 
 
-def save_dummy_dataset(dataset_path: str):
+def save_dummy_dataset(dataset_path: Path | str):
     """
     Save a dummy dataset for testing purposes.
     """
@@ -170,4 +166,4 @@ def save_dummy_dataset(dataset_path: str):
 
     # For train-val splits
     data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)})
-    data_dict.save_to_disk(dataset_path)
+    data_dict.save_to_disk(str(dataset_path))
diff --git a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-attn-pruning.yaml b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-attn-pruning.yaml
index 21a3486f0..473a5d418 100644
--- a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-attn-pruning.yaml
+++ b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-attn-pruning.yaml
@@ -9,7 +9,7 @@ defaults:
 puzzle_dir: ???
 teacher_dir: ${puzzle_dir}/ckpts/teacher/
 replacement_library_path: ${puzzle_dir}/replacement_library.json
-dataset_path: ???     # path to v0.4_mini
+dataset_path: ??? # path to v0.4_mini
 
 skip_realize_model: false
 
@@ -21,10 +21,10 @@ calc_subblock_stats:
   batch_sizes: [64, 96, 128]
   prefill_seq_len: 4096
   generation_seq_len: 4096
-  num_active_tokens_override:       # Optional override for sequence lengths
+  num_active_tokens_override: # Optional override for sequence lengths
   prefill_queue_size: 0
   allocate_prefill_query: false
-  benchmark_iterations:       # Set to a number (e.g., 1000) to enable runtime benchmarking
+  benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking
   merge_with_existing_stats: false
   subblock_stats_filename: "subblock_stats.json"
   moe_stats_filename: "moe_stats.json"
@@ -54,8 +54,6 @@ mip:
   # puzzle_profile:
   objective: metrics.cosine_embedding_loss_hidden_states
   bigger_is_better: false
-  num_solutions: 1
-  minimal_diversity: 2
 
   subblock_stats_args:
     - batch_size: 96
@@ -79,10 +77,7 @@ mip:
     target_memory: 780_000 # 78_000
 
   mip_constraints:
-  use_greedy_search: false
-  is_multi_layer_puzzle: true
   metric_overrides:
-  constrain_search_func:
   max_seconds_per_solution: 60
 
 realize_model:
@@ -90,10 +85,10 @@ realize_model:
   tokenizer_name: ${to_path:${teacher_dir}}
   replacement_library_path: ${replacement_library_path}
   save_models: true
-  solutions_path:     # Filled dynamically
+  solutions_path: # Filled dynamically
 
   # Validate params
-  skip_validation: false    # To enable validation of the model solution set `skip_validation` as False
+  skip_validation: false # To enable validation of the model solution set `skip_validation` as False
   eval_samples: 2
   micro_batch_size: 1
   dataset_path: ${dataset_path}/valid
diff --git a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-ffn-pruning.yaml b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-ffn-pruning.yaml
index 1d8fac655..8af352660 100644
--- a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-ffn-pruning.yaml
+++ b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B-ffn-pruning.yaml
@@ -9,7 +9,7 @@ defaults:
 puzzle_dir: ???
 teacher_dir: ${puzzle_dir}/ckpts/teacher/
 replacement_library_path: ${puzzle_dir}/replacement_library.json
-dataset_path: ???     # path to v0.4_mini
+dataset_path: ??? # path to v0.4_mini
 
 skip_realize_model: false
 
@@ -21,10 +21,10 @@ calc_subblock_stats:
   batch_sizes: [64, 96, 128]
   prefill_seq_len: 4096
   generation_seq_len: 4096
-  num_active_tokens_override:       # Optional override for sequence lengths
+  num_active_tokens_override: # Optional override for sequence lengths
   prefill_queue_size: 0
   allocate_prefill_query: false
-  benchmark_iterations:       # Set to a number (e.g., 1000) to enable runtime benchmarking
+  benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking
   merge_with_existing_stats: false
   subblock_stats_filename: "subblock_stats.json"
   moe_stats_filename: "moe_stats.json"
@@ -54,8 +54,6 @@ mip:
   # puzzle_profile:
   objective: metrics.cosine_embedding_loss_hidden_states
   bigger_is_better: false
-  num_solutions: 1
-  minimal_diversity: 2
 
   subblock_stats_args:
     - batch_size: 96
@@ -79,10 +77,7 @@ mip:
     target_memory: 780_000 # 78_000
 
   mip_constraints:
-  use_greedy_search: false
-  is_multi_layer_puzzle: true
   metric_overrides:
-  constrain_search_func:
   max_seconds_per_solution: 60
 
 realize_model:
@@ -90,10 +85,10 @@ realize_model:
   tokenizer_name: ${to_path:${teacher_dir}}
   replacement_library_path: ${replacement_library_path}
   save_models: true
-  solutions_path:     # Filled dynamically
+  solutions_path: # Filled dynamically
 
   # Validate params
-  skip_validation: false    # To enable validation of the model solution set `skip_validation` as False
+  skip_validation: false # To enable validation of the model solution set `skip_validation` as False
   eval_samples: 2
   micro_batch_size: 1
   dataset_path: ${dataset_path}/valid
diff --git a/tests/gpu/torch/_compress/test_compress.py b/tests/gpu/torch/_compress/test_compress.py
index b00be2485..e40756602 100644
--- a/tests/gpu/torch/_compress/test_compress.py
+++ b/tests/gpu/torch/_compress/test_compress.py
@@ -33,20 +33,6 @@
 #
 # Note: Bypass is disabled now in the test.
 
-# How to run this test (currently only supported internally at Nvidia).
-#
-# Have both modelopt and puzzle source code in the same directory:
-# /workspace/modelopt
-# /workspace/puzzletron
-#
-# submit_job --partition interactive --time 0 \
-# --image gitlab-master.nvidia.com/deci/puzzletron:modelopt_main \
-# --workdir $MODELOPT SRC DIRECTORY --interactive --gpu 1
-#
-# export PYTHONPATH=$PYTHONPATH:.:/workspace/puzzletron/v1
-#
-# pytest -s -v ./tests/gpu/torch/_compress/test_compress.py::test_compress -o addopts=""
-
 
 def test_compress(project_root_path: Path, tmp_path: Path):
     spawn_multiprocess_job(