Skip to content
Merged
39 changes: 39 additions & 0 deletions modelopt/torch/quantization/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@
"algorithm": "max",
}


INT4_AWQ_CFG = {
"quant_cfg": {
"*weight_quantizer": {
Expand Down Expand Up @@ -1189,6 +1190,44 @@ class SVDQuantConfig(QuantizeAlgorithmConfig):
)


class GPTQLiteConfig(QuantizeAlgorithmConfig):
"""The config for GPTQ lite.

GPTQ lite is a variant of GPTQ that does not exactly follow the official GPTQ implementation.

GPTQ lite does not perform sequential quantization of layers. This means that the updated
activations are not used to process the next layer.
Comment on lines +1198 to +1199
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you estimate how much effort is needed if we need to add this constraint? I am thinking if we can have a quick test to see what's the accuracy impact.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be addressed in a followup PR


The default values are taken from the official GPTQ implementation:
https://github.com/IST-DASLab/FP-Quant/blob/d2e3092f968262c4de5fb050e1aef568a280dadd/src/quantization/gptq.py#L35

Note: This feature is currently experimental and may not translate to improved accuracy as expected.


"""

method: Literal["gptq_lite"] = ModeloptField("gptq_lite")
percdamp: float | None = ModeloptField(
default=0.01,
gt=0.0,
le=1.0,
title="Percentage damping factor.",
description="The percentage of average Hessian diagonal used for damping.",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you have a reference from the original paper about what these are, could you also share the link too?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you also add some instructions here, so users can know what's the impact of increasing/decreasing this parameter?

)
block_size: int | None = ModeloptField(
default=128,
title="Block size for GPTQ weight update.",
description="""The block size for GPTQ weight update, which must be a multiple of the
group_size used in the quantization.""",
)
Comment on lines +1217 to +1222
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be the multiple of block_size used in quantization. We should explain it in the description as well.

hessian_state_path: str | None = ModeloptField(
default=None,
title="Path to the Hessian state file.",
description="""The path to the Hessian state file. If hessian path exists, we load from
hessian file instead of recomputing them.""",
)


QuantizeQuantCfgType = dict[
str | Callable,
QuantizerAttributeConfig
Expand Down
15 changes: 14 additions & 1 deletion modelopt/torch/quantization/mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
AWQFullCalibConfig,
AWQLiteCalibConfig,
CompressConfig,
GPTQLiteConfig,
MaxCalibConfig,
MseCalibConfig,
QuantizeAlgoCfgType,
Expand All @@ -55,7 +56,7 @@
restore_svdquant_model,
update_quantize_metadata,
)
from .model_calib import awq, max_calibrate, mse_calibrate, smoothquant, svdquant
from .model_calib import awq, gptq_lite, max_calibrate, mse_calibrate, smoothquant, svdquant

__all__ = ["BaseCalibrateModeDescriptor"]

Expand Down Expand Up @@ -439,3 +440,15 @@ def config_class(self) -> type[QuantizeAlgorithmConfig]:
def restore(self) -> RestoreEntrypoint:
"""The mode's entrypoint for restoring a model."""
return restore_svdquant_model


@CalibrateModeRegistry.register_mode
class GPTQLiteModeDescriptor(BaseCalibrateModeDescriptor):
"""Mode for GPTQ calibration algorithm."""

@property
def config_class(self) -> type[QuantizeAlgorithmConfig]:
"""Specifies the config class for the mode."""
return GPTQLiteConfig

_calib_func = gptq_lite
Loading
Loading