|
|
|
|
|
""" |
|
|
AutoRound W4A16 Quantization for GLM-4.7 REAP models |
|
|
|
|
|
This script quantizes a REAP-pruned GLM-4.7 model to INT4 weights using Intel's AutoRound. |
|
|
Reduces model size by ~4x while maintaining quality. |
|
|
|
|
|
Requirements: |
|
|
pip install auto-round |
|
|
|
|
|
Usage: |
|
|
python run_autoround.py --model-path ./GLM-4.7-REAP-50 --output-dir ./GLM-4.7-REAP-50-W4A16 |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import subprocess |
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="AutoRound W4A16 quantization") |
|
|
parser.add_argument("--model-path", type=str, required=True, |
|
|
help="Path to REAP-pruned model") |
|
|
parser.add_argument("--output-dir", type=str, default=None, |
|
|
help="Output directory (default: {model-path}-W4A16)") |
|
|
parser.add_argument("--bits", type=int, default=4, |
|
|
help="Weight bit width (default: 4)") |
|
|
parser.add_argument("--group-size", type=int, default=128, |
|
|
help="Quantization group size (default: 128)") |
|
|
parser.add_argument("--format", type=str, default="auto_gptq", |
|
|
choices=["auto_gptq", "auto_awq", "auto_round"], |
|
|
help="Output format (default: auto_gptq)") |
|
|
parser.add_argument("--iters", type=int, default=200, |
|
|
help="Optimization iterations (default: 200)") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
if not Path(args.model_path).exists(): |
|
|
print(f"ERROR: Model path not found: {args.model_path}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
if args.output_dir is None: |
|
|
args.output_dir = f"{args.model_path}-W{args.bits}A16" |
|
|
|
|
|
Path(args.output_dir).mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
model_size_gb = sum(f.stat().st_size for f in Path(args.model_path).rglob("*.safetensors")) / (1024**3) |
|
|
expected_output_gb = model_size_gb / 4 |
|
|
|
|
|
print("=" * 60) |
|
|
print(f"AutoRound W{args.bits}A16 Quantization") |
|
|
print("=" * 60) |
|
|
print(f"Input Model: {args.model_path}") |
|
|
print(f"Input Size: {model_size_gb:.1f} GB") |
|
|
print(f"Output: {args.output_dir}") |
|
|
print(f"Expected Output Size: ~{expected_output_gb:.1f} GB") |
|
|
print(f"Config: {args.bits}-bit, group_size={args.group_size}, format={args.format}") |
|
|
print("=" * 60) |
|
|
print("\nThis will take ~2-3 hours for a 92-layer MoE model...") |
|
|
print() |
|
|
|
|
|
|
|
|
cmd = [ |
|
|
"auto-round", |
|
|
"--model", args.model_path, |
|
|
"--bits", str(args.bits), |
|
|
"--group_size", str(args.group_size), |
|
|
"--format", args.format, |
|
|
"--output_dir", args.output_dir, |
|
|
"--iters", str(args.iters), |
|
|
] |
|
|
|
|
|
result = subprocess.run(cmd) |
|
|
|
|
|
if result.returncode == 0: |
|
|
|
|
|
output_size_gb = sum(f.stat().st_size for f in Path(args.output_dir).rglob("*.safetensors")) / (1024**3) |
|
|
compression = model_size_gb / output_size_gb if output_size_gb > 0 else 0 |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("AutoRound quantization complete!") |
|
|
print(f"Output: {args.output_dir}") |
|
|
print(f"Output Size: {output_size_gb:.1f} GB ({compression:.1f}x compression)") |
|
|
print("=" * 60) |
|
|
else: |
|
|
print(f"\nERROR: AutoRound failed with code {result.returncode}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|