Infinite3214's picture
Upload folder using huggingface_hub
4fb8489 verified
#!/usr/bin/env python3
"""
AutoRound W4A16 Quantization for GLM-4.7 REAP models
This script quantizes a REAP-pruned GLM-4.7 model to INT4 weights using Intel's AutoRound.
Reduces model size by ~4x while maintaining quality.
Requirements:
pip install auto-round
Usage:
python run_autoround.py --model-path ./GLM-4.7-REAP-50 --output-dir ./GLM-4.7-REAP-50-W4A16
"""
import argparse
import subprocess
import sys
from pathlib import Path
def main():
parser = argparse.ArgumentParser(description="AutoRound W4A16 quantization")
parser.add_argument("--model-path", type=str, required=True,
help="Path to REAP-pruned model")
parser.add_argument("--output-dir", type=str, default=None,
help="Output directory (default: {model-path}-W4A16)")
parser.add_argument("--bits", type=int, default=4,
help="Weight bit width (default: 4)")
parser.add_argument("--group-size", type=int, default=128,
help="Quantization group size (default: 128)")
parser.add_argument("--format", type=str, default="auto_gptq",
choices=["auto_gptq", "auto_awq", "auto_round"],
help="Output format (default: auto_gptq)")
parser.add_argument("--iters", type=int, default=200,
help="Optimization iterations (default: 200)")
args = parser.parse_args()
# Validate
if not Path(args.model_path).exists():
print(f"ERROR: Model path not found: {args.model_path}")
sys.exit(1)
# Build output directory
if args.output_dir is None:
args.output_dir = f"{args.model_path}-W{args.bits}A16"
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
# Get model size info
model_size_gb = sum(f.stat().st_size for f in Path(args.model_path).rglob("*.safetensors")) / (1024**3)
expected_output_gb = model_size_gb / 4 # ~4x compression for W4
print("=" * 60)
print(f"AutoRound W{args.bits}A16 Quantization")
print("=" * 60)
print(f"Input Model: {args.model_path}")
print(f"Input Size: {model_size_gb:.1f} GB")
print(f"Output: {args.output_dir}")
print(f"Expected Output Size: ~{expected_output_gb:.1f} GB")
print(f"Config: {args.bits}-bit, group_size={args.group_size}, format={args.format}")
print("=" * 60)
print("\nThis will take ~2-3 hours for a 92-layer MoE model...")
print()
# Build command
cmd = [
"auto-round",
"--model", args.model_path,
"--bits", str(args.bits),
"--group_size", str(args.group_size),
"--format", args.format,
"--output_dir", args.output_dir,
"--iters", str(args.iters),
]
result = subprocess.run(cmd)
if result.returncode == 0:
# Calculate actual output size
output_size_gb = sum(f.stat().st_size for f in Path(args.output_dir).rglob("*.safetensors")) / (1024**3)
compression = model_size_gb / output_size_gb if output_size_gb > 0 else 0
print("\n" + "=" * 60)
print("AutoRound quantization complete!")
print(f"Output: {args.output_dir}")
print(f"Output Size: {output_size_gb:.1f} GB ({compression:.1f}x compression)")
print("=" * 60)
else:
print(f"\nERROR: AutoRound failed with code {result.returncode}")
sys.exit(1)
if __name__ == "__main__":
main()