yangluo commited on
Commit
fafdd6c
Β·
verified Β·
1 Parent(s): 72b4aee

Upload merge_hf_cloud.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. merge_hf_cloud.py +134 -0
merge_hf_cloud.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # requires-python = ">=3.10"
4
+ # dependencies = [
5
+ # "torch",
6
+ # "peft>=0.18.0",
7
+ # "transformers>=5.1.0",
8
+ # "accelerate>=1.6.0",
9
+ # "huggingface_hub",
10
+ # ]
11
+ # ///
12
+ """
13
+ Merge LoRA Adapter and Push to Hub (HF Cloud)
14
+
15
+ This script is designed to run on Hugging Face Jobs infrastructure.
16
+ It loads a base model + LoRA adapter, merges them, and pushes the
17
+ standalone merged model to the Hub.
18
+
19
+ Usage:
20
+ # Submit via Makefile
21
+ make merge-cloud
22
+
23
+ # Or submit directly
24
+ python scripts/submit_cloud_training.py \
25
+ --flavor a100x4 --timeout 2h \
26
+ --script scripts/merge_hf_cloud.py
27
+ """
28
+
29
+ import os
30
+
31
+ import torch
32
+ from huggingface_hub import login
33
+ from peft import PeftModel
34
+ from transformers import AutoModelForCausalLM, AutoTokenizer
35
+
36
+ # ═══════════════════════════════════════════════════════════════
37
+ # CONFIGURATION
38
+ # ═══════════════════════════════════════════════════════════════
39
+
40
+ BASE_MODEL = "Qwen/Qwen3-Coder-Next"
41
+ ADAPTER_MODEL = "yangluo/univer-api-qwen3-next-coder"
42
+ MERGED_MODEL_ID = "yangluo/univer-api-qwen3-next-coder-merged"
43
+
44
+ # ═══════════════════════════════════════════════════════════════
45
+ # MERGE SCRIPT
46
+ # ═══════════════════════════════════════════════════════════════
47
+
48
+
49
+ def main():
50
+ # Login to Hugging Face Hub
51
+ hf_token = os.environ.get("HF_TOKEN")
52
+ if hf_token:
53
+ print("Logging in to Hugging Face Hub...")
54
+ login(token=hf_token, add_to_git_credential=False)
55
+ print(" Login successful!")
56
+ else:
57
+ print("ERROR: HF_TOKEN not found in environment!")
58
+ print("Cannot push merged model without authentication.")
59
+ return 1
60
+
61
+ print("=" * 60)
62
+ print(" MERGE LoRA ADAPTER & PUSH TO HUB")
63
+ print("=" * 60)
64
+ print(f"Base model: {BASE_MODEL}")
65
+ print(f"LoRA adapter: {ADAPTER_MODEL}")
66
+ print(f"Merged output: {MERGED_MODEL_ID}")
67
+ print(f"Dtype: bfloat16 (full precision, no quantization)")
68
+ print("=" * 60)
69
+
70
+ # ─────────────────────────────────────────────────────
71
+ # Load Tokenizer
72
+ # ─────────────────────────────────────────────────────
73
+ print("\n[1/5] Loading tokenizer...")
74
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
75
+ print(f" Tokenizer loaded: {tokenizer.__class__.__name__}")
76
+
77
+ # ─────────────────────────────────────────────────────
78
+ # Load Base Model (bf16, no quantization)
79
+ # ─────────────────────────────────────────────────────
80
+ print(f"\n[2/5] Loading base model: {BASE_MODEL}")
81
+ print(" Loading in bfloat16 with device_map='auto'...")
82
+
83
+ n_gpus = torch.cuda.device_count()
84
+ print(f" Detected {n_gpus} GPU(s)")
85
+
86
+ model = AutoModelForCausalLM.from_pretrained(
87
+ BASE_MODEL,
88
+ torch_dtype=torch.bfloat16,
89
+ device_map="auto",
90
+ trust_remote_code=True,
91
+ )
92
+
93
+ if torch.cuda.is_available():
94
+ allocated = torch.cuda.memory_allocated() / 1024**3
95
+ print(f" Base model loaded, GPU memory: {allocated:.1f} GB")
96
+
97
+ # ─────────────────────────────────────────────────────
98
+ # Load LoRA Adapter
99
+ # ─────────────────────────────────────────────────────
100
+ print(f"\n[3/5] Loading LoRA adapter: {ADAPTER_MODEL}")
101
+ model = PeftModel.from_pretrained(model, ADAPTER_MODEL)
102
+ print(" Adapter loaded successfully")
103
+
104
+ # ─────────────────────────────────────────────────────
105
+ # Merge and Unload
106
+ # ─────────────────────────────────────────────────────
107
+ print("\n[4/5] Merging adapter into base model...")
108
+ model = model.merge_and_unload()
109
+ print(" Merge complete!")
110
+
111
+ if torch.cuda.is_available():
112
+ allocated = torch.cuda.memory_allocated() / 1024**3
113
+ print(f" GPU memory after merge: {allocated:.1f} GB")
114
+
115
+ # ─────────────────────────────────────────────────────
116
+ # Push to Hub
117
+ # ─────────────────────────────────────────────────────
118
+ print(f"\n[5/5] Pushing merged model to Hub: {MERGED_MODEL_ID}")
119
+ print(" Pushing model (this may take a while for large models)...")
120
+ model.push_to_hub(MERGED_MODEL_ID)
121
+ print(" Model pushed!")
122
+
123
+ print(" Pushing tokenizer...")
124
+ tokenizer.push_to_hub(MERGED_MODEL_ID)
125
+ print(" Tokenizer pushed!")
126
+
127
+ print("\n" + "=" * 60)
128
+ print(" MERGE COMPLETE!")
129
+ print("=" * 60)
130
+ print(f"Merged model: https://huggingface.co/{MERGED_MODEL_ID}")
131
+
132
+
133
+ if __name__ == "__main__":
134
+ exit(main() or 0)