nvhshin commited on
Commit
aca6709
·
verified ·
1 Parent(s): fea6430

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +19 -139
README.md CHANGED
@@ -104,151 +104,31 @@ This code has been tested on Transformers v4.51.2, torch 2.6.0+cu124 and 2 NVIDI
104
  The model was trained with "nothink" instruction in order not to lose Qwen-3's reasoning ability.
105
  When prompting model, please append ` /nothink` to the user query and an empty thinking trace `<think>\n\n</think>\n\n` to the model response.
106
 
107
- Server code to host the model:
108
 
109
  ```python
110
- # server.py
111
-
112
- import argparse
113
  import torch
114
- from fastapi import FastAPI
115
- from pydantic import BaseModel
116
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
117
- import uvicorn
118
-
119
- app = FastAPI()
120
-
121
- class GenerateRequest(BaseModel):
122
- prompts: list[str]
123
-
124
- # Globals to hold our model/tokenizer once loaded
125
- tokenizer: AutoTokenizer
126
- model: AutoModelForSequenceClassification
127
-
128
- def load_model(model_path: str):
129
- global tokenizer, model
130
-
131
- print(f"Loading model from {model_path}…")
132
- # load tokenizer as usual
133
- tokenizer = AutoTokenizer.from_pretrained(model_path)
134
-
135
- # load the model in FP16 to save memory; drop .to() calls
136
- model = AutoModelForSequenceClassification.from_pretrained(
137
- model_path,
138
- pad_token_id=tokenizer.pad_token_id,
139
- torch_dtype=torch.float16,
140
- device_map="auto",
141
- )
142
-
143
- # optional: disable dropout everywhere
144
- disable_dropout_in_model(model)
145
-
146
- print("Model loaded and dispatched across GPUs.")
147
-
148
- def disable_dropout_in_model(module: torch.nn.Module) -> None:
149
- for m in module.modules():
150
- if isinstance(m, torch.nn.Dropout):
151
- m.p = 0
152
-
153
- @app.post("/generate")
154
- async def generate(req: GenerateRequest):
155
- inputs = tokenizer(
156
- req.prompts,
157
- return_tensors="pt",
158
- padding=True,
159
- truncation=True,
160
- max_length=8192, # adjust as needed
161
- )
162
-
163
- # Forward pass — no .to() calls needed
164
- with torch.no_grad():
165
- outputs = model(**inputs)
166
-
167
- # Return raw logits
168
- return {"logits": outputs.logits.cpu().tolist()}
169
-
170
- if __name__ == "__main__":
171
- parser = argparse.ArgumentParser(description="Sharded RM FastAPI server")
172
- parser.add_argument(
173
- "--model-path",
174
- type=str,
175
- required=True,
176
- default='nvidia/Qwen-3-Nemotron-32B-Reward',
177
- help="HuggingFace model ID or local checkpoint directory",
178
- )
179
- parser.add_argument(
180
- "--host", type=str, default="0.0.0.0", help="host for the FastAPI server"
181
- )
182
- parser.add_argument(
183
- "--port", type=int, default=9000, help="port for the FastAPI server"
184
- )
185
- args = parser.parse_args()
186
-
187
- load_model(args.model_path)
188
- uvicorn.run(app, host=args.host, port=args.port)
189
- ```
190
 
191
- Inference code example to get the reward scores:
192
 
193
- Please note the ` /nothink` at the end of the user query and an empty thinking trace `<think>\n\n</think>\n\n` in the beginning of the model response - this can be implemented in the server side instead, as well.
 
194
 
195
- ```python
196
- # client.py
197
- import requests
198
- import argparse
199
- from transformers import AutoTokenizer
200
- # SERVER_URL = "http://localhost:9000/generate"
201
-
202
- def send_prompts(host: str, port: int, prompts: list[str]) -> dict:
203
- resp = requests.post(f"http://{host}:{port}/generate", json={"prompts": prompts})
204
- resp.raise_for_status()
205
- return resp.json()
206
-
207
- if __name__ == "__main__":
208
- # Example usage
209
-
210
- parser = argparse.ArgumentParser(description="Qwen-3-RM FastAPI server")
211
- parser.add_argument(
212
- "--model-path",
213
- type=str,
214
- default='nvidia/Qwen-3-Nemotron-32B-Reward',
215
- help="HuggingFace model ID or local checkpoint directory",
216
- )
217
- parser.add_argument(
218
- "--host", type=str, default="0.0.0.0", help="host for the FastAPI server"
219
- )
220
- parser.add_argument(
221
- "--port", type=int, default=9000, help="port for the FastAPI server"
222
- )
223
- args = parser.parse_args()
224
-
225
- tokenizer = AutoTokenizer.from_pretrained(args.model_path)
226
-
227
- messages1 = [
228
- {"role": "user", "content": 'Tell me something about large language models. /nothink'},
229
- {"role": "assistant", "content": "<think>\n\n</think>\n\nI'm sorry, I can't answer that question."}
230
- ]
231
-
232
- messages2 = [
233
- {"role": "user", "content": 'Tell me something about large language models. /nothink'},
234
- {"role": "assistant", "content": "<think>\n\n</think>\n\nlarge language models are a type of machine learning model that are used to generate text."}
235
- ]
236
-
237
- messages1_chat = tokenizer.apply_chat_template(messages1, tokenize=False, add_generation_prompt=False)
238
- messages2_chat = tokenizer.apply_chat_template(messages2, tokenize=False, add_generation_prompt=False)
239
-
240
- prompts = [
241
- messages1_chat,
242
- messages2_chat,
243
- ]
244
- out = send_prompts(args.host, args.port, prompts)
245
- print(f"Received rm_scores for {len(prompts)} prompts:")
246
- for i, seq_logits in enumerate(out["logits"]):
247
- print(f" Prompt {i}: {seq_logits[0]}")
248
-
249
- # outputs:
250
- # Prompt 0: -8.796875
251
- # Prompt 1: 5.8359375
252
  ```
253
 
254
  ## Training Datasets:
 
104
  The model was trained with "nothink" instruction in order not to lose Qwen-3's reasoning ability.
105
  When prompting model, please append ` /nothink` to the user query and an empty thinking trace `<think>\n\n</think>\n\n` to the model response.
106
 
107
+ Please note the ` /nothink` at the end of the user query and an empty thinking trace `<think>\n\n</think>\n\n` in the beginning of the model response - this can be implemented in the server side instead, as well.
108
 
109
  ```python
 
 
 
110
  import torch
111
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ model_name = "nvidia/Qwen-3-Nemotron-32B-Reward"
114
 
115
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
116
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
117
 
118
+ prompt = "What is 1+1?"
119
+ good_response = "1+1=2"
120
+ bad_response = "1+1=3"
121
+
122
+ for response in [good_response, bad_response]:
123
+ messages = [{'role': "user", "content": prompt + " /nothink"}, {'role': "assistant", "content": "<think>\n\n</think>\n\n" + response}]
124
+ tokenized_message = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=False, return_tensors="pt", return_dict=True)
125
+ reward = model(tokenized_message['input_ids'].cuda(),attention_mask=tokenized_message['attention_mask'].cuda()).logits[0][0].item()
126
+ print(reward)
127
+
128
+ # Example quality - note that higher scores means higher quality, and scores can be negative.
129
+
130
+ # reward for good_response = 8.0234375
131
+ # reward for bad_response = -7.9765625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  ```
133
 
134
  ## Training Datasets: