nvidia
/

Qwen-3-Nemotron-32B-Reward

@@ -104,151 +104,31 @@ This code has been tested on Transformers v4.51.2, torch 2.6.0+cu124 and 2 NVIDI
 The model was trained with "nothink" instruction in order not to lose Qwen-3's reasoning ability.
 When prompting model, please append ` /nothink` to the user query and an empty thinking trace `<think>\n\n</think>\n\n` to the model response.
-Server code to host the model:
 ```python
-# server.py
-import argparse
 import torch
-from fastapi import FastAPI
-from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-import uvicorn
-app = FastAPI()
-class GenerateRequest(BaseModel):
-    prompts: list[str]
-# Globals to hold our model/tokenizer once loaded
-tokenizer: AutoTokenizer
-model: AutoModelForSequenceClassification
-def load_model(model_path: str):
-    global tokenizer, model
-    print(f"Loading model from {model_path}…")
-    # load tokenizer as usual
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    # load the model in FP16 to save memory; drop .to() calls
-    model = AutoModelForSequenceClassification.from_pretrained(
-        model_path,
-        pad_token_id=tokenizer.pad_token_id,
-        torch_dtype=torch.float16,
-        device_map="auto",
-    )
-    # optional: disable dropout everywhere
-    disable_dropout_in_model(model)
-    print("Model loaded and dispatched across GPUs.")
-def disable_dropout_in_model(module: torch.nn.Module) -> None:
-    for m in module.modules():
-        if isinstance(m, torch.nn.Dropout):
-            m.p = 0
-@app.post("/generate")
-async def generate(req: GenerateRequest):
-    inputs = tokenizer(
-        req.prompts,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=8192,   # adjust as needed
-    )
-    # Forward pass — no .to() calls needed
-    with torch.no_grad():
-        outputs = model(**inputs)
-    # Return raw logits
-    return {"logits": outputs.logits.cpu().tolist()}
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Sharded RM FastAPI server")
-    parser.add_argument(
-        "--model-path",
-        type=str,
-        required=True,
-        default='nvidia/Qwen-3-Nemotron-32B-Reward',
-        help="HuggingFace model ID or local checkpoint directory",
-    )
-    parser.add_argument(
-        "--host", type=str, default="0.0.0.0", help="host for the FastAPI server"
-    )
-    parser.add_argument(
-        "--port", type=int, default=9000, help="port for the FastAPI server"
-    )
-    args = parser.parse_args()
-    load_model(args.model_path)
-    uvicorn.run(app, host=args.host, port=args.port)
-```
-Inference code example to get the reward scores:
-Please note the ` /nothink` at the end of the user query and an empty thinking trace `<think>\n\n</think>\n\n` in the beginning of the model response - this can be implemented in the server side instead, as well.
-```python
-# client.py
-import requests
-import argparse
-from transformers import AutoTokenizer
-# SERVER_URL = "http://localhost:9000/generate"
-def send_prompts(host: str, port: int, prompts: list[str]) -> dict:
-    resp = requests.post(f"http://{host}:{port}/generate", json={"prompts": prompts})
-    resp.raise_for_status()
-    return resp.json()
-if __name__ == "__main__":
-    # Example usage
-    parser = argparse.ArgumentParser(description="Qwen-3-RM FastAPI server")
-    parser.add_argument(
-        "--model-path",
-        type=str,
-        default='nvidia/Qwen-3-Nemotron-32B-Reward',
-        help="HuggingFace model ID or local checkpoint directory",
-    )
-    parser.add_argument(
-        "--host", type=str, default="0.0.0.0", help="host for the FastAPI server"
-    )
-    parser.add_argument(
-        "--port", type=int, default=9000, help="port for the FastAPI server"
-    )
-    args = parser.parse_args()
-    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-    messages1 = [
-        {"role": "user", "content": 'Tell me something about large language models. /nothink'},
-        {"role": "assistant", "content": "<think>\n\n</think>\n\nI'm sorry, I can't answer that question."}
-    ]
-    messages2 = [
-        {"role": "user", "content": 'Tell me something about large language models. /nothink'},
-        {"role": "assistant", "content": "<think>\n\n</think>\n\nlarge language models are a type of machine learning model that are used to generate text."}
-    ]
-    messages1_chat = tokenizer.apply_chat_template(messages1, tokenize=False, add_generation_prompt=False)
-    messages2_chat = tokenizer.apply_chat_template(messages2, tokenize=False, add_generation_prompt=False)
-    prompts = [
-        messages1_chat,
-        messages2_chat,
-    ]
-    out = send_prompts(args.host, args.port, prompts)
-    print(f"Received rm_scores for {len(prompts)} prompts:")
-    for i, seq_logits in enumerate(out["logits"]):
-        print(f" Prompt {i}: {seq_logits[0]}")
-# outputs:
-#  Prompt 0: -8.796875
-#  Prompt 1: 5.8359375
 ```
 ## Training Datasets:

 The model was trained with "nothink" instruction in order not to lose Qwen-3's reasoning ability.
 When prompting model, please append ` /nothink` to the user query and an empty thinking trace `<think>\n\n</think>\n\n` to the model response.
+Please note the ` /nothink` at the end of the user query and an empty thinking trace `<think>\n\n</think>\n\n` in the beginning of the model response - this can be implemented in the server side instead, as well.
 ```python
 import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+model_name = "nvidia/Qwen-3-Nemotron-32B-Reward"
+model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+prompt = "What is 1+1?"
+good_response = "1+1=2"
+bad_response = "1+1=3"
+for response in [good_response, bad_response]:
+    messages = [{'role': "user", "content": prompt + " /nothink"}, {'role': "assistant", "content": "<think>\n\n</think>\n\n" + response}]
+    tokenized_message = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=False, return_tensors="pt", return_dict=True)
+    reward = model(tokenized_message['input_ids'].cuda(),attention_mask=tokenized_message['attention_mask'].cuda()).logits[0][0].item()
+    print(reward)
+# Example quality - note that higher scores means higher quality, and scores can be negative.
+# reward for good_response = 8.0234375
+# reward for bad_response = -7.9765625
 ```
 ## Training Datasets: