| | """ |
| | Inference script for UnixCoder-MIL |
| | ===================================== |
| | Usage: Simply run this script with your code samples |
| | """ |
| |
|
| | import torch |
| | import torch.nn as nn |
| | import torch.nn.functional as F |
| | from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification |
| | from safetensors.torch import load_file |
| | import numpy as np |
| |
|
| | DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| | CLASS_NAMES = ["Human", "AI-Generated", "Hybrid", "Adversarial"] |
| |
|
| | class MilUnixCoder(nn.Module): |
| | def __init__(self, model_name="microsoft/unixcoder-base", chunk_size=512, stride=256, max_chunks=16): |
| | super().__init__() |
| | self.config = AutoConfig.from_pretrained(model_name) |
| | self.unixcoder = AutoModel.from_pretrained(model_name) |
| | self.chunk_size, self.stride, self.max_chunks = chunk_size, stride, max_chunks |
| | self.classifier = nn.Linear(self.config.hidden_size, 4) |
| | self.dropout = nn.Dropout(0.1) |
| | def forward(self, input_ids, attention_mask=None): |
| | B, L = input_ids.size() |
| | if attention_mask is None: attention_mask = torch.ones_like(input_ids) |
| | if L > self.chunk_size: |
| | c_ids = input_ids.unfold(1, self.chunk_size, self.stride) |
| | c_mask = attention_mask.unfold(1, self.chunk_size, self.stride) |
| | nc = min(c_ids.size(1), self.max_chunks) |
| | flat_ids = c_ids[:,:nc,:].contiguous().view(-1, self.chunk_size) |
| | flat_mask = c_mask[:,:nc,:].contiguous().view(-1, self.chunk_size) |
| | else: |
| | nc, flat_ids, flat_mask = 1, input_ids, attention_mask |
| | out = self.unixcoder(input_ids=flat_ids, attention_mask=flat_mask) |
| | logits = self.classifier(self.dropout(out.last_hidden_state[:, 0, :])) |
| | return torch.max(logits.view(B, nc, -1), dim=1)[0] |
| |
|
| | def load_model(): |
| | """Load the model and tokenizer""" |
| | from huggingface_hub import hf_hub_download |
| | repo = "YoungDSMLKZ/UnixCoder-MIL" |
| | tokenizer = AutoTokenizer.from_pretrained(repo) |
| | model = MilUnixCoder("microsoft/unixcoder-base") |
| | weights_path = hf_hub_download(repo_id=repo, filename="model.safetensors") |
| | model.load_state_dict(load_file(weights_path)) |
| | model.to(DEVICE).eval() |
| | return model, tokenizer |
| |
|
| | def predict(code: str, model, tokenizer) -> dict: |
| | """Predict class for a single code sample""" |
| | inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=4096, padding=True).to(DEVICE) |
| | with torch.no_grad(): |
| | logits = model(inputs["input_ids"], inputs["attention_mask"]) |
| | probs = F.softmax(logits, dim=-1)[0] |
| | pred = torch.argmax(probs).item() |
| | return {"class": CLASS_NAMES[pred], "confidence": probs[pred].item()} |
| |
|
| | if __name__ == "__main__": |
| | print("Loading model...") |
| | model, tokenizer = load_model() |
| | |
| | |
| | test_code = """ |
| | def hello_world(): |
| | print("Hello, World!") |
| | """ |
| | |
| | result = predict(test_code, model, tokenizer) |
| | print(f"Predicted: {result['class']} (confidence: {result['confidence']:.2%})") |
| |
|