""" Inference script for UnixCoder-MIL ===================================== Usage: Simply run this script with your code samples """ import torch import torch.nn as nn import torch.nn.functional as F from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification from safetensors.torch import load_file import numpy as np DEVICE = "cuda" if torch.cuda.is_available() else "cpu" CLASS_NAMES = ["Human", "AI-Generated", "Hybrid", "Adversarial"] class MilUnixCoder(nn.Module): def __init__(self, model_name="microsoft/unixcoder-base", chunk_size=512, stride=256, max_chunks=16): super().__init__() self.config = AutoConfig.from_pretrained(model_name) self.unixcoder = AutoModel.from_pretrained(model_name) self.chunk_size, self.stride, self.max_chunks = chunk_size, stride, max_chunks self.classifier = nn.Linear(self.config.hidden_size, 4) self.dropout = nn.Dropout(0.1) def forward(self, input_ids, attention_mask=None): B, L = input_ids.size() if attention_mask is None: attention_mask = torch.ones_like(input_ids) if L > self.chunk_size: c_ids = input_ids.unfold(1, self.chunk_size, self.stride) c_mask = attention_mask.unfold(1, self.chunk_size, self.stride) nc = min(c_ids.size(1), self.max_chunks) flat_ids = c_ids[:,:nc,:].contiguous().view(-1, self.chunk_size) flat_mask = c_mask[:,:nc,:].contiguous().view(-1, self.chunk_size) else: nc, flat_ids, flat_mask = 1, input_ids, attention_mask out = self.unixcoder(input_ids=flat_ids, attention_mask=flat_mask) logits = self.classifier(self.dropout(out.last_hidden_state[:, 0, :])) return torch.max(logits.view(B, nc, -1), dim=1)[0] def load_model(): """Load the model and tokenizer""" from huggingface_hub import hf_hub_download repo = "YoungDSMLKZ/UnixCoder-MIL" tokenizer = AutoTokenizer.from_pretrained(repo) model = MilUnixCoder("microsoft/unixcoder-base") weights_path = hf_hub_download(repo_id=repo, filename="model.safetensors") model.load_state_dict(load_file(weights_path)) model.to(DEVICE).eval() return model, tokenizer def predict(code: str, model, tokenizer) -> dict: """Predict class for a single code sample""" inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=4096, padding=True).to(DEVICE) with torch.no_grad(): logits = model(inputs["input_ids"], inputs["attention_mask"]) probs = F.softmax(logits, dim=-1)[0] pred = torch.argmax(probs).item() return {"class": CLASS_NAMES[pred], "confidence": probs[pred].item()} if __name__ == "__main__": print("Loading model...") model, tokenizer = load_model() # Example usage test_code = """ def hello_world(): print("Hello, World!") """ result = predict(test_code, model, tokenizer) print(f"Predicted: {result['class']} (confidence: {result['confidence']:.2%})")