# nlp_pipeline.py from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification from sentence_transformers import SentenceTransformer import numpy as np # Load lighter/CPU-friendly models for HF Space SUMMARIZER = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1) # NER model (token-classification) NER = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple", device=-1) EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # small & fast def summarize(text, max_length=120): # chunk if needed if len(text) < 800: s = SUMMARIZER(text, max_length=max_length, min_length=40, do_sample=False) return s[0]["summary_text"] # naive chunking parts = [] chunk_size = 700 for i in range(0, len(text), chunk_size): chunk = text[i:i+chunk_size] parts.append(SUMMARIZER(chunk, max_length=60, min_length=20)[0]["summary_text"]) return " ".join(parts) def extract_entities(text): ner = NER(text) # ner returns list of {'entity_group','score','word'} grouped = {} for ent in ner: key = ent.get("entity_group") or ent.get("entity") grouped.setdefault(key, []).append({"text": ent["word"], "score": float(ent["score"])}) return grouped def embed_text(text): return EMBED_MODEL.encode(text, convert_to_numpy=True, normalize_embeddings=True) def get_sentence_provenance(sentences, entities): # map entity text to sentences that contain it (case-insensitive) prov = {} for t in entities: prov[t] = [] for s in sentences: if t.lower() in s.lower(): prov[t].append(s) return prov def process_document(doc): text = doc["text"] summary = summarize(text) entities_grouped = extract_entities(text) # flatten entity strings (unique) entity_texts = set() for k, v in entities_grouped.items(): for item in v: entity_texts.add(item["text"]) provenance = get_sentence_provenance(doc["sentences"], entity_texts) embedding = embed_text(summary) # index the summary embedding for compactness tags = [] # optional: simple tag by most frequent NER labels return { "summary": summary, "entities": entities_grouped, "entity_texts": list(entity_texts), "provenance": provenance, "embedding": embedding, "tags": tags }