Spaces:
Sleeping
Sleeping
feat (start): initial setup
Browse files- .gitignore +2 -0
- Dockerfile +13 -0
- app/clinical_ner.py +106 -0
- app/server_clinical_ner.py +114 -0
- app/test_clinical_ner.py +20 -0
- requirements.txt +13 -0
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
.venv
|
Dockerfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9
|
| 2 |
+
|
| 3 |
+
RUN useradd -m -u 1000 user
|
| 4 |
+
USER user
|
| 5 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 10 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY --chown=user ./app /app
|
| 13 |
+
CMD ["uvicorn", "server_clinical_ner:app", "--host", "0.0.0.0", "--port", "7860"]
|
app/clinical_ner.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
|
| 3 |
+
class ClinicalNER:
|
| 4 |
+
"""
|
| 5 |
+
A class for Named Entity Recognition using bert-base-uncased_clinical-ner model.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
def __init__(self):
|
| 9 |
+
"""
|
| 10 |
+
Initialize the NER pipeline with bert-base-uncased_clinical-ner model.
|
| 11 |
+
Note: Using aggregation_strategy="simple" to merge subword tokens.
|
| 12 |
+
"""
|
| 13 |
+
self.ner_pipeline = pipeline(
|
| 14 |
+
"ner",
|
| 15 |
+
model="samrawal/bert-base-uncased_clinical-ner",
|
| 16 |
+
aggregation_strategy="simple"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
def _merge_subwords(self, entities):
|
| 20 |
+
"""
|
| 21 |
+
Merge subword tokens (those starting with ##) into complete words.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
entities (list): List of entity dictionaries from the pipeline
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
list: Merged entities with complete words
|
| 28 |
+
"""
|
| 29 |
+
if not entities:
|
| 30 |
+
return []
|
| 31 |
+
|
| 32 |
+
merged = []
|
| 33 |
+
i = 0
|
| 34 |
+
|
| 35 |
+
while i < len(entities):
|
| 36 |
+
current = entities[i].copy()
|
| 37 |
+
word = current['word']
|
| 38 |
+
end = current['end']
|
| 39 |
+
|
| 40 |
+
# Look ahead for subword tokens (starting with ##)
|
| 41 |
+
j = i + 1
|
| 42 |
+
while j < len(entities):
|
| 43 |
+
next_entity = entities[j]
|
| 44 |
+
|
| 45 |
+
# Check if it's a subword of the same entity type
|
| 46 |
+
if (next_entity['word'].startswith('##') and
|
| 47 |
+
next_entity['entity_group'] == current['entity_group']):
|
| 48 |
+
# Remove ## prefix and append
|
| 49 |
+
word += next_entity['word'][2:]
|
| 50 |
+
end = next_entity['end']
|
| 51 |
+
j += 1
|
| 52 |
+
else:
|
| 53 |
+
break
|
| 54 |
+
|
| 55 |
+
# Update the merged entity
|
| 56 |
+
current['word'] = word
|
| 57 |
+
current['end'] = end
|
| 58 |
+
merged.append(current)
|
| 59 |
+
|
| 60 |
+
# Skip the merged tokens
|
| 61 |
+
i = j
|
| 62 |
+
|
| 63 |
+
return merged
|
| 64 |
+
|
| 65 |
+
def basic_ner(self, text):
|
| 66 |
+
"""
|
| 67 |
+
Performs NER on the input text and returns annotations with merged subwords.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
text (str): Input text to analyze
|
| 71 |
+
|
| 72 |
+
Returns:
|
| 73 |
+
list: List of dictionaries containing entity annotations
|
| 74 |
+
Each dict has: entity_group, score, word, start, end
|
| 75 |
+
"""
|
| 76 |
+
entities = self.ner_pipeline(text)
|
| 77 |
+
return self._merge_subwords(entities)
|
| 78 |
+
|
| 79 |
+
def prolog_ner(self, text):
|
| 80 |
+
"""
|
| 81 |
+
Performs NER and returns results as Prolog facts compatible with Tau Prolog.
|
| 82 |
+
Subword tokens are automatically merged.
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
text (str): Input text to analyze
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
str: Prolog facts as a string, one per line
|
| 89 |
+
"""
|
| 90 |
+
entities = self.ner_pipeline(text)
|
| 91 |
+
merged_entities = self._merge_subwords(entities)
|
| 92 |
+
|
| 93 |
+
prolog_facts = []
|
| 94 |
+
for i, entity in enumerate(merged_entities):
|
| 95 |
+
# Escape single quotes in words for Prolog
|
| 96 |
+
word = entity['word'].replace("'", "\\'")
|
| 97 |
+
|
| 98 |
+
# Format: entity(Id, Type, Word, Start, End, Score)
|
| 99 |
+
fact = (
|
| 100 |
+
f"entity({i}, '{entity['entity_group']}', "
|
| 101 |
+
f"'{word}', {entity['start']}, "
|
| 102 |
+
f"{entity['end']}, {entity['score']:.4f})."
|
| 103 |
+
)
|
| 104 |
+
prolog_facts.append(fact)
|
| 105 |
+
|
| 106 |
+
return "\n".join(prolog_facts)
|
app/server_clinical_ner.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
from clinical_ner import ClinicalNER
|
| 4 |
+
import uvicorn
|
| 5 |
+
|
| 6 |
+
# Initialize FastAPI app
|
| 7 |
+
app = FastAPI(
|
| 8 |
+
title="Clinical NER API",
|
| 9 |
+
description="Named Entity Recognition API using Bio_ClinicalBERT",
|
| 10 |
+
version="1.0.0"
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
# Initialize the NER model (singleton pattern)
|
| 14 |
+
ner_model = None
|
| 15 |
+
|
| 16 |
+
@app.on_event("startup")
|
| 17 |
+
async def startup_event():
|
| 18 |
+
"""Load the NER model on startup"""
|
| 19 |
+
global ner_model
|
| 20 |
+
ner_model = ClinicalNER()
|
| 21 |
+
print("NER model loaded successfully!")
|
| 22 |
+
|
| 23 |
+
# Request model
|
| 24 |
+
class TextRequest(BaseModel):
|
| 25 |
+
text: str
|
| 26 |
+
|
| 27 |
+
class Config:
|
| 28 |
+
json_schema_extra = {
|
| 29 |
+
"example": {
|
| 30 |
+
"text": "Patient presents with hypertension and diabetes. Prescribed metformin 500mg."
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
# Response models
|
| 35 |
+
class Entity(BaseModel):
|
| 36 |
+
entity_group: str
|
| 37 |
+
score: float
|
| 38 |
+
word: str
|
| 39 |
+
start: int
|
| 40 |
+
end: int
|
| 41 |
+
|
| 42 |
+
class BasicNERResponse(BaseModel):
|
| 43 |
+
entities: list[Entity]
|
| 44 |
+
count: int
|
| 45 |
+
|
| 46 |
+
class PrologNERResponse(BaseModel):
|
| 47 |
+
prolog_facts: str
|
| 48 |
+
count: int
|
| 49 |
+
|
| 50 |
+
@app.get("/")
|
| 51 |
+
async def root():
|
| 52 |
+
"""Root endpoint with API information"""
|
| 53 |
+
return {
|
| 54 |
+
"message": "Clinical NER API",
|
| 55 |
+
"endpoints": {
|
| 56 |
+
"/ner/basic": "POST - Get basic NER annotations",
|
| 57 |
+
"/ner/prolog": "POST - Get Prolog facts",
|
| 58 |
+
"/docs": "GET - Interactive API documentation"
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
@app.post("/ner/basic", response_model=BasicNERResponse)
|
| 63 |
+
async def ner_basic(request: TextRequest):
|
| 64 |
+
"""
|
| 65 |
+
Perform basic NER on the input text.
|
| 66 |
+
|
| 67 |
+
Returns a list of detected entities with their types, positions, and confidence scores.
|
| 68 |
+
"""
|
| 69 |
+
try:
|
| 70 |
+
if not request.text.strip():
|
| 71 |
+
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
| 72 |
+
|
| 73 |
+
entities = ner_model.basic_ner(request.text)
|
| 74 |
+
|
| 75 |
+
return {
|
| 76 |
+
"entities": entities,
|
| 77 |
+
"count": len(entities)
|
| 78 |
+
}
|
| 79 |
+
except Exception as e:
|
| 80 |
+
raise HTTPException(status_code=500, detail=f"Error processing text: {str(e)}")
|
| 81 |
+
|
| 82 |
+
@app.post("/ner/prolog", response_model=PrologNERResponse)
|
| 83 |
+
async def ner_prolog(request: TextRequest):
|
| 84 |
+
"""
|
| 85 |
+
Perform NER and return results as Prolog facts.
|
| 86 |
+
|
| 87 |
+
Returns Prolog facts in the format: entity(Id, Type, Word, Start, End, Score).
|
| 88 |
+
"""
|
| 89 |
+
try:
|
| 90 |
+
if not request.text.strip():
|
| 91 |
+
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
| 92 |
+
|
| 93 |
+
prolog_facts = ner_model.prolog_ner(request.text)
|
| 94 |
+
|
| 95 |
+
# Count the number of facts (lines)
|
| 96 |
+
count = len(prolog_facts.split('\n')) if prolog_facts else 0
|
| 97 |
+
|
| 98 |
+
return {
|
| 99 |
+
"prolog_facts": prolog_facts,
|
| 100 |
+
"count": count
|
| 101 |
+
}
|
| 102 |
+
except Exception as e:
|
| 103 |
+
raise HTTPException(status_code=500, detail=f"Error processing text: {str(e)}")
|
| 104 |
+
|
| 105 |
+
@app.get("/health")
|
| 106 |
+
async def health_check():
|
| 107 |
+
"""Health check endpoint"""
|
| 108 |
+
return {
|
| 109 |
+
"status": "healthy",
|
| 110 |
+
"model_loaded": ner_model is not None
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
if __name__ == "__main__":
|
| 114 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
app/test_clinical_ner.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from clinical_ner import ClinicalNER
|
| 2 |
+
|
| 3 |
+
# Initialize the NER system
|
| 4 |
+
ner = ClinicalNER()
|
| 5 |
+
|
| 6 |
+
# Sample clinical text
|
| 7 |
+
text = "Patient presents with hypertension and diabetes. Prescribed metformin 500mg."
|
| 8 |
+
|
| 9 |
+
# Get basic NER annotations
|
| 10 |
+
print("Basic NER:")
|
| 11 |
+
results = ner.basic_ner(text)
|
| 12 |
+
for entity in results:
|
| 13 |
+
print(f" {entity['word']} -> {entity['entity_group']} (score: {entity['score']:.4f})")
|
| 14 |
+
|
| 15 |
+
print("\n" + "="*60 + "\n")
|
| 16 |
+
|
| 17 |
+
# Get Prolog facts
|
| 18 |
+
print("Prolog NER:")
|
| 19 |
+
prolog_output = ner.prolog_ner(text)
|
| 20 |
+
print(prolog_output)
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FastAPI and server
|
| 2 |
+
fastapi==0.109.0
|
| 3 |
+
uvicorn[standard]==0.27.0
|
| 4 |
+
pydantic==2.5.3
|
| 5 |
+
|
| 6 |
+
# Transformers and ML
|
| 7 |
+
transformers==4.36.2
|
| 8 |
+
torch==2.1.2
|
| 9 |
+
tokenizers==0.15.0
|
| 10 |
+
|
| 11 |
+
# Additional dependencies
|
| 12 |
+
numpy==1.26.3
|
| 13 |
+
huggingface-hub==0.20.2
|