santanche commited on
Commit
6f1db1f
·
1 Parent(s): 4aa068b

feat (start): initial setup

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ .venv
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user ./app /app
13
+ CMD ["uvicorn", "server_clinical_ner:app", "--host", "0.0.0.0", "--port", "7860"]
app/clinical_ner.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ class ClinicalNER:
4
+ """
5
+ A class for Named Entity Recognition using bert-base-uncased_clinical-ner model.
6
+ """
7
+
8
+ def __init__(self):
9
+ """
10
+ Initialize the NER pipeline with bert-base-uncased_clinical-ner model.
11
+ Note: Using aggregation_strategy="simple" to merge subword tokens.
12
+ """
13
+ self.ner_pipeline = pipeline(
14
+ "ner",
15
+ model="samrawal/bert-base-uncased_clinical-ner",
16
+ aggregation_strategy="simple"
17
+ )
18
+
19
+ def _merge_subwords(self, entities):
20
+ """
21
+ Merge subword tokens (those starting with ##) into complete words.
22
+
23
+ Args:
24
+ entities (list): List of entity dictionaries from the pipeline
25
+
26
+ Returns:
27
+ list: Merged entities with complete words
28
+ """
29
+ if not entities:
30
+ return []
31
+
32
+ merged = []
33
+ i = 0
34
+
35
+ while i < len(entities):
36
+ current = entities[i].copy()
37
+ word = current['word']
38
+ end = current['end']
39
+
40
+ # Look ahead for subword tokens (starting with ##)
41
+ j = i + 1
42
+ while j < len(entities):
43
+ next_entity = entities[j]
44
+
45
+ # Check if it's a subword of the same entity type
46
+ if (next_entity['word'].startswith('##') and
47
+ next_entity['entity_group'] == current['entity_group']):
48
+ # Remove ## prefix and append
49
+ word += next_entity['word'][2:]
50
+ end = next_entity['end']
51
+ j += 1
52
+ else:
53
+ break
54
+
55
+ # Update the merged entity
56
+ current['word'] = word
57
+ current['end'] = end
58
+ merged.append(current)
59
+
60
+ # Skip the merged tokens
61
+ i = j
62
+
63
+ return merged
64
+
65
+ def basic_ner(self, text):
66
+ """
67
+ Performs NER on the input text and returns annotations with merged subwords.
68
+
69
+ Args:
70
+ text (str): Input text to analyze
71
+
72
+ Returns:
73
+ list: List of dictionaries containing entity annotations
74
+ Each dict has: entity_group, score, word, start, end
75
+ """
76
+ entities = self.ner_pipeline(text)
77
+ return self._merge_subwords(entities)
78
+
79
+ def prolog_ner(self, text):
80
+ """
81
+ Performs NER and returns results as Prolog facts compatible with Tau Prolog.
82
+ Subword tokens are automatically merged.
83
+
84
+ Args:
85
+ text (str): Input text to analyze
86
+
87
+ Returns:
88
+ str: Prolog facts as a string, one per line
89
+ """
90
+ entities = self.ner_pipeline(text)
91
+ merged_entities = self._merge_subwords(entities)
92
+
93
+ prolog_facts = []
94
+ for i, entity in enumerate(merged_entities):
95
+ # Escape single quotes in words for Prolog
96
+ word = entity['word'].replace("'", "\\'")
97
+
98
+ # Format: entity(Id, Type, Word, Start, End, Score)
99
+ fact = (
100
+ f"entity({i}, '{entity['entity_group']}', "
101
+ f"'{word}', {entity['start']}, "
102
+ f"{entity['end']}, {entity['score']:.4f})."
103
+ )
104
+ prolog_facts.append(fact)
105
+
106
+ return "\n".join(prolog_facts)
app/server_clinical_ner.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from clinical_ner import ClinicalNER
4
+ import uvicorn
5
+
6
+ # Initialize FastAPI app
7
+ app = FastAPI(
8
+ title="Clinical NER API",
9
+ description="Named Entity Recognition API using Bio_ClinicalBERT",
10
+ version="1.0.0"
11
+ )
12
+
13
+ # Initialize the NER model (singleton pattern)
14
+ ner_model = None
15
+
16
+ @app.on_event("startup")
17
+ async def startup_event():
18
+ """Load the NER model on startup"""
19
+ global ner_model
20
+ ner_model = ClinicalNER()
21
+ print("NER model loaded successfully!")
22
+
23
+ # Request model
24
+ class TextRequest(BaseModel):
25
+ text: str
26
+
27
+ class Config:
28
+ json_schema_extra = {
29
+ "example": {
30
+ "text": "Patient presents with hypertension and diabetes. Prescribed metformin 500mg."
31
+ }
32
+ }
33
+
34
+ # Response models
35
+ class Entity(BaseModel):
36
+ entity_group: str
37
+ score: float
38
+ word: str
39
+ start: int
40
+ end: int
41
+
42
+ class BasicNERResponse(BaseModel):
43
+ entities: list[Entity]
44
+ count: int
45
+
46
+ class PrologNERResponse(BaseModel):
47
+ prolog_facts: str
48
+ count: int
49
+
50
+ @app.get("/")
51
+ async def root():
52
+ """Root endpoint with API information"""
53
+ return {
54
+ "message": "Clinical NER API",
55
+ "endpoints": {
56
+ "/ner/basic": "POST - Get basic NER annotations",
57
+ "/ner/prolog": "POST - Get Prolog facts",
58
+ "/docs": "GET - Interactive API documentation"
59
+ }
60
+ }
61
+
62
+ @app.post("/ner/basic", response_model=BasicNERResponse)
63
+ async def ner_basic(request: TextRequest):
64
+ """
65
+ Perform basic NER on the input text.
66
+
67
+ Returns a list of detected entities with their types, positions, and confidence scores.
68
+ """
69
+ try:
70
+ if not request.text.strip():
71
+ raise HTTPException(status_code=400, detail="Text cannot be empty")
72
+
73
+ entities = ner_model.basic_ner(request.text)
74
+
75
+ return {
76
+ "entities": entities,
77
+ "count": len(entities)
78
+ }
79
+ except Exception as e:
80
+ raise HTTPException(status_code=500, detail=f"Error processing text: {str(e)}")
81
+
82
+ @app.post("/ner/prolog", response_model=PrologNERResponse)
83
+ async def ner_prolog(request: TextRequest):
84
+ """
85
+ Perform NER and return results as Prolog facts.
86
+
87
+ Returns Prolog facts in the format: entity(Id, Type, Word, Start, End, Score).
88
+ """
89
+ try:
90
+ if not request.text.strip():
91
+ raise HTTPException(status_code=400, detail="Text cannot be empty")
92
+
93
+ prolog_facts = ner_model.prolog_ner(request.text)
94
+
95
+ # Count the number of facts (lines)
96
+ count = len(prolog_facts.split('\n')) if prolog_facts else 0
97
+
98
+ return {
99
+ "prolog_facts": prolog_facts,
100
+ "count": count
101
+ }
102
+ except Exception as e:
103
+ raise HTTPException(status_code=500, detail=f"Error processing text: {str(e)}")
104
+
105
+ @app.get("/health")
106
+ async def health_check():
107
+ """Health check endpoint"""
108
+ return {
109
+ "status": "healthy",
110
+ "model_loaded": ner_model is not None
111
+ }
112
+
113
+ if __name__ == "__main__":
114
+ uvicorn.run(app, host="0.0.0.0", port=8000)
app/test_clinical_ner.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from clinical_ner import ClinicalNER
2
+
3
+ # Initialize the NER system
4
+ ner = ClinicalNER()
5
+
6
+ # Sample clinical text
7
+ text = "Patient presents with hypertension and diabetes. Prescribed metformin 500mg."
8
+
9
+ # Get basic NER annotations
10
+ print("Basic NER:")
11
+ results = ner.basic_ner(text)
12
+ for entity in results:
13
+ print(f" {entity['word']} -> {entity['entity_group']} (score: {entity['score']:.4f})")
14
+
15
+ print("\n" + "="*60 + "\n")
16
+
17
+ # Get Prolog facts
18
+ print("Prolog NER:")
19
+ prolog_output = ner.prolog_ner(text)
20
+ print(prolog_output)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastAPI and server
2
+ fastapi==0.109.0
3
+ uvicorn[standard]==0.27.0
4
+ pydantic==2.5.3
5
+
6
+ # Transformers and ML
7
+ transformers==4.36.2
8
+ torch==2.1.2
9
+ tokenizers==0.15.0
10
+
11
+ # Additional dependencies
12
+ numpy==1.26.3
13
+ huggingface-hub==0.20.2