MaziyarPanahi commited on
Commit
2f0e07f
·
verified ·
1 Parent(s): 86676d0

Upload PII detection model OpenMed-PII-BiomedBERT-Base-110M-v1

Browse files
README.md CHANGED
@@ -2,7 +2,7 @@
2
  language:
3
  - en
4
  license: apache-2.0
5
- base_model: microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract
6
  tags:
7
  - token-classification
8
  - ner
@@ -38,13 +38,13 @@ model-index:
38
  split: test
39
  metrics:
40
  - type: f1
41
- value: 0.9520
42
  name: F1 (micro)
43
  - type: precision
44
- value: 0.9563
45
  name: Precision
46
  - type: recall
47
- value: 0.9477
48
  name: Recall
49
  widget:
50
  - text: "Dr. Sarah Johnson (SSN: 123-45-6789) can be reached at sarah.johnson@hospital.org or 555-123-4567. She lives at 123 Oak Street, Boston, MA 02108."
@@ -55,7 +55,7 @@ widget:
55
 
56
  **PII Detection Model** | 110M Parameters | Open Source
57
 
58
- [![F1 Score](https://img.shields.io/badge/F1-95.20%25-brightgreen)]() [![Precision](https://img.shields.io/badge/Precision-95.63%25-blue)]() [![Recall](https://img.shields.io/badge/Recall-94.77%25-orange)]()
59
 
60
  ## Model Description
61
 
@@ -74,12 +74,12 @@ Evaluated on a stratified 2,000-sample test set from NVIDIA Nemotron-PII:
74
 
75
  | Metric | Score |
76
  |:---|:---:|
77
- | **Micro F1** | **0.9520** |
78
- | Precision | 0.9563 |
79
- | Recall | 0.9477 |
80
- | Macro F1 | 0.9557 |
81
- | Weighted F1 | 0.9510 |
82
- | Accuracy | 0.9936 |
83
 
84
  ### Top 10 PII Models
85
 
@@ -100,11 +100,11 @@ Evaluated on a stratified 2,000-sample test set from NVIDIA Nemotron-PII:
100
 
101
  | Entity | F1 | Precision | Recall | Support |
102
  |:---|:---:|:---:|:---:|:---:|
103
- | `blood_type` | 1.000 | 1.000 | 1.000 | 136 |
104
- | `ssn` | 1.000 | 1.000 | 1.000 | 141 |
105
- | `tax_id` | 1.000 | 1.000 | 1.000 | 43 |
106
- | `biometric_identifier` | 0.998 | 0.996 | 1.000 | 233 |
107
- | `credit_debit_card` | 0.998 | 0.995 | 1.000 | 214 |
108
 
109
  ### Challenging Entities
110
 
@@ -112,11 +112,11 @@ These entity types have lower performance and may benefit from additional post-p
112
 
113
  | Entity | F1 | Precision | Recall | Support |
114
  |:---|:---:|:---:|:---:|:---:|
115
- | `education_level` | 0.896 | 0.930 | 0.865 | 200 |
116
- | `time` | 0.866 | 0.878 | 0.855 | 470 |
117
- | `pin` | 0.861 | 0.855 | 0.868 | 136 |
118
- | `sexuality` | 0.849 | 0.800 | 0.905 | 84 |
119
- | `occupation` | 0.673 | 0.733 | 0.623 | 718 |
120
 
121
  ## Supported Entity Types
122
 
 
2
  language:
3
  - en
4
  license: apache-2.0
5
+ base_model: microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext
6
  tags:
7
  - token-classification
8
  - ner
 
38
  split: test
39
  metrics:
40
  - type: f1
41
+ value: 0.9497
42
  name: F1 (micro)
43
  - type: precision
44
+ value: 0.9537
45
  name: Precision
46
  - type: recall
47
+ value: 0.9457
48
  name: Recall
49
  widget:
50
  - text: "Dr. Sarah Johnson (SSN: 123-45-6789) can be reached at sarah.johnson@hospital.org or 555-123-4567. She lives at 123 Oak Street, Boston, MA 02108."
 
55
 
56
  **PII Detection Model** | 110M Parameters | Open Source
57
 
58
+ [![F1 Score](https://img.shields.io/badge/F1-94.97%25-brightgreen)]() [![Precision](https://img.shields.io/badge/Precision-95.37%25-blue)]() [![Recall](https://img.shields.io/badge/Recall-94.57%25-orange)]()
59
 
60
  ## Model Description
61
 
 
74
 
75
  | Metric | Score |
76
  |:---|:---:|
77
+ | **Micro F1** | **0.9497** |
78
+ | Precision | 0.9537 |
79
+ | Recall | 0.9457 |
80
+ | Macro F1 | 0.9489 |
81
+ | Weighted F1 | 0.9488 |
82
+ | Accuracy | 0.9932 |
83
 
84
  ### Top 10 PII Models
85
 
 
100
 
101
  | Entity | F1 | Precision | Recall | Support |
102
  |:---|:---:|:---:|:---:|:---:|
103
+ | `credit_debit_card` | 1.000 | 1.000 | 1.000 | 214 |
104
+ | `biometric_identifier` | 0.998 | 0.996 | 1.000 | 234 |
105
+ | `health_plan_beneficiary_number` | 0.998 | 0.995 | 1.000 | 216 |
106
+ | `email` | 0.995 | 0.997 | 0.992 | 761 |
107
+ | `ssn` | 0.993 | 1.000 | 0.986 | 141 |
108
 
109
  ### Challenging Entities
110
 
 
112
 
113
  | Entity | F1 | Precision | Recall | Support |
114
  |:---|:---:|:---:|:---:|:---:|
115
+ | `time` | 0.868 | 0.928 | 0.815 | 471 |
116
+ | `pin` | 0.851 | 0.842 | 0.860 | 136 |
117
+ | `sexuality` | 0.809 | 0.731 | 0.905 | 84 |
118
+ | `gender` | 0.776 | 0.712 | 0.852 | 189 |
119
+ | `occupation` | 0.676 | 0.741 | 0.622 | 719 |
120
 
121
  ## Supported Entity Types
122
 
all_results.json CHANGED
@@ -1,24 +1,24 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_accuracy": 0.9946379251588227,
4
- "eval_f1": 0.9578908824606371,
5
- "eval_loss": 0.021427959203720093,
6
- "eval_precision": 0.9607286259915777,
7
- "eval_recall": 0.9550698534780704,
8
- "eval_runtime": 14.9793,
9
- "eval_samples_per_second": 333.793,
10
- "eval_steps_per_second": 5.274,
11
- "test_accuracy": 0.994592134929861,
12
- "test_f1": 0.9575683536318046,
13
- "test_loss": 0.020539097487926483,
14
- "test_precision": 0.960039772620678,
15
- "test_recall": 0.9551096262601754,
16
- "test_runtime": 187.2157,
17
- "test_samples_per_second": 240.364,
18
- "test_steps_per_second": 3.76,
19
- "total_flos": 1.923768900984269e+16,
20
- "train_loss": 0.07172592910629114,
21
- "train_runtime": 850.4697,
22
- "train_samples_per_second": 176.373,
23
- "train_steps_per_second": 5.513
24
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_accuracy": 0.994305698056874,
4
+ "eval_f1": 0.9567553048616707,
5
+ "eval_loss": 0.023389853537082672,
6
+ "eval_precision": 0.961497877352703,
7
+ "eval_recall": 0.9520592880573442,
8
+ "eval_runtime": 14.9319,
9
+ "eval_samples_per_second": 334.853,
10
+ "eval_steps_per_second": 5.291,
11
+ "test_accuracy": 0.9944906320722552,
12
+ "test_f1": 0.9580974134172793,
13
+ "test_loss": 0.02194284088909626,
14
+ "test_precision": 0.9625433948474329,
15
+ "test_recall": 0.9536923150647733,
16
+ "test_runtime": 189.6478,
17
+ "test_samples_per_second": 237.282,
18
+ "test_steps_per_second": 3.712,
19
+ "total_flos": 1.8922726522850304e+16,
20
+ "train_loss": 0.1081566352736444,
21
+ "train_runtime": 846.98,
22
+ "train_samples_per_second": 177.1,
23
+ "train_steps_per_second": 5.536
24
  }
eval_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_accuracy": 0.9946379251588227,
4
- "eval_f1": 0.9578908824606371,
5
- "eval_loss": 0.021427959203720093,
6
- "eval_precision": 0.9607286259915777,
7
- "eval_recall": 0.9550698534780704,
8
- "eval_runtime": 14.9793,
9
- "eval_samples_per_second": 333.793,
10
- "eval_steps_per_second": 5.274
11
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_accuracy": 0.994305698056874,
4
+ "eval_f1": 0.9567553048616707,
5
+ "eval_loss": 0.023389853537082672,
6
+ "eval_precision": 0.961497877352703,
7
+ "eval_recall": 0.9520592880573442,
8
+ "eval_runtime": 14.9319,
9
+ "eval_samples_per_second": 334.853,
10
+ "eval_steps_per_second": 5.291
11
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a41b8e69ec9ddb7b922ea4ee7626a3310c8dbaf87a117593689d4634845da62d
3
  size 435915992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c85e6ea27c17123ab4059e9f93861e27b260435705ac6459744b89c59dda74d
3
  size 435915992
test_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "test_accuracy": 0.994592134929861,
3
- "test_f1": 0.9575683536318046,
4
- "test_loss": 0.020539097487926483,
5
- "test_precision": 0.960039772620678,
6
- "test_recall": 0.9551096262601754,
7
- "test_runtime": 187.2157,
8
- "test_samples_per_second": 240.364,
9
- "test_steps_per_second": 3.76
10
  }
 
1
  {
2
+ "test_accuracy": 0.9944906320722552,
3
+ "test_f1": 0.9580974134172793,
4
+ "test_loss": 0.02194284088909626,
5
+ "test_precision": 0.9625433948474329,
6
+ "test_recall": 0.9536923150647733,
7
+ "test_runtime": 189.6478,
8
+ "test_samples_per_second": 237.282,
9
+ "test_steps_per_second": 3.712
10
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.0,
3
- "total_flos": 1.923768900984269e+16,
4
- "train_loss": 0.07172592910629114,
5
- "train_runtime": 850.4697,
6
- "train_samples_per_second": 176.373,
7
- "train_steps_per_second": 5.513
8
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "total_flos": 1.8922726522850304e+16,
4
+ "train_loss": 0.1081566352736444,
5
+ "train_runtime": 846.98,
6
+ "train_samples_per_second": 177.1,
7
+ "train_steps_per_second": 5.536
8
  }
vocab.txt CHANGED
The diff for this file is too large to render. See raw diff