MaziyarPanahi commited on
Commit
a6c3e20
·
verified ·
1 Parent(s): e25534e

Upload PII detection model OpenMed-PII-LiteClinical-Small-66M-v1

Browse files
README.md CHANGED
@@ -2,7 +2,7 @@
2
  language:
3
  - en
4
  license: apache-2.0
5
- base_model: distilbert/distilbert-base-cased
6
  tags:
7
  - token-classification
8
  - ner
@@ -38,13 +38,13 @@ model-index:
38
  split: test
39
  metrics:
40
  - type: f1
41
- value: 0.9483
42
  name: F1 (micro)
43
  - type: precision
44
- value: 0.9530
45
  name: Precision
46
  - type: recall
47
- value: 0.9436
48
  name: Recall
49
  widget:
50
  - text: "Dr. Sarah Johnson (SSN: 123-45-6789) can be reached at sarah.johnson@hospital.org or 555-123-4567. She lives at 123 Oak Street, Boston, MA 02108."
@@ -55,7 +55,7 @@ widget:
55
 
56
  **PII Detection Model** | 66M Parameters | Open Source
57
 
58
- [![F1 Score](https://img.shields.io/badge/F1-94.83%25-brightgreen)]() [![Precision](https://img.shields.io/badge/Precision-95.30%25-blue)]() [![Recall](https://img.shields.io/badge/Recall-94.36%25-orange)]()
59
 
60
  ## Model Description
61
 
@@ -74,12 +74,12 @@ Evaluated on a stratified 2,000-sample test set from NVIDIA Nemotron-PII:
74
 
75
  | Metric | Score |
76
  |:---|:---:|
77
- | **Micro F1** | **0.9483** |
78
- | Precision | 0.9530 |
79
- | Recall | 0.9436 |
80
- | Macro F1 | 0.9503 |
81
- | Weighted F1 | 0.9473 |
82
- | Accuracy | 0.9931 |
83
 
84
  ### Top 10 PII Models
85
 
@@ -100,11 +100,11 @@ Evaluated on a stratified 2,000-sample test set from NVIDIA Nemotron-PII:
100
 
101
  | Entity | F1 | Precision | Recall | Support |
102
  |:---|:---:|:---:|:---:|:---:|
103
- | `employee_id` | 0.997 | 0.994 | 1.000 | 163 |
104
- | `biometric_identifier` | 0.996 | 0.991 | 1.000 | 231 |
105
- | `email` | 0.995 | 0.995 | 0.995 | 750 |
106
- | `date_of_birth` | 0.995 | 0.989 | 1.000 | 273 |
107
- | `medical_record_number` | 0.994 | 0.989 | 1.000 | 262 |
108
 
109
  ### Challenging Entities
110
 
@@ -112,11 +112,11 @@ These entity types have lower performance and may benefit from additional post-p
112
 
113
  | Entity | F1 | Precision | Recall | Support |
114
  |:---|:---:|:---:|:---:|:---:|
115
- | `language` | 0.896 | 0.972 | 0.831 | 207 |
116
- | `pin` | 0.891 | 0.927 | 0.858 | 134 |
117
- | `time` | 0.852 | 0.859 | 0.844 | 463 |
118
- | `sexuality` | 0.833 | 0.773 | 0.904 | 83 |
119
- | `occupation` | 0.644 | 0.697 | 0.599 | 708 |
120
 
121
  ## Supported Entity Types
122
 
 
2
  language:
3
  - en
4
  license: apache-2.0
5
+ base_model: distilbert/distilbert-base-uncased
6
  tags:
7
  - token-classification
8
  - ner
 
38
  split: test
39
  metrics:
40
  - type: f1
41
+ value: 0.9485
42
  name: F1 (micro)
43
  - type: precision
44
+ value: 0.9554
45
  name: Precision
46
  - type: recall
47
+ value: 0.9418
48
  name: Recall
49
  widget:
50
  - text: "Dr. Sarah Johnson (SSN: 123-45-6789) can be reached at sarah.johnson@hospital.org or 555-123-4567. She lives at 123 Oak Street, Boston, MA 02108."
 
55
 
56
  **PII Detection Model** | 66M Parameters | Open Source
57
 
58
+ [![F1 Score](https://img.shields.io/badge/F1-94.85%25-brightgreen)]() [![Precision](https://img.shields.io/badge/Precision-95.54%25-blue)]() [![Recall](https://img.shields.io/badge/Recall-94.18%25-orange)]()
59
 
60
  ## Model Description
61
 
 
74
 
75
  | Metric | Score |
76
  |:---|:---:|
77
+ | **Micro F1** | **0.9485** |
78
+ | Precision | 0.9554 |
79
+ | Recall | 0.9418 |
80
+ | Macro F1 | 0.9484 |
81
+ | Weighted F1 | 0.9468 |
82
+ | Accuracy | 0.9930 |
83
 
84
  ### Top 10 PII Models
85
 
 
100
 
101
  | Entity | F1 | Precision | Recall | Support |
102
  |:---|:---:|:---:|:---:|:---:|
103
+ | `biometric_identifier` | 1.000 | 1.000 | 1.000 | 234 |
104
+ | `credit_debit_card` | 0.995 | 1.000 | 0.991 | 215 |
105
+ | `email` | 0.994 | 0.995 | 0.993 | 763 |
106
+ | `date_of_birth` | 0.993 | 0.986 | 1.000 | 273 |
107
+ | `health_plan_beneficiary_number` | 0.991 | 0.982 | 1.000 | 216 |
108
 
109
  ### Challenging Entities
110
 
 
112
 
113
  | Entity | F1 | Precision | Recall | Support |
114
  |:---|:---:|:---:|:---:|:---:|
115
+ | `pin` | 0.862 | 0.872 | 0.853 | 136 |
116
+ | `time` | 0.859 | 0.894 | 0.826 | 472 |
117
+ | `gender` | 0.829 | 0.806 | 0.853 | 190 |
118
+ | `sexuality` | 0.824 | 0.758 | 0.904 | 83 |
119
+ | `occupation` | 0.647 | 0.755 | 0.566 | 724 |
120
 
121
  ## Supported Entity Types
122
 
all_results.json CHANGED
@@ -1,24 +1,24 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_accuracy": 0.993924587244714,
4
- "eval_f1": 0.948089578626854,
5
- "eval_loss": 0.02283928357064724,
6
- "eval_precision": 0.9507276353697274,
7
- "eval_recall": 0.945466121409506,
8
- "eval_runtime": 11.3615,
9
- "eval_samples_per_second": 440.082,
10
- "eval_steps_per_second": 6.953,
11
- "test_accuracy": 0.9939600221131221,
12
- "test_f1": 0.949191080722605,
13
- "test_loss": 0.022294577211141586,
14
- "test_precision": 0.9512299574706483,
15
- "test_recall": 0.9471609255822436,
16
- "test_runtime": 162.5659,
17
- "test_samples_per_second": 276.811,
18
- "test_steps_per_second": 4.331,
19
- "total_flos": 9918920981578752.0,
20
- "train_loss": 0.09467728419150338,
21
- "train_runtime": 496.6001,
22
- "train_samples_per_second": 302.054,
23
- "train_steps_per_second": 9.442
24
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_accuracy": 0.9943964513221458,
4
+ "eval_f1": 0.9546176592780861,
5
+ "eval_loss": 0.023015499114990234,
6
+ "eval_precision": 0.9615498227648681,
7
+ "eval_recall": 0.9477847333430388,
8
+ "eval_runtime": 11.052,
9
+ "eval_samples_per_second": 452.409,
10
+ "eval_steps_per_second": 7.148,
11
+ "test_accuracy": 0.9943050269458373,
12
+ "test_f1": 0.9548047025061205,
13
+ "test_loss": 0.02243383601307869,
14
+ "test_precision": 0.961166670708998,
15
+ "test_recall": 0.9485264003318991,
16
+ "test_runtime": 164.7983,
17
+ "test_samples_per_second": 273.061,
18
+ "test_steps_per_second": 4.272,
19
+ "total_flos": 9393035107602432.0,
20
+ "train_loss": 0.10323514050322825,
21
+ "train_runtime": 488.206,
22
+ "train_samples_per_second": 307.247,
23
+ "train_steps_per_second": 9.605
24
  }
config.json CHANGED
@@ -229,12 +229,11 @@
229
  "model_type": "distilbert",
230
  "n_heads": 12,
231
  "n_layers": 6,
232
- "output_past": true,
233
  "pad_token_id": 0,
234
  "qa_dropout": 0.1,
235
  "seq_classif_dropout": 0.2,
236
  "sinusoidal_pos_embds": false,
237
  "tie_weights_": true,
238
  "transformers_version": "4.57.1",
239
- "vocab_size": 28996
240
  }
 
229
  "model_type": "distilbert",
230
  "n_heads": 12,
231
  "n_layers": 6,
 
232
  "pad_token_id": 0,
233
  "qa_dropout": 0.1,
234
  "seq_classif_dropout": 0.2,
235
  "sinusoidal_pos_embds": false,
236
  "tie_weights_": true,
237
  "transformers_version": "4.57.1",
238
+ "vocab_size": 30522
239
  }
eval_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_accuracy": 0.993924587244714,
4
- "eval_f1": 0.948089578626854,
5
- "eval_loss": 0.02283928357064724,
6
- "eval_precision": 0.9507276353697274,
7
- "eval_recall": 0.945466121409506,
8
- "eval_runtime": 11.3615,
9
- "eval_samples_per_second": 440.082,
10
- "eval_steps_per_second": 6.953
11
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_accuracy": 0.9943964513221458,
4
+ "eval_f1": 0.9546176592780861,
5
+ "eval_loss": 0.023015499114990234,
6
+ "eval_precision": 0.9615498227648681,
7
+ "eval_recall": 0.9477847333430388,
8
+ "eval_runtime": 11.052,
9
+ "eval_samples_per_second": 452.409,
10
+ "eval_steps_per_second": 7.148
11
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:377c6337610ab7721d205a502d9aafd0b27fe4ea759a1d4b514be38c45bfe751
3
- size 261102048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50f1b835752fec6d82a261035ff3a13f4a4aba8033a19904ac3524358bff1fa3
3
+ size 265789928
test_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "test_accuracy": 0.9939600221131221,
3
- "test_f1": 0.949191080722605,
4
- "test_loss": 0.022294577211141586,
5
- "test_precision": 0.9512299574706483,
6
- "test_recall": 0.9471609255822436,
7
- "test_runtime": 162.5659,
8
- "test_samples_per_second": 276.811,
9
- "test_steps_per_second": 4.331
10
  }
 
1
  {
2
+ "test_accuracy": 0.9943050269458373,
3
+ "test_f1": 0.9548047025061205,
4
+ "test_loss": 0.02243383601307869,
5
+ "test_precision": 0.961166670708998,
6
+ "test_recall": 0.9485264003318991,
7
+ "test_runtime": 164.7983,
8
+ "test_samples_per_second": 273.061,
9
+ "test_steps_per_second": 4.272
10
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -43,7 +43,7 @@
43
  },
44
  "clean_up_tokenization_spaces": false,
45
  "cls_token": "[CLS]",
46
- "do_lower_case": false,
47
  "extra_special_tokens": {},
48
  "mask_token": "[MASK]",
49
  "model_max_length": 512,
 
43
  },
44
  "clean_up_tokenization_spaces": false,
45
  "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
  "extra_special_tokens": {},
48
  "mask_token": "[MASK]",
49
  "model_max_length": 512,
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.0,
3
- "total_flos": 9918920981578752.0,
4
- "train_loss": 0.09467728419150338,
5
- "train_runtime": 496.6001,
6
- "train_samples_per_second": 302.054,
7
- "train_steps_per_second": 9.442
8
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "total_flos": 9393035107602432.0,
4
+ "train_loss": 0.10323514050322825,
5
+ "train_runtime": 488.206,
6
+ "train_samples_per_second": 307.247,
7
+ "train_steps_per_second": 9.605
8
  }
vocab.txt CHANGED
The diff for this file is too large to render. See raw diff