Update fincat_utils.py
Browse files- fincat_utils.py +4 -38
fincat_utils.py
CHANGED
|
@@ -29,26 +29,7 @@ def extract_context_words(x, window = 6):
|
|
| 29 |
"""The following functions have been created with inspiration from https://github.com/arushiprakash/MachineLearning/blob/main/BERT%20Word%20Embeddings.ipynb"""
|
| 30 |
|
| 31 |
def bert_text_preparation(text, tokenizer):
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
Takes a string argument and performs
|
| 35 |
-
pre-processing like adding special tokens,
|
| 36 |
-
tokenization, tokens to ids, and tokens to
|
| 37 |
-
segment ids. All tokens are mapped to seg-
|
| 38 |
-
ment id = 1.
|
| 39 |
-
|
| 40 |
-
Args:
|
| 41 |
-
text (str): Text to be converted
|
| 42 |
-
tokenizer (obj): Tokenizer object
|
| 43 |
-
to convert text into BERT-re-
|
| 44 |
-
adable tokens and ids
|
| 45 |
-
|
| 46 |
-
Returns:
|
| 47 |
-
list: List of BERT-readable tokens
|
| 48 |
-
obj: Torch tensor with token ids
|
| 49 |
-
obj: Torch tensor segment ids
|
| 50 |
-
|
| 51 |
-
"""
|
| 52 |
marked_text = "[CLS] " + text + " [SEP]"
|
| 53 |
tokenized_text = tokenizer.tokenize(marked_text)
|
| 54 |
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
@@ -61,22 +42,7 @@ def bert_text_preparation(text, tokenizer):
|
|
| 61 |
return tokenized_text, tokens_tensor, segments_tensors
|
| 62 |
|
| 63 |
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
Args:
|
| 67 |
-
tokens_tensor (obj): Torch tensor size [n_tokens]
|
| 68 |
-
with token ids for each token in text
|
| 69 |
-
segments_tensors (obj): Torch tensor size [n_tokens]
|
| 70 |
-
with segment ids for each token in text
|
| 71 |
-
model (obj): Embedding model to generate embeddings
|
| 72 |
-
from token and segment ids
|
| 73 |
-
|
| 74 |
-
Returns:
|
| 75 |
-
list: List of list of floats of size
|
| 76 |
-
[n_tokens, n_embedding_dimensions]
|
| 77 |
-
containing embeddings for each token
|
| 78 |
-
"""
|
| 79 |
-
|
| 80 |
# Gradient calculation id disabled
|
| 81 |
# Model is in inference mode
|
| 82 |
with torch.no_grad():
|
|
@@ -106,5 +72,5 @@ def bert_embedding_extract(context_text, word):
|
|
| 106 |
word_embedding_all.append(word_embedding)
|
| 107 |
word_embedding_mean = np.array(word_embedding_all).mean(axis=0)
|
| 108 |
return word_embedding_mean
|
| 109 |
-
|
| 110 |
-
|
|
|
|
| 29 |
"""The following functions have been created with inspiration from https://github.com/arushiprakash/MachineLearning/blob/main/BERT%20Word%20Embeddings.ipynb"""
|
| 30 |
|
| 31 |
def bert_text_preparation(text, tokenizer):
|
| 32 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
marked_text = "[CLS] " + text + " [SEP]"
|
| 34 |
tokenized_text = tokenizer.tokenize(marked_text)
|
| 35 |
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
|
|
| 42 |
return tokenized_text, tokens_tensor, segments_tensors
|
| 43 |
|
| 44 |
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
|
| 45 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
# Gradient calculation id disabled
|
| 47 |
# Model is in inference mode
|
| 48 |
with torch.no_grad():
|
|
|
|
| 72 |
word_embedding_all.append(word_embedding)
|
| 73 |
word_embedding_mean = np.array(word_embedding_all).mean(axis=0)
|
| 74 |
return word_embedding_mean
|
| 75 |
+
except:
|
| 76 |
+
return ['None']
|