Spaces:

Shredder
/

CONBERT-3

Runtime error

App Files Files Community

Shredder commited on Sep 10, 2022

Commit

57a4ee5

1 Parent(s): 4a6fb9b

Update fincat_utils.py

Browse files

Files changed (1) hide show

fincat_utils.py +4 -38

fincat_utils.py CHANGED Viewed

@@ -29,26 +29,7 @@ def extract_context_words(x, window = 6):
 """The following functions have been created with inspiration from https://github.com/arushiprakash/MachineLearning/blob/main/BERT%20Word%20Embeddings.ipynb"""
 def bert_text_preparation(text, tokenizer):
-    """Preparing the input for BERT
-    Takes a string argument and performs
-    pre-processing like adding special tokens,
-    tokenization, tokens to ids, and tokens to
-    segment ids. All tokens are mapped to seg-
-    ment id = 1.
-    Args:
-        text (str): Text to be converted
-        tokenizer (obj): Tokenizer object
-            to convert text into BERT-re-
-            adable tokens and ids
-    Returns:
-        list: List of BERT-readable tokens
-        obj: Torch tensor with token ids
-        obj: Torch tensor segment ids
-    """
     marked_text = "[CLS] " + text + " [SEP]"
     tokenized_text = tokenizer.tokenize(marked_text)
     indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
@@ -61,22 +42,7 @@ def bert_text_preparation(text, tokenizer):
     return tokenized_text, tokens_tensor, segments_tensors
 def get_bert_embeddings(tokens_tensor, segments_tensors, model):
-    """Get embeddings from an embedding model
-    Args:
-        tokens_tensor (obj): Torch tensor size [n_tokens]
-            with token ids for each token in text
-        segments_tensors (obj): Torch tensor size [n_tokens]
-            with segment ids for each token in text
-        model (obj): Embedding model to generate embeddings
-            from token and segment ids
-    Returns:
-        list: List of list of floats of size
-            [n_tokens, n_embedding_dimensions]
-            containing embeddings for each token
-    """
     # Gradient calculation id disabled
     # Model is in inference mode
     with torch.no_grad():
@@ -106,5 +72,5 @@ def bert_embedding_extract(context_text, word):
         word_embedding_all.append(word_embedding)
       word_embedding_mean = np.array(word_embedding_all).mean(axis=0)
       return word_embedding_mean
-   except:
-       return ['None']

 """The following functions have been created with inspiration from https://github.com/arushiprakash/MachineLearning/blob/main/BERT%20Word%20Embeddings.ipynb"""
 def bert_text_preparation(text, tokenizer):
     marked_text = "[CLS] " + text + " [SEP]"
     tokenized_text = tokenizer.tokenize(marked_text)
     indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
     return tokenized_text, tokens_tensor, segments_tensors
 def get_bert_embeddings(tokens_tensor, segments_tensors, model):
     # Gradient calculation id disabled
     # Model is in inference mode
     with torch.no_grad():
         word_embedding_all.append(word_embedding)
       word_embedding_mean = np.array(word_embedding_all).mean(axis=0)
       return word_embedding_mean
+  except:
+      return ['None']