cnmoro/LexicalTriplets
Viewer • Updated • 77.6M • 415
How to use cnmoro/LexicalEmbed-Base with sentence-transformers:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("cnmoro/LexicalEmbed-Base", trust_remote_code=True)
sentences = [
"The weather is lovely today.",
"It's so sunny outside!",
"He drove to the stadium."
]
embeddings = model.encode(sentences)
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]This is a model trained on cnmoro/LexicalTriplets to produce lexical embeddings (not semantic!)
This can be used to compute lexical similarity between words or phrases.
Concept:
"Some text" will be similar to "Sm txt"
"King" will not be similar to "Queen" or "Royalty"
"Dog" will not be similar to "Animal"
"Doge" will be similar to "Dog"
import torch, re, unicodedata
from transformers import AutoModel, AutoTokenizer
model_name = "cnmoro/LexicalEmbed-Base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
model.eval()
def preprocess(text):
text = unicodedata.normalize('NFD', text)
text = ''.join(c for c in text if unicodedata.category(c) != 'Mn')
text = re.sub(r'[^\w\s]+', ' ', text.lower())
return re.sub(r'\s+', ' ', text).strip()
texts = ["hello world", "hel wor"]
texts = [ preprocess(s) for s in texts ]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
embeddings = model(**inputs)
cosine_sim = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1], dim=0)
print(f"Cosine Similarity: {cosine_sim.item()}") # 0.8966174125671387