| | |
| | import numpy as np |
| | import pandas as pd |
| |
|
| | import tensorflow |
| | import keras |
| | import torch |
| |
|
| | import re |
| |
|
| | from transformers import BertTokenizer |
| | from sklearn.preprocessing import LabelEncoder |
| |
|
| | |
| | def read_data(path): |
| | try: |
| | df=pd.read_csv(path) |
| | return df |
| | except FileNotFoundError: |
| | print("File not exsists") |
| | |
| | data=read_data(r"E:\transactify\Dataset\transaction_data.csv") |
| | if data is not None: |
| | print(data.head(15)) |
| | |
| | |
| | def clean_text(text): |
| | text=text.lower() |
| | text=re.sub(r"\d+"," ",text) |
| | text=re.sub(r"[^\w\s]"," ",text) |
| | text=text.strip() |
| | return text |
| |
|
| | def preprocessing_data(df,max_length=20): |
| | tokenizer=BertTokenizer.from_pretrained("bert-base-uncased") |
| | |
| | input_ids=[] |
| | attention_masks=[] |
| | |
| | for description in df["Transaction Description"]: |
| | cleaned_text = clean_text(description) |
| | |
| | |
| | print(f"Original Description: {description}") |
| | print(f"Cleaned Text: {cleaned_text}") |
| | |
| | |
| | if cleaned_text: |
| | encoded_dict = tokenizer.encode_plus( |
| | cleaned_text, |
| | add_special_tokens=True, |
| | max_length=max_length, |
| | pad_to_max_length=True, |
| | return_attention_mask=True, |
| | return_tensors="pt", |
| | truncation=True |
| | ) |
| | |
| | input_ids.append(encoded_dict['input_ids']) |
| | attention_masks.append(encoded_dict['attention_mask']) |
| | else: |
| | print("Cleaned text is empty, skipping...") |
| |
|
| | |
| | print(f"Total input_ids collected: {len(input_ids)}") |
| | print(f"Total attention_masks collected: {len(attention_masks)}") |
| | |
| | if not input_ids: |
| | raise ValueError("No input_ids were collected. Check the cleaning process.") |
| |
|
| | input_ids = torch.cat(input_ids, dim=0) |
| | attention_masks = torch.cat(attention_masks, dim=0) |
| | |
| | labelencoder = LabelEncoder() |
| | labels = labelencoder.fit_transform(df["Category"]) |
| | labels = torch.tensor(labels) |
| | |
| | return input_ids, attention_masks, labels, labelencoder |
| |
|
| | input_ids, attention_masks, labels, labelencoder = preprocessing_data(data) |
| |
|