| | |
| | import numpy as np |
| | import pandas as pd |
| | import re |
| | from sklearn.preprocessing import LabelEncoder |
| | from sklearn.model_selection import train_test_split |
| | from tensorflow.keras.preprocessing.text import Tokenizer |
| | from tensorflow.keras.preprocessing.sequence import pad_sequences |
| |
|
| | |
| | def read_data(path): |
| | try: |
| | df = pd.read_csv(path) |
| | if df.empty: |
| | print("The file is empty.") |
| | return None |
| | return df |
| | except FileNotFoundError: |
| | print(f"File not found at: {path}") |
| | return None |
| | except Exception as e: |
| | print(f"An error occurred: {e}") |
| | return None |
| |
|
| | |
| | def clean_text(text): |
| | text = text.lower() |
| | text = re.sub(r"\d+", " ", text) |
| | text = re.sub(r"[^\w\s]", " ", text) |
| | text = text.strip() |
| | return text |
| |
|
| | |
| | def preprocess_data(file_path, max_len=10, vocab_size=250): |
| | |
| | df = read_data(file_path) |
| | if df is None: |
| | print("Data loading failed.") |
| | return None, None, None, None |
| |
|
| | |
| | df['Transaction Description'] = df['Transaction Description'].apply(clean_text) |
| | |
| | |
| | tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>") |
| | tokenizer.fit_on_texts(df['Transaction Description']) |
| | |
| | |
| | sequences = tokenizer.texts_to_sequences(df['Transaction Description']) |
| | padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post') |
| | |
| | |
| | label_encoder = LabelEncoder() |
| | labels = label_encoder.fit_transform(df['Category']) |
| | |
| | return padded_sequences, labels, tokenizer, label_encoder |
| |
|
| | |
| | def split_data(sequences, labels, test_size=0.2, random_state=42): |
| | X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=test_size, random_state=random_state) |
| | return X_train, X_test, y_train, y_test |
| |
|
| | |
| | def main(): |
| | |
| | data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv" |
| |
|
| | |
| | sequences, labels, tokenizer, label_encoder = preprocess_data(data_path) |
| |
|
| | |
| | if sequences is not None: |
| | print("Data preprocessing successful!") |
| | |
| | X_train, X_test, y_train, y_test = split_data(sequences, labels) |
| | print(f"Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}") |
| | print(f"Testing data shape: {X_test.shape}, Testing labels shape: {y_test.shape}") |
| | else: |
| | print("Data preprocessing failed.") |
| |
|
| | |
| | if __name__ == "__main__": |
| | main() |
| |
|