| | import os |
| | from datasets import load_dataset |
| | from sklearn.model_selection import train_test_split |
| | import pandas as pd |
| |
|
| |
|
| | def prepare_data(dataset_name: str = "David-Egea/phishing-texts"): |
| | print(f"Loading dataset: {dataset_name}...") |
| | |
| | ds = load_dataset(dataset_name) |
| |
|
| | |
| | df: pd.DataFrame = ds["train"].to_pandas() |
| |
|
| | print(f"Total samples: {len(df)}") |
| | print(f"Class distribution:\n{df['phishing'].value_counts(normalize=True)}") |
| |
|
| | |
| | train_df, temp_df = train_test_split( |
| | df, test_size=0.2, random_state=42, stratify=df["phishing"] |
| | ) |
| |
|
| | |
| | val_df, test_df = train_test_split( |
| | temp_df, test_size=0.5, random_state=42, stratify=temp_df["phishing"] |
| | ) |
| |
|
| | print(f"Train samples: {len(train_df)}") |
| | print(f"Val samples: {len(val_df)}") |
| | print(f"Test samples: {len(test_df)}") |
| |
|
| | |
| | os.makedirs("data", exist_ok=True) |
| |
|
| | |
| | train_df.to_csv("data/train.csv", index=False) |
| | val_df.to_csv("data/val.csv", index=False) |
| | test_df.to_csv("data/test.csv", index=False) |
| | print("Splits saved to data/ folder.") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | prepare_data() |
| |
|