import os from datasets import load_dataset from sklearn.model_selection import train_test_split import pandas as pd def prepare_data(dataset_name: str = "David-Egea/phishing-texts"): print(f"Loading dataset: {dataset_name}...") # The dataset usually loads into a 'train' split if not specified ds = load_dataset(dataset_name) # Convert to pandas for easier manipulation/splitting df: pd.DataFrame = ds["train"].to_pandas() # type: ignore print(f"Total samples: {len(df)}") print(f"Class distribution:\n{df['phishing'].value_counts(normalize=True)}") # 80% Train, 20% Temp (Val + Test) train_df, temp_df = train_test_split( df, test_size=0.2, random_state=42, stratify=df["phishing"] ) # Split temp into 50% Val, 50% Test (results in 10% each of total) val_df, test_df = train_test_split( temp_df, test_size=0.5, random_state=42, stratify=temp_df["phishing"] ) print(f"Train samples: {len(train_df)}") print(f"Val samples: {len(val_df)}") print(f"Test samples: {len(test_df)}") # Ensure data directory exists os.makedirs("data", exist_ok=True) # Save splits train_df.to_csv("data/train.csv", index=False) val_df.to_csv("data/val.csv", index=False) test_df.to_csv("data/test.csv", index=False) print("Splits saved to data/ folder.") if __name__ == "__main__": prepare_data()