phish / data_loader.py
ggdpx's picture
Upload folder using huggingface_hub
0e038ee verified
import os
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
def prepare_data(dataset_name: str = "David-Egea/phishing-texts"):
print(f"Loading dataset: {dataset_name}...")
# The dataset usually loads into a 'train' split if not specified
ds = load_dataset(dataset_name)
# Convert to pandas for easier manipulation/splitting
df: pd.DataFrame = ds["train"].to_pandas() # type: ignore
print(f"Total samples: {len(df)}")
print(f"Class distribution:\n{df['phishing'].value_counts(normalize=True)}")
# 80% Train, 20% Temp (Val + Test)
train_df, temp_df = train_test_split(
df, test_size=0.2, random_state=42, stratify=df["phishing"]
)
# Split temp into 50% Val, 50% Test (results in 10% each of total)
val_df, test_df = train_test_split(
temp_df, test_size=0.5, random_state=42, stratify=temp_df["phishing"]
)
print(f"Train samples: {len(train_df)}")
print(f"Val samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")
# Ensure data directory exists
os.makedirs("data", exist_ok=True)
# Save splits
train_df.to_csv("data/train.csv", index=False)
val_df.to_csv("data/val.csv", index=False)
test_df.to_csv("data/test.csv", index=False)
print("Splits saved to data/ folder.")
if __name__ == "__main__":
prepare_data()