""" EDA helpers for Streamlit: Stage 1 (sensors + labels) and Stage 2 (IMS + merged). """ from __future__ import annotations import sys from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parent.parent if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT)) import pandas as pd import numpy as np def get_stage1_eda(): """Load Stage 1 data and return dict with summary, labels df, optional sensor sample for plots.""" from config import settings out = {"labels": None, "labels_stats": None, "sensor_sample": None, "error": None} labels_path = settings.PROCESSED_DIR / "stage1_labels.csv" if not labels_path.exists(): out["error"] = "Stage 1 labels not found. Run Stage 1 first." return out labels = pd.read_csv(labels_path, index_col=0, parse_dates=True) labels.index = pd.to_datetime(labels.index, utc=True) out["labels"] = labels out["labels_stats"] = { "count": len(labels), "date_min": labels.index.min(), "date_max": labels.index.max(), "A_mean": labels.iloc[:, 0].mean(), "A_std": labels.iloc[:, 0].std(), "A_min": labels.iloc[:, 0].min(), "A_max": labels.iloc[:, 0].max(), } # Optional: load a sample of sensor data for PAR/T (limit rows for speed) sensor_path = settings.SENSORS_WIDE_PATH if not sensor_path.exists(): sensor_path = settings.SENSORS_WIDE_SAMPLE_PATH if sensor_path.exists(): try: cols = ["time", "Air1_PAR_ref", "Air1_leafTemperature_ref", "Air1_airTemperature_ref", "Air1_CO2_ref", "Air1_VPD_ref"] sensor = pd.read_csv(sensor_path, usecols=lambda c: c in cols, nrows=50000) if "time" in sensor.columns: sensor["time"] = pd.to_datetime(sensor["time"], utc=True) sensor = sensor[sensor["Air1_PAR_ref"] > 50] out["sensor_sample"] = sensor except Exception: out["sensor_sample"] = None return out def get_stage2_eda(): """Load IMS + labels, merge, return merged df and summary for EDA.""" from config import settings from src.ims_client import IMSClient from src.preprocessor import Preprocessor out = {"merged": None, "ims": None, "labels": None, "stats": None, "error": None} labels_path = settings.PROCESSED_DIR / "stage1_labels.csv" if not labels_path.exists(): out["error"] = "Stage 1 labels not found." return out labels = pd.read_csv(labels_path, index_col=0, parse_dates=True) labels.index = pd.to_datetime(labels.index, utc=True) labels = labels.iloc[:, 0] client = IMSClient() ims = client.load_cached() if ims.empty: out["error"] = "IMS cache not found. Run download_ims_data first." return out preproc = Preprocessor() merged = preproc.merge_ims_with_labels(ims, labels, timestamp_index_labels=True) if merged.empty: out["error"] = "No overlap between IMS and labels." return out merged = preproc.create_time_features(merged) out["merged"] = merged out["ims"] = ims out["labels"] = labels out["stats"] = { "ims_rows": len(ims), "ims_date_min": pd.to_datetime(ims["timestamp_utc"]).min(), "ims_date_max": pd.to_datetime(ims["timestamp_utc"]).max(), "merged_rows": len(merged), "feature_cols": [c for c in merged.select_dtypes(include=[np.number]).columns if c not in ("A",)], } return out