Create ts_preprocess.py
Browse files- tools/ts_preprocess.py +104 -0
tools/ts_preprocess.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# space/tools/ts_preprocess.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
MONTH = "MS" # month-start frequency
|
| 7 |
+
|
| 8 |
+
def _emi(principal: float, annual_rate: float, n_months: int) -> float:
|
| 9 |
+
"""
|
| 10 |
+
EMI formula with monthly compounding.
|
| 11 |
+
r_m = annual_rate / 12
|
| 12 |
+
EMI = P * r_m * (1+r_m)^n / ((1+r_m)^n - 1)
|
| 13 |
+
"""
|
| 14 |
+
if n_months <= 0 or principal <= 0:
|
| 15 |
+
return 0.0
|
| 16 |
+
r = annual_rate / 12.0
|
| 17 |
+
if r <= 0:
|
| 18 |
+
return principal / n_months
|
| 19 |
+
fac = (1.0 + r) ** n_months
|
| 20 |
+
return principal * r * fac / (fac - 1.0)
|
| 21 |
+
|
| 22 |
+
def _project_deposit(principal: float, annual_rate: float, months: int) -> pd.DataFrame:
|
| 23 |
+
"""
|
| 24 |
+
Monthly path for a deposit. Value compounds monthly.
|
| 25 |
+
"""
|
| 26 |
+
r = annual_rate / 12.0
|
| 27 |
+
data = []
|
| 28 |
+
bal = principal
|
| 29 |
+
for m in range(months + 1):
|
| 30 |
+
data.append({"step": m, "portfolio_value": bal})
|
| 31 |
+
bal = bal * (1.0 + r)
|
| 32 |
+
return pd.DataFrame(data)
|
| 33 |
+
|
| 34 |
+
def _project_asset(principal: float, annual_rate: float, tenor_months: int) -> pd.DataFrame:
|
| 35 |
+
"""
|
| 36 |
+
Monthly amortization schedule for an asset/loan using EMI.
|
| 37 |
+
"""
|
| 38 |
+
emi = _emi(principal, annual_rate, tenor_months)
|
| 39 |
+
r = annual_rate / 12.0
|
| 40 |
+
data = []
|
| 41 |
+
bal = principal
|
| 42 |
+
for m in range(tenor_months + 1):
|
| 43 |
+
interest = bal * r
|
| 44 |
+
principal_pay = max(0.0, emi - interest)
|
| 45 |
+
next_bal = max(0.0, bal - principal_pay)
|
| 46 |
+
data.append({
|
| 47 |
+
"step": m,
|
| 48 |
+
"portfolio_value": bal,
|
| 49 |
+
"emi": emi,
|
| 50 |
+
"interest_component": interest,
|
| 51 |
+
"principal_component": principal_pay,
|
| 52 |
+
"remaining_balance": next_bal
|
| 53 |
+
})
|
| 54 |
+
bal = next_bal
|
| 55 |
+
return pd.DataFrame(data)
|
| 56 |
+
|
| 57 |
+
def build_timeseries(df: pd.DataFrame) -> pd.DataFrame:
|
| 58 |
+
"""
|
| 59 |
+
Input df columns (example):
|
| 60 |
+
- portfolio_date (datetime or str)
|
| 61 |
+
- instrument_type: 'Deposit' or 'Asset'
|
| 62 |
+
- balance: float
|
| 63 |
+
- interest_rate: annual rate (e.g., 0.12)
|
| 64 |
+
- time_to_maturity: months (int)
|
| 65 |
+
- tenor_months: months (for Assets; if missing, fallback to time_to_maturity)
|
| 66 |
+
Output:
|
| 67 |
+
Long time-series with monthly timestamps, projected 'portfolio_value'
|
| 68 |
+
(and EMI breakdown for Assets).
|
| 69 |
+
"""
|
| 70 |
+
df = df.copy()
|
| 71 |
+
if "timestamp" not in df.columns:
|
| 72 |
+
df["timestamp"] = pd.to_datetime(df["portfolio_date"])
|
| 73 |
+
|
| 74 |
+
out_frames: List[pd.DataFrame] = []
|
| 75 |
+
for _, row in df.iterrows():
|
| 76 |
+
itype = str(row.get("instrument_type", "")).strip().lower()
|
| 77 |
+
start = pd.to_datetime(row["timestamp"])
|
| 78 |
+
months = int(row.get("time_to_maturity", 0) or 0)
|
| 79 |
+
principal = float(row.get("balance", 0.0) or 0.0)
|
| 80 |
+
annual_rate = float(row.get("interest_rate", 0.0) or 0.0)
|
| 81 |
+
|
| 82 |
+
if itype == "deposit":
|
| 83 |
+
sched = _project_deposit(principal, annual_rate, months)
|
| 84 |
+
elif itype == "asset":
|
| 85 |
+
tenor = int(row.get("tenor_months", months) or months or 0)
|
| 86 |
+
sched = _project_asset(principal, annual_rate, tenor)
|
| 87 |
+
else:
|
| 88 |
+
# unknown types: keep flat
|
| 89 |
+
sched = pd.DataFrame({"step": range(months + 1), "portfolio_value": principal})
|
| 90 |
+
|
| 91 |
+
# Build timestamps: month-start frequency
|
| 92 |
+
sched["timestamp"] = pd.date_range(start=start, periods=len(sched), freq=MONTH)
|
| 93 |
+
# Carry identifiers
|
| 94 |
+
for col in ["instrument_type", "interest_rate"]:
|
| 95 |
+
if col in df.columns:
|
| 96 |
+
sched[col] = row.get(col)
|
| 97 |
+
sched["origin_portfolio_date"] = start
|
| 98 |
+
sched["origin_balance"] = principal
|
| 99 |
+
|
| 100 |
+
out_frames.append(sched)
|
| 101 |
+
|
| 102 |
+
ts = pd.concat(out_frames, ignore_index=True)
|
| 103 |
+
ts = ts.sort_values(["timestamp", "instrument_type"]).reset_index(drop=True)
|
| 104 |
+
return ts
|