Upload folder using huggingface_hub

5092c1e verified 3 months ago

8.76 kB

	import pandas as pd
	import numpy as np
	import pickle
	import json
	import argparse
	from pathlib import Path
	import sys

	from feature_engineering import FeatureEngineer
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import LabelEncoder
	from sklearn.metrics import (accuracy_score, precision_score, recall_score,
	f1_score, roc_auc_score, confusion_matrix)
	import xgboost as xgb
	import lightgbm as lgb
	from imblearn.over_sampling import SMOTE


	def load_data(data_dir):
	"""데이터 로드"""
	print("데이터 로드 중...")

	df_store = pd.read_csv(f'{data_dir}/big_data_set1_f.csv',
	encoding='cp949', on_bad_lines='skip')
	df_usage = pd.read_csv(f'{data_dir}/ds2_monthly_usage.csv',
	encoding='cp949', on_bad_lines='skip')
	df_customer = pd.read_csv(f'{data_dir}/ds3_monthly_customers.csv',
	encoding='cp949', on_bad_lines='skip')

	print(f"매장 정보: {df_store.shape}")
	print(f"이용 데이터: {df_usage.shape}")
	print(f"고객 데이터: {df_customer.shape}")

	return df_store, df_usage, df_customer


	def create_features(df_store, df_usage, df_customer, max_stores=None):
	"""특징 생성"""
	print("\n특징 생성 중...")

	engineer = FeatureEngineer(include_weather=False)

	all_features = []
	all_targets = []

	store_ids = df_store['ENCODED_MCT'].unique()
	if max_stores:
	store_ids = store_ids[:max_stores]

	for idx, store_id in enumerate(store_ids):
	store_info = df_store[df_store['ENCODED_MCT'] == store_id].iloc[0]
	usage_data = df_usage[df_usage['ENCODED_MCT'] == store_id]
	customer_data = df_customer[df_customer['ENCODED_MCT'] == store_id]

	# 최소 3개월 데이터 필요
	if len(usage_data) >= 3:
	store_data = {
	'industry': store_info['HPSN_MCT_BZN_CD_NM'] if pd.notna(store_info['HPSN_MCT_BZN_CD_NM']) else '기타',
	'location': store_info['MCT_SIGUNGU_NM']
	}

	features = engineer.create_features(store_data, usage_data, customer_data)
	target = 1 if pd.notna(store_info['MCT_ME_D']) else 0

	all_features.append(features)
	all_targets.append(target)

	if (idx + 1) % 500 == 0:
	print(f" 처리 중... {idx + 1}/{len(store_ids)}")

	X = pd.concat(all_features, ignore_index=True)
	y = pd.Series(all_targets)

	print(f"총 샘플: {len(X)}, 특징 수: {X.shape[1]}")
	print(f"폐업 비율: {y.mean():.2%} ({y.sum()}개)")

	return X, y


	def preprocess_data(X, y):
	"""데이터 전처리"""
	print("\n데이터 전처리 중...")

	# 카테고리 변수 인코딩
	label_encoders = {}
	if 'context_industry' in X.columns:
	le = LabelEncoder()
	X['context_industry'] = le.fit_transform(X['context_industry'].astype(str))
	label_encoders['context_industry'] = le

	# 결측치 처리
	X = X.fillna(X.median())

	# 데이터 분할
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.25, random_state=42, stratify=y
	)

	print(f"Train: {X_train.shape}, Test: {X_test.shape}")
	print(f"Train 폐업: {y_train.mean():.2%}, Test 폐업: {y_test.mean():.2%}")

	return X_train, X_test, y_train, y_test, label_encoders


	def apply_smote(X_train, y_train):
	"""SMOTE 적용"""
	print("\n클래스 불균형 처리(SMOTE)...")

	min_samples = min(y_train.sum(), len(y_train) - y_train.sum())
	k_neighbors = min(5, min_samples - 1)

	smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
	X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

	print(f"SMOTE 후: 영업 {(y_train_balanced == 0).sum()}개, 폐업 {(y_train_balanced == 1).sum()}개")

	return X_train_balanced, y_train_balanced


	def train_models(X_train, y_train):
	"""모델 학습"""
	print("\n모델 학습 중...")

	# XGBoost
	print(" - XGBoost 학습...")
	xgb_model = xgb.XGBClassifier(
	max_depth=6,
	learning_rate=0.1,
	n_estimators=200,
	random_state=42,
	eval_metric='logloss'
	)
	xgb_model.fit(X_train, y_train)

	# LightGBM
	print(" - LightGBM 학습...")
	lgb_model = lgb.LGBMClassifier(
	max_depth=6,
	learning_rate=0.1,
	n_estimators=200,
	random_state=42,
	verbose=-1
	)
	lgb_model.fit(X_train, y_train)

	print("모델 학습 완료")

	return xgb_model, lgb_model


	def evaluate_models(xgb_model, lgb_model, X_test, y_test):
	"""모델 평가"""
	print("\n모델 평가 중...")

	# 예측
	xgb_pred = xgb_model.predict_proba(X_test)[:, 1]
	lgb_pred = lgb_model.predict_proba(X_test)[:, 1]

	# 앙상블
	ensemble_pred = 0.5 * xgb_pred + 0.5 * lgb_pred
	ensemble_pred_binary = (ensemble_pred > 0.5).astype(int)

	# 평가 지표
	accuracy = accuracy_score(y_test, ensemble_pred_binary)
	precision = precision_score(y_test, ensemble_pred_binary, zero_division=0)
	recall = recall_score(y_test, ensemble_pred_binary, zero_division=0)
	f1 = f1_score(y_test, ensemble_pred_binary, zero_division=0)
	auc = roc_auc_score(y_test, ensemble_pred)

	print("\n" + "=" * 70)
	print("모델 성능 (Test Set)")
	print("=" * 70)
	print(f"Accuracy: {accuracy:.4f} ({accuracy * 100:.1f}%)")
	print(f"Precision: {precision:.4f} ({precision * 100:.1f}%)")
	print(f"Recall: {recall:.4f} ({recall * 100:.1f}%)")
	print(f"F1-Score: {f1:.4f}")
	print(f"AUC-ROC: {auc:.4f}")
	print("=" * 70)

	# 혼동 행렬
	cm = confusion_matrix(y_test, ensemble_pred_binary)
	print(f"\n혼동 행렬:")
	print(f" TN: {cm[0, 0]}, FP: {cm[0, 1]}")
	print(f" FN: {cm[1, 0]}, TP: {cm[1, 1]}")

	return {
	'accuracy': float(accuracy),
	'precision': float(precision),
	'recall': float(recall),
	'f1_score': float(f1),
	'auc_roc': float(auc)
	}


	def save_models(xgb_model, lgb_model, X, label_encoders, performance, output_dir):
	"""모델 저장"""
	print(f"\n모델 저장 중... ({output_dir})")

	output_path = Path(output_dir)
	output_path.mkdir(parents=True, exist_ok=True)

	# 모델 저장
	with open(output_path / 'xgboost_model.pkl', 'wb') as f:
	pickle.dump(xgb_model, f)

	with open(output_path / 'lightgbm_model.pkl', 'wb') as f:
	pickle.dump(lgb_model, f)

	with open(output_path / 'label_encoders.pkl', 'wb') as f:
	pickle.dump(label_encoders, f)

	# 특징 이름 저장
	feature_names = list(X.columns)
	with open(output_path / 'feature_names.json', 'w', encoding='utf-8') as f:
	json.dump(feature_names, f, ensure_ascii=False, indent=2)

	# 설정 저장
	config = {
	'model_version': '2.0',
	'ensemble_weights': [0.5, 0.5],
	'threshold': 0.5,
	'n_features': len(feature_names),
	'performance': performance
	}

	with open(output_path / 'config.json', 'w', encoding='utf-8') as f:
	json.dump(config, f, ensure_ascii=False, indent=2)

	print("모델 저장 완료")
	print(f" - {output_path / 'xgboost_model.pkl'}")
	print(f" - {output_path / 'lightgbm_model.pkl'}")
	print(f" - {output_path / 'config.json'}")


	def main():
	parser = argparse.ArgumentParser(description='자영업 조기경보 모델 학습')
	parser.add_argument('--data', type=str, default='data/raw',
	help='데이터 디렉토리 경로')
	parser.add_argument('--output', type=str, default='models',
	help='모델 저장 경로')
	parser.add_argument('--max-stores', type=int, default=None,
	help='최대 매장 수 (테스트용)')

	args = parser.parse_args()

	print("=" * 70)
	print("자영업 조기경보 모델 v2.0 학습")
	print("=" * 70)

	# 1. 데이터 로드
	df_store, df_usage, df_customer = load_data(args.data)

	# 2. 특징 생성
	X, y = create_features(df_store, df_usage, df_customer, args.max_stores)

	# 3. 전처리
	X_train, X_test, y_train, y_test, label_encoders = preprocess_data(X, y)

	# 4. SMOTE
	X_train_balanced, y_train_balanced = apply_smote(X_train, y_train)

	# 5. 모델 학습
	xgb_model, lgb_model = train_models(X_train_balanced, y_train_balanced)

	# 6. 평가
	performance = evaluate_models(xgb_model, lgb_model, X_test, y_test)

	# 7. 저장
	save_models(xgb_model, lgb_model, X, label_encoders, performance, args.output)

	print("\n" + "=" * 70)
	print("학습 완료!")
	print("=" * 70)


	if __name__ == "__main__":
	main()