From ece22dc56450723b8b6f039503e2157104b2352c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9E=97=E5=98=89=E7=83=A8?= Date: Fri, 16 Jan 2026 19:34:23 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=20train.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- train.py | 90 -------------------------------------------------------- 1 file changed, 90 deletions(-) delete mode 100644 train.py diff --git a/train.py b/train.py deleted file mode 100644 index 862a0d1..0000000 --- a/train.py +++ /dev/null @@ -1,90 +0,0 @@ -import polars as pl -import pandas as pd -import lightgbm as lgb -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import train_test_split -from sklearn.metrics import classification_report, roc_auc_score, f1_score -from sklearn.preprocessing import LabelEncoder, StandardScaler -import joblib -import logging -import os -from src.data import load_and_clean_data - -# 配置日志 -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) - -def train_models(data_path="data/bank.csv", model_dir="models"): - # 1. 加载数据 - df_pl = load_and_clean_data(data_path) - df = df_pl.to_pandas() # 转换为 Pandas 以兼容 Sklearn - - # 2. 特征预处理 - # 区分分类和数值特征 - target_col = "target" - X = df.drop(columns=[target_col]) - y = df[target_col] - - cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist() - num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist() - - # Label Encoding (为了简化,LightGBM 可以直接处理 Category,但 Sklearn 需要编码) - encoders = {} - for col in cat_cols: - le = LabelEncoder() - X[col] = le.fit_transform(X[col].astype(str)) - encoders[col] = le - - # 3. 数据切分 - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - - # 4. 训练基线模型 (Logistic Regression) - logger.info("训练基线模型 (Logistic Regression)...") - # 逻辑回归需要归一化 - scaler = StandardScaler() - X_train_scaled = X_train.copy() - X_test_scaled = X_test.copy() - X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols]) - X_test_scaled[num_cols] = scaler.transform(X_test[num_cols]) - - lr_model = LogisticRegression(max_iter=1000, random_state=42) - lr_model.fit(X_train_scaled, y_train) - - lr_pred = lr_model.predict(X_test_scaled) - lr_prob = lr_model.predict_proba(X_test_scaled)[:, 1] - - logger.info(f"Baseline F1: {f1_score(y_test, lr_pred):.4f}") - logger.info(f"Baseline AUC: {roc_auc_score(y_test, lr_prob):.4f}") - - # 5. 训练进阶模型 (LightGBM) - logger.info("训练进阶模型 (LightGBM)...") - lgb_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05, random_state=42, verbose=-1) - lgb_model.fit(X_train, y_train) - - lgb_pred = lgb_model.predict(X_test) - lgb_prob = lgb_model.predict_proba(X_test)[:, 1] - - logger.info(f"LightGBM F1: {f1_score(y_test, lgb_pred):.4f}") - logger.info(f"LightGBM AUC: {roc_auc_score(y_test, lgb_prob):.4f}") - - # 6. 保存模型与元数据 - if not os.path.exists(model_dir): - os.makedirs(model_dir) - - artifacts = { - "lgb_model": lgb_model, - "lr_model": lr_model, - "scaler": scaler, - "encoders": encoders, - "features": list(X.columns), - "cat_cols": cat_cols, - "num_cols": num_cols - } - - joblib.dump(artifacts, os.path.join(model_dir, "model_artifacts.pkl")) - logger.info(f"模型已保存至 {model_dir}/model_artifacts.pkl") - - return artifacts - -if __name__ == "__main__": - train_models()