189 lines
6.5 KiB
Python
189 lines
6.5 KiB
Python
|
|
from sklearn.model_selection import train_test_split, GridSearchCV
|
|||
|
|
from sklearn.linear_model import LogisticRegression
|
|||
|
|
from sklearn.metrics import (f1_score, roc_auc_score, accuracy_score,
|
|||
|
|
precision_score, recall_score, classification_report)
|
|||
|
|
import lightgbm as lgb
|
|||
|
|
import numpy as np
|
|||
|
|
import joblib
|
|||
|
|
import os
|
|||
|
|
from data_processing import data_processing_pipeline, preprocess_data
|
|||
|
|
|
|||
|
|
# 模型训练和评估类
|
|||
|
|
class ModelTrainer:
|
|||
|
|
def __init__(self):
|
|||
|
|
self.models = {}
|
|||
|
|
self.metrics = {}
|
|||
|
|
|
|||
|
|
# 创建models目录(如果不存在)
|
|||
|
|
os.makedirs("models", exist_ok=True)
|
|||
|
|
|
|||
|
|
# 训练Logistic Regression模型
|
|||
|
|
def train_logreg(self, X, y):
|
|||
|
|
print("训练Logistic Regression模型...")
|
|||
|
|
|
|||
|
|
# 划分训练集和测试集
|
|||
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|||
|
|
X, y, test_size=0.2, random_state=42, stratify=y
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 模型参数网格
|
|||
|
|
param_grid = {
|
|||
|
|
'C': [0.01, 0.1, 1.0, 10.0, 100.0],
|
|||
|
|
'max_iter': [1000],
|
|||
|
|
'solver': ['lbfgs']
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 使用GridSearchCV进行参数调优
|
|||
|
|
logreg = LogisticRegression(random_state=42)
|
|||
|
|
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid,
|
|||
|
|
cv=5, scoring='f1', n_jobs=-1)
|
|||
|
|
|
|||
|
|
grid_search.fit(X_train, y_train)
|
|||
|
|
best_logreg = grid_search.best_estimator_
|
|||
|
|
|
|||
|
|
# 评估模型
|
|||
|
|
y_pred = best_logreg.predict(X_test)
|
|||
|
|
y_pred_proba = best_logreg.predict_proba(X_test)[:, 1]
|
|||
|
|
|
|||
|
|
metrics = self.calculate_metrics(y_test, y_pred, y_pred_proba)
|
|||
|
|
|
|||
|
|
# 保存模型
|
|||
|
|
joblib.dump(best_logreg, "models/logreg_model.pkl")
|
|||
|
|
|
|||
|
|
self.models["logreg"] = best_logreg
|
|||
|
|
self.metrics["logreg"] = metrics
|
|||
|
|
|
|||
|
|
print("Logistic Regression模型训练完成!")
|
|||
|
|
print(f"最佳参数: {grid_search.best_params_}")
|
|||
|
|
print(f"F1分数: {metrics['f1']:.4f}")
|
|||
|
|
print(f"ROC-AUC: {metrics['roc_auc']:.4f}")
|
|||
|
|
|
|||
|
|
return best_logreg, metrics
|
|||
|
|
|
|||
|
|
# 训练LightGBM模型
|
|||
|
|
def train_lightgbm(self, X, y):
|
|||
|
|
print("\n训练LightGBM模型...")
|
|||
|
|
|
|||
|
|
# 划分训练集和测试集
|
|||
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|||
|
|
X, y, test_size=0.2, random_state=42, stratify=y
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 使用sklearn接口的LightGBM分类器
|
|||
|
|
from lightgbm import LGBMClassifier
|
|||
|
|
|
|||
|
|
# 模型参数
|
|||
|
|
params = {
|
|||
|
|
'objective': 'binary',
|
|||
|
|
'metric': 'binary_logloss',
|
|||
|
|
'boosting_type': 'gbdt',
|
|||
|
|
'random_state': 42,
|
|||
|
|
'n_jobs': -1,
|
|||
|
|
'verbose': -1
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 训练模型
|
|||
|
|
lgbm = LGBMClassifier(**params)
|
|||
|
|
|
|||
|
|
# 简化训练,不使用GridSearchCV
|
|||
|
|
best_lgbm = lgbm.fit(X_train, y_train)
|
|||
|
|
|
|||
|
|
# 评估模型
|
|||
|
|
y_pred_proba = best_lgbm.predict_proba(X_test)[:, 1]
|
|||
|
|
y_pred = best_lgbm.predict(X_test)
|
|||
|
|
|
|||
|
|
metrics = self.calculate_metrics(y_test, y_pred, y_pred_proba)
|
|||
|
|
|
|||
|
|
# 保存模型
|
|||
|
|
joblib.dump(lgbm, "models/lightgbm_model.pkl")
|
|||
|
|
|
|||
|
|
self.models["lightgbm"] = lgbm
|
|||
|
|
self.metrics["lightgbm"] = metrics
|
|||
|
|
|
|||
|
|
print("LightGBM模型训练完成!")
|
|||
|
|
print(f"F1分数: {metrics['f1']:.4f}")
|
|||
|
|
print(f"ROC-AUC: {metrics['roc_auc']:.4f}")
|
|||
|
|
|
|||
|
|
return lgbm, metrics
|
|||
|
|
|
|||
|
|
# 计算模型评估指标
|
|||
|
|
def calculate_metrics(self, y_true, y_pred, y_pred_proba):
|
|||
|
|
return {
|
|||
|
|
'accuracy': accuracy_score(y_true, y_pred),
|
|||
|
|
'precision': precision_score(y_true, y_pred),
|
|||
|
|
'recall': recall_score(y_true, y_pred),
|
|||
|
|
'f1': f1_score(y_true, y_pred),
|
|||
|
|
'roc_auc': roc_auc_score(y_true, y_pred_proba)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 对比模型性能
|
|||
|
|
def compare_models(self):
|
|||
|
|
print("\n" + "="*50)
|
|||
|
|
print("模型性能对比")
|
|||
|
|
print("="*50)
|
|||
|
|
|
|||
|
|
for model_name, metrics in self.metrics.items():
|
|||
|
|
print(f"\n{model_name.upper()} 性能:")
|
|||
|
|
print(f" Accuracy: {metrics['accuracy']:.4f}")
|
|||
|
|
print(f" Precision: {metrics['precision']:.4f}")
|
|||
|
|
print(f" Recall: {metrics['recall']:.4f}")
|
|||
|
|
print(f" F1 Score: {metrics['f1']:.4f}")
|
|||
|
|
print(f" ROC-AUC: {metrics['roc_auc']:.4f}")
|
|||
|
|
|
|||
|
|
# 找出最佳模型
|
|||
|
|
best_model = max(self.metrics.keys(), key=lambda x: self.metrics[x]['f1'])
|
|||
|
|
print(f"\n最佳模型: {best_model.upper()}")
|
|||
|
|
print(f"最佳F1分数: {self.metrics[best_model]['f1']:.4f}")
|
|||
|
|
|
|||
|
|
return best_model
|
|||
|
|
|
|||
|
|
# 加载模型进行预测
|
|||
|
|
def predict(self, model_name, X):
|
|||
|
|
if model_name not in self.models:
|
|||
|
|
# 尝试从文件加载模型
|
|||
|
|
try:
|
|||
|
|
model = joblib.load(f"models/{model_name}_model.pkl")
|
|||
|
|
self.models[model_name] = model
|
|||
|
|
except FileNotFoundError:
|
|||
|
|
raise ValueError(f"Model {model_name} not found. Please train the model first.")
|
|||
|
|
|
|||
|
|
model = self.models[model_name]
|
|||
|
|
y_pred_proba = model.predict_proba(X)[:, 1]
|
|||
|
|
y_pred = (y_pred_proba >= 0.5).astype(int)
|
|||
|
|
|
|||
|
|
return y_pred, y_pred_proba
|
|||
|
|
|
|||
|
|
# 主函数
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
# 1. 数据处理
|
|||
|
|
print("正在处理数据...")
|
|||
|
|
X, y, df = data_processing_pipeline("data/Telco-Customer-Churn.csv")
|
|||
|
|
X_np, y_np = preprocess_data(X, y)
|
|||
|
|
|
|||
|
|
# 2. 模型训练和评估
|
|||
|
|
trainer = ModelTrainer()
|
|||
|
|
|
|||
|
|
# 训练Logistic Regression
|
|||
|
|
logreg_model, logreg_metrics = trainer.train_logreg(X_np, y_np)
|
|||
|
|
|
|||
|
|
# 训练LightGBM
|
|||
|
|
lgbm_model, lgbm_metrics = trainer.train_lightgbm(X_np, y_np)
|
|||
|
|
|
|||
|
|
# 对比模型
|
|||
|
|
best_model = trainer.compare_models()
|
|||
|
|
|
|||
|
|
# 3. 检查是否达到要求
|
|||
|
|
print("\n" + "="*50)
|
|||
|
|
print("模型性能要求检查")
|
|||
|
|
print("="*50)
|
|||
|
|
|
|||
|
|
best_f1 = trainer.metrics[best_model]['f1']
|
|||
|
|
best_roc_auc = trainer.metrics[best_model]['roc_auc']
|
|||
|
|
|
|||
|
|
if best_f1 >= 0.70 or best_roc_auc >= 0.75:
|
|||
|
|
print(f"✓ 模型性能达标!最佳F1: {best_f1:.4f}, 最佳ROC-AUC: {best_roc_auc:.4f}")
|
|||
|
|
print("✓ 满足F1 ≥ 0.70 或 ROC-AUC ≥ 0.75 的要求")
|
|||
|
|
else:
|
|||
|
|
print(f"✗ 模型性能未达标!最佳F1: {best_f1:.4f}, 最佳ROC-AUC: {best_roc_auc:.4f}")
|
|||
|
|
print("✗ 未满足F1 ≥ 0.70 或 ROC-AUC ≥ 0.75 的要求")
|