添加完整的客户流失预测系统,包括数据处理、模型训练、预测和行动建议功能。主要包含以下模块: 1. 数据预处理流水线(Polars + Pandera) 2. 机器学习模型训练(LightGBM + Logistic Regression) 3. AI Agent预测和建议工具 4. Streamlit交互式Web界面 5. 完整的课程设计报告文档
189 lines
6.5 KiB
Python
189 lines
6.5 KiB
Python
from sklearn.model_selection import train_test_split, GridSearchCV
|
||
from sklearn.linear_model import LogisticRegression
|
||
from sklearn.metrics import (f1_score, roc_auc_score, accuracy_score,
|
||
precision_score, recall_score, classification_report)
|
||
import lightgbm as lgb
|
||
import numpy as np
|
||
import joblib
|
||
import os
|
||
from data_processing import data_processing_pipeline, preprocess_data
|
||
|
||
# 模型训练和评估类
|
||
class ModelTrainer:
|
||
def __init__(self):
|
||
self.models = {}
|
||
self.metrics = {}
|
||
|
||
# 创建models目录(如果不存在)
|
||
os.makedirs("models", exist_ok=True)
|
||
|
||
# 训练Logistic Regression模型
|
||
def train_logreg(self, X, y):
|
||
print("训练Logistic Regression模型...")
|
||
|
||
# 划分训练集和测试集
|
||
X_train, X_test, y_train, y_test = train_test_split(
|
||
X, y, test_size=0.2, random_state=42, stratify=y
|
||
)
|
||
|
||
# 模型参数网格
|
||
param_grid = {
|
||
'C': [0.01, 0.1, 1.0, 10.0, 100.0],
|
||
'max_iter': [1000],
|
||
'solver': ['lbfgs']
|
||
}
|
||
|
||
# 使用GridSearchCV进行参数调优
|
||
logreg = LogisticRegression(random_state=42)
|
||
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid,
|
||
cv=5, scoring='f1', n_jobs=-1)
|
||
|
||
grid_search.fit(X_train, y_train)
|
||
best_logreg = grid_search.best_estimator_
|
||
|
||
# 评估模型
|
||
y_pred = best_logreg.predict(X_test)
|
||
y_pred_proba = best_logreg.predict_proba(X_test)[:, 1]
|
||
|
||
metrics = self.calculate_metrics(y_test, y_pred, y_pred_proba)
|
||
|
||
# 保存模型
|
||
joblib.dump(best_logreg, "models/logreg_model.pkl")
|
||
|
||
self.models["logreg"] = best_logreg
|
||
self.metrics["logreg"] = metrics
|
||
|
||
print("Logistic Regression模型训练完成!")
|
||
print(f"最佳参数: {grid_search.best_params_}")
|
||
print(f"F1分数: {metrics['f1']:.4f}")
|
||
print(f"ROC-AUC: {metrics['roc_auc']:.4f}")
|
||
|
||
return best_logreg, metrics
|
||
|
||
# 训练LightGBM模型
|
||
def train_lightgbm(self, X, y):
|
||
print("\n训练LightGBM模型...")
|
||
|
||
# 划分训练集和测试集
|
||
X_train, X_test, y_train, y_test = train_test_split(
|
||
X, y, test_size=0.2, random_state=42, stratify=y
|
||
)
|
||
|
||
# 使用sklearn接口的LightGBM分类器
|
||
from lightgbm import LGBMClassifier
|
||
|
||
# 模型参数
|
||
params = {
|
||
'objective': 'binary',
|
||
'metric': 'binary_logloss',
|
||
'boosting_type': 'gbdt',
|
||
'random_state': 42,
|
||
'n_jobs': -1,
|
||
'verbose': -1
|
||
}
|
||
|
||
# 训练模型
|
||
lgbm = LGBMClassifier(**params)
|
||
|
||
# 简化训练,不使用GridSearchCV
|
||
best_lgbm = lgbm.fit(X_train, y_train)
|
||
|
||
# 评估模型
|
||
y_pred_proba = best_lgbm.predict_proba(X_test)[:, 1]
|
||
y_pred = best_lgbm.predict(X_test)
|
||
|
||
metrics = self.calculate_metrics(y_test, y_pred, y_pred_proba)
|
||
|
||
# 保存模型
|
||
joblib.dump(lgbm, "models/lightgbm_model.pkl")
|
||
|
||
self.models["lightgbm"] = lgbm
|
||
self.metrics["lightgbm"] = metrics
|
||
|
||
print("LightGBM模型训练完成!")
|
||
print(f"F1分数: {metrics['f1']:.4f}")
|
||
print(f"ROC-AUC: {metrics['roc_auc']:.4f}")
|
||
|
||
return lgbm, metrics
|
||
|
||
# 计算模型评估指标
|
||
def calculate_metrics(self, y_true, y_pred, y_pred_proba):
|
||
return {
|
||
'accuracy': accuracy_score(y_true, y_pred),
|
||
'precision': precision_score(y_true, y_pred),
|
||
'recall': recall_score(y_true, y_pred),
|
||
'f1': f1_score(y_true, y_pred),
|
||
'roc_auc': roc_auc_score(y_true, y_pred_proba)
|
||
}
|
||
|
||
# 对比模型性能
|
||
def compare_models(self):
|
||
print("\n" + "="*50)
|
||
print("模型性能对比")
|
||
print("="*50)
|
||
|
||
for model_name, metrics in self.metrics.items():
|
||
print(f"\n{model_name.upper()} 性能:")
|
||
print(f" Accuracy: {metrics['accuracy']:.4f}")
|
||
print(f" Precision: {metrics['precision']:.4f}")
|
||
print(f" Recall: {metrics['recall']:.4f}")
|
||
print(f" F1 Score: {metrics['f1']:.4f}")
|
||
print(f" ROC-AUC: {metrics['roc_auc']:.4f}")
|
||
|
||
# 找出最佳模型
|
||
best_model = max(self.metrics.keys(), key=lambda x: self.metrics[x]['f1'])
|
||
print(f"\n最佳模型: {best_model.upper()}")
|
||
print(f"最佳F1分数: {self.metrics[best_model]['f1']:.4f}")
|
||
|
||
return best_model
|
||
|
||
# 加载模型进行预测
|
||
def predict(self, model_name, X):
|
||
if model_name not in self.models:
|
||
# 尝试从文件加载模型
|
||
try:
|
||
model = joblib.load(f"models/{model_name}_model.pkl")
|
||
self.models[model_name] = model
|
||
except FileNotFoundError:
|
||
raise ValueError(f"Model {model_name} not found. Please train the model first.")
|
||
|
||
model = self.models[model_name]
|
||
y_pred_proba = model.predict_proba(X)[:, 1]
|
||
y_pred = (y_pred_proba >= 0.5).astype(int)
|
||
|
||
return y_pred, y_pred_proba
|
||
|
||
# 主函数
|
||
if __name__ == "__main__":
|
||
# 1. 数据处理
|
||
print("正在处理数据...")
|
||
X, y, df = data_processing_pipeline("data/Telco-Customer-Churn.csv")
|
||
X_np, y_np = preprocess_data(X, y)
|
||
|
||
# 2. 模型训练和评估
|
||
trainer = ModelTrainer()
|
||
|
||
# 训练Logistic Regression
|
||
logreg_model, logreg_metrics = trainer.train_logreg(X_np, y_np)
|
||
|
||
# 训练LightGBM
|
||
lgbm_model, lgbm_metrics = trainer.train_lightgbm(X_np, y_np)
|
||
|
||
# 对比模型
|
||
best_model = trainer.compare_models()
|
||
|
||
# 3. 检查是否达到要求
|
||
print("\n" + "="*50)
|
||
print("模型性能要求检查")
|
||
print("="*50)
|
||
|
||
best_f1 = trainer.metrics[best_model]['f1']
|
||
best_roc_auc = trainer.metrics[best_model]['roc_auc']
|
||
|
||
if best_f1 >= 0.70 or best_roc_auc >= 0.75:
|
||
print(f"✓ 模型性能达标!最佳F1: {best_f1:.4f}, 最佳ROC-AUC: {best_roc_auc:.4f}")
|
||
print("✓ 满足F1 ≥ 0.70 或 ROC-AUC ≥ 0.75 的要求")
|
||
else:
|
||
print(f"✗ 模型性能未达标!最佳F1: {best_f1:.4f}, 最佳ROC-AUC: {best_roc_auc:.4f}")
|
||
print("✗ 未满足F1 ≥ 0.70 或 ROC-AUC ≥ 0.75 的要求")
|