akaAKR47/machine_learning.py

189 lines
6.5 KiB
Python
Raw Normal View History

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (f1_score, roc_auc_score, accuracy_score,
precision_score, recall_score, classification_report)
import lightgbm as lgb
import numpy as np
import joblib
import os
from data_processing import data_processing_pipeline, preprocess_data
# 模型训练和评估类
class ModelTrainer:
def __init__(self):
self.models = {}
self.metrics = {}
# 创建models目录如果不存在
os.makedirs("models", exist_ok=True)
# 训练Logistic Regression模型
def train_logreg(self, X, y):
print("训练Logistic Regression模型...")
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 模型参数网格
param_grid = {
'C': [0.01, 0.1, 1.0, 10.0, 100.0],
'max_iter': [1000],
'solver': ['lbfgs']
}
# 使用GridSearchCV进行参数调优
logreg = LogisticRegression(random_state=42)
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid,
cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_logreg = grid_search.best_estimator_
# 评估模型
y_pred = best_logreg.predict(X_test)
y_pred_proba = best_logreg.predict_proba(X_test)[:, 1]
metrics = self.calculate_metrics(y_test, y_pred, y_pred_proba)
# 保存模型
joblib.dump(best_logreg, "models/logreg_model.pkl")
self.models["logreg"] = best_logreg
self.metrics["logreg"] = metrics
print("Logistic Regression模型训练完成")
print(f"最佳参数: {grid_search.best_params_}")
print(f"F1分数: {metrics['f1']:.4f}")
print(f"ROC-AUC: {metrics['roc_auc']:.4f}")
return best_logreg, metrics
# 训练LightGBM模型
def train_lightgbm(self, X, y):
print("\n训练LightGBM模型...")
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 使用sklearn接口的LightGBM分类器
from lightgbm import LGBMClassifier
# 模型参数
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'boosting_type': 'gbdt',
'random_state': 42,
'n_jobs': -1,
'verbose': -1
}
# 训练模型
lgbm = LGBMClassifier(**params)
# 简化训练不使用GridSearchCV
best_lgbm = lgbm.fit(X_train, y_train)
# 评估模型
y_pred_proba = best_lgbm.predict_proba(X_test)[:, 1]
y_pred = best_lgbm.predict(X_test)
metrics = self.calculate_metrics(y_test, y_pred, y_pred_proba)
# 保存模型
joblib.dump(lgbm, "models/lightgbm_model.pkl")
self.models["lightgbm"] = lgbm
self.metrics["lightgbm"] = metrics
print("LightGBM模型训练完成")
print(f"F1分数: {metrics['f1']:.4f}")
print(f"ROC-AUC: {metrics['roc_auc']:.4f}")
return lgbm, metrics
# 计算模型评估指标
def calculate_metrics(self, y_true, y_pred, y_pred_proba):
return {
'accuracy': accuracy_score(y_true, y_pred),
'precision': precision_score(y_true, y_pred),
'recall': recall_score(y_true, y_pred),
'f1': f1_score(y_true, y_pred),
'roc_auc': roc_auc_score(y_true, y_pred_proba)
}
# 对比模型性能
def compare_models(self):
print("\n" + "="*50)
print("模型性能对比")
print("="*50)
for model_name, metrics in self.metrics.items():
print(f"\n{model_name.upper()} 性能:")
print(f" Accuracy: {metrics['accuracy']:.4f}")
print(f" Precision: {metrics['precision']:.4f}")
print(f" Recall: {metrics['recall']:.4f}")
print(f" F1 Score: {metrics['f1']:.4f}")
print(f" ROC-AUC: {metrics['roc_auc']:.4f}")
# 找出最佳模型
best_model = max(self.metrics.keys(), key=lambda x: self.metrics[x]['f1'])
print(f"\n最佳模型: {best_model.upper()}")
print(f"最佳F1分数: {self.metrics[best_model]['f1']:.4f}")
return best_model
# 加载模型进行预测
def predict(self, model_name, X):
if model_name not in self.models:
# 尝试从文件加载模型
try:
model = joblib.load(f"models/{model_name}_model.pkl")
self.models[model_name] = model
except FileNotFoundError:
raise ValueError(f"Model {model_name} not found. Please train the model first.")
model = self.models[model_name]
y_pred_proba = model.predict_proba(X)[:, 1]
y_pred = (y_pred_proba >= 0.5).astype(int)
return y_pred, y_pred_proba
# 主函数
if __name__ == "__main__":
# 1. 数据处理
print("正在处理数据...")
X, y, df = data_processing_pipeline("data/Telco-Customer-Churn.csv")
X_np, y_np = preprocess_data(X, y)
# 2. 模型训练和评估
trainer = ModelTrainer()
# 训练Logistic Regression
logreg_model, logreg_metrics = trainer.train_logreg(X_np, y_np)
# 训练LightGBM
lgbm_model, lgbm_metrics = trainer.train_lightgbm(X_np, y_np)
# 对比模型
best_model = trainer.compare_models()
# 3. 检查是否达到要求
print("\n" + "="*50)
print("模型性能要求检查")
print("="*50)
best_f1 = trainer.metrics[best_model]['f1']
best_roc_auc = trainer.metrics[best_model]['roc_auc']
if best_f1 >= 0.70 or best_roc_auc >= 0.75:
print(f"✓ 模型性能达标最佳F1: {best_f1:.4f}, 最佳ROC-AUC: {best_roc_auc:.4f}")
print("✓ 满足F1 ≥ 0.70 或 ROC-AUC ≥ 0.75 的要求")
else:
print(f"✗ 模型性能未达标最佳F1: {best_f1:.4f}, 最佳ROC-AUC: {best_roc_auc:.4f}")
print("✗ 未满足F1 ≥ 0.70 或 ROC-AUC ≥ 0.75 的要求")