From ba1ce262013694b1a98d20b59d2a5f6c15fcc150 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=88=99=E6=96=87?= Date: Thu, 15 Jan 2026 23:50:11 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=20src/train=5Ftweet=5Fultima?= =?UTF-8?q?te.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/train_tweet_ultimate.py | 334 ------------------------------------ 1 file changed, 334 deletions(-) delete mode 100644 src/train_tweet_ultimate.py diff --git a/src/train_tweet_ultimate.py b/src/train_tweet_ultimate.py deleted file mode 100644 index 6389149..0000000 --- a/src/train_tweet_ultimate.py +++ /dev/null @@ -1,334 +0,0 @@ -"""推文情感分析训练模块(最终优化版) - -使用多种算法组合 + 特征工程 + 超参数优化。 -目标:达到 Accuracy ≥ 0.82 或 Macro-F1 ≥ 0.75 -""" - -from pathlib import Path - -import joblib -import numpy as np -import polars as pl -from scipy.sparse import hstack -from sklearn.ensemble import RandomForestClassifier, VotingClassifier -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.linear_model import LogisticRegression -from sklearn.metrics import classification_report, accuracy_score, f1_score -from sklearn.model_selection import train_test_split -from sklearn.naive_bayes import MultinomialNB -from sklearn.preprocessing import LabelEncoder - -try: - import lightgbm as lgb - HAS_LIGHTGBM = True -except ImportError: - HAS_LIGHTGBM = False - -try: - import xgboost as xgb - HAS_XGBOOST = True -except ImportError: - HAS_XGBOOST = False - -try: - from catboost import CatBoostClassifier - HAS_CATBOOST = True -except ImportError: - HAS_CATBOOST = False - -from src.tweet_data import load_cleaned_tweets, print_data_summary - -MODELS_DIR = Path("models") -MODEL_PATH = MODELS_DIR / "tweet_sentiment_model_ultimate.pkl" -ENCODER_PATH = MODELS_DIR / "label_encoder_ultimate.pkl" -TFIDF_PATH = MODELS_DIR / "tfidf_vectorizer_ultimate.pkl" -AIRLINE_ENCODER_PATH = MODELS_DIR / "airline_encoder_ultimate.pkl" - - -class TweetSentimentModel: - """推文情感分析模型类(最终优化) - - 结合多种算法和特征工程进行分类。 - """ - - def __init__( - self, - max_features: int = 15000, - ngram_range: tuple = (1, 3), - ): - self.max_features = max_features - self.ngram_range = ngram_range - - self.tfidf_vectorizer = None - self.label_encoder = None - self.model = None - self.airline_encoder = None - - def _create_tfidf_vectorizer(self) -> TfidfVectorizer: - """创建 TF-IDF 向量化器""" - return TfidfVectorizer( - max_features=self.max_features, - ngram_range=self.ngram_range, - min_df=2, - max_df=0.95, - lowercase=False, - sublinear_tf=True, - ) - - def fit( - self, - X_text: np.ndarray, - X_airline: np.ndarray, - y: np.ndarray, - ) -> None: - """训练模型 - - Args: - X_text: 训练文本数据 - X_airline: 训练航空公司数据 - y: 训练标签 - """ - # 初始化编码器 - self.tfidf_vectorizer = self._create_tfidf_vectorizer() - self.label_encoder = LabelEncoder() - self.airline_encoder = LabelEncoder() - - # 编码标签 - y_encoded = self.label_encoder.fit_transform(y) - - # 编码航空公司 - X_airline_encoded = self.airline_encoder.fit_transform(X_airline) - - # TF-IDF 向量化 - X_tfidf = self.tfidf_vectorizer.fit_transform(X_text) - - # 合并特征 - X_combined = hstack([X_tfidf, X_airline_encoded.reshape(-1, 1)]) - - # 构建集成模型 - 使用不同的算法 - estimators = [] - - # Logistic Regression - 稳定的基线 - estimators.append(("lr", LogisticRegression( - random_state=42, - max_iter=2000, - class_weight="balanced", - C=1.0, - n_jobs=-1, - ))) - - # MultinomialNB - 适合文本分类 - estimators.append(("nb", MultinomialNB(alpha=0.3))) - - # Random Forest - 集成学习 - estimators.append(("rf", RandomForestClassifier( - random_state=42, - n_estimators=200, - max_depth=15, - min_samples_split=5, - class_weight="balanced", - n_jobs=-1, - ))) - - # LightGBM - 梯度提升 - if HAS_LIGHTGBM: - estimators.append(("lgbm", lgb.LGBMClassifier( - random_state=42, - n_estimators=300, - learning_rate=0.05, - max_depth=6, - num_leaves=31, - class_weight="balanced", - verbose=-1, - n_jobs=-1, - ))) - - # XGBoost - 梯度提升 - if HAS_XGBOOST: - estimators.append(("xgb", xgb.XGBClassifier( - random_state=42, - n_estimators=300, - learning_rate=0.05, - max_depth=6, - subsample=0.8, - colsample_bytree=0.8, - eval_metric="mlogloss", - n_jobs=-1, - ))) - - # 使用 VotingClassifier 进行集成 - self.model = VotingClassifier( - estimators=estimators, - voting="soft", # 使用软投票(概率平均) - n_jobs=-1, - ) - - print(f"使用 {len(estimators)} 个基学习器:") - for name, _ in estimators: - print(f" - {name}") - - # 训练模型 - self.model.fit(X_combined, y_encoded) - - def predict(self, X_text: np.ndarray, X_airline: np.ndarray) -> np.ndarray: - """预测 - - Args: - X_text: 文本数据 - X_airline: 航空公司数据 - - Returns: - np.ndarray: 预测的情感类别 - """ - X_tfidf = self.tfidf_vectorizer.transform(X_text) - X_airline_encoded = self.airline_encoder.transform(X_airline) - X_combined = hstack([X_tfidf, X_airline_encoded.reshape(-1, 1)]) - - y_pred_encoded = self.model.predict(X_combined) - return self.label_encoder.inverse_transform(y_pred_encoded) - - def predict_proba(self, X_text: np.ndarray, X_airline: np.ndarray) -> np.ndarray: - """预测概率 - - Args: - X_text: 文本数据 - X_airline: 航空公司数据 - - Returns: - np.ndarray: 预测的概率 - """ - X_tfidf = self.tfidf_vectorizer.transform(X_text) - X_airline_encoded = self.airline_encoder.transform(X_airline) - X_combined = hstack([X_tfidf, X_airline_encoded.reshape(-1, 1)]) - - return self.model.predict_proba(X_combined) - - def save(self, model_path: Path, encoder_path: Path, tfidf_path: Path, airline_encoder_path: Path) -> None: - """保存模型 - - Args: - model_path: 模型保存路径 - encoder_path: 编码器保存路径 - tfidf_path: TF-IDF 向量化器保存路径 - airline_encoder_path: 航空公司编码器保存路径 - """ - if self.model is None: - raise ValueError("模型未训练,无法保存") - - model_path.parent.mkdir(parents=True, exist_ok=True) - - joblib.dump(self.model, model_path) - joblib.dump(self.label_encoder, encoder_path) - joblib.dump(self.tfidf_vectorizer, tfidf_path) - joblib.dump(self.airline_encoder, airline_encoder_path) - - @classmethod - def load(cls, model_path: Path, encoder_path: Path, tfidf_path: Path, airline_encoder_path: Path) -> "TweetSentimentModel": - """加载模型 - - Args: - model_path: 模型路径 - encoder_path: 编码器路径 - tfidf_path: TF-IDF 向量化器路径 - airline_encoder_path: 航空公司编码器路径 - - Returns: - TweetSentimentModel: 加载的模型 - """ - instance = cls() - - instance.model = joblib.load(model_path) - instance.label_encoder = joblib.load(encoder_path) - instance.tfidf_vectorizer = joblib.load(tfidf_path) - instance.airline_encoder = joblib.load(airline_encoder_path) - - return instance - - -def train_ultimate_model() -> None: - """执行最终优化模型训练流程""" - print(">>> 1. 加载清洗后的数据") - df = load_cleaned_tweets("data/Tweets_cleaned.csv") - print(f"数据集大小: {len(df)}") - - print("\n>>> 2. 数据统计") - print_data_summary(df, "训练数据统计") - - # 转换为 numpy 数组 - df_pandas = df.to_pandas() - - X_text = df_pandas["text_cleaned"].values - X_airline = df_pandas["airline"].values - y = df_pandas["airline_sentiment"].values - - # 划分训练集和测试集 - X_train_text, X_test_text, X_train_airline, X_test_airline, y_train, y_test = train_test_split( - X_text, X_airline, y, test_size=0.2, random_state=42, stratify=y - ) - - print(f"\n训练集大小: {len(X_train_text)}") - print(f"测试集大小: {len(X_test_text)}") - - print("\n>>> 3. 训练最终优化模型") - model = TweetSentimentModel( - max_features=15000, - ngram_range=(1, 3), - ) - - model.fit(X_train_text, X_train_airline, y_train) - - print("\n>>> 4. 模型评估") - - # 预测 - y_pred = model.predict(X_test_text, X_test_airline) - - # 计算指标 - accuracy = accuracy_score(y_test, y_pred) - macro_f1 = f1_score(y_test, y_pred, average="macro") - - print(f"Accuracy: {accuracy:.4f}") - print(f"Macro-F1: {macro_f1:.4f}") - - # 检查是否达到目标(调整后的目标) - print("\n>>> 5. 目标检查(调整后)") - target_accuracy = 0.82 - target_macro_f1 = 0.75 - - if accuracy >= target_accuracy: - print(f"✅ Accuracy 达标: {accuracy:.4f} >= {target_accuracy}") - else: - print(f"❌ Accuracy 未达标: {accuracy:.4f} < {target_accuracy}") - - if macro_f1 >= target_macro_f1: - print(f"✅ Macro-F1 达标: {macro_f1:.4f} >= {target_macro_f1}") - else: - print(f"❌ Macro-F1 未达标: {macro_f1:.4f} < {target_macro_f1}") - - # 详细分类报告 - print("\n>>> 6. 详细分类报告") - print(classification_report(y_test, y_pred, target_names=["negative", "neutral", "positive"])) - - # 保存模型 - print("\n>>> 7. 保存模型") - model.save(MODEL_PATH, ENCODER_PATH, TFIDF_PATH, AIRLINE_ENCODER_PATH) - print(f"模型已保存至 {MODEL_PATH}") - print(f"编码器已保存至 {ENCODER_PATH}") - print(f"TF-IDF 向量化器已保存至 {TFIDF_PATH}") - print(f"航空公司编码器已保存至 {AIRLINE_ENCODER_PATH}") - - -def load_model() -> "TweetSentimentModel": - """加载训练好的模型 - - Returns: - TweetSentimentModel: 训练好的模型 - """ - if not MODEL_PATH.exists(): - raise FileNotFoundError( - f"未找到模型文件 {MODEL_PATH}。请先运行 uv run python src/train_tweet_ultimate.py" - ) - return TweetSentimentModel.load(MODEL_PATH, ENCODER_PATH, TFIDF_PATH, AIRLINE_ENCODER_PATH) - - -if __name__ == "__main__": - train_ultimate_model()