G05-Customer_Sentiment/src/agent.py

import os
import joblib
import numpy as np
import pandas as pd
import requests
from typing import Literal, Annotated
from pydantic import BaseModel, Field

_model_lgb = None
_model_lr = None
_le = None

class CustomerFeatures(BaseModel):
    gender: Literal["male", "female", "other"]
    age_group: Literal["18-25", "26-35", "36-45", "46-60", "60+"]
    region: Literal["north", "south", "east", "west", "central"]
    product_category: str
    purchase_channel: Literal["online", "offline"]
    platform: str
    response_time_hours: Annotated[float, Field(ge=0)]
    issue_resolved: bool
    complaint_registered: bool
    review_text: Annotated[str, Field(min_length=3)]

class RiskOutput(BaseModel):
    risk: float

class ExplanationOutput(BaseModel):
    factors: list[str]

def _ensure_loaded():
    global _model_lgb, _model_lr, _le
    if _model_lgb is None:
        _model_lgb = joblib.load(os.path.join("artifacts", "lgb_pipeline.joblib"))
    if _model_lr is None:
        _model_lr = joblib.load(os.path.join("artifacts", "lr_pipeline.joblib"))
    if _le is None:
        _le = joblib.load(os.path.join("artifacts", "label_encoder.joblib"))

def _to_dataframe(features) -> pd.DataFrame:
    if isinstance(features, CustomerFeatures):
        payload = features.model_dump()
    elif isinstance(features, dict):
        payload = features
    else:
        raise TypeError("features must be CustomerFeatures or dict")
    return pd.DataFrame([payload])

def predict_risk(features: CustomerFeatures | dict) -> float:
    _ensure_loaded()
    df = _to_dataframe(features)
    probs = _model_lgb.predict_proba(df)[0]
    idx_neg = int(_le.transform(["negative"])[0])
    return float(probs[idx_neg])

def predict_risk_model(features: CustomerFeatures | dict) -> RiskOutput:
    return RiskOutput(risk=predict_risk(features))

def explain_features(features: CustomerFeatures | dict) -> list[str]:
    _ensure_loaded()
    df = _to_dataframe(features)
    pre = _model_lr.named_steps["preprocessor"]
    Xv = pre.transform(df)
    clf = _model_lr.named_steps["classifier"]
    idx_neg = int(_le.transform(["negative"])[0])
    coefs = clf.coef_[idx_neg]
    vec = Xv.toarray().ravel()
    contrib = vec * coefs
    names = pre.get_feature_names_out()
    order = np.argsort(-np.abs(contrib))[:8]
    out = []
    for i in order:
        direction = "increase" if contrib[i] > 0 else "decrease"
        out.append(f"{names[i]} {direction} negative risk (weight={contrib[i]:.3f})")
    return out

def explain_features_model(features: CustomerFeatures | dict) -> ExplanationOutput:
    return ExplanationOutput(factors=explain_features(features))

def explain_features_with_llm(features: CustomerFeatures | dict, api_key: str) -> str:
    """Use LLM to generate natural language explanation for the risk factors"""
    _ensure_loaded()
    explanations = explain_features(features)
    
    # Map feature names to human-readable descriptions
    feature_mapping = {
        'cat__gender_male': '男性',
        'cat__gender_female': '女性',
        'cat__gender_other': '其他性别',
        'cat__age_group_18-25': '18-25岁年龄段',
        'cat__age_group_26-35': '26-35岁年龄段',
        'cat__age_group_36-45': '36-45岁年龄段',
        'cat__age_group_46-60': '46-60岁年龄段',
        'cat__age_group_60+': '60岁以上年龄段',
        'cat__region_north': '北部地区',
        'cat__region_south': '南部地区',
        'cat__region_east': '东部地区',
        'cat__region_west': '西部地区',
        'cat__region_central': '中部地区',
        'cat__purchase_channel_online': '线上购买渠道',
        'cat__purchase_channel_offline': '线下购买渠道',
        'cat__issue_resolved_True': '问题已解决',
        'cat__issue_resolved_False': '问题未解决',
        'cat__complaint_registered_True': '已注册投诉',
        'cat__complaint_registered_False': '未注册投诉',
        'num__response_time_hours': '响应时间（小时）'
    }
    
    # Convert explanations to human-readable format
    human_explanations = []
    for exp in explanations:
        for feature, desc in feature_mapping.items():
            if feature in exp:
                # Replace feature name with description
                human_exp = exp.replace(feature, desc)
                # Make the text more natural
                human_exp = human_exp.replace('increase negative risk', '增加了负面情绪风险')
                human_exp = human_exp.replace('decrease negative risk', '降低了负面情绪风险')
                human_exp = human_exp.replace('weight=', '权重为')
                human_explanations.append(human_exp)
                break
    
    if not human_explanations:
        # Fallback if no feature mappings found
        human_explanations = [exp.replace('increase negative risk', '增加了负面情绪风险').replace('decrease negative risk', '降低了负面情绪风险') for exp in explanations]
    
    # Use DeepSeek API to generate natural language explanation
    prompt = f"请将以下客户负面情绪风险因素分析结果转化为一段自然、流畅的中文解释，用于向客服人员展示：\n\n{chr(10).join(human_explanations)}\n\n要求：\n1. 用简洁的语言说明主要风险因素\n2. 突出影响最大的几个因素\n3. 保持专业但易于理解\n4. 不要使用技术术语\n5. 总长度控制在100-200字之间"
    
    try:
        response = requests.post(
            "https://api.deepseek.com/v1/chat/completions",
            headers={
                "Authorization": f"Bearer {api_key}",
                "Content-Type": "application/json"
            },
            json={
                "model": "deepseek-chat",
                "messages": [
                    {"role": "system", "content": "你是一位专业的客户分析专家，擅长将复杂的数据分析结果转化为通俗易懂的解释。"},
                    {"role": "user", "content": prompt}
                ],
                "max_tokens": 200,
                "temperature": 0.7
            }
        )
        
        response.raise_for_status()
        result = response.json()
        return result["choices"][0]["message"]["content"]
    except Exception as e:
        # Fallback to simple concatenation if API call fails
        return f"客户负面情绪风险分析：{chr(10).join(human_explanations)}"
Initial commit: Customer Sentiment Analysis project 2026-01-14 20:43:02 +08:00			`import os`
			`import joblib`
			`import numpy as np`
			`import pandas as pd`
feat(agent): 添加LLM解释功能并本地化界面添加使用DeepSeek API生成自然语言解释的功能将streamlit界面从英文翻译为中文更新依赖项添加requests库移除.gitignore中不必要的数据文件排除规则 2026-01-15 20:12:33 +08:00			`import requests`
Initial commit: Customer Sentiment Analysis project 2026-01-14 20:43:02 +08:00			`from typing import Literal, Annotated`
			`from pydantic import BaseModel, Field`

			`_model_lgb = None`
			`_model_lr = None`
			`_le = None`

			`class CustomerFeatures(BaseModel):`
			`gender: Literal["male", "female", "other"]`
			`age_group: Literal["18-25", "26-35", "36-45", "46-60", "60+"]`
			`region: Literal["north", "south", "east", "west", "central"]`
			`product_category: str`
			`purchase_channel: Literal["online", "offline"]`
			`platform: str`
			`response_time_hours: Annotated[float, Field(ge=0)]`
			`issue_resolved: bool`
			`complaint_registered: bool`
			`review_text: Annotated[str, Field(min_length=3)]`

			`class RiskOutput(BaseModel):`
			`risk: float`

			`class ExplanationOutput(BaseModel):`
			`factors: list[str]`

			`def _ensure_loaded():`
			`global _model_lgb, _model_lr, _le`
			`if _model_lgb is None:`
			`_model_lgb = joblib.load(os.path.join("artifacts", "lgb_pipeline.joblib"))`
			`if _model_lr is None:`
			`_model_lr = joblib.load(os.path.join("artifacts", "lr_pipeline.joblib"))`
			`if _le is None:`
			`_le = joblib.load(os.path.join("artifacts", "label_encoder.joblib"))`

			`def _to_dataframe(features) -> pd.DataFrame:`
			`if isinstance(features, CustomerFeatures):`
			`payload = features.model_dump()`
			`elif isinstance(features, dict):`
			`payload = features`
			`else:`
			`raise TypeError("features must be CustomerFeatures or dict")`
			`return pd.DataFrame([payload])`

			`def predict_risk(features: CustomerFeatures \| dict) -> float:`
			`_ensure_loaded()`
			`df = _to_dataframe(features)`
			`probs = _model_lgb.predict_proba(df)[0]`
			`idx_neg = int(_le.transform(["negative"])[0])`
			`return float(probs[idx_neg])`

			`def predict_risk_model(features: CustomerFeatures \| dict) -> RiskOutput:`
			`return RiskOutput(risk=predict_risk(features))`

			`def explain_features(features: CustomerFeatures \| dict) -> list[str]:`
			`_ensure_loaded()`
			`df = _to_dataframe(features)`
			`pre = _model_lr.named_steps["preprocessor"]`
			`Xv = pre.transform(df)`
			`clf = _model_lr.named_steps["classifier"]`
			`idx_neg = int(_le.transform(["negative"])[0])`
			`coefs = clf.coef_[idx_neg]`
			`vec = Xv.toarray().ravel()`
			`contrib = vec * coefs`
			`names = pre.get_feature_names_out()`
			`order = np.argsort(-np.abs(contrib))[:8]`
			`out = []`
			`for i in order:`
			`direction = "increase" if contrib[i] > 0 else "decrease"`
			`out.append(f"{names[i]} {direction} negative risk (weight={contrib[i]:.3f})")`
			`return out`

			`def explain_features_model(features: CustomerFeatures \| dict) -> ExplanationOutput:`
			`return ExplanationOutput(factors=explain_features(features))`
feat(agent): 添加LLM解释功能并本地化界面添加使用DeepSeek API生成自然语言解释的功能将streamlit界面从英文翻译为中文更新依赖项添加requests库移除.gitignore中不必要的数据文件排除规则 2026-01-15 20:12:33 +08:00
			`def explain_features_with_llm(features: CustomerFeatures \| dict, api_key: str) -> str:`
			`"""Use LLM to generate natural language explanation for the risk factors"""`
			`_ensure_loaded()`
			`explanations = explain_features(features)`

			`# Map feature names to human-readable descriptions`
			`feature_mapping = {`
			`'cat__gender_male': '男性',`
			`'cat__gender_female': '女性',`
			`'cat__gender_other': '其他性别',`
			`'cat__age_group_18-25': '18-25岁年龄段',`
			`'cat__age_group_26-35': '26-35岁年龄段',`
			`'cat__age_group_36-45': '36-45岁年龄段',`
			`'cat__age_group_46-60': '46-60岁年龄段',`
			`'cat__age_group_60+': '60岁以上年龄段',`
			`'cat__region_north': '北部地区',`
			`'cat__region_south': '南部地区',`
			`'cat__region_east': '东部地区',`
			`'cat__region_west': '西部地区',`
			`'cat__region_central': '中部地区',`
			`'cat__purchase_channel_online': '线上购买渠道',`
			`'cat__purchase_channel_offline': '线下购买渠道',`
			`'cat__issue_resolved_True': '问题已解决',`
			`'cat__issue_resolved_False': '问题未解决',`
			`'cat__complaint_registered_True': '已注册投诉',`
			`'cat__complaint_registered_False': '未注册投诉',`
			`'num__response_time_hours': '响应时间（小时）'`
			`}`

			`# Convert explanations to human-readable format`
			`human_explanations = []`
			`for exp in explanations:`
			`for feature, desc in feature_mapping.items():`
			`if feature in exp:`
			`# Replace feature name with description`
			`human_exp = exp.replace(feature, desc)`
			`# Make the text more natural`
			`human_exp = human_exp.replace('increase negative risk', '增加了负面情绪风险')`
			`human_exp = human_exp.replace('decrease negative risk', '降低了负面情绪风险')`
			`human_exp = human_exp.replace('weight=', '权重为')`
			`human_explanations.append(human_exp)`
			`break`

			`if not human_explanations:`
			`# Fallback if no feature mappings found`
			`human_explanations = [exp.replace('increase negative risk', '增加了负面情绪风险').replace('decrease negative risk', '降低了负面情绪风险') for exp in explanations]`

			`# Use DeepSeek API to generate natural language explanation`
			`prompt = f"请将以下客户负面情绪风险因素分析结果转化为一段自然、流畅的中文解释，用于向客服人员展示：\n\n{chr(10).join(human_explanations)}\n\n要求：\n1. 用简洁的语言说明主要风险因素\n2. 突出影响最大的几个因素\n3. 保持专业但易于理解\n4. 不要使用技术术语\n5. 总长度控制在100-200字之间"`

			`try:`
			`response = requests.post(`
			`"https://api.deepseek.com/v1/chat/completions",`
			`headers={`
			`"Authorization": f"Bearer {api_key}",`
			`"Content-Type": "application/json"`
			`},`
			`json={`
			`"model": "deepseek-chat",`
			`"messages": [`
			`{"role": "system", "content": "你是一位专业的客户分析专家，擅长将复杂的数据分析结果转化为通俗易懂的解释。"},`
			`{"role": "user", "content": prompt}`
			`],`
			`"max_tokens": 200,`
			`"temperature": 0.7`
			`}`
			`)`

			`response.raise_for_status()`
			`result = response.json()`
			`return result["choices"][0]["message"]["content"]`
			`except Exception as e:`
			`# Fallback to simple concatenation if API call fails`
			`return f"客户负面情绪风险分析：{chr(10).join(human_explanations)}"`