feat(agent): 添加LLM解释功能并本地化界面

添加使用DeepSeek API生成自然语言解释的功能将streamlit界面从英文翻译为中文更新依赖项添加requests库移除.gitignore中不必要的数据文件排除规则
2026-01-15 20:12:33 +08:00 · 2026-01-15 20:12:33 +08:00 · 4553a1794d
commit 4553a1794d
parent 990323d5d6
7 changed files with 51023 additions and 933 deletions
--- a/.gitignore
+++ b/.gitignore
@ -17,6 +17,3 @@ wheels/
 # Artifacts and models
 artifacts/
 *.joblib
-
-# Large data files
-data/*.csv
--- a/data/Cleaned_Customer_Sentiment.csv
+++ b/data/Cleaned_Customer_Sentiment.csv
--- a/data/Customer_Sentiment.csv
+++ b/data/Customer_Sentiment.csv
--- a/pyproject.toml
+++ b/pyproject.toml
@ -15,4 +15,8 @@ dependencies = [
    "streamlit>=1.52.2",
    "pydantic>=2.9.2",
    "python-dotenv>=1.0.0",
+    "requests>=2.32.0",
 ]
+[[tool.uv.index]]
+name = "tencent"
+url = "https://mirrors.cloud.tencent.com/pypi/simple/"
--- a/src/agent.py
+++ b/src/agent.py
@ -2,6 +2,7 @@ import os
 import joblib
 import numpy as np
 import pandas as pd
+import requests
 from typing import Literal, Annotated
 from pydantic import BaseModel, Field

@ -75,3 +76,78 @@ def explain_features(features: CustomerFeatures | dict) -> list[str]:

 def explain_features_model(features: CustomerFeatures | dict) -> ExplanationOutput:
    return ExplanationOutput(factors=explain_features(features))
+
+def explain_features_with_llm(features: CustomerFeatures | dict, api_key: str) -> str:
+    """Use LLM to generate natural language explanation for the risk factors"""
+    _ensure_loaded()
+    explanations = explain_features(features)
+    
+    # Map feature names to human-readable descriptions
+    feature_mapping = {
+        'cat__gender_male': '男性',
+        'cat__gender_female': '女性',
+        'cat__gender_other': '其他性别',
+        'cat__age_group_18-25': '18-25岁年龄段',
+        'cat__age_group_26-35': '26-35岁年龄段',
+        'cat__age_group_36-45': '36-45岁年龄段',
+        'cat__age_group_46-60': '46-60岁年龄段',
+        'cat__age_group_60+': '60岁以上年龄段',
+        'cat__region_north': '北部地区',
+        'cat__region_south': '南部地区',
+        'cat__region_east': '东部地区',
+        'cat__region_west': '西部地区',
+        'cat__region_central': '中部地区',
+        'cat__purchase_channel_online': '线上购买渠道',
+        'cat__purchase_channel_offline': '线下购买渠道',
+        'cat__issue_resolved_True': '问题已解决',
+        'cat__issue_resolved_False': '问题未解决',
+        'cat__complaint_registered_True': '已注册投诉',
+        'cat__complaint_registered_False': '未注册投诉',
+        'num__response_time_hours': '响应时间（小时）'
+    }
+    
+    # Convert explanations to human-readable format
+    human_explanations = []
+    for exp in explanations:
+        for feature, desc in feature_mapping.items():
+            if feature in exp:
+                # Replace feature name with description
+                human_exp = exp.replace(feature, desc)
+                # Make the text more natural
+                human_exp = human_exp.replace('increase negative risk', '增加了负面情绪风险')
+                human_exp = human_exp.replace('decrease negative risk', '降低了负面情绪风险')
+                human_exp = human_exp.replace('weight=', '权重为')
+                human_explanations.append(human_exp)
+                break
+    
+    if not human_explanations:
+        # Fallback if no feature mappings found
+        human_explanations = [exp.replace('increase negative risk', '增加了负面情绪风险').replace('decrease negative risk', '降低了负面情绪风险') for exp in explanations]
+    
+    # Use DeepSeek API to generate natural language explanation
+    prompt = f"请将以下客户负面情绪风险因素分析结果转化为一段自然、流畅的中文解释，用于向客服人员展示：\n\n{chr(10).join(human_explanations)}\n\n要求：\n1. 用简洁的语言说明主要风险因素\n2. 突出影响最大的几个因素\n3. 保持专业但易于理解\n4. 不要使用技术术语\n5. 总长度控制在100-200字之间"
+    
+    try:
+        response = requests.post(
+            "https://api.deepseek.com/v1/chat/completions",
+            headers={
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": "deepseek-chat",
+                "messages": [
+                    {"role": "system", "content": "你是一位专业的客户分析专家，擅长将复杂的数据分析结果转化为通俗易懂的解释。"},
+                    {"role": "user", "content": prompt}
+                ],
+                "max_tokens": 200,
+                "temperature": 0.7
+            }
+        )
+        
+        response.raise_for_status()
+        result = response.json()
+        return result["choices"][0]["message"]["content"]
+    except Exception as e:
+        # Fallback to simple concatenation if API call fails
+        return f"客户负面情绪风险分析：{chr(10).join(human_explanations)}"
--- a/src/streamlit_app.py
+++ b/src/streamlit_app.py
@ -2,10 +2,10 @@ import streamlit as st
 import polars as pl
 import os
 from dotenv import load_dotenv
-from agent import CustomerFeatures, predict_risk, explain_features
+from agent import CustomerFeatures, predict_risk, explain_features, explain_features_with_llm
 import altair as alt

-st.set_page_config(page_title="Customer Sentiment Analysis", layout="wide")
+st.set_page_config(page_title="客户情感分析", layout="wide")
 load_dotenv()

 st.markdown(
@ -116,10 +116,10 @@ def map_risk_to_advice(risk: float):
    return level, decision, actions


-st.title("📊 Customer Sentiment Analysis Dashboard")
+st.title("📊 客户情感分析仪表板")

 st.image(
-    "https://via.placeholder.com/1200x200/fffde7/e65100?text=Customer+Sentiment+Prediction+System",
+    "https://via.placeholder.com/1200x200/fffde7/e65100?text=客户情感预测系统",
    use_container_width=True,
 )

@ -128,7 +128,7 @@ st.image(
 def load_data():
    file_path = os.path.join("data", "Cleaned_Customer_Sentiment.csv")
    if not os.path.exists(file_path):
-        st.error(f"Data file not found at {file_path}. Please run the data processing script first.")
+        st.error(f"在{file_path}找不到数据文件。请先运行数据处理脚本。")
        return None
    return pl.read_csv(file_path)

@ -136,11 +136,11 @@ def load_data():
 df = load_data()

 if df is not None:
-    st.sidebar.header("Filters")
+    st.sidebar.header("筛选条件")
    sentiments = df["sentiment"].unique().to_list()
-    selected_sentiment = st.sidebar.multiselect("Select Sentiment", sentiments, default=sentiments)
+    selected_sentiment = st.sidebar.multiselect("选择情感", sentiments, default=sentiments)
    regions = df["region"].unique().to_list()
-    selected_region = st.sidebar.multiselect("Select Region", regions, default=regions)
+    selected_region = st.sidebar.multiselect("选择地区", regions, default=regions)

    filtered_df = df.filter(
        (pl.col("sentiment").is_in(selected_sentiment)) &
@ -149,19 +149,19 @@ if df is not None:

    st.markdown('<div id="overview"></div>', unsafe_allow_html=True)
    col1, col2, col3 = st.columns(3)
-    col1.metric("Total Reviews", filtered_df.height)
-    col2.metric("Positive Reviews", filtered_df.filter(pl.col("sentiment") == "positive").height)
-    col3.metric("Avg Rating", f"{filtered_df['customer_rating'].mean():.2f}")
+    col1.metric("总评论数", filtered_df.height)
+    col2.metric("正面评论数", filtered_df.filter(pl.col("sentiment") == "positive").height)
+    col3.metric("平均评分", f"{filtered_df['customer_rating'].mean():.2f}")

    st.markdown('<div id="stats"></div>', unsafe_allow_html=True)
-    st.subheader("Sentiment Distribution")
+    st.subheader("情感分布")
    sentiment_counts = filtered_df["sentiment"].value_counts()
    sentiment_chart = (
        alt.Chart(sentiment_counts.to_pandas())
        .mark_bar(cornerRadiusTopLeft=6, cornerRadiusTopRight=6)
        .encode(
-            x=alt.X("sentiment:N", title="Sentiment"),
-            y=alt.Y("count:Q", title="Count"),
+            x=alt.X("sentiment:N", title="情感"),
+            y=alt.Y("count:Q", title="数量"),
            color=alt.Color(
                "sentiment:N",
                scale=alt.Scale(
@ -175,92 +175,92 @@ if df is not None:
    )
    st.altair_chart(sentiment_chart, use_container_width=True)

-    st.subheader("Data Preview")
+    st.subheader("数据预览")
    st.dataframe(filtered_df.to_pandas())

    banner_col1, banner_col2 = st.columns([2, 1])
    with banner_col1:
-        st.subheader("Configuration Check")
+        st.subheader("配置检查")
        api_key = os.getenv("API_KEY")
        if api_key:
-            st.success(f"API Key loaded: {api_key[:4]}****")
+            st.success(f"API密钥已加载: {api_key[:4]}****")
        else:
-            st.warning("API Key not found in environment variables.")
+            st.warning("在环境变量中未找到API密钥。")
    with banner_col2:
        st.image(
-            "https://via.placeholder.com/400x200/e8f5e9/1b5e20?text=Customer+Care",
+            "https://via.placeholder.com/400x200/e8f5e9/1b5e20?text=客户关怀",
            use_container_width=True,
        )

-    st.subheader("Cleaning Stats")
+    st.subheader("数据清洗统计")
    nulls_df = filtered_df.select([pl.col(c).is_null().sum().alias(c) for c in filtered_df.columns]).melt()
    nulls_pd = nulls_df.rename({"variable": "feature", "value": "nulls"}).to_pandas()
    c1, c2 = st.columns(2)
-    c1.metric("Issue Resolved Rate", f"{float(filtered_df['issue_resolved'].cast(pl.Boolean).mean())*100:.2f}%")
-    c2.metric("Complaint Registered Rate", f"{float(filtered_df['complaint_registered'].cast(pl.Boolean).mean())*100:.2f}%")
-    st.subheader("Nulls Per Column")
+    c1.metric("问题解决率", f"{float(filtered_df['issue_resolved'].cast(pl.Boolean).mean())*100:.2f}%")
+    c2.metric("投诉登记率", f"{float(filtered_df['complaint_registered'].cast(pl.Boolean).mean())*100:.2f}%")
+    st.subheader("每列空值统计")
    nulls_chart = (
        alt.Chart(nulls_pd)
        .mark_bar(cornerRadiusTopLeft=4, cornerRadiusTopRight=4)
        .encode(
-            x=alt.X("feature:N", sort="-y", title="Feature"),
-            y=alt.Y("nulls:Q", title="Null Count"),
+            x=alt.X("feature:N", sort="-y", title="特征"),
+            y=alt.Y("nulls:Q", title="空值数量"),
            color=alt.value("#80cbc4"),
            tooltip=["feature", "nulls"],
        )
    )
    st.altair_chart(nulls_chart, use_container_width=True)

-    st.subheader("Gender Distribution")
+    st.subheader("性别分布")
    gender_counts = filtered_df["gender"].value_counts()
    gender_chart = (
        alt.Chart(gender_counts.to_pandas())
        .mark_bar(cornerRadiusTopLeft=6, cornerRadiusTopRight=6)
        .encode(
-            x=alt.X("gender:N", title="Gender"),
-            y=alt.Y("count:Q", title="Count"),
+            x=alt.X("gender:N", title="性别"),
+            y=alt.Y("count:Q", title="数量"),
            color=alt.value("#90caf9"),
            tooltip=["gender", "count"],
        )
    )
    st.altair_chart(gender_chart, use_container_width=True)

-    st.subheader("Avg Response Time by Sentiment")
+    st.subheader("不同情感的平均响应时间")
    avg_resp = filtered_df.group_by("sentiment").agg(pl.col("response_time_hours").mean()).sort("response_time_hours", descending=True)
    avg_resp_chart = (
        alt.Chart(avg_resp.to_pandas())
        .mark_line(point=True)
        .encode(
-            x=alt.X("sentiment:N", title="Sentiment"),
-            y=alt.Y("response_time_hours:Q", title="Avg Response Time (hours)"),
+            x=alt.X("sentiment:N", title="情感"),
+            y=alt.Y("response_time_hours:Q", title="平均响应时间（小时）"),
            color=alt.value("#ffb74d"),
            tooltip=["sentiment", "response_time_hours"],
        )
    )
    st.altair_chart(avg_resp_chart, use_container_width=True)

-    st.subheader("Top Platforms")
+    st.subheader("主要平台分布")
    top_platforms = filtered_df["platform"].value_counts().sort("count", descending=True).head(10)
    platform_chart = (
        alt.Chart(top_platforms.to_pandas())
        .mark_bar(cornerRadiusTopLeft=6, cornerRadiusTopRight=6)
        .encode(
-            x=alt.X("platform:N", sort="-y", title="Platform"),
-            y=alt.Y("count:Q", title="Count"),
+            x=alt.X("platform:N", sort="-y", title="平台"),
+            y=alt.Y("count:Q", title="数量"),
            color=alt.value("#ce93d8"),
            tooltip=["platform", "count"],
        )
    )
    st.altair_chart(platform_chart, use_container_width=True)

-    st.subheader("Top Product Categories")
+    st.subheader("主要产品类别分布")
    top_products = filtered_df["product_category"].value_counts().sort("count", descending=True).head(10)
    product_chart = (
        alt.Chart(top_products.to_pandas())
        .mark_bar(cornerRadiusTopLeft=6, cornerRadiusTopRight=6)
        .encode(
-            x=alt.X("product_category:N", sort="-y", title="Product Category"),
-            y=alt.Y("count:Q", title="Count"),
+            x=alt.X("product_category:N", sort="-y", title="产品类别"),
+            y=alt.Y("count:Q", title="数量"),
            color=alt.value("#ffcc80"),
            tooltip=["product_category", "count"],
        )
@ -316,6 +316,15 @@ if df is not None:
            st.progress(risk)

            st.markdown("#### 分析")
+            # Get API key for LLM
+            api_key = os.getenv("API_KEY")
+            if api_key:
+                # Use LLM to generate natural language explanation
+                llm_explanation = explain_features_with_llm(customer, api_key)
+                st.write(llm_explanation)
+            else:
+                # Fallback to original explanation if no API key
+                st.warning("未找到API密钥，使用原始分析结果")
                for item in explanations:
                    st.write(f"- {item}")

--- a/uv.lock
+++ b/uv.lock