group-wbl/knowledge_base.py

#!/Users/bzbb/Documents/work/1/test/.venv/bin/python3
import os
import uuid
import time
import chromadb
import pypdf
import docx2txt
import jieba
from chromadb.config import Settings
from typing import List, Dict, Any, Optional
from datetime import datetime
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sklearn.feature_extraction.text import TfidfVectorizer
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID, STORED
from whoosh.qparser import MultifieldParser
from whoosh.analysis import Tokenizer, Token
import tempfile
import shutil

# 自定义中文分词器类
class ChineseTokenizer(Tokenizer):
    def __call__(self, text, **kwargs):
        from jieba import cut
        t = Token()
        pos = 0
        for word in cut(text):
            t.text = word
            t.boost = 1.0
            t.start_pos = text.find(word, pos)
            t.end_pos = t.start_pos + len(word)
            t.pos = pos  # 添加pos属性
            yield t
            pos = t.end_pos

def ChineseAnalyzer():
    return ChineseTokenizer()

class KnowledgeBase:
    """智能知识库核心类，管理文档和向量数据库"""

    def __init__(self, persist_directory: str = "./knowledge_base"):
        """初始化知识库

        Args:
            persist_directory: 向量数据库持久化目录
        """
        self.persist_directory = persist_directory

        # 初始化 ChromaDB 客户端
        self.client = chromadb.PersistentClient(
            path=persist_directory
        )

        # 创建或获取集合
        self.collection = self.client.get_or_create_collection(
            name="documents",
            metadata={"description": "智能知识库文档集合"}
        )

        # 文档元数据存储
        self.document_metadata = {}
        self.load_metadata()

        # 初始化文本分割器
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            separators=["\n\n", "\n", "。", "！", "？", "，", "、", " "]
        )

        # 初始化稀疏检索（Whoosh）索引
        self.sparse_index_dir = os.path.join(persist_directory, "sparse_index")
        self.schema = Schema(
            doc_id=ID(stored=True),
            content=TEXT(stored=True, analyzer=ChineseAnalyzer()),
            title=TEXT(stored=True),
            file_path=STORED,
            timestamp=STORED
        )

        # 创建或打开Whoosh索引
        if not os.path.exists(self.sparse_index_dir):
            os.makedirs(self.sparse_index_dir)
            self.sparse_index = create_in(self.sparse_index_dir, self.schema)
        else:
            self.sparse_index = open_dir(self.sparse_index_dir)

        # 初始化TF-IDF向量器
        self.tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba.cut, use_idf=True)

    def load_metadata(self):
        """加载文档元数据"""
        # 在实际应用中，应该从持久化存储中加载
        self.document_metadata = {}

    def save_metadata(self):
        """保存文档元数据"""
        # 在实际应用中，应该保存到持久化存储
        pass

    def parse_document(self, file_path: str) -> Dict[str, Any]:
        """解析不同格式的文档

        Args:
            file_path: 文件路径

        Returns:
            解析结果，包含content和metadata
        """
        file_extension = os.path.splitext(file_path)[1].lower()
        content = ""
        metadata = {
            "file_path": file_path,
            "file_type": file_extension,
            "parsed_at": datetime.now().isoformat()
        }

        try:
            if file_extension == ".pdf":
                # 解析PDF
                with open(file_path, "rb") as f:
                    reader = pypdf.PdfReader(f)
                    for page in reader.pages:
                        content += page.extract_text() or ""
                metadata["num_pages"] = len(reader.pages)
                metadata["title"] = reader.metadata.title if reader.metadata and reader.metadata.title else os.path.basename(file_path)

            elif file_extension in [".doc", ".docx"]:
                # 解析Word
                content = docx2txt.process(file_path)
                metadata["title"] = os.path.basename(file_path)

            elif file_extension == ".txt":
                # 解析纯文本
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()
                metadata["title"] = os.path.basename(file_path)

            else:
                raise ValueError(f"不支持的文件格式: {file_extension}")

        except Exception as e:
            raise Exception(f"文档解析失败: {str(e)}")

        return {"content": content, "metadata": metadata}

    def process_document(self, file_path: str, metadata: Dict = None) -> List[Dict[str, Any]]:
        """处理文档的完整ETL流程（解析、分割、向量化）

        Args:
            file_path: 文件路径
            metadata: 额外的元数据

        Returns:
            处理后的文档块列表
        """
        # 1. 解析文档
        parsed_result = self.parse_document(file_path)
        content = parsed_result["content"]
        doc_metadata = parsed_result["metadata"]

        # 合并额外元数据
        if metadata:
            doc_metadata.update(metadata)

        # 2. 文本分割
        chunks = self.text_splitter.split_text(content)

        # 3. 处理每个文档块
        processed_chunks = []
        for i, chunk in enumerate(chunks):
            chunk_id = str(uuid.uuid4())
            chunk_metadata = {
                "id": chunk_id,
                "parent_file": file_path,
                "chunk_index": i,
                "total_chunks": len(chunks),
                "timestamp": datetime.now().isoformat(),
                "version": 1,
                **doc_metadata
            }

            processed_chunks.append({
                "id": chunk_id,
                "content": chunk,
                "metadata": chunk_metadata
            })

        return processed_chunks

    def add_document(self, content: str = None, file_path: str = None, metadata: Dict = None) -> List[str]:
        """添加文档到知识库

        Args:
            content: 文档内容（如果提供file_path则可选）
            file_path: 文件路径（如果提供content则可选）
            metadata: 文档元数据

        Returns:
            文档块ID列表
        """
        if not content and not file_path:
            raise ValueError("必须提供content或file_path")

        processed_chunks = []

        if file_path:
            # 通过文件路径处理文档
            processed_chunks = self.process_document(file_path, metadata)
        else:
            # 直接处理内容
            chunks = self.text_splitter.split_text(content)
            doc_metadata = {
                "timestamp": datetime.now().isoformat(),
                "version": 1,
                "file_type": "text",
                "title": metadata.get("title", "直接输入内容") if metadata else "直接输入内容",
                **(metadata or {})
            }

            for i, chunk in enumerate(chunks):
                chunk_id = str(uuid.uuid4())
                chunk_metadata = {
                    "id": chunk_id,
                    "parent_file": "direct_input",
                    "chunk_index": i,
                    "total_chunks": len(chunks),
                    **doc_metadata
                }

                processed_chunks.append({
                    "id": chunk_id,
                    "content": chunk,
                    "metadata": chunk_metadata
                })

        # 添加到向量数据库和稀疏索引
        chunk_ids = []

        # 1. 准备所有数据用于批量添加
        all_documents = []
        all_metadatas = []
        all_ids = []

        for chunk in processed_chunks:
            chunk_id = chunk["id"]
            chunk_content = chunk["content"]
            chunk_metadata = chunk["metadata"]

            all_documents.append(chunk_content)
            all_metadatas.append(chunk_metadata)
            all_ids.append(chunk_id)

            # 保存元数据
            self.document_metadata[chunk_id] = chunk_metadata
            chunk_ids.append(chunk_id)

        # 2. 批量添加到向量数据库
        if all_documents:
            self.collection.add(
                documents=all_documents,
                metadatas=all_metadatas,
                ids=all_ids
            )

            # 3. 批量添加到稀疏索引（Whoosh）
            writer = self.sparse_index.writer()
            for i in range(len(processed_chunks)):
                chunk = processed_chunks[i]
                writer.add_document(
                    doc_id=all_ids[i],
                    content=all_documents[i],
                    title=all_metadatas[i].get("title", ""),
                    file_path=all_metadatas[i].get("file_path", ""),
                    timestamp=all_metadatas[i].get("timestamp", "")
                )
            writer.commit()

        self.save_metadata()
        return chunk_ids

    def update_document(self, document_id: str, content: str = None, file_path: str = None, metadata: Dict = None) -> bool:
        """更新文档

        Args:
            document_id: 文档块ID
            content: 新的文档内容（如果提供file_path则可选）
            file_path: 新的文件路径（如果提供content则可选）
            metadata: 新的元数据

        Returns:
            是否更新成功
        """
        if document_id not in self.document_metadata:
            return False

        current_metadata = self.document_metadata[document_id]

        # 1. 删除旧的文档块
        self.delete_document(document_id)

        # 2. 添加新的文档内容或文件
        if content:
            chunks = self.text_splitter.split_text(content)
            for i, chunk in enumerate(chunks):
                chunk_id = document_id if i == 0 else str(uuid.uuid4())
                new_metadata = {
                    "id": chunk_id,
                    "parent_file": current_metadata.get("parent_file"),
                    "chunk_index": i,
                    "total_chunks": len(chunks),
                    "timestamp": datetime.now().isoformat(),
                    "version": current_metadata.get("version", 1) + 1,
                    **current_metadata,
                    **(metadata or {})
                }

                # 添加到向量数据库
                self.collection.add(
                    documents=[chunk],
                    metadatas=[new_metadata],
                    ids=[chunk_id]
                )

                # 添加到稀疏索引
                writer = self.sparse_index.writer()
                writer.add_document(
                    doc_id=chunk_id,
                    content=chunk,
                    title=new_metadata.get("title", ""),
                    file_path=new_metadata.get("file_path", ""),
                    timestamp=new_metadata.get("timestamp", "")
                )
                writer.commit()

                # 保存元数据
                self.document_metadata[chunk_id] = new_metadata

        elif file_path:
            # 重新处理文件
            processed_chunks = self.process_document(file_path, {
                "version": current_metadata.get("version", 1) + 1,
                **current_metadata,
                **(metadata or {})
            })

            for chunk in processed_chunks:
                chunk_id = document_id if "chunk_index" in chunk["metadata"] and chunk["metadata"]["chunk_index"] == 0 else str(uuid.uuid4())
                chunk["metadata"]["id"] = chunk_id
                chunk["metadata"]["version"] = current_metadata.get("version", 1) + 1

                # 添加到向量数据库
                self.collection.add(
                    documents=[chunk["content"]],
                    metadatas=[chunk["metadata"]],
                    ids=[chunk_id]
                )

                # 添加到稀疏索引
                writer = self.sparse_index.writer()
                writer.add_document(
                    doc_id=chunk_id,
                    content=chunk["content"],
                    title=chunk["metadata"].get("title", ""),
                    file_path=chunk["metadata"].get("file_path", ""),
                    timestamp=chunk["metadata"].get("timestamp", "")
                )
                writer.commit()

                # 保存元数据
                self.document_metadata[chunk_id] = chunk["metadata"]

        self.save_metadata()
        return True

    def delete_document(self, document_id: str) -> bool:
        """删除文档

        Args:
            document_id: 文档块ID

        Returns:
            是否删除成功
        """
        if document_id not in self.document_metadata:
            return False

        # 1. 从向量数据库删除
        self.collection.delete(ids=[document_id])

        # 2. 从稀疏索引删除
        writer = self.sparse_index.writer()
        writer.delete_by_term("doc_id", document_id)
        writer.commit()

        # 3. 删除元数据
        del self.document_metadata[document_id]
        self.save_metadata()

        return True

    def get_document(self, document_id: str) -> Optional[Dict[str, Any]]:
        """获取文档信息

        Args:
            document_id: 文档ID

        Returns:
            文档信息
        """
        return self.document_metadata.get(document_id)

    def list_documents(self) -> List[Dict[str, Any]]:
        """列出所有文档

        Returns:
            文档列表
        """
        return list(self.document_metadata.values())

    def search(self, query: str, n_results: int = 5, hybrid_weight: float = 0.5) -> List[Dict[str, Any]]:
        """混合搜索文档（密集向量+稀疏关键词）

        Args:
            query: 搜索查询
            n_results: 返回结果数量
            hybrid_weight: 混合权重（0=纯稀疏，1=纯密集）

        Returns:
            搜索结果列表（已重排序）
        """
        # 1. 密集向量搜索（ChromaDB）
        dense_results = self.collection.query(
            query_texts=[query],
            n_results=n_results * 2,  # 获取更多结果用于重排序
            include=["documents", "metadatas", "distances"]
        )

        # 格式化密集搜索结果
        dense_formatted = []
        for i in range(len(dense_results["ids"][0])):
            dense_formatted.append({
                "id": dense_results["ids"][0][i],
                "content": dense_results["documents"][0][i],
                "metadata": dense_results["metadatas"][0][i],
                "dense_score": 1.0 / (1.0 + dense_results["distances"][0][i]),  # 转换为相似度分数
                "sparse_score": 0.0
            })

        # 2. 稀疏关键词搜索（Whoosh）
        sparse_results = []
        with self.sparse_index.searcher() as searcher:
            parser = MultifieldParser(["content", "title"], schema=self.schema)
            whoosh_query = parser.parse(query)

            whoosh_results = searcher.search(whoosh_query, limit=n_results * 2)
            for result in whoosh_results:
                doc_id = result["doc_id"]
                if doc_id in self.document_metadata:
                    sparse_results.append({
                        "id": doc_id,
                        "content": result["content"],
                        "metadata": self.document_metadata[doc_id],
                        "dense_score": 0.0,
                        "sparse_score": result.score
                    })

        # 3. 合并结果
        all_results = {}

        # 添加密集搜索结果
        for result in dense_formatted:
            all_results[result["id"]] = result

        # 添加或更新稀疏搜索结果
        for result in sparse_results:
            if result["id"] in all_results:
                all_results[result["id"]]["sparse_score"] = result["sparse_score"]
            else:
                all_results[result["id"]] = result

        # 4. 重排序（混合分数）
        def calculate_hybrid_score(result):
            return (hybrid_weight * result["dense_score"]) + ((1 - hybrid_weight) * result["sparse_score"])

        sorted_results = sorted(
            all_results.values(),
            key=calculate_hybrid_score,
            reverse=True
        )[:n_results]

        # 5. 格式化最终结果
        final_results = []
        for result in sorted_results:
            final_results.append({
                "id": result["id"],
                "content": result["content"],
                "metadata": result["metadata"],
                "dense_score": result["dense_score"],
                "sparse_score": result["sparse_score"],
                "hybrid_score": calculate_hybrid_score(result)
            })

        return final_results

    def clear(self):
        """清空知识库"""
        self.collection.delete()
        self.document_metadata = {}
        self.save_metadata()

# 创建全局知识库实例
global_knowledge_base = None

def get_knowledge_base() -> KnowledgeBase:
    """获取知识库实例（单例模式）"""
    global global_knowledge_base
    if global_knowledge_base is None:
        global_knowledge_base = KnowledgeBase()
    return global_knowledge_base