SPC/test_read.py

import os
from docx import Document

uploads_folder = 'uploads'

print("=== 测试文档读取功能 ===\n")

for file in os.listdir(uploads_folder):
    filepath = os.path.join(uploads_folder, file)
    print(f"文件: {file}")
    
    if file.endswith('.txt'):
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
            print(f"类型: TXT")
            print(f"内容长度: {len(content)} 字符")
            print(f"前100字符: {content[:100]}...\n")
    
    elif file.endswith('.pdf'):
        import pypdf
        with open(filepath, 'rb') as f:
            reader = pypdf.PdfReader(f)
            text = ''
            for page in reader.pages:
                text += page.extract_text() + '\n'
            print(f"类型: PDF")
            print(f"页数: {len(reader.pages)}")
            print(f"内容长度: {len(text)} 字符")
            print(f"前100字符: {text[:100]}...\n")
    
    elif file.endswith('.docx'):
        doc = Document(filepath)
        text = ''
        for paragraph in doc.paragraphs:
            text += paragraph.text + '\n'
        print(f"类型: DOCX")
        print(f"段落数: {len(doc.paragraphs)}")
        print(f"内容长度: {len(text)} 字符")
        print(f"前100字符: {text[:100]}...\n")
http://hblu.top:3000/Python2025-CourseDesign/SPC.git 2026-01-08 16:10:25 +08:00			`import os`
			`from docx import Document`

			`uploads_folder = 'uploads'`

			`print("=== 测试文档读取功能 ===\n")`

			`for file in os.listdir(uploads_folder):`
			`filepath = os.path.join(uploads_folder, file)`
			`print(f"文件: {file}")`

			`if file.endswith('.txt'):`
			`with open(filepath, 'r', encoding='utf-8') as f:`
			`content = f.read()`
			`print(f"类型: TXT")`
			`print(f"内容长度: {len(content)} 字符")`
			`print(f"前100字符: {content[:100]}...\n")`

			`elif file.endswith('.pdf'):`
			`import pypdf`
			`with open(filepath, 'rb') as f:`
			`reader = pypdf.PdfReader(f)`
			`text = ''`
			`for page in reader.pages:`
			`text += page.extract_text() + '\n'`
			`print(f"类型: PDF")`
			`print(f"页数: {len(reader.pages)}")`
			`print(f"内容长度: {len(text)} 字符")`
			`print(f"前100字符: {text[:100]}...\n")`

			`elif file.endswith('.docx'):`
			`doc = Document(filepath)`
			`text = ''`
			`for paragraph in doc.paragraphs:`
			`text += paragraph.text + '\n'`
			`print(f"类型: DOCX")`
			`print(f"段落数: {len(doc.paragraphs)}")`
			`print(f"内容长度: {len(text)} 字符")`
			`print(f"前100字符: {text[:100]}...\n")`