SPC/test_read.py

40 lines
1.3 KiB
Python
Raw Normal View History

import os
from docx import Document
uploads_folder = 'uploads'
print("=== 测试文档读取功能 ===\n")
for file in os.listdir(uploads_folder):
filepath = os.path.join(uploads_folder, file)
print(f"文件: {file}")
if file.endswith('.txt'):
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
print(f"类型: TXT")
print(f"内容长度: {len(content)} 字符")
print(f"前100字符: {content[:100]}...\n")
elif file.endswith('.pdf'):
import pypdf
with open(filepath, 'rb') as f:
reader = pypdf.PdfReader(f)
text = ''
for page in reader.pages:
text += page.extract_text() + '\n'
print(f"类型: PDF")
print(f"页数: {len(reader.pages)}")
print(f"内容长度: {len(text)} 字符")
print(f"前100字符: {text[:100]}...\n")
elif file.endswith('.docx'):
doc = Document(filepath)
text = ''
for paragraph in doc.paragraphs:
text += paragraph.text + '\n'
print(f"类型: DOCX")
print(f"段落数: {len(doc.paragraphs)}")
print(f"内容长度: {len(text)} 字符")
print(f"前100字符: {text[:100]}...\n")