40 lines
1.3 KiB
Python
40 lines
1.3 KiB
Python
import os
|
|
from docx import Document
|
|
|
|
uploads_folder = 'uploads'
|
|
|
|
print("=== 测试文档读取功能 ===\n")
|
|
|
|
for file in os.listdir(uploads_folder):
|
|
filepath = os.path.join(uploads_folder, file)
|
|
print(f"文件: {file}")
|
|
|
|
if file.endswith('.txt'):
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
print(f"类型: TXT")
|
|
print(f"内容长度: {len(content)} 字符")
|
|
print(f"前100字符: {content[:100]}...\n")
|
|
|
|
elif file.endswith('.pdf'):
|
|
import pypdf
|
|
with open(filepath, 'rb') as f:
|
|
reader = pypdf.PdfReader(f)
|
|
text = ''
|
|
for page in reader.pages:
|
|
text += page.extract_text() + '\n'
|
|
print(f"类型: PDF")
|
|
print(f"页数: {len(reader.pages)}")
|
|
print(f"内容长度: {len(text)} 字符")
|
|
print(f"前100字符: {text[:100]}...\n")
|
|
|
|
elif file.endswith('.docx'):
|
|
doc = Document(filepath)
|
|
text = ''
|
|
for paragraph in doc.paragraphs:
|
|
text += paragraph.text + '\n'
|
|
print(f"类型: DOCX")
|
|
print(f"段落数: {len(doc.paragraphs)}")
|
|
print(f"内容长度: {len(text)} 字符")
|
|
print(f"前100字符: {text[:100]}...\n")
|