- 添加MDF文件导出功能 - 集成阿里云OCR大模型识别 - 添加百度智能云AI照片评分 - 集成DeepSeek大模型创意文案生成 - 完善文档和配置管理 - 使用uv进行现代化依赖管理 - 添加完整的.gitignore配置
52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
import fitz # PyMuPDF
|
|
import pandas as pd
|
|
|
|
def extract_text_from_pdf(pdf_path):
|
|
"""从PDF中提取文本内容"""
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
text = ""
|
|
for page_num in range(len(doc)):
|
|
page = doc.load_page(page_num)
|
|
text += page.get_text()
|
|
doc.close()
|
|
return text
|
|
except Exception as e:
|
|
raise Exception(f"PDF文本提取失败: {str(e)}")
|
|
|
|
def extract_tables_from_pdf(pdf_path):
|
|
"""从PDF中提取表格数据"""
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
tables = []
|
|
|
|
for page_num in range(len(doc)):
|
|
page = doc.load_page(page_num)
|
|
|
|
# 尝试提取表格(简单实现,实际可能需要更复杂的表格检测)
|
|
text = page.get_text("text")
|
|
# 这里可以添加表格检测和提取逻辑
|
|
|
|
doc.close()
|
|
return tables
|
|
except Exception as e:
|
|
raise Exception(f"PDF表格提取失败: {str(e)}")
|
|
|
|
def pdf_to_excel(pdf_path, output_path):
|
|
"""将PDF文本内容导出为Excel"""
|
|
try:
|
|
text = extract_text_from_pdf(pdf_path)
|
|
|
|
# 将文本按段落分割
|
|
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
|
|
|
# 创建DataFrame
|
|
df = pd.DataFrame({
|
|
'段落编号': range(1, len(paragraphs) + 1),
|
|
'内容': paragraphs
|
|
})
|
|
|
|
df.to_excel(output_path, index=False)
|
|
return True
|
|
except Exception as e:
|
|
raise Exception(f"PDF转Excel失败: {str(e)}") |