|
@@ -0,0 +1,196 @@
|
|
|
+import os
|
|
|
+import fitz # PyMuPDF
|
|
|
+import argparse
|
|
|
+import logging
|
|
|
+from datetime import datetime
|
|
|
+from service.trunks_service import TrunksService
|
|
|
+
|
|
|
+def setup_logging(log_dir):
|
|
|
+ """
|
|
|
+ 配置日志记录器
|
|
|
+
|
|
|
+ Args:
|
|
|
+ log_dir: 日志文件保存目录
|
|
|
+ """
|
|
|
+ # 创建日志目录
|
|
|
+ os.makedirs(log_dir, exist_ok=True)
|
|
|
+
|
|
|
+ # 创建带时间戳的日志文件名
|
|
|
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
|
+ log_file = os.path.join(log_dir, f'pdf_process_{timestamp}.log')
|
|
|
+
|
|
|
+ # 配置日志格式
|
|
|
+ logging.basicConfig(
|
|
|
+ level=logging.INFO,
|
|
|
+ format='%(asctime)s - %(levelname)s - %(message)s',
|
|
|
+ handlers=[
|
|
|
+ logging.FileHandler(log_file, encoding='utf-8'),
|
|
|
+ logging.StreamHandler()
|
|
|
+ ]
|
|
|
+ )
|
|
|
+
|
|
|
+def process_pdf_file(pdf_path):
|
|
|
+ """
|
|
|
+ 处理单个PDF文件,将每一页内容提取并保存到单独的txt文件中
|
|
|
+
|
|
|
+ Args:
|
|
|
+ pdf_path: PDF文件路径
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ bool: 处理是否成功
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ # 检查文件是否存在
|
|
|
+ if not os.path.exists(pdf_path):
|
|
|
+ logging.error(f"错误: 文件 {pdf_path} 不存在!")
|
|
|
+ return False
|
|
|
+
|
|
|
+ # 获取PDF文件名(不包含扩展名)作为输出目录名
|
|
|
+ pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
|
|
+ output_dir = os.path.join(os.path.dirname(pdf_path), f"{pdf_name}_pages")
|
|
|
+
|
|
|
+ # 创建输出目录
|
|
|
+ os.makedirs(output_dir, exist_ok=True)
|
|
|
+
|
|
|
+ # 打开PDF文件
|
|
|
+ doc = fitz.open(pdf_path)
|
|
|
+ total_pages = len(doc)
|
|
|
+
|
|
|
+ logging.info(f"开始处理PDF文件: {pdf_path} (总页数: {total_pages})")
|
|
|
+
|
|
|
+ # 处理每一页
|
|
|
+ empty_page_count = 0 # 连续空页计数器
|
|
|
+ for page_num in range(total_pages):
|
|
|
+ try:
|
|
|
+ # 提取页面文本
|
|
|
+ page = doc.load_page(page_num)
|
|
|
+ text = page.get_text("text")
|
|
|
+
|
|
|
+ # 检查文本是否为空
|
|
|
+ if not text.strip():
|
|
|
+ empty_page_count += 1
|
|
|
+ if empty_page_count >= 20:
|
|
|
+ raise Exception(f"连续20页内容为空,文件可能已损坏: {pdf_path}")
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ empty_page_count = 0 # 重置计数器
|
|
|
+
|
|
|
+ # 使用trunks_service保存文本内容
|
|
|
+ trunk_service = TrunksService()
|
|
|
+
|
|
|
+ trunk_data = {
|
|
|
+ 'content': text,
|
|
|
+ 'file_path': pdf_path,
|
|
|
+ 'type': 'page',
|
|
|
+ 'title': pdf_name,
|
|
|
+ 'page_no': page_num+1
|
|
|
+ }
|
|
|
+
|
|
|
+ trunk_service.create_trunk(trunk_data)
|
|
|
+ logging.info(f"已处理 {pdf_path} 的第 {page_num+1 } 页,并保存到数据库")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logging.error(f"处理 {pdf_path} 的第 {page_num+1} 页时出错: {str(e)}")
|
|
|
+
|
|
|
+
|
|
|
+ # 关闭PDF文件
|
|
|
+ doc.close()
|
|
|
+ logging.info(f"完成处理PDF文件: {pdf_path}")
|
|
|
+ return True
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logging.error(f"处理PDF文件时出错 {pdf_path}: {str(e)}")
|
|
|
+ return False
|
|
|
+
|
|
|
+def process_directory(input_path):
|
|
|
+ """
|
|
|
+ 递归处理目录下的所有PDF文件
|
|
|
+
|
|
|
+ Args:
|
|
|
+ input_path: 输入路径(文件或目录)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ tuple: (总文件数, 成功处理数, 跳过文件数, 失败文件数)
|
|
|
+ """
|
|
|
+ # 初始化计数器
|
|
|
+ total_files = 0
|
|
|
+ success_count = 0
|
|
|
+ skipped_count = 0
|
|
|
+ failed_count = 0
|
|
|
+
|
|
|
+ # 如果是文件,直接处理
|
|
|
+ if os.path.isfile(input_path):
|
|
|
+ total_files = 1
|
|
|
+ if input_path.lower().endswith('.pdf'):
|
|
|
+ if process_pdf_file(input_path):
|
|
|
+ success_count = 1
|
|
|
+ else:
|
|
|
+ failed_count = 1
|
|
|
+ else:
|
|
|
+ logging.info(f"跳过非PDF文件: {input_path}")
|
|
|
+ skipped_count = 1
|
|
|
+
|
|
|
+ # 如果是目录,递归处理
|
|
|
+ elif os.path.isdir(input_path):
|
|
|
+ for root, _, files in os.walk(input_path):
|
|
|
+ for file in files:
|
|
|
+ total_files += 1
|
|
|
+ file_path = os.path.join(root, file)
|
|
|
+
|
|
|
+ if file.lower().endswith('.pdf'):
|
|
|
+ if process_pdf_file(file_path):
|
|
|
+ success_count += 1
|
|
|
+ else:
|
|
|
+ failed_count += 1
|
|
|
+ else:
|
|
|
+ logging.info(f"跳过非PDF文件: {file_path}")
|
|
|
+ skipped_count += 1
|
|
|
+
|
|
|
+ return total_files, success_count, skipped_count, failed_count
|
|
|
+
|
|
|
+def process_pdf_files(input_path, log_dir='logs'):
|
|
|
+ """
|
|
|
+ 处理PDF文件或目录,将每个PDF文件的每一页保存为单独的txt文件
|
|
|
+
|
|
|
+ Args:
|
|
|
+ input_path: 输入文件或目录路径
|
|
|
+ log_dir: 日志文件保存目录,默认为'logs'
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ tuple: (总文件数, 成功处理数, 跳过文件数, 失败文件数)
|
|
|
+ """
|
|
|
+ # 设置日志记录器
|
|
|
+ setup_logging(log_dir)
|
|
|
+
|
|
|
+ # 处理文件或目录
|
|
|
+ logging.info(f"开始处理输入路径: {input_path}")
|
|
|
+ total, success, skipped, failed = process_directory(input_path)
|
|
|
+
|
|
|
+ # 输出统计信息
|
|
|
+ logging.info(f"\n处理完成!统计信息:")
|
|
|
+ logging.info(f"总文件数: {total}")
|
|
|
+ logging.info(f"成功处理: {success}")
|
|
|
+ logging.info(f"跳过文件: {skipped}")
|
|
|
+ logging.info(f"处理失败: {failed}")
|
|
|
+
|
|
|
+ return total, success, skipped, failed
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ directory = 'E:\急诊科资料\pdf'
|
|
|
+
|
|
|
+ # 设置日志记录器
|
|
|
+ setup_logging(directory)
|
|
|
+
|
|
|
+ # 处理文件或目录
|
|
|
+ logging.info(f"开始处理输入路径: {directory}")
|
|
|
+ total, success, skipped, failed = process_directory(directory)
|
|
|
+
|
|
|
+ # 输出统计信息
|
|
|
+ logging.info(f"\n处理完成!统计信息:")
|
|
|
+ logging.info(f"总文件数: {total}")
|
|
|
+ logging.info(f"成功处理: {success}")
|
|
|
+ logging.info(f"跳过文件: {skipped}")
|
|
|
+ logging.info(f"处理失败: {failed}")
|