pdf_to_txt_mupdf.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. import os
  2. import fitz # PyMuPDF
  3. import argparse
  4. import logging
  5. from datetime import datetime
  6. from service.trunks_service import TrunksService
  7. def setup_logging(log_dir):
  8. """
  9. 配置日志记录器
  10. Args:
  11. log_dir: 日志文件保存目录
  12. """
  13. # 创建日志目录
  14. os.makedirs(log_dir, exist_ok=True)
  15. # 创建带时间戳的日志文件名
  16. timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
  17. log_file = os.path.join(log_dir, f'pdf_process_{timestamp}.log')
  18. # 配置日志格式
  19. logging.basicConfig(
  20. level=logging.INFO,
  21. format='%(asctime)s - %(levelname)s - %(message)s',
  22. handlers=[
  23. logging.FileHandler(log_file, encoding='utf-8'),
  24. logging.StreamHandler()
  25. ]
  26. )
  27. def process_pdf_file(pdf_path):
  28. """
  29. 处理单个PDF文件,将每一页内容提取并保存到单独的txt文件中
  30. Args:
  31. pdf_path: PDF文件路径
  32. Returns:
  33. bool: 处理是否成功
  34. """
  35. try:
  36. # 检查文件是否存在
  37. if not os.path.exists(pdf_path):
  38. logging.error(f"错误: 文件 {pdf_path} 不存在!")
  39. return False
  40. # 获取PDF文件名(不包含扩展名)作为输出目录名
  41. pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
  42. output_dir = os.path.join(os.path.dirname(pdf_path), f"{pdf_name}_pages")
  43. # 创建输出目录
  44. os.makedirs(output_dir, exist_ok=True)
  45. # 打开PDF文件
  46. doc = fitz.open(pdf_path)
  47. total_pages = len(doc)
  48. logging.info(f"开始处理PDF文件: {pdf_path} (总页数: {total_pages})")
  49. # 处理每一页
  50. empty_page_count = 0 # 连续空页计数器
  51. for page_num in range(total_pages):
  52. try:
  53. # 提取页面文本
  54. page = doc.load_page(page_num)
  55. text = page.get_text("text")
  56. # 检查文本是否为空
  57. if not text.strip():
  58. empty_page_count += 1
  59. if empty_page_count >= 20:
  60. raise Exception(f"连续20页内容为空,文件可能已损坏: {pdf_path}")
  61. continue
  62. else:
  63. empty_page_count = 0 # 重置计数器
  64. # 使用trunks_service保存文本内容
  65. trunk_service = TrunksService()
  66. trunk_data = {
  67. 'content': text,
  68. 'file_path': pdf_path,
  69. 'type': 'page',
  70. 'title': pdf_name,
  71. 'page_no': page_num+1
  72. }
  73. trunk_service.create_trunk(trunk_data)
  74. logging.info(f"已处理 {pdf_path} 的第 {page_num+1 } 页,并保存到数据库")
  75. except Exception as e:
  76. logging.error(f"处理 {pdf_path} 的第 {page_num+1} 页时出错: {str(e)}")
  77. # 关闭PDF文件
  78. doc.close()
  79. logging.info(f"完成处理PDF文件: {pdf_path}")
  80. return True
  81. except Exception as e:
  82. logging.error(f"处理PDF文件时出错 {pdf_path}: {str(e)}")
  83. return False
  84. def process_directory(input_path):
  85. """
  86. 递归处理目录下的所有PDF文件
  87. Args:
  88. input_path: 输入路径(文件或目录)
  89. Returns:
  90. tuple: (总文件数, 成功处理数, 跳过文件数, 失败文件数)
  91. """
  92. # 初始化计数器
  93. total_files = 0
  94. success_count = 0
  95. skipped_count = 0
  96. failed_count = 0
  97. # 如果是文件,直接处理
  98. if os.path.isfile(input_path):
  99. total_files = 1
  100. if input_path.lower().endswith('.pdf'):
  101. if process_pdf_file(input_path):
  102. success_count = 1
  103. else:
  104. failed_count = 1
  105. else:
  106. logging.info(f"跳过非PDF文件: {input_path}")
  107. skipped_count = 1
  108. # 如果是目录,递归处理
  109. elif os.path.isdir(input_path):
  110. for root, _, files in os.walk(input_path):
  111. for file in files:
  112. total_files += 1
  113. file_path = os.path.join(root, file)
  114. if file.lower().endswith('.pdf'):
  115. if process_pdf_file(file_path):
  116. success_count += 1
  117. else:
  118. failed_count += 1
  119. else:
  120. logging.info(f"跳过非PDF文件: {file_path}")
  121. skipped_count += 1
  122. return total_files, success_count, skipped_count, failed_count
  123. def process_pdf_files(input_path, log_dir='logs'):
  124. """
  125. 处理PDF文件或目录,将每个PDF文件的每一页保存为单独的txt文件
  126. Args:
  127. input_path: 输入文件或目录路径
  128. log_dir: 日志文件保存目录,默认为'logs'
  129. Returns:
  130. tuple: (总文件数, 成功处理数, 跳过文件数, 失败文件数)
  131. """
  132. # 设置日志记录器
  133. setup_logging(log_dir)
  134. # 处理文件或目录
  135. logging.info(f"开始处理输入路径: {input_path}")
  136. total, success, skipped, failed = process_directory(input_path)
  137. # 输出统计信息
  138. logging.info(f"\n处理完成!统计信息:")
  139. logging.info(f"总文件数: {total}")
  140. logging.info(f"成功处理: {success}")
  141. logging.info(f"跳过文件: {skipped}")
  142. logging.info(f"处理失败: {failed}")
  143. return total, success, skipped, failed
  144. if __name__ == "__main__":
  145. directory = 'C:\\Users\\17664\\Desktop\\test'
  146. # 设置日志记录器
  147. setup_logging(directory)
  148. # 处理文件或目录
  149. logging.info(f"开始处理输入路径: {directory}")
  150. total, success, skipped, failed = process_directory(directory)
  151. # 输出统计信息
  152. logging.info(f"\n处理完成!统计信息:")
  153. logging.info(f"总文件数: {total}")
  154. logging.info(f"成功处理: {success}")
  155. logging.info(f"跳过文件: {skipped}")
  156. logging.info(f"处理失败: {failed}")