123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196 |
- import os
- import fitz # PyMuPDF
- import argparse
- import logging
- from datetime import datetime
- from service.trunks_service import TrunksService
- def setup_logging(log_dir):
- """
- 配置日志记录器
-
- Args:
- log_dir: 日志文件保存目录
- """
- # 创建日志目录
- os.makedirs(log_dir, exist_ok=True)
-
- # 创建带时间戳的日志文件名
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
- log_file = os.path.join(log_dir, f'pdf_process_{timestamp}.log')
-
- # 配置日志格式
- logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(levelname)s - %(message)s',
- handlers=[
- logging.FileHandler(log_file, encoding='utf-8'),
- logging.StreamHandler()
- ]
- )
- def process_pdf_file(pdf_path):
- """
- 处理单个PDF文件,将每一页内容提取并保存到单独的txt文件中
-
- Args:
- pdf_path: PDF文件路径
-
- Returns:
- bool: 处理是否成功
- """
- try:
- # 检查文件是否存在
- if not os.path.exists(pdf_path):
- logging.error(f"错误: 文件 {pdf_path} 不存在!")
- return False
-
- # 获取PDF文件名(不包含扩展名)作为输出目录名
- pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
- output_dir = os.path.join(os.path.dirname(pdf_path), f"{pdf_name}_pages")
-
- # 创建输出目录
- os.makedirs(output_dir, exist_ok=True)
-
- # 打开PDF文件
- doc = fitz.open(pdf_path)
- total_pages = len(doc)
-
- logging.info(f"开始处理PDF文件: {pdf_path} (总页数: {total_pages})")
-
- # 处理每一页
- empty_page_count = 0 # 连续空页计数器
- for page_num in range(total_pages):
- try:
- # 提取页面文本
- page = doc.load_page(page_num)
- text = page.get_text("text")
-
- # 检查文本是否为空
- if not text.strip():
- empty_page_count += 1
- if empty_page_count >= 20:
- raise Exception(f"连续20页内容为空,文件可能已损坏: {pdf_path}")
- continue
- else:
- empty_page_count = 0 # 重置计数器
-
- # 使用trunks_service保存文本内容
- trunk_service = TrunksService()
- trunk_data = {
- 'content': text,
- 'file_path': pdf_path,
- 'type': 'page',
- 'title': pdf_name,
- 'page_no': page_num+1
- }
- trunk_service.create_trunk(trunk_data)
- logging.info(f"已处理 {pdf_path} 的第 {page_num+1 } 页,并保存到数据库")
-
- except Exception as e:
- logging.error(f"处理 {pdf_path} 的第 {page_num+1} 页时出错: {str(e)}")
-
- # 关闭PDF文件
- doc.close()
- logging.info(f"完成处理PDF文件: {pdf_path}")
- return True
-
- except Exception as e:
- logging.error(f"处理PDF文件时出错 {pdf_path}: {str(e)}")
- return False
- def process_directory(input_path):
- """
- 递归处理目录下的所有PDF文件
-
- Args:
- input_path: 输入路径(文件或目录)
-
- Returns:
- tuple: (总文件数, 成功处理数, 跳过文件数, 失败文件数)
- """
- # 初始化计数器
- total_files = 0
- success_count = 0
- skipped_count = 0
- failed_count = 0
-
- # 如果是文件,直接处理
- if os.path.isfile(input_path):
- total_files = 1
- if input_path.lower().endswith('.pdf'):
- if process_pdf_file(input_path):
- success_count = 1
- else:
- failed_count = 1
- else:
- logging.info(f"跳过非PDF文件: {input_path}")
- skipped_count = 1
-
- # 如果是目录,递归处理
- elif os.path.isdir(input_path):
- for root, _, files in os.walk(input_path):
- for file in files:
- total_files += 1
- file_path = os.path.join(root, file)
-
- if file.lower().endswith('.pdf'):
- if process_pdf_file(file_path):
- success_count += 1
- else:
- failed_count += 1
- else:
- logging.info(f"跳过非PDF文件: {file_path}")
- skipped_count += 1
-
- return total_files, success_count, skipped_count, failed_count
- def process_pdf_files(input_path, log_dir='logs'):
- """
- 处理PDF文件或目录,将每个PDF文件的每一页保存为单独的txt文件
-
- Args:
- input_path: 输入文件或目录路径
- log_dir: 日志文件保存目录,默认为'logs'
-
- Returns:
- tuple: (总文件数, 成功处理数, 跳过文件数, 失败文件数)
- """
- # 设置日志记录器
- setup_logging(log_dir)
-
- # 处理文件或目录
- logging.info(f"开始处理输入路径: {input_path}")
- total, success, skipped, failed = process_directory(input_path)
-
- # 输出统计信息
- logging.info(f"\n处理完成!统计信息:")
- logging.info(f"总文件数: {total}")
- logging.info(f"成功处理: {success}")
- logging.info(f"跳过文件: {skipped}")
- logging.info(f"处理失败: {failed}")
-
- return total, success, skipped, failed
- if __name__ == "__main__":
- directory = 'C:\\Users\\17664\\Desktop\\test'
- # 设置日志记录器
- setup_logging(directory)
- # 处理文件或目录
- logging.info(f"开始处理输入路径: {directory}")
- total, success, skipped, failed = process_directory(directory)
- # 输出统计信息
- logging.info(f"\n处理完成!统计信息:")
- logging.info(f"总文件数: {total}")
- logging.info(f"成功处理: {success}")
- logging.info(f"跳过文件: {skipped}")
- logging.info(f"处理失败: {failed}")
|