import os import fitz # PyMuPDF import argparse import logging from datetime import datetime from service.trunks_service import TrunksService def setup_logging(log_dir): """ 配置日志记录器 Args: log_dir: 日志文件保存目录 """ # 创建日志目录 os.makedirs(log_dir, exist_ok=True) # 创建带时间戳的日志文件名 timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') log_file = os.path.join(log_dir, f'pdf_process_{timestamp}.log') # 配置日志格式 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(log_file, encoding='utf-8'), logging.StreamHandler() ] ) def process_pdf_file(pdf_path): """ 处理单个PDF文件,将每一页内容提取并保存到单独的txt文件中 Args: pdf_path: PDF文件路径 Returns: bool: 处理是否成功 """ try: # 检查文件是否存在 if not os.path.exists(pdf_path): logging.error(f"错误: 文件 {pdf_path} 不存在!") return False # 获取PDF文件名(不包含扩展名)作为输出目录名 pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] output_dir = os.path.join(os.path.dirname(pdf_path), f"{pdf_name}_pages") # 创建输出目录 os.makedirs(output_dir, exist_ok=True) # 打开PDF文件 doc = fitz.open(pdf_path) total_pages = len(doc) logging.info(f"开始处理PDF文件: {pdf_path} (总页数: {total_pages})") # 处理每一页 empty_page_count = 0 # 连续空页计数器 for page_num in range(total_pages): try: # 提取页面文本 page = doc.load_page(page_num) text = page.get_text("text") # 检查文本是否为空 if not text.strip(): empty_page_count += 1 if empty_page_count >= 20: raise Exception(f"连续20页内容为空,文件可能已损坏: {pdf_path}") continue else: empty_page_count = 0 # 重置计数器 # 使用trunks_service保存文本内容 trunk_service = TrunksService() trunk_data = { 'content': text, 'file_path': pdf_path, 'type': 'page', 'title': pdf_name, 'page_no': page_num+1 } trunk_service.create_trunk(trunk_data) logging.info(f"已处理 {pdf_path} 的第 {page_num+1 } 页,并保存到数据库") except Exception as e: logging.error(f"处理 {pdf_path} 的第 {page_num+1} 页时出错: {str(e)}") # 关闭PDF文件 doc.close() logging.info(f"完成处理PDF文件: {pdf_path}") return True except Exception as e: logging.error(f"处理PDF文件时出错 {pdf_path}: {str(e)}") return False def process_directory(input_path): """ 递归处理目录下的所有PDF文件 Args: input_path: 输入路径(文件或目录) Returns: tuple: (总文件数, 成功处理数, 跳过文件数, 失败文件数) """ # 初始化计数器 total_files = 0 success_count = 0 skipped_count = 0 failed_count = 0 # 如果是文件,直接处理 if os.path.isfile(input_path): total_files = 1 if input_path.lower().endswith('.pdf'): if process_pdf_file(input_path): success_count = 1 else: failed_count = 1 else: logging.info(f"跳过非PDF文件: {input_path}") skipped_count = 1 # 如果是目录,递归处理 elif os.path.isdir(input_path): for root, _, files in os.walk(input_path): for file in files: total_files += 1 file_path = os.path.join(root, file) if file.lower().endswith('.pdf'): if process_pdf_file(file_path): success_count += 1 else: failed_count += 1 else: logging.info(f"跳过非PDF文件: {file_path}") skipped_count += 1 return total_files, success_count, skipped_count, failed_count def process_pdf_files(input_path, log_dir='logs'): """ 处理PDF文件或目录,将每个PDF文件的每一页保存为单独的txt文件 Args: input_path: 输入文件或目录路径 log_dir: 日志文件保存目录,默认为'logs' Returns: tuple: (总文件数, 成功处理数, 跳过文件数, 失败文件数) """ # 设置日志记录器 setup_logging(log_dir) # 处理文件或目录 logging.info(f"开始处理输入路径: {input_path}") total, success, skipped, failed = process_directory(input_path) # 输出统计信息 logging.info(f"\n处理完成!统计信息:") logging.info(f"总文件数: {total}") logging.info(f"成功处理: {success}") logging.info(f"跳过文件: {skipped}") logging.info(f"处理失败: {failed}") return total, success, skipped, failed if __name__ == "__main__": directory = 'C:\\Users\\17664\\Desktop\\test' # 设置日志记录器 setup_logging(directory) # 处理文件或目录 logging.info(f"开始处理输入路径: {directory}") total, success, skipped, failed = process_directory(directory) # 输出统计信息 logging.info(f"\n处理完成!统计信息:") logging.info(f"总文件数: {total}") logging.info(f"成功处理: {success}") logging.info(f"跳过文件: {skipped}") logging.info(f"处理失败: {failed}")