import os import time import logging import psycopg2 from urllib.parse import unquote from find_text_in_pdf import PDFTextFinder # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class TextSplitter: @staticmethod def split_text(text): """将文本分割成句子""" # 使用常见的标点符号作为分隔符 delimiters = ['.', '!', '?', '。', '!', '?', '\n', ';', '。', ';'] sentences = [text] for delimiter in delimiters: new_sentences = [] for sentence in sentences: new_sentences.extend(sentence.split(delimiter)) sentences = [s.strip() for s in new_sentences if s.strip()] return sentences class PDFPageUpdater: def __init__(self, db_params, books_dir): """初始化更新器 Args: db_params: 数据库连接参数字典 books_dir: PDF文件所在目录 """ self.db_params = db_params self.books_dir = books_dir self._page_cache = {} def connect_db(self): """连接到数据库""" return psycopg2.connect(**self.db_params) def get_records_without_page(self): """获取没有页码的记录""" query = """ SELECT id, content, referrence, title FROM public.trunks WHERE page_no IS NULL AND referrence LIKE '%/books/%' AND referrence LIKE '%.pdf' AND type ='trunk' AND file_path LIKE '%《内科学 第10版》%' """ with self.connect_db() as conn: with conn.cursor() as cur: cur.execute(query) return cur.fetchall() def update_page_number(self, record_id, page_no): """更新记录的页码 Args: record_id: 记录ID page_no: 页码 """ query = """ UPDATE public.trunks SET page_no = %s WHERE id = %s """ with self.connect_db() as conn: with conn.cursor() as cur: cur.execute(query, (page_no, record_id)) conn.commit() def find_page_number(self, pdf_path, content, title): """在PDF中查找内容对应的页码 Args: pdf_path: PDF文件路径 content: 要查找的内容 title: 文档标题(用于缓存) Returns: int or None: 找到的页码,未找到则返回None """ # 使用文件名和标题作为缓存键 file_name = os.path.basename(pdf_path) cache_key = f"{file_name}:{title}" # 检查缓存 if cache_key in self._page_cache: return self._page_cache[cache_key] # 首先尝试完整内容匹配 start_time = time.time() pages = PDFTextFinder(pdf_path).find_text(content.replace(' ', '')) search_time = time.time() - start_time logger.info(f"PDF页码搜索耗时: {search_time * 1000:.2f}ms (文件: {file_name}, 内容: {content[:50]}...)") if pages and len(pages) > 0: page_no = pages[0] self._page_cache[cache_key] = page_no return page_no # 如果完整匹配失败,尝试按句子搜索 sentences = TextSplitter.split_text(content) tried_sentences = 0 page_counts = {} for sentence in sentences: if len(sentence) < 10: # 跳过过短的句子 continue if tried_sentences >= 7: # 最多尝试3句 break pages = PDFTextFinder(pdf_path).find_text(sentence.replace(' ', '')) if pages and len(pages) > 0: page_no = pages[0] page_counts[page_no] = page_counts.get(page_no, 0) + 1 # 如果某个页码出现两次或相邻页码各出现一次,使用该页码 for p in page_counts: if page_counts[p] >= 2 or (page_counts.get(p-1, 0) + page_counts.get(p+1, 0) > 0): self._page_cache[cache_key] = p return p tried_sentences += 1 return None def process_records(self): """处理所有没有页码的记录""" records = self.get_records_without_page() total = len(records) logger.info(f"找到 {total} 条需要更新页码的记录") for i, (record_id, content, referrence, title) in enumerate(records, 1): try: # 从referrence构建PDF路径 file_name = unquote(referrence.split("/books/")[-1]) pdf_path = os.path.join(self.books_dir, file_name) if not os.path.exists(pdf_path): logger.warning(f"PDF文件不存在: {pdf_path}") continue # 查找页码 page_no = self.find_page_number(pdf_path, content, title) if page_no: # 更新数据库 self.update_page_number(record_id, page_no) logger.info(f"进度: [{i}/{total}] 已更新记录 {record_id} 的页码为 {page_no}") else: logger.warning(f"进度: [{i}/{total}] 未找到记录 {record_id} 的页码") except Exception as e: logger.error(f"处理记录 {record_id} 时出错: {str(e)}") continue def main(): # 数据库连接参数 db_params = { 'dbname': 'medkg', 'user': 'knowledge', 'password': 'qwer1234.', # 请替换为实际密码 'host': '173.18.12.203', 'port': '5432' } # PDF文件目录路径 books_dir = 'C:\\Users\\17664\\Desktop' try: updater = PDFPageUpdater(db_params, books_dir) updater.process_records() logger.info("页码更新完成") except Exception as e: logger.error(f"程序执行出错: {str(e)}") if __name__ == "__main__": main()