update_pdf_page_numbers.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. import os
  2. import time
  3. import logging
  4. import psycopg2
  5. from urllib.parse import unquote
  6. from find_text_in_pdf import PDFTextFinder
  7. # 配置日志
  8. logging.basicConfig(
  9. level=logging.INFO,
  10. format='%(asctime)s - %(levelname)s - %(message)s'
  11. )
  12. logger = logging.getLogger(__name__)
  13. class TextSplitter:
  14. @staticmethod
  15. def split_text(text):
  16. """将文本分割成句子"""
  17. # 使用常见的标点符号作为分隔符
  18. delimiters = ['.', '!', '?', '。', '!', '?', '\n', ';', '。', ';']
  19. sentences = [text]
  20. for delimiter in delimiters:
  21. new_sentences = []
  22. for sentence in sentences:
  23. new_sentences.extend(sentence.split(delimiter))
  24. sentences = [s.strip() for s in new_sentences if s.strip()]
  25. return sentences
  26. class PDFPageUpdater:
  27. def __init__(self, db_params, books_dir):
  28. """初始化更新器
  29. Args:
  30. db_params: 数据库连接参数字典
  31. books_dir: PDF文件所在目录
  32. """
  33. self.db_params = db_params
  34. self.books_dir = books_dir
  35. self._page_cache = {}
  36. def connect_db(self):
  37. """连接到数据库"""
  38. return psycopg2.connect(**self.db_params)
  39. def get_records_without_page(self):
  40. """获取没有页码的记录"""
  41. query = """
  42. SELECT id, content, referrence, title
  43. FROM public.trunks
  44. WHERE page_no IS NULL
  45. AND referrence LIKE '%/books/%'
  46. AND referrence LIKE '%.pdf'
  47. AND type ='trunk'
  48. AND file_path LIKE '%《内科学 第10版》%'
  49. """
  50. with self.connect_db() as conn:
  51. with conn.cursor() as cur:
  52. cur.execute(query)
  53. return cur.fetchall()
  54. def update_page_number(self, record_id, page_no):
  55. """更新记录的页码
  56. Args:
  57. record_id: 记录ID
  58. page_no: 页码
  59. """
  60. query = """
  61. UPDATE public.trunks
  62. SET page_no = %s
  63. WHERE id = %s
  64. """
  65. with self.connect_db() as conn:
  66. with conn.cursor() as cur:
  67. cur.execute(query, (page_no, record_id))
  68. conn.commit()
  69. def find_page_number(self, pdf_path, content, title):
  70. """在PDF中查找内容对应的页码
  71. Args:
  72. pdf_path: PDF文件路径
  73. content: 要查找的内容
  74. title: 文档标题(用于缓存)
  75. Returns:
  76. int or None: 找到的页码,未找到则返回None
  77. """
  78. # 使用文件名和标题作为缓存键
  79. file_name = os.path.basename(pdf_path)
  80. cache_key = f"{file_name}:{title}"
  81. # 检查缓存
  82. if cache_key in self._page_cache:
  83. return self._page_cache[cache_key]
  84. # 首先尝试完整内容匹配
  85. start_time = time.time()
  86. pages = PDFTextFinder(pdf_path).find_text(content.replace(' ', ''))
  87. search_time = time.time() - start_time
  88. logger.info(f"PDF页码搜索耗时: {search_time * 1000:.2f}ms (文件: {file_name}, 内容: {content[:50]}...)")
  89. if pages and len(pages) > 0:
  90. page_no = pages[0]
  91. self._page_cache[cache_key] = page_no
  92. return page_no
  93. # 如果完整匹配失败,尝试按句子搜索
  94. sentences = TextSplitter.split_text(content)
  95. tried_sentences = 0
  96. page_counts = {}
  97. for sentence in sentences:
  98. if len(sentence) < 10: # 跳过过短的句子
  99. continue
  100. if tried_sentences >= 7: # 最多尝试3句
  101. break
  102. pages = PDFTextFinder(pdf_path).find_text(sentence.replace(' ', ''))
  103. if pages and len(pages) > 0:
  104. page_no = pages[0]
  105. page_counts[page_no] = page_counts.get(page_no, 0) + 1
  106. # 如果某个页码出现两次或相邻页码各出现一次,使用该页码
  107. for p in page_counts:
  108. if page_counts[p] >= 2 or (page_counts.get(p-1, 0) + page_counts.get(p+1, 0) > 0):
  109. self._page_cache[cache_key] = p
  110. return p
  111. tried_sentences += 1
  112. return None
  113. def process_records(self):
  114. """处理所有没有页码的记录"""
  115. records = self.get_records_without_page()
  116. total = len(records)
  117. logger.info(f"找到 {total} 条需要更新页码的记录")
  118. for i, (record_id, content, referrence, title) in enumerate(records, 1):
  119. try:
  120. # 从referrence构建PDF路径
  121. file_name = unquote(referrence.split("/books/")[-1])
  122. pdf_path = os.path.join(self.books_dir, file_name)
  123. if not os.path.exists(pdf_path):
  124. logger.warning(f"PDF文件不存在: {pdf_path}")
  125. continue
  126. # 查找页码
  127. page_no = self.find_page_number(pdf_path, content, title)
  128. if page_no:
  129. # 更新数据库
  130. self.update_page_number(record_id, page_no)
  131. logger.info(f"进度: [{i}/{total}] 已更新记录 {record_id} 的页码为 {page_no}")
  132. else:
  133. logger.warning(f"进度: [{i}/{total}] 未找到记录 {record_id} 的页码")
  134. except Exception as e:
  135. logger.error(f"处理记录 {record_id} 时出错: {str(e)}")
  136. continue
  137. def main():
  138. # 数据库连接参数
  139. db_params = {
  140. 'dbname': 'medkg',
  141. 'user': 'knowledge',
  142. 'password': 'qwer1234.', # 请替换为实际密码
  143. 'host': '173.18.12.203',
  144. 'port': '5432'
  145. }
  146. # PDF文件目录路径
  147. books_dir = 'C:\\Users\\17664\\Desktop'
  148. try:
  149. updater = PDFPageUpdater(db_params, books_dir)
  150. updater.process_records()
  151. logger.info("页码更新完成")
  152. except Exception as e:
  153. logger.error(f"程序执行出错: {str(e)}")
  154. if __name__ == "__main__":
  155. main()