123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187 |
- import os
- import time
- import logging
- import psycopg2
- from urllib.parse import unquote
- from find_text_in_pdf import PDFTextFinder
- # 配置日志
- logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(levelname)s - %(message)s'
- )
- logger = logging.getLogger(__name__)
- class TextSplitter:
- @staticmethod
- def split_text(text):
- """将文本分割成句子"""
- # 使用常见的标点符号作为分隔符
- delimiters = ['.', '!', '?', '。', '!', '?', '\n', ';', '。', ';']
- sentences = [text]
- for delimiter in delimiters:
- new_sentences = []
- for sentence in sentences:
- new_sentences.extend(sentence.split(delimiter))
- sentences = [s.strip() for s in new_sentences if s.strip()]
- return sentences
- class PDFPageUpdater:
- def __init__(self, db_params, books_dir):
- """初始化更新器
-
- Args:
- db_params: 数据库连接参数字典
- books_dir: PDF文件所在目录
- """
- self.db_params = db_params
- self.books_dir = books_dir
- self._page_cache = {}
-
- def connect_db(self):
- """连接到数据库"""
- return psycopg2.connect(**self.db_params)
-
- def get_records_without_page(self):
- """获取没有页码的记录"""
- query = """
- SELECT id, content, referrence, title
- FROM public.trunks
- WHERE page_no IS NULL
- AND referrence LIKE '%/books/%'
- AND referrence LIKE '%.pdf'
- AND type ='trunk'
- AND file_path LIKE '%《内科学 第10版》%'
- """
-
- with self.connect_db() as conn:
- with conn.cursor() as cur:
- cur.execute(query)
- return cur.fetchall()
-
- def update_page_number(self, record_id, page_no):
- """更新记录的页码
-
- Args:
- record_id: 记录ID
- page_no: 页码
- """
- query = """
- UPDATE public.trunks
- SET page_no = %s
- WHERE id = %s
- """
-
- with self.connect_db() as conn:
- with conn.cursor() as cur:
- cur.execute(query, (page_no, record_id))
- conn.commit()
-
- def find_page_number(self, pdf_path, content, title):
- """在PDF中查找内容对应的页码
-
- Args:
- pdf_path: PDF文件路径
- content: 要查找的内容
- title: 文档标题(用于缓存)
-
- Returns:
- int or None: 找到的页码,未找到则返回None
- """
- # 使用文件名和标题作为缓存键
- file_name = os.path.basename(pdf_path)
- cache_key = f"{file_name}:{title}"
-
- # 检查缓存
- if cache_key in self._page_cache:
- return self._page_cache[cache_key]
-
- # 首先尝试完整内容匹配
- start_time = time.time()
- pages = PDFTextFinder(pdf_path).find_text(content.replace(' ', ''))
- search_time = time.time() - start_time
- logger.info(f"PDF页码搜索耗时: {search_time * 1000:.2f}ms (文件: {file_name}, 内容: {content[:50]}...)")
-
- if pages and len(pages) > 0:
- page_no = pages[0]
- self._page_cache[cache_key] = page_no
- return page_no
-
- # 如果完整匹配失败,尝试按句子搜索
- sentences = TextSplitter.split_text(content)
- tried_sentences = 0
- page_counts = {}
-
- for sentence in sentences:
- if len(sentence) < 10: # 跳过过短的句子
- continue
- if tried_sentences >= 7: # 最多尝试3句
- break
-
- pages = PDFTextFinder(pdf_path).find_text(sentence.replace(' ', ''))
- if pages and len(pages) > 0:
- page_no = pages[0]
- page_counts[page_no] = page_counts.get(page_no, 0) + 1
-
- # 如果某个页码出现两次或相邻页码各出现一次,使用该页码
- for p in page_counts:
- if page_counts[p] >= 2 or (page_counts.get(p-1, 0) + page_counts.get(p+1, 0) > 0):
- self._page_cache[cache_key] = p
- return p
-
- tried_sentences += 1
-
- return None
-
- def process_records(self):
- """处理所有没有页码的记录"""
- records = self.get_records_without_page()
- total = len(records)
- logger.info(f"找到 {total} 条需要更新页码的记录")
-
- for i, (record_id, content, referrence, title) in enumerate(records, 1):
- try:
- # 从referrence构建PDF路径
- file_name = unquote(referrence.split("/books/")[-1])
- pdf_path = os.path.join(self.books_dir, file_name)
-
- if not os.path.exists(pdf_path):
- logger.warning(f"PDF文件不存在: {pdf_path}")
- continue
-
- # 查找页码
- page_no = self.find_page_number(pdf_path, content, title)
-
- if page_no:
- # 更新数据库
- self.update_page_number(record_id, page_no)
- logger.info(f"进度: [{i}/{total}] 已更新记录 {record_id} 的页码为 {page_no}")
- else:
- logger.warning(f"进度: [{i}/{total}] 未找到记录 {record_id} 的页码")
-
- except Exception as e:
- logger.error(f"处理记录 {record_id} 时出错: {str(e)}")
- continue
- def main():
- # 数据库连接参数
- db_params = {
- 'dbname': 'medkg',
- 'user': 'knowledge',
- 'password': 'qwer1234.', # 请替换为实际密码
- 'host': '173.18.12.203',
- 'port': '5432'
- }
-
- # PDF文件目录路径
- books_dir = 'C:\\Users\\17664\\Desktop'
-
- try:
- updater = PDFPageUpdater(db_params, books_dir)
- updater.process_records()
- logger.info("页码更新完成")
- except Exception as e:
- logger.error(f"程序执行出错: {str(e)}")
- if __name__ == "__main__":
- main()
|