123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198 |
- import fitz # PyMuPDF
- from pdfminer.pdfparser import PDFParser
- from pdfminer.pdfdocument import PDFDocument
- from pdfminer.pdfpage import PDFPage
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
- from pdfminer.converter import PDFPageAggregator
- from pdfminer.layout import LAParams, LTTextBoxHorizontal, LTLine, LTRect, LTImage
- import chardet
- def extract_text_from_pdf(pdf_path):
- """ 提取文本内容 """
- resource_manager = PDFResourceManager()
- device = PDFPageAggregator(resource_manager, laparams=LAParams())
- interpreter = PDFPageInterpreter(resource_manager, device)
- extracted_text = []
- with open(pdf_path, 'rb') as fh:
- pages = PDFPage.get_pages(fh, caching=True, check_extractable=True)
- for page in pages:
- interpreter.process_page(page)
- layout = device.get_result()
- for element in layout:
- if isinstance(element, LTTextBoxHorizontal):
- content = element.get_text().strip()
- try:
- extracted_text.append(content)
- except Exception as e:
- print(f"Error encoding text")
-
- return '\n'.join(extracted_text)
- def detect_graphic_objects(pdf_path):
- """ 检测图形对象(直线、矩形) """
- document = fitz.open(pdf_path)
- graphic_objects = []
- for page_num in range(len(document)):
- page = document.load_page(page_num)
- shapes = page.get_drawings()
- for shape in shapes:
- if 'l' in shape: # 直线
- graphic_objects.append(('line', shape['l']))
- elif 'rect' in shape: # 矩形
- graphic_objects.append(('rect', shape['rect']))
- return graphic_objects
- def detect_tables(pdf_path):
- """ 检测表格 """
- resource_manager = PDFResourceManager()
- device = PDFPageAggregator(resource_manager, laparams=LAParams())
- interpreter = PDFPageInterpreter(resource_manager, device)
- tables = []
- with open(pdf_path, 'rb') as fh:
- pages = PDFPage.get_pages(fh, caching=True, check_extractable=True)
- for page in pages:
- interpreter.process_page(page)
- layout = device.get_result()
- # 检测表格的方法:查找相邻的矩形和文本框
- boxes = [element for element in layout if isinstance(element, LTRect)]
- text_boxes = [element for element in layout if isinstance(element, LTTextBoxHorizontal)]
- for box in boxes:
- # 查找与矩形相邻的文本框
- adjacent_text_boxes = [
- tb for tb in text_boxes
- if abs(tb.y0 - box.y0) < 10 or abs(tb.y1 - box.y1) < 10 or
- abs(tb.x0 - box.x0) < 10 or abs(tb.x1 - box.x1) < 10
- ]
- if adjacent_text_boxes:
- tables.append({
- 'bbox': box.bbox,
- 'adjacent_text_boxes': adjacent_text_boxes
- })
- return tables
- def detect_images(pdf_path):
- """ 检测图像 """
- document = fitz.open(pdf_path)
- images = []
- for page_num in range(len(document)):
- page = document.load_page(page_num)
- image_list = page.get_images(full=True)
- for img_index, img in enumerate(image_list):
- xref = img[0]
- base_image = document.extract_image(xref)
- for k in base_image.keys():
- print("**************" + k)
- # 确保字典中有 'stream' 键
- if 'image' in base_image:
- image_bytes = base_image['image']
- else:
- image_bytes = b'' # 如果没有 'stream' 键,设置为空字节串
- image_ext = base_image["ext"]
- image_bbox = page.get_image_bbox(img)
- images.append({
- 'page': page_num + 1,
- 'index': img_index,
- 'bbox': image_bbox,
- 'format': image_ext,
- 'data': image_bytes
- })
- return images
- def main(path_of_job:str):
- import os
- os.makedirs(path_of_job+"/ocr_output", exist_ok=True)
- for root,dirs,files in os.walk(path_of_job+"/upload"):
- for f in files:
- if f.endswith(".pdf"): # 只处理 PDF 文件
- pdf_file = os.path.join(root, f)
- # 提取文本内容
- extracted_text = extract_text_from_pdf(pdf_file)
- print("Extracted Text:")
- try:
- with open(path_of_job+"/ocr_output/"+f+".txt", "w", encoding="utf-8") as f:
- f.write(extracted_text)
- except Exception as e:
- print(f"Error writing file {path_of_job+"/output/"+f+".txt"}")
- # # 检测图形对象
- # graphic_objects = detect_graphic_objects(pdf_file)
- # print("\nDetected Graphic Objects:")
- # for obj in graphic_objects:
- # obj_type, obj_data = obj
- # if obj_type == 'line':
- # print(f"Line from ({obj_data[0]}, {obj_data[1]}) to ({obj_data[2]}, {obj_data[3]})")
- # elif obj_type == 'rect':
- # print(f"Rectangle at ({obj_data[0]}, {obj_data[1]}, {obj_data[2]}, {obj_data[3]})")
- # # 检测表格
- # tables = detect_tables(pdf_file)
- # print("\nDetected Tables:")
- # for table in tables:
- # print(f"Table at bbox {table['bbox']} with {len(table['adjacent_text_boxes'])} adjacent text boxes")
- # # 检测图像
- # images = detect_images(pdf_file)
- # print("\nDetected Images:")
- # for img in images:
- # print(f"Image on page {img['page']} at bbox {img['bbox']} with format {img['format']}")
- # f = open("image"+str(img['index'])+"."+img['format'], "wb")
- # f.write(img['data'])
- # f.close()
- if __name__ == "__main__":
- import sys
- # 检查命令行参数
- if len(sys.argv) != 2:
- print("Usage: python script.py <path_of_job>")
- sys.exit(-1)
- path_of_job = sys.argv[1]
- print(dir(fitz))
- main(path_of_job)
- # 解释
- # 导入必要的模块:
- # fitz(PyMuPDF)用于解析 PDF 的图形对象和图像。
- # PDFMiner.six 的相关模块用于解析 PDF 的文本内容。
- # 定义函数 extract_text_from_pdf:
- # 使用 PDFMiner.six 提取 PDF 文件中的文本内容。
- # 初始化 PDF 资源管理器、设备和解释器。
- # 打开 PDF 文件并获取所有页面。
- # 遍历每个页面,处理页面布局。
- # 对于每个文本框(LTTextBoxHorizontal),提取文本并存储。
- # 定义函数 detect_graphic_objects:
- # 使用 PyMuPDF 检测 PDF 文件中的图形对象(如直线、矩形)。
- # 打开 PDF 文件并加载每一页。
- # 获取每一页的绘图对象。
- # 检查绘图对象中是否有直线或矩形,并将其添加到 graphic_objects 列表中。
- # 定义函数 detect_tables:
- # 使用 PDFMiner.six 检测 PDF 文件中的表格。
- # 初始化 PDF 资源管理器、设备和解释器。
- # 打开 PDF 文件并获取所有页面。
- # 遍历每个页面,处理页面布局。
- # 对于每个表格(LTTable),检测并存储。
- # 定义函数 detect_images:
- # 使用 PyMuPDF 检测 PDF 文件中的图像。
- # 打开 PDF 文件并加载每一页。
- # 获取每一页的图像列表。
- # 提取每个图像的 XREF、格式、边界框和数据。
- # 将图像信息存储在 images 列表中。
- # 定义主函数 main:
- # 调用 extract_text_from_pdf 函数提取文本内容并打印。
- # 调用 detect_graphic_objects 函数检测图形对象并打印。
- # 调用 detect_tables 函数检测表格并打印。
- # 调用 detect_images 函数检测图像并打印。
|