从扫描的 pdf 中提取设计模式

标签：python opencv deep-learning object-detection layout-parser

我有一份大约 650 页的扫描版 pdf。每页都包含一些图案（花朵、几何图案等）。我的目标是从 pdf 中提取这些模式。供您参考，这是 pdf 中的一页

我当前的解决方案涉及使用 opencv 检测轮廓并提取图案。但是，该解决方案并不能处理所有情况。该 pdf 每页包含不同数量的图像，并且具有不同的布局。每个图像模式下方都包含文本说明。我的代码如下所示。

import cv2
import numpy as np
from pdf2image import convert_from_path
from pdf2image.exceptions import PDFPageCountError
import fitz  # PyMuPDF
import os

def extract_patterns(pdf_path, use_ocr=False):
    if use_ocr:
        try:
            import pytesseract
        except ImportError:
            print("Pytesseract is not working")
            use_ocr = False

    try:
        pages = convert_from_path(pdf_path)
    except PDFPageCountError:
        print("Error.")
        doc = fitz.open(pdf_path)
        pages = [page.get_pixmap() for page in doc]
    
    patterns = []
    
    for page_num, page in enumerate(pages[16:], start=17):
        if isinstance(page, fitz.Pixmap):
            img = cv2.cvtColor(np.frombuffer(page.samples, dtype=np.uint8).reshape(page.height, page.width, 3), cv2.COLOR_RGB2BGR)
        else:
            img = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
        
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        _, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
        
        
        kernel = np.ones((5,5), np.uint8)
        binary = cv2.dilate(binary, kernel, iterations=2)
        binary = cv2.erode(binary, kernel, iterations=1)
        
        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        
        contours = sorted(contours, key=cv2.contourArea, reverse=True)
        
        pattern_count = 0
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            
            
            min_width = img.shape[1] * 0.05  
            min_height = img.shape[0] * 0.05  
            max_width = img.shape[1] * 0.5   
            max_height = img.shape[0] * 0.5  
            
            if w < min_width or h < min_height or w > max_width or h > max_height:
                continue
            
            pattern_region = img[y:y+h, x:x+w]
            
            text = ""
            if use_ocr:
                try:
                    text_height = min(30, img.shape[0] - (y + h))
                    text_region = img[y+h:y+h+text_height, x:x+w]
                    text = pytesseract.image_to_string(text_region, lang='chi_sim+eng')
                    
                    if not text.strip():
                        text_region = img[max(0, y-text_height):y, x:x+w]
                        text = pytesseract.image_to_string(text_region, lang='chi_sim+eng')
                except Exception as e:
                    print(f"OCR error on page {page_num}, pattern {pattern_count + 1}: {e}")
            
            if not use_ocr or text.strip():
                pattern_count += 1
                patterns.append({
                    'image': pattern_region,
                    'text': text.strip() if text.strip() else f"Pattern_{page_num}_{pattern_count}",
                    'page': page_num,
                    'position': (x, y, w, h),
                    'pattern_number': pattern_count
                })
    
    return patterns

从上图中，此代码在模式 4（从左开始）上失败。

好的，我可以帮助你改进你的设计模式提取代码。你当前使用 OpenCV 的方法是一个不错的开始，但正如你所提到的，在处理不同布局和从图案中分离文本时，它可能会遇到问题。

以下是我们可以改进你的方法的一些建议：

改进预处理： 目前的预处理步骤（灰度、阈值处理、膨胀、腐蚀）是一个很好的开始，但我们可以通过添加更多步骤来提高其稳健性：
- 调整大小/重新缩放： 在处理之前重新缩放图像可以提高轮廓检测的准确性，尤其是在处理小型复杂图案时。
- 去噪： 使用高斯模糊或中值滤波去除扫描文档中可能存在的噪声。
- 自适应阈值处理： 由于照明条件的变化，考虑使用自适应阈值处理而不是全局阈值处理。像 cv2.adaptiveThreshold 这样的函数可以帮助你实现这一点。
更强大的轮廓分析：
- 轮廓层次结构： 利用 OpenCV 的轮廓层次结构来区分图案和文本。图案很可能位于文本轮廓的父轮廓内。
- 纵横比和面积过滤： 使用更严格的纵横比和面积阈值过滤误报的轮廓，如那些可能由文本产生的轮廓。实验确定适合你的特定数据集的最佳阈值。
文本检测和移除：
- 专用文本检测： 对于更强大的文本检测，请考虑使用专用的 OCR 引擎（如 Tesseract）或文本检测模型（如 EAST）。
- 基于连接组件的分析： 分析连接组件并移除文本区域，这将有助于更清晰地分离图案。
布局分析：
- 行/列检测： 对于不同布局，请使用霍夫线变换或其他线检测技术检测页面上的行或列。这可以帮助你将页面分割成不同的区域，从而更轻松地提取图案。

改进代码示例：

import cv2
import numpy as np
from pdf2image import convert_from_path
# ...其他导入...

def extract_patterns(pdf_path, use_ocr=False):
    # ...（OCR 检查与之前相同）...

    patterns = []

    for page_num, page in enumerate(pages[16:], start=17):
        # ...（图像加载与之前相同）...

        # 预处理
        img = cv2.resize(img, None, fx=1.5, fy=1.5, interpolation=cv2.INTER_CUBIC) # 调整图像大小
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        blurred = cv2.GaussianBlur(gray, (5, 5), 0) # 应用高斯模糊
        _, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU) # 自适应阈值处理

        # ...（膨胀和腐蚀与之前相同）...

        contours, hierarchy = cv2.findContours(binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) # 获取轮廓层次结构

        pattern_count = 0
        for i, contour in enumerate(contours):
            x, y, w, h = cv2.boundingRect(contour)

            # ...（纵横比和面积过滤与之前相同）...

            # 检查当前轮廓是否是父轮廓，并且没有父轮廓
            if hierarchy[0][i][3] == -1 and (w * h) < (img.shape[0] * img.shape[1] * 0.8): 
                pattern_region = img[y:y + h, x:x + w]

                # ...（使用 OCR 提取文本 - 与之前相同）...

                pattern_count += 1
                patterns.append({
                    # ...（将提取的信息保存到 patterns 列表中）...
                })

    return patterns

请记住，这些只是建议，你可能需要根据你的特定 PDF 文档调整代码。实验不同的预处理技术、轮廓分析方法和布局分析技术，以找到最适合你的需求的组合。

标签：python,opencv,deep-learning,object-detection,layout-parser
From： 78818607

从扫描的 pdf 中提取设计模式

相关文章

赞助商

阅读排行