From 414c39436ba6be3674396f10f9ccc6db80b9f0c6 Mon Sep 17 00:00:00 2001 From: zxstty Date: Tue, 1 Jul 2025 10:55:39 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E5=AE=8C=E5=96=84=E5=AF=B9avx512=E7=9A=84?= =?UTF-8?q?=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/parser/handler/deep_pdf_parser.py | 19 +--- data_chain/parser/tools/instruct_scan_tool.py | 95 +++++++++++++++++++ data_chain/parser/tools/ocr_tool.py | 64 +++++++------ requirements.txt | 3 +- 4 files changed, 133 insertions(+), 48 deletions(-) create mode 100644 data_chain/parser/tools/instruct_scan_tool.py diff --git a/data_chain/parser/handler/deep_pdf_parser.py b/data_chain/parser/handler/deep_pdf_parser.py index 365664e..f81a779 100644 --- a/data_chain/parser/handler/deep_pdf_parser.py +++ b/data_chain/parser/handler/deep_pdf_parser.py @@ -4,9 +4,6 @@ import io import fitz from fitz import Page, Document import numpy as np -from PIL import Image -from pandas import DataFrame -from paddleocr import PaddleOCR from pydantic import BaseModel, Field import uuid import cv2 @@ -16,6 +13,7 @@ import shutil from data_chain.entities.enum import DocParseRelutTopology, ChunkParseTopology, ChunkType from data_chain.parser.parse_result import ParseNode, ParseResult from data_chain.parser.handler.base_parser import BaseParser +from data_chain.parser.tools.ocr_tool import OcrTool from data_chain.logger.logger import logger as logging @@ -51,16 +49,6 @@ class ParseNodeWithBbox(BaseModel): class DeepPdfParser(BaseParser): name = 'pdf.deep' - det_model_dir = 'data_chain/parser/model/ocr/ch_PP-OCRv4_det_infer' - rec_model_dir = 'data_chain/parser/model/ocr/ch_PP-OCRv4_rec_infer' - cls_model_dir = 'data_chain/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer' - ocr = PaddleOCR( - det_model_dir=det_model_dir, - rec_model_dir=rec_model_dir, - cls_model_dir=cls_model_dir, - use_angle_cls=True, - lang="ch" - ) # 使用中文语言模型 @staticmethod async def extract_text_from_page( @@ -111,8 +99,7 @@ class DeepPdfParser(BaseParser): async def extract_text_from_page_by_ocr( image_path: str, exclude_regions: list[Bbox] = None) -> list[ParseNodeWithBbox]: text_nodes_with_bbox = [] - image = cv2.imread(image_path) - result = DeepPdfParser.ocr.ocr(image, cls=True) + result = OcrTool.ocr_from_image_path(image_path) if not result or not result[0]: return [] for line in result[0]: @@ -281,7 +268,7 @@ class DeepPdfParser(BaseParser): int(merged_bboxes[index].x0): int(merged_bboxes[index].x1)] table_image_path = os.path.join(tmp_path, f"table_{uuid.uuid4()}.png") cv2.imwrite(table_image_path, table_image) - result = DeepPdfParser.ocr.ocr(table_image_path, cls=True) + result = OcrTool.ocr_from_image_path(table_image_path) if not result or not result[0]: continue diff --git a/data_chain/parser/tools/instruct_scan_tool.py b/data_chain/parser/tools/instruct_scan_tool.py new file mode 100644 index 0000000..9a849f4 --- /dev/null +++ b/data_chain/parser/tools/instruct_scan_tool.py @@ -0,0 +1,95 @@ +import platform +import cpuinfo + +class InstructScanTool: + @staticmethod + def check_avx512_support(): + """ + 检测当前系统是否支持 AVX-512 指令集 + + 返回值: + True: 明确支持 AVX-512 + False: 明确不支持 AVX-512 + "Maybe": 无法确定是否支持 + """ + try: + # 优先使用 cpuinfo 库获取精确信息 + info = cpuinfo.get_cpu_info() + flags = info.get('flags', []) + + # 检查常见的 AVX-512 子指令集 + avx512_flags = [ + 'avx512f', 'avx512cd', 'avx512er', 'avx512pf', + 'avx512dq', 'avx512bw', 'avx512vl', 'avx512ifma', + 'avx512vbmi' + ] + + # 只要存在一个 AVX-512 相关标志即判定支持 + if any(flag in flags for flag in avx512_flags): + return True + + # 对于 Intel 处理器,检查是否属于已知支持 AVX-512 的系列 + brand = info.get('brand_raw', '').lower() + if 'intel' in brand: + # 检查是否为 Xeon 或第 10 代及以后的 Core 处理器 + if 'xeon' in brand or ('core' in brand and any(f' {gen}th' in brand for gen in range(10, 14))): + return "Maybe" # 部分型号支持,需手动确认 + + return False + + except Exception as e: + # 回退到基于平台的检测方法(准确性较低) + return InstructScanTool._fallback_check() + + @staticmethod + def _fallback_check(): + """ + 回退到基于平台命令的检测方法(原实现) + """ + system = platform.system() + + if system == "Linux": + try: + with open('/proc/cpuinfo', 'r') as f: + cpuinfo = f.read() + avx512_flags = [ + 'avx512f', 'avx512cd', 'avx512er', 'avx512pf', + 'avx512dq', 'avx512bw', 'avx512vl', 'avx512ifma', + 'avx512vbmi' + ] + for flag in avx512_flags: + if flag in cpuinfo: + return True + return False + except Exception: + return False + + elif system == "Windows": + try: + import subprocess + # 尝试使用 PowerShell 获取更准确的信息 + try: + output = subprocess.check_output( + "powershell -command \"Get-WmiObject -Class Win32_Processor | Select-Object -ExpandProperty Name\"", + shell=True, stderr=subprocess.DEVNULL).decode().lower() + except: + # 旧版 Windows 回退到 wmic + output = subprocess.check_output("wmic cpu get name", shell=True).decode().lower() + + if "avx-512" in output: + return True + + # 检查是否为可能支持 AVX-512 的处理器系列 + if "xeon" in output or "i9" in output or "i7" in output: + return "Maybe" + + return False + except Exception: + return False + + elif system == "Darwin": # macOS + # macOS 硬件目前不支持 AVX-512 + return False + + else: + return False diff --git a/data_chain/parser/tools/ocr_tool.py b/data_chain/parser/tools/ocr_tool.py index bfc7aab..da2b449 100644 --- a/data_chain/parser/tools/ocr_tool.py +++ b/data_chain/parser/tools/ocr_tool.py @@ -1,11 +1,12 @@ from PIL import Image, ImageEnhance import yaml -from paddleocr import PaddleOCR +import cv2 import numpy as np from data_chain.parser.tools.token_tool import TokenTool from data_chain.logger.logger import logger as logging from data_chain.config.config import config from data_chain.llm.llm import LLM +from data_chain.parser.tools.instruct_scan_tool import InstructScanTool class OcrTool: @@ -13,15 +14,33 @@ class OcrTool: rec_model_dir = 'data_chain/parser/model/ocr/ch_PP-OCRv4_rec_infer' cls_model_dir = 'data_chain/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer' # 优化 OCR 参数配置 - model = PaddleOCR( - det_model_dir=det_model_dir, - rec_model_dir=rec_model_dir, - cls_model_dir=cls_model_dir, - use_angle_cls=True, - use_space_char=True, - det_db_thresh=0.3, # 降低文本检测阈值,提高敏感度 - det_db_box_thresh=0.5, # 调整文本框阈值 - ) + if InstructScanTool.check_avx512_support(): + from paddleocr import PaddleOCR + model = PaddleOCR( + det_model_dir=det_model_dir, + rec_model_dir=rec_model_dir, + cls_model_dir=cls_model_dir, + use_angle_cls=True, + lang="ch" + ) + else: + model = None + + @staticmethod + async def ocr_from_image_path(image_path: str) -> list: + try: + # 打开图片 + if OcrTool.model is None: + err = "[OCRTool] 当前机器不支持 AVX-512,无法进行OCR识别" + logging.error(err) + return None + image = cv2.imread(image_path) + result = OcrTool.model.ocr(image, cls=True) + return result + except Exception as e: + err = f"[OCRTool] OCR识别失败: {e}" + logging.exception(err) + return None @staticmethod async def ocr_from_image(image: np.ndarray) -> list: @@ -29,27 +48,6 @@ class OcrTool: # 尝试OCR识别 ocr_result = OcrTool.model.ocr(image) - - # 如果第一次尝试失败,尝试不同的参数配置 - if ocr_result is None or len(ocr_result) == 0 or ocr_result[0] is None: - logging.warning("[OCRTool] 第一次OCR尝试失败,尝试降低阈值...") - # 创建临时OCR实例,使用更低的阈值 - temp_ocr = PaddleOCR( - det_model_dir=OcrTool.det_model_dir, - rec_model_dir=OcrTool.rec_model_dir, - cls_model_dir=OcrTool.cls_model_dir, - use_angle_cls=True, - use_space_char=True, - det_db_thresh=0.2, # 更低的检测阈值 - det_db_box_thresh=0.4, # 更低的文本框阈值 - ) - ocr_result = temp_ocr.ocr(image) - - # 记录OCR结果状态 - if ocr_result is None or len(ocr_result) == 0 or ocr_result[0] is None: - logging.warning("[OCRTool] 图片无法识别文本") - return None - return ocr_result except Exception as e: err = f"[OCRTool] OCR识别失败: {e}" @@ -100,6 +98,10 @@ class OcrTool: @staticmethod async def image_to_text(image: np.ndarray, image_related_text: str = '', llm: LLM = None) -> str: try: + if OcrTool.model is None: + err = "[OCRTool] 当前机器不支持 AVX-512,无法进行OCR识别" + logging.error(err) + return '' ocr_result = await OcrTool.ocr_from_image(image) if ocr_result is None: return '' diff --git a/requirements.txt b/requirements.txt index cd791a2..81e5e5c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -45,4 +45,5 @@ tika==2.6.0 tiktoken==0.8.0 urllib3==2.2.1 uvicorn==0.21.0 -xlrd==2.0.1 \ No newline at end of file +xlrd==2.0.1 +py-cpuinfo==9.0.0 \ No newline at end of file -- Gitee From 20e44f02c44728aabc1d5c0244376d02d47c5cbe Mon Sep 17 00:00:00 2001 From: zxstty Date: Tue, 1 Jul 2025 11:41:33 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E5=AE=8C=E5=96=84=20deep=20pdf=20parse=20?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_chain/parser/handler/deep_pdf_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_chain/parser/handler/deep_pdf_parser.py b/data_chain/parser/handler/deep_pdf_parser.py index f81a779..c66fc40 100644 --- a/data_chain/parser/handler/deep_pdf_parser.py +++ b/data_chain/parser/handler/deep_pdf_parser.py @@ -99,7 +99,7 @@ class DeepPdfParser(BaseParser): async def extract_text_from_page_by_ocr( image_path: str, exclude_regions: list[Bbox] = None) -> list[ParseNodeWithBbox]: text_nodes_with_bbox = [] - result = OcrTool.ocr_from_image_path(image_path) + result = await OcrTool.ocr_from_image_path(image_path) if not result or not result[0]: return [] for line in result[0]: @@ -268,7 +268,7 @@ class DeepPdfParser(BaseParser): int(merged_bboxes[index].x0): int(merged_bboxes[index].x1)] table_image_path = os.path.join(tmp_path, f"table_{uuid.uuid4()}.png") cv2.imwrite(table_image_path, table_image) - result = OcrTool.ocr_from_image_path(table_image_path) + result = await OcrTool.ocr_from_image_path(table_image_path) if not result or not result[0]: continue -- Gitee