From 414c39436ba6be3674396f10f9ccc6db80b9f0c6 Mon Sep 17 00:00:00 2001
From: zxstty <zhaojiaqi18@huawei.com>
Date: Tue, 1 Jul 2025 10:55:39 +0800
Subject: [PATCH 1/2] =?UTF-8?q?=E5=AE=8C=E5=96=84=E5=AF=B9avx512=E7=9A=84?=
 =?UTF-8?q?=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data_chain/parser/handler/deep_pdf_parser.py  | 19 +---
 data_chain/parser/tools/instruct_scan_tool.py | 95 +++++++++++++++++++
 data_chain/parser/tools/ocr_tool.py           | 64 +++++++------
 requirements.txt                              |  3 +-
 4 files changed, 133 insertions(+), 48 deletions(-)
 create mode 100644 data_chain/parser/tools/instruct_scan_tool.py

diff --git a/data_chain/parser/handler/deep_pdf_parser.py b/data_chain/parser/handler/deep_pdf_parser.py
index 365664e..f81a779 100644
--- a/data_chain/parser/handler/deep_pdf_parser.py
+++ b/data_chain/parser/handler/deep_pdf_parser.py
@@ -4,9 +4,6 @@ import io
 import fitz
 from fitz import Page, Document
 import numpy as np
-from PIL import Image
-from pandas import DataFrame
-from paddleocr import PaddleOCR
 from pydantic import BaseModel, Field
 import uuid
 import cv2
@@ -16,6 +13,7 @@ import shutil
 from data_chain.entities.enum import DocParseRelutTopology, ChunkParseTopology, ChunkType
 from data_chain.parser.parse_result import ParseNode, ParseResult
 from data_chain.parser.handler.base_parser import BaseParser
+from data_chain.parser.tools.ocr_tool import OcrTool
 from data_chain.logger.logger import logger as logging
 
 
@@ -51,16 +49,6 @@ class ParseNodeWithBbox(BaseModel):
 
 class DeepPdfParser(BaseParser):
     name = 'pdf.deep'
-    det_model_dir = 'data_chain/parser/model/ocr/ch_PP-OCRv4_det_infer'
-    rec_model_dir = 'data_chain/parser/model/ocr/ch_PP-OCRv4_rec_infer'
-    cls_model_dir = 'data_chain/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer'
-    ocr = PaddleOCR(
-        det_model_dir=det_model_dir,
-        rec_model_dir=rec_model_dir,
-        cls_model_dir=cls_model_dir,
-        use_angle_cls=True,
-        lang="ch"
-    )  # 使用中文语言模型
 
     @staticmethod
     async def extract_text_from_page(
@@ -111,8 +99,7 @@ class DeepPdfParser(BaseParser):
     async def extract_text_from_page_by_ocr(
             image_path: str, exclude_regions: list[Bbox] = None) -> list[ParseNodeWithBbox]:
         text_nodes_with_bbox = []
-        image = cv2.imread(image_path)
-        result = DeepPdfParser.ocr.ocr(image, cls=True)
+        result = OcrTool.ocr_from_image_path(image_path)
         if not result or not result[0]:
             return []
         for line in result[0]:
@@ -281,7 +268,7 @@ class DeepPdfParser(BaseParser):
                                     int(merged_bboxes[index].x0): int(merged_bboxes[index].x1)]
                 table_image_path = os.path.join(tmp_path, f"table_{uuid.uuid4()}.png")
                 cv2.imwrite(table_image_path, table_image)
-                result = DeepPdfParser.ocr.ocr(table_image_path, cls=True)
+                result = OcrTool.ocr_from_image_path(table_image_path)
 
                 if not result or not result[0]:
                     continue
diff --git a/data_chain/parser/tools/instruct_scan_tool.py b/data_chain/parser/tools/instruct_scan_tool.py
new file mode 100644
index 0000000..9a849f4
--- /dev/null
+++ b/data_chain/parser/tools/instruct_scan_tool.py
@@ -0,0 +1,95 @@
+import platform
+import cpuinfo
+
+class InstructScanTool:
+    @staticmethod
+    def check_avx512_support():
+        """
+        检测当前系统是否支持 AVX-512 指令集
+        
+        返回值:
+            True: 明确支持 AVX-512
+            False: 明确不支持 AVX-512
+            "Maybe": 无法确定是否支持
+        """
+        try:
+            # 优先使用 cpuinfo 库获取精确信息
+            info = cpuinfo.get_cpu_info()
+            flags = info.get('flags', [])
+
+            # 检查常见的 AVX-512 子指令集
+            avx512_flags = [
+                'avx512f', 'avx512cd', 'avx512er', 'avx512pf',
+                'avx512dq', 'avx512bw', 'avx512vl', 'avx512ifma',
+                'avx512vbmi'
+            ]
+
+            # 只要存在一个 AVX-512 相关标志即判定支持
+            if any(flag in flags for flag in avx512_flags):
+                return True
+
+            # 对于 Intel 处理器，检查是否属于已知支持 AVX-512 的系列
+            brand = info.get('brand_raw', '').lower()
+            if 'intel' in brand:
+                # 检查是否为 Xeon 或第 10 代及以后的 Core 处理器
+                if 'xeon' in brand or ('core' in brand and any(f' {gen}th' in brand for gen in range(10, 14))):
+                    return "Maybe"  # 部分型号支持，需手动确认
+
+            return False
+
+        except Exception as e:
+            # 回退到基于平台的检测方法（准确性较低）
+            return InstructScanTool._fallback_check()
+
+    @staticmethod
+    def _fallback_check():
+        """
+        回退到基于平台命令的检测方法（原实现）
+        """
+        system = platform.system()
+        
+        if system == "Linux":
+            try:
+                with open('/proc/cpuinfo', 'r') as f:
+                    cpuinfo = f.read()
+                avx512_flags = [
+                    'avx512f', 'avx512cd', 'avx512er', 'avx512pf',
+                    'avx512dq', 'avx512bw', 'avx512vl', 'avx512ifma',
+                    'avx512vbmi'
+                ]
+                for flag in avx512_flags:
+                    if flag in cpuinfo:
+                        return True
+                return False
+            except Exception:
+                return False
+        
+        elif system == "Windows":
+            try:
+                import subprocess
+                # 尝试使用 PowerShell 获取更准确的信息
+                try:
+                    output = subprocess.check_output(
+                        "powershell -command \"Get-WmiObject -Class Win32_Processor | Select-Object -ExpandProperty Name\"",
+                        shell=True, stderr=subprocess.DEVNULL).decode().lower()
+                except:
+                    # 旧版 Windows 回退到 wmic
+                    output = subprocess.check_output("wmic cpu get name", shell=True).decode().lower()
+
+                if "avx-512" in output:
+                    return True
+
+                # 检查是否为可能支持 AVX-512 的处理器系列
+                if "xeon" in output or "i9" in output or "i7" in output:
+                    return "Maybe"
+
+                return False
+            except Exception:
+                return False
+        
+        elif system == "Darwin":  # macOS
+            # macOS 硬件目前不支持 AVX-512
+            return False
+        
+        else:
+            return False
diff --git a/data_chain/parser/tools/ocr_tool.py b/data_chain/parser/tools/ocr_tool.py
index bfc7aab..da2b449 100644
--- a/data_chain/parser/tools/ocr_tool.py
+++ b/data_chain/parser/tools/ocr_tool.py
@@ -1,11 +1,12 @@
 from PIL import Image, ImageEnhance
 import yaml
-from paddleocr import PaddleOCR
+import cv2
 import numpy as np
 from data_chain.parser.tools.token_tool import TokenTool
 from data_chain.logger.logger import logger as logging
 from data_chain.config.config import config
 from data_chain.llm.llm import LLM
+from data_chain.parser.tools.instruct_scan_tool import InstructScanTool
 
 
 class OcrTool:
@@ -13,15 +14,33 @@ class OcrTool:
     rec_model_dir = 'data_chain/parser/model/ocr/ch_PP-OCRv4_rec_infer'
     cls_model_dir = 'data_chain/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer'
     # 优化 OCR 参数配置
-    model = PaddleOCR(
-        det_model_dir=det_model_dir,
-        rec_model_dir=rec_model_dir,
-        cls_model_dir=cls_model_dir,
-        use_angle_cls=True,
-        use_space_char=True,
-        det_db_thresh=0.3,       # 降低文本检测阈值，提高敏感度
-        det_db_box_thresh=0.5,   # 调整文本框阈值
-    )
+    if InstructScanTool.check_avx512_support():
+        from paddleocr import PaddleOCR
+        model = PaddleOCR(
+            det_model_dir=det_model_dir,
+            rec_model_dir=rec_model_dir,
+            cls_model_dir=cls_model_dir,
+            use_angle_cls=True,
+            lang="ch"
+        )
+    else:
+        model = None
+
+    @staticmethod
+    async def ocr_from_image_path(image_path: str) -> list:
+        try:
+            # 打开图片
+            if OcrTool.model is None:
+                err = "[OCRTool] 当前机器不支持 AVX-512，无法进行OCR识别"
+                logging.error(err)
+                return None
+            image = cv2.imread(image_path)
+            result = OcrTool.model.ocr(image, cls=True)
+            return result
+        except Exception as e:
+            err = f"[OCRTool] OCR识别失败: {e}"
+            logging.exception(err)
+            return None
 
     @staticmethod
     async def ocr_from_image(image: np.ndarray) -> list:
@@ -29,27 +48,6 @@ class OcrTool:
 
             # 尝试OCR识别
             ocr_result = OcrTool.model.ocr(image)
-
-            # 如果第一次尝试失败，尝试不同的参数配置
-            if ocr_result is None or len(ocr_result) == 0 or ocr_result[0] is None:
-                logging.warning("[OCRTool] 第一次OCR尝试失败，尝试降低阈值...")
-                # 创建临时OCR实例，使用更低的阈值
-                temp_ocr = PaddleOCR(
-                    det_model_dir=OcrTool.det_model_dir,
-                    rec_model_dir=OcrTool.rec_model_dir,
-                    cls_model_dir=OcrTool.cls_model_dir,
-                    use_angle_cls=True,
-                    use_space_char=True,
-                    det_db_thresh=0.2,       # 更低的检测阈值
-                    det_db_box_thresh=0.4,   # 更低的文本框阈值
-                )
-                ocr_result = temp_ocr.ocr(image)
-
-            # 记录OCR结果状态
-            if ocr_result is None or len(ocr_result) == 0 or ocr_result[0] is None:
-                logging.warning("[OCRTool] 图片无法识别文本")
-                return None
-
             return ocr_result
         except Exception as e:
             err = f"[OCRTool] OCR识别失败: {e}"
@@ -100,6 +98,10 @@ class OcrTool:
     @staticmethod
     async def image_to_text(image: np.ndarray, image_related_text: str = '', llm: LLM = None) -> str:
         try:
+            if OcrTool.model is None:
+                err = "[OCRTool] 当前机器不支持 AVX-512，无法进行OCR识别"
+                logging.error(err)
+                return ''
             ocr_result = await OcrTool.ocr_from_image(image)
             if ocr_result is None:
                 return ''
diff --git a/requirements.txt b/requirements.txt
index cd791a2..81e5e5c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -45,4 +45,5 @@ tika==2.6.0
 tiktoken==0.8.0
 urllib3==2.2.1
 uvicorn==0.21.0
-xlrd==2.0.1
\ No newline at end of file
+xlrd==2.0.1
+py-cpuinfo==9.0.0
\ No newline at end of file
-- 
Gitee


From 20e44f02c44728aabc1d5c0244376d02d47c5cbe Mon Sep 17 00:00:00 2001
From: zxstty <zhaojiaqi18@huawei.com>
Date: Tue, 1 Jul 2025 11:41:33 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E5=AE=8C=E5=96=84=20deep=20pdf=20parse=20?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data_chain/parser/handler/deep_pdf_parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_chain/parser/handler/deep_pdf_parser.py b/data_chain/parser/handler/deep_pdf_parser.py
index f81a779..c66fc40 100644
--- a/data_chain/parser/handler/deep_pdf_parser.py
+++ b/data_chain/parser/handler/deep_pdf_parser.py
@@ -99,7 +99,7 @@ class DeepPdfParser(BaseParser):
     async def extract_text_from_page_by_ocr(
             image_path: str, exclude_regions: list[Bbox] = None) -> list[ParseNodeWithBbox]:
         text_nodes_with_bbox = []
-        result = OcrTool.ocr_from_image_path(image_path)
+        result = await OcrTool.ocr_from_image_path(image_path)
         if not result or not result[0]:
             return []
         for line in result[0]:
@@ -268,7 +268,7 @@ class DeepPdfParser(BaseParser):
                                     int(merged_bboxes[index].x0): int(merged_bboxes[index].x1)]
                 table_image_path = os.path.join(tmp_path, f"table_{uuid.uuid4()}.png")
                 cv2.imwrite(table_image_path, table_image)
-                result = OcrTool.ocr_from_image_path(table_image_path)
+                result = await OcrTool.ocr_from_image_path(table_image_path)
 
                 if not result or not result[0]:
                     continue
-- 
Gitee