发票识别 checkpoints模型转inference模型后，识别效果不一致

#### 问题描述 / Problem Description
我的原始需求是做发票识别，使用的是ppstructure里面的关键信息抽取功能，具体步骤建复现代码。

#### 运行环境 / Runtime Environment
- OS: Windows10
- Paddle: 2.4.2
- PaddleOCR: 2.7.0

#### 复现代码 / Reproduction Code
1、文本检测模型使用自己训练的模型；
#训练det模型
python tools/train.py -c ./configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml
#导出inference模型
python tools/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o Global.pretrained_model=./output/ch_PP-OCR_V3_det/best_model/model  Global.save_inference_dir=./output/ch_PP-OCR_v3_det_inferer
使用的配置文件见附件。

2、训练识别发票的ser模型
#训练ser模型
python tools/train.py -c ./configs/kie/vi_layoutxlm/fapiao_putong_ser_vi_layoutxlm_xfund_zh_udml.yml
#预测ser模型
python tools/infer_kie_token_ser.py -c configs/kie/vi_layoutxlm/fapiao_putong_ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/fapiao_putong_ser_vi_layoutxlm_xfund_zh_udml/best_accuracy Global.infer_img=./train_data/fapiao_putong/test/imgs/b0.jpg
#预测结果正常，只识别了发票号码和开票日期
![输入图片说明](https://foruda.gitee.com/images/1719536689836205268/4618a71b_13255200.png "屏幕截图")

#转换成inference模型
python tools/export_model.py -c configs/kie/vi_layoutxlm/fapiao_putong_ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/fapiao_putong_ser_vi_layoutxlm_xfund_zh_udml/best_accuracy Global.save_inference_dir=./inference/fapiao_putong_ser_vi_layoutxlm_xfund_zh_udml

#预测inference模型
cd ppstructure
kie/predict_kie_token_ser.py --kie_algorithm=LayoutXLM --ser_model_dir=../inference/fapiao_putong_ser_vi_layoutxlm_xfund_zh_udml --image_dir=../train_data/fapiao_putong/test/imgs/b0.jpg --ser_dict_path=../train_data/fapiao_putong/class_list.txt --vis_font_path=../doc/fonts/simfang.ttf --ocr_order_method="tb-yx" --det_model_dir=./output/ch_PP-OCR_v3_det_inferer
#预测inference结果，证照发票上的信息都识别出来了，感觉没走自己的文本检测模型，但是det_model_dir是传了值的
![)WK~ME2YLI {3 {EH9E~}A](https://github.com/PaddlePaddle/PaddleOCR/assets/129714430/4f9c8b21-5cc0-4034-81d0-2650e075c581)

配置文件见附件。

#### 完整报错 / Complete Error Message
没有报错，只是识别效果不一致。

#### 可能解决方案 / Possible solutions

#### 附件 / Appendix
ch_PP-OCRv3_det_student.yml
Global:
  debug: false
  use_gpu: true
  epoch_num: 500
  log_smooth_window: 20
  print_batch_step: 10
  save_model_dir: ./output/ch_PP-OCR_V3_det/
  save_epoch_step: 100
  eval_batch_step:
  - 0
  - 500
  cal_metric_during_train: false
  pretrained_model: ./pretrained_model/ch_PP-OCRv3_det_distill_train/student.pdparams
  checkpoints: null
  save_inference_dir: null
  use_visualdl: false
  infer_img: doc/imgs_en/img_10.jpg
  save_res_path: ./checkpoints/det_db/predicts_db.txt
  distributed: true

Architecture:
  model_type: det
  algorithm: DB
  Transform:
  Backbone:
    name: MobileNetV3
    scale: 0.5
    model_name: large
    disable_se: True
  Neck:
    name: RSEFPN
    out_channels: 96
    shortcut: True
  Head:
    name: DBHead
    k: 50

Loss:
  name: DBLoss
  balance_loss: true
  main_loss_type: DiceLoss
  alpha: 5
  beta: 10
  ohem_ratio: 3
Optimizer:
  name: Adam
  beta1: 0.9
  beta2: 0.999
  lr:
    name: Cosine
    learning_rate: 0.00005
    warmup_epoch: 2
  regularizer:
    name: L2
    factor: 5.0e-05
PostProcess:
  name: DBPostProcess
  thresh: 0.3
  box_thresh: 0.6
  max_candidates: 1000
  unclip_ratio: 1.5
Metric:
  name: DetMetric
  main_indicator: hmean
Train:
  dataset:
    name: SimpleDataSet
    data_dir: ./train_data/wenbenjiance_initdata
    label_file_list:
      - ./train_data/wenbenjiance_initdata/train.txt
    ratio_list: [1.0]
    transforms:
    - DecodeImage:
        img_mode: BGR
        channel_first: false
    - DetLabelEncode: null
    - IaaAugment:
        augmenter_args:
        - type: Fliplr
          args:
            p: 0.5
        - type: Affine
          args:
            rotate:
            - -10
            - 10
        - type: Resize
          args:
            size:
            - 0.5
            - 3
    - EastRandomCropData:
        size:
        - 960
        - 960
        max_tries: 50
        keep_ratio: true
    - MakeBorderMap:
        shrink_ratio: 0.4
        thresh_min: 0.3
        thresh_max: 0.7
    - MakeShrinkMap:
        shrink_ratio: 0.4
        min_text_size: 8
    - NormalizeImage:
        scale: 1./255.
        mean:
        - 0.485
        - 0.456
        - 0.406
        std:
        - 0.229
        - 0.224
        - 0.225
        order: hwc
    - ToCHWImage: null
    - KeepKeys:
        keep_keys:
        - image
        - threshold_map
        - threshold_mask
        - shrink_map
        - shrink_mask
  loader:
    shuffle: true
    drop_last: false
    batch_size_per_card: 4
    num_workers: 0
Eval:
  dataset:
    name: SimpleDataSet
    data_dir: ./train_data/wenbenjiance_initdata
    label_file_list:
      - ./train_data/wenbenjiance_initdata/val.txt
    transforms:
    - DecodeImage:
        img_mode: BGR
        channel_first: false
    - DetLabelEncode: null
    - DetResizeForTest: null
    - NormalizeImage:
        scale: 1./255.
        mean:
        - 0.485
        - 0.456
        - 0.406
        std:
        - 0.229
        - 0.224
        - 0.225
        order: hwc
    - ToCHWImage: null
    - KeepKeys:
        keep_keys:
        - image
        - shape
        - polys
        - ignore_tags
  loader:
    shuffle: false
    drop_last: false
    batch_size_per_card: 1
    num_workers: 0
fapiao_putong_ser_vi_layoutxlm_xfund_zh_udml.yml
Global:
  use_gpu: True
  epoch_num: &epoch_num 10
  log_smooth_window: 10
  print_batch_step: 10
  save_model_dir: ./output/fapiao_putong_ser_vi_layoutxlm_xfund_zh_udml
  save_epoch_step: 32
  # evaluation is run every 10 iterations after the 0th iteration
  eval_batch_step: [ 0, 19 ]
  cal_metric_during_train: False
  save_inference_dir:
  use_visualdl: False
  seed: 2022
  infer_img: ppstructure/docs/kie/input/zh_val_42.jpg
  save_res_path: ./output/fapiao_putong_ser_vi_layoutxlm_xfund_zh_udml_res

Architecture:
  model_type: &model_type "kie"
  name: DistillationModel
  algorithm: Distillation
  Models:
    Teacher:
      pretrained:
      freeze_params: false
      return_all_feats: true
      model_type: *model_type
      algorithm: &algorithm "LayoutXLM"
      Transform:
      Backbone:
        name: LayoutXLMForSer
        pretrained: True
        # one of base or vi
        mode: vi
        checkpoints:
        num_classes: &num_classes 5
    Student:
      pretrained:
      freeze_params: false
      return_all_feats: true
      model_type: *model_type
      algorithm: *algorithm
      Transform:
      Backbone:
        name: LayoutXLMForSer
        pretrained: True
        # one of base or vi
        mode: vi
        checkpoints:
        num_classes: *num_classes

Loss:
  name: CombinedLoss
  loss_config_list:
  - DistillationVQASerTokenLayoutLMLoss:
      weight: 1.0
      model_name_list: ["Student", "Teacher"]
      key: backbone_out
      num_classes: *num_classes
  - DistillationSERDMLLoss:
      weight: 1.0
      act: "softmax"
      use_log: true
      model_name_pairs:
      - ["Student", "Teacher"]
      key: backbone_out
  - DistillationVQADistanceLoss:
      weight: 0.5
      mode: "l2"
      model_name_pairs:
        - ["Student", "Teacher"]
      key: hidden_states_5
      name: "loss_5"
  - DistillationVQADistanceLoss:
      weight: 0.5
      mode: "l2"
      model_name_pairs:
        - ["Student", "Teacher"]
      key: hidden_states_8
      name: "loss_8"

Optimizer:
  name: AdamW
  beta1: 0.9
  beta2: 0.999
  lr:
    name: Linear
    learning_rate: 0.00005
    epochs: *epoch_num
    warmup_epoch: 10
  regularizer:
    name: L2
    factor: 0.00000
    
PostProcess:
  name: DistillationSerPostProcess
  model_name: ["Student", "Teacher"]
  key: backbone_out
  class_path: &class_path train_data/fapiao_putong/class_list.txt

Metric:
  name: DistillationMetric
  base_metric_name: VQASerTokenMetric
  main_indicator: hmean
  key: "Student"

Train:
  dataset:
    name: SimpleDataSet
    data_dir: train_data/fapiao_putong/train/imgs
    label_file_list: 
      - train_data/fapiao_putong/train/train.json
    ratio_list: [ 1.0 ]
    transforms:
      - DecodeImage: # load image
          img_mode: RGB
          channel_first: False
      - VQATokenLabelEncode: # Class handling label
          contains_re: False
          algorithm: *algorithm
          class_path: *class_path
          # one of [None, "tb-yx"]
          order_method: &order_method "tb-yx"
      - VQATokenPad:
          max_seq_len: &max_seq_len 512
          return_attention_mask: True
      - VQASerTokenChunk:
          max_seq_len: *max_seq_len
      - Resize:
          size: [224,224]
      - NormalizeImage:
          scale: 1
          mean: [ 123.675, 116.28, 103.53 ]
          std: [ 58.395, 57.12, 57.375 ]
          order: 'hwc'
      - ToCHWImage:
      - KeepKeys:
          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order
  loader:
    shuffle: True
    drop_last: False
    batch_size_per_card: 4
    num_workers: 0

Eval:
  dataset:
    name: SimpleDataSet
    data_dir: train_data/fapiao_putong/val/imgs
    label_file_list:
      - train_data/fapiao_putong/val/val.json
    transforms:
      - DecodeImage: # load image
          img_mode: RGB
          channel_first: False
      - VQATokenLabelEncode: # Class handling label
          contains_re: False
          algorithm: *algorithm
          class_path: *class_path
          order_method: *order_method
      - VQATokenPad:
          max_seq_len: *max_seq_len
          return_attention_mask: True
      - VQASerTokenChunk:
          max_seq_len: *max_seq_len
      - Resize:
          size: [224,224]
      - NormalizeImage:
          scale: 1
          mean: [ 123.675, 116.28, 103.53 ]
          std: [ 58.395, 57.12, 57.375 ]
          order: 'hwc'
      - ToCHWImage:
      - KeepKeys:
          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order
  loader:
    shuffle: False
    drop_last: False
    batch_size_per_card: 8
    num_workers: 0

fapiao_putong_ser_vi_layoutxlm_xfund_zh.yml
Global:
  use_gpu: True
  epoch_num: &epoch_num 200
  log_smooth_window: 10
  print_batch_step: 10
  save_model_dir: ./output/fapiao_putong_ser_vi_layoutxlm_xfund_zh
  save_epoch_step: 2000
  # evaluation is run every 10 iterations after the 0th iteration
  eval_batch_step: [ 0, 19 ]
  cal_metric_during_train: False
  save_inference_dir:
  use_visualdl: False
  seed: 2022
  infer_img: ppstructure/docs/kie/input/zh_val_42.jpg
  d2s_train_image_shape: [3, 224, 224]
  # if you want to predict using the groundtruth ocr info,
  # you can use the following config
  # infer_img: train_data/XFUND/zh_val/val.json
  # infer_mode: False

save_res_path: ./output/fapiao_putong_ser_vi_layoutxlm_xfund_zh_res
  kie_rec_model_dir: 
  kie_det_model_dir: ./output/ch_PP-OCR_v3_det_inferer
  amp_custom_white_list: ['scale', 'concat', 'elementwise_add']

Architecture:
  model_type: kie
  algorithm: &algorithm "LayoutXLM"
  Transform:
  Backbone:
    name: LayoutXLMForSer
    pretrained: True
    checkpoints:
    # one of base or vi
    mode: vi
    num_classes: &num_classes 5

Loss:
  name: VQASerTokenLayoutLMLoss
  num_classes: *num_classes
  key: "backbone_out"

Optimizer:
  name: AdamW
  beta1: 0.9
  beta2: 0.999
  lr:
    name: Linear
    learning_rate: 0.00005
    epochs: *epoch_num
    warmup_epoch: 2
  regularizer:
    name: L2
    factor: 0.00000
    
PostProcess:
  name: VQASerTokenLayoutLMPostProcess
  class_path: &class_path train_data/fapiao_putong/class_list.txt

Metric:
  name: VQASerTokenMetric
  main_indicator: hmean

Train:
  dataset:
    name: SimpleDataSet
    data_dir: train_data/fapiao_putong/train/imgs
    label_file_list: 
      - train_data/fapiao_putong/train/train.json
    ratio_list: [ 1.0 ]
    transforms:
      - DecodeImage: # load image
          img_mode: RGB
          channel_first: False
      - VQATokenLabelEncode: # Class handling label
          contains_re: False
          algorithm: *algorithm
          class_path: *class_path
          use_textline_bbox_info: &use_textline_bbox_info True
          # one of [None, "tb-yx"]
          order_method: &order_method "tb-yx"
      - VQATokenPad:
          max_seq_len: &max_seq_len 512
          return_attention_mask: True
      - VQASerTokenChunk:
          max_seq_len: *max_seq_len
      - Resize:
          size: [224,224]
      - NormalizeImage:
          scale: 1
          mean: [ 123.675, 116.28, 103.53 ]
          std: [ 58.395, 57.12, 57.375 ]
          order: 'hwc'
      - ToCHWImage:
      - KeepKeys:
          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order
  loader:
    shuffle: True
    drop_last: False
    batch_size_per_card: 8
    num_workers: 0

Eval:
  dataset:
    name: SimpleDataSet
    data_dir: train_data/fapiao_putong/val/imgs
    label_file_list:
      - train_data/fapiao_putong/val/val.json
    transforms:
      - DecodeImage: # load image
          img_mode: RGB
          channel_first: False
      - VQATokenLabelEncode: # Class handling label
          contains_re: False
          algorithm: *algorithm
          class_path: *class_path
          use_textline_bbox_info: *use_textline_bbox_info
          order_method: *order_method
      - VQATokenPad:
          max_seq_len: *max_seq_len
          return_attention_mask: True
      - VQASerTokenChunk:
          max_seq_len: *max_seq_len
      - Resize:
          size: [224,224]
      - NormalizeImage:
          scale: 1
          mean: [ 123.675, 116.28, 103.53 ]
          std: [ 58.395, 57.12, 57.375 ]
          order: 'hwc'
      - ToCHWImage:
      - KeepKeys:
          keep_keys: [ 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'image', 'labels'] # dataloader will return list in this order
  loader:
    shuffle: False
    drop_last: False
    batch_size_per_card: 8
    num_workers: 0

GVP PaddlePaddle/PaddleOCR

内容风险标识

评论 (1)

GVPPaddlePaddle/PaddleOCR

内容风险标识