From 830d7b1ed36e9d541d5c4ba4d7f8c4f2babb8357 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 10 Dec 2025 11:25:59 +0800
Subject: [PATCH 1/7] support yolox bs 1/32/64

---
 .../cv/object_detection/yolox/ixrt/README.md  |  27 +-
 .../yolox/ixrt/{python => }/build_engine.py   |   0
 .../build_engine_by_write_qparams.py          |   0
 .../ixrt/{python => }/calibration_dataset.py  |   0
 .../object_detection/yolox/ixrt/ci/prepare.sh |  13 +-
 .../yolox/ixrt/{python => }/cut_model.py      |   0
 .../ixrt/{python => }/datasets/__init__.py    |   0
 .../yolox/ixrt/{python => }/datasets/coco.py  |   0
 .../ixrt/{python => }/datasets/common.py      |   0
 .../{python => }/datasets/post_process.py     |   0
 .../ixrt/{python => }/datasets/pre_process.py |   0
 .../ixrt/{python => }/datasets/vision.py      |   0
 .../yolox/ixrt/{python => }/deploy.py         |   0
 .../yolox/ixrt/{python => }/inference.py      |   0
 .../ixrt/{python => }/load_ixrt_plugin.py     |   2 +-
 .../yolox/ixrt/plugin/CMakeLists.txt          |  61 -----
 .../ixrt/plugin/cmake/FindCompiler.cmake      |  15 -
 .../yolox/ixrt/plugin/cmake/FindCuda.cmake    |  57 ----
 .../yolox/ixrt/plugin/cmake/FindIxrt.cmake    |  20 --
 .../ixrt/plugin/cmake/FindPluginFiles.cmake   |   4 -
 .../plugin/src/common/checkMacrosPlugin.cpp   |  46 ----
 .../plugin/src/common/checkMacrosPlugin.h     | 205 --------------
 .../ixrt/plugin/src/common/common_def.cuh     |  64 -----
 .../plugin/src/common/kernels/cuda_helper.cuh |  23 --
 .../yolox/ixrt/plugin/src/common/plugin.cpp   |  47 ----
 .../yolox/ixrt/plugin/src/common/plugin.h     |  60 ----
 .../yolox/ixrt/plugin/src/common/serialize.h  | 132 ---------
 .../src/yolox_decoder/yoloxDecoderKernel.cu   |  59 ----
 .../src/yolox_decoder/yoloxDecoderKernel.h    |  29 --
 .../src/yolox_decoder/yoloxDecoderPlugin.cc   | 258 ------------------
 .../src/yolox_decoder/yoloxDecoderPlugin.h    |  98 -------
 .../src/yolox_decoder/yolox_decoder.cuh       | 114 --------
 .../yolox/ixrt/{python => }/quant.py          |   2 +-
 .../yolox/ixrt/requirements.txt               |   2 +-
 .../ixrt/scripts/infer_yolox_fp16_accuracy.sh |  14 +-
 .../scripts/infer_yolox_fp16_performance.sh   |  11 +-
 .../ixrt/scripts/infer_yolox_int8_accuracy.sh |   9 +-
 .../scripts/infer_yolox_int8_performance.sh   |   9 +-
 .../yolox/ixrt/{python => }/utils.py          |   0
 39 files changed, 36 insertions(+), 1345 deletions(-)
 rename models/cv/object_detection/yolox/ixrt/{python => }/build_engine.py (100%)
 rename models/cv/object_detection/yolox/ixrt/{python => }/build_engine_by_write_qparams.py (100%)
 rename models/cv/object_detection/yolox/ixrt/{python => }/calibration_dataset.py (100%)
 rename models/cv/object_detection/yolox/ixrt/{python => }/cut_model.py (100%)
 rename models/cv/object_detection/yolox/ixrt/{python => }/datasets/__init__.py (100%)
 rename models/cv/object_detection/yolox/ixrt/{python => }/datasets/coco.py (100%)
 rename models/cv/object_detection/yolox/ixrt/{python => }/datasets/common.py (100%)
 rename models/cv/object_detection/yolox/ixrt/{python => }/datasets/post_process.py (100%)
 rename models/cv/object_detection/yolox/ixrt/{python => }/datasets/pre_process.py (100%)
 rename models/cv/object_detection/yolox/ixrt/{python => }/datasets/vision.py (100%)
 rename models/cv/object_detection/yolox/ixrt/{python => }/deploy.py (100%)
 rename models/cv/object_detection/yolox/ixrt/{python => }/inference.py (100%)
 rename models/cv/object_detection/yolox/ixrt/{python => }/load_ixrt_plugin.py (93%)
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/CMakeLists.txt
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/cmake/FindCompiler.cmake
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/cmake/FindCuda.cmake
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/cmake/FindIxrt.cmake
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/cmake/FindPluginFiles.cmake
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/src/common/checkMacrosPlugin.cpp
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/src/common/checkMacrosPlugin.h
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/src/common/common_def.cuh
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/src/common/kernels/cuda_helper.cuh
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/src/common/plugin.cpp
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/src/common/plugin.h
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/src/common/serialize.h
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderKernel.cu
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderKernel.h
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderPlugin.cc
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderPlugin.h
 delete mode 100644 models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yolox_decoder.cuh
 rename models/cv/object_detection/yolox/ixrt/{python => }/quant.py (99%)
 rename models/cv/object_detection/yolox/ixrt/{python => }/utils.py (100%)

diff --git a/models/cv/object_detection/yolox/ixrt/README.md b/models/cv/object_detection/yolox/ixrt/README.md
index d5fad290..156eed0f 100644
--- a/models/cv/object_detection/yolox/ixrt/README.md
+++ b/models/cv/object_detection/yolox/ixrt/README.md
@@ -49,44 +49,31 @@ coco
 
 ### Install Dependencies
 
-Contact the Iluvatar administrator to get the missing packages:
-
-- mmcv-2.1.0+corex.4.3.0-cp310-cp310-linux_x86_64.whl
-
 ```bash
+## CentOS
+yum install -y numactl
+## Ubuntu
+apt install numactl
+
 pip3 install -r requirements.txt
-pip3 install mmcv-2.1.0+corex.4.3.0-cp310-cp310-linux_x86_64.whl
 ```
 
 ### Model Conversion
 
 ```bash
 # install yolox
-git clone https://github.com/Megvii-BaseDetection/YOLOX.git
+git clone https://github.com/Megvii-BaseDetection/YOLOX.git --depth=1
 
 cd YOLOX
 python3 setup.py install
 # export onnx model
 python3 tools/export_onnx.py --output-name ../yolox.onnx -n yolox-m -c yolox_m.pth --batch-size 32
-cd ..
 ```
 
 ## Model Inference
 
 ```bash
-# Set DATASETS_DIR
-export DATASETS_DIR=/Path/to/coco/
-
-# Build plugin on ILUVATAR env
-cd plugin && mkdir build && cd build
-cmake .. -DIXRT_HOME=/usr/local/corex
-make -j12
-cd ../..
-
-# Build plugin on NVIDIA env
-cd plugin && mkdir build && cd build
-cmake .. -DUSE_TRT=1
-make -j12
+export DATASETS_DIR=./coco/
 ```
 
 ### FP16
diff --git a/models/cv/object_detection/yolox/ixrt/python/build_engine.py b/models/cv/object_detection/yolox/ixrt/build_engine.py
similarity index 100%
rename from models/cv/object_detection/yolox/ixrt/python/build_engine.py
rename to models/cv/object_detection/yolox/ixrt/build_engine.py
diff --git a/models/cv/object_detection/yolox/ixrt/python/build_engine_by_write_qparams.py b/models/cv/object_detection/yolox/ixrt/build_engine_by_write_qparams.py
similarity index 100%
rename from models/cv/object_detection/yolox/ixrt/python/build_engine_by_write_qparams.py
rename to models/cv/object_detection/yolox/ixrt/build_engine_by_write_qparams.py
diff --git a/models/cv/object_detection/yolox/ixrt/python/calibration_dataset.py b/models/cv/object_detection/yolox/ixrt/calibration_dataset.py
similarity index 100%
rename from models/cv/object_detection/yolox/ixrt/python/calibration_dataset.py
rename to models/cv/object_detection/yolox/ixrt/calibration_dataset.py
diff --git a/models/cv/object_detection/yolox/ixrt/ci/prepare.sh b/models/cv/object_detection/yolox/ixrt/ci/prepare.sh
index fb99838a..a69ef069 100644
--- a/models/cv/object_detection/yolox/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/yolox/ixrt/ci/prepare.sh
@@ -18,9 +18,9 @@ set -x
 
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 if [[ ${ID} == "ubuntu" ]]; then
-    apt install -y libgl1-mesa-glx
+    apt install -y numactl
 elif [[ ${ID} == "centos" ]]; then
-    yum install -y mesa-libGL
+    yum install -y numactl
 else
     echo "Not Support Os"
 fi
@@ -28,11 +28,4 @@ fi
 pip install -r requirements.txt
 unzip -q /root/data/repos/yolox-f00a798c8bf59f43ab557a2f3d566afa831c8887.zip -d ./
 ln -s /root/data/checkpoints/yolox_m.pth ./YOLOX/
-# install ixrt run
-bash /root/data/install/ixrt-1.0.0.alpha+corex.4.3.0-linux_x86_64.run
-cd YOLOX && python3 setup.py develop && python3 tools/export_onnx.py --output-name ../yolox.onnx -n yolox-m -c yolox_m.pth --batch-size 32
-if [ "$1" = "nvidia" ]; then
-    cd ../plugin && mkdir -p build && cd build && cmake .. -DUSE_TRT=1 && make -j12
-else
-    cd ../plugin && mkdir -p build && cd build && cmake .. -DIXRT_HOME=/usr/local/corex && make -j12
-fi
\ No newline at end of file
+cd YOLOX && python3 setup.py develop && python3 tools/export_onnx.py --output-name ../yolox.onnx -n yolox-m -c yolox_m.pth --batch-size 32
\ No newline at end of file
diff --git a/models/cv/object_detection/yolox/ixrt/python/cut_model.py b/models/cv/object_detection/yolox/ixrt/cut_model.py
similarity index 100%
rename from models/cv/object_detection/yolox/ixrt/python/cut_model.py
rename to models/cv/object_detection/yolox/ixrt/cut_model.py
diff --git a/models/cv/object_detection/yolox/ixrt/python/datasets/__init__.py b/models/cv/object_detection/yolox/ixrt/datasets/__init__.py
similarity index 100%
rename from models/cv/object_detection/yolox/ixrt/python/datasets/__init__.py
rename to models/cv/object_detection/yolox/ixrt/datasets/__init__.py
diff --git a/models/cv/object_detection/yolox/ixrt/python/datasets/coco.py b/models/cv/object_detection/yolox/ixrt/datasets/coco.py
similarity index 100%
rename from models/cv/object_detection/yolox/ixrt/python/datasets/coco.py
rename to models/cv/object_detection/yolox/ixrt/datasets/coco.py
diff --git a/models/cv/object_detection/yolox/ixrt/python/datasets/common.py b/models/cv/object_detection/yolox/ixrt/datasets/common.py
similarity index 100%
rename from models/cv/object_detection/yolox/ixrt/python/datasets/common.py
rename to models/cv/object_detection/yolox/ixrt/datasets/common.py
diff --git a/models/cv/object_detection/yolox/ixrt/python/datasets/post_process.py b/models/cv/object_detection/yolox/ixrt/datasets/post_process.py
similarity index 100%
rename from models/cv/object_detection/yolox/ixrt/python/datasets/post_process.py
rename to models/cv/object_detection/yolox/ixrt/datasets/post_process.py
diff --git a/models/cv/object_detection/yolox/ixrt/python/datasets/pre_process.py b/models/cv/object_detection/yolox/ixrt/datasets/pre_process.py
similarity index 100%
rename from models/cv/object_detection/yolox/ixrt/python/datasets/pre_process.py
rename to models/cv/object_detection/yolox/ixrt/datasets/pre_process.py
diff --git a/models/cv/object_detection/yolox/ixrt/python/datasets/vision.py b/models/cv/object_detection/yolox/ixrt/datasets/vision.py
similarity index 100%
rename from models/cv/object_detection/yolox/ixrt/python/datasets/vision.py
rename to models/cv/object_detection/yolox/ixrt/datasets/vision.py
diff --git a/models/cv/object_detection/yolox/ixrt/python/deploy.py b/models/cv/object_detection/yolox/ixrt/deploy.py
similarity index 100%
rename from models/cv/object_detection/yolox/ixrt/python/deploy.py
rename to models/cv/object_detection/yolox/ixrt/deploy.py
diff --git a/models/cv/object_detection/yolox/ixrt/python/inference.py b/models/cv/object_detection/yolox/ixrt/inference.py
similarity index 100%
rename from models/cv/object_detection/yolox/ixrt/python/inference.py
rename to models/cv/object_detection/yolox/ixrt/inference.py
diff --git a/models/cv/object_detection/yolox/ixrt/python/load_ixrt_plugin.py b/models/cv/object_detection/yolox/ixrt/load_ixrt_plugin.py
similarity index 93%
rename from models/cv/object_detection/yolox/ixrt/python/load_ixrt_plugin.py
rename to models/cv/object_detection/yolox/ixrt/load_ixrt_plugin.py
index 34b6e018..eb221afd 100644
--- a/models/cv/object_detection/yolox/ixrt/python/load_ixrt_plugin.py
+++ b/models/cv/object_detection/yolox/ixrt/load_ixrt_plugin.py
@@ -17,7 +17,7 @@ import tensorrt
 from os.path import join, dirname, exists
 def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
     if not dynamic_path:
-        dynamic_path = join("./plugin/build/", "libixrt_plugin.so")
+        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
     if not exists(dynamic_path):
         raise FileNotFoundError(
             f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/CMakeLists.txt b/models/cv/object_detection/yolox/ixrt/plugin/CMakeLists.txt
deleted file mode 100644
index 91cbfc56..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/CMakeLists.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(ixrt_plugin_unittest)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake;${CMAKE_MODULE_PATH}")
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-set(IXRT_HOME)
-
-set(TARGET_NAME ixrt_plugin)
-set(SHARED_TARGET ${TARGET_NAME})
-set(STATIC_TARGET ${TARGET_NAME}_static)
-
-set(PLUGIN_REPO_PATH ${PROJECT_SOURCE_DIR})
-set(PLUGIN_SOURCES)
-
-if (USE_TRT)
-    # cuda
-    find_package(CUDA)
-    include_directories(/usr/local/cuda/include)
-    link_directories(/usr/local/cuda/lib64)
-    # tensorrt
-    include_directories(/usr/include/x86_64-linux-gnu/)
-    link_directories(/usr/lib/x86_64-linux-gnu)
-    include_directories(/usr/local/TensorRT/include)
-    link_directories(/usr/local/TensorRT/lib)
-else()
-    include(FindIxrt)
-    include(FindCompiler)
-    include(FindCuda)
-endif ()
-
-include(FindPluginFiles)
-list(APPEND PLUGIN_FILES ${PLUGIN_SOURCES})
-
-if (USE_TRT)
-    include_directories(
-        ${CUDA_PATH}/include)
-else()
-    include_directories(${IXRT_INCLUDE_DIR}
-        ${CUDA_PATH}/include)
-    ################################## Compile Options ######################################
-    # For cuda files
-    string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_PATH}")
-    string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=${CUDA_ARCH}")
-endif ()
-
-cuda_add_library(${SHARED_TARGET} SHARED
-        ${PLUGIN_FILES}
-)
-
-if (USE_TRT)
-    target_link_libraries(${SHARED_TARGET}  cublasLt cudart nvinfer)
-else()
-    target_link_libraries(${SHARED_TARGET} PUBLIC  cublasLt cudart ixrt)
-endif ()
-
-target_link_directories(${SHARED_TARGET} PUBLIC ${IXRT_LIB_DIR})
-target_include_directories(${SHARED_TARGET} PUBLIC src PUBLIC src/common PUBLIC  src/common/kernels)
-# add_subdirectory(unit_test)
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/cmake/FindCompiler.cmake b/models/cv/object_detection/yolox/ixrt/plugin/cmake/FindCompiler.cmake
deleted file mode 100644
index 07c436f5..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/cmake/FindCompiler.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-if(NOT COMPILER_PATH)
-  if (EXISTS /opt/sw_home/local/bin/clang++)
-    set(COMPILER_PATH /opt/sw_home/local/bin)
-  elseif (EXISTS /usr/local/corex/bin/clang++)
-    set(COMPILER_PATH /usr/local/corex/bin)
-  else()
-    message(STATUS "COMPILER_PATH is not set and we couldn't find clang compiler neither, will use system C/C++ compiler")
-  endif()
-endif()
-if (COMPILER_PATH)
-  set(CMAKE_CXX_COMPILER ${COMPILER_PATH}/clang++)
-  set(CMAKE_C_COMPILER ${COMPILER_PATH}/clang)
-endif()
-
-message(STATUS "Use ${CMAKE_CXX_COMPILER} and ${CMAKE_C_COMPILER} as C++ and C compiler")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/cmake/FindCuda.cmake b/models/cv/object_detection/yolox/ixrt/plugin/cmake/FindCuda.cmake
deleted file mode 100644
index e8aa67dc..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/cmake/FindCuda.cmake
+++ /dev/null
@@ -1,57 +0,0 @@
-# This cmake does:
-# - Set CUDA_PATH
-# - Find libcudart
-# - Util functions like cuda_add_library, cuda_add_executable
-
-
-# CUDA_PATH can be specified through below means shown in priority order 1.
-# cmake command line argument, -DCUDA_PATH=/path/to/cuda 2. bash environment
-# variable, export CUDA_PATH=/path/to/cuda
-if(DEFINED ENV{CUDA_PATH})
-  set(CUDA_PATH "$ENV{CUDA_PATH}")
-else()
-  set(CUDA_PATH
-      "/usr/local/corex"
-      CACHE PATH "cuda installation root path")
-endif()
-message(STATUS "Use CUDA_PATH=${CUDA_PATH} ")
-
-# GPU arch
-if(NOT "${CUDA_ARCH}" STREQUAL "")
-  set(CUDA_ARCH
-      ${CUDA_ARCH}
-      CACHE STRING "GPU architecture tag, ivcore11")
-else("${CUDA_ARCH}" STREQUAL "")
-  set(CUDA_ARCH
-      "ivcore11"
-      CACHE STRING "GPU architecture tag, ivcore11")
-endif()
-message(STATUS "Use CUDA_ARCH=${CUDA_ARCH}")
-
-macro(cuda_add_executable)
-  foreach(File ${ARGN})
-    if(${File} MATCHES ".*\.cu$")
-      set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
-    endif()
-  endforeach()
-  add_executable(${ARGV})
-endmacro()
-
-macro(cuda_add_library)
-  foreach(File ${ARGN})
-    if(${File} MATCHES ".*\.cu$")
-      set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
-    endif()
-  endforeach()
-  add_library(${ARGV})
-endmacro()
-
-find_library(
-  CUDART_LIBRARY cudart
-  PATHS ${CUDA_PATH}
-  PATH_SUFFIXES lib/x64 lib64 lib
-  NO_DEFAULT_PATH)
-
-if (NOT USE_TRT)
-  set(CUDA_LIBRARIES cudart)
-endif()
\ No newline at end of file
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/cmake/FindIxrt.cmake b/models/cv/object_detection/yolox/ixrt/plugin/cmake/FindIxrt.cmake
deleted file mode 100644
index f80c45d9..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/cmake/FindIxrt.cmake
+++ /dev/null
@@ -1,20 +0,0 @@
-# This cmake file decides how to build with IxRT
-# Custom IxRT Path
-if(NOT "${IXRT_HOME}" STREQUAL "")
-    set(IXRT_INCLUDE_DIR ${IXRT_HOME}/include)
-    set(IXRT_LIB_DIR ${IXRT_HOME}/lib)
-    include_directories("${IXRT_HOME}/lib/python3/dist-packages/tensorrt/include")
-# From default paths
-else()
-  set(IXRT_INCLUDE_DIR /usr/local/corex/include)
-  set(IXRT_LIB_DIR /usr/local/corex/lib)
-endif()
-
-message(STATUS "IXRT_INCLUDE_DIR:   ${IXRT_INCLUDE_DIR}")
-message(STATUS "IXRT_LIB_DIR:   ${IXRT_LIB_DIR}")
-
-if(EXISTS ${IXRT_INCLUDE_DIR} AND EXISTS ${IXRT_LIB_DIR})
-    include_directories(${IXRT_INCLUDE_DIR})
-else()
-    message( FATAL_ERROR "IxRT library doesn't exist!")
-endif()
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/cmake/FindPluginFiles.cmake b/models/cv/object_detection/yolox/ixrt/plugin/cmake/FindPluginFiles.cmake
deleted file mode 100644
index c0f71de9..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/cmake/FindPluginFiles.cmake
+++ /dev/null
@@ -1,4 +0,0 @@
-
-file(GLOB_RECURSE PLUGIN_FILES ${PLUGIN_REPO_PATH}/src/*.cpp
-        ${PLUGIN_REPO_PATH}/src/*.cc
-        ${PLUGIN_REPO_PATH}/src/*.cu)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/src/common/checkMacrosPlugin.cpp b/models/cv/object_detection/yolox/ixrt/plugin/src/common/checkMacrosPlugin.cpp
deleted file mode 100644
index 3147760d..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/src/common/checkMacrosPlugin.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "checkMacrosPlugin.h"
-
-#include "NvInferRuntimeCommon.h"
-
-namespace nvinfer1 {
-namespace plugin {
-
-ILogger* gLogger{};
-
-template <ILogger::Severity kSeverity>
-int32_t LogStream<kSeverity>::Buf::sync() {
-    std::string s = str();
-    while (!s.empty() && s.back() == '\n') {
-        s.pop_back();
-    }
-    if (gLogger != nullptr) {
-        gLogger->log(kSeverity, s.c_str());
-    }
-    str("");
-    return 0;
-}
-
-// These use gLogger, and therefore require initLibNvInferPlugins() to be called with a logger
-// (otherwise, it will not log)
-LogStream<ILogger::Severity::kERROR> gLogError;
-LogStream<ILogger::Severity::kWARNING> gLogWarning;
-LogStream<ILogger::Severity::kINFO> gLogInfo;
-LogStream<ILogger::Severity::kVERBOSE> gLogVerbose;
-
-}  // namespace plugin
-}  // namespace nvinfer1
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/src/common/checkMacrosPlugin.h b/models/cv/object_detection/yolox/ixrt/plugin/src/common/checkMacrosPlugin.h
deleted file mode 100644
index c5448abd..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/src/common/checkMacrosPlugin.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cublasLt.h>
-
-#include <cassert>
-#include <iostream>
-#include <mutex>
-#include <sstream>
-
-#include "NvInfer.h"
-#include "NvInferRuntime.h"
-
-// Logs failed assertion and aborts.
-// Aborting is undesirable and will be phased-out from the plugin module, at which point
-// PLUGIN_ASSERT will perform the same function as PLUGIN_VALIDATE.
-using namespace std;
-
-namespace nvinfer1 {
-namespace plugin {
-
-#ifdef _MSC_VER
-#define FN_NAME __FUNCTION__
-#else
-#define FN_NAME __func__
-#endif
-
-#define IXRT_PLUGIN_CHECK_VALUE(value, msg)                            \
-    {                                                                  \
-        if (not(value)) {                                              \
-            std::cerr << __FILE__ << " (" << __LINE__ << ")"           \
-                      << "-" << __FUNCTION__ << " : "                  \
-                      << " Plugin assert error: " << msg << std::endl; \
-            std::exit(EXIT_FAILURE);                                   \
-        }                                                              \
-    }
-
-#define IXRT_PLUGIN_ASSERT(value)                             \
-    {                                                         \
-        if (not(value)) {                                     \
-            std::cerr << __FILE__ << " (" << __LINE__ << ")"  \
-                      << "-" << __FUNCTION__ << " : "         \
-                      << " Plugin assert false" << std::endl; \
-            std::exit(EXIT_FAILURE);                          \
-        }                                                     \
-    }
-
-#define IXRT_PLUGIN_CHECK_CUDA(call)                                        \
-    do {                                                                    \
-        const cudaError_t error_code = call;                                \
-        if (error_code != cudaSuccess) {                                    \
-            printf("CUDA Error:\n");                                        \
-            printf("    File:       %s\n", __FILE__);                       \
-            printf("    Line:       %d\n", __LINE__);                       \
-            printf("    Error code: %d\n", error_code);                     \
-            printf("    Error text: %s\n", cudaGetErrorString(error_code)); \
-            exit(1);                                                        \
-        }                                                                   \
-    } while (0)
-
-inline void caughtError(const std::exception& e) { std::cerr << e.what() << std::endl; }
-
-#define IXRT_PLUGIN_FAIL(msg)                         \
-    do {                                              \
-        std::ostringstream stream;                    \
-        stream << "Assertion failed: " << msg << "\n" \
-               << __FILE__ << ':' << __LINE__ << "\n" \
-               << "Aborting..."                       \
-               << "\n";                               \
-        IXRT_PLUGIN_CHECK_CUDA(cudaDeviceReset());    \
-        abort;                                        \
-    } while (0)
-
-inline void throwCudaError(char const* file, char const* function, int32_t line, int32_t status, char const* msg) {
-    std::cerr << file << " (" << line << ")"
-              << "-" << function << " : " << msg << std::endl;
-    std::exit(EXIT_FAILURE);
-}
-
-#define IXRT_PLUGIN_CUASSERT(status_)                             \
-    {                                                             \
-        auto s_ = status_;                                        \
-        if (s_ != cudaSuccess) {                                  \
-            const char* msg = cudaGetErrorString(s_);             \
-            throwCudaError(__FILE__, FN_NAME, __LINE__, s_, msg); \
-        }                                                         \
-    }
-
-#undef CUINFER_CHECK
-#define CUINFER_CHECK(func)                                                              \
-    do {                                                                                 \
-        cuinferStatus_t status = (func);                                                 \
-        if (status != CUINFER_STATUS_SUCCESS) {                                          \
-            std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": " \
-                      << cuinferGetErrorString(status) << std::endl;                     \
-            std::exit(EXIT_FAILURE);                                                     \
-        }                                                                                \
-    } while (0)
-
-static std::string _cudaGetErrorString(cublasStatus_t error) {
-    switch (error) {
-        case CUBLAS_STATUS_SUCCESS:
-            return "CUBLAS_STATUS_SUCCESS";
-
-        case CUBLAS_STATUS_NOT_INITIALIZED:
-            return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-        case CUBLAS_STATUS_ALLOC_FAILED:
-            return "CUBLAS_STATUS_ALLOC_FAILED";
-
-        case CUBLAS_STATUS_INVALID_VALUE:
-            return "CUBLAS_STATUS_INVALID_VALUE";
-
-        case CUBLAS_STATUS_ARCH_MISMATCH:
-            return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-        case CUBLAS_STATUS_MAPPING_ERROR:
-            return "CUBLAS_STATUS_MAPPING_ERROR";
-
-        case CUBLAS_STATUS_EXECUTION_FAILED:
-            return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-        case CUBLAS_STATUS_INTERNAL_ERROR:
-            return "CUBLAS_STATUS_INTERNAL_ERROR";
-
-        case CUBLAS_STATUS_NOT_SUPPORTED:
-            return "CUBLAS_STATUS_NOT_SUPPORTED";
-
-        case CUBLAS_STATUS_LICENSE_ERROR:
-            return "CUBLAS_STATUS_LICENSE_ERROR";
-    }
-    return "CUBLAS_UNKNOW";
-}
-
-template <typename T>
-void check_gpu_error(T result, char const* const func, const char* const file, int const line) {
-    if (result) {
-        throw std::runtime_error(std::string("[CUDA][ERROR] ") + +file + "(" + std::to_string(line) +
-                                 "): " + (_cudaGetErrorString(result)) + "\n");
-    }
-}
-
-#define CHECK_GPU_ERROR(val) check_gpu_error((val), #val, __FILE__, __LINE__)
-
-template <ILogger::Severity kSeverity>
-class LogStream : public std::ostream {
-    class Buf : public std::stringbuf {
-       public:
-        int32_t sync() override;
-    };
-
-    Buf buffer;
-    std::mutex mLogStreamMutex;
-
-   public:
-    std::mutex& getMutex() { return mLogStreamMutex; }
-    LogStream() : std::ostream(&buffer){};
-};
-
-// Use mutex to protect multi-stream write to buffer
-template <ILogger::Severity kSeverity, typename T>
-LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, T const& msg) {
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << msg;
-    return stream;
-}
-
-// Special handling static numbers
-template <ILogger::Severity kSeverity>
-inline LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, int32_t num) {
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << num;
-    return stream;
-}
-
-// Special handling std::endl
-template <ILogger::Severity kSeverity>
-inline LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, std::ostream& (*f)(std::ostream&)) {
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << f;
-    return stream;
-}
-
-extern LogStream<ILogger::Severity::kERROR> gLogError;
-extern LogStream<ILogger::Severity::kWARNING> gLogWarning;
-extern LogStream<ILogger::Severity::kINFO> gLogInfo;
-extern LogStream<ILogger::Severity::kVERBOSE> gLogVerbose;
-}  // namespace plugin
-}  // namespace nvinfer1
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/src/common/common_def.cuh b/models/cv/object_detection/yolox/ixrt/plugin/src/common/common_def.cuh
deleted file mode 100644
index 21c04694..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/src/common/common_def.cuh
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-
-#include <cublasLt.h>
-
-#include <cuda.h>
-namespace nvinfer1::plugin {
-#ifdef __ILUVATAR__
-static const int kMaxThreadNbPerBlock = 1024;
-static const int kMaxBlockNbPerSM = 8;
-static const int kWarpSize = 64;
-static const dim3 kMaxBlockDimension = {4096, 4096, 64};
-static const dim3 kMaxGridDimension = {4294967295, 65536, 65536};
-static const int kNbThreadsPerBlockGainBestPerformance = 1024;
-static const int kMaxSharedMemSizePerBlock = (128 * 1024 * 4);
-static const int kNbSmemLane = 64;
-static const int kNbBytesPerSmemLane = 4;
-#else
-static const int kMaxThreadNbPerBlock = 1024;
-static const int kMaxBlockNbPerSM = 8;
-static const int kWarpSize = 32;
-static const dim3 kMaxBlockDimension = {1024, 1024, 64};
-static const dim3 kMaxGridDimension = {2147483647, 65535, 65535};
-static const int kNbThreadsPerBlockGainBestPerformance = 256;
-static const int kMaxSharedMemSizePerBlock = 48 * 1024 * 4;
-static const int kNbSmemLane = 32;
-static const int kNbBytesPerSmemLane = 4;
-#endif
-
-static const int kNbCe = 4;
-static const int kNbCuPerCe = 4;
-static const int kNbSppPerCu = 4;
-
-static const float kLog2e = 1.442695040888963387;
-
-#define DivUp(x, y) (((x) + (y)-1) / (y))
-
-__device__ __forceinline__ float floatExp(float x) { return __builtin_exp2f(kLog2e * x); }
-
-__device__ __forceinline__ float floatLog(float x) { return __logf(x); }
-
-__forceinline__ int nearest_num(int x, int value) {
-    if (x % value == 0) {
-        return x;
-    } else {
-        int padding = value - x % value;
-        return x + padding;
-    }
-}
-}  // namespace nvinfer1::plugin
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/src/common/kernels/cuda_helper.cuh b/models/cv/object_detection/yolox/ixrt/plugin/src/common/kernels/cuda_helper.cuh
deleted file mode 100644
index c85bdfac..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/src/common/kernels/cuda_helper.cuh
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#define DEVICE_FUNC __device__ __forceinline__
-namespace nvinfer1::plugin {
-constexpr float LOG2E = 1.442695040888963387;
-
-DEVICE_FUNC float _exp(float x) { return __builtin_exp2f(LOG2E * x); }
-DEVICE_FUNC float dequantize(int8_t x, float scale) { return scale * static_cast<float>(x); }
-DEVICE_FUNC float sigmoid(float x) { return 1/((1.f + _exp(0.f - x))); }
-}  // namespace nvinfer1::plugin
\ No newline at end of file
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/src/common/plugin.cpp b/models/cv/object_detection/yolox/ixrt/plugin/src/common/plugin.cpp
deleted file mode 100644
index 0085b94b..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/src/common/plugin.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "plugin.h"
-#include "checkMacrosPlugin.h"
-
-namespace nvinfer1
-{
-namespace plugin
-{
-
-void validateRequiredAttributesExist(std::set<std::string> requiredFieldNames, PluginFieldCollection const* fc)
-{
-    for (int32_t i = 0; i < fc->nbFields; i++)
-    {
-        requiredFieldNames.erase(fc->fields[i].name);
-    }
-    if (!requiredFieldNames.empty())
-    {
-        std::stringstream msg{};
-        msg << "PluginFieldCollection missing required fields: {";
-        char const* separator = "";
-        for (auto const& field : requiredFieldNames)
-        {
-            msg << separator << field;
-            separator = ", ";
-        }
-        msg << "}";
-        std::string msg_str = msg.str();
-        IXRT_PLUGIN_CHECK_VALUE(false, msg_str.c_str());
-    }
-}
-
-} // namespace plugin
-} // namespace nvinfer1
\ No newline at end of file
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/src/common/plugin.h b/models/cv/object_detection/yolox/ixrt/plugin/src/common/plugin.h
deleted file mode 100644
index 110da352..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/src/common/plugin.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-
-#include <cstring>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <set>
-#include "NvInferRuntimeCommon.h"
-
-typedef enum
-{
-    STATUS_SUCCESS = 0,
-    STATUS_FAILURE = 1,
-    STATUS_BAD_PARAM = 2,
-    STATUS_NOT_SUPPORTED = 3,
-    STATUS_NOT_INITIALIZED = 4
-} pluginStatus_t;
-
-namespace nvinfer1 {
-
-namespace plugin {
-
-
-// Write values into buffer
-template <typename T>
-void write(char*& buffer, const T& val) {
-    std::memcpy(buffer, &val, sizeof(T));
-    buffer += sizeof(T);
-}
-
-// Read values from buffer
-template <typename T>
-T read(const char*& buffer) {
-    T val{};
-    std::memcpy(&val, buffer, sizeof(T));
-    buffer += sizeof(T);
-    return val;
-}
-
-void validateRequiredAttributesExist(std::set<std::string> requiredFieldNames, PluginFieldCollection const* fc);
-
-}  // namespace plugin
-
-}  // namespace nvinfer1
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/src/common/serialize.h b/models/cv/object_detection/yolox/ixrt/plugin/src/common/serialize.h
deleted file mode 100644
index a2ac72d7..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/src/common/serialize.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-
-#include <cstring>
-#include <vector>
-#include <cassert>
-#include <type_traits>
-
-#include <iostream>
-using std::cerr;
-using std::cout;
-using std::endl;
-
-template <typename T>
-inline void serialize_value(void** buffer, T const& value);
-
-template <typename T>
-inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value);
-
-namespace
-{
-
-template <typename T, class Enable = void>
-struct Serializer
-{
-};
-
-template <typename T>
-struct Serializer<T,
-    typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value || std::is_pod<T>::value>::type>
-{
-    static size_t serialized_size(T const&)
-    {
-        return sizeof(T);
-    }
-    static void serialize(void** buffer, T const& value)
-    {
-        ::memcpy(*buffer, &value, sizeof(T));
-        reinterpret_cast<char*&>(*buffer) += sizeof(T);
-    }
-    static void deserialize(void const** buffer, size_t* buffer_size, T* value)
-    {
-        assert(*buffer_size >= sizeof(T));
-        ::memcpy(value, *buffer, sizeof(T));
-        reinterpret_cast<char const*&>(*buffer) += sizeof(T);
-        *buffer_size -= sizeof(T);
-    }
-};
-
-template <>
-struct Serializer<const char*>
-{
-    static size_t serialized_size(const char* value)
-    {
-        return strlen(value) + 1;
-    }
-    static void serialize(void** buffer, const char* value)
-    {
-        ::strcpy(static_cast<char*>(*buffer), value);
-        reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
-    }
-    static void deserialize(void const** buffer, size_t* buffer_size, const char** value)
-    {
-        *value = static_cast<char const*>(*buffer);
-        size_t data_size = strnlen(*value, *buffer_size) + 1;
-        assert(*buffer_size >= data_size);
-        reinterpret_cast<char const*&>(*buffer) += data_size;
-        *buffer_size -= data_size;
-    }
-};
-
-template <typename T>
-struct Serializer<std::vector<T>,
-    typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value || std::is_pod<T>::value>::type>
-{
-    static size_t serialized_size(std::vector<T> const& value)
-    {
-        return sizeof(value.size()) + value.size() * sizeof(T);
-    }
-    static void serialize(void** buffer, std::vector<T> const& value)
-    {
-        serialize_value(buffer, value.size());
-        size_t nbyte = value.size() * sizeof(T);
-        ::memcpy(*buffer, value.data(), nbyte);
-        reinterpret_cast<char*&>(*buffer) += nbyte;
-    }
-    static void deserialize(void const** buffer, size_t* buffer_size, std::vector<T>* value)
-    {
-        size_t size;
-        deserialize_value(buffer, buffer_size, &size);
-        value->resize(size);
-        size_t nbyte = value->size() * sizeof(T);
-        assert(*buffer_size >= nbyte);
-        ::memcpy(value->data(), *buffer, nbyte);
-        reinterpret_cast<char const*&>(*buffer) += nbyte;
-        *buffer_size -= nbyte;
-    }
-};
-
-} // namespace
-
-template <typename T>
-inline size_t serialized_size(T const& value)
-{
-    return Serializer<T>::serialized_size(value);
-}
-
-template <typename T>
-inline void serialize_value(void** buffer, T const& value)
-{
-    return Serializer<T>::serialize(buffer, value);
-}
-
-template <typename T>
-inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value)
-{
-    return Serializer<T>::deserialize(buffer, buffer_size, value);
-}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderKernel.cu b/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderKernel.cu
deleted file mode 100644
index a367f146..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderKernel.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "NvInfer.h"
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#include "checkMacrosPlugin.h"
-#include "common_def.cuh"
-#include "cuda_fp16.h"
-#include "yoloxDecoderKernel.h"
-#include "yolox_decoder.cuh"
-
-namespace nvinfer1::plugin {
-int32_t YoloxDecoderInference(cudaStream_t stream, void const *box_data, void const *conf_data, void const *class_data,
-                              void *output_data, const float box_quant_factor, const float conf_quant_factor,
-                              const float class_quant_factor, const int batch_size, const int input_h,
-                              const int input_w, const int input_channel_0, const int input_channel_1,
-                              const int input_channel_2, const int stride, const int num_class, const int faster_impl,
-                              nvinfer1::DataType type) {
-    uint32_t kThreadPerBlock = 1024;
-    int32_t total_boxes = batch_size * input_h * input_w;
-    int32_t grid_ = (total_boxes + kThreadPerBlock - 1) / kThreadPerBlock;
-    int32_t block_ = kThreadPerBlock;
-
-    switch (type) {
-        case DataType::kHALF: {
-            YOLOX_Decode_NHWC_FP16<<<grid_, block_, 0, stream>>>(
-                input_channel_0, input_channel_1, input_channel_2, batch_size, input_h, input_w, stride, num_class,
-                reinterpret_cast<const __half *>(box_data), reinterpret_cast<const __half *>(conf_data),
-                reinterpret_cast<const __half *>(class_data), reinterpret_cast<__half *>(output_data));
-            break;
-        }
-        case DataType::kINT8: {
-            YOLOX_Decode_NHWC_INT8<<<grid_, block_, 0, stream>>>(
-                box_quant_factor, conf_quant_factor, class_quant_factor, input_channel_0, input_channel_1,
-                input_channel_2, batch_size, input_h, input_w, stride, num_class,
-                reinterpret_cast<const int8_t *>(box_data), reinterpret_cast<const int8_t *>(conf_data),
-                reinterpret_cast<const int8_t *>(class_data), reinterpret_cast<__half *>(output_data));
-            break;
-        }
-        default:
-            IXRT_PLUGIN_FAIL("YoloxDecoderPlugin Unsupported datatype");
-    }
-
-    return 0;
-}
-}  // namespace nvinfer1::plugin
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderKernel.h b/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderKernel.h
deleted file mode 100644
index 8f6cc30a..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderKernel.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cuda_runtime.h>
-#include <stdint.h>
-
-#include "NvInfer.h"
-
-namespace nvinfer1::plugin {
-int32_t YoloxDecoderInference(cudaStream_t stream, void const *box_data, void const *conf_data, void const *class_data,
-                              void *output_data, const float box_quant_factor, const float conf_quant_factor,
-                              const float class_quant_factor, const int batch_size, const int input_h,
-                              const int input_w, const int input_channel_0, const int input_channel_1,
-                              const int input_channel_2, const int stride, const int num_class, const int faster_impl,
-                              nvinfer1::DataType type);
-}
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderPlugin.cc b/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderPlugin.cc
deleted file mode 100644
index c1f9d1c9..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderPlugin.cc
+++ /dev/null
@@ -1,258 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "yoloxDecoderPlugin.h"
-
-#include "checkMacrosPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-#include "yoloxDecoderKernel.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::plugin;
-
-namespace {
-char const *kYoloxDecoderPluginVersion{"1"};
-char const *kYoloxDecoderPluginName{"YoloXDecoder"};
-}  // namespace
-
-PluginFieldCollection YoloxDecodePluginCreator::mFC{};
-std::vector<PluginField> YoloxDecodePluginCreator::mPluginAttributes;
-
-YoloxDecoderPlugin::YoloxDecoderPlugin(int32_t num_class, int32_t stride, int32_t faster_impl)
-    : nb_classes_(num_class), stride_(stride), faster_impl_(faster_impl) {}
-
-int32_t YoloxDecoderPlugin::getNbOutputs() const noexcept { return 1; }
-
-int32_t YoloxDecoderPlugin::initialize() noexcept { return 0; }
-
-void YoloxDecoderPlugin::terminate() noexcept {}
-
-void YoloxDecoderPlugin::destroy() noexcept { delete this; }
-
-size_t YoloxDecoderPlugin::getWorkspaceSize(PluginTensorDesc const *inputs, int32_t nbInputs,
-                                            PluginTensorDesc const *outputs, int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-bool YoloxDecoderPlugin::supportsFormatCombination(int32_t pos, PluginTensorDesc const *inOut, int32_t nbInputs,
-                                                   int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(inOut != nullptr);
-    IXRT_PLUGIN_ASSERT(pos < 4);
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-
-    bool condition = true;
-    switch (pos) {
-        case 0: {
-            condition &= (inOut[pos].type == DataType::kINT8 || inOut[pos].type == DataType::kHALF);
-            condition &= inOut[pos].format == TensorFormat::kLINEAR;
-            break;
-        }
-        case 1: {
-            condition &= (inOut[pos].type == DataType::kINT8 || inOut[pos].type == DataType::kHALF);
-            condition &= inOut[pos].format == TensorFormat::kLINEAR;
-            condition &= (inOut[0].type == inOut[1].type);
-            break;
-        }
-        case 2: {
-            condition &= (inOut[pos].type == DataType::kINT8 || inOut[pos].type == DataType::kHALF);
-            condition &= inOut[pos].format == TensorFormat::kLINEAR;
-            condition &= (inOut[0].type == inOut[2].type);
-            break;
-        }
-        case 3: {
-            condition &= inOut[pos].type == DataType::kHALF;
-            condition &= inOut[pos].format == TensorFormat::kLINEAR;
-            break;
-        }
-        default: {
-            IXRT_PLUGIN_ASSERT(false);
-        }
-    }
-    return condition;
-}
-
-char const *YoloxDecoderPlugin::getPluginType() const noexcept { return kYoloxDecoderPluginName; }
-
-char const *YoloxDecoderPlugin::getPluginVersion() const noexcept { return kYoloxDecoderPluginVersion; }
-
-IPluginV2DynamicExt *YoloxDecoderPlugin::clone() const noexcept {
-    try {
-        auto plugin = new YoloxDecoderPlugin(*this);
-        plugin->setPluginNamespace(mNameSpace.c_str());
-        return plugin;
-    } catch (std::exception const &e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void YoloxDecoderPlugin::setPluginNamespace(char const *libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNameSpace = libNamespace;
-    } catch (std::exception const &e) {
-        caughtError(e);
-    }
-}
-
-char const *YoloxDecoderPlugin::getPluginNamespace() const noexcept { return mNameSpace.c_str(); }
-
-DimsExprs YoloxDecoderPlugin::getOutputDimensions(int32_t outputIndex, DimsExprs const *inputs, int32_t nbInputs,
-                                                  IExprBuilder &exprBuilder) noexcept {
-    IXRT_PLUGIN_ASSERT(inputs != nullptr);
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-    IXRT_PLUGIN_ASSERT(outputIndex == 0);  // there is only one output
-    DimsExprs result;
-    result.nbDims = 3;
-
-    // n
-    result.d[0] = inputs[0].d[0];
-    // H*W*anchor_number
-    result.d[1] = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[2], *inputs[0].d[3]);
-    // box info
-    result.d[2] = exprBuilder.constant(6);
-    return result;
-}
-
-int32_t YoloxDecoderPlugin::enqueue(PluginTensorDesc const *inputDesc, PluginTensorDesc const *outputDesc,
-                                    void const *const *inputs, void *const *outputs, void *workspace,
-                                    cudaStream_t stream) noexcept {
-    IXRT_PLUGIN_ASSERT(inputDesc != nullptr);
-    IXRT_PLUGIN_ASSERT(inputs != nullptr);
-    IXRT_PLUGIN_ASSERT(outputs != nullptr);
-    IXRT_PLUGIN_ASSERT(outputDesc != nullptr);
-
-    auto type = inputDesc[0].type;
-    float box_scale = 1.f, conf_scale = 1.f, class_scale = 1.f;
-    if (type == DataType::kINT8) {
-        box_scale = 1.f / inputDesc[0].scale;
-        conf_scale = 1.f / inputDesc[1].scale;
-        class_scale = 1.f / inputDesc[2].scale;
-    }
-
-    int N = inputDesc[0].dims.d[0], H = inputDesc[0].dims.d[1], W = inputDesc[0].dims.d[2],
-        box_channel = inputDesc[0].dims.d[3], conf_channel = inputDesc[1].dims.d[3],
-        class_channel = inputDesc[2].dims.d[3];
-    return YoloxDecoderInference(stream, inputs[0], inputs[1], inputs[2], outputs[0], box_scale, conf_scale,
-                                 class_scale, N, H, W, box_channel, conf_channel, class_channel, stride_, nb_classes_,
-                                 faster_impl_, type);
-}
-
-size_t YoloxDecoderPlugin::getSerializationSize() const noexcept {
-    // Note:serialize_value and deserialize_value save/load vector as: vector size
-    // + vector data,
-    //    remember to count the space of the size itself as well.
-    return /*num_class*/ sizeof(int32_t) + /*stride*/ sizeof(int32_t) +
-           /*faster_impl*/ sizeof(int32_t);
-}
-
-void YoloxDecoderPlugin::serialize(void *buffer) const noexcept {
-    IXRT_PLUGIN_ASSERT(buffer != nullptr);
-    serialize_value(&buffer, nb_classes_);
-    serialize_value(&buffer, stride_);
-    serialize_value(&buffer, faster_impl_);
-}
-
-YoloxDecoderPlugin::YoloxDecoderPlugin(void const *data, size_t length) {
-    deserialize_value(&data, &length, &nb_classes_);
-    deserialize_value(&data, &length, &stride_);
-    deserialize_value(&data, &length, &faster_impl_);
-}
-
-DataType YoloxDecoderPlugin::getOutputDataType(int32_t index, DataType const *inputTypes,
-                                               int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-    IXRT_PLUGIN_ASSERT(index == 0);
-    return DataType::kHALF;
-}
-
-void YoloxDecoderPlugin::configurePlugin(DynamicPluginTensorDesc const *in, int32_t nbInputs,
-                                         DynamicPluginTensorDesc const *out, int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(in != nullptr);
-    IXRT_PLUGIN_ASSERT(out != nullptr);
-}
-
-YoloxDecodePluginCreator::YoloxDecodePluginCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("num_class", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("stride", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("faster_impl", nullptr, PluginFieldType::kINT32, 1));
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const *YoloxDecodePluginCreator::getPluginName() const noexcept { return kYoloxDecoderPluginName; }
-
-char const *YoloxDecodePluginCreator::getPluginVersion() const noexcept { return kYoloxDecoderPluginVersion; }
-
-PluginFieldCollection const *YoloxDecodePluginCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2DynamicExt *YoloxDecodePluginCreator::createPlugin(char const *name,
-                                                            PluginFieldCollection const *fc) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-        IXRT_PLUGIN_ASSERT(fc->nbFields == 3);
-        PluginField const *fields = fc->fields;
-
-        int32_t num_class, stride, faster_impl;
-        float *anchor = nullptr;
-        for (int32_t i = 0; i < fc->nbFields; ++i) {
-            char const *attrName = fields[i].name;
-            if (!strcmp(attrName, "num_class")) {
-                IXRT_PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
-                num_class = *static_cast<int32_t *>(const_cast<void *>((fields[i].data)));
-            } else if (!strcmp(attrName, "stride")) {
-                IXRT_PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
-                stride = *static_cast<int32_t *>(const_cast<void *>((fields[i].data)));
-            } else if (!strcmp(attrName, "faster_impl")) {
-                IXRT_PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
-                faster_impl = *static_cast<int32_t *>(const_cast<void *>(fields[i].data));
-            }
-        }
-        IPluginV2DynamicExt *plugin = new YoloxDecoderPlugin(num_class, stride, faster_impl);
-        plugin->setPluginNamespace(mNamespace.c_str());
-        return plugin;
-    } catch (std::exception const &e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2DynamicExt *YoloxDecodePluginCreator::deserializePlugin(char const *name, void const *data,
-                                                                 size_t length) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(data != nullptr);
-        return new YoloxDecoderPlugin(data, length);
-    } catch (std::exception const &e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void YoloxDecodePluginCreator::setPluginNamespace(char const *libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const &e) {
-        caughtError(e);
-    }
-}
-
-char const *YoloxDecodePluginCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-REGISTER_TENSORRT_PLUGIN(YoloxDecodePluginCreator);
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderPlugin.h b/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderPlugin.h
deleted file mode 100644
index 5449b145..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yoloxDecoderPlugin.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-
-#include <cuda_runtime_api.h>
-#include <stdint.h>
-
-#include <string>
-#include <vector>
-
-#include "NvInfer.h"
-#include "NvInferRuntime.h"
-
-namespace nvinfer1::plugin {
-
-class YoloxDecoderPlugin : public IPluginV2DynamicExt {
-   public:
-    YoloxDecoderPlugin(int32_t num_class, int32_t stride, int32_t faster_impl);
-    YoloxDecoderPlugin(void const *data, size_t length);
-    YoloxDecoderPlugin() noexcept = delete;
-    ~YoloxDecoderPlugin() override = default;
-
-    // IPluginV2 methods
-    char const *getPluginType() const noexcept override;
-    char const *getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void *buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const *libNamespace) noexcept override;
-    char const *getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext methods
-    DataType getOutputDataType(int32_t index, DataType const *inputType, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt methods
-    IPluginV2DynamicExt *clone() const noexcept override;
-    DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const *inputs, int32_t nbInputs,
-                                  IExprBuilder &exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, PluginTensorDesc const *inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(DynamicPluginTensorDesc const *in, int32_t nbInputs, DynamicPluginTensorDesc const *out,
-                         int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(PluginTensorDesc const *inputs, int32_t nbInputs, PluginTensorDesc const *outputs,
-                            int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(PluginTensorDesc const *inputDesc, PluginTensorDesc const *outputDesc, void const *const *inputs,
-                    void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
-
-   private:
-    std::string mNameSpace{};
-    // from attributes:
-    int32_t nb_classes_;
-    int32_t stride_;
-    int32_t faster_impl_;
-};
-
-class YoloxDecodePluginCreator : public IPluginCreator {
-   public:
-    YoloxDecodePluginCreator();
-
-    ~YoloxDecodePluginCreator() override = default;
-
-    char const *getPluginName() const noexcept override;
-
-    char const *getPluginVersion() const noexcept override;
-
-    PluginFieldCollection const *getFieldNames() noexcept override;
-
-    IPluginV2DynamicExt *createPlugin(char const *name, PluginFieldCollection const *fc) noexcept override;
-
-    IPluginV2DynamicExt *deserializePlugin(char const *name, void const *serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const *pluginNamespace) noexcept override;
-    char const *getPluginNamespace() const noexcept override;
-
-   private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-}  // namespace nvinfer1::plugin
diff --git a/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yolox_decoder.cuh b/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yolox_decoder.cuh
deleted file mode 100644
index 0bf110fb..00000000
--- a/models/cv/object_detection/yolox/ixrt/plugin/src/yolox_decoder/yolox_decoder.cuh
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include "NvInfer.h"
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#include "common_def.cuh"
-#include "kernels/cuda_helper.cuh"
-
-namespace nvinfer1::plugin {
-__global__ void YOLOX_Decode_NHWC_INT8(const float inp_scale1,  // 4
-                                       const float inp_scale2,  // 1
-                                       const float inp_scale3,  // 80
-                                       const int in_channel1, const int in_channel2, const int in_channel3,
-                                       const int N,  // batch size
-                                       const int H, const int W, const int stride, const int nb_classes,
-                                       const int8_t *inp_data1, const int8_t *inp_data2, const int8_t *inp_data3,
-                                       __half *oup) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= N * H * W) return;
-
-    const int h_idx = (tid % (H * W)) / W;  // y
-    const int w_idx = (tid % (H * W)) % W;  // x
-
-    // pointer to a featuremp
-    float xywh0 = inp_data1[tid * in_channel1 + 0];
-    float xywh1 = inp_data1[tid * in_channel1 + 1];
-    float xywh2 = inp_data1[tid * in_channel1 + 2];
-    float xywh3 = inp_data1[tid * in_channel1 + 3];
-    float conf0 = inp_data2[tid * in_channel2 + 0];
-
-    const float cx = (dequantize(xywh0, inp_scale1) + w_idx) * stride;
-    const float cy = (dequantize(xywh1, inp_scale1) + h_idx) * stride;
-    const float w = exp(dequantize(xywh2, inp_scale1)) * stride;
-    const float h = exp(dequantize(xywh3, inp_scale1)) * stride;
-    const float conf = sigmoid(dequantize(conf0, inp_scale2));
-
-    float max_prob = sigmoid(dequantize(inp_data3[tid * in_channel3], inp_scale3));
-    int class_id = 1;
-    // #pragma unroll
-    for (int i = 1; i < nb_classes; ++i) {
-        float tmp_prob = sigmoid(dequantize(inp_data3[tid * in_channel3 + i], inp_scale3));
-        if (tmp_prob > max_prob) {
-            max_prob = tmp_prob;
-            class_id = i + 1;
-        }
-    }
-    float x1 = cx - 0.5f * w;
-    float y1 = cy - 0.5f * h;
-
-    oup[tid * 6 + 0] = __float2half(x1);
-    oup[tid * 6 + 1] = __float2half(y1);
-    oup[tid * 6 + 2] = __float2half(x1 + w);
-    oup[tid * 6 + 3] = __float2half(y1 + h);
-    oup[tid * 6 + 4] = __float2half(class_id);
-    oup[tid * 6 + 5] = __float2half(max_prob * conf);
-}
-
-__global__ void YOLOX_Decode_NHWC_FP16(const int in_channel1, const int in_channel2, const int in_channel3, const int N,
-                                       const int H, const int W, const int stride, const int nb_classes,
-                                       const __half *inp_data1, const __half *inp_data2, const __half *inp_data3,
-                                       __half *oup) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= N * H * W) return;
-
-    const int h_idx = (tid % (H * W)) / W;  // y
-    const int w_idx = (tid % (H * W)) % W;  // x
-
-    // pointer to a featuremp
-    float xywh0 = __half2float(inp_data1[tid * in_channel1 + 0]);
-    float xywh1 = __half2float(inp_data1[tid * in_channel1 + 1]);
-    float xywh2 = __half2float(inp_data1[tid * in_channel1 + 2]);
-    float xywh3 = __half2float(inp_data1[tid * in_channel1 + 3]);
-    float conf0 = __half2float(inp_data2[tid * in_channel2 + 0]);
-
-    const float cx = (xywh0 + w_idx) * stride;
-    const float cy = (xywh1 + h_idx) * stride;
-    const float w = exp(xywh2) * stride;
-    const float h = exp(xywh3) * stride;
-    const float conf = sigmoid(conf0);
-    float max_prob = sigmoid(__half2float(inp_data3[tid * nb_classes]));
-    int class_id = 1;
-    // #pragma unroll
-    for (int i = 1; i < nb_classes; ++i) {
-        float tmp_prob = sigmoid(__half2float(inp_data3[tid * nb_classes + i]));
-        if (tmp_prob > max_prob) {
-            max_prob = tmp_prob;
-            class_id = i + 1;
-        }
-    }
-    float x1 = cx - 0.5f * w;
-    float y1 = cy - 0.5f * h;
-
-    oup[tid * 6 + 0] = __float2half(x1);
-    oup[tid * 6 + 1] = __float2half(y1);
-    oup[tid * 6 + 2] = __float2half(x1 + w);
-    oup[tid * 6 + 3] = __float2half(y1 + h);
-    oup[tid * 6 + 4] = __float2half(class_id);
-    oup[tid * 6 + 5] = __float2half(max_prob * conf);
-}
-}  // namespace nvinfer1::plugin
diff --git a/models/cv/object_detection/yolox/ixrt/python/quant.py b/models/cv/object_detection/yolox/ixrt/quant.py
similarity index 99%
rename from models/cv/object_detection/yolox/ixrt/python/quant.py
rename to models/cv/object_detection/yolox/ixrt/quant.py
index 41c49e5a..36fcc05e 100644
--- a/models/cv/object_detection/yolox/ixrt/python/quant.py
+++ b/models/cv/object_detection/yolox/ixrt/quant.py
@@ -56,7 +56,7 @@ WORKING_DIRECTORY = 'checkpoints'  # choose your working directory
 TARGET_PLATFORM = TargetPlatform.TRT_INT8  # choose your target platform
 MODEL_TYPE = NetworkFramework.ONNX  # or NetworkFramework.CAFFE
 INPUT_LAYOUT = 'chw'  # input data layout, chw or hwc
-NETWORK_INPUTSHAPE = [32, 3, 640, 640]  # input shape of your network
+NETWORK_INPUTSHAPE = [config.bsz, 3, 640, 640]  # input shape of your network
 EXECUTING_DEVICE = 'cuda'  # 'cuda' or 'cpu'.
 REQUIRE_ANALYSE = False
 TRAINING_YOUR_NETWORK = False  # 是否需要 Finetuning 一下你的网络
diff --git a/models/cv/object_detection/yolox/ixrt/requirements.txt b/models/cv/object_detection/yolox/ixrt/requirements.txt
index 05e148b3..a64772f7 100644
--- a/models/cv/object_detection/yolox/ixrt/requirements.txt
+++ b/models/cv/object_detection/yolox/ixrt/requirements.txt
@@ -1,5 +1,5 @@
 tqdm
-onnx==1.18.0
+onnx
 onnxsim
 tabulate
 pycocotools
diff --git a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_accuracy.sh b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_accuracy.sh
index 1d9f3bf0..8c0ae7f1 100644
--- a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_accuracy.sh
@@ -35,26 +35,26 @@ echo "batch size is ${batchsize}"
 DECODER_INPUT_NAMES="/head/obj_preds.0/Conv_output_0 /head/cls_preds.0/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/cls_preds.1/Conv_output_0 /head/reg_preds.2/Conv_output_0 /head/obj_preds.2/Conv_output_0 /head/cls_preds.2/Conv_output_0"
 
 # cut onnx
-python3 python/cut_model.py                             \
+python3 cut_model.py                             \
         --input_model ${model_path}.onnx                \
         --output_model ${model_path}_cut.onnx           \
         --input_names images                            \
-        --output_names ${DECODER_INPUT_NAMES[@]}        
+        --output_names ${DECODER_INPUT_NAMES[@]}
 
 # create onnx
-python3 python/deploy.py                        \
+python3 deploy.py                        \
         --src ${model_path}_cut.onnx            \
         --dst ${model_path}_decoder.onnx
 
+
 # build engine
-python3 python/build_engine.py                  \
+python3 build_engine.py                  \
         --model ${model_path}.onnx              \
         --precision float16                     \
         --engine ${model_path}_decoder.engine
 
 # inference
-python3 python/inference.py                             \
+python3 inference.py                             \
         --engine ${model_path}_decoder.engine           \
         --batchsize ${batchsize}                        \
-        --datasets ${datasets_path}
-
+        --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_performance.sh b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_performance.sh
index 8f512d51..bc123cf7 100644
--- a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_performance.sh
+++ b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_performance.sh
@@ -35,25 +35,26 @@ echo "batch size is ${batchsize}"
 DECODER_INPUT_NAMES="/head/obj_preds.0/Conv_output_0 /head/cls_preds.0/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/cls_preds.1/Conv_output_0 /head/reg_preds.2/Conv_output_0 /head/obj_preds.2/Conv_output_0 /head/cls_preds.2/Conv_output_0"
 
 # cut onnx
-python3 python/cut_model.py                             \
+python3 cut_model.py                             \
         --input_model ${model_path}.onnx                \
         --output_model ${model_path}_cut.onnx           \
         --input_names images                            \
-        --output_names ${DECODER_INPUT_NAMES[@]}        
+        --output_names ${DECODER_INPUT_NAMES[@]}   
 
 # create onnx
-python3 python/deploy.py                        \
+python3 deploy.py                        \
         --src ${model_path}_cut.onnx            \
         --dst ${model_path}_decoder.onnx
 
 # build engine
-python3 python/build_engine.py                  \
+python3 build_engine.py                  \
         --model ${model_path}.onnx              \
         --precision float16                     \
         --engine ${model_path}_decoder.engine
 
+
 # inference
-python3 python/inference.py                             \
+python3 inference.py                             \
         --engine ${model_path}_decoder.engine           \
         --datasets ${datasets_path}                     \
         --batchsize ${batchsize}                        \
diff --git a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_int8_accuracy.sh b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_int8_accuracy.sh
index 0f58e9c2..495cb6c0 100644
--- a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_int8_accuracy.sh
+++ b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_int8_accuracy.sh
@@ -34,21 +34,22 @@ done
 echo "batch size is ${batchsize}"
 
 # quant
-python3 python/quant.py             \
+python3 quant.py             \
         --model_name ${model_path}  \
         --model ${model_path}.onnx      \
         --dataset_dir ${datasets_path}/val2017      \
         --ann_file ${datasets_path}/annotations/instances_val2017.json      \
-        --save_dir ./
+        --save_dir ./       \
+        --bsz ${batchsize}
 
 # build engine
-python3 python/build_engine_by_write_qparams.py         \
+python3 build_engine_by_write_qparams.py         \
         --onnx quantized_yolox.onnx                     \
         --qparam_json quant_cfg.json                    \
         --engine ${model_path}_int8.engine
 
 # inference
-python3 python/inference.py                             \
+python3 inference.py                             \
         --engine ${model_path}_int8.engine              \
         --batchsize ${batchsize}                        \
         --datasets ${datasets_path}
diff --git a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_int8_performance.sh b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_int8_performance.sh
index afc48038..fa9c1f2a 100644
--- a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_int8_performance.sh
+++ b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_int8_performance.sh
@@ -34,21 +34,22 @@ done
 echo "batch size is ${batchsize}"
 
 # quant
-python3 python/quant.py             \
+python3 quant.py             \
         --model_name ${model_path}  \
         --model ${model_path}.onnx      \
         --dataset_dir ${datasets_path}/val2017      \
         --ann_file ${datasets_path}/annotations/instances_val2017.json      \
-        --save_dir ./
+        --save_dir ./      \
+        --bsz ${batchsize}
 
 # build engine
-python3 python/build_engine_by_write_qparams.py         \
+python3 build_engine_by_write_qparams.py         \
         --onnx quantized_yolox.onnx                     \
         --qparam_json quant_cfg.json                    \
         --engine ${model_path}_int8.engine
 
 # inference
-python3 python/inference.py                             \
+python3 inference.py                             \
         --engine ${model_path}_int8.engine              \
         --batchsize ${batchsize}                        \
         --datasets ${datasets_path}                     \
diff --git a/models/cv/object_detection/yolox/ixrt/python/utils.py b/models/cv/object_detection/yolox/ixrt/utils.py
similarity index 100%
rename from models/cv/object_detection/yolox/ixrt/python/utils.py
rename to models/cv/object_detection/yolox/ixrt/utils.py
-- 
Gitee


From dc54220d93643851a635ec400683ff8e6c514cf6 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 10 Dec 2025 14:17:55 +0800
Subject: [PATCH 2/7] update yolov5

---
 .../cv/object_detection/yolov5/ixrt/README.md |  4 ++--
 .../yolov5/ixrt/ci/prepare.sh                 |  9 --------
 .../scripts/infer_yolov5_fp16_accuracy.sh     | 16 ++++++++------
 .../scripts/infer_yolov5_fp16_performance.sh  | 21 ++++++++++++-------
 .../scripts/infer_yolov5_int8_accuracy.sh     | 16 ++++++++------
 .../scripts/infer_yolov5_int8_performance.sh  | 18 +++++++++-------
 6 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/models/cv/object_detection/yolov5/ixrt/README.md b/models/cv/object_detection/yolov5/ixrt/README.md
index 41bec0dc..a1812061 100644
--- a/models/cv/object_detection/yolov5/ixrt/README.md
+++ b/models/cv/object_detection/yolov5/ixrt/README.md
@@ -86,11 +86,11 @@ popd
 
 ```bash
 export PROJ_DIR=./
+export DATASETS_DIR=./coco/
 export DATASETS_DIR=/Path/to/coco/
-export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
 export EVAL_DIR=${DATASETS_DIR}/images/val2017
-export RUN_DIR=../../ixrt_common
+export RUN_DIR=../../ixrt_common/
 export CONFIG_DIR=../../ixrt_common/config/YOLOV5M_CONFIG
 ```
 
diff --git a/models/cv/object_detection/yolov5/ixrt/ci/prepare.sh b/models/cv/object_detection/yolov5/ixrt/ci/prepare.sh
index b99ab99d..23b04131 100644
--- a/models/cv/object_detection/yolov5/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/yolov5/ixrt/ci/prepare.sh
@@ -16,15 +16,6 @@
 
 set -x
 
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-if [[ ${ID} == "ubuntu" ]]; then
-    apt install -y libgl1-mesa-glx
-elif [[ ${ID} == "centos" ]]; then
-    yum install -y mesa-libGL
-else
-    echo "Not Support Os"
-fi
-
 pip3 install -r ../../ixrt_common/requirements.txt
 
 mkdir checkpoints
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh
index 52ec959f..f213d46e 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh
@@ -3,15 +3,16 @@
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=-1
+TGT=0.626
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=float16
@@ -40,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -111,7 +115,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -133,7 +137,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}_with_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -149,7 +153,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh
index 5e2f97fb..fcccca46 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh
@@ -3,16 +3,17 @@
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=-1
-LOOP_COUNT=10
+TGT=735
+LOOP_COUNT=100
 RUN_MODE=FPS
 PRECISION=float16
 
@@ -40,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -76,6 +80,7 @@ else
 fi
 CURRENT_MODEL=${NO_DECODER_MODEL}
 
+
 # Quant Model
 if [ $PRECISION == "int8" ];then
     let step++
@@ -111,7 +116,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion_no_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -133,7 +138,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}_without_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -149,7 +154,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
@@ -176,7 +181,7 @@ let step++
 echo;
 echo [STEP ${step}] : Inference
 python3 ${RUN_DIR}/inference.py                 \
-    --model_engine=${ENGINE_FILE}               \
+    --model_engine=${ENGINE_FILE}              \
     --nms_engine=${NMS_ENGINE}                  \
     --coco_gt=${COCO_GT}                        \
     --eval_dir=${EVAL_DIR}                      \
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh
index 606fc94c..7661a194 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh
@@ -3,15 +3,16 @@
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=-1
+TGT=0.626
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=int8
@@ -40,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -111,7 +115,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -133,7 +137,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_with_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -149,7 +153,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh
index b2983669..aa027317 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh
@@ -3,16 +3,17 @@
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=-1
-LOOP_COUNT=10
+TGT=735
+LOOP_COUNT=100
 RUN_MODE=FPS
 PRECISION=int8
 
@@ -40,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -112,7 +116,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_no_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -134,7 +138,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_without_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -150,7 +154,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
-- 
Gitee


From 535fd60497c98dc509571f4d769e6443cc6d06f0 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 10 Dec 2025 14:18:16 +0800
Subject: [PATCH 3/7] sync bert base squad

---
 .../plm/bert_base_squad/ixrt/CMakeLists.txt   |  49 --
 models/nlp/plm/bert_base_squad/ixrt/README.md |  60 +--
 .../ixrt/{python/ixrt => }/builder.py         |   1 -
 .../ixrt/{python/ixrt => }/builder_utils.py   |   0
 .../plm/bert_base_squad/ixrt/ci/prepare.sh    |  27 +-
 .../ixrt/cmake/FindCompiler.cmake             |  15 -
 .../bert_base_squad/ixrt/cmake/FindCuda.cmake |  57 --
 .../bert_base_squad/ixrt/cmake/FindIxrt.cmake |  19 -
 .../ixrt/cmake/FindPluginFiles.cmake          |   7 -
 .../ixrt/{python/ixrt => }/evaluate-v1.1.py   |   0
 .../ixrt/{python => }/helpers/__init__.py     |   0
 .../{python/ixrt => }/helpers/calibrator.py   |  13 +-
 .../{python => }/helpers/data_processing.py   |   0
 .../ixrt/{python => }/helpers/tokenization.py |   0
 .../ixrt/{python/ixrt => }/inference.py       |  97 ++--
 .../bert_base_squad/ixrt/load_ixrt_plugin.py  |  13 +
 .../ixrt/{python/ixrt => }/perf.py            |  76 ++-
 .../ixrt/python/helpers/calibrator.py         | 112 ----
 .../ixrt/python/ixrt/builder_int8.py          | 408 --------------
 .../ixrt/python/ixrt/builder_utils_int8.py    | 208 --------
 .../ixrt/python/ixrt/evaluate.py              | 131 -----
 .../ixrt/python/ixrt/helpers/__init__.py      |   0
 .../python/ixrt/helpers/data_processing.py    | 497 -----------------
 .../ixrt/python/ixrt/helpers/tokenization.py  | 446 ----------------
 .../ixrt/python/ixrt/load_ixrt_plugin.py      |  46 --
 .../ixrt/python/script/build_engine.sh        |  49 --
 .../ixrt/python/script/inference.sh           |  51 --
 .../ixrt/python/script/inference_squad.sh     |  51 --
 .../ixrt/python/script/mdb_infer_run.sh       |  73 ---
 .../ixrt/python/script/perf.sh                |  38 --
 .../ixrt/python/script/prepare.sh             |  63 ---
 .../infer_bert_base_squad_fp16_accuracy.sh}   |  12 +-
 ...infer_bert_base_squad_fp16_performance.sh} |  42 +-
 .../ixrt/src/api/plugin_loader.cc             | 168 ------
 .../ixrt/src/backend/bert/bert_helper.h       | 299 -----------
 .../ixrt/src/backend/cublas/cublas_helper.h   | 312 -----------
 .../backend/ixinfer/ixinfer_gemm_helper.cu    | 416 ---------------
 .../src/backend/ixinfer/ixinfer_gemm_helper.h |  73 ---
 .../ixrt/src/common/bertCommon.h              | 242 ---------
 .../ixrt/src/common/checkMacrosPlugin.cpp     |  62 ---
 .../ixrt/src/common/checkMacrosPlugin.h       | 221 --------
 .../ixrt/src/common/common_def.cuh            |  67 ---
 .../ixrt/src/common/plugin.cpp                |  63 ---
 .../bert_base_squad/ixrt/src/common/plugin.h  |  72 ---
 .../ixrt/src/common/serialize.h               | 148 ------
 .../ixrt/src/custom_fc/fcInt8Plugin.cpp       | 431 ---------------
 .../ixrt/src/custom_fc/fcInt8Plugin.cu        | 485 -----------------
 .../ixrt/src/custom_fc/fcPlugin.cpp           | 345 ------------
 .../ixrt/src/custom_fc/fcPlugin.h             | 246 ---------
 .../emb_layernorm/embLayerNormInt8Plugin.cpp  | 503 ------------------
 .../emb_layernorm/embLayerNormInt8Plugin.cu   | 342 ------------
 .../emb_layernorm/embLayerNormInt8Plugin.h    | 128 -----
 .../src/emb_layernorm/embLayerNormPlugin.cpp  | 495 -----------------
 .../src/emb_layernorm/embLayerNormPlugin.cu   | 258 ---------
 .../src/emb_layernorm/embLayerNormPlugin.h    | 142 -----
 .../ixrt/src/ffn/ffnPlugin.cpp                | 389 --------------
 .../bert_base_squad/ixrt/src/ffn/ffnPlugin.h  | 216 --------
 .../ixrt/src/gelu/geluPlugin.cpp              | 355 ------------
 .../ixrt/src/gelu/geluPlugin.cu               | 218 --------
 .../ixrt/src/gelu/geluPlugin.h                | 148 ------
 .../qkv_to_context/qkvToContextInt8Plugin.cpp | 335 ------------
 .../qkv_to_context/qkvToContextInt8Plugin.cu  | 488 -----------------
 .../qkv_to_context/qkvToContextInt8Plugin.h   | 164 ------
 .../src/qkv_to_context/qkvToContextPlugin.cpp | 388 --------------
 .../src/qkv_to_context/qkvToContextPlugin.cu  | 317 -----------
 .../src/qkv_to_context/qkvToContextPlugin.h   | 155 ------
 .../skipLayerNormInt8Plugin.cpp               | 404 --------------
 .../skip_layernorm/skipLayerNormInt8Plugin.cu | 361 -------------
 .../skip_layernorm/skipLayerNormInt8Plugin.h  | 146 -----
 .../skip_layernorm/skipLayerNormPlugin.cpp    | 430 ---------------
 .../src/skip_layernorm/skipLayerNormPlugin.cu | 401 --------------
 .../src/skip_layernorm/skipLayerNormPlugin.h  | 133 -----
 72 files changed, 163 insertions(+), 13064 deletions(-)
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/CMakeLists.txt
 rename models/nlp/plm/bert_base_squad/ixrt/{python/ixrt => }/builder.py (99%)
 rename models/nlp/plm/bert_base_squad/ixrt/{python/ixrt => }/builder_utils.py (100%)
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/cmake/FindCompiler.cmake
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/cmake/FindCuda.cmake
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/cmake/FindIxrt.cmake
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/cmake/FindPluginFiles.cmake
 rename models/nlp/plm/bert_base_squad/ixrt/{python/ixrt => }/evaluate-v1.1.py (100%)
 rename models/nlp/plm/bert_base_squad/ixrt/{python => }/helpers/__init__.py (100%)
 rename models/nlp/plm/bert_base_squad/ixrt/{python/ixrt => }/helpers/calibrator.py (89%)
 rename models/nlp/plm/bert_base_squad/ixrt/{python => }/helpers/data_processing.py (100%)
 rename models/nlp/plm/bert_base_squad/ixrt/{python => }/helpers/tokenization.py (100%)
 rename models/nlp/plm/bert_base_squad/ixrt/{python/ixrt => }/inference.py (83%)
 create mode 100644 models/nlp/plm/bert_base_squad/ixrt/load_ixrt_plugin.py
 rename models/nlp/plm/bert_base_squad/ixrt/{python/ixrt => }/perf.py (71%)
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/helpers/calibrator.py
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/ixrt/builder_int8.py
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/ixrt/builder_utils_int8.py
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/ixrt/evaluate.py
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/__init__.py
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/data_processing.py
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/tokenization.py
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/ixrt/load_ixrt_plugin.py
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/script/build_engine.sh
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/script/inference.sh
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/script/inference_squad.sh
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/script/mdb_infer_run.sh
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/script/perf.sh
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/python/script/prepare.sh
 rename models/nlp/plm/bert_base_squad/ixrt/{python/script/infer_bert_base_squad_fp16_ixrt.sh => scripts/infer_bert_base_squad_fp16_accuracy.sh} (89%)
 rename models/nlp/plm/bert_base_squad/ixrt/{python/script/infer_bert_base_squad_int8_ixrt.sh => scripts/infer_bert_base_squad_fp16_performance.sh} (56%)
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/api/plugin_loader.cc
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/backend/bert/bert_helper.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/backend/cublas/cublas_helper.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/common/bertCommon.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/common/checkMacrosPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/common/checkMacrosPlugin.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/common/common_def.cuh
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/common/plugin.cpp
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/common/plugin.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/common/serialize.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcInt8Plugin.cu
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcPlugin.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/ffn/ffnPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/ffn/ffnPlugin.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/gelu/geluPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/gelu/geluPlugin.cu
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/gelu/geluPlugin.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu
 delete mode 100644 models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h

diff --git a/models/nlp/plm/bert_base_squad/ixrt/CMakeLists.txt b/models/nlp/plm/bert_base_squad/ixrt/CMakeLists.txt
deleted file mode 100644
index 9a0e7a12..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/CMakeLists.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(nv_plugin)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake;${CMAKE_MODULE_PATH}")
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-set(TARGET_NAME ixrt_plugin)
-set(SHARED_TARGET ${TARGET_NAME})
-set(STATIC_TARGET ${TARGET_NAME}_static)
-set(PLUGIN_REPO_PATH ${PROJECT_SOURCE_DIR})
-
-if(DEFINED USE_TENSORRT)
-  find_package(CUDA)
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_75)
-
-  include_directories(
-        ${CUDA_PATH}/include)
-
-  message(STATUS "Plugin lib use TRT 8.6.1")
-  set(TRT_INC_PATH /usr/include/x86_64-linux-gnu/)
-  set(TRT_LIB_PATH /usr/lib/x86_64-linux-gnu/ /usr/local/cuda/targets/x86_64-linux/lib)
-  set(TRT_LIBRARY nvinfer cublasLt)
-  
-  message(STATUS "cuda_libs = ${CUDA_LIBRARIES}")
-  message(STATUS "cudadevrt_libs = ${CUDA_cudadevrt_LIBRARY}")
-else()
-  include(FindIxrt)
-  include(FindCompiler)
-  include(FindCuda)
-  set(TRT_LIBRARY cublasLt cudart ixrt)
-  include_directories(${IXRT_INCLUDE_DIR}
-        ${CUDA_PATH}/include)
-  add_definitions(-D__ILUVATAR__)
-
-  string(APPEND CMAKE_CXX_FLAGS " -std=c++17")
-endif()
-
-include(FindPluginFiles)
-
-################################## Compile Options ######################################
-cuda_add_library(${SHARED_TARGET} SHARED
-        ${PLUGIN_FILES}
-)
-
-target_link_libraries(${SHARED_TARGET} ${CUDA_LIBRARIES} ${CUDA_cudadevrt_LIBRARY} ${TRT_LIBRARY})
-target_link_directories(${SHARED_TARGET} PUBLIC ${CUDA_PATH}/lib64 ${TRT_LIB_PATH} ${IXRT_LIB_DIR})
-target_include_directories(${SHARED_TARGET}  PUBLIC ${CUDA_PATH}/include ${TRT_INC_PATH} src PUBLIC src/common)
diff --git a/models/nlp/plm/bert_base_squad/ixrt/README.md b/models/nlp/plm/bert_base_squad/ixrt/README.md
index 8f595b3b..496528c0 100644
--- a/models/nlp/plm/bert_base_squad/ixrt/README.md
+++ b/models/nlp/plm/bert_base_squad/ixrt/README.md
@@ -16,69 +16,26 @@ BERT is designed to pre-train deep bidirectional representations from unlabeled
 ### Prepare Resources
 
 ```bash
-cd python
-bash script/prepare.sh v1_1
+mkdir -p data/datasets/bert_base_squad/squad
+mkdir -p data/checkpoints/bert_base_squad
+wget http://files.deepspark.org.cn:880/deepspark/data/checkpoints/bert_base_uncased_squad.tar
+tar -xvf bert_base_uncased_squad.tar -C data/checkpoints/bert_base_squad/
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O data/datasets/bert_base_squad/squad/dev-v1.1.json
 ```
 
 ### Install Dependencies
 
-Contact the Iluvatar administrator to get the missing packages:
-- ixrt-1.0.0a0+corex.4.3.0-cp310-cp310-linux_x86_64.whl
-- cuda_python-11.8.0+corex.4.3.0-cp310-cp310-linux_x86_64.whl
-
-#### Install on Iluvatar
-
-```bash
-cmake -S . -B build
-cmake --build build -j16
-```
-
-#### Install on NV
-
-Require tensorrt_version >= 8.6
-
-```bash
-# Get TensorRT docker image
-docker pull nvcr.io/nvidia/tensorrt:23.04-py3
-# Run TensorRT docker
-```
-
 ```bash
-# Install requirements.txt in TensorRT docker
 pip3 install -r requirements.txt
-
-# Build
-cmake -S . -B build -DUSE_TENSORRT=true
-cmake --build build -j16
 ```
 
 ## Model Inference
 
-### On Iluvatar
-
-#### FP16
-
-```bash
-cd script/
-
-# FP16
-bash infer_bert_base_squad_fp16_ixrt.sh
-
-# INT8
-bash infer_bert_base_squad_int8_ixrt.sh
-```
-
-### On NV
+### FP16
 
 ```bash
-# FP16
-# use --bs to set max_batch_size (dynamic) 
-bash script/build_engine.sh --bs 32
-bash script/inference_squad.sh --bs 32
-
-# INT8
-bash script/build_engine.sh --bs 32 --int8
-bash script/inference_squad.sh --bs 32 --int8
+bash scripts/infer_bert_base_squad_fp16_accuracy.sh
+bash scripts/infer_bert_base_squad_fp16_performance.sh
 ```
 
 ## Model Results
@@ -86,7 +43,6 @@ bash script/inference_squad.sh --bs 32 --int8
 | Model           | BatchSize | Precision | Latency QPS | exact_match | f1    |
 | --------------- | --------- | --------- | ----------- | ----------- | ----- |
 | BERT Base SQuAD | 32        | FP16      | 1444.69     | 80.92       | 88.20 |
-| BERT Base SQuAD | 32        | INT8      | 2325.20     | 78.41       | 86.97 |
 
 ## Referenece
 
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/builder.py b/models/nlp/plm/bert_base_squad/ixrt/builder.py
similarity index 99%
rename from models/nlp/plm/bert_base_squad/ixrt/python/ixrt/builder.py
rename to models/nlp/plm/bert_base_squad/ixrt/builder.py
index 8632d95d..c921f535 100644
--- a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/builder.py
+++ b/models/nlp/plm/bert_base_squad/ixrt/builder.py
@@ -28,7 +28,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import os
 import argparse
 import json
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/builder_utils.py b/models/nlp/plm/bert_base_squad/ixrt/builder_utils.py
similarity index 100%
rename from models/nlp/plm/bert_base_squad/ixrt/python/ixrt/builder_utils.py
rename to models/nlp/plm/bert_base_squad/ixrt/builder_utils.py
diff --git a/models/nlp/plm/bert_base_squad/ixrt/ci/prepare.sh b/models/nlp/plm/bert_base_squad/ixrt/ci/prepare.sh
index 0b82d655..930f87f6 100644
--- a/models/nlp/plm/bert_base_squad/ixrt/ci/prepare.sh
+++ b/models/nlp/plm/bert_base_squad/ixrt/ci/prepare.sh
@@ -16,27 +16,6 @@
 
 set -x
 
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-if [[ ${ID} == "ubuntu" ]]; then
-    apt install -y libgl1-mesa-glx
-elif [[ ${ID} == "centos" ]]; then
-    yum install -y mesa-libGL
-else
-    echo "Not Support Os"
-fi
-
-pip install -r requirements.txt
-
-# install ixrt run
-bash /root/data/install/ixrt-1.0.0.alpha+corex.4.3.0-linux_x86_64.run
-
-if [ "$1" = "nvidia" ]; then
-    cmake -S . -B build -DUSE_TENSORRT=true
-    cmake --build build -j16
-else
-    cmake -S . -B build
-    cmake --build build -j16
-fi
-
-mkdir -p ./python/data
-ln -s /root/data/checkpoints/bert_base_uncased_squad/ ./python/data && ln -s /root/data/datasets/squad/ ./python/data
\ No newline at end of file
+mkdir -p data/datasets/bert_base_squad/
+mkdir -p data/checkpoints/bert_base_squad
+ln -s /root/data/checkpoints/bert_base_uncased_squad/ ./data/checkpoints/bert_base_squad && ln -s /root/data/datasets/squad ./data/datasets/bert_base_squad/
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/cmake/FindCompiler.cmake b/models/nlp/plm/bert_base_squad/ixrt/cmake/FindCompiler.cmake
deleted file mode 100644
index 07c436f5..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/cmake/FindCompiler.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-if(NOT COMPILER_PATH)
-  if (EXISTS /opt/sw_home/local/bin/clang++)
-    set(COMPILER_PATH /opt/sw_home/local/bin)
-  elseif (EXISTS /usr/local/corex/bin/clang++)
-    set(COMPILER_PATH /usr/local/corex/bin)
-  else()
-    message(STATUS "COMPILER_PATH is not set and we couldn't find clang compiler neither, will use system C/C++ compiler")
-  endif()
-endif()
-if (COMPILER_PATH)
-  set(CMAKE_CXX_COMPILER ${COMPILER_PATH}/clang++)
-  set(CMAKE_C_COMPILER ${COMPILER_PATH}/clang)
-endif()
-
-message(STATUS "Use ${CMAKE_CXX_COMPILER} and ${CMAKE_C_COMPILER} as C++ and C compiler")
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/cmake/FindCuda.cmake b/models/nlp/plm/bert_base_squad/ixrt/cmake/FindCuda.cmake
deleted file mode 100644
index e8aa67dc..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/cmake/FindCuda.cmake
+++ /dev/null
@@ -1,57 +0,0 @@
-# This cmake does:
-# - Set CUDA_PATH
-# - Find libcudart
-# - Util functions like cuda_add_library, cuda_add_executable
-
-
-# CUDA_PATH can be specified through below means shown in priority order 1.
-# cmake command line argument, -DCUDA_PATH=/path/to/cuda 2. bash environment
-# variable, export CUDA_PATH=/path/to/cuda
-if(DEFINED ENV{CUDA_PATH})
-  set(CUDA_PATH "$ENV{CUDA_PATH}")
-else()
-  set(CUDA_PATH
-      "/usr/local/corex"
-      CACHE PATH "cuda installation root path")
-endif()
-message(STATUS "Use CUDA_PATH=${CUDA_PATH} ")
-
-# GPU arch
-if(NOT "${CUDA_ARCH}" STREQUAL "")
-  set(CUDA_ARCH
-      ${CUDA_ARCH}
-      CACHE STRING "GPU architecture tag, ivcore11")
-else("${CUDA_ARCH}" STREQUAL "")
-  set(CUDA_ARCH
-      "ivcore11"
-      CACHE STRING "GPU architecture tag, ivcore11")
-endif()
-message(STATUS "Use CUDA_ARCH=${CUDA_ARCH}")
-
-macro(cuda_add_executable)
-  foreach(File ${ARGN})
-    if(${File} MATCHES ".*\.cu$")
-      set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
-    endif()
-  endforeach()
-  add_executable(${ARGV})
-endmacro()
-
-macro(cuda_add_library)
-  foreach(File ${ARGN})
-    if(${File} MATCHES ".*\.cu$")
-      set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
-    endif()
-  endforeach()
-  add_library(${ARGV})
-endmacro()
-
-find_library(
-  CUDART_LIBRARY cudart
-  PATHS ${CUDA_PATH}
-  PATH_SUFFIXES lib/x64 lib64 lib
-  NO_DEFAULT_PATH)
-
-if (NOT USE_TRT)
-  set(CUDA_LIBRARIES cudart)
-endif()
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/cmake/FindIxrt.cmake b/models/nlp/plm/bert_base_squad/ixrt/cmake/FindIxrt.cmake
deleted file mode 100644
index 5b0f2729..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/cmake/FindIxrt.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-# This cmake file decides how to build with IxRT
-# Custom IxRT Path
-if(NOT "${IXRT_HOME}" STREQUAL "")
-    set(IXRT_INCLUDE_DIR ${IXRT_HOME}/include)
-    set(IXRT_LIB_DIR ${IXRT_HOME}/lib)
-# From default paths
-else()
-  set(IXRT_INCLUDE_DIR /usr/local/corex/include)
-  set(IXRT_LIB_DIR /usr/local/corex/lib)
-endif()
-
-message(STATUS "IXRT_INCLUDE_DIR:   ${IXRT_INCLUDE_DIR}")
-message(STATUS "IXRT_LIB_DIR:   ${IXRT_LIB_DIR}")
-
-if(EXISTS ${IXRT_INCLUDE_DIR} AND EXISTS ${IXRT_LIB_DIR})
-    include_directories(${IXRT_INCLUDE_DIR})
-else()
-    message( FATAL_ERROR "IxRT library doesn't exist!")
-endif()
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/cmake/FindPluginFiles.cmake b/models/nlp/plm/bert_base_squad/ixrt/cmake/FindPluginFiles.cmake
deleted file mode 100644
index 60360699..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/cmake/FindPluginFiles.cmake
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB_RECURSE PLUGIN_FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cu)
-
-if(DEFINED USE_TENSORRT)
-   list(FILTER PLUGIN_FILES EXCLUDE REGEX "${CMAKE_CURRENT_SOURCE_DIR}/src/backend/ixinfer")
-endif()
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/evaluate-v1.1.py b/models/nlp/plm/bert_base_squad/ixrt/evaluate-v1.1.py
similarity index 100%
rename from models/nlp/plm/bert_base_squad/ixrt/python/ixrt/evaluate-v1.1.py
rename to models/nlp/plm/bert_base_squad/ixrt/evaluate-v1.1.py
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/helpers/__init__.py b/models/nlp/plm/bert_base_squad/ixrt/helpers/__init__.py
similarity index 100%
rename from models/nlp/plm/bert_base_squad/ixrt/python/helpers/__init__.py
rename to models/nlp/plm/bert_base_squad/ixrt/helpers/__init__.py
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/calibrator.py b/models/nlp/plm/bert_base_squad/ixrt/helpers/calibrator.py
similarity index 89%
rename from models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/calibrator.py
rename to models/nlp/plm/bert_base_squad/ixrt/helpers/calibrator.py
index beacc625..73084f39 100644
--- a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/calibrator.py
+++ b/models/nlp/plm/bert_base_squad/ixrt/helpers/calibrator.py
@@ -19,8 +19,8 @@
 import tensorrt as trt
 import os
 
-import pycuda.driver as cuda
-import pycuda.autoinit
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 import numpy as np
 import helpers.tokenization as tokenization
 import helpers.data_processing as dp
@@ -80,9 +80,12 @@ class BertCalibrator(trt.IInt8LegacyCalibrator):
                 segment_ids = features[0].segment_ids
                 input_mask = features[0].input_mask
 
-        cuda.memcpy_htod(self.device_inputs[0], input_ids.ravel())
-        cuda.memcpy_htod(self.device_inputs[1], segment_ids.ravel())
-        cuda.memcpy_htod(self.device_inputs[2], input_mask.ravel())
+        err, = cuda.cuMemcpyHtoD(self.device_inputs[0], input_ids.ravel(), input_ids.ravel().nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        err, = cuda.cuMemcpyHtoD(self.device_inputs[1], segment_ids.ravel(), segment_ids.ravel().nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        err, = cuda.cuMemcpyHtoD(self.device_inputs[2], input_mask.ravel(), input_mask.ravel().nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
 
         self.current_index += self.batch_size
         return self.device_inputs
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/helpers/data_processing.py b/models/nlp/plm/bert_base_squad/ixrt/helpers/data_processing.py
similarity index 100%
rename from models/nlp/plm/bert_base_squad/ixrt/python/helpers/data_processing.py
rename to models/nlp/plm/bert_base_squad/ixrt/helpers/data_processing.py
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/helpers/tokenization.py b/models/nlp/plm/bert_base_squad/ixrt/helpers/tokenization.py
similarity index 100%
rename from models/nlp/plm/bert_base_squad/ixrt/python/helpers/tokenization.py
rename to models/nlp/plm/bert_base_squad/ixrt/helpers/tokenization.py
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/inference.py b/models/nlp/plm/bert_base_squad/ixrt/inference.py
similarity index 83%
rename from models/nlp/plm/bert_base_squad/ixrt/python/ixrt/inference.py
rename to models/nlp/plm/bert_base_squad/ixrt/inference.py
index 920d5b80..3c7f988e 100644
--- a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/inference.py
+++ b/models/nlp/plm/bert_base_squad/ixrt/inference.py
@@ -38,8 +38,8 @@ import argparse
 import collections
 import numpy as np
 import tensorrt as trt
-import pycuda.driver as cuda
-import pycuda.autoinit
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 
 import helpers.tokenization as tokenization
 import helpers.data_processing as dp
@@ -155,12 +155,13 @@ if __name__ == '__main__':
             raise RuntimeError("Could not find any profile that can run batch size {}.".format(args.batch_size))
 
         # Create a stream in which to copy inputs/outputs and run inference.
-        stream = cuda.Stream()
+        err_dr, stream = cuda.cuStreamCreate(0)
+        assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
         
         # if args.use_trt:
         #     context.active_optimization_profile = selected_profile
         # else:
-        context.set_optimization_profile_async(selected_profile, stream.handle)
+        context.set_optimization_profile_async(selected_profile, stream)
         binding_idx_offset = selected_profile * num_binding_per_profile
 
         input_shape = (args.batch_size, max_seq_length)
@@ -170,11 +171,17 @@ if __name__ == '__main__':
         assert context.all_binding_shapes_specified
 
         # Allocate device memory for inputs.
-        d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]
+        d_inputs = []
+        for binding in range(3):
+            err, ptr = cuda.cuMemAlloc(input_nbytes)
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
+            d_inputs.append(ptr)
 
         # Allocate output buffer by querying the size from the context. This may be different for different input shapes.
-        h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(binding_idx_offset + 3)), dtype=np.float32)
-        d_output = cuda.mem_alloc(h_output.nbytes)
+        h_output = np.empty(tuple(context.get_binding_shape(binding_idx_offset + 3)), dtype=np.float32) 
+ 
+        err, d_output = cuda.cuMemAlloc(h_output.nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
 
         def inference(features, tokens):
             global h_output
@@ -191,24 +198,30 @@ if __name__ == '__main__':
                 segment_ids_batch = np.repeat(np.expand_dims(feature.segment_ids, 0), args.batch_size, axis=0)
                 input_mask_batch = np.repeat(np.expand_dims(feature.input_mask, 0), args.batch_size, axis=0)
 
-                input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids_batch.ravel()))
-                segment_ids = cuda.register_host_memory(np.ascontiguousarray(segment_ids_batch.ravel()))
-                input_mask = cuda.register_host_memory(np.ascontiguousarray(input_mask_batch.ravel()))
+                input_ids = cuda.cuMemHostRegister(np.ascontiguousarray(input_ids_batch.ravel()), input_ids_batch.nbytes)
+                segment_ids = cuda.cuMemHostRegister(np.ascontiguousarray(segment_ids_batch.ravel()), segment_ids_batch.nbytes)
+                input_mask = cuda.cuMemHostRegister(np.ascontiguousarray(input_mask_batch.ravel()), input_mask.nbytes)
 
                 eval_start_time = time.time()
-                cuda.memcpy_htod_async(d_inputs[0], input_ids, stream)
-                cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream)
-                cuda.memcpy_htod_async(d_inputs[2], input_mask, stream)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[0], input_ids, input_ids.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[1], segment_ids, segment_ids.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[2], input_mask, input_mask.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
 
                 # Run inference
-                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
+                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream)
                 # Synchronize the stream
-                stream.synchronize()
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
                 eval_time_elapsed += (time.time() - eval_start_time)
 
                 # Transfer predictions back from GPU
-                cuda.memcpy_dtoh_async(h_output, d_output, stream)
-                stream.synchronize()
+                err, = cuda.cuMemcpyDtoHAsync(h_output, d_output, h_output.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
                 # for x in h_output[0].reshape(-1,2):
                 #     print(x)
                 # Only retrieve and post-process the first batch
@@ -322,10 +335,13 @@ if __name__ == '__main__':
                 for binding in range(3):
                     context.set_binding_shape(binding, (args.batch_size, max_seq_length))
                 assert context.all_binding_shapes_specified
-                cuda.memcpy_htod_async(d_inputs[0], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), stream)
-                cuda.memcpy_htod_async(d_inputs[1], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), stream)
-                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
-            stream.synchronize()
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[0], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel().nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[1], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel().nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream)
+            err, = cuda.cuStreamSynchronize(stream)
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
             
             infer_toal_time = 0
             output_index = 0
@@ -334,20 +350,24 @@ if __name__ == '__main__':
                     context.set_binding_shape(binding, input_ids.shape)
                 assert context.all_binding_shapes_specified
 
-                cuda.memcpy_htod_async(d_inputs[0], input_ids.ravel(), stream)
-                cuda.memcpy_htod_async(d_inputs[1], segment_ids.ravel(), stream)
-                stream.synchronize()
-
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[0], input_ids.ravel(), input_ids.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[1], segment_ids.ravel(), segment_ids.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
                 infer_start_time = time.time()
-                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
-                stream.synchronize()
+                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
                 infer_end_time = time.time()
                 infer_time = infer_end_time - infer_start_time
                 infer_toal_time += infer_time
-                
-                cuda.memcpy_dtoh_async(h_output, d_output, stream)
-                stream.synchronize()
-    
+                err, = cuda.cuMemcpyDtoHAsync(h_output, d_output, h_output.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+
                 new_h_output = np.array(h_output.reshape(-1)[:input_ids.shape[0]*input_ids.shape[1]*2]).reshape(input_ids.shape[0], input_ids.shape[1], 2)
                 for index in range(input_ids.shape[0]):
                     networkOutputs.append(_NetworkOutput(
@@ -356,7 +376,12 @@ if __name__ == '__main__':
                         feature_index = index
                     ))
                     output_index += 1
-
+            for i in range(3):
+                err, = cuda.cuMemFree(d_inputs[i])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+            err, = cuda.cuMemFree(d_output)
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
+            
             output_index = 0
             for (be, bf) in zip(batch_example_list, batch_feature_list):
                 for index in range(len(bf)):
@@ -379,17 +404,11 @@ if __name__ == '__main__':
                 lengths.append(len(features[0].input_ids))
 
             sort_index = np.argsort(lengths)
-            infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions)
-            print(F"E2E time : {infer_time:.3f} seconds")
+            infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions)          
             
             qps = math.ceil(len(squad_examples)/args.batch_size)*args.batch_size/infer_time
             print(f"Latency QPS: {qps} sentences/s")
 
-            metricResult = {"metricResult": {}}
-            metricResult["metricResult"]["E2E time"] = round(infer_time, 3)
-            metricResult["metricResult"]["Latency QPS"] = round(qps, 3)
-            print(metricResult)
-
             with open(output_prediction_file, "w") as f:
                 f.write(json.dumps(all_predictions, indent=4))
                 print("\nOutput dump to {}".format(output_prediction_file))
diff --git a/models/nlp/plm/bert_base_squad/ixrt/load_ixrt_plugin.py b/models/nlp/plm/bert_base_squad/ixrt/load_ixrt_plugin.py
new file mode 100644
index 00000000..b40f6910
--- /dev/null
+++ b/models/nlp/plm/bert_base_squad/ixrt/load_ixrt_plugin.py
@@ -0,0 +1,13 @@
+from os.path import join, dirname, exists
+import tensorrt as trt
+import ctypes
+
+def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""):
+    if not dynamic_path:
+        dynamic_path = join(dirname(trt.__file__), "lib", "libixrt_plugin.so")
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+    ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+    trt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/perf.py b/models/nlp/plm/bert_base_squad/ixrt/perf.py
similarity index 71%
rename from models/nlp/plm/bert_base_squad/ixrt/python/ixrt/perf.py
rename to models/nlp/plm/bert_base_squad/ixrt/perf.py
index 968a3943..e275194e 100644
--- a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/perf.py
+++ b/models/nlp/plm/bert_base_squad/ixrt/perf.py
@@ -28,14 +28,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
 import argparse
 import ctypes
 import time
 import numpy as np
 import tensorrt as trt
-import pycuda.driver as cuda
-import pycuda.autoinit
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 
 import numpy as np
 
@@ -44,17 +43,19 @@ from load_ixrt_plugin import load_ixrt_plugin
 
 class DeviceBuffer(object):
     def __init__(self, shape, dtype=trt.int32):
-        self.buf = cuda.mem_alloc(trt.volume(shape) * 4)
+        _, self.buf = cuda.cuMemAlloc(trt.volume(shape) * 4)
 
     def binding(self):
         return int(self.buf)
 
     def free(self):
-        self.buf.free()
+        err, = cuda.cuMemFree(self.buf)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
 
 
 def main():
     parser = argparse.ArgumentParser(description='BERT Inference Benchmark')
+    parser.add_argument("-z", "--use_trt", action="store_false", help="Whether to use tensorRT or IxRT")
     parser.add_argument("-e", "--engine", help='Path to BERT TensorRT engine')
     parser.add_argument('-b', '--batch-size', default=[], action="append", help='Batch size(s) to benchmark. Can be specified multiple times for more than one batch size. This script assumes that the engine has been built with one optimization profile for each batch size, and that these profiles are in order of increasing batch size.', type=int)
     parser.add_argument('-s', '--sequence-length', default=128, help='Sequence length of the BERT model', type=int)
@@ -62,11 +63,12 @@ def main():
     parser.add_argument('-w', '--warm-up-runs', default=10, help='Number of iterations to run prior to benchmarking.', type=int)
     parser.add_argument('-d', '--duration', default=0.0, help='Minimal number of seconds to run when benchmarking each batch size.', type=float)
     parser.add_argument('-r', '--random-seed', required=False, default=12345, help='Random seed.', type=int)
+    parser.add_argument('-t', '--target-qps', default=0, help='Target QPS', type=int)
     args, _ = parser.parse_known_args()
     args.batch_size = args.batch_size or [1]
 
     # Import necessary plugins for BERT TensorRT
-    load_ixrt_plugin(TRT_LOGGER, dynamic_path="../build/libixrt_plugin.so")
+    load_ixrt_plugin(TRT_LOGGER)
 
     with open(args.engine, 'rb') as f:
         runtime = trt.Runtime(TRT_LOGGER)
@@ -92,15 +94,19 @@ def main():
         test_input_mask = np.ones((max(args.batch_size), args.sequence_length), dtype=np.int32)
 
         # Copy input h2d
-        cuda.memcpy_htod(buffers[0].buf, test_word_ids.ravel())
-        cuda.memcpy_htod(buffers[1].buf, test_segment_ids.ravel())
-        cuda.memcpy_htod(buffers[2].buf, test_input_mask.ravel())
+        err, = cuda.cuMemcpyHtoD(buffers[0].buf, test_word_ids.ravel(), test_word_ids.ravel().nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        err, = cuda.cuMemcpyHtoD(buffers[1].buf, test_segment_ids.ravel(), test_segment_ids.ravel().nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        err, = cuda.cuMemcpyHtoD(buffers[2].buf, test_input_mask.ravel(), test_input_mask.ravel().nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
 
         num_binding_per_profile = engine.num_bindings // engine.num_optimization_profiles
 
         bench_times = {}
 
-        stream = cuda.Stream()
+        err_dr, stream = cuda.cuStreamCreate(0)
+        assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
         for batch_size in sorted(args.batch_size):
             # # Select engine profile
             selected_profile = -1
@@ -111,7 +117,7 @@ def main():
                     break
             if selected_profile == -1:
                 raise RuntimeError("None of the dynamic shape profiles meets the requirement batch = {} and sequence = {}.".format(batch_size, args.sequence_length))
-            context.set_optimization_profile_async(selected_profile, stream.handle)
+            context.set_optimization_profile_async(selected_profile, stream)
 
             # Each profile has unique bindings
             binding_idx_offset = selected_profile * num_binding_per_profile
@@ -129,41 +135,57 @@ def main():
 
             # Inference
             total_time = 0
-            start = cuda.Event()
-            end = cuda.Event()
+            err_dr, start = cuda.cuEventCreate(0)
+            assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+            err_dr, end = cuda.cuEventCreate(0)
+            assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
 
             # Warmup
             for _ in range(args.warm_up_runs):
-                context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
-                stream.synchronize()
+                context.execute_async_v2(bindings=bindings, stream_handle=stream)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
 
             # Timing loop
             times = []
             actual_iterations = 0
             start_time = time.time()
             while actual_iterations < args.iterations or (time.time() - start_time) < args.duration:
-                start.record(stream)
-                context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
-                end.record(stream)
-                stream.synchronize()
-                times.append(end.time_since(start))
+                cuda.cuEventRecord(start, stream)
+                context.execute_async_v2(bindings=bindings, stream_handle=stream)
+                cuda.cuEventRecord(end, stream)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                _, ms = cuda.cuEventElapsedTime(start, end)
+                times.append(ms)
                 actual_iterations += 1
 
             # Compute average time, 95th percentile time and 99th percentile time.
             bench_times[batch_size] = times
-
+        err_rt, = cudart.cudaStreamDestroy(stream)
+        assert(err_rt == cudart.cudaError_t.cudaSuccess)
         [b.free() for b in buffers]
+        del context
+        del engine
 
         for batch_size, times in bench_times.items():
             total_time = sum(times)
             avg_time = total_time / float(actual_iterations)
-            times.sort()
-            percentile95 = times[int(actual_iterations * 0.95)]
-            percentile99 = times[int(actual_iterations * 0.99)]
-            print("Running {:} iterations with Batch Size: {:}\n\tTotal Time: {:} ms \tAverage Time: {:} ms\t95th Percentile Time: {:} ms\t99th Percentile Time: {:}".format(actual_iterations, batch_size, total_time, avg_time, percentile95, percentile99))
+            # times.sort()
+            # percentile95 = times[int(actual_iterations * 0.95)]
+            # percentile99 = times[int(actual_iterations * 0.99)]
+            # print("Running {:} iterations with Batch Size: {:}\n\tTotal Time: {:} ms \tAverage Time: {:} ms\t95th Percentile Time: {:} ms\t99th Percentile Time: {:}".format(actual_iterations, batch_size, total_time, avg_time, percentile95, percentile99))
+            QPS = 1000.0 / (avg_time / batch_size)
+            print("BatchSize = {:d}, QPS = {:.3f}".format(batch_size, QPS))
+            if QPS >= args.target_qps:
+                print("Performance Check : Test {:.3f} >= target {:.3f}".format(QPS, args.target_qps))
+                print("pass!")
+                exit()
+            else:
+                print("failed!")
+                exit(1)
 
-        del context
-        del engine
+        
 
 if __name__ == '__main__':
     main()
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/helpers/calibrator.py b/models/nlp/plm/bert_base_squad/ixrt/python/helpers/calibrator.py
deleted file mode 100644
index beacc625..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/python/helpers/calibrator.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/usr/bin/env python3
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import tensorrt as trt
-import os
-
-import pycuda.driver as cuda
-import pycuda.autoinit
-import numpy as np
-import helpers.tokenization as tokenization
-import helpers.data_processing as dp
-
-class BertCalibrator(trt.IInt8LegacyCalibrator):
-    def __init__(self, squad_json, vocab_file, cache_file, batch_size, max_seq_length, num_inputs):
-        # Whenever you specify a custom constructor for a TensorRT class,
-        # you MUST call the constructor of the parent explicitly.
-        trt.IInt8LegacyCalibrator.__init__(self)
-
-        self.cache_file = cache_file
-
-        # Every time get_batch is called, the next batch of size batch_size will be copied to the device and returned.
-        self.data = dp.read_squad_json(squad_json)
-        self.max_seq_length = max_seq_length
-        self.batch_size = batch_size
-        self.current_index = 0
-        self.num_inputs = num_inputs
-        self.tokenizer = tokenization.BertTokenizer(vocab_file=vocab_file, do_lower_case=True)
-        self.doc_stride = 128
-        self.max_query_length = 64
-
-        # Allocate enough memory for a whole batch.
-        self.device_inputs = [cuda.mem_alloc(self.max_seq_length * trt.int32.itemsize * self.batch_size) for binding in range(3)]
-
-    def free(self):
-        for dinput in self.device_inputs:
-            dinput.free()
-
-    def get_batch_size(self):
-        return self.batch_size
-
-    # TensorRT passes along the names of the engine bindings to the get_batch function.
-    # You don't necessarily have to use them, but they can be useful to understand the order of
-    # the inputs. The bindings list is expected to have the same ordering as 'names'.
-    def get_batch(self, names):
-        if self.current_index + self.batch_size > self.num_inputs:
-            print("Calibrating index {:} batch size {:} exceed max input limit {:} sentences".format(self.current_index, self.batch_size, self.num_inputs))
-            return None
-
-        current_batch = int(self.current_index / self.batch_size)
-        if current_batch % 10 == 0:
-            print("Calibrating batch {:}, containing {:} sentences".format(current_batch, self.batch_size))
-
-        input_ids = []
-        segment_ids = []
-        input_mask = []
-        for i in range(self.batch_size):
-            example = self.data[self.current_index + i]
-            features = dp.convert_example_to_features(example.doc_tokens, example.question_text, self.tokenizer, self.max_seq_length, self.doc_stride, self.max_query_length)
-            if len(input_ids) and len(segment_ids) and len(input_mask):
-                input_ids = np.concatenate((input_ids, features[0].input_ids))
-                segment_ids = np.concatenate((segment_ids, features[0].segment_ids))
-                input_mask = np.concatenate((input_mask, features[0].input_mask))
-            else:
-                input_ids = features[0].input_ids
-                segment_ids = features[0].segment_ids
-                input_mask = features[0].input_mask
-
-        cuda.memcpy_htod(self.device_inputs[0], input_ids.ravel())
-        cuda.memcpy_htod(self.device_inputs[1], segment_ids.ravel())
-        cuda.memcpy_htod(self.device_inputs[2], input_mask.ravel())
-
-        self.current_index += self.batch_size
-        return self.device_inputs
-
-    def read_calibration_cache(self):
-        # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
-        if os.path.exists(self.cache_file):
-            with open(self.cache_file, "rb") as f:
-                return f.read()
-
-    def write_calibration_cache(self, cache):
-        with open(self.cache_file, "wb") as f:
-            f.write(cache)
-            f.flush()
-            os.fsync(f)
-
-    def get_quantile(self):
-        return 0.9999
-
-    def get_regression_cutoff(self):
-        return 1.0
-
-    def read_histogram_cache(self, length):
-        return None
-
-    def write_histogram_cache(self, ptr, length):
-        return None
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/builder_int8.py b/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/builder_int8.py
deleted file mode 100644
index 7167882b..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/builder_int8.py
+++ /dev/null
@@ -1,408 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import argparse
-import json
-import tensorrt as trt
-import time
-import sys
-import ctypes
-import os
-import numpy as np
-from builder_utils_int8 import load_pytorch_weights_and_quant
-from builder_utils_int8 import WQKV, BQKV  # Attention Keys
-from builder_utils_int8 import W_AOUT, B_AOUT, W_MID, B_MID, W_LOUT, B_LOUT  # Transformer Keys
-from builder_utils_int8 import SQD_W, SQD_B  # SQuAD Output Keys
-from builder import custom_fc as custom_fc_fp16
-
-trt_version = [int(n) for n in trt.__version__.split('.')]
-
-TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin(TRT_LOGGER)
-
-plg_registry = trt.get_plugin_registry()
-registry_list = plg_registry.plugin_creator_list
-print("registry_list: ", [registry.name + '/' + registry.plugin_version for registry in registry_list])
-emln_plg_creator = plg_registry.get_plugin_creator("CustomEmbLayerNormPluginDynamic_IxRT", "2", "")
-qkv2_plg_creator = plg_registry.get_plugin_creator("CustomQKVToContextPluginDynamic_IxRT", "3", "")
-skln_plg_creator = plg_registry.get_plugin_creator("CustomSkipLayerNormPluginDynamic_IxRT", "3", "")
-gelu_plg_creator = plg_registry.get_plugin_creator("CustomGeluPluginDynamic_IxRT", "1", "")
-fc_plg_creator = plg_registry.get_plugin_creator("CustomFCPluginDynamic_IxRT", "2", "")
-
-# 
-class BertConfig:
-    def __init__(self, bert_config_path, use_int8):
-        with open(bert_config_path, "r") as f:
-            data = json.load(f)
-            self.num_attention_heads = data["num_attention_heads"]
-            self.hidden_size = data["hidden_size"]
-            self.intermediate_size = data["intermediate_size"]
-            self.num_hidden_layers = data["num_hidden_layers"]
-            self.head_size = self.hidden_size // self.num_attention_heads
-            self.use_int8 = use_int8
-
-def set_tensor_name(tensor, prefix, name):
-    tensor.name = prefix + name
-
-def set_output_name(layer, prefix, name, out_idx = 0):
-    set_tensor_name(layer.get_output(out_idx), prefix, name)
-
-def set_output_range(layer, maxval, out_idx = 0):
-    layer.get_output(out_idx).set_dynamic_range(-maxval, maxval)
-
-def get_mha_dtype(config):
-    dtype = trt.float32
-    if config.use_int8:
-        dtype = trt.int8
-    return int(dtype)
-
-def custom_fc(prefix, config, init_dict, network, input_tensor, out_dims, W, B):
-    pf_out_dims = trt.PluginField("out_dims", np.array([out_dims], dtype=np.int32), trt.PluginFieldType.INT32)
-    pf_W = trt.PluginField("W", W, trt.PluginFieldType.FLOAT32)
-    
-    fields = [pf_out_dims, pf_W]
-
-    if config.use_int8:
-        amax_vec = [init_dict[prefix + "wei_amax"]]
-        if B is not None:
-            pf_B = trt.PluginField("Bias", B, trt.PluginFieldType.FLOAT32)
-            amax_vec.append(init_dict[prefix + "out_amax"])
-            pf_amax = trt.PluginField("fc_amax", np.array(amax_vec, np.float32), trt.PluginFieldType.FLOAT32)
-            fields.append(pf_B)
-            fields.append(pf_amax)
-        else:
-            pf_amax = trt.PluginField("fc_amax", np.array(amax_vec, np.float32), trt.PluginFieldType.FLOAT32)
-            fields.append(pf_amax)
-
-    pfc = trt.PluginFieldCollection(fields)
-    fc_plugin = fc_plg_creator.create_plugin("fcplugin", pfc)
-    plug_inputs = [input_tensor]
-    out_dense = network.add_plugin_v2(plug_inputs, fc_plugin)
-    return out_dense
-
-def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask):
-    """
-    Add the attention layer
-    """
-    B, S, hidden_size = input_tensor.shape
-    num_heads = config.num_attention_heads
-    head_size = int(hidden_size / num_heads)
-
-    Wall = init_dict[prefix + WQKV]
-    Ball = init_dict[prefix + BQKV]
-
-    # FC_attention
-    mult_all = custom_fc(prefix + "self_qkv_", config, init_dict, network, input_tensor, 3*hidden_size, Wall, Ball)
-    set_output_range(mult_all, init_dict[prefix + "self_qkv_out_amax"])
-
-    has_mask = imask is not None
-
-    # QKV2CTX
-    pf_hidden_size = trt.PluginField("hidden_size", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
-    pf_num_heads = trt.PluginField("num_heads", np.array([num_heads], np.int32), trt.PluginFieldType.INT32)
-    fields = [pf_hidden_size, pf_num_heads]
-    dq_probs = [
-                init_dict[prefix + "arrange_qkv_amax"],
-                init_dict[prefix + "softmax_in_amax"],
-                init_dict[prefix + "softmax_out_amax"] 
-                ]
-    pf_dq = trt.PluginField("dq_probs", np.array(dq_probs, np.float32), trt.PluginFieldType.FLOAT32)
-    fields.append(pf_dq)
-    
-    pfc = trt.PluginFieldCollection(fields)
-    qkv2ctx_plug = qkv2_plg_creator.create_plugin("qkv2ctx", pfc)
-
-    qkv_in = [mult_all.get_output(0)]
-    if has_mask:
-        qkv_in.append(imask)
-    qkv2ctx = network.add_plugin_v2(qkv_in, qkv2ctx_plug)
-    if config.use_int8:
-        set_output_range(qkv2ctx, init_dict[prefix + "output_dense_in_amax"])
-    return qkv2ctx
-
-
-def skipln(prefix, config, init_dict, network, input_tensor, skip, residual, is_last_layer, bias=None):
-    """
-    Add the skip layer
-    """
-    idims = input_tensor.shape
-    hidden_size = idims[2]
-
-    dtype = trt.float32
-    if config.use_int8:
-        dtype = trt.int8
-
-    wbeta = init_dict[prefix + "beta"]
-    wgamma = init_dict[prefix + "gamma"]
-
-    pf_ld = trt.PluginField("ld", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
-    pf_beta = trt.PluginField("beta", wbeta, trt.PluginFieldType.FLOAT32)
-    pf_gamma = trt.PluginField("gamma", wgamma, trt.PluginFieldType.FLOAT32)
-    pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32)
-
-    fields = [pf_ld, pf_beta, pf_gamma, pf_type ]
-    if bias is not None:
-        pf_bias = trt.PluginField("bias", bias, trt.PluginFieldType.FLOAT32)
-        fields.append(pf_bias)
-    if is_last_layer:
-        pf_fp32 = trt.PluginField("output_fp32", np.array([1], np.int32), trt.PluginFieldType.INT32)
-        fields.append(pf_fp32)
-
-    pfc = trt.PluginFieldCollection(fields)
-    skipln_plug = skln_plg_creator.create_plugin("skipln", pfc)
-
-    skipln_inputs = [input_tensor, skip]
-    if config.use_int8:
-        skipln_inputs.append(residual)
-    layer = network.add_plugin_v2(skipln_inputs, skipln_plug)
-    return layer
-
-def ffn(prefix, config, init_dict, network, input_tensor, residual, is_last_layer):
-     # FC1 + GELU
-    B_mid = init_dict[prefix + B_MID]
-    W_mid = init_dict[prefix + W_MID]
-
-    mid_dense = custom_fc(prefix + "intermediate_dense_", config, init_dict, network, input_tensor, config.intermediate_size, W_mid, None)
-    set_output_range(mid_dense, init_dict[prefix + "intermediate_dense_out_amax"])
-
-    dtype = trt.float32
-
-    if config.use_int8:
-        dtype = trt.int8
-
-    pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32)
-    pf_ld = trt.PluginField("ld", np.array([int(config.intermediate_size)], np.int32), trt.PluginFieldType.INT32)
-    fields = [pf_type, pf_ld]
-    if config.use_int8:
-        pf_bias = trt.PluginField("bias", B_mid, trt.PluginFieldType.FLOAT32)
-        fields.append(pf_bias)
-    
-    pfc = trt.PluginFieldCollection(fields)
-    gelu_plug = gelu_plg_creator.create_plugin("gelu", pfc)
-
-    gelu_inputs = [mid_dense.get_output(0)]
-    gelu_layer = network.add_plugin_v2(gelu_inputs, gelu_plug)
-
-    if config.use_int8:
-        set_output_range(gelu_layer, init_dict[prefix + "output_dense_in_amax"])
-
-    intermediate_act = gelu_layer.get_output(0)
-    # set_tensor_name(intermediate_act, prefix, "gelu")
-
-    # FC2
-    # Dense to hidden size
-    B_lout = init_dict[prefix + B_LOUT]
-    W_lout = init_dict[prefix + W_LOUT]
-    out_dense = custom_fc(prefix + "output_dense_", config, init_dict, network, intermediate_act, config.hidden_size, W_lout, None)
-    set_output_range(out_dense, init_dict[prefix + "output_dense_out_amax"])
-
-    out_layer = skipln(prefix + "output_layernorm_", config, init_dict, network, out_dense.get_output(0), input_tensor, residual, is_last_layer, B_lout)
-    return out_layer
-
-def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imask, residual, is_last_layer):
-    """
-    Add the transformer layer
-    """
-    idims = input_tensor.shape
-    hidden_size = idims[2]
-
-    context_transposed = attention_layer_opt(prefix + "attention_", config, init_dict, network, input_tensor, imask)
-    attention_heads = context_transposed.get_output(0)
-    
-    # FC0
-    B_aout = init_dict[prefix + B_AOUT]
-    W_aout = init_dict[prefix + W_AOUT]
-    attention_out_fc = custom_fc(prefix + "attention_output_dense_", config, init_dict, network, attention_heads, hidden_size, W_aout, None)
-    set_output_range(attention_out_fc, init_dict[prefix + "attention_output_dense_out_amax"])   
-    
-    skiplayer = skipln(prefix + "attention_output_layernorm_", config, init_dict, network, attention_out_fc.get_output(0), input_tensor, residual, False, B_aout)
-    if config.use_int8:
-        set_output_range(skiplayer, init_dict[prefix + "intermediate_dense_in_amax"])
-    
-    ffn_layer = ffn(prefix, config, init_dict, network, skiplayer.get_output(0), skiplayer.get_output(1), is_last_layer)
-    return ffn_layer
-
-def bert_model(config, init_dict, network, input_tensor, input_mask, residual):
-    """
-    Create the bert model
-    """
-    prev_input = input_tensor
-    for layer in range(0, config.num_hidden_layers):
-        ss = "l{}_".format(layer) 
-        out_layer = transformer_layer_opt(ss, config,  init_dict, network, prev_input, input_mask, residual,
-                                          True if config.use_int8 and layer == config.num_hidden_layers - 1 else False)
-        prev_input = out_layer.get_output(0)
-        residual = None
-        if config.use_int8:
-            residual = out_layer.get_output(1)
-        if layer < config.num_hidden_layers - 1:
-            set_output_range(out_layer, init_dict["l{}_".format(layer+1) + "attention_self_qkv_in_amax"])
-        else:
-            set_output_range(out_layer, 1)
-
-    return prev_input
-
-def squad_output(prefix, config, init_dict, network, input_tensor):
-    """
-    Create the squad output
-    """
-
-    idims = input_tensor.shape
-    B, S, hidden_size = idims
-
-    W_out = init_dict[prefix + SQD_W]
-    B_out = init_dict[prefix + SQD_B]
-
-    dense = custom_fc_fp16(network, input_tensor, 2, W_out, B_out)
-    return dense
-
-def emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes):
-    input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
-    segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
-    input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
-
-    if len(sequence_lengths) > 1:
-        profile = builder.create_optimization_profile()
-        min_shape = (batch_sizes[0], sequence_lengths[0])
-        opt_shape = (batch_sizes[1], sequence_lengths[1])
-        max_shape = (batch_sizes[2], sequence_lengths[2])
-        assert(sequence_lengths[0] <= sequence_lengths[1] and sequence_lengths[1] <= sequence_lengths[2])
-        
-        print('set dynamic shape -> ', min_shape, opt_shape, max_shape)
-        profile.set_shape("input_ids", min_shape, opt_shape, max_shape)
-        profile.set_shape("segment_ids", min_shape, opt_shape, max_shape)
-        profile.set_shape("input_mask", min_shape, opt_shape, max_shape)
-        builder_config.add_optimization_profile(profile)
-
-    wbeta = trt.PluginField("bert_embeddings_layernorm_beta", weights_dict["bert_embeddings_layernorm_beta"], trt.PluginFieldType.FLOAT32)
-    wgamma = trt.PluginField("bert_embeddings_layernorm_gamma", weights_dict["bert_embeddings_layernorm_gamma"], trt.PluginFieldType.FLOAT32)
-    wwordemb = trt.PluginField("bert_embeddings_word_embeddings", weights_dict["bert_embeddings_word_embeddings"], trt.PluginFieldType.FLOAT32)
-    wtokemb = trt.PluginField("bert_embeddings_token_type_embeddings", weights_dict["bert_embeddings_token_type_embeddings"], trt.PluginFieldType.FLOAT32)
-    wposemb = trt.PluginField("bert_embeddings_position_embeddings", weights_dict["bert_embeddings_position_embeddings"], trt.PluginFieldType.FLOAT32)
-
-    output_fp16 = trt.PluginField("output_fp16", np.array([1]).astype(np.int32), trt.PluginFieldType.INT32)
-    mha_type = trt.PluginField("mha_type_id", np.array([get_mha_dtype(config)], np.int32), trt.PluginFieldType.INT32)
-
-    pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type])
-    fn = emln_plg_creator.create_plugin("embeddings", pfc)
-
-    inputs = [input_ids, segment_ids, input_mask]
-    emb_layer = network.add_plugin_v2(inputs, fn)
-    
-    if config.use_int8:
-        set_output_range(emb_layer, weights_dict["l0_attention_self_qkv_in_amax"])
-        set_output_range(emb_layer, 1.0, 1)
-    return emb_layer
-
-def build_engine(batch_sizes, sequence_lengths, config, weights_dict):
-    explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-
-    builder = trt.Builder(TRT_LOGGER)
-    with builder.create_network(explicit_batch_flag) as network, builder.create_builder_config() as builder_config:
-        network = builder.create_network(explicit_batch_flag) 
-        builder_config = builder.create_builder_config()
-        builder_config.set_flag(trt.BuilderFlag.INT8)
-
-        # Create the network
-        emb_layer = emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes)
-        embeddings = emb_layer.get_output(0)
-        mask_idx = emb_layer.get_output(1)
-
-        residual_buffer = None
-        if config.use_int8:
-            residual_buffer = emb_layer.get_output(2)
-
-        bert_out = bert_model(config, weights_dict, network, embeddings, mask_idx, residual_buffer)
-
-        squad_logits = squad_output("cls_", config, weights_dict, network, bert_out)
-        squad_logits_out = squad_logits.get_output(0)
-
-        network.mark_output(squad_logits_out)
-
-        build_start_time = time.time()
-        plan = builder.build_serialized_network(network, builder_config)
-        build_time_elapsed = (time.time() - build_start_time)
-        TRT_LOGGER.log(TRT_LOGGER.INFO, "build engine in {:.3f} Sec".format(build_time_elapsed))
-        return plan
-    
-def main():
-    parser = argparse.ArgumentParser(description="TensorRT BERT Sample", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("-x", "--onnx", required=False, help="The ONNX model file path.")
-    parser.add_argument("-pt", "--pytorch", required=False, help="The PyTorch checkpoint file path.")
-    parser.add_argument("-o", "--output", required=True, default="bert_base_384.engine", help="The bert engine file, ex bert.engine")
-    parser.add_argument("-b", "--batch-size", nargs='+', help="Batch size(s) to optimize for. The engine will be usable with any batch size below this, but may not be optimal for smaller sizes. Can be specified multiple times to optimize for more than one batch size.", type=int)
-    parser.add_argument("-s", "--sequence-length", nargs='+', help="Sequence length of the BERT model", type=int)
-    parser.add_argument("-c", "--config-dir", required=True,
-                        help="The folder containing the bert_config.json, which can be downloaded e.g. from https://github.com/google-research/bert#pre-trained-models or by running download_models.py in dle/TensorFlow/LanguageModeling/BERT/data/pretrained_models_google")
-    parser.add_argument("-f", "--fp16", action="store_true", help="Indicates that inference should be run in FP16 precision", required=False)
-    parser.add_argument("-i", "--int8", action="store_true", help="Indicates that inference should be run in INT8 precision", required=False)
-    parser.add_argument("-j", "--squad-json", default="squad/dev-v1.1.json", help="squad json dataset used for int8 calibration", required=False)
-    parser.add_argument("-v", "--vocab-file", default="./pre-trained_model/uncased_L-24_H-1024_A-16/vocab.txt", help="Path to file containing entire understandable vocab", required=False)
-    parser.add_argument("--verbose", action="store_true", help="Turn on verbose logger and set profiling verbosity to DETAILED", required=False)
-
-    args, _ = parser.parse_known_args()
-    args.batch_size = args.batch_size or [1]
-    args.sequence_length = args.sequence_length or [128]
-
-    if len(args.sequence_length) not in [1, 3]:
-        print("Error: You must provide <args.sequence_length> either one or three integers.")
-        sys.exit(1)
-
-    if len(args.batch_size) not in [1, 3]:
-        print("Error: You must provide <args.batch_size> either one or three integers.")
-        sys.exit(1)
-
-    if args.verbose:
-        TRT_LOGGER.min_severity = TRT_LOGGER.VERBOSE
-
-    bert_config_path = os.path.join(args.config_dir, "config.json")
-    TRT_LOGGER.log(TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path))
-
-    config = BertConfig(bert_config_path, args.int8)
-
-    if args.onnx != None:
-        if args.int8:
-            raise RuntimeError("int8 onnx not supported now!!!")
-    elif args.pytorch != None:
-        weights_dict = load_pytorch_weights_and_quant(args.pytorch, config)
-    else:
-        raise RuntimeError("You need either specify TF checkpoint using option --ckpt or ONNX using option --onnx to build TRT BERT model.")
-
-    # engine = build_engine(args.batch_size, args.workspace_size, args.sequence_length, config, weights_dict, args.squad_json, args.vocab_file, None, args.calib_num, args.verbose)
-    with build_engine(args.batch_size, args.sequence_length, config, weights_dict) as serialized_engine:
-        TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving Engine to {:}".format(args.output))
-        with open(args.output, "wb") as fout:
-            fout.write(serialized_engine)
-        TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.")
-
-if __name__ == "__main__":
-    main()
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/builder_utils_int8.py b/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/builder_utils_int8.py
deleted file mode 100644
index 67a53f05..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/builder_utils_int8.py
+++ /dev/null
@@ -1,208 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import tensorrt as trt
-import json
-import struct
-import torch
-
-TRT_LOGGER = trt.Logger(trt.Logger.INFO)
-
-"""
-Attentions Keys
-"""
-WQ = "self_query_kernel"
-BQ = "self_query_bias"
-WK = "self_key_kernel"
-BK = "self_key_bias"
-WV = "self_value_kernel"
-BV = "self_value_bias"
-WQKV = "self_qkv_kernel"
-BQKV = "self_qkv_bias"
-
-"""
-Transformer Keys
-"""
-W_AOUT = "attention_output_dense_kernel"
-B_AOUT = "attention_output_dense_bias"
-AOUT_LN_BETA = "attention_output_layernorm_beta"
-AOUT_LN_GAMMA = "attention_output_layernorm_gamma"
-W_MID = "intermediate_dense_kernel"
-B_MID = "intermediate_dense_bias"
-W_LOUT = "output_dense_kernel"
-B_LOUT = "output_dense_bias"
-LOUT_LN_BETA = "output_layernorm_beta"
-LOUT_LN_GAMMA = "output_layernorm_gamma"
-
-"""
-Squad Output Keys
-"""
-SQD_W = "squad_output_weights"
-SQD_B = "squad_output_bias"
-
-ixrt_name_map = {
-    "bert.embeddings.LayerNorm.bias": "bert_embeddings_layernorm_beta",
-    "bert.embeddings.LayerNorm.weight" : "bert_embeddings_layernorm_gamma",
-    "bert.embeddings.word_embeddings.weight" : "bert_embeddings_word_embeddings",
-    "bert.embeddings.token_type_embeddings.weight" : "bert_embeddings_token_type_embeddings",
-    "bert.embeddings.position_embeddings.weight" : "bert_embeddings_position_embeddings",
-    "qa_outputs.weight" : "cls_squad_output_weights",
-    "qa_outputs.bias" : "cls_squad_output_bias"
-}
-
-ixrt_atten_name_map = {
-    "bert.encoder.layer.{}.self_attn.qkv_proj.weight" : "l{}_attention_self_qkv_kernel",
-    "bert.encoder.layer.{}.self_attn.qkv_proj.bias" : "l{}_attention_self_qkv_bias",
-    "bert.encoder.layer.{}.self_attn.out_proj.bias" : "l{}_attention_output_dense_bias",
-    "bert.encoder.layer.{}.self_attn.out_proj.weight" : "l{}_attention_output_dense_kernel",
-    "bert.encoder.layer.{}.fc1.weight" : "l{}_intermediate_dense_kernel",
-    "bert.encoder.layer.{}.fc1.bias" : "l{}_intermediate_dense_bias",
-    "bert.encoder.layer.{}.fc2.weight" : "l{}_output_dense_kernel",
-    "bert.encoder.layer.{}.fc2.bias" : "l{}_output_dense_bias", 
-    "bert.encoder.layer.{}.self_attn_layer_norm.weight" : "l{}_attention_output_layernorm_gamma",
-    "bert.encoder.layer.{}.self_attn_layer_norm.bias" : "l{}_attention_output_layernorm_beta",
-    "bert.encoder.layer.{}.final_layer_norm.weight" : "l{}_output_layernorm_gamma",
-    "bert.encoder.layer.{}.final_layer_norm.bias" : "l{}_output_layernorm_beta",
-    "bert.encoder.layer.{}.self_attn.qkv_proj.weight_quant.clip.clip_value_max" : "l{}_attention_self_qkv_wei_amax",
-    "bert.encoder.layer.{}.self_attn.qkv_proj.input_quant.clip.clip_value_max" : "l{}_attention_self_qkv_in_amax",
-    "bert.encoder.layer.{}.self_attn.qkv_proj.output_quant.clip.clip_value_max" : "l{}_attention_self_qkv_out_amax",
-    "bert.encoder.layer.{}.self_attn.attention_quant.clip.clip_value_max" : "l{}_attention_arrange_qkv_amax",
-    "bert.encoder.layer.{}.self_attn.softmax_in_quant.clip.clip_value_max" : "l{}_attention_softmax_in_amax",
-    "bert.encoder.layer.{}.self_attn.atten_score_out_quant.clip.clip_value_max" : "l{}_attention_softmax_out_amax",
-    "bert.encoder.layer.{}.self_attn.out_proj.input_quant.clip.clip_value_max" : "l{}_attention_output_dense_in_amax",
-    "bert.encoder.layer.{}.self_attn.out_proj.output_quant.clip.clip_value_max" : "l{}_attention_output_dense_out_amax",
-    "bert.encoder.layer.{}.self_attn.out_proj.weight_quant.clip.clip_value_max" : "l{}_attention_output_dense_wei_amax",
-    "bert.encoder.layer.{}.fc1.input_quant.clip.clip_value_max" : "l{}_intermediate_dense_in_amax",
-    "bert.encoder.layer.{}.fc1.output_quant.clip.clip_value_max" : "l{}_intermediate_dense_out_amax",
-    "bert.encoder.layer.{}.fc1.weight_quant.clip.clip_value_max" : "l{}_intermediate_dense_wei_amax",
-    "bert.encoder.layer.{}.fc2.input_quant.clip.clip_value_max" : "l{}_output_dense_in_amax",
-    "bert.encoder.layer.{}.fc2_out_quant.clip.clip_value_max" : "l{}_output_dense_out_amax",
-    "bert.encoder.layer.{}.fc2.weight_quant.clip.clip_value_max" : "l{}_output_dense_wei_amax"
-}
-
-def get_weight_dict(tensor_dict, config):
-    N = config.num_attention_heads
-    H = config.head_size
-    hidden_size = config.hidden_size
-
-    weights_dict = dict()
-    for outname, tensor in tensor_dict.items():
-        if outname.find("_amax") != -1:
-            weights_dict[outname] = tensor.item()
-        elif outname.find(BQ) != -1:
-            prefix = outname[:outname.find(BQ)]
-
-            Wqkv = np.zeros((3, hidden_size, hidden_size), np.float32)
-            Bqkv = np.zeros((3, hidden_size), np.float32)
-
-            Wqkv[0,:,:] = tensor_dict[prefix + WQ]
-            Wqkv[1,:,:] = tensor_dict[prefix + WK]
-            Wqkv[2,:,:] = tensor_dict[prefix + WV]
-            Bqkv[0,:] = tensor
-            Bqkv[1,:] = tensor_dict[prefix + BK]
-            Bqkv[2,:] = tensor_dict[prefix + BV]
-
-            weights_dict[prefix + WQKV] = Wqkv.flatten()
-            weights_dict[prefix + BQKV] = Bqkv.flatten()
-        elif outname.find(BK) != -1 or outname.find(BV) != -1 or outname.find(WQ) != -1 or outname.find(WK) != -1 or outname.find(WV) != -1:
-            pass
-        else:
-            flat_tensor = np.ascontiguousarray(tensor).flatten()
-            weights_dict[outname] = flat_tensor
-
-    return weights_dict
-
-def pytorch_to_trt_name(state_dict, num_layer):
-    tensor_dict = {}
-    for name in ixrt_name_map.keys():
-        tensor_dict[ixrt_name_map[name]] = state_dict[name]
-
-    for name in ixrt_atten_name_map.keys():
-        for layer_id in range(num_layer):
-            key_name = name.format(layer_id)
-            value_name = ixrt_atten_name_map[name].format(layer_id)
-            tensor_dict[value_name] = state_dict[key_name]
-    return tensor_dict
-
-def load_pytorch_weights_and_quant(path, config):
-    """
-    Load the weights from the pytorch checkpoint
-    """
-    state_dict = torch.load(path, map_location='cpu')
-    tensor_dict = pytorch_to_trt_name(state_dict, config.num_hidden_layers)
-    return get_weight_dict(tensor_dict, config)
-
-class BertConfig:
-    def __init__(self, bert_config_path, use_fp16, use_int8=False, use_trt=False):
-        with open(bert_config_path, "r") as f:
-            data = json.load(f)
-            self.num_attention_heads = data["num_attention_heads"]
-            self.hidden_size = data["hidden_size"]
-            self.intermediate_size = data["intermediate_size"]
-            self.num_hidden_layers = data["num_hidden_layers"]
-            self.head_size = self.hidden_size // self.num_attention_heads
-            self.use_fp16 = use_fp16
-            self.use_int8 = use_int8
-            self.use_trt = use_trt
-
-if __name__ == '__main__':
-    bert_config_path = './data/bert-large-uncased/bert_config.json'
-    pytorch_model_path = './data/bert-large-uncased/bert_large_int8_qat.bin'
-    weight_save_path = "./data/bert-large-uncased/bert_large_v1_1_int8.wts"
-    config = BertConfig(bert_config_path, True)
-    weights_dict = load_pytorch_weights_and_quant(pytorch_model_path, config)
-    f = open(weight_save_path, "w")
-    num = 0
-    for key, value in weights_dict.items():
-        if key.find('_amax') == -1:
-            num += 1
-    
-    f.write('{}\n'.format(num))
-    for key, value in weights_dict.items():
-        if key.find('_amax') != -1:
-            continue
-        print('key: ', key)
-        f.write("{} {}".format(key, len(value)))
-        print(len(value))
-        for v in value:
-            f.write(" ")
-            f.write(struct.pack('>f', float(v)).hex())
-        f.write("\n")
-
-    f.write('{}\n'.format(len(weights_dict) - num))
-    for key, value in weights_dict.items():
-        if key.find('_amax') == -1:
-            continue
-        print('key: ', key)
-        print('value: ', value)
-        f.write('{} '.format(key))
-        f.write(struct.pack('>f', float(weights_dict[key])).hex())
-        f.write('\n')
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/evaluate.py b/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/evaluate.py
deleted file mode 100644
index 49b0dede..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/evaluate.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Official evaluation script for v1.1 of the SQuAD dataset. """
-
-import argparse
-import json
-import re
-import string
-import sys
-from collections import Counter
-
-
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    def remove_articles(text):
-        return re.sub(r"\b(a|an|the)\b", " ", text)
-
-    def white_space_fix(text):
-        return " ".join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return "".join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def f1_score(prediction, ground_truth):
-    prediction_tokens = normalize_answer(prediction).split()
-    ground_truth_tokens = normalize_answer(ground_truth).split()
-    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
-    num_same = sum(common.values())
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(prediction_tokens)
-    recall = 1.0 * num_same / len(ground_truth_tokens)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
-
-
-def exact_match_score(prediction, ground_truth):
-    return normalize_answer(prediction) == normalize_answer(ground_truth)
-
-
-def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
-    scores_for_ground_truths = []
-    for ground_truth in ground_truths:
-        score = metric_fn(prediction, ground_truth)
-        scores_for_ground_truths.append(score)
-    return max(scores_for_ground_truths)
-
-
-def evaluate(dataset, predictions):
-    f1 = exact_match = total = 0
-    for article in dataset:
-        for paragraph in article["paragraphs"]:
-            for qa in paragraph["qas"]:
-                total += 1
-                if qa["id"] not in predictions:
-                    message = (
-                        "Unanswered question " + qa["id"] + " will receive score 0."
-                    )
-                    print(message, file=sys.stderr)
-                    continue
-                ground_truths = list(map(lambda x: x["text"], qa["answers"]))
-                prediction = predictions[qa["id"]]
-                exact_match += metric_max_over_ground_truths(
-                    exact_match_score, prediction, ground_truths
-                )
-                f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
-
-    exact_match = 100.0 * exact_match / total
-    f1 = 100.0 * f1 / total
-
-    return {"exact_match": exact_match, "f1": f1}
-
-
-if __name__ == "__main__":
-    expected_version = "1.1"
-    parser = argparse.ArgumentParser(
-        description="Evaluation for SQuAD " + expected_version
-    )
-    parser.add_argument("dataset_file", help="Dataset file")
-    parser.add_argument("prediction_file", help="Prediction File")
-    args = parser.parse_args()
-    with open(args.dataset_file) as dataset_file:
-        dataset_json = json.load(dataset_file)
-        if dataset_json["version"] != expected_version:
-            print(
-                "Evaluation expects v-"
-                + expected_version
-                + ", but got dataset with v-"
-                + dataset_json["version"],
-                file=sys.stderr,
-            )
-        dataset = dataset_json["data"]
-    with open(args.prediction_file) as prediction_file:
-        predictions = json.load(prediction_file)
-    print(json.dumps(evaluate(dataset, predictions)))
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/__init__.py b/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/data_processing.py b/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/data_processing.py
deleted file mode 100644
index 712e1a61..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/data_processing.py
+++ /dev/null
@@ -1,497 +0,0 @@
-#!/usr/bin/env python3
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import helpers.tokenization as tokenization
-import collections
-import numpy as np
-import six
-import math
-import json
-
-
-def convert_doc_tokens(paragraph_text):
-
-    """ Return the list of tokens from the doc text """
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-
-    doc_tokens = []
-    prev_is_whitespace = True
-    for c in paragraph_text:
-        if is_whitespace(c):
-            prev_is_whitespace = True
-        else:
-            if prev_is_whitespace:
-                doc_tokens.append(c)
-            else:
-                doc_tokens[-1] += c
-            prev_is_whitespace = False
-
-    return doc_tokens
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-
-    # Because of the sliding window approach taken to scoring documents, a single
-    # token can appear in multiple documents. E.g.
-    #  Doc: the man went to the store and bought a gallon of milk
-    #  Span A: the man went to the
-    #  Span B: to the store and bought
-    #  Span C: and bought a gallon of
-    #  ...
-    #
-    # Now the word 'bought' will have two scores from spans B and C. We only
-    # want to consider the score with "maximum context", which we define as
-    # the *minimum* of its left and right context (the *sum* of left and
-    # right context will always be the same, of course).
-    #
-    # In the example the maximum context for 'bought' would be span C since
-    # it has 1 left context and 3 right context, while span B has 4 left context
-    # and 0 right context.
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-def convert_example_to_features(doc_tokens, question_text, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    query_tokens = tokenizer.tokenize(question_text)
-
-    if len(query_tokens) > max_query_length:
-        query_tokens = query_tokens[0:max_query_length]
-
-    tok_to_orig_index = []
-    orig_to_tok_index = []
-    all_doc_tokens = []
-    for (i, token) in enumerate(doc_tokens):
-        orig_to_tok_index.append(len(all_doc_tokens))
-        sub_tokens = tokenizer.tokenize(token)
-        for sub_token in sub_tokens:
-            tok_to_orig_index.append(i)
-            all_doc_tokens.append(sub_token)
-
-    # The -3 accounts for [CLS], [SEP] and [SEP]
-    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-
-    # We can have documents that are longer than the maximum sequence length.
-    # To deal with this we do a sliding window approach, where we take chunks
-    # of the up to our max length with a stride of `doc_stride`.
-    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-        "DocSpan", ["start", "length"])
-    doc_spans = []
-    start_offset = 0
-    while start_offset < len(all_doc_tokens):
-        length = len(all_doc_tokens) - start_offset
-        if length > max_tokens_for_doc:
-            length = max_tokens_for_doc
-        doc_spans.append(_DocSpan(start=start_offset, length=length))
-        if start_offset + length == len(all_doc_tokens):
-            break
-        start_offset += min(length, doc_stride)
-
-    _Feature = collections.namedtuple(  # pylint: disable=invalid-name
-            "Feature",
-            ["input_ids", "input_mask", "segment_ids", "tokens", "token_to_orig_map", "token_is_max_context"])
-
-
-    features = []
-    for (doc_span_index, doc_span) in enumerate(doc_spans):
-        tokens = []
-        token_to_orig_map = {}
-        token_is_max_context = {}
-        segment_ids = []
-        tokens.append("[CLS]")
-        segment_ids.append(0)
-        for token in query_tokens:
-            tokens.append(token)
-            segment_ids.append(0)
-        tokens.append("[SEP]")
-        segment_ids.append(0)
-
-        for i in range(doc_span.length):
-            split_token_index = doc_span.start + i
-            token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-            is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
-            token_is_max_context[len(tokens)] = is_max_context
-            tokens.append(all_doc_tokens[split_token_index])
-            segment_ids.append(1)
-        tokens.append("[SEP]")
-        segment_ids.append(1)
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        # while len(input_ids) < max_seq_length:
-        #     input_ids.append(0)
-        #     input_mask.append(0)
-        #     segment_ids.append(0)
-
-        # assert len(input_ids) == max_seq_length
-        # assert len(input_mask) == max_seq_length
-        # assert len(segment_ids) == max_seq_length
-
-        def create_int_feature(values):
-            feature = np.asarray(values, dtype=np.int32, order=None)
-            return feature
-
-
-        features.append(_Feature(
-            input_ids = create_int_feature(input_ids),
-            input_mask = create_int_feature(input_mask),
-            segment_ids = create_int_feature(segment_ids),
-            tokens = tokens,
-            token_to_orig_map = token_to_orig_map,
-            token_is_max_context = token_is_max_context
-            ))
-    return features
-
-
-def read_squad_json(input_file):
-    """read from squad json into a list of examples"""
-    with open(input_file, "r", encoding='utf-8') as reader:
-        input_data = json.load(reader)["data"]
-
-    _Example = collections.namedtuple(  # pylint: disable=invalid-name
-            "Example",
-            ["id", "question_text", "doc_tokens"])
-
-    examples = []
-    for entry in input_data:
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]
-            doc_tokens = convert_doc_tokens(paragraph_text)
-
-            for qa in paragraph["qas"]:
-                examples.append(_Example(
-                    id = qa["id"],
-                    question_text = qa["question"],
-                    doc_tokens = doc_tokens
-                    ))
-
-    return examples
-
-
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-
-    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
-
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-
-
-def get_final_text(pred_text, orig_text, do_lower_case):
-    """Project the tokenized prediction back to the original text."""
-
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heruistic between
-    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
-
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        return orig_text
-
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
-
-
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
-
-
-def get_predictions(doc_tokens, features, results, n_best_size, max_answer_length):
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction",
-        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
-
-    prediction = ""
-    scores_diff_json = 0.0
-
-    prelim_predictions = []
-    # keep track of the minimum score of null start+end of position 0
-    score_null = 1000000  # large and positive
-    min_null_feature_index = 0  # the paragraph slice with min mull score
-    null_start_logit = 0  # the start logit at the slice with min null score
-    null_end_logit = 0  # the end logit at the slice with min null score
-    version_2_with_negative = False
-
-    for result in results:
-        start_indexes = _get_best_indexes(result.start_logits, n_best_size)
-        end_indexes = _get_best_indexes(result.end_logits, n_best_size)
-        feature = features[result.feature_index]
-
-        # if we could have irrelevant answers, get the min score of irrelevant
-        if version_2_with_negative:
-            feature_null_score = result.start_logits[0] + result.end_logits[0]
-            if feature_null_score < score_null:
-                score_null = feature_null_score
-                min_null_feature_index = 0
-                null_start_logit = result.start_logits[0]
-                null_end_logit = result.end_logits[0]
-
-        for start_index in start_indexes:
-            for end_index in end_indexes:
-                # We could hypothetically create invalid predictions, e.g., predict
-                # that the start of the span is in the question. We throw out all
-                # invalid predictions.
-                if start_index >= len(feature.tokens):
-                    continue
-                if end_index >= len(feature.tokens):
-                    continue
-                if start_index not in feature.token_to_orig_map:
-                    continue
-                if end_index not in feature.token_to_orig_map:
-                    continue
-                if not feature.token_is_max_context.get(start_index, False):
-                    continue
-                if end_index < start_index:
-                    continue
-                length = end_index - start_index + 1
-                if length > max_answer_length:
-                    continue
-                prelim_predictions.append(
-                    _PrelimPrediction(
-                        feature_index=result.feature_index,
-                        start_index=start_index,
-                        end_index=end_index,
-                        start_logit=result.start_logits[start_index],
-                        end_logit=result.end_logits[end_index]))
-
-    if version_2_with_negative:
-        prelim_predictions.append(
-            _PrelimPrediction(
-                feature_index=result.feature_index,
-                start_index=0,
-                end_index=0,
-                start_logit=null_start_logit,
-                end_logit=null_end_logit))
-
-    prelim_predictions = sorted(
-        prelim_predictions,
-        key=lambda x: (x.start_logit + x.end_logit),
-        reverse=True)
-
-    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_logit", "end_logit"])
-
-    seen_predictions = {}
-    nbest = []
-    for pred in prelim_predictions:
-        if len(nbest) >= n_best_size:
-            break
-
-        if pred.start_index > 0:  # this is a non-null prediction
-            feature = features[pred.feature_index]
-            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
-            orig_doc_start = feature.token_to_orig_map[pred.start_index]
-            orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = doc_tokens[orig_doc_start:(orig_doc_end + 1)]
-            tok_text = " ".join(tok_tokens)
-
-            # De-tokenize WordPieces that have been split off.
-            tok_text = tok_text.replace(" ##", "")
-            tok_text = tok_text.replace("##", "")
-
-            # Clean whitespace
-            tok_text = tok_text.strip()
-            tok_text = " ".join(tok_text.split())
-            orig_text = " ".join(orig_tokens)
-
-            final_text = get_final_text(tok_text, orig_text, True)
-            if final_text in seen_predictions:
-                continue
-
-            seen_predictions[final_text] = True
-        else:
-            final_text = ""
-            seen_predictions[final_text] = True
-
-        if len(final_text):
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
-
-    # if we didn't inlude the empty option in the n-best, inlcude it
-    if version_2_with_negative:
-        if "" not in seen_predictions:
-            nbest.append(
-                _NbestPrediction(
-                    text="", start_logit=null_start_logit,
-                    end_logit=null_end_logit))
-    # In very rare edge cases we could have no valid predictions. So we
-    # just create a nonce prediction in this case to avoid failure.
-    if not nbest:
-        nbest.append(
-            _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-    assert len(nbest) >= 1
-
-    total_scores = []
-    best_non_null_entry = None
-    for entry in nbest:
-        total_scores.append(entry.start_logit + entry.end_logit)
-        if not best_non_null_entry:
-            if entry.text:
-                best_non_null_entry = entry
-
-    probs = _compute_softmax(total_scores)
-
-    nbest_json = []
-    for (i, entry) in enumerate(nbest):
-        output = collections.OrderedDict()
-        output["text"] = entry.text
-        output["probability"] = probs[i]
-        output["start_logit"] = entry.start_logit
-        output["end_logit"] = entry.end_logit
-        nbest_json.append(output)
-
-    assert len(nbest_json) >= 1
-
-    null_score_diff_threshold = 0.0
-    if not version_2_with_negative:
-        prediction = nbest_json[0]["text"]
-    else:
-        # predict "" iff the null score - the score of best non-null > threshold
-        score_diff = score_null - best_non_null_entry.start_logit - (
-            best_non_null_entry.end_logit)
-        scores_diff_json = score_diff
-        if score_diff > null_score_diff_threshold:
-            prediction = ""
-        else:
-            prediction = best_non_null_entry.text
-
-    return prediction, nbest_json, scores_diff_json
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/tokenization.py b/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/tokenization.py
deleted file mode 100644
index 434f411d..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/helpers/tokenization.py
+++ /dev/null
@@ -1,446 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import re
-import unicodedata
-import six
-
-
-def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
-  """Checks whether the casing config is consistent with the checkpoint name."""
-
-  # The casing has to be passed in by the user and there is no explicit check
-  # as to whether it matches the checkpoint. The casing information probably
-  # should have been stored in the bert_config.json file, but it's not, so
-  # we have to heuristically detect it to validate.
-
-  if not init_checkpoint:
-    return
-
-  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
-  if m is None:
-    return
-
-  model_name = m.group(1)
-
-  lower_models = [
-      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
-      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
-  ]
-
-  cased_models = [
-      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
-      "multi_cased_L-12_H-768_A-12"
-  ]
-
-  is_bad_config = False
-  if model_name in lower_models and not do_lower_case:
-    is_bad_config = True
-    actual_flag = "False"
-    case_name = "lowercased"
-    opposite_flag = "True"
-
-  if model_name in cased_models and do_lower_case:
-    is_bad_config = True
-    actual_flag = "True"
-    case_name = "cased"
-    opposite_flag = "False"
-
-  if is_bad_config:
-    raise ValueError(
-        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
-        "However, `%s` seems to be a %s model, so you "
-        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
-        "how the model was pre-training. If this error is wrong, please "
-        "just comment out this check." % (actual_flag, init_checkpoint,
-                                          model_name, case_name, opposite_flag))
-
-
-def convert_to_unicode(text):
-  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-  if six.PY3:
-    if isinstance(text, str):
-      return text
-    elif isinstance(text, bytes):
-      return text.decode("utf-8", "ignore")
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  elif six.PY2:
-    if isinstance(text, str):
-      return text.decode("utf-8", "ignore")
-    elif isinstance(text, unicode):
-      return text
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  else:
-    raise ValueError("Not running on Python2 or Python 3?")
-
-
-def printable_text(text):
-  """Returns text encoded in a way suitable for print or `tf.logging`."""
-
-  # These functions want `str` for both Python2 and Python3, but in one case
-  # it's a Unicode string and in the other it's a byte string.
-  if six.PY3:
-    if isinstance(text, str):
-      return text
-    elif isinstance(text, bytes):
-      return text.decode("utf-8", "ignore")
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  elif six.PY2:
-    if isinstance(text, str):
-      return text
-    elif isinstance(text, unicode):
-      return text.encode("utf-8")
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  else:
-    raise ValueError("Not running on Python2 or Python 3?")
-
-
-def load_vocab(vocab_file):
-  """Loads a vocabulary file into a dictionary."""
-  vocab = collections.OrderedDict()
-  index = 0
-  with open(vocab_file, "r", encoding='utf-8') as reader:
-    while True:
-      token = convert_to_unicode(reader.readline())
-      if not token:
-        break
-      token = token.strip()
-      vocab[token] = index
-      index += 1
-  return vocab
-
-
-def convert_by_vocab(vocab, items):
-  """Converts a sequence of [tokens|ids] using the vocab."""
-  output = []
-  for item in items:
-    output.append(vocab[item])
-  return output
-
-
-def convert_tokens_to_ids(vocab, tokens):
-  return convert_by_vocab(vocab, tokens)
-
-
-def convert_ids_to_tokens(inv_vocab, ids):
-  return convert_by_vocab(inv_vocab, ids)
-
-
-def whitespace_tokenize(text):
-  """Runs basic whitespace cleaning and splitting on a piece of text."""
-  text = text.strip()
-  if not text:
-    return []
-  tokens = text.split()
-  return tokens
-
-
-class FullTokenizer(object):
-  """Runs end-to-end tokenziation."""
-
-  def __init__(self, vocab_file, do_lower_case=True):
-    self.vocab = load_vocab(vocab_file)
-    self.inv_vocab = {v: k for k, v in self.vocab.items()}
-    self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-
-  def tokenize(self, text):
-    split_tokens = []
-    for token in self.basic_tokenizer.tokenize(text):
-      for sub_token in self.wordpiece_tokenizer.tokenize(token):
-        split_tokens.append(sub_token)
-
-    return split_tokens
-
-  def convert_tokens_to_ids(self, tokens):
-    return convert_by_vocab(self.vocab, tokens)
-
-  def convert_ids_to_tokens(self, ids):
-    return convert_by_vocab(self.inv_vocab, ids)
-
-
-class BertTokenizer(object):
-    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
-
-    def __init__(self, vocab_file, do_lower_case=True):
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict(
-            [(ids, tok) for tok, ids in self.vocab.items()])
-        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-
-    def tokenize(self, text):
-        split_tokens = []
-        for token in self.basic_tokenizer.tokenize(text):
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
-        return split_tokens
-
-    def convert_tokens_to_ids(self, tokens):
-        """Converts a sequence of tokens into ids using the vocab."""
-        ids = []
-        for token in tokens:
-            ids.append(self.vocab[token])
-        return ids
-
-    def convert_ids_to_tokens(self, ids):
-        """Converts a sequence of ids in wordpiece tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            tokens.append(self.ids_to_tokens[i])
-        return tokens
-
-class BasicTokenizer(object):
-  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-
-  def __init__(self, do_lower_case=True):
-    """Constructs a BasicTokenizer.
-
-    Args:
-      do_lower_case: Whether to lower case the input.
-    """
-    self.do_lower_case = do_lower_case
-
-  def tokenize(self, text):
-    """Tokenizes a piece of text."""
-    text = convert_to_unicode(text)
-    text = self._clean_text(text)
-
-    # This was added on November 1st, 2018 for the multilingual and Chinese
-    # models. This is also applied to the English models now, but it doesn't
-    # matter since the English models were not trained on any Chinese data
-    # and generally don't have any Chinese data in them (there are Chinese
-    # characters in the vocabulary because Wikipedia does have some Chinese
-    # words in the English Wikipedia.).
-    text = self._tokenize_chinese_chars(text)
-
-    orig_tokens = whitespace_tokenize(text)
-    split_tokens = []
-    for token in orig_tokens:
-      if self.do_lower_case:
-        token = token.lower()
-        token = self._run_strip_accents(token)
-      split_tokens.extend(self._run_split_on_punc(token))
-
-    output_tokens = whitespace_tokenize(" ".join(split_tokens))
-    return output_tokens
-
-  def _run_strip_accents(self, text):
-    """Strips accents from a piece of text."""
-    text = unicodedata.normalize("NFD", text)
-    output = []
-    for char in text:
-      cat = unicodedata.category(char)
-      if cat == "Mn":
-        continue
-      output.append(char)
-    return "".join(output)
-
-  def _run_split_on_punc(self, text):
-    """Splits punctuation on a piece of text."""
-    chars = list(text)
-    i = 0
-    start_new_word = True
-    output = []
-    while i < len(chars):
-      char = chars[i]
-      if _is_punctuation(char):
-        output.append([char])
-        start_new_word = True
-      else:
-        if start_new_word:
-          output.append([])
-        start_new_word = False
-        output[-1].append(char)
-      i += 1
-
-    return ["".join(x) for x in output]
-
-  def _tokenize_chinese_chars(self, text):
-    """Adds whitespace around any CJK character."""
-    output = []
-    for char in text:
-      cp = ord(char)
-      if self._is_chinese_char(cp):
-        output.append(" ")
-        output.append(char)
-        output.append(" ")
-      else:
-        output.append(char)
-    return "".join(output)
-
-  def _is_chinese_char(self, cp):
-    """Checks whether CP is the codepoint of a CJK character."""
-    # This defines a "chinese character" as anything in the CJK Unicode block:
-    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-    #
-    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-    # despite its name. The modern Korean Hangul alphabet is a different block,
-    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-    # space-separated words, so they are not treated specially and handled
-    # like the all of the other languages.
-    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-        (cp >= 0x3400 and cp <= 0x4DBF) or  #
-        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-        (cp >= 0x2B820 and cp <= 0x2CEAF) or
-        (cp >= 0xF900 and cp <= 0xFAFF) or  #
-        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-      return True
-
-    return False
-
-  def _clean_text(self, text):
-    """Performs invalid character removal and whitespace cleanup on text."""
-    output = []
-    for char in text:
-      cp = ord(char)
-      if cp == 0 or cp == 0xfffd or _is_control(char):
-        continue
-      if _is_whitespace(char):
-        output.append(" ")
-      else:
-        output.append(char)
-    return "".join(output)
-
-
-class WordpieceTokenizer(object):
-  """Runs WordPiece tokenziation."""
-
-  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
-    self.vocab = vocab
-    self.unk_token = unk_token
-    self.max_input_chars_per_word = max_input_chars_per_word
-
-  def tokenize(self, text):
-    """Tokenizes a piece of text into its word pieces.
-
-    This uses a greedy longest-match-first algorithm to perform tokenization
-    using the given vocabulary.
-
-    For example:
-      input = "unaffable"
-      output = ["un", "##aff", "##able"]
-
-    Args:
-      text: A single token or whitespace separated tokens. This should have
-        already been passed through `BasicTokenizer.
-
-    Returns:
-      A list of wordpiece tokens.
-    """
-
-    text = convert_to_unicode(text)
-
-    output_tokens = []
-    for token in whitespace_tokenize(text):
-      chars = list(token)
-      if len(chars) > self.max_input_chars_per_word:
-        output_tokens.append(self.unk_token)
-        continue
-
-      is_bad = False
-      start = 0
-      sub_tokens = []
-      while start < len(chars):
-        end = len(chars)
-        cur_substr = None
-        while start < end:
-          substr = "".join(chars[start:end])
-          if start > 0:
-            substr = "##" + substr
-          if substr in self.vocab:
-            cur_substr = substr
-            break
-          end -= 1
-        if cur_substr is None:
-          is_bad = True
-          break
-        sub_tokens.append(cur_substr)
-        start = end
-
-      if is_bad:
-        output_tokens.append(self.unk_token)
-      else:
-        output_tokens.extend(sub_tokens)
-    return output_tokens
-
-
-def _is_whitespace(char):
-  """Checks whether `chars` is a whitespace character."""
-  # \t, \n, and \r are technically contorl characters but we treat them
-  # as whitespace since they are generally considered as such.
-  if char == " " or char == "\t" or char == "\n" or char == "\r":
-    return True
-  cat = unicodedata.category(char)
-  if cat == "Zs":
-    return True
-  return False
-
-
-def _is_control(char):
-  """Checks whether `chars` is a control character."""
-  # These are technically control characters but we count them as whitespace
-  # characters.
-  if char == "\t" or char == "\n" or char == "\r":
-    return False
-  cat = unicodedata.category(char)
-  if cat.startswith("C"):
-    return True
-  return False
-
-
-def _is_punctuation(char):
-  """Checks whether `chars` is a punctuation character."""
-  cp = ord(char)
-  # We treat all non-letter/number ASCII as punctuation.
-  # Characters such as "^", "$", and "`" are not in the Unicode
-  # Punctuation class but we treat them as punctuation anyways, for
-  # consistency.
-  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-    return True
-  cat = unicodedata.category(char)
-  if cat.startswith("P"):
-    return True
-  return False
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/load_ixrt_plugin.py b/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/load_ixrt_plugin.py
deleted file mode 100644
index ed2939c6..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/python/ixrt/load_ixrt_plugin.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from os.path import join, dirname, exists
-import tensorrt as trt
-import ctypes
-
-current_directory = os.getcwd()
-
-def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""):
-    if not dynamic_path:
-        dynamic_path = join(dirname(trt.__file__), "lib", "libixrt_plugin.so")
-    if not exists(dynamic_path):
-        raise FileNotFoundError(
-            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
-    ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
-    trt.init_libnvinfer_plugins(logger, namespace)
-    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/script/build_engine.sh b/models/nlp/plm/bert_base_squad/ixrt/python/script/build_engine.sh
deleted file mode 100644
index bd983dbb..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/python/script/build_engine.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-BSZ=1
-USE_FP16=True
-
-# Update arguments
-index=0
-options=$@
-arguments=($options)
-for argument in $options
-do
-    index=`expr $index + 1`
-    case $argument in
-      --bs) BSZ=${arguments[index]};;
-      --int8) USE_FP16=False;;
-    esac
-done
-
-if [ "$USE_FP16" = "True" ]; then
-    echo 'USE_FP16=True'
-    python3 builder.py -x ./data/bert_base_uncased_squad/bert_base_squad.onnx \
-                       -w 4096 \
-                       -o ./data/bert_base_384.engine \
-                       -s 1 384 384 \
-                       -b 1 ${BSZ} ${BSZ} \
-                       --fp16 \
-                       -c ./data/bert_base_uncased_squad/config.json
-else
-    echo 'USE_INT8=True'
-    python3 builder_int8.py -pt ./data/bert_base_uncased_squad/bert_base_int8_qat.bin \
-                -o ./data/bert_base_384_int8.engine \
-                -s 1 384 384 \
-                -b 1 ${BSZ} ${BSZ} \
-                -i \
-                -c ./data/bert_base_uncased_squad/config.json 
-fi
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/script/inference.sh b/models/nlp/plm/bert_base_squad/ixrt/python/script/inference.sh
deleted file mode 100644
index 7018333f..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/python/script/inference.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-PASSAGE='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps such as recommenders, 
-speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops and layers before applying optimizations 
-for inference. Today NVIDIA is open-sourcing parsers and plugins in TensorRT so that the deep learning community can customize and extend these components 
-to take advantage of powerful TensorRT optimizations for your apps.'
-QUESTION="What is TensorRT?"
-
-USE_FP16=True
-
-# Update arguments
-index=0
-options=$@
-arguments=($options)
-for argument in $options
-do
-    index=`expr $index + 1`
-    case $argument in
-      --int8) USE_FP16=False;;
-    esac
-done
-
-if [ "$USE_FP16" = "True" ]; then
-    echo 'USE_FP16=True'
-    python3 inference.py -e ./data/bert_base_384.engine \
-                        -s 384 \
-                        -p $PASSAGE \
-                        -q $QUESTION \
-                        -v ./data/bert_base_uncased_squad/vocab.txt 
-else
-    echo 'USE_INT8=True'
-    python3 inference.py -e ./data/bert_base_384_int8.engine \
-                        -s 384 \
-                        -p $PASSAGE \
-                        -q $QUESTION \
-                        -v ./data/bert_base_uncased_squad/vocab.txt 
-fi
-
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/script/inference_squad.sh b/models/nlp/plm/bert_base_squad/ixrt/python/script/inference_squad.sh
deleted file mode 100644
index 9473ee77..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/python/script/inference_squad.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-BSZ=1
-USE_FP16=True
-
-# Update arguments
-index=0
-options=$@
-arguments=($options)
-for argument in $options
-do
-    index=`expr $index + 1`
-    case $argument in
-      --bs) BSZ=${arguments[index]};;
-      --int8) USE_FP16=False;;
-    esac
-done
-
-if [ "$USE_FP16" = "True" ]; then
-    echo 'USE_FP16=True'
-    UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ./data/bert_base_384.engine \
-                            -b ${BSZ} \
-                            -s 384 \
-                            -sq ./data/squad/dev-v1.1.json \
-                            -v ./data/bert_base_uncased_squad/vocab.txt \
-                            -o ./data/predictions-bert_base_384.json 
-    python3 evaluate-v1.1.py  ./data/squad/dev-v1.1.json  ./data/predictions-bert_base_384.json 87
-else
-    echo 'USE_INT8=True'
-    UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ./data/bert_base_384_int8.engine \
-                            -b ${BSZ} \
-                            -s 384 \
-                            -sq ./data/squad/dev-v1.1.json \
-                            -v ./data/bert_base_uncased_squad/vocab.txt \
-                            -o ./data/predictions-bert_base_384_int8.json \
-                            -i
-    python3 evaluate-v1.1.py  ./data/squad/dev-v1.1.json  ./data/predictions-bert_base_384_int8.json 86
-fi
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/script/mdb_infer_run.sh b/models/nlp/plm/bert_base_squad/ixrt/python/script/mdb_infer_run.sh
deleted file mode 100644
index 6386fe83..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/python/script/mdb_infer_run.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-index=0
-options=("$@") # 将所有参数存储到数组中
-PRECISION=fp16
-BSZ=32
-
-# 循环遍历所有参数
-while [[ $index -lt ${#options[@]} ]]; do
-    argument=${options[$index]}
-    case $argument in
-    --bs)
-        ((index++))
-        BSZ=${options[$index]}
-        ;;
-    --prec)
-        ((index++))
-        PRECISION=${options[$index]}
-        ;;
-    esac
-    ((index++))
-done
-
-# 设置INT8_FLAG
-INT8_FLAG=""
-if [[ "$PRECISION" == "int8" ]]; then
-    INT8_FLAG="--int8"
-fi
-
-# 设置BSZ_FLAG
-BSZ_FLAG=""
-if [[ "$BSZ" -ne 32 ]]; then
-    BSZ_FLAG="--bs $BSZ"
-fi
-
-echo "PREC_FLAG=$INT8_FLAG"
-echo "PRECISION=$PRECISION"
-echo "BSZ=$BSZ"
-echo "BSZ_FLAG=$BSZ_FLAG"
-
-# 检查环境并执行相应的脚本
-if command -v ixsmi &>/dev/null; then
-    echo "MR env"
-    cmake -S . -B build
-    cmake --build build -j16
-    cd ./python/script/
-    bash infer_bert_base_squad_${PRECISION}_ixrt.sh $BSZ_FLAG
-
-elif command -v nvidia-smi &>/dev/null; then
-    echo "NV env"
-    cmake -S . -B build -DUSE_TENSORRT=true
-    cmake --build build -j16
-    cd ./python/
-    bash script/build_engine.sh --bs $BSZ $INT8_FLAG
-    bash script/inference_squad.sh --bs $BSZ $INT8_FLAG
-else
-    echo "No driver detected"
-    exit 1
-fi
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/script/perf.sh b/models/nlp/plm/bert_base_squad/ixrt/python/script/perf.sh
deleted file mode 100644
index f73d8fb8..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/python/script/perf.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-BSZ=1
-USE_FP16=True
-
-# Update arguments
-index=0
-options=$@
-arguments=($options)
-for argument in $options
-do
-    index=`expr $index + 1`
-    case $argument in
-      --bs) BSZ=${arguments[index]};;
-      --int8) USE_FP16=False;;
-    esac
-done
-
-if [ "$USE_FP16" = "True" ]; then
-    echo 'USE_FP16=True'
-    python3 perf.py -e ./data/bert_base_384.engine -b ${BSZ} -s 384
-else
-    echo 'USE_INT8=True'
-    python3 perf.py -e ./data/bert_base_384_int8.engine -b ${BSZ} -s 384
-fi
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/script/prepare.sh b/models/nlp/plm/bert_base_squad/ixrt/python/script/prepare.sh
deleted file mode 100644
index 1da4aea4..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/python/script/prepare.sh
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-VERSION='v1.1'
-
-while test $# -gt 0
-do
-    case "$1" in
-        -h) echo "Usage: sh download_squad.sh [v2_0|v1_1]"
-            exit 0
-            ;;
-        v2_0) VERSION='v2.0'
-            ;;
-        v1_1) VERSION='v1.1'
-            ;;
-        *) echo "Invalid argument $1...exiting"
-            exit 0
-            ;;
-    esac
-    shift
-done
-
-# Download the SQuAD training and dev datasets
-echo "Step 1: Downloading SQuAD-${VERSION} training and dev datasets to ./data/squad"
-if [ ! -d "./data" ]; then
-    mkdir -p data
-else
-    echo 'data directory existed'
-fi
-
-pushd data
-if [ ! -d "./squad" ]; then
-    mkdir -p squad
-    pushd squad
-    wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-${VERSION}.json
-    wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-${VERSION}.json
-    popd
-else 
-    echo 'squad directory existed'
-fi
-
-echo "Step 2: Downloading model file and config to ./data/bert_base_uncased_squad"
-
-if [ ! -d "./bert_base_uncased_squad" ]; then
-    wget http://files.deepspark.org.cn:880/deepspark/data/checkpoints/bert_base_uncased_squad.tar
-    tar -xvf bert_base_uncased_squad.tar -C ./
-    rm -f bert_base_uncased_squad.tar
-else 
-    echo 'bert_base_uncased_squad directory existed'
-fi
-popd
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/script/infer_bert_base_squad_fp16_ixrt.sh b/models/nlp/plm/bert_base_squad/ixrt/scripts/infer_bert_base_squad_fp16_accuracy.sh
similarity index 89%
rename from models/nlp/plm/bert_base_squad/ixrt/python/script/infer_bert_base_squad_fp16_ixrt.sh
rename to models/nlp/plm/bert_base_squad/ixrt/scripts/infer_bert_base_squad_fp16_accuracy.sh
index 4da5ac8f..48592984 100644
--- a/models/nlp/plm/bert_base_squad/ixrt/python/script/infer_bert_base_squad_fp16_ixrt.sh
+++ b/models/nlp/plm/bert_base_squad/ixrt/scripts/infer_bert_base_squad_fp16_accuracy.sh
@@ -14,8 +14,8 @@
 #    under the License.
 set -eo pipefail
 
-BSZ=32
-TGT=87
+BSZ=1
+TGT=88.5
 USE_TRT=False
 
 # Update arguments
@@ -32,16 +32,14 @@ do
     esac
 done
 
-current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
-project_path=$(realpath ${current_path}/..)
-checkpoints_path=${project_path}/data/bert_base_uncased_squad
-datasets_path=${project_path}/data/
+project_path=./
+checkpoints_path=${project_path}/data/checkpoints/bert_base_squad/bert_base_uncased_squad
+datasets_path=${project_path}/data/datasets/bert_base_squad/
 
 echo 'USE_TRT='${USE_TRT}
 export USE_TRT=$USE_TRT
 
 echo "Step1 Build Engine FP16(bert base squad)!"
-cd ${project_path}/ixrt
 python3 builder.py -x ${checkpoints_path}/bert_base_squad.onnx \
                    -w 4096 \
                    -o ${checkpoints_path}/bert_base_b${BSZ}.engine \
diff --git a/models/nlp/plm/bert_base_squad/ixrt/python/script/infer_bert_base_squad_int8_ixrt.sh b/models/nlp/plm/bert_base_squad/ixrt/scripts/infer_bert_base_squad_fp16_performance.sh
similarity index 56%
rename from models/nlp/plm/bert_base_squad/ixrt/python/script/infer_bert_base_squad_int8_ixrt.sh
rename to models/nlp/plm/bert_base_squad/ixrt/scripts/infer_bert_base_squad_fp16_performance.sh
index b5596c1c..178615bb 100644
--- a/models/nlp/plm/bert_base_squad/ixrt/python/script/infer_bert_base_squad_int8_ixrt.sh
+++ b/models/nlp/plm/bert_base_squad/ixrt/scripts/infer_bert_base_squad_fp16_performance.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
 # All Rights Reserved.
 #
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
@@ -15,7 +15,7 @@
 set -eo pipefail
 
 BSZ=32
-TGT=86
+TGT=900
 USE_TRT=False
 
 # Update arguments
@@ -32,33 +32,29 @@ do
     esac
 done
 
-current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
-project_path=$(realpath ${current_path}/..)
-echo ${project_path}
-checkpoints_path=${project_path}/data/bert_base_uncased_squad/
-datasets_path=${project_path}/data/
+project_path=./
+checkpoints_path=${project_path}/data/checkpoints/bert_base_squad/bert_base_uncased_squad
+datasets_path=${project_path}/data/datasets/bert_base_squad/
 
 echo 'USE_TRT='${USE_TRT}
 export USE_TRT=$USE_TRT
 
-echo "Step1 Build Engine Int8(bert base squad)!"
-cd ${project_path}/ixrt
-python3 builder_int8.py -pt ${checkpoints_path}/bert_base_int8_qat.bin \
-                -o ${checkpoints_path}/bert_base_int8_b${BSZ}.engine \
-                -b 1 ${BSZ} ${BSZ} \
-                -s 1 384 384 \
-                -i \
-                -c ${checkpoints_path}
+echo "Step1 Build Engine FP16(bert base squad)!"
+python3 builder.py -x ${checkpoints_path}/bert_base_squad.onnx \
+                   -w 4096 \
+                   -o ${checkpoints_path}/bert_base_b${BSZ}.engine \
+                   -s 1 384 384 \
+                   -b 1 ${BSZ} ${BSZ} \
+                   --fp16 \
+                   -c ${checkpoints_path}/config.json \
+                   -z ${USE_TRT}
 
-echo "Step2 Run dev.json and generate json"
-python3 inference.py -e ${checkpoints_path}/bert_base_int8_b${BSZ}.engine \
-                        -b ${BSZ} \
+echo "Step2 Inference(test QPS)"
+UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ${checkpoints_path}/bert_base_b${BSZ}.engine \
                         -s 384 \
+                        -b ${BSZ} \
                         -sq ${datasets_path}/squad/dev-v1.1.json \
                         -v ${checkpoints_path}/vocab.txt \
-                        -o ${checkpoints_path}/predictions-bert_base_int8_b${BSZ}.json \
+                        -o ${checkpoints_path}/predictions-bert_base_b${BSZ}.json \
                         -z ${USE_TRT} \
-                        -i
-
-echo "Step3 Inference(test F1-score)"
-python3 evaluate-v1.1.py  ${datasets_path}/squad/dev-v1.1.json  ${checkpoints_path}/predictions-bert_base_int8_b${BSZ}.json ${TGT}
\ No newline at end of file
+                        --target_qps ${TGT}
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/api/plugin_loader.cc b/models/nlp/plm/bert_base_squad/ixrt/src/api/plugin_loader.cc
deleted file mode 100644
index ceea8d8b..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/api/plugin_loader.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <memory>
-#include <mutex>
-#include <stack>
-#include <unordered_set>
-
-#include "NvInfer.h"
-#include "NvInferPlugin.h"
-#include "NvInferRuntimeCommon.h"
-#include "custom_fc/fcPlugin.h"
-#include "emb_layernorm/embLayerNormPlugin.h"
-#include "emb_layernorm/embLayerNormInt8Plugin.h"
-#include "gelu/geluPlugin.h"
-#include "qkv_to_context/qkvToContextInt8Plugin.h"
-#include "qkv_to_context/qkvToContextPlugin.h"
-#include "skip_layernorm/skipLayerNormInt8Plugin.h"
-#include "skip_layernorm/skipLayerNormPlugin.h"
-#include "ffn/ffnPlugin.h"
-
-using namespace nvinfer1;
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-
-extern ILogger* gLogger;
-
-}  // namespace plugin
-}  // namespace nvinfer1
-
-namespace {
-// This singleton ensures that each plugin is only registered once for a given
-// namespace and type, and attempts of duplicate registration are ignored.
-class PluginCreatorRegistry {
-   public:
-    static PluginCreatorRegistry& getInstance() {
-        static PluginCreatorRegistry instance;
-        return instance;
-    }
-
-    string GetPluginUniqKey(const AsciiChar* const plugin_namespace, const AsciiChar* const plugin_name,
-                            const AsciiChar* const plugin_version) {
-        stringstream os;
-        if (plugin_namespace[0] != '\0') {
-            os << plugin_namespace << "/";
-        }
-        os << plugin_name;
-        if (plugin_version[0] != '\0') {
-            os << "/" << plugin_version;
-        }
-        return os.str();
-    }
-
-    template <typename CreatorType>
-    void addPluginCreator(void* logger, char const* libNamespace) {
-        printf("start addPluginCreator %s\n", libNamespace);
-        // Make accesses to the plugin creator registry thread safe
-        std::lock_guard<std::mutex> lock(mRegistryLock);
-
-        std::string errorMsg;
-        std::string verboseMsg;
-
-        std::unique_ptr<CreatorType> pluginCreator{new CreatorType{}};
-        pluginCreator->setPluginNamespace(libNamespace);
-
-        nvinfer1::ixrt_plugin::gLogger = static_cast<nvinfer1::ILogger*>(logger);
-        std::string pluginType = GetPluginUniqKey(pluginCreator->getPluginNamespace(), pluginCreator->getPluginName(),
-                                                  pluginCreator->getPluginVersion());
-
-        if (mRegistryList.find(pluginType) == mRegistryList.end()) {
-            bool status = getPluginRegistry()->registerCreator(*pluginCreator, libNamespace);
-            if (status) {
-                mRegistry.push(std::move(pluginCreator));
-                mRegistryList.insert(pluginType);
-                printf("Registered plugin creator -  %s\n", pluginType.c_str());
-                verboseMsg = "Registered plugin creator - " + pluginType;
-            } else {
-                printf("Could not register plugin creator - %s\n", pluginType.c_str());
-                errorMsg = "Could not register plugin creator -  " + pluginType;
-            }
-        } else {
-            printf("Plugin creator already registered - %s\n", pluginType.c_str());
-            verboseMsg = "Plugin creator already registered - " + pluginType;
-        }
-
-        if (logger) {
-            if (!errorMsg.empty()) {
-                nvinfer1::ixrt_plugin::gLogger->log(ILogger::Severity::kERROR, errorMsg.c_str());
-            }
-            if (!verboseMsg.empty()) {
-                nvinfer1::ixrt_plugin::gLogger->log(ILogger::Severity::kVERBOSE, verboseMsg.c_str());
-            }
-        }
-    }
-
-    ~PluginCreatorRegistry() {
-        std::lock_guard<std::mutex> lock(mRegistryLock);
-
-        // Release pluginCreators in LIFO order of registration.
-        while (!mRegistry.empty()) {
-            mRegistry.pop();
-        }
-        mRegistryList.clear();
-    }
-
-   private:
-    PluginCreatorRegistry() {}
-
-    std::mutex mRegistryLock;
-    std::stack<std::unique_ptr<IPluginCreator>> mRegistry;
-    std::unordered_set<std::string> mRegistryList;
-
-   public:
-    PluginCreatorRegistry(PluginCreatorRegistry const&) = delete;
-    void operator=(PluginCreatorRegistry const&) = delete;
-};
-
-template <typename CreatorType>
-void initializePlugin(void* logger, char const* libNamespace) {
-    PluginCreatorRegistry::getInstance().addPluginCreator<CreatorType>(logger, libNamespace);
-}
-
-}  // namespace
-
-extern "C" {
-bool initLibNvInferPlugins(void* logger, const char* libNamespace) {
-    initializePlugin<nvinfer1::ixrt_plugin::bert::FCPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::FCInt8PluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::FFNPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::EmbLayerNormPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::EmbLayerNormInt8PluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::GeluPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::QKVToContextPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::QKVToContextInt8PluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::SkipLayerNormPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::SkipLayerNormInt8PluginHFaceCreator>(logger, libNamespace);
-    return true;
-}
-}
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/backend/bert/bert_helper.h b/models/nlp/plm/bert_base_squad/ixrt/src/backend/bert/bert_helper.h
deleted file mode 100644
index bd094b40..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/backend/bert/bert_helper.h
+++ /dev/null
@@ -1,299 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cuda.h>
-#include <cuda_fp16.h>
-
-#include <stdexcept>
-
-#ifndef C10_WARP_SIZE
-
-#ifdef __ILUVATAR__
-#define C10_WARP_SIZE 64
-#else
-#define C10_WARP_SIZE 32
-#endif
-
-#endif
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace backend {
-
-const float epsilon = 0.000000000001;
-const unsigned int WARP_REDUCE_MASK = 0xffffffff;
-const float CUDA_FLOAT_INF_NEG = -100000000.f;  // FIXME later
-const float CUDA_FLOAT_INF_POS = 100000000.f;   // FIXME later
-const int CUDA_INT_INF = 2147483647;
-const int MAX_THREADS = 1024;
-
-__forceinline__ __device__ int8_t float2int8(float x, float quant_scale) {
-    float i8_f = x * quant_scale;
-    int32_t i8 = floorf(i8_f + 0.5);
-    i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8);
-    return int8_t(i8);
-}
-
-inline __device__ void WelfordCombine(float val, float *mean, float *m2, float *count) {
-    // Use Welford Online algorithem to compute mean and variance
-    // For more details you can refer to:
-    // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
-    *count += 1;
-    float delta1 = val - *mean;
-    *mean += delta1 / *count;
-    float delta2 = val - *mean;
-    *m2 += delta1 * delta2;
-}
-
-inline __device__ void WelfordCombine(float b_mean, float b_m2, float b_count, float *mean, float *m2, float *count) {
-    if (b_count == 0) {
-        return;
-    }
-    float new_count = *count + b_count;
-    float nb_over_n = b_count / new_count;
-    float delta = b_mean - *mean;
-    *mean += delta * nb_over_n;
-    *m2 += b_m2 + delta * delta * (*count) * nb_over_n;
-    *count = new_count;
-}
-
-__inline__ __device__ void WelfordWarpReduce(float thread_mean, float thread_m2, float thread_count, float *mean,
-                                             float *m2, float *count) {
-    *mean = thread_mean;
-    *m2 = thread_m2;
-    *count = thread_count;
-    for (int mask = C10_WARP_SIZE / 2; mask > 0; mask /= 2) {
-        float b_mean = __shfl_down_sync(0xffffffff, *mean, mask);
-        float b_m2 = __shfl_down_sync(0xffffffff, *m2, mask);
-        float b_count = __shfl_down_sync(0xffffffff, *count, mask);
-        WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
-    }
-}
-// addd by pxl
-// block内所有数据完成reduce
-//  template <int >
-__inline__ __device__ void WelfordBlockAllReduce(float thread_mean, float thread_m2, float thread_count,
-                                                 float *result_mean, float *result_m2, float *result_count) {
-    __shared__ float mean_shared[C10_WARP_SIZE];
-    __shared__ float m2_shared[C10_WARP_SIZE];
-    __shared__ float count_shared[C10_WARP_SIZE];
-    __shared__ float mean_result_broadcast;
-    __shared__ float m2_result_broadcast;
-    __shared__ float count_result_broadcast;
-
-    const int lid = threadIdx.x % C10_WARP_SIZE;
-    const int wid = threadIdx.x / C10_WARP_SIZE;
-    float warp_mean = 0;
-    float warp_m2 = 0;
-    float warp_count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &warp_mean, &warp_m2, &warp_count);
-    __syncthreads();
-
-    if (lid == 0) {
-        mean_shared[wid] = warp_mean;
-        m2_shared[wid] = warp_m2;
-        count_shared[wid] = warp_count;
-    }
-    __syncthreads();
-
-    if (wid == 0) {
-        if (threadIdx.x < blockDim.x / C10_WARP_SIZE) {
-            warp_mean = mean_shared[lid];
-            warp_m2 = m2_shared[lid];
-            warp_count = count_shared[lid];
-
-        } else {
-            warp_mean = 0.f;
-            warp_m2 = 0.f;
-            warp_count = 0.f;
-        }
-        __syncwarp();
-
-        float block_mean = 0;
-        float block_m2 = 0;
-        float block_count = 0;
-
-        WelfordWarpReduce(warp_mean, warp_m2, warp_count, &block_mean, &block_m2, &block_count);
-
-        if (lid == 0) {
-            mean_result_broadcast = block_mean;
-            m2_result_broadcast = block_m2;
-            count_result_broadcast = block_count;
-        }
-    }
-    __syncthreads();
-    *result_mean = mean_result_broadcast;
-    *result_m2 = m2_result_broadcast;
-    *result_count = count_result_broadcast;
-}
-__forceinline__ __device__ char4 float42char4(float4 vals, float quant_scale) {
-    char4 res;
-    res.x = float2int8(vals.x, quant_scale);
-    res.y = float2int8(vals.y, quant_scale);
-    res.z = float2int8(vals.z, quant_scale);
-    res.w = float2int8(vals.w, quant_scale);
-    return res;
-}
-
-// load 两个 half2, 保存到 float4
-__forceinline__ __device__ void load_float4_from_half(float4 &vals, __half2 *input, int index) {
-    __half2 i1 = input[index * 2];
-    __half2 i2 = input[index * 2 + 1];
-
-    vals.x = __half2float(i1.x);
-    vals.y = __half2float(i1.y);
-    vals.z = __half2float(i2.x);
-    vals.w = __half2float(i2.y);
-}
-
-/* Convert vector index to 3-dim tensor index */
-__forceinline__ __host__ __device__ void decompose_3dim(int src, int dim1, int dim2, int *id0, int *id1, int *id2) {
-    *id2 = src % dim2;
-    src /= dim2;
-
-    *id1 = src % dim1;
-    *id0 = src / dim1;
-}
-
-__forceinline__ __device__ float4 compute_float4_norm_value(float4 vals, float mean, float m2, int hidden_size,
-                                                            float epsilon, float4 scale, float4 bias) {
-    float4 norm_value;
-    norm_value.x =
-        (vals.x - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.x + bias.x;
-    norm_value.y =
-        (vals.y - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.y + bias.y;
-    norm_value.z =
-        (vals.z - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.z + bias.z;
-    norm_value.w =
-        (vals.w - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.w + bias.w;
-    return norm_value;
-}
-
-// for layer norm
-__forceinline__ __device__ float4 compute_float4_norm_value(float4 vals, float mean, float m2, int hidden_size,
-                                                            float epsilon, half2 scale_1, half2 scale_2, half2 bias_1,
-                                                            half2 bias_2) {
-    float4 norm_value;
-    norm_value.x =
-        (vals.x - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.x) + __half2float(bias_1.x);
-    norm_value.y =
-        (vals.y - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.y) + __half2float(bias_1.y);
-    norm_value.z =
-        (vals.z - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_2.x) + __half2float(bias_2.x);
-    norm_value.w =
-        (vals.w - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_2.y) + __half2float(bias_2.y);
-    return norm_value;
-}
-/* Convert half2 into float2, mask inf and -inf */
-__forceinline__ __host__ __device__ float safe_half_to_float(half hval) {
-    return fmax(fmin(100000.f, __half2float(hval)), -100000.f);
-}
-__forceinline__ __device__ float4 char4addfloat4_dequant(char4 input_4, float4 residual,
-                                                        float dequant_scale) {
-    float4 res;
-    res.x = __int2float_rn(input_4.x) * dequant_scale + residual.x;
-    res.y = __int2float_rn(input_4.y) * dequant_scale + residual.y;
-    res.z = __int2float_rn(input_4.z) * dequant_scale + residual.z;
-    res.w = __int2float_rn(input_4.w) * dequant_scale + residual.w;
-    return res;
-}
-
-__forceinline__ __device__ float4 char4addhalf2_dequant(char4 input_4, half2 residual_1, half2 residual_2,
-                                                        float dequant_scale) {
-    float4 res;
-    res.x = __int2float_rn(input_4.x) * dequant_scale + safe_half_to_float(residual_1.x);
-    res.y = __int2float_rn(input_4.y) * dequant_scale + safe_half_to_float(residual_1.y);
-    res.z = __int2float_rn(input_4.z) * dequant_scale + safe_half_to_float(residual_2.x);
-    res.w = __int2float_rn(input_4.w) * dequant_scale + safe_half_to_float(residual_2.y);
-    return res;
-}
-
-// gelu
-//  IxinferBiasGeluI8II8OKernel
-template <typename T>
-__forceinline__ __device__ T tanhf_exp(T x) {
-    // float e1 = __expf(x);
-    // float e2 = 1.0f / e1;
-    // return (e1 - e2) / (e1 + e2);
-
-    return (2.f / (1.f + __expf(-2.f * x)) - 1.f);
-}
-
-template <typename T>
-__forceinline__ __device__ T gelu(T x) {
-    float cdf = 0.5f * (1.0f + tanhf_exp((0.7978845608028654f * (x + 0.044715f * x * x * x))));
-    return x * cdf;
-}
-
-// softmax
-__forceinline__ __host__ __device__ int log2_ceil(int value) {
-    int log2_value = 0;
-    while ((1 << log2_value) < value) ++log2_value;
-    return log2_value;
-}
-template <typename T>
-__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width, unsigned int mask = 0xffffffff) {
-#if !(defined(__HIP_PLATFORM_HCC__) || defined(__ILUVATAR__))
-    return __shfl_xor_sync(mask, value, laneMask, width);
-#else
-    return __shfl_xor(value, laneMask, width);
-#endif
-}
-
-template <typename T>
-struct Add {
-    __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename T>
-struct Max {
-    __device__ __forceinline__ T operator()(T a, T b) const { return a < b ? b : a; }
-};
-template <typename acc_t, int REDUCE_WARP_SIZE, template <typename> class ReduceOp>
-__device__ __forceinline__ void warp_reduce(acc_t *sum) {
-    ReduceOp<acc_t> r;
-#pragma unroll
-    for (int offset = REDUCE_WARP_SIZE / 2; offset > 0; offset /= 2) {
-        acc_t b = WARP_SHFL_XOR(*sum, offset, REDUCE_WARP_SIZE);
-        *sum = r(*sum, b);
-    }
-}
-/* Convert 3-dim tensor index into vector index */
-__forceinline__ __host__ __device__ int targetid_3dim(int id1, int id2, int id3, int dim2, int dim3) {
-    return id1 * dim2 * dim3 + id2 * dim3 + id3;
-}
-
-/* Convert 4-dim tensor index into vector index */
-__forceinline__ __host__ __device__ int targetid_4dim(int id1, int id2, int id3, int id4, int dim2, int dim3,
-                                                      int dim4) {
-    // return id1*(dim2*dim3*dim4) + id2*(dim3*dim4) + id3*dim4 + id4;
-    int res = id4;
-
-    int ld = dim4;
-    res += id3 * ld;
-
-    ld *= dim3;
-    res += id2 * ld;
-
-    ld *= dim2;
-    res += id1 * ld;
-
-    return res;
-}
-
-}  // namespace backend
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/backend/cublas/cublas_helper.h b/models/nlp/plm/bert_base_squad/ixrt/src/backend/cublas/cublas_helper.h
deleted file mode 100644
index c0f34842..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/backend/cublas/cublas_helper.h
+++ /dev/null
@@ -1,312 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cublasLt.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-
-#include <stdexcept>
-
-#include "checkMacrosPlugin.h"
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace backend {
-
-/* GPU function guard */
-
-/**
- * @brief cublasLt gemm without imma
- *
- * @tparam OutType output dtype
- * @tparam ScaleType scale dtype
- * @param input_a
- * @param input_b
- * @param output_c
- * @param batch_count
- * @param m
- * @param n
- * @param k
- * @param stridea
- * @param strideb
- * @param stridec
- * @param alpha
- * @param cublasLt_handle
- * @param stream
- */
-template <typename OutType, typename ScaleType>
-void cublaslt_gemm(const int8_t* input_a, const int8_t* input_b, OutType* output_c, int batch_count, int m, int n,
-                   int k, int64_t stridea, int64_t strideb, int64_t stridec, const ScaleType alpha,
-                   cublasLtHandle_t cublasLt_handle, cudaStream_t stream) {
-    cublasOperation_t transpose = CUBLAS_OP_T;
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32I;
-#else
-    cudaDataType_t compute_type = CUDA_R_32I;
-#endif
-    cublasLtMatmulDesc_t matmul_desc;
-    cublasLtMatrixLayout_t desc_a = NULL;
-    cublasLtMatrixLayout_t desc_b = NULL;
-    cublasLtMatrixLayout_t desc_c = NULL;
-
-    cudaDataType_t out_dtype;
-    cudaDataType_t scale_dtype;
-    if (std::is_same<OutType, int32_t>::value) {
-        out_dtype = CUDA_R_32I;
-        scale_dtype = CUDA_R_32I;
-    } else if (std::is_same<OutType, int8_t>::value) {
-        out_dtype = CUDA_R_8I;
-        scale_dtype = CUDA_R_32F;
-    } else {
-        throw std::runtime_error("Unsupported output type");
-    }
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype));
-#else
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type));
-    CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype,
-                                                   sizeof(scale_dtype)));
-#endif
-    CHECK_GPU_ERROR(
-        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transpose, sizeof(transpose)));
-
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_8I, k, m, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_8I, k, n, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m));
-
-    if (batch_count > 1) {
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-                                                         sizeof(stridea)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-                                                         sizeof(strideb)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-                                                         sizeof(stridec)));
-    }
-
-    ScaleType beta = ScaleType(0);
-    CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta,
-                                   output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream));
-
-    CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c));
-}
-
-inline void cublaslt_gemm(const half* input_a, const half* input_b, half* output_c, int batch_count, int m, int n,
-                          int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                          cublasLtHandle_t cublasLt_handle, cudaStream_t stream) {
-    cublasOperation_t transpose = CUBLAS_OP_T;
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
-#else
-    cudaDataType_t compute_type = CUDA_R_32F;
-#endif
-    cublasLtMatmulDesc_t matmul_desc;
-    cublasLtMatrixLayout_t desc_a = NULL;
-    cublasLtMatrixLayout_t desc_b = NULL;
-    cublasLtMatrixLayout_t desc_c = NULL;
-
-    cudaDataType_t out_dtype = CUDA_R_16F;
-    cudaDataType_t scale_dtype = CUDA_R_32F;
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype));
-#else
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type));
-    CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype,
-                                                   sizeof(scale_dtype)));
-#endif
-    CHECK_GPU_ERROR(
-        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transpose, sizeof(transpose)));
-
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_16F, k, m, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_16F, k, n, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m));
-
-    if (batch_count > 1) {
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-                                                         sizeof(stridea)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-                                                         sizeof(strideb)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-                                                         sizeof(stridec)));
-    }
-
-    float beta = 0.0;
-    CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta,
-                                   output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream));
-
-    CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c));
-}
-
-template void cublaslt_gemm<int32_t, int32_t>(const int8_t* input_a, const int8_t* input_b, int32_t* output_c,
-                                              int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb,
-                                              int64_t stridec, const int32_t alpha, cublasLtHandle_t cublasLt_handle,
-                                              cudaStream_t stream);
-
-template void cublaslt_gemm<int8_t, float>(const int8_t* input_a, const int8_t* input_b, int8_t* output_c,
-                                           int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb,
-                                           int64_t stridec, const float alpha, cublasLtHandle_t cublasLt_handle,
-                                           cudaStream_t stream);
-
-/************add by pxl *************/
-template <typename OutType, typename ScaleType>
-void cublaslt_gemm_nn(const int8_t* input_a, const int8_t* input_b, OutType* output_c, int batch_count, int m, int n,
-                      int k, int64_t stridea, int64_t strideb, int64_t stridec, const ScaleType alpha,
-                      cublasLtHandle_t cublasLt_handle, cudaStream_t stream) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32I;
-#else
-    cudaDataType_t compute_type = CUDA_R_32I;
-#endif
-    cublasLtMatmulDesc_t matmul_desc;
-    cublasLtMatrixLayout_t desc_a = NULL;
-    cublasLtMatrixLayout_t desc_b = NULL;
-    cublasLtMatrixLayout_t desc_c = NULL;
-
-    cudaDataType_t out_dtype;
-    cudaDataType_t scale_dtype;
-    if (std::is_same<OutType, int32_t>::value) {
-        out_dtype = CUDA_R_32I;
-        scale_dtype = CUDA_R_32I;
-    } else if (std::is_same<OutType, int8_t>::value) {
-        out_dtype = CUDA_R_8I;
-        scale_dtype = CUDA_R_32F;
-    } else {
-        throw std::runtime_error("Unsupported output type");
-    }
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype));
-#else
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type));
-    CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype,
-                                                   sizeof(scale_dtype)));
-#endif
-
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_8I, m, k, m));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_8I, k, n, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m));
-
-    if (batch_count > 1) {
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-                                                         sizeof(stridea)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-                                                         sizeof(strideb)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-                                                         sizeof(stridec)));
-    }
-
-    ScaleType beta = ScaleType(0);
-    CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta,
-                                   output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream));
-
-    CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c));
-}
-
-template void cublaslt_gemm_nn<int32_t, int32_t>(const int8_t* input_a, const int8_t* input_b, int32_t* output_c,
-                                                 int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb,
-                                                 int64_t stridec, const int32_t alpha, cublasLtHandle_t cublasLt_handle,
-                                                 cudaStream_t stream);
-
-template void cublaslt_gemm_nn<int8_t, float>(const int8_t* input_a, const int8_t* input_b, int8_t* output_c,
-                                              int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb,
-                                              int64_t stridec, const float alpha, cublasLtHandle_t cublasLt_handle,
-                                              cudaStream_t stream);
-
-inline void cublaslt_gemm_nn(const half* input_a, const half* input_b, half* output_c, int batch_count, int m, int n,
-                          int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                          cublasLtHandle_t cublasLt_handle, cudaStream_t stream) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
-#else
-    cudaDataType_t compute_type = CUDA_R_32F;
-#endif
-    cublasLtMatmulDesc_t matmul_desc;
-    cublasLtMatrixLayout_t desc_a = NULL;
-    cublasLtMatrixLayout_t desc_b = NULL;
-    cublasLtMatrixLayout_t desc_c = NULL;
-
-    cudaDataType_t out_dtype = CUDA_R_16F;
-    cudaDataType_t scale_dtype = CUDA_R_32F;
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype));
-#else
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type));
-    CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype,
-                                                   sizeof(scale_dtype)));
-#endif
-
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_16F, m, k, m));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_16F, k, n, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m));
-
-    if (batch_count > 1) {
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-                                                         sizeof(stridea)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-                                                         sizeof(strideb)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-                                                         sizeof(stridec)));
-    }
-
-    float beta = 0.0;
-    CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta,
-                                   output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream));
-
-    CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c));
-}
-
-}  // namespace backend
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu b/models/nlp/plm/bert_base_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu
deleted file mode 100644
index b3f0bbcb..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu
+++ /dev/null
@@ -1,416 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "ixinfer_gemm_helper.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace backend {
-
-void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                     int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     cuinferHandle_t cuinfer_handle, cudaStream_t stream) {
-    /* TN: input_a: m,k input_b: n,k  output_c: n,m */
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_N;
-
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-
-    int lda = k;
-    int ldb = k;
-    int ldc = m;
-
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, nullptr, customOption);
-
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error!, error type: " + std::to_string((int)status) + " !");
-    }
-}
-
-void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, const float *bias, int8_t *output_c, int batch_count,
-                     int m, int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const float beta, const int act_type, cuinferHandle_t &cuinfer_handle, cudaStream_t &stream) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_N;
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-    }
-
-    int lda = k;
-    int ldb = k;
-    int ldc = m;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-
-void cuinfer_nn_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream) {
-    /* TN: input_a: k,m input_b: n,k  output_c: n,m */
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_N;
-    cuinferOperation_t transb = CUINFER_OP_N;
-
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-
-    int lda = m;
-    int ldb = k;
-    int ldc = m;
-
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, nullptr, customOption);
-
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error!");
-    }
-}
-
-void cuinfer_nt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream) {
-    /* TN: input_a: k,m input_b: k,n  output_c: n,m */
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_N;
-    cuinferOperation_t transb = CUINFER_OP_T;
-
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-
-    int lda = m;
-    int ldb = n;
-    int ldc = m;
-
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, nullptr, customOption);
-
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error!");
-    }
-}
-
-void cuinfer_tt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream) {
-    /* TN: input_a: k,m input_b: k,n  output_c: n,m */
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_T;
-
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-
-    int lda = k;
-    int ldb = n;
-    int ldc = m;
-
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, nullptr, customOption);
-
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error!");
-    }
-}
-
-void cuinfer_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k,
-                  int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t handle,
-                  cudaStream_t stream) {
-    /* Performs operation using cublas */
-    float beta = 0.0f;
-    cublasSetStream(handle, stream);
-    cublasStatus_t status;
-    if (batch_count <= 1) {
-        status = cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, k, input_b,
-                              CUDA_R_16F, k, &beta, output_c, CUDA_R_16F, m, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    } else {
-        status = cublasGemmStridedBatchedEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, k,
-                                            stridea, input_b, CUDA_R_16F, k, strideb, &beta, output_c, CUDA_R_16F, m,
-                                            stridec, batch_count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    }
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinfer_gemm error!");
-    }
-}
-
-void cuinfer_nn_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k,
-                     int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t handle,
-                     cudaStream_t stream) {
-    /* Performs operation using cublas */
-    float beta = 0.0f;
-    cublasSetStream(handle, stream);
-    cublasStatus_t status;
-    if (batch_count <= 1) {
-        // k,m n,k
-        status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, m, input_b,
-                              CUDA_R_16F, k, &beta, output_c, CUDA_R_16F, m, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    } else {
-        status = cublasGemmStridedBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, m,
-                                            stridea, input_b, CUDA_R_16F, k, strideb, &beta, output_c, CUDA_R_16F, m,
-                                            stridec, batch_count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    }
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinfer_gemm error!");
-    }
-}
-
-void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                  int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                  const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_N;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-        // std::cout << "CUINFER_BLAS_GEMM_CUSTOM_NONE" << std::endl;
-    }
-
-    int lda = k;
-    int ldb = k;
-    int ldc = m;
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                  int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, const float beta,
-                  const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_N;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-        // std::cout << "CUINFER_BLAS_GEMM_CUSTOM_NONE" << std::endl;
-    }
-
-    int lda = k;
-    int ldb = k;
-    int ldc = m;
-    // float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-void cuinfer_nn_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_N;
-    cuinferOperation_t transb = CUINFER_OP_N;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-    }
-
-    int lda = m;
-    int ldb = k;
-    int ldc = m;
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-void cuinfer_nt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_N;
-    cuinferOperation_t transb = CUINFER_OP_T;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-    }
-
-    int lda = m;
-    int ldb = n;
-    int ldc = m;
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-
-void cuinfer_tt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_T;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-    }
-
-    int lda = k;
-    int ldb = n;
-    int ldc = m;
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-
-}  // namespace backend
-}  // namespace nvinfer1::ixrt_plugin
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h b/models/nlp/plm/bert_base_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h
deleted file mode 100644
index 2433b3a1..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <ixinfer.h>
-
-#include <stdexcept>
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace backend {
-
-void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                     int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     cuinferHandle_t cuinfer_handle, cudaStream_t stream);
-
-void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, const float *bias, int8_t *output_c, int batch_count,
-                     int m, int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const float beta, const int act_type, cuinferHandle_t &cuinfer_handle, cudaStream_t &stream);
-
-void cuinfer_nn_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream);
-
-void cuinfer_nt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream);
-
-void cuinfer_tt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream);
-
-void cuinfer_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k,
-                  int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t cublas_handle,
-                  cudaStream_t stream);
-
-void cuinfer_nn_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k,
-                     int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t cublas_handle,
-                     cudaStream_t stream);
-
-void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                  int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                  const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                  int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, const float beta,
-                  const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-void cuinfer_nn_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-void cuinfer_nt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-void cuinfer_tt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-}  // namespace bert
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/common/bertCommon.h b/models/nlp/plm/bert_base_squad/ixrt/src/common/bertCommon.h
deleted file mode 100644
index a75d902f..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/common/bertCommon.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma once
-#include <cuda_fp16.h>
-
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <numeric>
-
-#include "NvInfer.h"
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#include "checkMacrosPlugin.h"
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace bert {
-
-constexpr uint32_t BDIM = 0;  // batch dimension
-constexpr uint32_t SDIM = 1;  // seq len dimension
-constexpr uint32_t HDIM = 2;  // hidden dimension
-
-#define TRT_UNUSED (void)
-
-template <typename T>
-struct CudaDeleter {
-    void operator()(T* buf) { IXRT_PLUGIN_CUASSERT(cudaFree(buf)); }
-};
-
-template <typename T>
-using cuda_unique_ptr = std::unique_ptr<T, CudaDeleter<T>>;
-
-inline uint32_t getElementSize(nvinfer1::DataType t) noexcept {
-    switch (t) {
-        case nvinfer1::DataType::kINT32:
-            return 4;
-        case nvinfer1::DataType::kFLOAT:
-            return 4;
-        case nvinfer1::DataType::kHALF:
-            return 2;
-        case nvinfer1::DataType::kBOOL:
-        // case nvinfer1::DataType::kUINT8:
-        case nvinfer1::DataType::kINT8:
-            return 1;
-        default:
-            break;
-        // case DataType::kUNKNOWN:
-        // case DataType::kINT64:
-        // case DataType::kFLOAT64:
-            // break;
-    }
-    return 0;
-}
-
-inline int64_t getWeightsSize(nvinfer1::Weights const& w, nvinfer1::DataType type) {
-    return w.count * getElementSize(type);
-}
-
-template <typename T>
-using cuda_shared_ptr = std::shared_ptr<T>;
-
-template <typename T>
-void make_cuda_shared(cuda_shared_ptr<T>& ptr, void* cudaMem) {
-    ptr.reset(static_cast<T*>(cudaMem), bert::CudaDeleter<T>());
-}
-
-struct WeightsWithOwnership : public nvinfer1::Weights {
-    ILogger* logger_;
-    WeightsWithOwnership() {
-        values = nullptr;
-        count = 0;
-    }
-    ~WeightsWithOwnership() { operator delete[](const_cast<void*>(values)); }
-
-    WeightsWithOwnership(WeightsWithOwnership const&) = delete;
-    WeightsWithOwnership operator=(WeightsWithOwnership const&) = delete;
-    WeightsWithOwnership(WeightsWithOwnership const&&) = delete;
-    WeightsWithOwnership operator=(WeightsWithOwnership const&&) = delete;
-
-    void convertAndCopy(nvinfer1::Weights const& src, nvinfer1::DataType type, float scale = 1) {
-        this->type = type;
-        this->count = src.count;
-
-        if (type == nvinfer1::DataType::kFLOAT) {
-            auto destBuf = new float[src.count];
-            this->values = destBuf;
-
-            if (src.type == nvinfer1::DataType::kFLOAT) {
-                ixrt_plugin::gLogInfo << "Float Weights(Host) => Float Array(Host)" << endl;
-                std::copy_n(static_cast<float const*>(src.values), src.count, destBuf);
-            } else {
-                IXRT_PLUGIN_ASSERT(src.type == nvinfer1::DataType::kHALF);
-
-                ixrt_plugin::gLogInfo << "Half Weights(Host) => Float Array(Host)" << endl;
-                auto const s = static_cast<half const*>(src.values);
-                auto d = static_cast<float*>(const_cast<void*>(this->values));
-
-                for (auto it = 0; it < src.count; it++) {
-                    d[it] = __half2float(s[it]);
-                }
-            }
-        } else if (type == nvinfer1::DataType::kHALF) {
-            auto destBuf = new half[src.count];
-            this->values = destBuf;
-
-            if (src.type == nvinfer1::DataType::kHALF) {
-                ixrt_plugin::gLogInfo << "Half Weights(Host) => Half Array(Host)" << endl;
-                std::copy_n(static_cast<half const*>(src.values), src.count, destBuf);
-            } else {
-                IXRT_PLUGIN_ASSERT(src.type == nvinfer1::DataType::kFLOAT);
-
-                ixrt_plugin::gLogInfo << "Float Weights(Host) => Half Array(Host)" << endl;
-                auto const s = static_cast<float const*>(src.values);
-                auto d = static_cast<half*>(const_cast<void*>(this->values));
-
-                for (auto it = 0; it < src.count; it++) {
-                    d[it] = __float2half(s[it]);
-                }
-            }
-        } else if (type == nvinfer1::DataType::kINT8) {
-            auto destBuf = new int8_t[src.count];
-            this->values = destBuf;
-
-            if (src.type == nvinfer1::DataType::kFLOAT) {
-                ixrt_plugin::gLogInfo << "Float Weights(Host) => Int8 Array(Host)" << endl;
-                auto const s = static_cast<float const*>(src.values);
-                auto d = static_cast<int8_t*>(const_cast<void*>(this->values));
-
-                for (auto it = 0; it < src.count; it++) {
-                    int32_t v = static_cast<int32_t>(std::roundf(s[it] / scale));
-                    d[it] = v <= -127 ? -127 : (v >= 127 ? 127 : v);
-                }
-            } else if (src.type == nvinfer1::DataType::kINT8) {
-                ixrt_plugin::gLogInfo << "Int8 Weights(Host) => Int8 Array(Host)" << endl;
-                std::copy_n(static_cast<int8_t const*>(src.values), src.count, destBuf);
-            } else {
-                throw std::runtime_error("Unsupported DataType specified for plugin.");
-            }
-        } else {
-            throw std::runtime_error("Unsupported DataType specified for plugin.");
-        }
-    }
-
-    void convertAndCopy(char const*& srcBuf, size_t count, nvinfer1::DataType type) noexcept {
-        this->type = type;
-        this->count = count;
-        auto const nbBytes = getWeightsSize(*this, type);
-        auto destBuf = new char[nbBytes];
-        this->values = destBuf;
-
-        std::copy_n(srcBuf, nbBytes, destBuf);
-        srcBuf += nbBytes;
-    }
-};
-
-template <typename T>
-inline void copyToDevice(WeightsWithOwnership& hostWeights, size_t nbBytes, cuda_unique_ptr<T>& cudaWeights) {
-    if (hostWeights.values) {
-        void* cudaMem{nullptr};
-        IXRT_PLUGIN_CUASSERT(cudaMalloc(&cudaMem, nbBytes));
-        IXRT_PLUGIN_CUASSERT(cudaMemcpy(cudaMem, hostWeights.values, nbBytes, cudaMemcpyHostToDevice));
-        cudaWeights.reset(static_cast<T*>(cudaMem));
-    }
-}
-
-template <typename T>
-inline void serFromDev(char*& buffer, T const* data, size_t nbElem) {
-    const size_t len = sizeof(T) * nbElem;
-    IXRT_PLUGIN_CUASSERT(cudaMemcpy(buffer, static_cast<void const*>(data), len, cudaMemcpyDeviceToHost));
-    buffer += len;
-}
-
-template <typename T>
-inline T* deserToDev(char const*& buffer, size_t nbElem) {
-    void* dev{nullptr};
-    const size_t len = sizeof(T) * nbElem;
-    IXRT_PLUGIN_CUASSERT(cudaMalloc(&dev, len));
-    IXRT_PLUGIN_CUASSERT(cudaMemcpy(dev, buffer, len, cudaMemcpyHostToDevice));
-
-    buffer += len;
-    return static_cast<T*>(dev);
-}
-
-inline nvinfer1::DataType fieldTypeToDataType(const nvinfer1::PluginFieldType ftype) {
-    switch (ftype) {
-        case nvinfer1::PluginFieldType::kFLOAT32: {
-            gLogInfo << "PluginFieldType is Float32" << endl;
-            return nvinfer1::DataType::kFLOAT;
-        }
-        case nvinfer1::PluginFieldType::kFLOAT16: {
-            gLogInfo << "PluginFieldType is Float16" << endl;
-            return nvinfer1::DataType::kHALF;
-        }
-        case nvinfer1::PluginFieldType::kINT32: {
-            gLogInfo << "PluginFieldType is Int32" << endl;
-            return nvinfer1::DataType::kINT32;
-        }
-        case nvinfer1::PluginFieldType::kINT8: {
-            gLogInfo << "PluginFieldType is Int8" << endl;
-            return nvinfer1::DataType::kINT8;
-        }
-        default:
-            throw std::invalid_argument("No corresponding datatype for plugin field type");
-    }
-}
-
-inline int64_t volume(nvinfer1::Dims const& d) {
-    return std::accumulate(d.d, d.d + d.nbDims, int64_t{1}, std::multiplies<int64_t>{});
-}
-}  // namespace bert
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/common/checkMacrosPlugin.cpp b/models/nlp/plm/bert_base_squad/ixrt/src/common/checkMacrosPlugin.cpp
deleted file mode 100644
index 8e705d6c..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/common/checkMacrosPlugin.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include "checkMacrosPlugin.h"
-
-#include "NvInferRuntimeCommon.h"
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-
-ILogger* gLogger{};
-
-template <ILogger::Severity kSeverity>
-int32_t LogStream<kSeverity>::Buf::sync() {
-    std::string s = str();
-    while (!s.empty() && s.back() == '\n') {
-        s.pop_back();
-    }
-    if (gLogger != nullptr) {
-        gLogger->log(kSeverity, s.c_str());
-    }
-    str("");
-    return 0;
-}
-
-// These use gLogger, and therefore require initLibNvInferPlugins() to be called with a logger
-// (otherwise, it will not log)
-LogStream<ILogger::Severity::kERROR> gLogError;
-LogStream<ILogger::Severity::kWARNING> gLogWarning;
-LogStream<ILogger::Severity::kINFO> gLogInfo;
-LogStream<ILogger::Severity::kVERBOSE> gLogVerbose;
-
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/common/checkMacrosPlugin.h b/models/nlp/plm/bert_base_squad/ixrt/src/common/checkMacrosPlugin.h
deleted file mode 100644
index 76d87a92..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/common/checkMacrosPlugin.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma once
-#include <cublasLt.h>
-
-#include <cassert>
-#include <iostream>
-#include <mutex>
-#include <sstream>
-
-#include "NvInfer.h"
-#include "NvInferRuntime.h"
-
-// Logs failed assertion and aborts.
-// Aborting is undesirable and will be phased-out from the plugin module, at which point
-// PLUGIN_ASSERT will perform the same function as PLUGIN_VALIDATE.
-using namespace std;
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-
-#ifdef _MSC_VER
-#define FN_NAME __FUNCTION__
-#else
-#define FN_NAME __func__
-#endif
-
-#define IXRT_PLUGIN_CHECK_VALUE(value, msg)                            \
-    {                                                                  \
-        if (not(value)) {                                              \
-            std::cerr << __FILE__ << " (" << __LINE__ << ")"           \
-                      << "-" << __FUNCTION__ << " : "                  \
-                      << " Plugin assert error: " << msg << std::endl; \
-            std::exit(EXIT_FAILURE);                                   \
-        }                                                              \
-    }
-
-#define IXRT_PLUGIN_ASSERT(value)                             \
-    {                                                         \
-        if (not(value)) {                                     \
-            std::cerr << __FILE__ << " (" << __LINE__ << ")"  \
-                      << "-" << __FUNCTION__ << " : "         \
-                      << " Plugin assert false" << std::endl; \
-            std::exit(EXIT_FAILURE);                          \
-        }                                                     \
-    }
-
-#define IXRT_PLUGIN_CHECK_CUDA(call)                                        \
-    do {                                                                    \
-        const cudaError_t error_code = call;                                \
-        if (error_code != cudaSuccess) {                                    \
-            printf("CUDA Error:\n");                                        \
-            printf("    File:       %s\n", __FILE__);                       \
-            printf("    Line:       %d\n", __LINE__);                       \
-            printf("    Error code: %d\n", error_code);                     \
-            printf("    Error text: %s\n", cudaGetErrorString(error_code)); \
-            exit(1);                                                        \
-        }                                                                   \
-    } while (0)
-
-inline void caughtError(const std::exception& e) { std::cerr << e.what() << std::endl; }
-
-#define IXRT_PLUGIN_FAIL(msg)                         \
-    do {                                              \
-        std::ostringstream stream;                    \
-        stream << "Assertion failed: " << msg << "\n" \
-               << __FILE__ << ':' << __LINE__ << "\n" \
-               << "Aborting..."                       \
-               << "\n";                               \
-        IXRT_PLUGIN_CHECK_CUDA(cudaDeviceReset());    \
-        abort;                                        \
-    } while (0)
-
-inline void throwCudaError(char const* file, char const* function, int32_t line, int32_t status, char const* msg) {
-    std::cerr << file << " (" << line << ")"
-              << "-" << function << " : " << msg << std::endl;
-    std::exit(EXIT_FAILURE);
-}
-
-#define IXRT_PLUGIN_CUASSERT(status_)                             \
-    {                                                             \
-        auto s_ = status_;                                        \
-        if (s_ != cudaSuccess) {                                  \
-            const char* msg = cudaGetErrorString(s_);             \
-            throwCudaError(__FILE__, FN_NAME, __LINE__, s_, msg); \
-        }                                                         \
-    }
-
-#undef CUINFER_CHECK
-#define CUINFER_CHECK(func)                                                              \
-    do {                                                                                 \
-        cuinferStatus_t status = (func);                                                 \
-        if (status != CUINFER_STATUS_SUCCESS) {                                          \
-            std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": " \
-                      << cuinferGetErrorString(status) << std::endl;                     \
-            std::exit(EXIT_FAILURE);                                                     \
-        }                                                                                \
-    } while (0)
-
-static std::string _cudaGetErrorString(cublasStatus_t error) {
-    switch (error) {
-        case CUBLAS_STATUS_SUCCESS:
-            return "CUBLAS_STATUS_SUCCESS";
-
-        case CUBLAS_STATUS_NOT_INITIALIZED:
-            return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-        case CUBLAS_STATUS_ALLOC_FAILED:
-            return "CUBLAS_STATUS_ALLOC_FAILED";
-
-        case CUBLAS_STATUS_INVALID_VALUE:
-            return "CUBLAS_STATUS_INVALID_VALUE";
-
-        case CUBLAS_STATUS_ARCH_MISMATCH:
-            return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-        case CUBLAS_STATUS_MAPPING_ERROR:
-            return "CUBLAS_STATUS_MAPPING_ERROR";
-
-        case CUBLAS_STATUS_EXECUTION_FAILED:
-            return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-        case CUBLAS_STATUS_INTERNAL_ERROR:
-            return "CUBLAS_STATUS_INTERNAL_ERROR";
-
-        case CUBLAS_STATUS_NOT_SUPPORTED:
-            return "CUBLAS_STATUS_NOT_SUPPORTED";
-
-        case CUBLAS_STATUS_LICENSE_ERROR:
-            return "CUBLAS_STATUS_LICENSE_ERROR";
-    }
-    return "CUBLAS_UNKNOW";
-}
-
-template <typename T>
-void check_gpu_error(T result, char const* const func, const char* const file, int const line) {
-    if (result) {
-        throw std::runtime_error(std::string("[CUDA][ERROR] ") + +file + "(" + std::to_string(line) +
-                                 "): " + (_cudaGetErrorString(result)) + "\n");
-    }
-}
-
-#define CHECK_GPU_ERROR(val) check_gpu_error((val), #val, __FILE__, __LINE__)
-
-template <ILogger::Severity kSeverity>
-class LogStream : public std::ostream {
-    class Buf : public std::stringbuf {
-       public:
-        int32_t sync() override;
-    };
-
-    Buf buffer;
-    std::mutex mLogStreamMutex;
-
-   public:
-    std::mutex& getMutex() { return mLogStreamMutex; }
-    LogStream() : std::ostream(&buffer){};
-};
-
-// Use mutex to protect multi-stream write to buffer
-template <ILogger::Severity kSeverity, typename T>
-LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, T const& msg) {
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << msg;
-    return stream;
-}
-
-// Special handling static numbers
-template <ILogger::Severity kSeverity>
-inline LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, int32_t num) {
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << num;
-    return stream;
-}
-
-// Special handling std::endl
-template <ILogger::Severity kSeverity>
-inline LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, std::ostream& (*f)(std::ostream&)) {
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << f;
-    return stream;
-}
-
-extern LogStream<ILogger::Severity::kERROR> gLogError;
-extern LogStream<ILogger::Severity::kWARNING> gLogWarning;
-extern LogStream<ILogger::Severity::kINFO> gLogInfo;
-extern LogStream<ILogger::Severity::kVERBOSE> gLogVerbose;
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/common/common_def.cuh b/models/nlp/plm/bert_base_squad/ixrt/src/common/common_def.cuh
deleted file mode 100644
index b9b9eb8e..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/common/common_def.cuh
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-
-#pragma once
-
-#include <cublasLt.h>
-
-#include <cuda.h>
-namespace nvinfer1 {
-namespace ixrt_plugin {
-#ifdef __ILUVATAR__
-static const int kMaxThreadNbPerBlock = 1024;
-static const int kMaxBlockNbPerSM = 8;
-static const int kWarpSize = 64;
-static const dim3 kMaxBlockDimension = {4096, 4096, 64};
-static const dim3 kMaxGridDimension = {4294967295, 65536, 65536};
-static const int kNbThreadsPerBlockGainBestPerformance = 1024;
-static const int kMaxSharedMemSizePerBlock = (128 * 1024 * 4);
-static const int kNbSmemLane = 64;
-static const int kNbBytesPerSmemLane = 4;
-#else
-static const int kMaxThreadNbPerBlock = 1024;
-static const int kMaxBlockNbPerSM = 8;
-static const int kWarpSize = 32;
-static const dim3 kMaxBlockDimension = {1024, 1024, 64};
-static const dim3 kMaxGridDimension = {2147483647, 65535, 65535};
-static const int kNbThreadsPerBlockGainBestPerformance = 256;
-static const int kMaxSharedMemSizePerBlock = 48 * 1024 * 4;
-static const int kNbSmemLane = 32;
-static const int kNbBytesPerSmemLane = 4;
-#endif
-
-static const int kNbCe = 4;
-static const int kNbCuPerCe = 4;
-static const int kNbSppPerCu = 4;
-
-static const float kLog2e = 1.442695040888963387;
-
-#define DivUp(x, y) (((x) + (y)-1) / (y))
-
-__device__ __forceinline__ float floatExp(float x) { return __builtin_exp2f(kLog2e * x); }
-
-__device__ __forceinline__ float floatLog(float x) { return __logf(x); }
-
-__forceinline__ int nearest_num(int x, int value) {
-    if (x % value == 0) {
-        return x;
-    } else {
-        int padding = value - x % value;
-        return x + padding;
-    }
-}
-}  // namespace nvinfer1::ixrt_plugin
-}
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/common/plugin.cpp b/models/nlp/plm/bert_base_squad/ixrt/src/common/plugin.cpp
deleted file mode 100644
index 29908ff1..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/common/plugin.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include "plugin.h"
-#include "checkMacrosPlugin.h"
-
-namespace nvinfer1
-{
-namespace ixrt_plugin
-{
-
-void validateRequiredAttributesExist(std::set<std::string> requiredFieldNames, PluginFieldCollection const* fc)
-{
-    for (int32_t i = 0; i < fc->nbFields; i++)
-    {
-        requiredFieldNames.erase(fc->fields[i].name);
-    }
-    if (!requiredFieldNames.empty())
-    {
-        std::stringstream msg{};
-        msg << "PluginFieldCollection missing required fields: {";
-        char const* separator = "";
-        for (auto const& field : requiredFieldNames)
-        {
-            msg << separator << field;
-            separator = ", ";
-        }
-        msg << "}";
-        std::string msg_str = msg.str();
-        IXRT_PLUGIN_CHECK_VALUE(false, msg_str.c_str());
-    }
-}
-
-} // namespace ixrt_plugin
-} // namespace nvinfer1
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/common/plugin.h b/models/nlp/plm/bert_base_squad/ixrt/src/common/plugin.h
deleted file mode 100644
index b24ef300..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/common/plugin.h
+++ /dev/null
@@ -1,72 +0,0 @@
-
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma once
-#include <cstring>
-#include <string>
-#include <set>
-#include "NvInferRuntimeCommon.h"
-
-typedef enum
-{
-    STATUS_SUCCESS = 0,
-    STATUS_FAILURE = 1,
-    STATUS_BAD_PARAM = 2,
-    STATUS_NOT_SUPPORTED = 3,
-    STATUS_NOT_INITIALIZED = 4
-} pluginStatus_t;
-
-namespace nvinfer1 {
-
-namespace ixrt_plugin {
-
-
-// Write values into buffer
-template <typename T>
-void write(char*& buffer, const T& val) {
-    std::memcpy(buffer, &val, sizeof(T));
-    buffer += sizeof(T);
-}
-
-// Read values from buffer
-template <typename T>
-T read(const char*& buffer) {
-    T val{};
-    std::memcpy(&val, buffer, sizeof(T));
-    buffer += sizeof(T);
-    return val;
-}
-
-void validateRequiredAttributesExist(std::set<std::string> requiredFieldNames, PluginFieldCollection const* fc);
-
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/common/serialize.h b/models/nlp/plm/bert_base_squad/ixrt/src/common/serialize.h
deleted file mode 100644
index 11ef7eca..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/common/serialize.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma once
-
-#include <cstring>
-#include <vector>
-#include <cassert>
-#include <type_traits>
-
-#include <iostream>
-using std::cerr;
-using std::cout;
-using std::endl;
-
-template <typename T>
-inline void serialize_value(void** buffer, T const& value);
-
-template <typename T>
-inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value);
-
-namespace
-{
-
-template <typename T, class Enable = void>
-struct Serializer
-{
-};
-
-template <typename T>
-struct Serializer<T,
-    typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value || std::is_pod<T>::value>::type>
-{
-    static size_t serialized_size(T const&)
-    {
-        return sizeof(T);
-    }
-    static void serialize(void** buffer, T const& value)
-    {
-        ::memcpy(*buffer, &value, sizeof(T));
-        reinterpret_cast<char*&>(*buffer) += sizeof(T);
-    }
-    static void deserialize(void const** buffer, size_t* buffer_size, T* value)
-    {
-        assert(*buffer_size >= sizeof(T));
-        ::memcpy(value, *buffer, sizeof(T));
-        reinterpret_cast<char const*&>(*buffer) += sizeof(T);
-        *buffer_size -= sizeof(T);
-    }
-};
-
-template <>
-struct Serializer<const char*>
-{
-    static size_t serialized_size(const char* value)
-    {
-        return strlen(value) + 1;
-    }
-    static void serialize(void** buffer, const char* value)
-    {
-        ::strcpy(static_cast<char*>(*buffer), value);
-        reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
-    }
-    static void deserialize(void const** buffer, size_t* buffer_size, const char** value)
-    {
-        *value = static_cast<char const*>(*buffer);
-        size_t data_size = strnlen(*value, *buffer_size) + 1;
-        assert(*buffer_size >= data_size);
-        reinterpret_cast<char const*&>(*buffer) += data_size;
-        *buffer_size -= data_size;
-    }
-};
-
-template <typename T>
-struct Serializer<std::vector<T>,
-    typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value || std::is_pod<T>::value>::type>
-{
-    static size_t serialized_size(std::vector<T> const& value)
-    {
-        return sizeof(value.size()) + value.size() * sizeof(T);
-    }
-    static void serialize(void** buffer, std::vector<T> const& value)
-    {
-        serialize_value(buffer, value.size());
-        size_t nbyte = value.size() * sizeof(T);
-        ::memcpy(*buffer, value.data(), nbyte);
-        reinterpret_cast<char*&>(*buffer) += nbyte;
-    }
-    static void deserialize(void const** buffer, size_t* buffer_size, std::vector<T>* value)
-    {
-        size_t size;
-        deserialize_value(buffer, buffer_size, &size);
-        value->resize(size);
-        size_t nbyte = value->size() * sizeof(T);
-        assert(*buffer_size >= nbyte);
-        ::memcpy(value->data(), *buffer, nbyte);
-        reinterpret_cast<char const*&>(*buffer) += nbyte;
-        *buffer_size -= nbyte;
-    }
-};
-
-} // namespace
-
-template <typename T>
-inline size_t serialized_size(T const& value)
-{
-    return Serializer<T>::serialized_size(value);
-}
-
-template <typename T>
-inline void serialize_value(void** buffer, T const& value)
-{
-    return Serializer<T>::serialize(buffer, value);
-}
-
-template <typename T>
-inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value)
-{
-    return Serializer<T>::deserialize(buffer, buffer_size, value);
-}
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp b/models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp
deleted file mode 100644
index cf00d620..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp
+++ /dev/null
@@ -1,431 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include "NvInferRuntimeCommon.h"
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "cuda_runtime_api.h"
-#include "driver_types.h"
-#include "fcPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-#include <cassert>
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace {
-char const* const kFC_VERSION{"2"};
-char const* const kFC_NAME{"CustomFCPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection FCInt8PluginDynamicCreator::mFC{};
-std::vector<PluginField> FCInt8PluginDynamicCreator::mPluginAttributes;
-
-FCInt8PluginDynamicCreator::FCInt8PluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("out_dims", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("W", nullptr, PluginFieldType::kINT8, 1));
-    mPluginAttributes.emplace_back(PluginField("fc_amax", nullptr, PluginFieldType::kFLOAT32, 2));
-
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* FCInt8PluginDynamicCreator::getPluginName() const noexcept { return kFC_NAME; }
-
-char const* FCInt8PluginDynamicCreator::getPluginVersion() const noexcept { return kFC_VERSION; }
-
-PluginFieldCollection const* FCInt8PluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* FCInt8PluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "Creating FCInt8PluginDynamicCreator..." << endl;
-        IXRT_PLUGIN_ASSERT(name != nullptr);
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        int32_t outDims = 0;
-        Weights W{DataType::kINT8, nullptr, 0LL};
-        Weights Bias{DataType::kFLOAT, nullptr, 0LL};
-        ixrt_plugin::validateRequiredAttributesExist({"out_dims", "W", "fc_amax"}, fc);
-        vector<float> weight_scale;
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string fieldName(fc->fields[i].name);
-            if (fieldName.compare("out_dims") == 0) {
-                outDims = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building outDims: " << outDims << endl;
-            }
-
-            if (fieldName.compare("W") == 0) {
-                gLogInfo << "Building W..." << endl;
-                W.values = fc->fields[i].data;
-                W.count = fc->fields[i].length;
-                W.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is W int8: " << (W.type == DataType::kINT8) << endl;
-            }
-
-            if (fieldName.compare("Bias") == 0) {
-                gLogInfo << "Building Bias..." << endl;
-                Bias.values = fc->fields[i].data;
-                Bias.count = fc->fields[i].length;
-                Bias.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is Bias float32: " << (Bias.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("fc_amax") == 0) {
-                gLogInfo << "Building fc_amax..." << endl;
-                for (auto j = 0; j < fc->fields[i].length; j++) {
-                    auto value = static_cast<float const*>(fc->fields[i].data)[j];
-                    weight_scale.emplace_back(value / 127.0);
-                }
-            }
-        }
-
-        if (outDims <= 0) {
-            gLogInfo << "Invalid output dimension" << endl;
-        }
-        if (W.count == 0 || W.values == nullptr || W.count < outDims) {
-            gLogInfo << "Invalid weights" << endl;
-        }
-
-        DataType type = DataType::kINT8;
-        return new FCInt8PluginDynamic(name, type, outDims, W, Bias, weight_scale);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* FCInt8PluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                         size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call FCInt8PluginDynamic::destroy()
-    try {
-        return new FCInt8PluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void FCInt8PluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FCInt8PluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(FCInt8PluginDynamicCreator);
-//#########################################################################//
-FCInt8PluginDynamic::FCInt8PluginDynamic(std::string const name, DataType const type, int32_t const outDim,
-                                         Weights const& W, Weights const& Bias, vector<float> const& scale)
-    : mLayerName(name),
-      mType(type),
-      mOutDim(outDim),
-      mNumParams(W.count),
-      mNmax(0),
-      mK(0),
-      mWdev(nullptr),
-      mNumBias(Bias.count),
-      mScale(scale),
-      mBiasdev(nullptr) {
-    if (W.type == nvinfer1::DataType::kFLOAT) {
-        float weight_max = std::numeric_limits<float>::min();
-        for (int64_t wb = 0, we = W.count; wb < we; ++wb) {
-            float val = static_cast<const float*>(W.values)[wb];
-            weight_max = std::max(weight_max, std::abs(val));
-        }
-        // mWeightScale = 127 / weight_max;
-    }
-
-    mW.convertAndCopy(W, DataType::kINT8, scale[0]);
-    copyToDevice(mW, getWeightsSize(mW, DataType::kINT8), mWdev);
-    if (Bias.values != nullptr) {
-        mBias.convertAndCopy(Bias, DataType::kFLOAT);
-        copyToDevice(mBias, getWeightsSize(mBias, DataType::kFLOAT), mBiasdev);
-    }
-}
-
-FCInt8PluginDynamic::FCInt8PluginDynamic(std::string const name, void const* data, size_t length)
-    : mLayerName(name), mWdev(nullptr), mBiasdev(nullptr) {
-    gLogInfo << "FCInt8PluginDynamic deserialize" << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mOutDim);
-    deserialize_value(&data, &length, &mNumParams);
-    deserialize_value(&data, &length, &mNmax);
-    deserialize_value(&data, &length, &mK);
-    deserialize_value(&data, &length, &mNumBias);
-    deserialize_value(&data, &length, &mScale);
-
-    char const* d = static_cast<char const*>(data);
-
-    mW.convertAndCopy(d, mNumParams, DataType::kINT8);
-    copyToDevice(mW, getWeightsSize(mW, DataType::kINT8), mWdev);
-    if (mNumBias > 0) {
-        mBias.convertAndCopy(d, mNumBias, DataType::kFLOAT);
-        copyToDevice(mBias, getWeightsSize(mBias, DataType::kFLOAT), mBiasdev);
-    }
-}
-
-// IPluginV2 Methods
-char const* FCInt8PluginDynamic::getPluginType() const noexcept { return kFC_NAME; }
-
-char const* FCInt8PluginDynamic::getPluginVersion() const noexcept { return kFC_VERSION; }
-
-int32_t FCInt8PluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t FCInt8PluginDynamic::initialize() noexcept {
-    gLogInfo << "FCInt8PluginDynamic initialize" << endl;
-    return 0;
-}
-
-void FCInt8PluginDynamic::terminate() noexcept { gLogInfo << "FCInt8PluginDynamic terminate" << endl; }
-
-size_t FCInt8PluginDynamic::getSerializationSize() const noexcept {
-    return sizeof(mType) + sizeof(mOutDim) + sizeof(mNumParams) + sizeof(mNmax) + sizeof(mK) + sizeof(mNumBias) +
-           mScale.size() * sizeof(float) + sizeof(mScale.size()) + getElementSize(DataType::kINT8) * mNumParams +
-           getElementSize(DataType::kFLOAT) * mNumBias;
-}
-
-void FCInt8PluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mOutDim);
-    serialize_value(&buffer, mNumParams);
-    serialize_value(&buffer, mNmax);
-    serialize_value(&buffer, mK);
-    serialize_value(&buffer, mNumBias);
-    serialize_value(&buffer, mScale);
-
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, static_cast<char*>(mWdev.get()), mNumParams * getElementSize(DataType::kINT8));
-
-    if (mNumBias > 0) {
-        serFromDev(d, static_cast<char*>(mBiasdev.get()), mNumBias * getElementSize(DataType::kFLOAT));
-    }
-}
-
-void FCInt8PluginDynamic::destroy() noexcept {
-    gLogInfo << "FCInt8PluginDynamic destroy" << endl;
-    mWdev.reset(nullptr);
-    if (mNumBias > 0) {
-        mBiasdev.reset(nullptr);
-    }
-    delete this;
-}
-
-void FCInt8PluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FCInt8PluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType FCInt8PluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0);
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-    // IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kINT8);
-    return inputTypes[0];
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* FCInt8PluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "FCInt8PluginDynamic clone" << endl;
-
-        auto* p = new FCInt8PluginDynamic(mLayerName, mType, mOutDim, mW, mBias, mScale);
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs FCInt8PluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                                   IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        DimsExprs ret;
-        ret.nbDims = 5;
-        ret.d[0] = inputs[0].d[0];
-        ret.d[1] = inputs[0].d[1];
-        ret.d[2] = exprBuilder.constant(mOutDim);
-        ret.d[3] = exprBuilder.constant(1);
-        ret.d[4] = exprBuilder.constant(1);
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool FCInt8PluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                    int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    IXRT_PLUGIN_ASSERT(inOut != nullptr);
-
-    PluginTensorDesc const& in = inOut[pos];
-    if (pos == 0) {
-        return (in.type == mType) && (in.format == TensorFormat::kLINEAR);
-    }
-    PluginTensorDesc const& prev = inOut[pos - 1];
-
-    // output
-    return in.type == prev.type && in.format == prev.format;
-}
-
-void FCInt8PluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                          DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type);
-        auto const& inDims0 = inputs[0].desc.dims;
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5);
-        mK = inDims0.d[HDIM];  // hiddensize
-        // IXRT_PLUGIN_ASSERT(hiddenSize * mOutDim == mNumParams);
-        IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1);
-        IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1);
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-        CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t FCInt8PluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                             PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    int32_t const B = inputs[0].dims.d[BDIM];
-    int32_t const S = inputs[0].dims.d[SDIM];
-    int32_t const oE = outputs[0].dims.d[HDIM];
-#ifdef __ILUVATAR__
-        return B * S * oE * sizeof(int8_t);
-#else 
-        return B * S * oE * sizeof(int32_t);
-#endif
-}
-
-int32_t FCInt8PluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                     void const* const* inputs, void* const* outputs, void* workSpace,
-                                     cudaStream_t stream) noexcept {
-    try {
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream));
-#endif
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const E = inputDesc->dims.d[HDIM];
-        int32_t const oE = outputDesc->dims.d[HDIM];
-        int32_t const n = S * B;
-        IXRT_PLUGIN_ASSERT(n >= 0);
-
-        float qkv_in_scale = inputDesc[0].scale;
-        float qkv_wei_scale = mScale[0];
-        float output_scale = outputDesc[0].scale;
-        float qkv_out_scale;
-        if (mScale.size() == 2) {
-            qkv_out_scale = mScale[1];
-        } else {
-            qkv_out_scale = output_scale;
-        }
-#ifdef __ILUVATAR__
-        int8_t* buffer = static_cast<int8_t*>(workSpace);
-#else
-        int32_t* buffer = static_cast<int32_t*>(workSpace);
-#endif
-        if (mType == DataType::kINT8) {
-            auto const* const input = static_cast<int8_t const*>(inputs[0]);
-            auto* output = static_cast<int8_t*>(outputs[0]);
-            auto weight = static_cast<int8_t*>(mWdev.get());
-
-            float dequant_scale = (qkv_in_scale * qkv_wei_scale) / qkv_out_scale;
-
-            if (mBiasdev.get() != nullptr) {
-#ifdef __ILUVATAR__
-                cuinfer_i8_gemm(weight, input, nullptr, buffer, 1, oE, n, E, 0, 0, 0, dequant_scale, 0.0, 0,
-                                cuinfer_handle, stream);
-                dequantGemmWithBias(buffer, static_cast<float*>(mBiasdev.get()), output, B * S, oE, qkv_out_scale,
-                                    1.0 / output_scale, stream);
-#else
-                cublaslt_gemm(weight, input, buffer, 1, oE, n, E, 0, 0, 0, 1, blaslt_handle, stream);
-                dequantGemmWithBias(buffer, static_cast<float*>(mBiasdev.get()), output, B * S, oE,  dequant_scale, qkv_out_scale,
-                                    1.0 / output_scale, stream);
-#endif
-                
-            } else {
-#ifdef __ILUVATAR__
-                cuinfer_i8_gemm(weight, input, nullptr, output, 1, oE, n, E, 0, 0, 0, dequant_scale, 0.0, 0,
-                                cuinfer_handle, stream);
-#else
-                
-                cublaslt_gemm(weight, input, buffer, 1, oE, n, E, 0, 0, 0, 1, blaslt_handle, stream);
-                quantGemm(buffer, output, B * S, oE, dequant_scale, stream);
-#endif
-            }
-        } else {
-            gLogError << "Unsupported type error, expected [kINT8], but received " << static_cast<int32_t>(mType)
-                      << endl;
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcInt8Plugin.cu b/models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcInt8Plugin.cu
deleted file mode 100644
index 7e233c87..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcInt8Plugin.cu
+++ /dev/null
@@ -1,485 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "backend/bert/bert_helper.h"
-#include "fcPlugin.h"
-using namespace nvinfer1::ixrt_plugin::backend;
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void dequant_gemm_without_bias(const int8_t* input, int8_t* output, int hidden_size, float dequant_scale,
-                                          float quant_scale, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_start = blockIdx.x * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    char4* p_input = (char4*)input;
-    char4* p_output = (char4*)output;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-
-        val[it].x = __int2float_rn(p_input[element_index].x) * dequant_scale;
-        val[it].y = __int2float_rn(p_input[element_index].y) * dequant_scale;
-        val[it].z = __int2float_rn(p_input[element_index].z) * dequant_scale;
-        val[it].w = __int2float_rn(p_input[element_index].w) * dequant_scale;
-
-        char4 res = float42char4(val[it], quant_scale);
-        p_output[element_index] = res;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void dequant_gemm_with_bias(const int8_t* input, const float* bias, int8_t* output, int hidden_size,
-                                       float dequant_scale, float quant_scale, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_start = blockIdx.x * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    char4* p_input = (char4*)input;
-    float4* p_bias = (float4*)bias;
-    char4* p_output = (char4*)output;
-
-    float4 bias_val;
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-        bias_val.x = p_bias[element_index].x;
-        bias_val.y = p_bias[element_index].y;
-        bias_val.z = p_bias[element_index].z;
-        bias_val.w = p_bias[element_index].w;
-
-        val[it].x = __int2float_rn(p_input[element_index].x) * dequant_scale + bias_val.x;
-        val[it].y = __int2float_rn(p_input[element_index].y) * dequant_scale + bias_val.y;
-        val[it].z = __int2float_rn(p_input[element_index].z) * dequant_scale + bias_val.z;
-        val[it].w = __int2float_rn(p_input[element_index].w) * dequant_scale + bias_val.w;
-
-        char4 res = float42char4(val[it], quant_scale);
-        p_output[element_index] = res;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void dequant_gemm_with_bias(const int32_t* input, const float* bias, int8_t* output, int hidden_size,
-                                       float quant_scale1, float dequant_scale, float quant_scale2, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_start = blockIdx.x * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    int4* p_input = (int4*)input;
-    float4* p_bias = (float4*)bias;
-    char4* p_output = (char4*)output;
-
-    float4 bias_val;
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-        bias_val.x = p_bias[element_index].x;
-        bias_val.y = p_bias[element_index].y;
-        bias_val.z = p_bias[element_index].z;
-        bias_val.w = p_bias[element_index].w;
-
-        char4 q_input;
-        q_input.x = float2int8(p_input[element_index].x*1.0, quant_scale1);
-        q_input.y = float2int8(p_input[element_index].y*1.0, quant_scale1);
-        q_input.z = float2int8(p_input[element_index].z*1.0, quant_scale1);
-        q_input.w = float2int8(p_input[element_index].w*1.0, quant_scale1);
-
-        val[it].x = __int2float_rn(q_input.x) * dequant_scale + bias_val.x;
-        val[it].y = __int2float_rn(q_input.y) * dequant_scale + bias_val.y;
-        val[it].z = __int2float_rn(q_input.z) * dequant_scale + bias_val.z;
-        val[it].w = __int2float_rn(q_input.w) * dequant_scale + bias_val.w;
-
-        char4 res = float42char4(val[it], quant_scale2);
-        p_output[element_index] = res;
-    }
-}
-
-void dequantGemmWithoutBias(int8_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale,
-                            float quant_scale, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int num_per_tca = 64;
-    dim3 gridSize(batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-
-    switch (num_warp) {
-        case 1:
-            dequant_gemm_without_bias<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 2:
-            dequant_gemm_without_bias<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 3:
-            dequant_gemm_without_bias<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 4:
-            dequant_gemm_without_bias<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 5:
-            dequant_gemm_without_bias<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 6:
-            dequant_gemm_without_bias<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 7:
-            dequant_gemm_without_bias<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 8:
-            dequant_gemm_without_bias<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 9:
-            dequant_gemm_without_bias<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 10:
-            dequant_gemm_without_bias<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 11:
-            dequant_gemm_without_bias<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 12:
-            dequant_gemm_without_bias<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 13:
-            dequant_gemm_without_bias<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 14:
-            dequant_gemm_without_bias<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 15:
-            dequant_gemm_without_bias<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 16:
-            dequant_gemm_without_bias<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("dequantGemmWithoutBias");
-            break;
-    }
-}
-
-void dequantGemmWithBias(int8_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size,
-                         float dequant_scale, float quant_scale, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int num_per_tca = 64;
-    dim3 gridSize(batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-
-    switch (num_warp) {
-        case 1:
-            dequant_gemm_with_bias<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 2:
-            dequant_gemm_with_bias<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 3:
-            dequant_gemm_with_bias<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 4:
-            dequant_gemm_with_bias<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 5:
-            dequant_gemm_with_bias<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 6:
-            dequant_gemm_with_bias<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 7:
-            dequant_gemm_with_bias<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 8:
-            dequant_gemm_with_bias<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 9:
-            dequant_gemm_with_bias<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 10:
-            dequant_gemm_with_bias<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 11:
-            dequant_gemm_with_bias<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 12:
-            dequant_gemm_with_bias<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 13:
-            dequant_gemm_with_bias<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 14:
-            dequant_gemm_with_bias<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 15:
-            dequant_gemm_with_bias<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 16:
-            dequant_gemm_with_bias<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("dequantGemmWithBias with int8_t input");
-            break;
-    }
-}
-
-void dequantGemmWithBias(int32_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size,
-                         float quant_scale1, float dequant_scale, float quant_scale2, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int num_per_tca = 64;
-    dim3 gridSize(batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-
-    switch (num_warp) {
-        case 1:
-            dequant_gemm_with_bias<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 2:
-            dequant_gemm_with_bias<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 3:
-            dequant_gemm_with_bias<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 4:
-            dequant_gemm_with_bias<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 5:
-            dequant_gemm_with_bias<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 6:
-            dequant_gemm_with_bias<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 7:
-            dequant_gemm_with_bias<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 8:
-            dequant_gemm_with_bias<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 9:
-            dequant_gemm_with_bias<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 10:
-            dequant_gemm_with_bias<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 11:
-            dequant_gemm_with_bias<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 12:
-            dequant_gemm_with_bias<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 13:
-            dequant_gemm_with_bias<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 14:
-            dequant_gemm_with_bias<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 15:
-            dequant_gemm_with_bias<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 16:
-            dequant_gemm_with_bias<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("dequantGemmWithBias with int32_t input");
-            break;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void quant_gemm(const int32_t* input, int8_t* output, int hidden_size, float quant_scale, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_start = blockIdx.x * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    int4* p_input = (int4*)input;
-    char4* p_output = (char4*)output;
-
-    float4 bias_val;
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-        char4 q_input;
-        q_input.x = float2int8(p_input[element_index].x*1.0, quant_scale);
-        q_input.y = float2int8(p_input[element_index].y*1.0, quant_scale);
-        q_input.z = float2int8(p_input[element_index].z*1.0, quant_scale);
-        q_input.w = float2int8(p_input[element_index].w*1.0, quant_scale);
-
-        p_output[element_index] = q_input;
-    }
-}
-
-void quantGemm(int32_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int num_per_tca = 64;
-    dim3 gridSize(batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-
-    switch (num_warp) {
-        case 1:
-            quant_gemm<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 2:
-            quant_gemm<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 3:
-            quant_gemm<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 4:
-            quant_gemm<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 5:
-            quant_gemm<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 6:
-            quant_gemm<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 7:
-            quant_gemm<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 8:
-            quant_gemm<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 9:
-            quant_gemm<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 10:
-            quant_gemm<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 11:
-            quant_gemm<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 12:
-            quant_gemm<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 13:
-            quant_gemm<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 14:
-            quant_gemm<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 15:
-            quant_gemm<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 16:
-            quant_gemm<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("quantGemm");
-            break;
-    }
-}
-
-}  // namespace bert
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcPlugin.cpp b/models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcPlugin.cpp
deleted file mode 100644
index 67541535..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcPlugin.cpp
+++ /dev/null
@@ -1,345 +0,0 @@
-#include "fcPlugin.h"
-
-#include "NvInferRuntimeCommon.h"
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#endif
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace {
-char const* const kFC_VERSION{"1"};
-char const* const kFC_NAME{"CustomFCPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection FCPluginDynamicCreator::mFC{};
-std::vector<PluginField> FCPluginDynamicCreator::mPluginAttributes;
-
-FCPluginDynamicCreator::FCPluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("out_dims", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("W", nullptr, PluginFieldType::kFLOAT32, 1));
-
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* FCPluginDynamicCreator::getPluginName() const noexcept { return kFC_NAME; }
-
-char const* FCPluginDynamicCreator::getPluginVersion() const noexcept { return kFC_VERSION; }
-
-PluginFieldCollection const* FCPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* FCPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "Creating FCPluginDynamicCreator..." << endl;
-        IXRT_PLUGIN_ASSERT(name != nullptr);
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        int32_t outDims = 0;
-        int32_t typeId = -1;
-        Weights W{DataType::kFLOAT, nullptr, 0LL};
-        Weights B{DataType::kFLOAT, nullptr, 0LL};
-        ixrt_plugin::validateRequiredAttributesExist({"out_dims", "type_id", "W"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string fieldName(fc->fields[i].name);
-            if (fieldName.compare("out_dims") == 0) {
-                outDims = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building outDims: " << outDims << endl;
-            }
-
-            if (fieldName.compare("type_id") == 0) {
-                typeId = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building typeId: " << typeId << endl;
-            }
-
-            if (fieldName.compare("W") == 0) {
-                gLogInfo << "Building W..." << endl;
-                W.values = fc->fields[i].data;
-                W.count = fc->fields[i].length;
-                W.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is W float32: " << (W.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("B") == 0) {
-                gLogInfo << "Building B..." << endl;
-                B.values = fc->fields[i].data;
-                B.count = fc->fields[i].length;
-                B.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is B float32: " << (B.type == DataType::kFLOAT) << endl;
-            }
-        }
-
-        if (outDims <= 0) {
-            gLogInfo << "Invalid output dimension" << endl;
-        }
-        if (typeId < 0 || typeId > 1) {
-            gLogInfo << "Invalid type id" << typeId << endl;
-        }
-        if (W.count == 0 || W.values == nullptr || W.count < outDims) {
-            gLogInfo << "Invalid weights" << endl;
-        }
-
-        DataType type = typeId == 0 ? DataType::kFLOAT : DataType::kHALF;
-        return new FCPluginDynamic(name, type, outDims, W, B);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* FCPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                     size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call FCPluginDynamic::destroy()
-    try {
-        return new FCPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void FCPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FCPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(FCPluginDynamicCreator);
-//#########################################################################//
-FCPluginDynamic::FCPluginDynamic(std::string const name, DataType const type, int32_t const outDim, Weights const& W,
-                                 Weights const& B)
-    : mLayerName(name),
-      mType(type),
-      mOutDim(outDim),
-      mNumParams(W.count),
-      mNumBias(B.count),
-      mWdev(nullptr),
-      mBdev(nullptr) {
-    mW.convertAndCopy(W, mType);
-    copyToDevice(mW, getWeightsSize(mW, mType), mWdev);
-    if (mNumBias) {
-        mB.convertAndCopy(B, mType);
-        copyToDevice(mB, getWeightsSize(mB, mType), mBdev);
-    }
-}
-
-FCPluginDynamic::FCPluginDynamic(std::string const name, void const* data, size_t length)
-    : mLayerName(name), mWdev(nullptr) {
-    gLogInfo << "FCPluginDynamic deserialize" << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mOutDim);
-    deserialize_value(&data, &length, &mNumParams);
-    deserialize_value(&data, &length, &mNumBias);
-
-    char const* d = static_cast<char const*>(data);
-
-    mW.convertAndCopy(d, mNumParams, mType);
-    copyToDevice(mW, getWeightsSize(mW, mType), mWdev);
-    if (mNumBias) {
-        mB.convertAndCopy(d, mNumBias, mType);
-        copyToDevice(mB, getWeightsSize(mB, mType), mBdev);
-    }
-}
-
-// IPluginV2 Methods
-char const* FCPluginDynamic::getPluginType() const noexcept { return kFC_NAME; }
-
-char const* FCPluginDynamic::getPluginVersion() const noexcept { return kFC_VERSION; }
-
-int32_t FCPluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t FCPluginDynamic::initialize() noexcept {
-    gLogInfo << "FCPluginDynamic initialize" << endl;
-    return 0;
-}
-
-void FCPluginDynamic::terminate() noexcept { gLogInfo << "FCPluginDynamic terminate" << endl; }
-
-size_t FCPluginDynamic::getSerializationSize() const noexcept {
-    size_t wordSize = getElementSize(mType);
-    return wordSize * (mNumParams + mNumBias) + sizeof(mType) + sizeof(mOutDim) + sizeof(mNumParams) + sizeof(mNumBias);
-}
-
-void FCPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mOutDim);
-    serialize_value(&buffer, mNumParams);
-    serialize_value(&buffer, mNumBias);
-
-    size_t wordSize = getElementSize(mType);
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, static_cast<char*>(mWdev.get()), mNumParams * wordSize);
-    if (mNumBias) {
-        serFromDev(d, static_cast<char*>(mBdev.get()), mNumBias * wordSize);
-    }
-}
-
-void FCPluginDynamic::destroy() noexcept {
-    gLogInfo << "FCPluginDynamic destroy" << endl;
-    mWdev.reset(nullptr);
-    if (mNumBias) {
-        mBdev.reset(nullptr);
-    }
-    delete this;
-}
-
-void FCPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FCPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType FCPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                            int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0);
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-    IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF);
-    return inputTypes[0];
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* FCPluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "FCPluginDynamic clone" << endl;
-
-        auto* p = new FCPluginDynamic(mLayerName, mType, mOutDim, mW, mB);
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs FCPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                               IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        DimsExprs ret;
-        ret.nbDims = 5;
-        ret.d[0] = inputs[0].d[0];
-        ret.d[1] = inputs[0].d[1];
-        ret.d[2] = exprBuilder.constant(mOutDim);
-        ret.d[3] = exprBuilder.constant(1);
-        ret.d[4] = exprBuilder.constant(1);
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool FCPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    IXRT_PLUGIN_ASSERT(inOut != nullptr);
-
-    PluginTensorDesc const& in = inOut[pos];
-    if (pos == 0) {
-        return (in.type == mType) && (in.format == TensorFormat::kLINEAR);
-    }
-    PluginTensorDesc const& prev = inOut[pos - 1];
-
-    // output
-    return in.type == prev.type && in.format == prev.format;
-}
-
-void FCPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                      DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type);
-        auto const& inDims0 = inputs[0].desc.dims;
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5);
-        // IXRT_PLUGIN_ASSERT(hiddenSize * mOutDim == mNumParams);
-        IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1);
-        IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1);
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-        CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t FCPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                         PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-int32_t FCPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                 void const* const* inputs, void* const* outputs, void* workSpace,
-                                 cudaStream_t stream) noexcept {
-    gLogInfo << "in FCPluginDynamic.." << endl;
-    try {
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream));
-#endif
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const E = inputDesc->dims.d[HDIM];
-        int32_t const n = S * B;
-        IXRT_PLUGIN_ASSERT(n >= 0);
-
-        if (mType == DataType::kHALF) {
-            auto const* const input = static_cast<half const*>(inputs[0]);
-            auto* output = static_cast<half*>(outputs[0]);
-            auto weight = static_cast<half*>(mWdev.get());
-            half* bias = nullptr;
-            if (mNumBias) {
-                bias = static_cast<half*>(mBdev.get());
-            }
-
-#ifdef __ILUVATAR__
-            cuinfer_gemm(weight, input, bias, output, 1, mOutDim, n, E, 0, 0, 0, 1.0f, -1, stream, cuinfer_handle);
-#else
-            cublaslt_gemm(weight, input, output, 1, mOutDim, n, E, 0, 0, 0, 1.0f, blaslt_handle, stream);
-#endif
-        } else {
-            gLogError << "Unsupported type error, expected [kHALF,kFLOAT], but received " << static_cast<int32_t>(mType)
-                      << endl;
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcPlugin.h b/models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcPlugin.h
deleted file mode 100644
index 2f9115dc..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/custom_fc/fcPlugin.h
+++ /dev/null
@@ -1,246 +0,0 @@
-
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#include <memory>
-#include <vector>
-
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#include "bertCommon.h"
-#include "driver_types.h"
-
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#else
-#include "backend/cublas/cublas_helper.h"
-#endif
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace bert {
-
-void quantGemm(int32_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale,
-               cudaStream_t stream);
-
-void dequantGemmWithBias(int32_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size,
-                         float dequant_scale1, float dequant_scale2, float quant_scale, cudaStream_t stream);
-
-void dequantGemmWithBias(int8_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size,
-                         float dequant_scale, float quant_scale, cudaStream_t stream);
-
-void dequantGemmWithoutBias(int8_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale,
-                            float quant_scale, cudaStream_t stream);
-
-class FCPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    FCPluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim,
-                    nvinfer1::Weights const& W, nvinfer1::Weights const& B);
-
-    FCPluginDynamic(std::string const name, void const* data, size_t length);
-
-    // It doesn't make sense to make FCPluginDynamic without arguments, so we
-    // delete default constructor.
-    FCPluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    std::string const mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    size_t mOutDim;  // leading dim
-    size_t mNumParams;
-    size_t mNumBias;
-
-    bert::WeightsWithOwnership mW;
-    bert::cuda_unique_ptr<void> mWdev;
-    bert::WeightsWithOwnership mB;
-    bert::cuda_unique_ptr<void> mBdev;
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-};
-
-class FCPluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    FCPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-class FCInt8PluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    FCInt8PluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim,
-                        nvinfer1::Weights const& W, nvinfer1::Weights const& Bias, vector<float> const& scale);
-
-    FCInt8PluginDynamic(std::string const name, void const* data, size_t length);
-
-    // It doesn't make sense to make FCInt8PluginDynamic without arguments, so we
-    // delete default constructor.
-    FCInt8PluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    std::string const mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    size_t mOutDim;  // leading dim
-    size_t mNumParams;
-    int32_t mNmax;
-    int32_t mK;
-    int32_t mNumBias;
-
-    vector<float> mScale;
-
-    bert::WeightsWithOwnership mW;
-    bert::cuda_unique_ptr<void> mWdev;
-
-    bert::WeightsWithOwnership mBias;
-    bert::cuda_unique_ptr<void> mBiasdev;
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-};
-
-class FCInt8PluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    FCInt8PluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-}  // namespace bert
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp b/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp
deleted file mode 100644
index 292e8a63..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp
+++ /dev/null
@@ -1,503 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "NvInferImpl.h"
-#include "NvInferRuntimeCommon.h"
-#include "checkMacrosPlugin.h"
-#include "common_def.cuh"
-#include "driver_types.h"
-#include "embLayerNormInt8Plugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* EMB_LAYER_NORM_INT8_VERSION{"2"};
-char const* EMB_LAYER_NORM_INT8_NAME{"CustomEmbLayerNormPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection EmbLayerNormInt8PluginDynamicCreator::mFC{};
-std::vector<PluginField> EmbLayerNormInt8PluginDynamicCreator::mPluginAttributes;
-
-EmbLayerNormInt8PluginDynamicCreator::EmbLayerNormInt8PluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_beta"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_gamma"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_word_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_token_type_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_position_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("output_fp16"));
-    mPluginAttributes.emplace_back(PluginField("full_mask"));
-    mPluginAttributes.emplace_back(PluginField("mha_type_id"));
-    mPluginAttributes.emplace_back(PluginField("pad_id"));
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* EmbLayerNormInt8PluginDynamicCreator::getPluginName() const noexcept { return EMB_LAYER_NORM_INT8_NAME; }
-
-char const* EmbLayerNormInt8PluginDynamicCreator::getPluginVersion() const noexcept {
-    return EMB_LAYER_NORM_INT8_VERSION;
-}
-
-PluginFieldCollection const* EmbLayerNormInt8PluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2DynamicExt* EmbLayerNormInt8PluginDynamicCreator::createPlugin(char const* name,
-                                                                        PluginFieldCollection const* fc) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-        gLogInfo << "EmbLayerNormInt8PluginDynamic createPlugin." << endl;
-        std::set<std::string> const requiredAttributes{
-            "bert_embeddings_layernorm_beta",      "bert_embeddings_layernorm_gamma",
-            "bert_embeddings_word_embeddings",     "bert_embeddings_token_type_embeddings",
-            "bert_embeddings_position_embeddings",
-        };
-
-        bool output_fp16 = false;
-        bool useFullMask = false;
-        Weights beta{};
-        Weights gamma{};
-        Weights word_emb{};
-        Weights pos_emb{};
-        Weights tok_emb{};
-        int32_t mhaTypeId = 0;
-        int32_t pad_id = 0;
-
-        for (auto i = 0; i < fc->nbFields; i++) {
-            std::string field_name(fc->fields[i].name);
-            if (field_name.compare("bert_embeddings_layernorm_beta") == 0) {
-                gLogInfo << "Building bert_embeddings_layernorm_beta..." << endl;
-                beta.values = fc->fields[i].data;
-                beta.count = fc->fields[i].length;
-                beta.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_layernorm_gamma") == 0) {
-                gLogInfo << "Building bert_embeddings_layernorm_gamma..." << endl;
-                gamma.values = fc->fields[i].data;
-                gamma.count = fc->fields[i].length;
-                gamma.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_word_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_word_embeddings..." << endl;
-                word_emb.values = fc->fields[i].data;
-                word_emb.count = fc->fields[i].length;
-                word_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_token_type_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_token_type_embeddings..." << endl;
-                tok_emb.values = fc->fields[i].data;
-                tok_emb.count = fc->fields[i].length;
-                tok_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_position_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_position_embeddings..." << endl;
-                pos_emb.values = fc->fields[i].data;
-                pos_emb.count = fc->fields[i].length;
-                pos_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("output_fp16") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-                output_fp16 = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
-                gLogInfo << "Building output_fp16: " << output_fp16 << endl;
-            }
-
-            if (field_name.compare("full_mask") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-                useFullMask = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
-                gLogInfo << "Building full_mask: " << useFullMask << endl;
-            }
-
-            if (field_name.compare("mha_type_id") == 0) {
-                mhaTypeId = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_ASSERT(mhaTypeId >= 0 && mhaTypeId < 3);
-                gLogInfo << "Building mha typeId: " << mhaTypeId << endl;
-            }
-
-            if (field_name.compare("pad_id") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32)
-                pad_id = *static_cast<int32_t const*>(fc->fields[i].data);
-            }
-        }
-        gLogInfo << "Building EmbLayerNormInt8PluginDynamic Plugin..." << endl;
-        DataType mhaType = static_cast<DataType>(mhaTypeId);
-        EmbLayerNormInt8PluginDynamic* p =
-            new EmbLayerNormInt8PluginDynamic(name, output_fp16 ? DataType::kHALF : DataType::kFLOAT, mhaType, beta,
-                                              gamma, word_emb, pos_emb, tok_emb, useFullMask, pad_id);
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2DynamicExt* EmbLayerNormInt8PluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                             size_t serialLength) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(serialData != nullptr);
-        return new EmbLayerNormInt8PluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void EmbLayerNormInt8PluginDynamicCreator::setPluginNamespace(char const* pluginNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(pluginNamespace != nullptr);
-        mNamespace = pluginNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* EmbLayerNormInt8PluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(EmbLayerNormInt8PluginDynamicCreator);
-
-//#########################################################################//
-EmbLayerNormInt8PluginDynamic::EmbLayerNormInt8PluginDynamic(std::string const& name, DataType const type,
-                                                             DataType const mhaType, Weights const& beta,
-                                                             Weights const& gamma, Weights const& wordEmb,
-                                                             Weights const& posEmb, Weights const& tokEmb,
-                                                             bool const useFullMask, int32_t padId)
-    : mLayerName(name),
-      mHiddenSize(beta.count),
-      mEmbType(type),
-      mUseFullMask(useFullMask),
-      mMhaType(mhaType),
-      mPadId(padId) {
-    IXRT_PLUGIN_ASSERT(beta.count == gamma.count);
-    IXRT_PLUGIN_ASSERT(mHiddenSize > 0U);
-    IXRT_PLUGIN_ASSERT(wordEmb.count % mHiddenSize == 0);
-    IXRT_PLUGIN_ASSERT(posEmb.count % mHiddenSize == 0);
-    IXRT_PLUGIN_ASSERT(tokEmb.count % mHiddenSize == 0);
-    mWordVocabSize = wordEmb.count / mHiddenSize;
-    mPosVocabSize = posEmb.count / mHiddenSize;
-    mTokVocabSize = tokEmb.count / mHiddenSize;
-
-    mBeta.convertAndCopy(beta, nvinfer1::DataType::kFLOAT);
-    mGamma.convertAndCopy(gamma, nvinfer1::DataType::kFLOAT);
-    mWordEmb.convertAndCopy(wordEmb, mEmbType);
-    mTokEmb.convertAndCopy(tokEmb, mEmbType);
-    mPosEmb.convertAndCopy(posEmb, mEmbType);
-
-    copyToDevice(mGamma, sizeof(float) * mGamma.count, mGammaDev);
-    copyToDevice(mBeta, sizeof(float) * mBeta.count, mBetaDev);
-    copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev);
-    copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev);
-    copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev);
-}
-
-EmbLayerNormInt8PluginDynamic::EmbLayerNormInt8PluginDynamic(std::string const& name, void const* data, size_t length)
-    : mLayerName(name),
-      mGammaDev(nullptr),
-      mBetaDev(nullptr),
-      mWordEmbDev(nullptr),
-      mTokEmbDev(nullptr),
-      mPosEmbDev(nullptr) {
-    gLogInfo << "EmbLayerNormInt8PluginDynamic deserialize." << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mEmbType);
-    deserialize_value(&data, &length, &mMhaType);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mSeqLen);
-    deserialize_value(&data, &length, &mPadId);
-    deserialize_value(&data, &length, &mWordVocabSize);
-    deserialize_value(&data, &length, &mPosVocabSize);
-    deserialize_value(&data, &length, &mTokVocabSize);
-    deserialize_value(&data, &length, &mUseFullMask);
-
-    char const* d = static_cast<char const*>(data);
-    mBeta.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kFLOAT);
-    mGamma.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kFLOAT);
-    mWordEmb.convertAndCopy(d, mHiddenSize * mWordVocabSize, mEmbType);
-    mPosEmb.convertAndCopy(d, mHiddenSize * mPosVocabSize, mEmbType);
-    mTokEmb.convertAndCopy(d, mHiddenSize * mTokVocabSize, mEmbType);
-
-    copyToDevice(mGamma, sizeof(float) * mGamma.count, mGammaDev);
-    copyToDevice(mBeta, sizeof(float) * mBeta.count, mBetaDev);
-    copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev);
-    copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev);
-    copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev);
-}
-
-// IPluginV2 Methods
-char const* EmbLayerNormInt8PluginDynamic::getPluginType() const noexcept { return EMB_LAYER_NORM_INT8_NAME; }
-
-char const* EmbLayerNormInt8PluginDynamic::getPluginVersion() const noexcept { return EMB_LAYER_NORM_INT8_VERSION; }
-
-int32_t EmbLayerNormInt8PluginDynamic::getNbOutputs() const noexcept { return 3; }
-
-int32_t EmbLayerNormInt8PluginDynamic::initialize() noexcept { return 0; }
-
-void EmbLayerNormInt8PluginDynamic::terminate() noexcept {
-    gLogInfo << "EmbLayerNormInt8PluginDynamic terminate." << endl;
-}
-
-size_t EmbLayerNormInt8PluginDynamic::getSerializationSize() const noexcept {
-    size_t const wordSize = getElementSize(mEmbType);
-    return sizeof(mEmbType) * 2                       // mEmbType, mMhaType
-           + sizeof(mHiddenSize) * 6                  // mHiddenSize, mSeqLen, 3*VocabSize, mPadId
-           + sizeof(mUseFullMask)                     // mask type
-           + 2 * sizeof(float) * mHiddenSize           // beta + gamma
-           + wordSize * mHiddenSize * mWordVocabSize  // word emb
-           + wordSize * mHiddenSize * mPosVocabSize   // pos emb
-           + wordSize * mHiddenSize * mTokVocabSize   // tok emb
-        ;
-}
-
-void EmbLayerNormInt8PluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mEmbType);
-    serialize_value(&buffer, mMhaType);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mSeqLen);
-    serialize_value(&buffer, mPadId);
-    serialize_value(&buffer, mWordVocabSize);
-    serialize_value(&buffer, mPosVocabSize);
-    serialize_value(&buffer, mTokVocabSize);
-    serialize_value(&buffer, mUseFullMask);
-
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, mBetaDev.get(), mHiddenSize);
-    serFromDev(d, mGammaDev.get(), mHiddenSize);
-    size_t const wordSize = getElementSize(mEmbType);
-    serFromDev(d, static_cast<char*>(mWordEmbDev.get()), mHiddenSize * mWordVocabSize * wordSize);
-    serFromDev(d, static_cast<char*>(mPosEmbDev.get()), mHiddenSize * mPosVocabSize * wordSize);
-    serFromDev(d, static_cast<char*>(mTokEmbDev.get()), mHiddenSize * mTokVocabSize * wordSize);
-}
-
-void EmbLayerNormInt8PluginDynamic::destroy() noexcept {
-    gLogInfo << "EmbLayerNormInt8PluginDynamic destroy." << endl;
-    // This gets called when the network containing plugin is destroyed
-    mGammaDev.reset(nullptr);
-    mBetaDev.reset(nullptr);
-    mWordEmbDev.reset(nullptr);
-    mPosEmbDev.reset(nullptr);
-    mTokEmbDev.reset(nullptr);
-    delete this;
-}
-
-void EmbLayerNormInt8PluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* EmbLayerNormInt8PluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType EmbLayerNormInt8PluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                          int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index >= 0 && index <= 2);
-    if (index == 0) {
-        return mMhaType;
-    }
-    if (index == 1) {
-        return DataType::kINT8;
-    }
-    return DataType::kFLOAT;
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* EmbLayerNormInt8PluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "EmbLayerNormInt8PluginDynamic clone." << endl;
-
-        auto p = new EmbLayerNormInt8PluginDynamic(mLayerName, mEmbType, mMhaType, mBeta, mGamma, mWordEmb, mPosEmb,
-                                                   mTokEmb, mUseFullMask);
-        p->mSeqLen = mSeqLen;
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs EmbLayerNormInt8PluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                             int32_t nbInputs, IExprBuilder& exprBuilder) noexcept {
-    try {
-        // Input should be input ids and token ids and the input mask
-        // Output should be the embeddings tensor and mask indices
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == 2);  // BxS
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[2].nbDims);
-
-        IXRT_PLUGIN_ASSERT(outputIndex >= 0 || outputIndex <= 2);
-
-        if (outputIndex == 0) {
-            DimsExprs ret;
-            ret.nbDims = 5;
-            ret.d[0] = inputs[0].d[BDIM];
-            ret.d[1] = inputs[0].d[SDIM];
-            ret.d[2] = exprBuilder.constant(mHiddenSize);
-            ret.d[3] = exprBuilder.constant(1);
-            ret.d[4] = exprBuilder.constant(1);
-            return ret;
-        }
-        if (outputIndex == 1) {
-            DimsExprs ret;
-            ret.nbDims = 2;
-            ret.d[0] = inputs[0].d[BDIM];
-            ret.d[1] = inputs[0].d[SDIM];
-            return ret;
-        }
-
-        DimsExprs ret;
-        ret.nbDims = 5;
-        ret.d[0] = inputs[0].d[BDIM];
-        ret.d[1] = inputs[0].d[SDIM];
-        ret.d[2] = exprBuilder.constant(mHiddenSize);
-        ret.d[3] = exprBuilder.constant(1);
-        ret.d[4] = exprBuilder.constant(1);
-        return ret;
-
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool EmbLayerNormInt8PluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut,
-                                                              int32_t nbInputs, int32_t nbOutputs) noexcept {
-    // 3 inputs of size BxS
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 3);
-
-    PluginTensorDesc const& desc = inOut[pos];
-    if (desc.format != TensorFormat::kLINEAR) {
-        return false;
-    }
-    if (pos == 0) {
-        return desc.type == DataType::kINT32;
-    }
-
-    PluginTensorDesc const& prev = inOut[pos - 1];
-    if (pos == 1 || pos == 2) {
-        return desc.type == DataType::kINT32 && desc.format == prev.format;
-    }
-
-    // emb_out
-    if (pos == 3 || pos == 4) {
-        return desc.type == DataType::kINT8 && desc.format == prev.format;
-    }
-    // residual
-    return desc.type == DataType::kFLOAT;
-}
-
-void EmbLayerNormInt8PluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                                    DynamicPluginTensorDesc const* outputs,
-                                                    int32_t nbOutputs) noexcept {
-    gLogInfo << "EmbLayerNormInt8PluginDynamic configurePlugin." << endl;
-
-    // Validate input arguments
-    IXRT_PLUGIN_ASSERT(nbOutputs == 3);
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-    IXRT_PLUGIN_ASSERT(inputs[0].desc.dims.nbDims == 2);
-    int32_t const S = inputs[0].desc.dims.d[SDIM];
-    mSeqLen = S;
-    int32_t const B = inputs[0].desc.dims.d[BDIM];
-    TRT_UNUSED B;
-    IXRT_PLUGIN_ASSERT(mSeqLen == static_cast<size_t>(inputs[1].desc.dims.d[SDIM]));
-    IXRT_PLUGIN_ASSERT(B == inputs[1].desc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(mSeqLen == static_cast<size_t>(inputs[2].desc.dims.d[SDIM]));
-    IXRT_PLUGIN_ASSERT(B == inputs[2].desc.dims.d[BDIM]);
-
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(static_cast<size_t>(outputs[0].desc.dims.d[SDIM]) == mSeqLen);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[BDIM] == B);
-    IXRT_PLUGIN_ASSERT(static_cast<size_t>(outputs[0].desc.dims.d[2]) == mHiddenSize);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[4] == 1);
-
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.nbDims == 2);
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[0] == B);
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[1] == S);
-
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[SDIM] == outputs[0].desc.dims.d[SDIM]);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[BDIM] == outputs[0].desc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[2] == outputs[0].desc.dims.d[2]);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[4] == 1);
-}
-
-size_t EmbLayerNormInt8PluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                       PluginTensorDesc const* outputs,
-                                                       int32_t nbOutputs) const noexcept {
-    int32_t const B = inputs[0].dims.d[BDIM];
-    int32_t const S = inputs[0].dims.d[SDIM];
-    return B * S * sizeof(int32_t);
-}
-
-int32_t EmbLayerNormInt8PluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                               void const* const* inputs, void* const* outputs, void* workspace,
-                                               cudaStream_t stream) noexcept {
-    try {
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t status = STATUS_SUCCESS;
-        int32_t fmha_S = S;
-        int32_t batch_tokens = B * fmha_S;
-
-        // Our plugin outputs only one tensor
-        auto const inputIds = static_cast<int32_t const*>(inputs[0]);
-        auto const segmentIds = static_cast<int32_t const*>(inputs[1]);
-
-        float const* beta = mBetaDev.get();
-        float const* gamma = mGammaDev.get();
-        auto output = static_cast<int8_t*>(outputs[0]);
-        auto mNewMask = static_cast<int8_t*>(outputs[1]);
-        auto residual = static_cast<float*>(outputs[2]);
-        auto const wordEmb = static_cast<float const*>(mWordEmbDev.get());
-        auto const tokEmb = static_cast<float const*>(mTokEmbDev.get());
-        auto const posEmb = static_cast<float const*>(mPosEmbDev.get());
-
-        float l0_qkv_in_amax = outputDesc[0].scale * 127;
-
-        auto mask_idx = static_cast<int32_t*>(workspace);
-        status = embLayerNorm(stream, static_cast<int32_t>(mHiddenSize), B, S, inputIds, segmentIds, beta, gamma,
-                              wordEmb, posEmb, tokEmb, mWordVocabSize, mTokVocabSize, residual, output, mask_idx,
-                              mPadId, l0_qkv_in_amax);
-
-        IxinferMaskPad(mask_idx, mNewMask, B, S, mHiddenSize, fmha_S, batch_tokens, stream);
-
-        if (status != cudaSuccess) {
-            return STATUS_FAILURE;
-        }
-
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu b/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu
deleted file mode 100644
index 3aa0cd86..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu
+++ /dev/null
@@ -1,342 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "embLayerNormInt8Plugin.h"
-#include "backend/bert/bert_helper.h"
-
-namespace nvinfer1::ixrt_plugin {
-using namespace backend;
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferResidualI8O(const float *input, int8_t *output, int hidden_size, float quant_scale) {
-    float4 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_size;
-
-    input += block_start;
-    output += block_start;
-
-    float4 *p_input = (float4 *)input;
-    char4 *p_output = (char4 *)output;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        vals[it].x = p_input[element_index].x;
-        vals[it].y = p_input[element_index].y;
-        vals[it].z = p_input[element_index].z;
-        vals[it].w = p_input[element_index].w;
-
-        char4 res = float42char4(vals[it], quant_scale);
-        p_output[element_index] = res;
-    }
-}
-
-template <typename T>
-void IxinferResidualI8OLauncher(const T *input, int8_t *output, int batch_tokens, int hidden_size, float quant_scale,
-                                  cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int num_warp = hidden_size / C10_WARP_SIZE / 4;
-
-    switch (num_warp) {
-        case 1:
-            IxinferResidualI8O<1><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 2:
-            IxinferResidualI8O<2><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 3:
-            IxinferResidualI8O<3><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 4:
-            IxinferResidualI8O<4><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 5:
-            IxinferResidualI8O<5><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 6:
-            IxinferResidualI8O<6><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 7:
-            IxinferResidualI8O<7><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 8:
-            IxinferResidualI8O<8><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 9:
-            IxinferResidualI8O<9><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 10:
-            IxinferResidualI8O<10><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 11:
-            IxinferResidualI8O<11><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 12:
-            IxinferResidualI8O<12><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 13:
-            IxinferResidualI8O<13><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 14:
-            IxinferResidualI8O<14><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 15:
-            IxinferResidualI8O<15><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 16:
-            IxinferResidualI8O<16><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        default:
-            throw std::runtime_error("IxinferResidualI8OLauncher");
-            break;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferBertEmbedLnKernel(const float *token_emb, const float *pos_emb, const float *type_emb, const int *tokens,
-                                         float *output, int *pad_mask, int *type_ids, int pad_id, int batch_size,
-                                         int seq_len, int hidden_dim, const float *scale, const float *bias) {
-    float4 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_dim;
-    int batch_idx, seq_idx;
-    batch_idx = blockIdx.x / seq_len;
-    seq_idx = blockIdx.x % seq_len;
-
-    int tokens_idx = blockIdx.x;
-    int token = tokens[tokens_idx];
-    int token_type = type_ids[tokens_idx];
-
-    output += block_start;
-
-    float4 *p_output = (float4 *)output;
-
-    float4 *p_scale = (float4 *)scale;
-    float4 *p_bias = (float4 *)bias;
-    float4 *p_value = (float4 *)(token_emb + token * hidden_dim);
-    float4 *p_pemb = (float4 *)(pos_emb + seq_idx * hidden_dim);
-    float4 *p_temb = (float4 *)(type_emb + token_type * hidden_dim);
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-
-        if (token == pad_id) {
-            if (element_index == 0) {
-                pad_mask[tokens_idx] = 1;
-            }
-            vals[it] = make_float4(0.f, 0.f, 0.f, 0.f);
-
-        } else {
-            if (element_index == 0) {
-                pad_mask[tokens_idx] = 0;
-            }
-        
-            vals[it].x = p_value[element_index].x + p_pemb[element_index].x + p_temb[element_index].x;
-            vals[it].y = p_value[element_index].y + p_pemb[element_index].y + p_temb[element_index].y;
-            vals[it].z = p_value[element_index].z + p_pemb[element_index].z + p_temb[element_index].z;
-            vals[it].w = p_value[element_index].w + p_pemb[element_index].w + p_temb[element_index].w;
-            WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].z, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].w, &thread_mean, &thread_m2, &thread_count);
-        }
-    }
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        float4 scale_value = p_scale[element_index];
-        float4 bias_value = p_bias[element_index];
-        float4 norm_value = compute_float4_norm_value(vals[it], mean, m2, hidden_dim, epsilon,
-                                                      scale_value, bias_value);
-        int tokens_idx = blockIdx.x;
-
-        int token = tokens[tokens_idx];
-        if (token == pad_id) {
-            p_output[element_index] = make_float4(0.f, 0.f, 0.f, 0.f);
-        } else {
-            p_output[element_index] = norm_value;
-        }
-    }
-}
-
-
-void IxinferBertEmbedLn(const float *token_emb, const float *pos_emb, const float *type_emb, const int *tokens, float *output,
-                        int *pad_mask, int *type_ids, int pad_id, int batch_size, int seq_len, int hidden_size,
-                        const float *scale, const float *bias, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int batch_tokens = batch_size * seq_len;
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-    int num_warp = hidden_size / C10_WARP_SIZE / 4; 
-
-    switch (num_warp) {
-        case 1:
-            IxinferBertEmbedLnKernel<1>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 2:
-            IxinferBertEmbedLnKernel<2>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 3:
-            IxinferBertEmbedLnKernel<3>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 4:
-            IxinferBertEmbedLnKernel<4>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 5:
-            IxinferBertEmbedLnKernel<5>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 6:
-            IxinferBertEmbedLnKernel<6>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 7:
-            IxinferBertEmbedLnKernel<7>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 8:
-            IxinferBertEmbedLnKernel<8>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 9:
-            IxinferBertEmbedLnKernel<9>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 10:
-            IxinferBertEmbedLnKernel<10>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 11:
-            IxinferBertEmbedLnKernel<11>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 12:
-            IxinferBertEmbedLnKernel<12>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 13:
-            IxinferBertEmbedLnKernel<13>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 14:
-            IxinferBertEmbedLnKernel<14>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 15:
-            IxinferBertEmbedLnKernel<15>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 16:
-            IxinferBertEmbedLnKernel<16>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        default:
-            throw std::runtime_error("IxinferBertEmbedLn");
-            break;
-    }
-}
-
-cudaError_t embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds,
-    float const* beta, float const* gamma, float const* wordEmb, float const* posEmb, float const* tokEmb, int32_t const wordSize,
-    int32_t const tokSize, float* buffer, int8_t* output, int32_t* maskIdx, int32_t padId, float l0_qkv_in_amax)
-{
-    IxinferBertEmbedLn(wordEmb, posEmb, tokEmb, inputIds, buffer, maskIdx, (int*)segmentIds,
-                                    padId, B, S, E, gamma, beta, stream);
-                         
-    IxinferResidualI8OLauncher<float>(buffer, output, B*S, E, 127.0 / l0_qkv_in_amax, stream);
-    return cudaSuccess;
-}
-
-void __global__ IxinferMaskPadKernel(const int32_t* mask, int8_t* new_mask, int bsz,
-                                    int ori_seq_len, int hsz, int fmha_seq_len) {
-    int batch_idx = blockIdx.x;
-    int seq_idx = blockIdx.y;
-
-    if (seq_idx < ori_seq_len) {
-        if (threadIdx.x == 0) {
-            new_mask[batch_idx * fmha_seq_len + seq_idx] = mask[batch_idx * ori_seq_len + seq_idx];
-        }
-    } else {
-        new_mask[batch_idx * fmha_seq_len + seq_idx] = 1;
-    }
-} 
-
-void IxinferMaskPad(int32_t* mask, int8_t* new_mask, int bsz, int ori_seq_len, int hsz,
-                   int fmha_seq_len, int batch_tokens, cudaStream_t stream) {
-    if (hsz / 2 > 4096) {
-        throw std::runtime_error("hsz/2>4096");
-    }
-    if (hsz % 2 != 0) {
-        throw std::runtime_error("hsz % 2 !=0");
-    }
-    if (ori_seq_len > fmha_seq_len) {
-        throw std::runtime_error("ori_seq_len > fmha_seq_len");
-    }
-    if (bsz * ori_seq_len > batch_tokens) {
-        throw std::runtime_error("bsz*ori_seq_len > batch_tokens");
-    }
-    dim3 blockSize(bsz, fmha_seq_len);
-    IxinferMaskPadKernel<<<blockSize, hsz / 2, 0, stream>>>(mask, new_mask, bsz, ori_seq_len, hsz,
-                                                           fmha_seq_len);
-}
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin                  
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h b/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h
deleted file mode 100644
index 5fee7a43..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-void IxinferBertEmbedLn(const float *token_emb, const float *pos_emb, const float *type_emb, const int *tokens, float *output,
-                        int *pad_mask, int *type_ids, int pad_id, int batch_size, int seq_len, int hidden_size,
-                        const float *scale, const float *bias, cudaStream_t stream);
-
-cudaError_t embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds,
-    float const* beta, float const* gamma, float const* wordEmb, float const* posEmb, float const* tokEmb, int32_t const wordSize,
-    int32_t const tokSize, float* buffer, int8_t* output, int32_t* maskIdx, int32_t padId, float token_embed_amax_);
-
-void IxinferMaskPad(int32_t* mask, int8_t* new_mask, int bsz, int ori_seq_len, int hsz,
-                   int fmha_seq_len, int batch_tokens, cudaStream_t stream);
-
-class EmbLayerNormInt8PluginDynamic : public IPluginV2DynamicExt {
-   public:
-    EmbLayerNormInt8PluginDynamic(std::string const& name, nvinfer1::DataType const type, nvinfer1::DataType const mhaType,
-        nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& word_emb,
-        nvinfer1::Weights const& pos_emb, nvinfer1::Weights const& tok_emb, bool const useFullMask, int32_t padId = 0);
-    EmbLayerNormInt8PluginDynamic(std::string const& name, void const* data, size_t length);
-    EmbLayerNormInt8PluginDynamic() noexcept = delete;
-    ~EmbLayerNormInt8PluginDynamic() override = default;
-
-    // IPluginV2 methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* libNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext methods
-    DataType getOutputDataType(int32_t index, DataType const* inputType, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt methods
-    IPluginV2DynamicExt* clone() const noexcept override;
-    DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                  IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out,
-                         int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs,
-                            int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
-                    void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-   private:
-    const std::string mLayerName;
-    std::string mNamespace;
-    size_t mHiddenSize;
-    size_t mSeqLen;
-    size_t mPadId;
-    DataType mEmbType;
-    bool mUseFullMask;
-    DataType mMhaType;
-    size_t mWordVocabSize, mPosVocabSize, mTokVocabSize;
-    cuda_unique_ptr<float> mGammaDev;
-    cuda_unique_ptr<float> mBetaDev;
-    cuda_unique_ptr<void> mWordEmbDev;
-    cuda_unique_ptr<void> mTokEmbDev;
-    cuda_unique_ptr<void> mPosEmbDev;
-    // cuda_unique_ptr<int32_t> mNewMask;
-    WeightsWithOwnership mBeta;
-    WeightsWithOwnership mGamma;
-    WeightsWithOwnership mWordEmb;
-    WeightsWithOwnership mTokEmb;
-    WeightsWithOwnership mPosEmb; 
-};
-
-class EmbLayerNormInt8PluginDynamicCreator : public IPluginCreator {
-   public:
-    EmbLayerNormInt8PluginDynamicCreator();
-
-    ~EmbLayerNormInt8PluginDynamicCreator() override = default;
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    PluginFieldCollection const* getFieldNames() noexcept override;
-
-    IPluginV2DynamicExt* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override;
-
-    IPluginV2DynamicExt* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
-
-};
-
-
-} // namespace bert
-} //namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp b/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp
deleted file mode 100644
index 499b2eef..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp
+++ /dev/null
@@ -1,495 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "embLayerNormPlugin.h"
-
-#include "NvInferImpl.h"
-#include "checkMacrosPlugin.h"
-#include "common_def.cuh"
-#include "driver_types.h"
-
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* EMB_LAYER_NORM_VERSION{"1"};
-char const* EMB_LAYER_NORM_NAME{"CustomEmbLayerNormPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection EmbLayerNormPluginDynamicCreator::mFC{};
-std::vector<PluginField> EmbLayerNormPluginDynamicCreator::mPluginAttributes;
-
-EmbLayerNormPluginDynamicCreator::EmbLayerNormPluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_beta"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_gamma"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_word_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_token_type_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_position_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("output_fp16"));
-    mPluginAttributes.emplace_back(PluginField("full_mask"));
-    mPluginAttributes.emplace_back(PluginField("mha_type_id"));
-    mPluginAttributes.emplace_back(PluginField("pad_id"));
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* EmbLayerNormPluginDynamicCreator::getPluginName() const noexcept { return EMB_LAYER_NORM_NAME; }
-
-char const* EmbLayerNormPluginDynamicCreator::getPluginVersion() const noexcept { return EMB_LAYER_NORM_VERSION; }
-
-PluginFieldCollection const* EmbLayerNormPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2DynamicExt* EmbLayerNormPluginDynamicCreator::createPlugin(char const* name,
-                                                                    PluginFieldCollection const* fc) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-        gLogInfo << "EmbLayerNormPluginDynamic createPlugin." << endl;
-        std::set<std::string> const requiredAttributes{
-            "bert_embeddings_layernorm_beta",      "bert_embeddings_layernorm_gamma",
-            "bert_embeddings_word_embeddings",     "bert_embeddings_token_type_embeddings",
-            "bert_embeddings_position_embeddings",
-        };
-
-        bool output_fp16 = false;
-        bool useFullMask = false;
-        Weights beta{};
-        Weights gamma{};
-        Weights word_emb{};
-        Weights pos_emb{};
-        Weights tok_emb{};
-        int32_t mhaTypeId = 0;
-        int32_t pad_id = 0;
-
-        for (auto i = 0; i < fc->nbFields; i++) {
-            std::string field_name(fc->fields[i].name);
-            if (field_name.compare("bert_embeddings_layernorm_beta") == 0) {
-                gLogInfo << "Building bert_embeddings_layernorm_beta..." << endl;
-                beta.values = fc->fields[i].data;
-                beta.count = fc->fields[i].length;
-                beta.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_layernorm_gamma") == 0) {
-                gLogInfo << "Building bert_embeddings_layernorm_gamma..." << endl;
-                gamma.values = fc->fields[i].data;
-                gamma.count = fc->fields[i].length;
-                gamma.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_word_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_word_embeddings..." << endl;
-                word_emb.values = fc->fields[i].data;
-                word_emb.count = fc->fields[i].length;
-                word_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_token_type_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_token_type_embeddings..." << endl;
-                tok_emb.values = fc->fields[i].data;
-                tok_emb.count = fc->fields[i].length;
-                tok_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_position_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_position_embeddings..." << endl;
-                pos_emb.values = fc->fields[i].data;
-                pos_emb.count = fc->fields[i].length;
-                pos_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("output_fp16") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-                output_fp16 = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
-                gLogInfo << "Building output_fp16: " << output_fp16 << endl;
-            }
-
-            if (field_name.compare("full_mask") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-                useFullMask = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
-                gLogInfo << "Building full_mask: " << useFullMask << endl;
-            }
-
-            if (field_name.compare("mha_type_id") == 0) {
-                mhaTypeId = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_ASSERT(mhaTypeId >= 0 && mhaTypeId < 3);
-                gLogInfo << "Building mha typeId: " << mhaTypeId << endl;
-            }
-
-            if (field_name.compare("pad_id") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32)
-                pad_id = *static_cast<int32_t const*>(fc->fields[i].data);
-            }
-        }
-        gLogInfo << "Building EmbLayerNormPluginDynamic Plugin..." << endl;
-        DataType mhaType = static_cast<DataType>(mhaTypeId);
-        EmbLayerNormPluginDynamic* p =
-            new EmbLayerNormPluginDynamic(name, output_fp16 ? DataType::kHALF : DataType::kFLOAT, mhaType, beta, gamma,
-                                          word_emb, pos_emb, tok_emb, useFullMask, pad_id);
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2DynamicExt* EmbLayerNormPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                         size_t serialLength) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(serialData != nullptr);
-        return new EmbLayerNormPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void EmbLayerNormPluginDynamicCreator::setPluginNamespace(char const* pluginNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(pluginNamespace != nullptr);
-        mNamespace = pluginNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* EmbLayerNormPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(EmbLayerNormPluginDynamicCreator);
-
-//#########################################################################//
-EmbLayerNormPluginDynamic::EmbLayerNormPluginDynamic(std::string const& name, DataType const type,
-                                                     DataType const mhaType, Weights const& beta, Weights const& gamma,
-                                                     Weights const& wordEmb, Weights const& posEmb,
-                                                     Weights const& tokEmb, bool const useFullMask, int32_t padId)
-    : mLayerName(name),
-      mHiddenSize(beta.count),
-      mEmbType(type),
-      mUseFullMask(useFullMask),
-      mMhaType(mhaType),
-      mPadId(padId) {
-    IXRT_PLUGIN_ASSERT(beta.count == gamma.count);
-    IXRT_PLUGIN_ASSERT(mHiddenSize > 0U);
-    IXRT_PLUGIN_ASSERT(wordEmb.count % mHiddenSize == 0);
-    IXRT_PLUGIN_ASSERT(posEmb.count % mHiddenSize == 0);
-    IXRT_PLUGIN_ASSERT(tokEmb.count % mHiddenSize == 0);
-    mWordVocabSize = wordEmb.count / mHiddenSize;
-    mPosVocabSize = posEmb.count / mHiddenSize;
-    mTokVocabSize = tokEmb.count / mHiddenSize;
-
-    mBeta.convertAndCopy(beta, nvinfer1::DataType::kHALF);
-    mGamma.convertAndCopy(gamma, nvinfer1::DataType::kHALF);
-    mWordEmb.convertAndCopy(wordEmb, mEmbType);
-    mTokEmb.convertAndCopy(tokEmb, mEmbType);
-    mPosEmb.convertAndCopy(posEmb, mEmbType);
-
-    copyToDevice(mGamma, sizeof(half) * mGamma.count, mGammaDev);
-    copyToDevice(mBeta, sizeof(half) * mBeta.count, mBetaDev);
-    copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev);
-    copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev);
-    copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev);
-}
-
-EmbLayerNormPluginDynamic::EmbLayerNormPluginDynamic(std::string const& name, void const* data, size_t length)
-    : mLayerName(name),
-      mGammaDev(nullptr),
-      mBetaDev(nullptr),
-      mWordEmbDev(nullptr),
-      mTokEmbDev(nullptr),
-      mPosEmbDev(nullptr) {
-    gLogInfo << "EmbLayerNormPluginDynamic deserialize." << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mEmbType);
-    deserialize_value(&data, &length, &mMhaType);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mSeqLen);
-    deserialize_value(&data, &length, &mPadId);
-    deserialize_value(&data, &length, &mWordVocabSize);
-    deserialize_value(&data, &length, &mPosVocabSize);
-    deserialize_value(&data, &length, &mTokVocabSize);
-    deserialize_value(&data, &length, &mUseFullMask);
-
-    char const* d = static_cast<char const*>(data);
-    mBeta.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kHALF);
-    mGamma.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kHALF);
-    mWordEmb.convertAndCopy(d, mHiddenSize * mWordVocabSize, mEmbType);
-    mPosEmb.convertAndCopy(d, mHiddenSize * mPosVocabSize, mEmbType);
-    mTokEmb.convertAndCopy(d, mHiddenSize * mTokVocabSize, mEmbType);
-
-    copyToDevice(mGamma, sizeof(half) * mGamma.count, mGammaDev);
-    copyToDevice(mBeta, sizeof(half) * mBeta.count, mBetaDev);
-    copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev);
-    copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev);
-    copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev);
-}
-
-// IPluginV2 Methods
-char const* EmbLayerNormPluginDynamic::getPluginType() const noexcept { return EMB_LAYER_NORM_NAME; }
-
-char const* EmbLayerNormPluginDynamic::getPluginVersion() const noexcept { return EMB_LAYER_NORM_VERSION; }
-
-int32_t EmbLayerNormPluginDynamic::getNbOutputs() const noexcept { return 2; }
-
-int32_t EmbLayerNormPluginDynamic::initialize() noexcept { return 0; }
-
-void EmbLayerNormPluginDynamic::terminate() noexcept {  gLogInfo << "EmbLayerNormPluginDynamic terminate." << endl; }
-
-size_t EmbLayerNormPluginDynamic::getSerializationSize() const noexcept {
-    size_t const wordSize = getElementSize(mEmbType);
-    return sizeof(mEmbType) * 2                       // mEmbType, mMhaType
-           + sizeof(mHiddenSize) * 6                  // mHiddenSize, mSeqLen, 3*VocabSize, mPadId
-           + sizeof(mUseFullMask)                     // mask type
-           + 2 * sizeof(half) * mHiddenSize           // beta + gamma
-           + wordSize * mHiddenSize * mWordVocabSize  // word emb
-           + wordSize * mHiddenSize * mPosVocabSize   // pos emb
-           + wordSize * mHiddenSize * mTokVocabSize   // tok emb
-        ;
-}
-
-void EmbLayerNormPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mEmbType);
-    serialize_value(&buffer, mMhaType);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mSeqLen);
-    serialize_value(&buffer, mPadId);
-    serialize_value(&buffer, mWordVocabSize);
-    serialize_value(&buffer, mPosVocabSize);
-    serialize_value(&buffer, mTokVocabSize);
-    serialize_value(&buffer, mUseFullMask);
-
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, mBetaDev.get(), mHiddenSize);
-    serFromDev(d, mGammaDev.get(), mHiddenSize);
-    size_t const wordSize = getElementSize(mEmbType);
-    serFromDev(d, static_cast<char*>(mWordEmbDev.get()), mHiddenSize * mWordVocabSize * wordSize);
-    serFromDev(d, static_cast<char*>(mPosEmbDev.get()), mHiddenSize * mPosVocabSize * wordSize);
-    serFromDev(d, static_cast<char*>(mTokEmbDev.get()), mHiddenSize * mTokVocabSize * wordSize);
-}
-
-void EmbLayerNormPluginDynamic::destroy() noexcept {
-    gLogInfo << "EmbLayerNormPluginDynamic destroy." << endl;
-    // This gets called when the network containing plugin is destroyed
-    mGammaDev.reset(nullptr);
-    mBetaDev.reset(nullptr);
-    mWordEmbDev.reset(nullptr);
-    mPosEmbDev.reset(nullptr);
-    mTokEmbDev.reset(nullptr);
-    delete this;
-}
-
-void EmbLayerNormPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* EmbLayerNormPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType EmbLayerNormPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                      int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0 || index == 1);
-    if (index == 0) {
-        IXRT_PLUGIN_ASSERT(mMhaType == DataType::kHALF || mMhaType == DataType::kFLOAT);
-        return mMhaType;
-    }
-    return DataType::kINT32;
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* EmbLayerNormPluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "EmbLayerNormPluginDynamic clone." << endl;
-
-        auto p = new EmbLayerNormPluginDynamic(mLayerName, mEmbType, mMhaType, mBeta, mGamma, mWordEmb, mPosEmb,
-                                               mTokEmb, mUseFullMask);
-        p->mSeqLen = mSeqLen;
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs EmbLayerNormPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                                         IExprBuilder& exprBuilder) noexcept {
-    try {
-        // Input should be input ids and token ids and the input mask
-        // Output should be the embeddings tensor and mask indices
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == 2);  // BxS
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[2].nbDims);
-
-        IXRT_PLUGIN_ASSERT(outputIndex == 0 || outputIndex == 1);
-
-        if (outputIndex == 0) {
-            DimsExprs ret;
-            ret.nbDims = 5;
-            ret.d[0] = inputs[0].d[0];
-            ret.d[1] = inputs[0].d[1];
-            ret.d[2] = exprBuilder.constant(mHiddenSize);
-            ret.d[3] = exprBuilder.constant(1);
-            ret.d[4] = exprBuilder.constant(1);
-            return ret;
-        }
-
-        DimsExprs ret;
-        ret.nbDims = 2;
-        ret.d[0] = inputs[0].d[BDIM];
-        ret.d[1] = inputs[0].d[SDIM];
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool EmbLayerNormPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                          int32_t nbOutputs) noexcept {
-    // 3 inputs of size BxS
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 2);
-
-    PluginTensorDesc const& desc = inOut[pos];
-    if (desc.format != TensorFormat::kLINEAR) {
-        return false;
-    }
-    if (pos == 0) {
-        return desc.type == DataType::kINT32;
-    }
-
-    PluginTensorDesc const& prev = inOut[pos - 1];
-    if (pos == 1 || pos == 2) {
-        return desc.type == DataType::kINT32 && desc.format == prev.format;
-    }
-
-    // embedded sequence
-    if (pos == 3) {
-        return desc.type == mMhaType && desc.format == prev.format;
-    }
-    // mask
-    return desc.type == ((mMhaType == DataType::kHALF) ? DataType::kINT32 : mMhaType);
-}
-
-void EmbLayerNormPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                                DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    gLogInfo << "EmbLayerNormPluginDynamic configurePlugin." << endl;
-
-    // Validate input arguments
-    IXRT_PLUGIN_ASSERT(nbOutputs == 2);
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-    IXRT_PLUGIN_ASSERT(inputs[0].desc.dims.nbDims == 2);
-    int32_t const S = inputs[0].desc.dims.d[SDIM];
-    mSeqLen = S;
-    int32_t const B = inputs[0].desc.dims.d[BDIM];
-    TRT_UNUSED B;
-    IXRT_PLUGIN_ASSERT(mSeqLen == static_cast<size_t>(inputs[1].desc.dims.d[SDIM]));
-    IXRT_PLUGIN_ASSERT(B == inputs[1].desc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(mSeqLen == static_cast<size_t>(inputs[2].desc.dims.d[SDIM]));
-    IXRT_PLUGIN_ASSERT(B == inputs[2].desc.dims.d[BDIM]);
-
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(mSeqLen == outputs[0].desc.dims.d[SDIM])
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[BDIM] == B);
-    IXRT_PLUGIN_ASSERT(static_cast<size_t>(outputs[0].desc.dims.d[2]) == mHiddenSize);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[4] == 1);
-
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.nbDims == 2);
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[0] == B);
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[1] == mSeqLen);
-}
-
-size_t EmbLayerNormPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                   PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-int32_t EmbLayerNormPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                           void const* const* inputs, void* const* outputs, void* workspace,
-                                           cudaStream_t stream) noexcept {
-    gLogInfo << "enqueue EmbLayerNormPluginDynamic.." << endl;
-    try {
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t status = STATUS_SUCCESS;
-        int32_t fmha_S = S;
-        int32_t batch_tokens = B * fmha_S;
-
-        // Our plugin outputs only one tensor
-        auto const inputIds = static_cast<int32_t const*>(inputs[0]);
-        auto const segmentIds = static_cast<int32_t const*>(inputs[1]);
-
-        half const* beta = mBetaDev.get();
-        half const* gamma = mGammaDev.get();
-        if (mMhaType == DataType::kFLOAT) {
-            gLogError << "embLayerNormPlugin float type not supported!" << endl;
-            return STATUS_NOT_SUPPORTED;
-        } else if (mMhaType == DataType::kHALF) {
-            auto output = static_cast<half*>(outputs[0]);
-            auto mNewMask = static_cast<int32_t*>(outputs[1]);
-            auto const wordEmb = static_cast<half const*>(mWordEmbDev.get());
-            auto const tokEmb = static_cast<half const*>(mTokEmbDev.get());
-            auto const posEmb = static_cast<half const*>(mPosEmbDev.get());
-
-            status =
-                embLayerNorm(stream, static_cast<int32_t>(mHiddenSize), B, S, inputIds, segmentIds, beta, gamma,
-                                wordEmb, posEmb, tokEmb, mWordVocabSize, mTokVocabSize, output, mNewMask, mPadId);
-            if (status != cudaSuccess) {
-                return STATUS_FAILURE;
-            }
-        }
-        else {
-            gLogError << "Unsupported type error, expected [kHALF,kFLOAT], but received "
-                      << static_cast<int32_t>(mMhaType) << endl;
-
-            return STATUS_NOT_SUPPORTED;
-        }
-
-        return status;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu b/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu
deleted file mode 100644
index 5766d382..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu
+++ /dev/null
@@ -1,258 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "embLayerNormPlugin.h"
-#include "backend/bert/bert_helper.h"
-
-namespace nvinfer1::ixrt_plugin {
-using namespace backend;
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferBertEmbedLnKernel(const __half *token_emb, const __half *pos_emb, const __half *type_emb,
-                                         const int *tokens, __half *output, int *pad_mask, int *type_ids, int pad_id,
-                                         int batch_size, int seq_len, int hidden_dim, const __half *scale,
-                                         const __half *bias) {
-    float2 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_dim;
-    output += block_start;
-
-    __half2 *p_output = (__half2 *)output;
-    __half2 *p_scale = (__half2 *)scale;
-    __half2 *p_bias = (__half2 *)bias;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-
-        int batch_idx, seq_idx, dim_idx;
-        batch_idx = blockIdx.x / seq_len;
-        seq_idx = blockIdx.x % seq_len;
-        dim_idx = element_index;
-        int tokens_idx = blockIdx.x;
-        int token = tokens[tokens_idx];
-        int token_type = type_ids[tokens_idx];
-
-        half2 value;
-
-        if (token == pad_id) {
-            if (dim_idx == 0) {
-                pad_mask[tokens_idx] = 1;
-            }
-            value.x = __float2half(0.f);
-            value.y = __float2half(0.f);
-
-        } else {
-            if (dim_idx == 0) {
-                pad_mask[tokens_idx] = 0;
-            }
-            value = ((half2 *)(token_emb + token * hidden_dim + dim_idx * 2))[0];
-            half2 pemb = ((half2 *)(pos_emb + seq_idx * hidden_dim + dim_idx * 2))[0];
-            half2 temb = ((half2 *)(type_emb + token_type * hidden_dim + dim_idx * 2))[0];
-
-            vals[it].x = __half2float(value.x) + __half2float(pemb.x) + __half2float(temb.x);
-            vals[it].y = __half2float(value.y) + __half2float(pemb.y) + __half2float(temb.y);
-
-            WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-        }
-
-        float mean = 0;
-        float m2 = 0;
-        float count = 0;
-        WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-        mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-        m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-        count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-        m2 = rsqrtf(m2 / hidden_dim + epsilon);
-
-#pragma unroll
-        for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-            int element_index = threadIdx.x + it * C10_WARP_SIZE;
-
-            __half2 scale_value = p_scale[element_index];
-            __half2 bias_value = p_bias[element_index];
-
-            float2 norm_value;
-            norm_value.x = (vals[it].x - mean) * m2 * __half2float(scale_value.x) + __half2float(bias_value.x);
-            norm_value.y = (vals[it].y - mean) * m2 * __half2float(scale_value.y) + __half2float(bias_value.y);
-
-            __half2 res;
-            res.x = __float2half(norm_value.x);
-            res.y = __float2half(norm_value.y);
-
-            int token = tokens[tokens_idx];
-            if (token == pad_id) {
-                res.x = __float2half(0.f);
-                res.y = __float2half(0.f);
-                p_output[element_index] = res;
-            } else {
-                p_output[element_index] = res;
-            }
-        }
-    }
-}
-
-void IxinferBertEmbedLn(const half *token_emb, const half *pos_emb, const half *type_emb,
-                                const int *tokens, half *output, int *pad_mask, int *type_ids, int pad_id,
-                                int batch_size, int seq_len, int hidden_size, const half *scale, const half *bias,
-                                cudaStream_t stream) {
-    if (hidden_size > 2048) {
-        throw std::runtime_error("hidden_size should <= 2048");
-    }
-    if (hidden_size / 2 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size / 2 // C10_WARP_SIZE != 0");
-    }
-    int batch_tokens = batch_size * seq_len;
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int num_warp = hidden_size / C10_WARP_SIZE / 2;
-
-    switch (num_warp) {
-        case 1:
-            IxinferBertEmbedLnKernel<1>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 2:
-            IxinferBertEmbedLnKernel<2>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 3:
-            IxinferBertEmbedLnKernel<3>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 4:
-            IxinferBertEmbedLnKernel<4>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 5:
-            IxinferBertEmbedLnKernel<5>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 6:
-            IxinferBertEmbedLnKernel<6>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 7:
-            IxinferBertEmbedLnKernel<7>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 8:
-            IxinferBertEmbedLnKernel<8>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 9:
-            IxinferBertEmbedLnKernel<9>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 10:
-            IxinferBertEmbedLnKernel<10>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 11:
-            IxinferBertEmbedLnKernel<11>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 12:
-            IxinferBertEmbedLnKernel<12>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 13:
-            IxinferBertEmbedLnKernel<13>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 14:
-            IxinferBertEmbedLnKernel<14>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 15:
-            IxinferBertEmbedLnKernel<15>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 16:
-            IxinferBertEmbedLnKernel<16>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        default:
-            throw std::runtime_error("IxinferBertEmbedLn");
-            break;
-    }
-}
-
-cudaError_t embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds,
-    half const* beta, half const* gamma, half const* wordEmb, half const* posEmb, half const* tokEmb, int32_t const wordSize,
-    int32_t const tokSize, half* output, int32_t* maskIdx, int32_t padId)
-{
-    IxinferBertEmbedLn(wordEmb, posEmb, tokEmb, inputIds, output, maskIdx, (int*)segmentIds,
-                                    padId, B, S, E, gamma, beta, stream);
-    return cudaSuccess;
-}
-
-void __global__ IxinferMaskPadKernel(const int32_t* mask, int32_t* new_mask, int bsz,
-                                    int ori_seq_len, int hsz, int fmha_seq_len) {
-    int batch_idx = blockIdx.x;
-    int seq_idx = blockIdx.y;
-
-    if (seq_idx < ori_seq_len) {
-        if (threadIdx.x == 0) {
-            new_mask[batch_idx * fmha_seq_len + seq_idx] = mask[batch_idx * ori_seq_len + seq_idx];
-        }
-    } else {
-        new_mask[batch_idx * fmha_seq_len + seq_idx] = 1;
-    }
-} 
-
-void IxinferMaskPad(int32_t* mask, int32_t* new_mask, int bsz, int ori_seq_len, int hsz,
-                   int fmha_seq_len, int batch_tokens, cudaStream_t stream) {
-    if (hsz / 2 > 4096) {
-        throw std::runtime_error("hsz/2>4096");
-    }
-    if (hsz % 2 != 0) {
-        throw std::runtime_error("hsz % 2 !=0");
-    }
-    if (ori_seq_len > fmha_seq_len) {
-        throw std::runtime_error("ori_seq_len > fmha_seq_len");
-    }
-    if (bsz * ori_seq_len > batch_tokens) {
-        throw std::runtime_error("bsz*ori_seq_len > batch_tokens");
-    }
-    dim3 blockSize(bsz, fmha_seq_len);
-    IxinferMaskPadKernel<<<blockSize, hsz / 2, 0, stream>>>(mask, new_mask, bsz, ori_seq_len, hsz,
-                                                           fmha_seq_len);
-}
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h b/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h
deleted file mode 100644
index f96e7d73..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-
-
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-cudaError embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds,
-    half const* beta, half const* gamma, half const* wordEmb, half const* posEmb, half const* tokEmb, int32_t const wordSize,
-    int32_t const tokSize, half* output, int32_t* maskIdx, int32_t padId);
-
-void IxinferMaskPad(int32_t* mask, int32_t* new_mask, int bsz, int ori_seq_len, int hsz,
-                   int fmha_seq_len, int batch_tokens, cudaStream_t stream);
-
-void IxinferBertEmbedLn(const half *token_emb, const half *pos_emb, const half *type_emb, const int *tokens, half *output,
-                        int *pad_mask, int *type_ids, int pad_id, int batch_size, int seq_len, int hidden_size,
-                        const half *scale, const half *bias, cudaStream_t stream);;
-
-class EmbLayerNormPluginDynamic : public IPluginV2DynamicExt {
-   public:
-    EmbLayerNormPluginDynamic(std::string const& name, nvinfer1::DataType const type, nvinfer1::DataType const mhaType,
-        nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& word_emb,
-        nvinfer1::Weights const& pos_emb, nvinfer1::Weights const& tok_emb, bool const useFullMask, int32_t padId = 0);
-    EmbLayerNormPluginDynamic(std::string const& name, void const* data, size_t length);
-    EmbLayerNormPluginDynamic() noexcept = delete;
-    ~EmbLayerNormPluginDynamic() override = default;
-
-    // IPluginV2 methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* libNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext methods
-    DataType getOutputDataType(int32_t index, DataType const* inputType, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt methods
-    IPluginV2DynamicExt* clone() const noexcept override;
-    DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                  IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out,
-                         int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs,
-                            int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
-                    void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-   private:
-    const std::string mLayerName;
-    std::string mNamespace;
-    size_t mHiddenSize;
-    size_t mSeqLen;
-    size_t mPadId;
-    DataType mEmbType;
-    bool mUseFullMask;
-    DataType mMhaType;
-    size_t mWordVocabSize, mPosVocabSize, mTokVocabSize;
-    cuda_unique_ptr<half> mGammaDev;
-    cuda_unique_ptr<half> mBetaDev;
-    cuda_unique_ptr<void> mWordEmbDev;
-    cuda_unique_ptr<void> mTokEmbDev;
-    cuda_unique_ptr<void> mPosEmbDev;
-    WeightsWithOwnership mBeta;
-    WeightsWithOwnership mGamma;
-    WeightsWithOwnership mWordEmb;
-    WeightsWithOwnership mTokEmb;
-    WeightsWithOwnership mPosEmb; 
-};
-
-class EmbLayerNormPluginDynamicCreator : public IPluginCreator {
-   public:
-    EmbLayerNormPluginDynamicCreator();
-
-    ~EmbLayerNormPluginDynamicCreator() override = default;
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    PluginFieldCollection const* getFieldNames() noexcept override;
-
-    IPluginV2DynamicExt* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override;
-
-    IPluginV2DynamicExt* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
-
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/ffn/ffnPlugin.cpp b/models/nlp/plm/bert_base_squad/ixrt/src/ffn/ffnPlugin.cpp
deleted file mode 100644
index 30b47f88..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/ffn/ffnPlugin.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "ffnPlugin.h"
-
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#endif
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "gelu/geluPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace {
-char const* const kFFN_VERSION{"1"};
-char const* const kFFN_NAME{"CustomFFNPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection FFNPluginDynamicCreator::mFFN{};
-std::vector<PluginField> FFNPluginDynamicCreator::mPluginAttributes;
-
-FFNPluginDynamicCreator::FFNPluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("out_dims", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("act_type", nullptr, PluginFieldType::kINT32, 1));
-
-    mFFN.nbFields = mPluginAttributes.size();
-    mFFN.fields = mPluginAttributes.data();
-}
-
-char const* FFNPluginDynamicCreator::getPluginName() const noexcept { return kFFN_NAME; }
-
-char const* FFNPluginDynamicCreator::getPluginVersion() const noexcept { return kFFN_VERSION; }
-
-PluginFieldCollection const* FFNPluginDynamicCreator::getFieldNames() noexcept { return &mFFN; }
-
-IPluginV2* FFNPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "Creating FFNPluginDynamicCreator..." << endl;
-        IXRT_PLUGIN_ASSERT(name != nullptr);
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        int32_t outDims = 0;
-        int32_t typeId = -1;
-        int32_t act_type = -1;
-        Weights W1{DataType::kFLOAT, nullptr, 0LL};
-        Weights W2{DataType::kFLOAT, nullptr, 0LL};
-        Weights B1{DataType::kFLOAT, nullptr, 0LL};
-        ixrt_plugin::validateRequiredAttributesExist({"out_dims", "type_id", "W1", "W2", "B1"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string fieldName(fc->fields[i].name);
-            if (fieldName.compare("out_dims") == 0) {
-                outDims = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building outDims: " << outDims << endl;
-            }
-
-            if (fieldName.compare("type_id") == 0) {
-                typeId = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building typeId: " << typeId << endl;
-            }
-
-            if (fieldName.compare("W1") == 0) {
-                gLogInfo << "Building W1..." << endl;
-                W1.values = fc->fields[i].data;
-                W1.count = fc->fields[i].length;
-                W1.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is W1 float32: " << (W1.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("W2") == 0) {
-                gLogInfo << "Building W2..." << endl;
-                W2.values = fc->fields[i].data;
-                W2.count = fc->fields[i].length;
-                W2.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is W2 float32: " << (W2.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("B1") == 0) {
-                gLogInfo << "Building B1..." << endl;
-                B1.values = fc->fields[i].data;
-                B1.count = fc->fields[i].length;
-                B1.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is B1 float32: " << (B1.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("act_type") == 0) {
-                gLogInfo << "Building act_type..." << endl;
-                act_type = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building act_type: " << act_type << endl;
-            }
-        }
-
-        if (outDims <= 0) {
-            gLogInfo << "Invalid output dimension" << endl;
-        }
-        if (typeId < 0 || typeId > 1) {
-            gLogInfo << "Invalid type id" << typeId << endl;
-        }
-        if (W1.count == 0 || W1.values == nullptr) {
-            gLogInfo << "Invalid weights W1" << endl;
-        }
-        if (W2.count == 0 || W2.values == nullptr) {
-            gLogInfo << "Invalid weights W2" << endl;
-        }
-        if (B1.count == 0 || B1.values == nullptr) {
-            gLogInfo << "Invalid weights B1" << endl;
-        }
-
-        DataType type = typeId == 0 ? DataType::kFLOAT : DataType::kHALF;
-        return new FFNPluginDynamic(name, type, outDims, act_type, W1, W2, B1);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* FFNPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                      size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call FFNPluginDynamic::destroy()
-    try {
-        return new FFNPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void FFNPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FFNPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(FFNPluginDynamicCreator);
-//#########################################################################//
-FFNPluginDynamic::FFNPluginDynamic(std::string const name, DataType const type, int32_t const outDim,
-                                   int32_t const act_type, Weights const& W1, Weights const& W2, Weights const& B1)
-    : mLayerName(name),
-      mType(type),
-      mHiddenSize(outDim),
-      mActType(act_type),
-      mWdev1(nullptr),
-      mWdev2(nullptr),
-      mBdev1(nullptr) {
-    mW1.convertAndCopy(W1, mType);
-    mW2.convertAndCopy(W2, mType);
-    mB1.convertAndCopy(B1, mType);
-    copyToDevice(mW1, getWeightsSize(mW1, mType), mWdev1);
-    copyToDevice(mW2, getWeightsSize(mW2, mType), mWdev2);
-    copyToDevice(mB1, getWeightsSize(mB1, mType), mBdev1);
-}
-
-FFNPluginDynamic::FFNPluginDynamic(std::string const name, void const* data, size_t length)
-    : mLayerName(name), mWdev1(nullptr), mWdev2(nullptr), mBdev1(nullptr) {
-    gLogInfo << "FFNPluginDynamic deserialize" << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mActType);
-
-    char const* d = static_cast<char const*>(data);
-
-    mW1.convertAndCopy(d, mHiddenSize * mHiddenSize * 4, mType);
-    copyToDevice(mW1, getWeightsSize(mW1, mType), mWdev1);
-
-    mW2.convertAndCopy(d, mHiddenSize * mHiddenSize * 4, mType);
-    copyToDevice(mW2, getWeightsSize(mW2, mType), mWdev2);
-
-    mB1.convertAndCopy(d, mHiddenSize * 4, mType);
-    copyToDevice(mB1, getWeightsSize(mB1, mType), mBdev1);
-}
-
-// IPluginV2 Methods
-char const* FFNPluginDynamic::getPluginType() const noexcept { return kFFN_NAME; }
-
-char const* FFNPluginDynamic::getPluginVersion() const noexcept { return kFFN_VERSION; }
-
-int32_t FFNPluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t FFNPluginDynamic::initialize() noexcept {
-    gLogInfo << "FFNPluginDynamic initialize" << endl;
-    return 0;
-}
-
-void FFNPluginDynamic::terminate() noexcept { gLogInfo << "FFNPluginDynamic terminate" << endl; }
-
-size_t FFNPluginDynamic::getSerializationSize() const noexcept {
-    size_t wordSize = getElementSize(mType);
-    return wordSize * (mHiddenSize * mHiddenSize * 8 + mHiddenSize * 4) + sizeof(mType) + sizeof(mHiddenSize) +
-           sizeof(mActType);
-}
-
-void FFNPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mActType);
-
-    size_t wordSize = getElementSize(mType);
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, static_cast<char*>(mWdev1.get()), 4 * mHiddenSize * mHiddenSize * wordSize);
-    serFromDev(d, static_cast<char*>(mWdev2.get()), 4 * mHiddenSize * mHiddenSize * wordSize);
-    serFromDev(d, static_cast<char*>(mBdev1.get()), 4 * mHiddenSize * wordSize);
-}
-
-void FFNPluginDynamic::destroy() noexcept {
-    gLogInfo << "FFNPluginDynamic destroy" << endl;
-    mWdev1.reset(nullptr);
-    mWdev2.reset(nullptr);
-    mBdev1.reset(nullptr);
-    delete this;
-}
-
-void FFNPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FFNPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType FFNPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                             int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0);
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-    IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF);
-    return inputTypes[0];
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* FFNPluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "FFNPluginDynamic clone" << endl;
-
-        auto* p = new FFNPluginDynamic(mLayerName, mType, mHiddenSize, mActType, mW1, mW2, mB1);
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs FFNPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                                IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        DimsExprs ret;
-        ret.nbDims = 5;
-        ret.d[0] = inputs[0].d[0];
-        ret.d[1] = inputs[0].d[1];
-        ret.d[2] = exprBuilder.constant(mHiddenSize);
-        ret.d[3] = exprBuilder.constant(1);
-        ret.d[4] = exprBuilder.constant(1);
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool FFNPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                 int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    IXRT_PLUGIN_ASSERT(inOut != nullptr);
-
-    PluginTensorDesc const& in = inOut[pos];
-    if (pos == 0) {
-        return (in.type == mType) && (in.format == TensorFormat::kLINEAR);
-    }
-    PluginTensorDesc const& prev = inOut[pos - 1];
-
-    // output
-    return in.type == prev.type && in.format == prev.format;
-}
-
-void FFNPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                       DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type);
-        auto const& inDims0 = inputs[0].desc.dims;
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5);
-        IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1);
-        IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1);
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-        CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t FFNPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                          PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    int32_t const S = inputs[0].dims.d[SDIM];
-    int32_t const B = inputs[0].dims.d[BDIM];
-    return B * S * 4 * mHiddenSize * sizeof(half);
-}
-
-int32_t FFNPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                  void const* const* inputs, void* const* outputs, void* workSpace,
-                                  cudaStream_t stream) noexcept {
-    gLogInfo << "in FFNPluginDynamic.." << endl;
-    try {
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream));
-#endif
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const n = S * B;
-        IXRT_PLUGIN_ASSERT(n >= 0);
-
-        if (mType == DataType::kHALF) {
-            auto const* const input = static_cast<half const*>(inputs[0]);
-            auto* output = static_cast<half*>(outputs[0]);
-            auto weight1 = static_cast<half*>(mWdev1.get());
-            auto weight2 = static_cast<half*>(mWdev2.get());
-            auto bias1 = static_cast<half*>(mBdev1.get());
-            auto buffer = static_cast<half*>(workSpace);
-
-#ifdef __ILUVATAR__
-            cuinfer_gemm(weight1, input, bias1, buffer, 1, mHiddenSize * 4, n, mHiddenSize, 0, 0, 0, 1.0f, mActType,
-                         stream, cuinfer_handle);
-            cuinfer_gemm(weight2, buffer, nullptr, output, 1, mHiddenSize, n, 4 * mHiddenSize, 0, 0, 0, 1.0f, -1,
-                         stream, cuinfer_handle);
-#else
-            cublaslt_gemm(weight1, input, buffer, 1, mHiddenSize * 4, n, mHiddenSize, 0, 0, 0, 1.0f, blaslt_handle,
-                          stream);
-            computeGeluBias(buffer, buffer, bias1, 4 * mHiddenSize, n, stream);
-            cublaslt_gemm(weight2, buffer, output, 1, mHiddenSize, n, mHiddenSize * 4, 0, 0, 0, 1.0f, blaslt_handle,
-                          stream);
-#endif
-        } else {
-            gLogError << "Unsupported type error, expected [kHALF], but received " << static_cast<int32_t>(mType)
-                      << endl;
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/ffn/ffnPlugin.h b/models/nlp/plm/bert_base_squad/ixrt/src/ffn/ffnPlugin.h
deleted file mode 100644
index 21459c9b..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/ffn/ffnPlugin.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#ifdef __ILUVATAR__
-#include <ixinfer.h>
-#endif
-
-#include <memory>
-
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#include "backend/cublas/cublas_helper.h"
-#include "bertCommon.h"
-#include <vector>
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-class FFNPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    FFNPluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim,
-                     int32_t const out_type, nvinfer1::Weights const& W1, nvinfer1::Weights const& W2,
-                     nvinfer1::Weights const& B1);
-
-    FFNPluginDynamic(std::string const name, void const* data, size_t length);
-
-    // It doesn't make sense to make FFNPluginDynamic without arguments, so we
-    // delete default constructor.
-    FFNPluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    std::string const mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    size_t mHiddenSize;
-    size_t mActType;
-
-    bert::WeightsWithOwnership mW1;
-    bert::WeightsWithOwnership mB1;
-    bert::WeightsWithOwnership mW2;
-    bert::cuda_unique_ptr<void> mWdev1;
-    bert::cuda_unique_ptr<void> mWdev2;
-    bert::cuda_unique_ptr<void> mBdev1;
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-};
-
-class FFNPluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    FFNPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFFN;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-class FFNInt8PluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    FFNInt8PluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim,
-                         nvinfer1::Weights const& W, nvinfer1::Weights const& Bias, vector<float> const& scale);
-
-    FFNInt8PluginDynamic(std::string const name, void const* data, size_t length);
-
-    // It doesn't make sense to make FFNInt8PluginDynamic without arguments, so we
-    // delete default constructor.
-    FFNInt8PluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    std::string const mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    size_t mOutDim;  // leading dim
-    size_t mNumParams;
-    int32_t mNmax;
-    int32_t mK;
-    int32_t mNumBias;
-
-    vector<float> mScale;
-
-    bert::WeightsWithOwnership mW;
-    bert::cuda_unique_ptr<void> mWdev;
-
-    bert::WeightsWithOwnership mBias;
-    bert::cuda_unique_ptr<void> mBiasdev;
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-};
-
-class FFNInt8PluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    FFNInt8PluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-}  // namespace bert
-}  // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/gelu/geluPlugin.cpp b/models/nlp/plm/bert_base_squad/ixrt/src/gelu/geluPlugin.cpp
deleted file mode 100644
index 897052de..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/gelu/geluPlugin.cpp
+++ /dev/null
@@ -1,355 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "geluPlugin.h"
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-#include <cstdint>
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* const kGELU_IXRT_PLUGIN_VERSION{"1"};
-char const* const kGELU_IXRT_PLUGIN_NAME{"CustomGeluPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection GeluPluginDynamicCreator::mFC{};
-std::vector<PluginField> GeluPluginDynamicCreator::mPluginAttributes;
-
-GeluPluginDynamicCreator::GeluPluginDynamicCreator() {
-    mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("bias", nullptr, PluginFieldType::kFLOAT32, 1));
-
-    // Fill PluginFieldCollection with PluginField arguments metadata
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* GeluPluginDynamicCreator::getPluginName() const noexcept { return kGELU_IXRT_PLUGIN_NAME; }
-
-char const* GeluPluginDynamicCreator::getPluginVersion() const noexcept { return kGELU_IXRT_PLUGIN_VERSION; }
-
-PluginFieldCollection const* GeluPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* GeluPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogVerbose << "GeluPluginDynamicCreator createPlugin\n";
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        Weights bias{DataType::kFLOAT, nullptr, 0};
-        int32_t typeId = -1;
-        ixrt_plugin::validateRequiredAttributesExist({"type_id", "ld"}, fc);
-        int32_t ld = 0;
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            IXRT_PLUGIN_ASSERT(fc->fields[i].name != nullptr);
-            std::string fieldName(fc->fields[i].name);
-
-            if (fieldName.compare("type_id") == 0) {
-                typeId = *static_cast<int32_t const*>(fc->fields[i].data);
-            }
-            if (fieldName.compare("bias") == 0) {
-                bias.values = fc->fields[i].data;
-                bias.count = fc->fields[i].length;
-                bias.type = fieldTypeToDataType(fc->fields[i].type);
-                if (ld == 0) {
-                    ld = bias.count;
-                }
-            }
-            if (fieldName.compare("ld") == 0) {
-                ld = *static_cast<int32_t const*>(fc->fields[i].data);
-            }
-        }
-
-        if (typeId < 0 || typeId > 3) {
-            gLogError << "GeluPluginDynamicCreator: invalid typeId " << typeId << std::endl;
-            return nullptr;
-        }
-
-        return new GeluPluginDynamic(name, static_cast<DataType>(typeId), bias, ld);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* GeluPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                       size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call GeluPluginDynamic::destroy()
-    try {
-        return new GeluPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void GeluPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* GeluPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(GeluPluginDynamicCreator);
-//#########################################################################//
-GeluPluginDynamic::GeluPluginDynamic(const std::string name, const DataType type, Weights const& bias, const int ld)
-    : mLayerName(name), mType(type), mLd(ld), mNumBias(bias.count) {
-    if (mNumBias > 0) {
-        mBias.convertAndCopy(bias, DataType::kHALF);
-        copyToDevice(mBias, getWeightsSize(mBias, DataType::kHALF), mBiasDev);
-    }
-}
-
-GeluPluginDynamic::GeluPluginDynamic(const std::string name, void const* data, size_t length) : mLayerName(name) {
-    gLogVerbose << "GeluPluginDynamic deserialize\n";
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mLd);
-    deserialize_value(&data, &length, &mNumBias);
-
-    if (mNumBias > 0) {
-        IXRT_PLUGIN_ASSERT(mLd > 0);
-        char const* d = static_cast<char const*>(data);
-        mBias.convertAndCopy(d, mNumBias, DataType::kHALF);
-        copyToDevice(mBias, getWeightsSize(mBias, DataType::kHALF), mBiasDev);
-    }
-}
-
-// IPluginV2 Methods
-
-char const* GeluPluginDynamic::getPluginType() const noexcept { return kGELU_IXRT_PLUGIN_NAME; }
-
-char const* GeluPluginDynamic::getPluginVersion() const noexcept { return kGELU_IXRT_PLUGIN_VERSION; }
-
-int32_t GeluPluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t GeluPluginDynamic::initialize() noexcept {
-    gLogVerbose << "GeluPluginDynamic initalize\n";
-    return 0;
-}
-
-void GeluPluginDynamic::terminate() noexcept { gLogVerbose << "GeluPluginDynamic terminate\n"; }
-
-size_t GeluPluginDynamic::getSerializationSize() const noexcept {
-    const size_t wordSize = getElementSize(mType);
-    return sizeof(mType) + sizeof(mLd) + sizeof(mNumBias) + mNumBias * sizeof(half);
-}
-
-void GeluPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mLd);
-    serialize_value(&buffer, mNumBias);
-    if (mNumBias > 0) {
-        IXRT_PLUGIN_ASSERT(mLd > 0);
-        char* d = static_cast<char*>(buffer);
-
-        serFromDev(d, static_cast<char*>(mBiasDev.get()), mLd * getElementSize(DataType::kHALF));
-    }
-}
-
-void GeluPluginDynamic::destroy() noexcept {
-    gLogVerbose << "GeluPluginDynamic destroy\n";
-    // This gets called when the network containing plugin is destroyed
-    if (mNumBias > 0) {
-        mBiasDev.reset();
-    }
-    delete this;
-}
-
-void GeluPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* GeluPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-nvinfer1::DataType GeluPluginDynamic::getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                                        int32_t nbInputs) const noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(index == 0);
-        IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-        IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF ||
-                           inputTypes[0] == DataType::kINT8);
-        return inputTypes[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DataType{};
-}
-
-// IPluginV2DynamicExt Methods
-nvinfer1::IPluginV2DynamicExt* GeluPluginDynamic::clone() const noexcept {
-    try {
-        gLogVerbose << "GeluPluginDynamic clone\n";
-        auto* plugin = new GeluPluginDynamic(mLayerName, mType, mBias, mLd);
-        plugin->setPluginNamespace(mNamespace.c_str());
-        return plugin;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs,
-                                                           int32_t nbInputs,
-                                                           nvinfer1::IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        return inputs[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool GeluPluginDynamic::supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut,
-                                                  int32_t nbInputs, int32_t nbOutputs) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inOut != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(pos >= 0);
-        IXRT_PLUGIN_ASSERT(pos < nbInputs + nbOutputs);
-    } catch (std::exception const& e) {
-        caughtError(e);
-        return false;
-    }
-
-    PluginTensorDesc const& input = inOut[0];
-    if (pos == 0) {
-        return (input.type == mType) && (input.format == TensorFormat::kLINEAR);
-    }
-    if (pos == 1) {
-        PluginTensorDesc const& output = inOut[1];
-        return (input.type == output.type) && (output.format == TensorFormat::kLINEAR) && (output.type == mType);
-    }
-    return false;
-}
-
-void GeluPluginDynamic::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                                        nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept {
-    gLogVerbose << "GeluPluginDynamic configurePlugin\n";
-
-    try {
-        IXRT_PLUGIN_ASSERT(in != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(mType == in[0].desc.type);
-        IXRT_PLUGIN_ASSERT(mType == DataType::kHALF || mType == DataType::kINT8);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t GeluPluginDynamic::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                                           nvinfer1::PluginTensorDesc const* outputs,
-                                           int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-template <typename TDataType>
-int32_t GeluPluginDynamic::enqueueTyped(void const* input_, void* output_, int32_t const inputVolume,
-                                        cudaStream_t stream) noexcept {
-    TDataType const* input = static_cast<TDataType const*>(input_);
-    TDataType* output = static_cast<TDataType*>(output_);
-    int32_t const cols = inputVolume / mLd;
-    int32_t const rows = mLd;
-
-    if (mNumBias > 0) {
-        TDataType const* bias = static_cast<TDataType*>(mBiasDev.get());
-        return computeGeluBias(output, input, bias, rows, cols, stream);
-    } else {
-        return computeGelu(stream, inputVolume, input, output);
-    }
-}
-
-int32_t GeluPluginDynamic::enqueueInt8(void const* input_, void* output_, float dequant_scale, float quant_scale,
-                                       int32_t const inputVolume, cudaStream_t stream) noexcept {
-    int8_t const* input = static_cast<int8_t const*>(input_);
-    int8_t* output = static_cast<int8_t*>(output_);
-    int32_t const cols = inputVolume / mLd;
-    int32_t const rows = mLd;
-
-    if (mNumBias > 0) {
-        half const* bias = static_cast<half*>(mBiasDev.get());
-        return computeGeluI8O8Bias(output, input, bias, rows, cols, dequant_scale, quant_scale, stream);
-    } else {
-        return computeGeluI8O8(stream, inputVolume, input, output, dequant_scale, quant_scale);
-    }
-}
-
-int32_t GeluPluginDynamic::enqueue(nvinfer1::PluginTensorDesc const* inputDesc,
-                                   nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs,
-                                   void* const* outputs, void* workspace, cudaStream_t stream) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputDesc != nullptr);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-    } catch (std::exception const& e) {
-        caughtError(e);
-        return STATUS_FAILURE;
-    }
-
-    int32_t const inputVolume = volume(inputDesc[0].dims);
-    int32_t batch_token_num = inputDesc[0].dims.d[BDIM] * inputDesc[0].dims.d[SDIM];
-
-    // Our plugin outputs only one tensor.
-    // Launch CUDA kernel wrapper and save its return value.
-    switch (mType) {
-        case DataType::kFLOAT:
-            return enqueueTyped<float>(inputs[0], outputs[0], inputVolume, stream);
-        case DataType::kHALF:
-            return enqueueTyped<half>(inputs[0], outputs[0], inputVolume, stream);
-        case DataType::kINT8: {
-            int8_t* input = (int8_t*)(inputs[0]);
-            int8_t* output = (int8_t*)(outputs[0]);
-            return enqueueInt8(input, output, inputDesc[0].scale, 1.0/outputDesc[0].scale, inputVolume, stream);
-        }
-        default:
-            return STATUS_FAILURE;
-    }
-}
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/gelu/geluPlugin.cu b/models/nlp/plm/bert_base_squad/ixrt/src/gelu/geluPlugin.cu
deleted file mode 100644
index c36cac15..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/gelu/geluPlugin.cu
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "backend/bert/bert_helper.h"
-#include "geluPlugin.h"
-
-namespace nvinfer1::ixrt_plugin {
-using namespace backend;
-namespace bert {
-// constants for approximating the normal cdf
-constexpr float A = 0.5f;
-constexpr float B = 0.7978845608028654f;    // sqrt(2.0/M_PI)
-constexpr float C = 0.035677408136300125f;  // 0.044715 * sqrt(2.0/M_PI)
-
-
-template <typename T>
-__global__ void IxinferBiasGeluI8II8OKernel(int8_t *input, int8_t *output, const T *bias, int feature_dim,
-                                            float dequant_scale, float quant_scale) {
-    int block_start = blockIdx.x * feature_dim;
-    int start = block_start + threadIdx.x;
-    int end = block_start + feature_dim;
-    for (int i = start; i < end; i += blockDim.x) {
-        int input_index = i;
-
-        float fout = gelu<float>(float(input[input_index]) * dequant_scale + __ldg(&bias[i - block_start]));
-
-        int output_index = i;
-        output[output_index] = float2int8(fout, quant_scale);
-    }
-}
-
-template <>
-__global__ void IxinferBiasGeluI8II8OKernel<__half>(int8_t *input, int8_t *output, const __half *bias, int feature_dim,
-                                                    float dequant_scale, float quant_scale) {
-    //  #pragma unroll
-    for (int block_index = 0; block_index < 2; block_index++) {
-        int block_start = (blockIdx.x * 2 + block_index) * feature_dim;
-        int start = block_start + threadIdx.x * 4;
-        int input_index = start;
-        char4 *p_input = (char4 *)(input + input_index);
-        half2 *p_bias = (half2 *)(bias + input_index - block_start);
-        float fout1 = gelu<float>(float(p_input[0].x) * dequant_scale + __half2float(p_bias[0].x));
-        float fout2 = gelu<float>(float(p_input[0].y) * dequant_scale + __half2float(p_bias[0].y));
-        float fout3 = gelu<float>(float(p_input[0].z) * dequant_scale + __half2float(p_bias[1].x));
-        float fout4 = gelu<float>(float(p_input[0].w) * dequant_scale + __half2float(p_bias[1].y));
-
-        int output_index = start;
-        char4 out;
-        out.x = float2int8(fout1, quant_scale);
-        out.y = float2int8(fout2, quant_scale);
-        out.z = float2int8(fout3, quant_scale);
-        out.w = float2int8(fout4, quant_scale);
-        char4 *p_output = (char4 *)(output + output_index);
-
-        p_output[0] = out;
-    }
-}
-
-template <typename T>
-void IxinferBiasGeluI8II8O(int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output, const T *bias,
-                           int feature_dim, float dequant_scale, float quant_scale) {
-    IxinferBiasGeluI8II8OKernel<T>
-        <<<batch_token_num, 1024, 0, stream>>>(input, output, bias, feature_dim, dequant_scale, quant_scale);
-}
-
-template void IxinferBiasGeluI8II8O<half>(int, cudaStream_t, int8_t*, int8_t *, const half *, int, float, float);
-
-template <unsigned TPB>
-__global__ void geluKernel(const half a, const half b, const half c, int n, const half* input, half* output) {
-    const int idx = blockIdx.x * TPB + threadIdx.x;
-
-    if (idx < n) {
-        const half in = input[idx];
-        const half cdf = a + a * __float2half(tanh(__half2float(in * (c * in * in + b))));
-        output[idx] = in * cdf;
-    }
-}
-
-template <unsigned TPB>
-__global__ void geluKernel(const float a, const float b, const float c, int n, const float* input, float* output) {
-    const int idx = blockIdx.x * TPB + threadIdx.x;
-
-    if (idx < n) {
-        const float in = input[idx];
-        const float cdf = a + a * tanh(in * (c * in * in + b));
-        output[idx] = in * cdf;
-    }
-}
-
-template <unsigned TPB>
-__global__ void geluKernel(const float a, const float b, const float c, int n, const int8_t* input, int8_t* output,
-                           float dequant_scale, float quant_scale) {
-    const int idx = blockIdx.x * TPB + threadIdx.x;
-
-    if (idx < n) {
-        const float in = float(input[idx]) * dequant_scale;
-        const float cdf = a + a * tanh(in * (c * in * in + b));
-        float i8_f = in * cdf * quant_scale;
-        int32_t i8 = floorf(i8_f + 0.5);
-        i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8);
-        output[idx] = int8_t(i8);
-    }
-}
-
-int computeGelu(cudaStream_t stream, int n, const float* input, float* output) {
-    constexpr int blockSize = 256;
-    const int gridSize = (n + blockSize - 1) / blockSize;
-    geluKernel<blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, n, input, output);
-
-    return 0;
-}
-
-int computeGelu(cudaStream_t stream, int n, const half* input, half* output) {
-    constexpr int blockSize = 256;
-    const int gridSize = (n + blockSize - 1) / blockSize;
-    geluKernel<blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, n, input, output);
-
-    return 0;
-}
-
-int32_t computeGeluI8O8(cudaStream_t stream, int n, const int8_t* input, int8_t* output, float dequant_scale,
-                        float quant_scale) {
-    constexpr int blockSize = 256;
-    const int gridSize = (n + blockSize - 1) / blockSize;
-    geluKernel<blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, n, input, output, dequant_scale, quant_scale);
-
-    return 0;
-}
-
-template <int TPB>
-__global__ void geluBiasKernel(const half a, const half b, const half c, half* output, const half* input,
-                               const half* bias, const int ld) {
-    const int offset = blockIdx.x * ld;
-
-    for (int it = threadIdx.x; it < ld; it += TPB) {
-        const int idx = it + offset;
-        const half in = input[idx] + bias[it];
-        const half cdf = a + a * __float2half(tanh(__half2float(in * (c * in * in + b))));
-        output[idx] = in * cdf;
-    }
-}
-
-template <int TPB>
-__global__ void geluBiasKernel(const float a, const float b, const float c, float* output, const float* input,
-                               const float* bias, const int ld) {
-    const int offset = blockIdx.x * ld;
-
-    for (int it = threadIdx.x; it < ld; it += TPB) {
-        const int idx = it + offset;
-        const float in = input[idx] + bias[it];
-        const float cdf = a + a * tanh(in * (c * in * in + b));
-        output[idx] = in * cdf;
-    }
-}
-
-template <int TPB>
-__global__ void geluBiasKernel(const float a, const float b, const float c, int8_t* output, const int8_t* input,
-                               const half* bias, float dequant_scale, float quant_scale, const int ld) {
-    const int offset = blockIdx.x * ld;
-
-    for (int it = threadIdx.x; it < ld; it += TPB) {
-        const int idx = it + offset;
-        const float in = float(input[idx]) * dequant_scale + __half2float(bias[it]);
-        const float cdf = a + a * tanh(in * (c * in * in + b));
-        float i8_f = in * cdf * quant_scale;
-        int32_t i8 = floorf(i8_f + 0.5);
-        i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8);
-        output[idx] = int8_t(i8);
-    }
-}
-
-int computeGeluBias(float* output, const float* input, const float* bias, const int ld, const int cols,
-                    cudaStream_t stream) {
-    geluBiasKernel<256><<<cols, 256, 0, stream>>>(A, B, C, output, input, bias, ld);
-    return cudaPeekAtLastError();
-}
-
-int computeGeluBias(half* output, const half* input, const half* bias, const int ld, const int cols,
-                    cudaStream_t stream) {
-    geluBiasKernel<256><<<cols, 256, 0, stream>>>(A, B, C, output, input, bias, ld);
-    return cudaPeekAtLastError();
-}
-
-int32_t computeGeluI8O8Bias(int8_t* output, const int8_t* input, const half* bias, const int ld, const int cols,
-                            float dequant_scale, float quant_scale, cudaStream_t stream) {
-    geluBiasKernel<256><<<cols, 256, 0, stream>>>(A, B, C, output, input, bias, dequant_scale, quant_scale, ld);
-    return cudaPeekAtLastError();
-}
-
-}  // namespace bert
-}  // namespace nvinfer1::plugin
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/gelu/geluPlugin.h b/models/nlp/plm/bert_base_squad/ixrt/src/gelu/geluPlugin.h
deleted file mode 100644
index 182fe7f3..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/gelu/geluPlugin.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#ifdef __ILUVATAR__
-#include <ixinfer.h>
-#endif
-
-#include <vector>
-
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-template <typename T>
-void IxinferBiasGeluI8II8O(int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output, const T *bias,
-                           int feature_dim, float dequant_scale, float quant_scale);
-
-int32_t computeGelu(cudaStream_t stream, int32_t n, float const* input, float* output);
-
-int32_t computeGelu(cudaStream_t stream, int32_t n, half const* input, half* output);
-
-int32_t computeGeluI8O8(cudaStream_t stream, int n, const int8_t* input, int8_t* output, float dequant_scale,
-                        float quant_scale);
-
-int32_t computeGeluBias(float* output, float const* input, float const* bias, int32_t const ld, int32_t const cols,
-                        cudaStream_t stream);
-
-int32_t computeGeluBias(half* output, half const* input, half const* bias, int32_t const ld, int32_t const cols,
-                        cudaStream_t stream);
-
-int32_t computeGeluI8O8Bias(int8_t* output, const int8_t* input, const half* bias, const int ld, const int cols,
-                            float dequant_scale, float quant_scale, cudaStream_t stream);
-
-class GeluPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    GeluPluginDynamic(const std::string name, const nvinfer1::DataType type, nvinfer1::Weights const& bias,
-                      const int ld);
-
-    GeluPluginDynamic(const std::string name, void const* data, size_t length);
-
-    // It doesn't make sense to make GeluPluginDynamic without arguments, so we delete
-    // default constructor.
-    GeluPluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    // Helper method for enqueue()
-    template <typename TDataType>
-    int32_t enqueueTyped(void const* input, void* output, int32_t const inputVolume, cudaStream_t stream) noexcept;
-    int32_t enqueueInt8(void const* input_, void* output_, float dequant_scale, float quant_scale,
-                        int32_t const inputVolume, cudaStream_t stream) noexcept;
-
-    const std::string mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    bert::WeightsWithOwnership mBias;
-    bert::cuda_unique_ptr<void> mBiasDev;
-    size_t mLd;
-    size_t mNumBias;
-};
-
-class GeluPluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    GeluPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-}  // namespace bert
-}  // namespace nvinfer1::plugin
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp b/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp
deleted file mode 100644
index c3a25ba1..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "qkvToContextInt8Plugin.h"
-
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "driver_types.h"
-#include "plugin.h"
-#include "serialize.h"
-#include <iomanip>
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* const kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_VERSION{"3"};
-char const* const kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_NAME{"CustomQKVToContextPluginDynamic_IxRT"};
-}  // namespace
-
-PluginFieldCollection QKVToContextInt8PluginDynamicCreator::mFC{};
-std::vector<PluginField> QKVToContextInt8PluginDynamicCreator::mPluginAttributes;
-
-constexpr uint32_t IIDX = 0;  // index of the input tensor
-constexpr uint32_t MIDX = 1;  // index of the mask
-/*
-dq_probs:
-_arrange_qkv_amax
-_softmax_in_amax
-_softmax_out_amax
-*/
-QKVToContextInt8PluginDynamicCreator::QKVToContextInt8PluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("hidden_size", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("num_heads", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("dq_probs", nullptr, PluginFieldType::kFLOAT32, 3));
-
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* QKVToContextInt8PluginDynamicCreator::getPluginName() const noexcept {
-    return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_NAME;
-}
-
-char const* QKVToContextInt8PluginDynamicCreator::getPluginVersion() const noexcept {
-    return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_VERSION;
-}
-
-PluginFieldCollection const* QKVToContextInt8PluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* QKVToContextInt8PluginDynamicCreator::createPlugin(char const* name,
-                                                              PluginFieldCollection const* fc) noexcept {
-    try {
-        int32_t hiddenSize = 0;
-        // Since numHeads must always exist or validateRequiredAttributes will fail,
-        // we can set numHeads to -1 so that static analysis tools don't warn about
-        // a division by zero in QKVToContextInt8PluginDynamic constructor.
-        int32_t numHeads{-1};
-
-        vector<float> dqProbs;
-
-        ixrt_plugin::validateRequiredAttributesExist({"hidden_size", "num_heads"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string field_name(fc->fields[i].name);
-
-            if (field_name.compare("hidden_size") == 0) {
-                hiddenSize = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(hiddenSize > 0,
-                                        ("QKV: Invalid hiddenSize " + std::to_string(hiddenSize)).c_str());
-                gLogInfo << "Building hiddenSize: " << hiddenSize << endl;
-            }
-            if (field_name.compare("num_heads") == 0) {
-                numHeads = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(numHeads > 0, ("QKV: Invalid numHeads " + std::to_string(numHeads)).c_str());
-                gLogInfo << "Building numHeads: " << numHeads << endl;
-            }
-            if (field_name.compare("dq_probs") == 0) {
-                IXRT_PLUGIN_CHECK_VALUE(fc->fields[i].length > 0,
-                                        ("QKV: dpProbs can not be empty, error: [dpProbs.length == 0]!"));
-                gLogInfo << "Building dqProbs: [";
-                for (auto j = 0; j < fc->fields[i].length; j++) {
-                    dqProbs.emplace_back(static_cast<float const*>((fc->fields[i].data))[j]);
-                    gLogInfo << std::setprecision(5) << dqProbs[j];
-                }
-                gLogInfo << "]" << endl;
-            }
-        }
-
-        QKVToContextInt8PluginDynamic* p = new QKVToContextInt8PluginDynamic(name, hiddenSize, numHeads, dqProbs);
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* QKVToContextInt8PluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                   size_t serialLength) noexcept {
-    try {
-        // This object will be deleted when the network is destroyed, which will
-        // call QKVToContextInt8PluginDynamic::destroy() noexcept
-        return new QKVToContextInt8PluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void QKVToContextInt8PluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    mNamespace = libNamespace;
-}
-
-char const* QKVToContextInt8PluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(QKVToContextInt8PluginDynamicCreator);
-//#########################################################################//
-QKVToContextInt8PluginDynamic::QKVToContextInt8PluginDynamic(std::string const& name, int32_t const hiddenSize,
-                                                             int32_t const numHeads, vector<float> const dqProbs)
-    : mLayerName(name),
-      mS(0),
-      mB(0),
-      mHeadSize(hiddenSize / numHeads),
-      mHiddenSize(hiddenSize),
-      mNumHeads(numHeads),
-      mDqProbs(dqProbs) {}
-
-QKVToContextInt8PluginDynamic::QKVToContextInt8PluginDynamic(std::string const& name, void const* data, size_t length)
-    : mLayerName(name) {
-    gLogInfo << "deserialize QKVToContextInt8PluginDynamic" << endl;
-    deserialize_value(&data, &length, &mNumHeads);
-    deserialize_value(&data, &length, &mHeadSize);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mDqProbs);
-}
-
-// IPluginV2 Methods
-char const* QKVToContextInt8PluginDynamic::getPluginType() const noexcept {
-    return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_NAME;
-}
-
-char const* QKVToContextInt8PluginDynamic::getPluginVersion() const noexcept {
-    return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_VERSION;
-}
-
-int32_t QKVToContextInt8PluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t QKVToContextInt8PluginDynamic::initialize() noexcept { return 0; }
-
-void QKVToContextInt8PluginDynamic::terminate() noexcept {}
-
-size_t QKVToContextInt8PluginDynamic::getSerializationSize() const noexcept {
-    return sizeof(mNumHeads) + sizeof(mHeadSize) + sizeof(mHiddenSize) + mDqProbs.size() * sizeof(float) +
-           sizeof(mDqProbs.size());
-}
-
-void QKVToContextInt8PluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mNumHeads);
-    serialize_value(&buffer, mHeadSize);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mDqProbs);
-}
-
-void QKVToContextInt8PluginDynamic::destroy() noexcept { delete this; }
-
-void QKVToContextInt8PluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { mNamespace = libNamespace; }
-
-char const* QKVToContextInt8PluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType QKVToContextInt8PluginDynamic::getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                                          int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0)
-    return DataType::kINT8;
-}
-
-// IPluginV2DynamicExt Methods
-nvinfer1::IPluginV2DynamicExt* QKVToContextInt8PluginDynamic::clone() const noexcept {
-    try {
-        QKVToContextInt8PluginDynamic* ret =
-            new QKVToContextInt8PluginDynamic(mLayerName, mHiddenSize, mNumHeads, mDqProbs);
-
-        ret->setPluginNamespace(mNamespace.c_str());
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs QKVToContextInt8PluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                             int32_t nbInputs, IExprBuilder& exprBuilder) noexcept {
-    // input [B, S, 3*E] int8
-    // pad_mask [B, S] int8
-    
-    // output [B, S, E] int8
-    IXRT_PLUGIN_ASSERT(outputIndex == 0);
-    // Copy over everything
-    DimsExprs output(inputs[IIDX]);
-    // Divide last dim by three
-    auto const* three = exprBuilder.constant(3);
-    output.d[HDIM] = exprBuilder.constant(mHiddenSize);
-    return output;
-}
-bool QKVToContextInt8PluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut,
-                                                              int32_t nbInputs, int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 2);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    return (inOut[pos].type == DataType::kINT8) && (inOut[pos].format == TensorFormat::kLINEAR);
-}
-
-void QKVToContextInt8PluginDynamic::configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                                                    DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 2);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    PluginTensorDesc const& inDesc = in[IIDX].desc;
-    PluginTensorDesc const& outDesc = out[0].desc;
-    IXRT_PLUGIN_ASSERT(inDesc.dims.nbDims == 5)
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[HDIM] == 3 * mHiddenSize);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[4] == 1);
-
-    PluginTensorDesc const& maskDesc = in[MIDX].desc;
-    IXRT_PLUGIN_ASSERT(maskDesc.dims.nbDims == 2);
-    IXRT_PLUGIN_ASSERT(maskDesc.dims.d[0] == inDesc.dims.d[0]);
-    IXRT_PLUGIN_ASSERT(maskDesc.dims.d[1] == inDesc.dims.d[1]);
-
-    const int32_t S = inDesc.dims.d[SDIM];
-
-    IXRT_PLUGIN_ASSERT(outDesc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[BDIM] == inDesc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[SDIM] == S);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[HDIM] == mHiddenSize);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[4] == 1);
-
-#ifdef __ILUVATAR__
-    CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-    CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-}
-
-size_t QKVToContextInt8PluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                       PluginTensorDesc const* outputs,
-                                                       int32_t nbOutputs) const noexcept {
-    const int32_t B = inputs[0].dims.d[BDIM];
-    const int32_t S = inputs->dims.d[SDIM];
-    const int32_t E = inputs->dims.d[HDIM];
-    IXRT_PLUGIN_ASSERT(E == 3 * mHiddenSize);
-    int64_t buffer_size = B * S * E * sizeof(int8_t) + B * S * S * mNumHeads * sizeof(int8_t);
-#ifndef __ILUVATAR__
-    buffer_size += B * S * S * mNumHeads * sizeof(int32_t);
-#endif
-    return buffer_size;
-}
-
-int32_t QKVToContextInt8PluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                               void const* const* inputs, void* const* outputs, void* workspace,
-                                               cudaStream_t stream) noexcept {
-    try {
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferSetStream(cuinfer_handle, 0));
-#endif
-        int32_t const B = inputDesc[0].dims.d[BDIM];
-        int32_t const S = inputDesc[0].dims.d[SDIM];
-
-        float qkv_out_amax_ = inputDesc[0].scale * 127;
-        float linear_in_amax_ = outputDesc[0].scale * 127;
-        float arrange_qkv_amax_ = mDqProbs[0];
-        float softmax_in_amax_ = mDqProbs[1];
-        float softmax_out_amax_ = mDqProbs[2];
-
-        int8_t* qkv_buffer_ = (int8_t*)inputs[0];
-        int8_t* qkv_out_ = (int8_t*)outputs[0];
-        int8_t* mask_ = (int8_t*)inputs[1];
-
-        int64_t buffer_size = B * S * mHiddenSize;
-        int64_t buffer_size2 = B * S * S * mNumHeads;
-        int8_t* q_buffer_ = static_cast<int8_t*>(workspace);
-        int8_t* k_buffer_ = q_buffer_ + buffer_size;
-        int8_t* v_buffer_ = k_buffer_ + buffer_size;
-        int8_t* qk_buffer_ = v_buffer_ + buffer_size;
-        
-#ifdef __ILUVATAR__
-        auto status =
-            fused_multihead_attetion_int8(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qkv_out_,
-                                          qk_buffer_, B, S, mHeadSize, mNumHeads, mHiddenSize, arrange_qkv_amax_,
-                                          softmax_in_amax_, softmax_out_amax_, linear_in_amax_, cuinfer_handle, stream);
-#else
-        int32_t* qk_out_ = reinterpret_cast<int32_t*>(qk_buffer_ + buffer_size2);
-        auto status =
-            fused_multihead_attetion_int8(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qk_out_, qkv_out_,
-                                          qk_buffer_, B, S, mHeadSize, mNumHeads, mHiddenSize, arrange_qkv_amax_,
-                                          softmax_in_amax_, softmax_out_amax_, linear_in_amax_, blaslt_handle, stream);
-#endif
-        if (status != cudaSuccess) {
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-        return STATUS_FAILURE;
-    }
-}
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu b/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu
deleted file mode 100644
index 2330debf..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu
+++ /dev/null
@@ -1,488 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "backend/bert/bert_helper.h"
-#include "backend/cublas/cublas_helper.h"
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#endif
-#include "qkvToContextInt8Plugin.h"
-
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-const int _max_thread_per_block = 1024;
-const float _quant_range = 127.0;
-
-__global__ void IxinferArrangeEncselfQkvI8II8ONoBias(const int8_t *ori_qkv, int8_t *new_qkv, int max_batch_dim,
-                                                     int batch_seq_len, int dim_per_head, int head_num) {
-    int hidden_size = dim_per_head * head_num;
-    int batch_id = blockIdx.x / batch_seq_len;
-    int token_id = blockIdx.x % batch_seq_len;
-
-    int i = threadIdx.x;  // 1个线程处理4个数据
-
-    int head_id = (i * 4) / dim_per_head;
-    int dim_id = (i * 4) % dim_per_head;
-    int target_id = targetid_4dim(batch_id, head_id, token_id, dim_id, head_num, batch_seq_len, dim_per_head);
-
-#pragma unroll
-    for (int qkv_idx = 0; qkv_idx < 3; qkv_idx++) {
-        char4 *p_ori_qkv = (char4 *)(ori_qkv + (blockIdx.x * 3 + qkv_idx) * hidden_size);
-        int qkv_offset = max_batch_dim * qkv_idx;
-        char4 *p_new_qkv = (char4 *)(new_qkv + qkv_offset + target_id);
-        p_new_qkv[0] = p_ori_qkv[i];
-    }
-}
-
-template <int log2_elements, int WARP_BATCH>
-__global__ void IxinferCorrelationSoftmaxEncselfI8II8OKernel(int8_t *correlation, const int8_t *src_padding_mask,
-                                                             int batch_seq_len, float quant_scale,
-                                                             float dequant_scale) {
-    constexpr int next_power_of_two = 1 << log2_elements;
-    constexpr int SOFT_WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    constexpr int WARP_ITERATIONS = next_power_of_two / SOFT_WARP_SIZE;
-    int local_idx = threadIdx.x;
-
-    for (int warp_idx = 0; warp_idx < WARP_BATCH; ++warp_idx) {
-        int start_idx = (blockIdx.x * gridDim.y * WARP_BATCH * gridDim.z * batch_seq_len +
-                         (blockIdx.y + gridDim.y * warp_idx) * gridDim.z * batch_seq_len + blockIdx.z * batch_seq_len);
-
-        char4 *p_correlation = (char4 *)(correlation + start_idx);
-        char4 *p_src_padding_mask = (char4 *)(src_padding_mask + blockIdx.x * batch_seq_len);
-
-        // load data from global memory
-        // float
-        float4 elements[WARP_ITERATIONS];
-#pragma unroll
-        for (int it = 0; it < WARP_ITERATIONS; ++it) {
-            int element_index = local_idx + it * SOFT_WARP_SIZE;
-            if (element_index < batch_seq_len / 4) {
-                char4 mask = p_src_padding_mask[element_index];
-                char4 correlation_value = p_correlation[element_index];
-
-                elements[it].x =
-                    mask.x ? -INFINITY : (float)correlation_value.x * dequant_scale;
-                elements[it].y =
-                    mask.y ? -INFINITY : (float)correlation_value.y * dequant_scale;
-                elements[it].z =
-                    mask.z ? -INFINITY : (float)correlation_value.z * dequant_scale;
-                elements[it].w =
-                    mask.w ? -INFINITY : (float)correlation_value.w * dequant_scale;
-
-            } else {
-                elements[it].x = -INFINITY;
-                elements[it].y = -INFINITY;
-                elements[it].z = -INFINITY;
-                elements[it].w = -INFINITY;
-            }
-        }
-
-        // compute max_value
-        float max_value = elements[0].x;
-        max_value = (max_value > elements[0].y) ? max_value : elements[0].y;
-        max_value = (max_value > elements[0].z) ? max_value : elements[0].z;
-        max_value = (max_value > elements[0].w) ? max_value : elements[0].w;
-
-#pragma unroll
-        for (int it = 1; it < WARP_ITERATIONS; ++it) {
-            max_value = (max_value > elements[it].x) ? max_value : elements[it].x;
-            max_value = (max_value > elements[it].y) ? max_value : elements[it].y;
-            max_value = (max_value > elements[it].z) ? max_value : elements[it].z;
-            max_value = (max_value > elements[it].w) ? max_value : elements[it].w;
-        }
-
-        warp_reduce<float, SOFT_WARP_SIZE, Max>(&max_value);
-
-        // exp sum
-        float sum = 0.0f;
-#pragma unroll
-        for (int it = 0; it < WARP_ITERATIONS; ++it) {
-            elements[it].x = __expf(elements[it].x - max_value);
-            elements[it].y = __expf(elements[it].y - max_value);
-            elements[it].z = __expf(elements[it].z - max_value);
-            elements[it].w = __expf(elements[it].w - max_value);
-
-            sum += (elements[it].x + elements[it].y + elements[it].z + elements[it].w);
-        }
-
-        warp_reduce<float, SOFT_WARP_SIZE, Add>(&sum);
-        sum = 1.0f / sum;
-        // store result
-#pragma unroll
-        for (int it = 0; it < WARP_ITERATIONS; ++it) {
-            int element_index = local_idx + it * SOFT_WARP_SIZE;
-            char4 correlation_value;
-            if (element_index < batch_seq_len / 4) {
-                correlation_value.x = float2int8(elements[it].x * sum, quant_scale);
-                correlation_value.y = float2int8(elements[it].y * sum, quant_scale);
-                correlation_value.z = float2int8(elements[it].z * sum, quant_scale);
-                correlation_value.w = float2int8(elements[it].w * sum, quant_scale);
-
-                p_correlation[element_index] = correlation_value;
-
-            } else {
-                break;
-            }
-        }
-    }
-}
-
-void IxinferCorrelationSoftmaxEncselfI8II8O(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
-                                            int8_t *correlation, const int8_t *src_padding_mask, float quant_scale,
-                                            float dequant_scale) {
-    const int NUM_INT8_SOFTMAX_BATCH_WARP = 4;
-    if (batch_seq_len > 512) {
-        throw std::runtime_error("batch_seq_len should <= 512");
-    }
-    if (head_num % NUM_INT8_SOFTMAX_BATCH_WARP != 0) {
-        throw std::runtime_error("head_num % NUM_INT8_SOFTMAX_BATCH_WARP !0");
-    }
-    if (batch_seq_len % 4 != 0) {
-        throw std::runtime_error("batch_seq_len % 4 != 0");
-    }
-
-    int log2_elements = log2_ceil(batch_seq_len / 4);
-    int next_power_of_two = 1 << log2_elements;
-    int SOFT_WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    // dim3 blockSize(batch_size, head_num / NUM_INT8_SOFTMAX_BATCH_WARP,
-    // batch_seq_len);
-    //
-    dim3 grid(batch_size, head_num / NUM_INT8_SOFTMAX_BATCH_WARP, batch_seq_len);
-
-    dim3 block(SOFT_WARP_SIZE);
-
-    switch (log2_elements) {
-        case 0:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<0, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-
-            break;
-
-        case 1:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<1, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 2:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<2, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 3:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<3, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 4:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<4, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 5:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<5, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 6:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<6, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-        case 7:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<7, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-        case 8:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<8, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-        case 9:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<9, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-        default:
-            throw std::runtime_error(
-                "ker_correlation_softmax_encself_i8I_i8O_ix_ "
-                "NotImplementedError");
-            break;
-    }
-}
-
-
-__global__ void IxinferArrangeAttenOutputI8II8OKernel(const int8_t *ori_q, int8_t *new_q, int beam_size,
-                                                      int dim_per_head, int head_num, float quant_scale,
-                                                      float dequant_scale) {
-    int hidden_size = dim_per_head * head_num;
-
-#pragma unroll
-    for (int blockin = 0; blockin < 4; blockin++) {
-        int batch_id = (blockIdx.x * 4 + blockin) / beam_size;
-        // note, for encoder, beam_id is token_id; for decoder, beam_id is beam_id
-        int beam_id = (blockIdx.x * 4 + blockin) % beam_size;
-        int i = threadIdx.x;
-        int out_index = (blockIdx.x * 4 + blockin) * hidden_size + i;
-        int head_id = i / dim_per_head;
-        int dim_id = i % dim_per_head;
-
-        char4 *p_ori_q = (char4 *)ori_q;
-        char4 *p_new_q = (char4 *)new_q;
-        char4 value;
-
-        value = p_ori_q[targetid_4dim(batch_id, head_id, beam_id, dim_id, head_num, beam_size, dim_per_head)];
-        value.x = float2int8(value.x * dequant_scale, quant_scale);
-        value.y = float2int8(value.y * dequant_scale, quant_scale);
-        value.z = float2int8(value.z * dequant_scale, quant_scale);
-        value.w = float2int8(value.w * dequant_scale, quant_scale);
-        p_new_q[out_index] = value;
-    }
-}
-
-void IxinferArrangeAttenOutputI8II8O(int batch_token_num, int hidden_size, cudaStream_t stream, const int8_t *ori_q,
-                                     int8_t *new_q, int beam_size, int dim_per_head, int head_num,
-                                     int max_thread_per_block, float quant_scale, float dequant_scale) {
-    int qual_hidden_size = hidden_size >> 2;
-    int qual_dim_per_head = dim_per_head >> 2;
-    IxinferArrangeAttenOutputI8II8OKernel<<<batch_token_num / 4, qual_hidden_size, 0, stream>>>(
-        ori_q, new_q, beam_size, qual_dim_per_head, head_num, quant_scale, dequant_scale);
-}
-
-#ifdef __ILUVATAR__
-cudaError_t fused_multihead_attetion_int8(int8_t* qkv_buffer, int8_t* mask, int8_t* q_buffer, int8_t* k_buffer,
-                                          int8_t* v_buffer, int8_t* qkv_out, int8_t* qk_buffer,
-                                          int batch_size, int batch_seq_len, int head_dim, int head_num,
-                                          int hidden_size, float arrange_qkv_amax, float softmax_in_amax,
-                                          float softmax_out_amax, float linear_in_amax, cuinferHandle_t& cuinfer_handle,
-                                          cudaStream_t& stream) {
-    int batch_token_num = batch_size * batch_seq_len;
-    int max_batch_dim = batch_token_num * hidden_size;
-
-    float scaleCtx = linear_in_amax / _quant_range;
-    float scaleArrange = arrange_qkv_amax / _quant_range;
-    float scaleSoftin = softmax_in_amax / _quant_range;
-    float scaleSoftout = softmax_out_amax / _quant_range;
-
-    float scaleBmm1 = scaleArrange * scaleArrange / scaleSoftin * sqrt(1.f / head_dim);
-    float scaleBmm2 = scaleSoftout * scaleArrange / scaleCtx;
-
-    IxinferArrangeEncselfQkvI8II8ONoBias<<<batch_token_num, hidden_size / 4, 0, stream>>>(
-        qkv_buffer, q_buffer, max_batch_dim, batch_seq_len, head_dim, head_num);
-
-    switch (head_dim) {
-        case 64:
-        case 128:
-        case 192:
-        case 256: {
-            cuinferFlashAttnConfigInfo flashAttnInfo;
-            flashAttnInfo.scaling = sqrt(1.f / (head_dim * 1.0));
-            flashAttnInfo.quantParam.q_amax = arrange_qkv_amax;
-            flashAttnInfo.quantParam.k_amax = arrange_qkv_amax;
-            flashAttnInfo.quantParam.v_amax = arrange_qkv_amax;
-            flashAttnInfo.quantParam.p_amax = softmax_out_amax;
-            flashAttnInfo.quantParam.o_amax = linear_in_amax;
-
-            cuinferTensorDescriptor_t qDesc, kDesc, vDesc, maskDesc, oDesc;
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&qDesc));
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&kDesc));
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&vDesc));
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&maskDesc));
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&oDesc));
-
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(qDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len,
-                                                       head_dim));
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(kDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len,
-                                                       head_dim));
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(vDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len,
-                                                       head_dim));
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(maskDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, 1, 1, batch_seq_len));
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(oDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len,
-                                                       head_dim));
-
-            CUINFER_CHECK(cuinferFMHAForwardEx(cuinfer_handle, flashAttnInfo, qDesc, q_buffer, kDesc, k_buffer, vDesc,
-                                               v_buffer, maskDesc, mask, oDesc, qk_buffer));
-            break;
-        }
-        default: {
-            cuinfer_i8_gemm(k_buffer, q_buffer, nullptr, qkv_buffer, batch_size * head_num, batch_seq_len,
-                            batch_seq_len, head_dim, batch_seq_len * head_dim, batch_seq_len * head_dim,
-                            batch_seq_len * batch_seq_len, scaleBmm1, 0.0, 0, cuinfer_handle, stream);
-
-            IxinferCorrelationSoftmaxEncselfI8II8O(batch_size, batch_seq_len, head_num, stream, qkv_buffer, mask,
-                                                   1.0 / scaleSoftout, scaleSoftin);
-
-            cuinfer_nn_i8_gemm(v_buffer, qkv_buffer, qk_buffer, batch_size * head_num, head_dim, batch_seq_len,
-                               batch_seq_len, batch_seq_len * head_dim, batch_seq_len * batch_seq_len,
-                               batch_seq_len * head_dim, scaleBmm2, cuinfer_handle, stream);
-            break;
-        }
-    }
-
-    IxinferArrangeAttenOutputI8II8O(batch_token_num, hidden_size, stream, qk_buffer, qkv_out, batch_seq_len, head_dim,
-                                    head_num, _max_thread_per_block, 1.f, 1.f);
-    return cudaSuccess;
-}
-#else
-template <int THREAD_DATA_LEN>
-__global__ void quant_qkv_gemm(const int32_t* input, int8_t* output, int hidden_size, float quant_scale, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_id = blockIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z;
-    int block_start = block_id * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    int4* p_input = (int4*)input;
-    char4* p_output = (char4*)output;
-
-    float4 bias_val;
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-        char4 q_input;
-        q_input.x = float2int8(p_input[element_index].x*1.0, quant_scale);
-        q_input.y = float2int8(p_input[element_index].y*1.0, quant_scale);
-        q_input.z = float2int8(p_input[element_index].z*1.0, quant_scale);
-        q_input.w = float2int8(p_input[element_index].w*1.0, quant_scale);
-
-        p_output[element_index] = q_input;
-    }
-}
-
-void quantQKVGemm(int32_t* input, int8_t* output, int batch_size, int head_num, int batch_seq_len, int hidden_size, float dequant_scale, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    int num_per_tca = min(hidden_size / 4, C10_WARP_SIZE); 
-    dim3 gridSize(batch_size, head_num, batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-    switch (num_warp) {
-        case 1:
-            quant_qkv_gemm<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 2:
-            quant_qkv_gemm<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 3:
-            quant_qkv_gemm<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 4:
-            quant_qkv_gemm<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 5:
-            quant_qkv_gemm<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 6:
-            quant_qkv_gemm<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 7:
-            quant_qkv_gemm<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 8:
-            quant_qkv_gemm<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 9:
-            quant_qkv_gemm<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 10:
-            quant_qkv_gemm<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 11:
-            quant_qkv_gemm<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 12:
-            quant_qkv_gemm<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 13:
-            quant_qkv_gemm<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 14:
-            quant_qkv_gemm<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 15:
-            quant_qkv_gemm<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 16:
-            quant_qkv_gemm<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("quantQKVGemm");
-            break;
-    }
-}
-
-
-cudaError_t fused_multihead_attetion_int8(int8_t *qkv_buffer, int8_t *mask, int8_t *q_buffer, int8_t *k_buffer,
-                                          int8_t *v_buffer, int32_t *qk_out, int8_t *qkv_out, int8_t *qk_buffer, int batch_size,
-                                          int batch_seq_len, int head_dim, int head_num, int hidden_size,
-                                          float arrange_qkv_amax, float softmax_in_amax, float softmax_out_amax,
-                                          float linear_in_amax, cublasLtHandle_t &cublas_lt_handle,
-                                          cudaStream_t &stream) {
-    int batch_token_num = batch_size * batch_seq_len;
-    int max_batch_dim = batch_token_num * hidden_size;
-
-    float scaleCtx = linear_in_amax / _quant_range;
-    float scaleArrange = arrange_qkv_amax / _quant_range;
-    float scaleSoftin = softmax_in_amax / _quant_range;
-    float scaleSoftout = softmax_out_amax / _quant_range;
-
-    float scaleBmm1 = scaleArrange * scaleArrange / scaleSoftin * sqrt(1.f / head_dim);
-    float scaleBmm2 = scaleSoftout * scaleArrange / scaleCtx;
-
-    IxinferArrangeEncselfQkvI8II8ONoBias<<<batch_token_num, hidden_size / 4, 0, stream>>>(
-        qkv_buffer, q_buffer, max_batch_dim, batch_seq_len, head_dim, head_num);
-
-    cublaslt_gemm(k_buffer, q_buffer, qk_out, batch_size * head_num, batch_seq_len, batch_seq_len, head_dim,
-                  batch_seq_len * head_dim, batch_seq_len * head_dim, batch_seq_len * batch_seq_len, 1,
-                  cublas_lt_handle, stream);
-    quantQKVGemm(qk_out, qk_buffer, batch_size, head_num, batch_seq_len, batch_seq_len, scaleBmm1, stream);
-
-    IxinferCorrelationSoftmaxEncselfI8II8O(batch_size, batch_seq_len, head_num, stream, qk_buffer, mask,
-                                           1.0 / scaleSoftout, scaleSoftin);
-
-    cublaslt_gemm_nn(v_buffer, qk_buffer, qk_out, batch_size * head_num, head_dim, batch_seq_len, batch_seq_len,
-                     batch_seq_len * head_dim, batch_seq_len * batch_seq_len, batch_seq_len * head_dim, 1,
-                     cublas_lt_handle, stream);
-    quantQKVGemm(qk_out, q_buffer, batch_size, head_num, batch_seq_len, head_dim, scaleBmm2, stream);
-
-    IxinferArrangeAttenOutputI8II8O(batch_token_num, hidden_size, stream, q_buffer, qkv_out, batch_seq_len, head_dim,
-                                    head_num, _max_thread_per_block, 1.f, 1.f);
-    return cudaSuccess;
-}
-#endif
-}  // namespace bert
-}  // namespace nvinfer1::ixrt_plugin
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h b/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h
deleted file mode 100644
index b5c501fc..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#include <cublasLt.h>
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-#include <string>
-#include <vector>
-#ifdef __ILUVATAR__
-#include "ixinfer.h"
-#endif
-
-namespace nvinfer1::ixrt_plugin
-{
-namespace bert
-{
-
-#ifdef __ILUVATAR__
-cudaError_t fused_multihead_attetion_int8(int8_t* qkv_buffer, int8_t* mask, int8_t* q_buffer, int8_t* k_buffer,
-                                          int8_t* v_buffer, int8_t* qkv_out, int8_t* qk_buffer,
-                                          int batch_size, int batch_seq_len, int head_dim, int head_num,
-                                          int hidden_size, float arrange_qkv_amax, float softmax_in_amax,
-                                          float softmax_out_amax, float linear_in_amax, cuinferHandle_t& cuinfer_handle,
-                                          cudaStream_t& stream);
-#else
-cudaError_t fused_multihead_attetion_int8(int8_t* qkv_buffer, int8_t* mask, int8_t* q_buffer, int8_t* k_buffer,
-                                          int8_t* v_buffer, int32_t* qk_out, int8_t* qkv_out, int8_t* qk_buffer,
-                                          int batch_size, int batch_seq_len, int head_dim, int head_num,
-                                          int hidden_size, float arrange_qkv_amax, float softmax_in_amax,
-                                          float softmax_out_amax, float linear_in_amax,
-                                          cublasLtHandle_t& cublas_lt_handle, cudaStream_t& stream);
-#endif
-
-void IxinferCorrelationSoftmaxEncselfI8II8O(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
-                                            int8_t *correlation, const int8_t *src_padding_mask, float quant_scale,
-                                            float dequant_scale);
-
-void IxinferArrangeAttenOutputI8II8O(int batch_token_num, int hidden_size, cudaStream_t stream, const int8_t *ori_q,
-                                     int8_t *new_q, int beam_size, int dim_per_head, int head_num,
-                                     int max_thread_per_block, float quant_scale, float dequant_scale);
-class QKVToContextInt8PluginDynamic : public nvinfer1::IPluginV2DynamicExt
-{
-public:
-    QKVToContextInt8PluginDynamic(std::string const& name, int32_t const hiddenSize, int32_t const numHeads,
-        vector<float> const dqProbs);
-
-    QKVToContextInt8PluginDynamic(std::string const& name, void const* data, size_t length);
-
-    // It doesn't make sense to make QKVToContextInt8PluginDynamic without arguments, so we
-    // delete default constructor.
-    QKVToContextInt8PluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(
-        int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(
-        int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-        nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-        nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-        void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;  
-
-protected:
-    void createMHARunner() noexcept;
-    int32_t getSMVersion() const noexcept;
-
-private:
-    std::string const& mLayerName;
-    std::string mNamespace;
-
-    int32_t mS;
-    int32_t mB;
-    int32_t mSM;
-    int32_t mHeadSize;
-    int32_t mHiddenSize;
-    int32_t mNumHeads;
-
-    cuda_unique_ptr<half> mQkvBias;
-
-    vector<float> mDqProbs;
-    bool mUseInt8ScaleMax{true};
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-};
-
-class QKVToContextInt8PluginDynamicCreator : public nvinfer1::IPluginCreator
-{
-public:
-    QKVToContextInt8PluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(
-        char const* name, void const* serialData, size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp b/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp
deleted file mode 100644
index a69fb957..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp
+++ /dev/null
@@ -1,388 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "qkvToContextPlugin.h"
-
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "common_def.cuh"
-#include "cuda_runtime_api.h"
-#include "driver_types.h"
-#include "plugin.h"
-#include "serialize.h"
-#include <cstddef>
-#include <cstdint>
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* const kQKV_TO_CONTEXT_IXRT_PLUGIN_VERSION{"1"};
-char const* const kQKV_TO_CONTEXT_VAR_SEQLEN_IXRT_PLUGIN_VERSION{"2"};
-char const* const kQKV_TO_CONTEXT_IXRT_PLUGIN_NAME{"CustomQKVToContextPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection QKVToContextPluginDynamicCreator::mFC{};
-std::vector<PluginField> QKVToContextPluginDynamicCreator::mPluginAttributes;
-
-constexpr uint32_t IIDX = 0;  // index of the input tensor
-constexpr uint32_t MIDX = 1;  // index of the mask
-
-QKVToContextPluginDynamicCreator::QKVToContextPluginDynamicCreator() {
-    mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("hidden_size", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("num_heads", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("has_mask", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("dq_probs", nullptr, PluginFieldType::kFLOAT32, 1));
-
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* QKVToContextPluginDynamicCreator::getPluginName() const noexcept {
-    return kQKV_TO_CONTEXT_IXRT_PLUGIN_NAME;
-}
-
-char const* QKVToContextPluginDynamicCreator::getPluginVersion() const noexcept {
-    return kQKV_TO_CONTEXT_IXRT_PLUGIN_VERSION;
-}
-
-PluginFieldCollection const* QKVToContextPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* QKVToContextPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "Creating QKV2ContextPlugin..." << endl;
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-        int32_t hiddenSize = 0;
-        // Since numHeads must always exist or validateRequiredAttributes will fail,
-        // we can set numHeads to -1 so that static analysis tools don't warn about
-        // a division by zero in QKVToContextPluginDynamic constructor.
-        int32_t numHeads{-1};
-        bool hasMask = false;
-        int32_t typeId = -1;
-
-        float dqProbs = -1;
-
-        IXRT_PLUGIN_ASSERT(fc->fields != nullptr);
-        ixrt_plugin::validateRequiredAttributesExist({"type_id", "hidden_size", "num_heads", "has_mask"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            IXRT_PLUGIN_ASSERT(fc->fields[i].name != nullptr);
-            IXRT_PLUGIN_ASSERT(fc->fields[i].data != nullptr);
-            std::string field_name(fc->fields[i].name);
-
-            if (field_name.compare("type_id") == 0) {
-                typeId = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(typeId >= 0 && typeId <= 2,
-                                        ("QKV: Invalid TypeId " + std::to_string(typeId)).c_str());
-                gLogInfo << "Building typeId: " << typeId << endl;
-            }
-            if (field_name.compare("hidden_size") == 0) {
-                hiddenSize = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(hiddenSize > 0,
-                                        ("QKV: Invalid hiddenSize " + std::to_string(hiddenSize)).c_str());
-                gLogInfo << "Building hiddenSize: " << hiddenSize << endl;
-            }
-            if (field_name.compare("num_heads") == 0) {
-                numHeads = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(numHeads > 0, ("QKV: Invalid numHeads " + std::to_string(numHeads)).c_str());
-                gLogInfo << "Building numHeads: " << numHeads << endl;
-            }
-            if (field_name.compare("has_mask") == 0) {
-                auto hasMaskValue = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(hasMaskValue == 0 || hasMaskValue == 1,
-                                        ("QKV: Invalid hasMask " + std::to_string(hasMaskValue)).c_str());
-                hasMask = static_cast<bool>(hasMaskValue);
-                gLogInfo << "Building hasMask: " << hasMask << endl;
-            }
-        }
-
-        gLogInfo << "Building the Plugin..." << endl;
-        auto type = static_cast<DataType>(typeId);
-        auto* p = new QKVToContextPluginDynamic(name, type, hiddenSize, numHeads, dqProbs, hasMask);
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* QKVToContextPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                               size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call QKVToContextPluginDynamic::destroy()
-    return new QKVToContextPluginDynamic(name, serialData, serialLength);
-}
-
-void QKVToContextPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    mNamespace = libNamespace;
-}
-
-char const* QKVToContextPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(QKVToContextPluginDynamicCreator);
-//#########################################################################//
-QKVToContextPluginDynamic::QKVToContextPluginDynamic(const std::string name, const DataType type,
-                                                     const int32_t hiddenSize, const int32_t numHeads,
-                                                     float const dqProbs, bool hasImask)
-    : mLayerName(name),
-      mS(0),
-      mB(0),
-      mHeadSize(hiddenSize / numHeads),
-      mHiddenSize(hiddenSize),
-      mNumHeads(numHeads),
-      mHasImask(hasImask),
-      mType(type)
-
-{
-    //
-}
-
-QKVToContextPluginDynamic::QKVToContextPluginDynamic(const std::string name, void const* data, size_t length)
-    : mLayerName(name) {
-    gLogInfo << "QKV Deser Start" << endl;
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mNumHeads);
-    deserialize_value(&data, &length, &mHeadSize);
-    deserialize_value(&data, &length, &mHasImask);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mS);
-    deserialize_value(&data, &length, &mB);
-
-    gLogInfo << "QKV Deser done" << endl;
-}
-
-// IPluginV2 Methods
-char const* QKVToContextPluginDynamic::getPluginType() const noexcept { return kQKV_TO_CONTEXT_IXRT_PLUGIN_NAME; }
-
-char const* QKVToContextPluginDynamic::getPluginVersion() const noexcept { return kQKV_TO_CONTEXT_IXRT_PLUGIN_VERSION; }
-
-int32_t QKVToContextPluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t QKVToContextPluginDynamic::initialize() noexcept { return 0; }
-
-void QKVToContextPluginDynamic::terminate() noexcept {}
-
-size_t QKVToContextPluginDynamic::getSerializationSize() const noexcept {
-    return sizeof(mNumHeads) + sizeof(mHeadSize) + sizeof(DataType) + sizeof(mHasImask) + sizeof(mHiddenSize) +
-           sizeof(mS) + sizeof(mB);
-}
-
-void QKVToContextPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mNumHeads);
-    serialize_value(&buffer, mHeadSize);
-    serialize_value(&buffer, mHasImask);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mS);
-    serialize_value(&buffer, mB);
-}
-
-void QKVToContextPluginDynamic::destroy() noexcept { delete this; }
-
-void QKVToContextPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { mNamespace = libNamespace; }
-
-char const* QKVToContextPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType QKVToContextPluginDynamic::getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                                      int32_t /*nbInputs*/) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0);
-    IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF ||
-                       inputTypes[0] == DataType::kINT8);
-    return inputTypes[0];
-}
-
-// IPluginV2DynamicExt Methods
-nvinfer1::IPluginV2DynamicExt* QKVToContextPluginDynamic::clone() const noexcept {
-    gLogInfo << "QKV Clone" << endl;
-
-    QKVToContextPluginDynamic* ret = nullptr;
-    ret = new QKVToContextPluginDynamic(mLayerName, mType, mHiddenSize, mNumHeads, mDqProbs, mHasImask);
-
-    ret->setPluginNamespace(mNamespace.c_str());
-    gLogInfo << "QKV Clone done" << endl;
-    return ret;
-}
-
-DimsExprs QKVToContextPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                         int32_t /*nbInputs*/, IExprBuilder& exprBuilder) noexcept {
-    // Input is BxSx3*N*H, output should be BxSxN*H
-    IXRT_PLUGIN_ASSERT(outputIndex == 0);
-    // Copy over everything
-    DimsExprs output(inputs[IIDX]);
-    // Divide last dim by three
-    auto const* three = exprBuilder.constant(3);
-    output.d[HDIM] = exprBuilder.constant(mHiddenSize);
-    return output;
-}
-bool QKVToContextPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                          int32_t /*nbOutputs*/) noexcept {
-    IXRT_PLUGIN_ASSERT(pos >= 0);
-    IXRT_PLUGIN_ASSERT(pos < 2 + mHasImask);
-    IXRT_PLUGIN_ASSERT(nbInputs == 1 + mHasImask);
-    auto const* in = inOut;
-    auto const* out = inOut + nbInputs;
-
-    if (pos == 0) {
-        return (in->type == mType) && (in->format == TensorFormat::kLINEAR);
-    }
-
-    // pos==1
-    if ((mHasImask && pos == 1))  // pos 1 is the mask
-    {
-        auto const* inMask = &inOut[1];
-
-        // detect full mask and check that it was produced
-        return (inMask->type == DataType::kINT32) &&       // precision
-               (inMask->format == TensorFormat::kLINEAR);  // format
-    }
-
-    if (!mHasImask || pos == 2)  // output pos
-    {
-        return (in->type == out->type) && (out->format == TensorFormat::kLINEAR);
-    }
-
-    return false;
-}
-void QKVToContextPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                                                DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 1 + mHasImask);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    PluginTensorDesc const& inDesc = in[IIDX].desc;
-    TRT_UNUSED inDesc;
-    PluginTensorDesc const& outDesc = out->desc;
-    TRT_UNUSED outDesc;
-    IXRT_PLUGIN_ASSERT(mType == inDesc.type);
-    IXRT_PLUGIN_ASSERT(mType == outDesc.type);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.nbDims == 5)
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[HDIM] == 3 * mHiddenSize);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[4] == 1);
-    if (mHasImask) {
-        PluginTensorDesc const& maskDesc = in[MIDX].desc;
-        TRT_UNUSED maskDesc;
-        IXRT_PLUGIN_ASSERT(maskDesc.dims.nbDims == 2);
-        IXRT_PLUGIN_ASSERT(maskDesc.dims.d[0] == inDesc.dims.d[0]);
-        IXRT_PLUGIN_ASSERT(maskDesc.dims.d[1] == inDesc.dims.d[1]);
-    }
-
-    const int32_t S = inDesc.dims.d[SDIM];
-    const int32_t B = inDesc.dims.d[BDIM] <= 0 ? in->max.d[BDIM] : inDesc.dims.d[BDIM];
-    mS = S;
-    mB = B;
-
-    IXRT_PLUGIN_ASSERT(outDesc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[BDIM] == inDesc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[SDIM] == mS);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[HDIM] == mHiddenSize);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[4] == 1);
-#ifdef __ILUVATAR__
-    CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-    CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-}
-
-size_t QKVToContextPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                   PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    const int32_t B = inputs->dims.d[BDIM];
-    const int32_t S = inputs->dims.d[SDIM];
-    const int32_t E = inputs->dims.d[2];
-    int32_t fmha_S = S;
-    int64_t buffer_size = B * fmha_S * E;
-#ifndef __ILUVATAR__
-    buffer_size += B * S * S * mNumHeads;
-#endif
-    return 4 * buffer_size * sizeof(mType);
-}
-
-inline void print_element(half* x, int num, string name) {
-    printf("%s: \n", name.c_str());
-    half* out = (half*)malloc(num * sizeof(half));
-    cudaMemcpy(out, x, num * sizeof(half), cudaMemcpyDeviceToHost);
-    for (auto i = 0; i < num; i++) {
-        printf("%f\n", __half2float(out[i]));
-    }
-    printf("\n");
-}
-
-int32_t QKVToContextPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                           void const* const* inputs, void* const* outputs, void* workspace,
-                                           cudaStream_t stream) noexcept {
-    gLogInfo << "in QKVToContextPluginDynamic.." << endl;
-    int32_t S = inputDesc->dims.d[SDIM];
-    int32_t B = inputDesc->dims.d[BDIM];
-    int32_t status = STATUS_SUCCESS;
-#ifdef __ILUVATAR__
-    CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream));
-#endif
-
-    try {
-        if (mType != DataType::kHALF) {
-            gLogError << "embLayerNormPlugin infer type{" << int(mType) << "} not supported!" << endl;
-            return STATUS_NOT_SUPPORTED;
-        }
-        half* qkv_buffer_ = (half*)inputs[0];
-        half* qkv_out_ = (half*)outputs[0];
-        // [B, fmha_S]
-        int32_t* mask_ = mHasImask ? (int32_t*)inputs[1] : nullptr;
-        int fmha_seq_len = S;
-
-        int64_t buffer_size = B * fmha_seq_len * mHiddenSize;
-        half* q_buffer_ = reinterpret_cast<half*>(workspace);
-        half* k_buffer_ = q_buffer_ + buffer_size;
-        half* v_buffer_ = k_buffer_ + buffer_size;
-        
-
-        // [B, S, 3*E, 1, 1] [B, fmha_S]
-#ifdef __ILUVATAR__
-        auto status =
-            fused_multihead_attetion(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qkv_out_, B, mHeadSize,
-                                     mNumHeads, mHiddenSize, S, fmha_seq_len, cuinfer_handle, stream);
-#else    
-        half* qk_out_ = v_buffer_ + buffer_size;
-        auto status =
-            fused_multihead_attetion(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qk_out_, qkv_out_, B, mHeadSize,
-                                     mNumHeads, mHiddenSize, S, fmha_seq_len, blaslt_handle, stream);
-#endif
-        if (status != cudaSuccess) {
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-
-    } catch (std::exception const& e) {
-        caughtError(e);
-        return STATUS_FAILURE;
-    }
-}
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu b/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu
deleted file mode 100644
index fb9455c6..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu
+++ /dev/null
@@ -1,317 +0,0 @@
-#include "qkvToContextPlugin.h"
-#include "backend/bert/bert_helper.h"
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#else
-#include "backend/cublas/cublas_helper.h"
-#endif
-
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-void __global__ IxinferArrangeEncQkvKernel(half *ori_qkv, half *new_q, half *new_k, half *new_v,
-                                           int head_dim, int head_num, int batch_seq_len, int fmha_seq_len) {
-    int hidden_size = head_dim * head_num;
-    int batch_id = blockIdx.x;
-    int token_id = blockIdx.y;
-
-    int i = threadIdx.x;  // 1个线程处理2个数据
-    int head_id = (i * 2) / head_dim;
-    int dim_id = (i * 2) % head_dim;
-
-    half2 *p_ori_qkv = (half2 *)(ori_qkv + batch_id * batch_seq_len * hidden_size * 3 + token_id * hidden_size * 3);
-    half2 *p_new_qkv;
-
-    int target_id = batch_id * head_num * fmha_seq_len * head_dim + head_id * fmha_seq_len * head_dim +
-                    token_id * head_dim + dim_id;
-    /* q */
-    p_new_qkv = (half2 *)(new_q + target_id);
-    p_new_qkv[0] = p_ori_qkv[i];
-    /* k */
-    p_ori_qkv += hidden_size / 2;
-    p_new_qkv = (half2 *)(new_k + target_id);
-    p_new_qkv[0] = p_ori_qkv[i];
-    /* v */
-    p_ori_qkv += hidden_size / 2;
-    p_new_qkv = (half2 *)(new_v + target_id);
-    p_new_qkv[0] = p_ori_qkv[i];
-}
-
-void IxinferArrangeEncQkv(half *ori_qkv, half *new_q, half *new_k, half *new_v, int bsz,
-                          int head_num, int head_dim, int ori_seq_len, int fmha_seq_len, cudaStream_t stream) {
-    int hsz = head_num * head_dim;
-    if (hsz / 2 > 4096) {
-        throw std::runtime_error("hidden_size / 2 > 4096");
-    }
-    if (hsz % 2 != 0) {
-        throw std::runtime_error("hsz % 2 != 0");
-    }
-    if (head_dim % 2 != 0) {
-        throw std::runtime_error("head_dim %2 != 0");
-    }
-    dim3 blockSize(bsz, ori_seq_len);
-    IxinferArrangeEncQkvKernel<<<blockSize, hsz / 2, 0, stream>>>(ori_qkv, new_q, new_k, new_v, head_dim,
-                                                                  head_num, ori_seq_len, fmha_seq_len);
-}
-
-__global__ void IxinferEncAttnOutArrangeKernel(const half *ori_q, half *new_q, const int bsz, const int ori_seq_len,
-                                               const int fmha_seq_len, const int head_num, const int head_dim) {
-    half2 *p_ori_q = (half2 *)ori_q;
-    half2 *p_new_q = (half2 *)new_q;
-
-    int batch_token_num = ori_seq_len * head_dim * head_num;
-    int hidden_size = head_dim * head_num;
-    int date_length = bsz * ori_seq_len * head_num * head_dim;
-
-    int elem_idx = threadIdx.x + blockIdx.x * blockDim.x;
-    while (elem_idx < date_length / 2) {
-        int half_elem_idx = elem_idx * 2;
-
-        int bsz_idx = half_elem_idx / batch_token_num;
-        int seq_idx = half_elem_idx % batch_token_num / hidden_size;
-        int head_idx = half_elem_idx % batch_token_num % hidden_size / head_dim;
-        int dim_idx = half_elem_idx % batch_token_num % hidden_size % head_dim;
-
-        int src_index = bsz_idx * head_num * fmha_seq_len * head_dim + head_idx * fmha_seq_len * head_dim +
-                        seq_idx * head_dim + dim_idx;
-
-        p_new_q[elem_idx] = p_ori_q[src_index / 2];
-
-        elem_idx += gridDim.x * blockDim.x;
-    }
-}
-
-void IxinferEncAttnOutArrange(half *ori_q, half *new_q, int bsz, int ori_seq_len, int fmha_seq_len, int head_num,
-                              int head_dim, cudaStream_t stream) {
-    if (bsz * ori_seq_len * head_num * head_dim % 2 != 0) {
-        throw std::runtime_error("bsz * ori_seq_len * head_num * head_dim % 2 != 0");
-    }
-    int data_length = bsz * ori_seq_len * head_num * head_dim / 2;
-    int num_threads = 512;
-    int num_blocks = ((data_length - 1 + num_threads) / num_threads);
-    num_blocks = std::min(num_blocks, 128);
-    IxinferEncAttnOutArrangeKernel<<<num_blocks, num_threads, 0, stream>>>(ori_q, new_q, bsz, ori_seq_len, fmha_seq_len,
-                                                                           head_num, head_dim);
-}
-
-
-template <int log2_elements>
-__global__ void IxinferCorrelationSoftmaxEncselfKernel(__half *correlation, const int *src_padding_mask,
-                                                       const int batch_seq_len) {
-    constexpr int next_power_of_two = 1 << log2_elements;
-    constexpr int SOFT_WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    constexpr int WARP_ITERATIONS = next_power_of_two / SOFT_WARP_SIZE;
-
-    int head_num = blockDim.y;
-    int seq_len = gridDim.y;
-    int start_idx = (blockIdx.x * head_num * seq_len * batch_seq_len + threadIdx.y * seq_len * batch_seq_len +
-                     blockIdx.y * batch_seq_len);
-
-    half2 *p_correlation = (half2 *)(correlation + start_idx);
-    int32_t *p_mask = (int32_t *)(src_padding_mask + blockIdx.x * batch_seq_len);
-
-    int local_idx = threadIdx.x;
-
-    float2 elements[WARP_ITERATIONS];
-#pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-        int element_index = local_idx + it * SOFT_WARP_SIZE;
-        if (element_index < batch_seq_len / 2) {
-            half2 correlation_value = p_correlation[element_index];
-
-            elements[it].x =
-                p_mask[element_index * 2] ? -INFINITY : __half2float(correlation_value.x);
-            elements[it].y = p_mask[element_index * 2 + 1] ? -INFINITY
-                                                           : __half2float(correlation_value.y);
-
-        } else {
-            elements[it].x = -INFINITY;
-            elements[it].y = -INFINITY;
-        }
-    }
-
-    float max_value = elements[0].x;
-    max_value = (max_value > elements[0].y) ? max_value : elements[0].y;
-
-#pragma unroll
-    for (int it = 1; it < WARP_ITERATIONS; ++it) {
-        max_value = (max_value > elements[it].x) ? max_value : elements[it].x;
-        max_value = (max_value > elements[it].y) ? max_value : elements[it].y;
-    }
-
-    warp_reduce<float, SOFT_WARP_SIZE, Max>(&max_value);
-
-    float sum = 0.0f;
-#pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-        elements[it].x = __expf(elements[it].x - max_value);
-        elements[it].y = __expf(elements[it].y - max_value);
-
-        sum += (elements[it].x + elements[it].y);
-    }
-
-    warp_reduce<float, SOFT_WARP_SIZE, Add>(&sum);
-    sum = 1.0f / sum;
-
-#pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-        int element_index = local_idx + it * SOFT_WARP_SIZE;
-        half2 correlation_value;
-        if (element_index < batch_seq_len / 2) {
-            correlation_value.x = __float2half(elements[it].x * sum);
-            correlation_value.y = __float2half(elements[it].y * sum);
-
-            p_correlation[element_index] = correlation_value;
-
-        } else {
-            break;
-        }
-    }
-}
-
-void IxinferCorrelationSoftmaxEncself(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
-                                      __half *correlation, const int *src_padding_mask) {
-    if (batch_seq_len > 4096) {
-        throw std::runtime_error("batch_seq_len should <= 4096");
-    }
-    if (batch_seq_len % 2 != 0) {
-        throw std::runtime_error("batch_seq_len % 2 != 0");
-    }
-
-    int log2_elements = log2_ceil(batch_seq_len / 2);
-    int next_power_of_two = 1 << log2_elements;
-    int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-
-    dim3 grid(batch_size, batch_seq_len);
-
-    dim3 block(WARP_SIZE, head_num);
-
-    switch (log2_elements) {
-        case 0:
-            IxinferCorrelationSoftmaxEncselfKernel<0>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 1:
-            IxinferCorrelationSoftmaxEncselfKernel<1>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 2:
-            IxinferCorrelationSoftmaxEncselfKernel<2>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 3:
-            IxinferCorrelationSoftmaxEncselfKernel<3>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 4:
-            IxinferCorrelationSoftmaxEncselfKernel<4>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 5:
-            IxinferCorrelationSoftmaxEncselfKernel<5>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 6:
-            IxinferCorrelationSoftmaxEncselfKernel<6>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 7:
-            IxinferCorrelationSoftmaxEncselfKernel<7>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 8:
-            IxinferCorrelationSoftmaxEncselfKernel<8>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 9:
-            IxinferCorrelationSoftmaxEncselfKernel<9>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 10:
-            IxinferCorrelationSoftmaxEncselfKernel<10>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 11:
-            IxinferCorrelationSoftmaxEncselfKernel<11>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 12:
-            IxinferCorrelationSoftmaxEncselfKernel<12>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        default:
-            throw std::runtime_error("IxinferCorrelationSoftmaxEncself NotImplementedError");
-            break;
-    }
-}
-
-#ifdef __ILUVATAR__
-cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask,
-                              half* q_buffer, half* k_buffer, half* v_buffer, half* qkv_out,
-                              int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len,
-                              cuinferHandle_t &cuinfer_handle, cudaStream_t &stream) {
-    /* qkv arrange*/
-    // bsz,ori_seq_len,3*hsz -> 3*(bsz,head_num,fmha_seq_len,head_dim)
-    IxinferArrangeEncQkv(qkv_buffer, q_buffer, k_buffer, v_buffer, bsz, head_num, head_dim, ori_seq_len,
-                         fmha_seq_len, stream);
-
-    cuinferTensorDescriptor_t qDesc, kDesc, vDesc, maskDesc, oDesc;
-    cuinferDataType_t _cuinferCompType = cuinferDataType_t::CUINFER_DATA_FLOAT;
-    cuinferDataType_t _cuinferDataType = cuinferDataType_t::CUINFER_DATA_HALF;
-    cuinferDataType_t _cuinferMaskType = cuinferDataType_t::CUINFER_DATA_INT32;
-    cuinferCreateTensorDescriptor(&qDesc);
-    cuinferCreateTensorDescriptor(&kDesc);
-    cuinferCreateTensorDescriptor(&vDesc);
-    cuinferCreateTensorDescriptor(&maskDesc);
-    cuinferCreateTensorDescriptor(&oDesc);
-
-    cuinferSetTensor4dDescriptor(qDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num,
-                                    fmha_seq_len, head_dim);
-    cuinferSetTensor4dDescriptor(kDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num,
-                                    fmha_seq_len, head_dim);
-    cuinferSetTensor4dDescriptor(vDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num,
-                                    fmha_seq_len, head_dim);
-    cuinferSetTensor4dDescriptor(maskDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferMaskType, bsz, 1, 1,
-                                    fmha_seq_len);
-    cuinferSetTensor4dDescriptor(oDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num,
-                                    fmha_seq_len, head_dim);
-
-    cuinferFMHAParam fmha_param;
-    cuinferFMHAForward(cuinfer_handle, fmha_param, _cuinferCompType, _cuinferDataType, _cuinferMaskType, qDesc,
-                        q_buffer, kDesc, k_buffer, vDesc, v_buffer, maskDesc, mask, oDesc, q_buffer, true);
-    
-    IxinferEncAttnOutArrange(q_buffer, qkv_out, bsz, ori_seq_len, fmha_seq_len, head_num, head_dim, stream);
-    return cudaSuccess;
-}
-#else
-cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask, 
-                              half* q_buffer, half* k_buffer, half* v_buffer, half* qk_out, half* qkv_out,
-                              int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len,
-                              cublasLtHandle_t &blaslt_handle, cudaStream_t &stream) {
-    /* qkv arrange*/
-    // bsz,ori_seq_len,3*hsz -> 3*(bsz,head_num,fmha_seq_len,head_dim)
-    IxinferArrangeEncQkv(qkv_buffer, q_buffer, k_buffer, v_buffer, bsz, head_num, head_dim, ori_seq_len,
-                         fmha_seq_len, stream);
-
-    cublaslt_gemm(k_buffer, q_buffer, qk_out, bsz * head_num, fmha_seq_len, fmha_seq_len, head_dim,
-                    fmha_seq_len * head_dim, fmha_seq_len * head_dim, fmha_seq_len * fmha_seq_len, 1.0/sqrt(head_dim*1.0), blaslt_handle, stream);
- 
-    IxinferCorrelationSoftmaxEncself(bsz, fmha_seq_len, head_num, stream, qk_out, mask);
- 
-    cublaslt_gemm_nn(v_buffer, qk_out, q_buffer, bsz * head_num, head_dim, fmha_seq_len, fmha_seq_len,
-                    fmha_seq_len * head_dim, fmha_seq_len * fmha_seq_len, fmha_seq_len * head_dim, 1.0f, blaslt_handle, stream);
-
-    IxinferEncAttnOutArrange(q_buffer, qkv_out, bsz, ori_seq_len, fmha_seq_len, head_num, head_dim, stream);
-    return cudaSuccess;                            
-}
-#endif
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h b/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h
deleted file mode 100644
index aaee52b7..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#ifdef __ILUVATAR__
-#include <ixinfer.h>
-#endif
-#include <vector>
-
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-#ifdef __ILUVATAR__
-cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask, 
-                              half* q_buffer, half* k_buffer, half* v_buffer, half* qkv_out,
-                              int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len,
-                              cuinferHandle_t &cuinfer_handle, cudaStream_t &stream);
-#else
-cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask, 
-                              half* q_buffer, half* k_buffer, half* v_buffer, half* qk_out, half* qkv_out,
-                              int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len,
-                              cublasLtHandle_t &blaslt_handle, cudaStream_t &stream);
-#endif
-
-void IxinferArrangeEncQkv(half *ori_qkv, half *new_q, half *new_k, half *new_v, int bsz,
-                          int head_num, int head_dim, int ori_seq_len, int fmha_seq_len, cudaStream_t stream);
-
-void IxinferEncAttnOutArrange(half *ori_q, half *new_q, int bsz, int ori_seq_len, int fmha_seq_len, int head_num,
-                              int head_dim, cudaStream_t stream);
-
-void IxinferCorrelationSoftmaxEncself(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
-                                      half *correlation, const int *src_padding_mask);
-
-class QKVToContextPluginDynamic : public nvinfer1::IPluginV2DynamicExt
-{
-public:
-    QKVToContextPluginDynamic(const std::string name, const nvinfer1::DataType type, const int32_t hiddenSize,
-        const int32_t numHeads, float const dqProbs, bool hasImask = false);
-
-    QKVToContextPluginDynamic(const std::string name, void const* data, size_t length);
-
-    // It doesn't make sense to make QKVToContextPluginDynamic without arguments, so we
-    // delete default constructor.
-    QKVToContextPluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(
-        int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(
-        int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-        nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-        nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-        void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-private:
-    const std::string mLayerName;
-    std::string mNamespace;
-
-    int32_t mS;
-    int32_t mB;
-    int32_t mSM;
-    int32_t mHeadSize;
-    int32_t mHiddenSize;
-    int32_t mNumHeads;
-    bool mHasImask;
-    nvinfer1::DataType mType;
-    float mDqProbs;
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-    
-    half* query_;
-};
-
-class QKVToContextPluginDynamicCreator : public nvinfer1::IPluginCreator
-{
-public:
-    QKVToContextPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(
-        char const* name, void const* serialData, size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp b/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp
deleted file mode 100644
index 6e4e5a37..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp
+++ /dev/null
@@ -1,404 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "skipLayerNormInt8Plugin.h"
-
-#include "NvInferRuntime.h"
-#include "checkMacrosPlugin.h"
-#include "driver_types.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-// Clip plugin specific constants
-namespace {
-char const* kSKIP_LAYER_NORM_INT8_VERSION_HFACE{"3"};
-char const* kSKIP_LAYER_NORM_INT8_VERSION_MTRON{"4"};
-char const* kSKIP_LAYER_NORM_INT8_NAME{"CustomSkipLayerNormPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection SkipLayerNormInt8PluginBaseCreator::mFC{};
-std::vector<PluginField> SkipLayerNormInt8PluginBaseCreator::mPluginAttributes;
-
-constexpr auto param_type = DataType::kFLOAT;
-
-SkipLayerNormInt8PluginBaseCreator::SkipLayerNormInt8PluginBaseCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("beta"));
-    mPluginAttributes.emplace_back(PluginField("gamma"));
-    mPluginAttributes.emplace_back(PluginField("bias"));
-    mPluginAttributes.emplace_back(PluginField("output_fp32"));
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-SkipLayerNormInt8PluginHFaceCreator::SkipLayerNormInt8PluginHFaceCreator() : SkipLayerNormInt8PluginBaseCreator() {}
-
-char const* SkipLayerNormInt8PluginBaseCreator::getPluginName() const noexcept { return kSKIP_LAYER_NORM_INT8_NAME; }
-
-PluginFieldCollection const* SkipLayerNormInt8PluginBaseCreator::getFieldNames() noexcept { return &mFC; }
-
-void SkipLayerNormInt8PluginBaseCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    mNamespace = libNamespace;
-}
-
-char const* SkipLayerNormInt8PluginBaseCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-char const* SkipLayerNormInt8PluginHFaceCreator::getPluginVersion() const noexcept {
-    return kSKIP_LAYER_NORM_INT8_VERSION_HFACE;
-}
-
-bool buildBetaAndGamma(PluginFieldCollection const* fc, Weights& beta, Weights& gamma, Weights& bias) {
-    ixrt_plugin::validateRequiredAttributesExist({"beta", "gamma"}, fc);
-
-    bool output_fp32 = false;
-
-    for (int32_t i = 0; i < fc->nbFields; i++) {
-        std::string field_name(fc->fields[i].name);
-
-        if (field_name.compare("beta") == 0) {
-            gLogInfo << "Building beta..." << endl;
-            beta.values = fc->fields[i].data;
-            beta.count = fc->fields[i].length;
-            beta.type = fieldTypeToDataType(fc->fields[i].type);
-        }
-
-        if (field_name.compare("gamma") == 0) {
-            gLogInfo << "Building gamma..." << endl;
-            gamma.values = fc->fields[i].data;
-            gamma.count = fc->fields[i].length;
-            gamma.type = fieldTypeToDataType(fc->fields[i].type);
-        }
-
-        if (field_name.compare("bias") == 0) {
-            gLogInfo << "Building bias..." << endl;
-            bias.values = fc->fields[i].data;
-            bias.count = fc->fields[i].length;
-            bias.type = fieldTypeToDataType(fc->fields[i].type);
-        }
-
-        if (field_name.compare("output_fp32") == 0) {
-            IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-            output_fp32 = (static_cast<int32_t const*>(fc->fields[i].data)[0] == 1);
-            gLogInfo << "Building output_fp32" << output_fp32 << endl;
-        }
-    }
-
-    IXRT_PLUGIN_CHECK_VALUE(beta.values != nullptr, "SkipLayerNorm: invalid beta");
-    IXRT_PLUGIN_CHECK_VALUE(beta.count > 0, "SkipLayerNorm: invalid beta");
-
-    IXRT_PLUGIN_CHECK_VALUE(gamma.values != nullptr, "SkipLayerNorm: invalid gamma");
-    IXRT_PLUGIN_CHECK_VALUE(gamma.count > 0, "SkipLayerNorm: invalid gamma");
-    return output_fp32;
-}
-
-IPluginV2* SkipLayerNormInt8PluginHFaceCreator::createPlugin(char const* name,
-                                                             PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "SkipLayerNormInt8PluginHFaceCreator createPlugin" << endl;
-
-        Weights beta{DataType::kFLOAT, nullptr, 0};
-        Weights gamma{DataType::kFLOAT, nullptr, 0};
-        Weights bias{DataType::kFLOAT, nullptr, 0};
-        bool output_fp32 = buildBetaAndGamma(fc, beta, gamma, bias);
-        return new SkipLayerNormInt8PluginHFace(name, beta, gamma, bias, output_fp32);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* SkipLayerNormInt8PluginHFaceCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                  size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call SkipLayerNormInterleavedPlugin::destroy()
-    try {
-        gLogInfo << "SkipLayerNormInterleavedPluginHFaceCreator deserializePlugin" << endl;
-        return new SkipLayerNormInt8PluginHFace(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-// REGISTER_TENSORRT_PLUGIN(SkipLayerNormInt8PluginHFaceCreator);
-//#########################################################################//
-SkipLayerNormInt8PluginBase::SkipLayerNormInt8PluginBase(std::string const& name, Weights const& beta,
-                                                         Weights const& gamma, Weights const& bias, bool output_fp32)
-    : mLayerName(name),
-      mGammaDev(nullptr),
-      mBetaDev(nullptr),
-      mBiasDev(nullptr),
-      mLd(beta.count),
-      mParamsOnDevice(false),
-      output_fp32(output_fp32) {
-    IXRT_PLUGIN_ASSERT(mLd > 0);
-    IXRT_PLUGIN_ASSERT(beta.count == gamma.count);
-    // dataType for beta, gamma weights is always fp16
-    mParamWordsize = getElementSize(param_type);
-
-    mBeta.convertAndCopy(beta, param_type);
-    mGamma.convertAndCopy(gamma, param_type);
-
-    mHasBias = (bias.values != nullptr);
-    if (mHasBias) {
-        mBias.convertAndCopy(bias, param_type);
-    }
-
-    copyToDevice(mGamma, getWeightsSize(mGamma, param_type), mGammaDev);
-    copyToDevice(mBeta, getWeightsSize(mBeta, param_type), mBetaDev);
-    if (mHasBias) {
-        copyToDevice(mBias, getWeightsSize(mBias, param_type), mBiasDev);
-    }
-}
-
-SkipLayerNormInt8PluginBase::SkipLayerNormInt8PluginBase(std::string const& name, void const* data, size_t length)
-    : mLayerName(name), mGammaDev(nullptr), mBetaDev(nullptr), mParamsOnDevice(false) {
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mLd);
-    deserialize_value(&data, &length, &mHasBias);
-    deserialize_value(&data, &length, &output_fp32);
-
-    mParamWordsize = getElementSize(param_type);
-
-    char const* d = static_cast<char const*>(data);
-    mBeta.convertAndCopy(d, mLd, param_type);
-    mGamma.convertAndCopy(d, mLd, param_type);
-
-    if (mHasBias) {
-        mBias.convertAndCopy(d, mLd, param_type);
-    }
-
-    copyToDevice(mGamma, getWeightsSize(mGamma, param_type), mGammaDev);
-    copyToDevice(mBeta, getWeightsSize(mBeta, param_type), mBetaDev);
-    if (mHasBias) {
-        copyToDevice(mBias, getWeightsSize(mBias, param_type), mBiasDev);
-    }
-}
-
-SkipLayerNormInt8PluginHFace::SkipLayerNormInt8PluginHFace(std::string const& name, Weights const& beta,
-                                                           Weights const& gamma, Weights const& bias, bool output_fp32)
-    : SkipLayerNormInt8PluginBase(name, beta, gamma, bias, output_fp32) {}
-
-SkipLayerNormInt8PluginHFace::SkipLayerNormInt8PluginHFace(std::string const& name, void const* data, size_t length)
-    : SkipLayerNormInt8PluginBase(name, data, length) {
-    gLogInfo << "SkipLayerNormInt8PluginHFace deserialize" << endl;
-}
-
-// IPluginV2 Methods
-char const* SkipLayerNormInt8PluginBase::getPluginType() const noexcept { return kSKIP_LAYER_NORM_INT8_NAME; }
-
-size_t SkipLayerNormInt8PluginBase::getSerializationSize() const noexcept {
-    const size_t biasSize = mHasBias ? (mLd * mParamWordsize) : 0;
-    return 2 * mParamWordsize * mLd + sizeof(mLd) + sizeof(mHasBias) + sizeof(output_fp32) + biasSize;
-}
-
-void SkipLayerNormInt8PluginBase::serialize(void* buffer) const noexcept {
-    try {
-        serialize_value(&buffer, mLd);
-        serialize_value(&buffer, mHasBias);
-        serialize_value(&buffer, output_fp32);
-
-        char* d = static_cast<char*>(buffer);
-        serFromDev(d, static_cast<char*>(mBetaDev.get()), mLd * mParamWordsize);
-        serFromDev(d, static_cast<char*>(mGammaDev.get()), mLd * mParamWordsize);
-        if (mHasBias) {
-            serFromDev(d, static_cast<char*>(mBiasDev.get()), mLd * mParamWordsize);
-        }
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-void SkipLayerNormInt8PluginBase::destroy() noexcept {
-    try {
-        // This gets called when the network containing plugin is destroyed
-        mGammaDev.reset(nullptr);
-        mBetaDev.reset(nullptr);
-        if (mHasBias) {
-            mBiasDev.reset(nullptr);
-        }
-        delete this;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-void SkipLayerNormInt8PluginBase::setPluginNamespace(char const* libNamespace) noexcept { mNamespace = libNamespace; }
-
-char const* SkipLayerNormInt8PluginBase::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// HFace
-int32_t SkipLayerNormInt8PluginHFace::initialize() noexcept {
-    gLogInfo << "SkipLayerNormInterleavedPluginHFace initialize" << endl;
-    return 0;
-}
-
-void SkipLayerNormInt8PluginHFace::terminate() noexcept {
-    gLogInfo << "SkipLayerNormInterleavedPluginHFace terminate" << endl;
-}
-
-void SkipLayerNormInt8PluginHFace::destroy() noexcept {
-    gLogInfo << "SkipLayerNormInterleavedPluginHFace destroy" << endl;
-    SkipLayerNormInt8PluginBase::destroy();
-}
-
-char const* SkipLayerNormInt8PluginHFace::getPluginVersion() const noexcept {
-    return kSKIP_LAYER_NORM_INT8_VERSION_HFACE;
-}
-
-int32_t SkipLayerNormInt8PluginHFace::getNbOutputs() const noexcept { return 2; }
-
-// IPluginV2Ext Methods
-DataType SkipLayerNormInt8PluginBase::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                        int32_t nbInputs) const noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-        IXRT_PLUGIN_ASSERT(index >= 0 && index < getNbOutputs());
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-        if (index == 0) {
-            return output_fp32 ? DataType::kFLOAT : DataType::kINT8;
-        }
-        return DataType::kFLOAT;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DataType{};
-}
-
-// IPluginV2DynamicExt Methods
-DimsExprs SkipLayerNormInt8PluginBase::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                           int32_t nbInputs, IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-        IXRT_PLUGIN_ASSERT(outputIndex >= 0 && outputIndex < getNbOutputs());
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        return inputs[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool SkipLayerNormInt8PluginBase::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut,
-                                                            int32_t nbInputs, int32_t nbOutputs) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inOut != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-        IXRT_PLUGIN_ASSERT(nbOutputs == getNbOutputs());
-        IXRT_PLUGIN_ASSERT(pos >= 0 && pos < (nbInputs + nbOutputs));
-
-        PluginTensorDesc const& desc = inOut[pos];
-        if (pos == 2 || pos == 4 || (output_fp32 && pos == 3)) {
-            return desc.type == DataType::kFLOAT && desc.format == TensorFormat::kLINEAR;
-        }
-        return desc.type == DataType::kINT8 && desc.format == TensorFormat::kLINEAR;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return false;
-}
-
-void SkipLayerNormInt8PluginBase::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                                  DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbOutputs == getNbOutputs());
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-        auto const& inDims0 = inputs[0].desc.dims;
-        auto const& inDims1 = inputs[1].desc.dims;
-        auto const& inDims2 = inputs[2].desc.dims;
-        TRT_UNUSED inDims1;
-        TRT_UNUSED inDims2;
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == inDims1.nbDims);
-        IXRT_PLUGIN_ASSERT(std::equal(inDims0.d, inDims0.d + inDims0.nbDims, inDims1.d));
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == inDims2.nbDims);
-        IXRT_PLUGIN_ASSERT(std::equal(inDims0.d, inDims0.d + inDims0.nbDims, inDims2.d));
-
-        mParamWordsize = getElementSize(param_type);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t SkipLayerNormInt8PluginBase::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                     PluginTensorDesc const* outputs,
-                                                     int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-// HFace IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* SkipLayerNormInt8PluginHFace::clone() const noexcept {
-    try {
-        gLogInfo << "SkipLayerNormInterleavedPluginHFace clone" << endl;
-        auto* p = new SkipLayerNormInt8PluginHFace(mLayerName, mBeta, mGamma, mBias, output_fp32);
-        p->initialize();
-        p->setPluginNamespace(mNamespace.c_str());
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-int32_t SkipLayerNormInt8PluginHFace::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                              void const* const* inputs, void* const* outputs, void* workspace,
-                                              cudaStream_t stream) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        auto const iDesc = inputDesc[0];
-        auto const oDesc = outputDesc[0];
-
-        const int32_t B = iDesc.dims.d[0];
-        const int32_t S = iDesc.dims.d[1];
-        const int32_t E = iDesc.dims.d[2];
-        int batch_token_num = B * S;
-        float const dqScaleIn = iDesc.scale;
-        IXRT_PLUGIN_ASSERT(dqScaleIn > 1e-9);
-        float const qScale = oDesc.scale;
-        int8_t const* input = static_cast<int8_t const*>(inputs[0]);
-        int8_t const* skip = static_cast<int8_t const*>(inputs[1]);
-        float* residual = (float*)inputs[2];
-        float const* gamma = static_cast<float const*>(mGammaDev.get());
-        float const* beta = static_cast<float const*>(mBetaDev.get());
-        float const* bias = static_cast<float const*>(mBiasDev.get());
-        float* residual_out = static_cast<float*>(outputs[1]);
-
-        if (!output_fp32) {
-            int8_t* output = static_cast<int8_t*>(outputs[0]);
-            skipLayerNormI8II8O(input, gamma, beta, bias, output, residual, residual_out, batch_token_num, E,
-                                dqScaleIn, 1.0 / qScale, 1024, stream, true);
-        } else {
-            float* output = static_cast<float*>(outputs[0]);
-            skipLayerNormI8IF32O(input, gamma, beta, bias, output, residual, residual_out, batch_token_num, E,
-                                 1.0 / dqScaleIn, 1.0 / qScale, 1024, stream, true);
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu b/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu
deleted file mode 100644
index 7cd3e564..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu
+++ /dev/null
@@ -1,361 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "backend/bert/bert_helper.h"
-#include "skipLayerNormInt8Plugin.h"
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void skipLayernormI8II8OKernel(const int8_t *input, const float *scale, const float *bias,
-                                        const float *residual_bias, int8_t *output, float *residual, float* residual_out, 
-                                        int hidden_size, float dequant_scale, float quant_scale,
-                                        bool is_post_ln) {
-    // register
-    // process 2 data
-    float4 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x *  hidden_size / 4;
-    char4 *p_input = (char4 *)input;
-    char4 *p_output = (char4 *)output;
-    float4 *p_residual = (float4 *)residual;
-    float4 *p_residual_out = (float4 *)residual_out;
-    float4 *p_scale = (float4 *)scale;
-    float4 *p_bias = (float4 *)bias;
-    float4 *p_residual_bias = (float4 *)residual_bias;
-    // one line start
-    p_input += block_start;
-    p_output += block_start;
-    p_residual += block_start; 
-    p_residual_out += block_start;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-    // load data from global memory
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-         // vals = dequant(input) + residual + bias
-        p_residual_out[element_index].x = p_residual[element_index].x + p_residual_bias[element_index].x;
-        p_residual_out[element_index].y = p_residual[element_index].y + p_residual_bias[element_index].y;
-        p_residual_out[element_index].z = p_residual[element_index].z + p_residual_bias[element_index].z;
-        p_residual_out[element_index].w = p_residual[element_index].w + p_residual_bias[element_index].w;
-        vals[it] = char4addfloat4_dequant(p_input[element_index], p_residual_out[element_index], dequant_scale);
-        WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].z, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].w, &thread_mean, &thread_m2, &thread_count);
-    }
-
-    // mean var
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        float4 norm_value = compute_float4_norm_value(vals[it], mean, m2, hidden_size, epsilon,
-                                                      p_scale[element_index], p_bias[element_index]);
-
-        p_residual_out[element_index].x = norm_value.x;
-        p_residual_out[element_index].y = norm_value.y;
-        p_residual_out[element_index].z = norm_value.z;
-        p_residual_out[element_index].w = norm_value.w;
-
-        char4 res = float42char4(norm_value, quant_scale);
-        p_output[element_index] = res;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void skipLayernormI8IF32OKernel(const int8_t *input, const float *scale, const float *bias,
-                                        const float *residual_bias, float *output, float *residual, float* residual_out, 
-                                        int hidden_size, float dequant_scale, float quant_scale,
-                                        bool is_post_ln) {
-    // register
-    // process 2 data
-    float4 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_size / 4;
-    char4 *p_input = (char4 *)input;
-    float4 *p_output = (float4 *)output;
-    float4 *p_residual = (float4 *)residual;
-    float4 *p_residual_out = (float4 *)residual_out;
-    float4 *p_scale = (float4 *)scale;
-    float4 *p_bias = (float4 *)bias;
-    float4 *p_residual_bias = (float4 *)residual_bias;
-    // one line start
-    p_input += block_start;
-    p_output += block_start;
-    p_residual += block_start;
-    p_residual_out += block_start;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-    // load data from global memory
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-         // vals = dequant(input) + residual + bias
-        p_residual_out[element_index].x = p_residual[element_index].x + p_residual_bias[element_index].x;
-        p_residual_out[element_index].y = p_residual[element_index].y + p_residual_bias[element_index].y;
-        p_residual_out[element_index].z = p_residual[element_index].z + p_residual_bias[element_index].z;
-        p_residual_out[element_index].w = p_residual[element_index].w + p_residual_bias[element_index].w;
-        vals[it] = char4addfloat4_dequant(p_input[element_index], p_residual_out[element_index], dequant_scale);
-        WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].z, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].w, &thread_mean, &thread_m2, &thread_count);
-    }
-
-    // mean var
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        float4 norm_value = compute_float4_norm_value(vals[it], mean, m2, hidden_size, epsilon,
-                                                      p_scale[element_index], p_bias[element_index]);
-        
-        p_output[element_index].x = norm_value.x;
-        p_output[element_index].y = norm_value.y;
-        p_output[element_index].z = norm_value.z;
-        p_output[element_index].w = norm_value.w;
-    }
-}
-
-
-void skipLayerNormI8II8O(const int8_t *input, const  float *scale, const float *bias, const float *residual_bias, 
-                       int8_t *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale,
-                       float quant_scale, int max_thread_per_block, cudaStream_t stream,
-                       bool is_post_ln) {
-
-    if (hidden_size > 1024) {
-        throw std::runtime_error("hidden_size should <= 1024");
-    }
-    if (hidden_size % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int num_warp = hidden_size / C10_WARP_SIZE / 4;
-
-    switch (num_warp) {
-        case 1:
-            skipLayernormI8II8OKernel<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 2:
-            skipLayernormI8II8OKernel<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 3:
-            skipLayernormI8II8OKernel<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 4:
-            skipLayernormI8II8OKernel<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 5:
-            skipLayernormI8II8OKernel<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 6:
-            skipLayernormI8II8OKernel<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 7:
-            skipLayernormI8II8OKernel<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 8:
-            skipLayernormI8II8OKernel<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 9:
-            skipLayernormI8II8OKernel<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 10:
-            skipLayernormI8II8OKernel<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 11:
-            skipLayernormI8II8OKernel<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 12:
-            skipLayernormI8II8OKernel<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 13:
-            skipLayernormI8II8OKernel<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 14:
-            skipLayernormI8II8OKernel<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 15:
-            skipLayernormI8II8OKernel<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 16:
-            skipLayernormI8II8OKernel<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        default:
-            throw std::runtime_error("skipLayernormI8II8OKernel");
-            break;
-    }
-}
-
-void skipLayerNormI8IF32O(const int8_t *input, const  float *scale, const float *bias, const float *residual_bias,
-                       float *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale,
-                       float quant_scale, int max_thread_per_block, cudaStream_t stream,
-                       bool is_post_ln) {
-    if (hidden_size > 1024) {
-        throw std::runtime_error("hidden_size should <= 1024");
-    }
-    if (hidden_size % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int num_warp = hidden_size / C10_WARP_SIZE / 4;
-
-    switch (num_warp) {
-        case 1:
-            skipLayernormI8IF32OKernel<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 2:
-            skipLayernormI8IF32OKernel<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 3:
-            skipLayernormI8IF32OKernel<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 4:
-            skipLayernormI8IF32OKernel<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 5:
-            skipLayernormI8IF32OKernel<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 6:
-            skipLayernormI8IF32OKernel<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 7:
-            skipLayernormI8IF32OKernel<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 8:
-            skipLayernormI8IF32OKernel<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 9:
-            skipLayernormI8IF32OKernel<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 10:
-            skipLayernormI8IF32OKernel<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 11:
-            skipLayernormI8IF32OKernel<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 12:
-            skipLayernormI8IF32OKernel<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 13:
-            skipLayernormI8IF32OKernel<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 14:
-            skipLayernormI8IF32OKernel<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 15:
-            skipLayernormI8IF32OKernel<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 16:
-            skipLayernormI8IF32OKernel<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        default:
-            throw std::runtime_error("skipLayernormI8II8OKernel");
-            break;    
-    }             
-}
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin 
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h b/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h
deleted file mode 100644
index f752f59f..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-
-#include <string>
-#include <vector>
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-
-void skipLayerNormI8II8O(const int8_t *input, const  float *scale, const float *bias, const float *residual_bias, 
-                       int8_t *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale,
-                       float quant_scale, int max_thread_per_block, cudaStream_t stream,
-                       bool is_post_ln);
-
-void skipLayerNormI8IF32O(const int8_t *input, const  float *scale, const float *bias, const float *residual_bias,
-                       float *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale,
-                       float quant_scale, int max_thread_per_block, cudaStream_t stream,
-                       bool is_post_ln);
-
-class SkipLayerNormInt8PluginBase : public nvinfer1::IPluginV2DynamicExt
-{
-public:
-    SkipLayerNormInt8PluginBase(
-        std::string const& name, nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& bias, bool output_fp32);
-
-    SkipLayerNormInt8PluginBase(std::string const& name, void const* data, size_t length);
-
-    // It doesn't make sense to make SkipLayerNormInterleavedPlugin without
-    // arguments, so we delete default constructor.
-    SkipLayerNormInt8PluginBase() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(
-        int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(
-        int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-        nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-        nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-
-protected:
-    std::string const& mLayerName;
-    std::string mNamespace;
-
-    bert::cuda_unique_ptr<void> mGammaDev;
-    bert::cuda_unique_ptr<void> mBetaDev;
-    size_t mLd{}; // leading dim
-    bert::WeightsWithOwnership mGamma;
-    bert::WeightsWithOwnership mBeta;
-
-    size_t mParamWordsize{};
-    bool mParamsOnDevice{};
-    bool mHasBias{};
-    cuda_unique_ptr<void> mBiasDev;
-    WeightsWithOwnership mBias;
-    bool output_fp32{};
-};
-
-class SkipLayerNormInt8PluginHFace : public SkipLayerNormInt8PluginBase
-{
-public:
-    SkipLayerNormInt8PluginHFace(
-        std::string const& name, nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& bias, bool output_fp32);
-
-    SkipLayerNormInt8PluginHFace(std::string const& name, void const* data, size_t length);
-
-    // It doesn't make sense to make SkipLayerNormInterleavedPlugin without
-    // arguments, so we delete default constructor.
-    SkipLayerNormInt8PluginHFace() = delete;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-        void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-    // IPluginV2 Methods
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    void destroy() noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-};
-
-class SkipLayerNormInt8PluginBaseCreator : public nvinfer1::IPluginCreator
-{
-public:
-    SkipLayerNormInt8PluginBaseCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-class SkipLayerNormInt8PluginHFaceCreator : public SkipLayerNormInt8PluginBaseCreator
-{
-public:
-    SkipLayerNormInt8PluginHFaceCreator();
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-    nvinfer1::IPluginV2* deserializePlugin(
-        char const* name, void const* serialData, size_t serialLength) noexcept override;
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp b/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp
deleted file mode 100644
index 4ca63061..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp
+++ /dev/null
@@ -1,430 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "skipLayerNormPlugin.h"
-
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* kSKIP_LAYER_NORM_VERSION{"1"};
-char const* kSKIP_LAYER_NORM_NAME{"CustomSkipLayerNormPluginDynamic_IxRT"};
-char const* kSKIP_LAYER_NORM_VAR_SEQLEN_VERSION{"2"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection SkipLayerNormPluginDynamicCreator::mFC{};
-std::vector<PluginField> SkipLayerNormPluginDynamicCreator::mPluginAttributes;
-
-// REGISTER_TENSORRT_PLUGIN(SkipLayerNormPluginDynamicCreator);
-
-static inline DataType getParamWordType(DataType cfgType) noexcept {
-    if (cfgType == DataType::kINT8) {
-        return DataType::kHALF;
-    }
-
-    return cfgType;
-}
-
-SkipLayerNormPluginDynamicCreator::SkipLayerNormPluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("ld"));
-    mPluginAttributes.emplace_back(PluginField("type_id"));
-    mPluginAttributes.emplace_back(PluginField("beta"));
-    mPluginAttributes.emplace_back(PluginField("gamma"));
-    mPluginAttributes.emplace_back(PluginField("bias"));
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* SkipLayerNormPluginDynamicCreator::getPluginName() const noexcept { return kSKIP_LAYER_NORM_NAME; }
-
-char const* SkipLayerNormPluginDynamicCreator::getPluginVersion() const noexcept { return kSKIP_LAYER_NORM_VERSION; }
-
-PluginFieldCollection const* SkipLayerNormPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* SkipLayerNormPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "SkipLayerNormPluginDynamicCreator createPlugin" << endl;
-
-        int32_t ld = 0;
-        Weights beta{DataType::kFLOAT, nullptr, 0};
-        Weights gamma{DataType::kFLOAT, nullptr, 0};
-        Weights bias{DataType::kFLOAT, nullptr, 0};
-        int32_t typeId = -1;
-
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        ixrt_plugin::validateRequiredAttributesExist({"type_id", "beta", "ld", "gamma"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string field_name(fc->fields[i].name);
-            if (field_name.compare("ld") == 0) {
-                ld = *static_cast<int32_t const*>(fc->fields[i].data);
-                gLogInfo << "Building ld: " << ld << endl;
-            }
-
-            if (field_name.compare("type_id") == 0) {
-                typeId = *static_cast<int32_t const*>(fc->fields[i].data);
-                gLogInfo << "Building typeId: " << typeId << endl;
-            }
-
-            if (field_name.compare("beta") == 0) {
-                gLogInfo << "Building beta..." << endl;
-                beta.values = fc->fields[i].data;
-                beta.count = fc->fields[i].length;
-                beta.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("gamma") == 0) {
-                gLogInfo << "Building gamma..." << endl;
-                gamma.values = fc->fields[i].data;
-                gamma.count = fc->fields[i].length;
-                gamma.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bias") == 0) {
-                gLogInfo << "Building bias..." << endl;
-                bias.values = fc->fields[i].data;
-                bias.count = fc->fields[i].length;
-                bias.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-        }
-        gLogInfo << "Type " << typeId << endl;
-
-        IXRT_PLUGIN_CHECK_VALUE(typeId >= 0 && typeId <= 3,
-                                ("SkipLayerNorm: Invalid type ID: " + std::to_string(typeId)).c_str());
-
-        IXRT_PLUGIN_CHECK_VALUE(beta.values != nullptr, "SkipLayerNorm: invalid beta");
-        IXRT_PLUGIN_CHECK_VALUE(beta.count > 0, "SkipLayerNorm: invalid beta");
-
-        IXRT_PLUGIN_CHECK_VALUE(gamma.values != nullptr, "SkipLayerNorm: invalid gamma");
-        IXRT_PLUGIN_CHECK_VALUE(gamma.count > 0, "SkipLayerNorm: invalid gamma");
-
-        IXRT_PLUGIN_CHECK_VALUE(typeId == (int)DataType::kHALF, "typeId != DataType::kHALF error");
-
-        return new SkipLayerNormPluginDynamic(name, static_cast<DataType>(typeId), ld, beta, gamma, bias);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-nvinfer1::IPluginV2* SkipLayerNormPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                          size_t serialLength) noexcept {
-    try {
-        return new SkipLayerNormPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void SkipLayerNormPluginDynamicCreator::setPluginNamespace(char const* pluginNamespace) noexcept {
-    try {
-        mNamespace = pluginNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* SkipLayerNormPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-//#########################################################################//
-SkipLayerNormPluginDynamic::SkipLayerNormPluginDynamic(const std::string name, const DataType type, int32_t const ld,
-                                                       Weights const& beta, Weights const& gamma, Weights const& bias)
-    : mLayerName(name), mGammaDev(nullptr), mBetaDev(nullptr), mHiddenSize(ld), mType(type), mBiasDev(nullptr) {
-    IXRT_PLUGIN_ASSERT(mType == nvinfer1::DataType::kFLOAT || mType == nvinfer1::DataType::kHALF ||
-                       mType == nvinfer1::DataType::kINT8);
-
-    mCfgType = mType == DataType::kINT8 ? DataType::kHALF : mType;
-    mParamWordsize = getElementSize(mCfgType);
-
-    mBeta.convertAndCopy(beta, mCfgType);
-    mGamma.convertAndCopy(gamma, mCfgType);
-
-    mHasBias = (bias.values != nullptr);
-    if (mHasBias) {
-        mBias.convertAndCopy(bias, mCfgType);
-    }
-
-    copyToDevice(mGamma, getWeightsSize(mGamma, mCfgType), mGammaDev);
-    copyToDevice(mBeta, getWeightsSize(mBeta, mCfgType), mBetaDev);
-    if (mHasBias) {
-        copyToDevice(mBias, getWeightsSize(mBias, mCfgType), mBiasDev);
-    }
-}
-
-SkipLayerNormPluginDynamic::SkipLayerNormPluginDynamic(const std::string& name, void const* data, size_t length)
-    : mLayerName(name), mGammaDev(nullptr), mBetaDev(nullptr), mBiasDev(nullptr) {
-    gLogInfo << "SkipLayerNormPluginDynamic deserialize" << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mCfgType);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mHasBias);
-
-    IXRT_PLUGIN_ASSERT(mCfgType == nvinfer1::DataType::kFLOAT || mCfgType == nvinfer1::DataType::kHALF);
-    mParamWordsize = getElementSize(mCfgType);
-
-    char const* d = static_cast<char const*>(data);
-    mBeta.convertAndCopy(d, mHiddenSize, mCfgType);
-    mGamma.convertAndCopy(d, mHiddenSize, mCfgType);
-    if (mHasBias) {
-        mBias.convertAndCopy(d, mHiddenSize, mCfgType);
-    }
-
-    copyToDevice(mGamma, getWeightsSize(mGamma, mCfgType), mGammaDev);
-    copyToDevice(mBeta, getWeightsSize(mBeta, mCfgType), mBetaDev);
-    if (mHasBias) {
-        copyToDevice(mBias, getWeightsSize(mBias, mCfgType), mBiasDev);
-    }
-}
-
-// IPluginV2Ext Methods
-DataType SkipLayerNormPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                       int32_t nbInputs) const noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-        IXRT_PLUGIN_ASSERT(index == 0);
-        IXRT_PLUGIN_ASSERT(nbInputs == 2);
-        return inputTypes[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DataType{};
-}
-
-// IPluginV2 Methods
-char const* SkipLayerNormPluginDynamic::getPluginType() const noexcept { return kSKIP_LAYER_NORM_NAME; }
-
-char const* SkipLayerNormPluginDynamic::getPluginVersion() const noexcept { return kSKIP_LAYER_NORM_VERSION; }
-
-int32_t SkipLayerNormPluginDynamic::getNbOutputs() const noexcept { return 1; }
-int32_t SkipLayerNormPluginDynamic::initialize() noexcept {
-    gLogInfo << "SkipLayerNormPluginDynamic initialize" << endl;
-    return 0;
-}
-
-void SkipLayerNormPluginDynamic::terminate() noexcept { gLogInfo << "SkipLayerNormPluginDynamic terminate" << endl; }
-
-size_t SkipLayerNormPluginDynamic::getSerializationSize() const noexcept {
-    const size_t biasSize = mHasBias ? (mHiddenSize * mParamWordsize) : 0;
-    return 2 * mParamWordsize * mHiddenSize + 2 * sizeof(DataType) + sizeof(mHiddenSize) + biasSize + sizeof(mHasBias);
-}
-
-void SkipLayerNormPluginDynamic::serialize(void* buffer) const noexcept {
-    try {
-        serialize_value(&buffer, mType);
-        serialize_value(&buffer, mCfgType);
-        serialize_value(&buffer, mHiddenSize);
-        serialize_value(&buffer, mHasBias);
-
-        char* d = static_cast<char*>(buffer);
-        serFromDev(d, static_cast<char*>(mBetaDev.get()), mHiddenSize * mParamWordsize);
-        serFromDev(d, static_cast<char*>(mGammaDev.get()), mHiddenSize * mParamWordsize);
-        if (mHasBias) {
-            serFromDev(d, static_cast<char*>(mBiasDev.get()), mHiddenSize * mParamWordsize);
-        }
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-void SkipLayerNormPluginDynamic::destroy() noexcept {
-    try {
-        gLogInfo << "SkipLayerNormPluginDynamic destroy" << endl;
-        // This gets called when the network containing plugin is destroyed
-        mGammaDev.reset(nullptr);
-        mBetaDev.reset(nullptr);
-        if (mHasBias) {
-            mBiasDev.reset(nullptr);
-        }
-        delete this;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-void SkipLayerNormPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* SkipLayerNormPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* SkipLayerNormPluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "SkipLayerNormPluginDynamic clone" << endl;
-
-        auto* p = new SkipLayerNormPluginDynamic(mLayerName, mType, mHiddenSize, mBeta, mGamma, mBias);
-        p->initialize();
-        p->setPluginNamespace(mNamespace.c_str());
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                          int32_t nbInputs, IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 2);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        return inputs[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool SkipLayerNormPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                           int32_t nbOutputs) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inOut != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 2);
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(pos >= 0 && pos < (nbInputs + nbOutputs));
-
-        PluginTensorDesc const& in = inOut[pos];
-        if (pos == 0) {
-            return (in.type == mType) && (in.format == TensorFormat::kLINEAR);
-        }
-        PluginTensorDesc const& prev = inOut[pos - 1];
-
-        return in.type == prev.type && in.format == prev.format && (in.type == DataType::kHALF);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return false;
-}
-
-void SkipLayerNormPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                                 DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        gLogInfo << "SkipLayerNormPluginDynamic configurePlugin" << endl;
-
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(nbInputs == 2);
-        if (mType == DataType::kFLOAT || mType == DataType::kHALF) {
-            IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type);
-            IXRT_PLUGIN_ASSERT(mType == inputs[1].desc.type);
-        } else {
-            IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type || DataType::kFLOAT == inputs[0].desc.type);
-            IXRT_PLUGIN_ASSERT(mType == inputs[1].desc.type || DataType::kFLOAT == inputs[1].desc.type);
-        }
-        auto const& inDims0 = inputs[0].desc.dims;
-        auto const& inDims1 = inputs[1].desc.dims;
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == inDims1.nbDims);
-
-        IXRT_PLUGIN_ASSERT(std::equal(inDims0.d, inDims0.d + inDims0.nbDims, inDims1.d));
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5);
-        mHiddenSize = inDims0.d[HDIM];  // hiddensize
-        IXRT_PLUGIN_ASSERT(mHiddenSize != 0U);
-        IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1);
-        IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1);
-        IXRT_PLUGIN_ASSERT(outputs[0].desc.type == DataType::kHALF);
-
-        mCfgType = inputs[0].desc.type == DataType::kINT8 ? DataType::kHALF : inputs[0].desc.type;
-
-        auto const paramType = getParamWordType(mCfgType);
-        mParamWordsize = getElementSize(paramType);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t SkipLayerNormPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                    PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-int32_t SkipLayerNormPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                            void const* const* inputs, void* const* outputs, void* workspace,
-                                            cudaStream_t stream) noexcept {
-    gLogInfo << "in SkipLayerNormPluginDynamic.." << endl;
-    int32_t status = -1;
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        int32_t const inputVolume = volume(inputDesc[0].dims);
-        DataType iType = inputDesc->type;
-
-        // Our plugin outputs only one tensor
-        // Launch CUDA kernel wrapper and save its return value
-        if (iType == DataType::kFLOAT) {
-            gLogInfo << "SkipLayerNormPlugin fp32 not supported yet!" << endl;
-            return STATUS_NOT_SUPPORTED;
-        } else if (iType == DataType::kHALF) {
-            auto const* input = static_cast<half const*>(inputs[0]);
-            auto skip = (half*)(inputs[1]);
-            auto* output = static_cast<half*>(outputs[0]);
-            auto const* const bias = static_cast<half const*>(mBiasDev.get());
-            auto const* const beta = static_cast<half const*>(mBetaDev.get());
-            auto const* const gamma = static_cast<half const*>(mGammaDev.get());
-
-            if (mHasBias) {
-                status = computeSkipLayerNorm<half, true>(stream, static_cast<int32_t>(mHiddenSize), inputVolume, input,
-                                                          gamma, beta, bias, skip, output);
-            } else {
-                status = computeSkipLayerNorm<half, false>(stream, static_cast<int32_t>(mHiddenSize), inputVolume,
-                                                           input, gamma, beta, bias, skip, output);
-            }
-        } else {
-            IXRT_PLUGIN_CHECK_VALUE(false, "Unsupported type error, expected [kHALF,kFLOAT], but received " +
-                                               std::to_string(static_cast<int32_t>(iType)));
-        }
-        if (status != cudaSuccess) {
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu b/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu
deleted file mode 100644
index 1b127fc5..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu
+++ /dev/null
@@ -1,401 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include <cassert>
-
-#include "backend/bert/bert_helper.h"
-#include "skipLayerNormPlugin.h"
-// #include "backend/transformer/transformer_add_norm.h"
-
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferResidualBiasLnPad(const half *input, const half *scale, const half *bias,
-                                         const half *residual_bias, half *output, half *residual, int hidden_size,
-                                         bool is_post_ln) {
-    float2 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_size / 2;
-    half2 *p_input = (half2 *)input;
-    half2 *p_output = (half2 *)output;
-    half2 *p_residual = (half2 *)residual;
-    half2 *p_scale = (half2 *)scale;
-    half2 *p_bias = (half2 *)bias;
-    half2 *p_residual_bias = (half2 *)residual_bias;
-    // one line start
-    p_input += block_start;
-    p_output += block_start;
-    p_residual += block_start;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        if (element_index < hidden_size / 2) {
-            half2 value1 = p_input[element_index];
-            half2 value2 = p_residual[element_index];
-
-            vals[it].x = __half2float(value1.x) + __half2float(value2.x);
-            vals[it].y = __half2float(value1.y) + __half2float(value2.y);
-
-            half2 res_bias_val_1;
-            if (residual_bias == nullptr) {
-                res_bias_val_1.x = __float2half(0.0f);
-                res_bias_val_1.y = __float2half(0.0f);
-            } else {
-                res_bias_val_1 = p_residual_bias[element_index];
-            }
-            vals[it].x = vals[it].x + __half2float(res_bias_val_1.x);
-            vals[it].y = vals[it].y + __half2float(res_bias_val_1.y);
-
-            WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-        }
-    }
-
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        if (element_index < hidden_size / 2) {
-            float2 norm_value;
-            half2 scale_1 = p_scale[element_index];
-            half2 bias_1 = p_bias[element_index];
-            norm_value.x = (vals[it].x - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.x) +
-                           __half2float(bias_1.x);
-            norm_value.y = (vals[it].y - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.y) +
-                           __half2float(bias_1.y);
-
-            half2 res;
-            res.x = __float2half(norm_value.x);
-            res.y = __float2half(norm_value.y);
-
-            p_output[element_index] = res;
-
-            half2 r1;
-            if (is_post_ln) {
-                r1 = res;
-            } else {
-                r1.x = __float2half(vals[it].x);
-                r1.y = __float2half(vals[it].y);
-            }
-            p_residual[element_index] = r1;
-        }
-    }
-}
-
-void IxinferResidualBiasLnPad(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                              half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream,
-                              bool is_post_ln) {
-    if (hidden_size > 2048) {
-        throw std::runtime_error("hidden_size should <= 1024");
-    }
-    if (hidden_size % 2 != 0) {
-        throw std::runtime_error("hidden_size % 2 != 0");
-    }
-
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int neareast_hidden_size = hidden_size;
-    if (neareast_hidden_size % (C10_WARP_SIZE * 2) != 0) {
-        neareast_hidden_size = neareast_hidden_size + C10_WARP_SIZE * 2 - neareast_hidden_size % (C10_WARP_SIZE * 2);
-    }
-
-    int num_warp = neareast_hidden_size / C10_WARP_SIZE / 2;
-
-    switch (num_warp) {
-        case 1:
-            IxinferResidualBiasLnPad<1><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 2:
-            IxinferResidualBiasLnPad<2><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 3:
-            IxinferResidualBiasLnPad<3><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 4:
-            IxinferResidualBiasLnPad<4><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 5:
-            IxinferResidualBiasLnPad<5><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 6:
-            IxinferResidualBiasLnPad<6><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 7:
-            IxinferResidualBiasLnPad<7><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 8:
-            IxinferResidualBiasLnPad<8><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 9:
-            IxinferResidualBiasLnPad<9><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 10:
-            IxinferResidualBiasLnPad<10><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 11:
-            IxinferResidualBiasLnPad<11><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 12:
-            IxinferResidualBiasLnPad<12><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 13:
-            IxinferResidualBiasLnPad<13><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 14:
-            IxinferResidualBiasLnPad<14><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 15:
-            IxinferResidualBiasLnPad<15><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 16:
-            IxinferResidualBiasLnPad<16><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        default:
-            std::cout << "hidden size: " << hidden_size << std::endl;
-            throw std::runtime_error("IxinferResidualBiasLnPad not supported!");
-            break;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferResidualBiasLn(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                                      half *output, half *residual, int hidden_size, bool is_post_ln) {
-    float2 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_size / 2;
-    half2 *p_input = (half2 *)input;
-    half2 *p_output = (half2 *)output;
-    half2 *p_residual = (half2 *)residual;
-    half2 *p_scale = (half2 *)scale;
-    half2 *p_bias = (half2 *)bias;
-    half2 *p_residual_bias = (half2 *)residual_bias;
-
-    p_input += block_start;
-    p_output += block_start;
-    p_residual += block_start;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        half2 value1 = p_input[element_index];
-        half2 value2 = p_residual[element_index];
-
-        vals[it].x = __half2float(value1.x) + __half2float(value2.x);
-        vals[it].y = __half2float(value1.y) + __half2float(value2.y);
-
-        half2 res_bias_val_1;
-        if (residual_bias == nullptr) {
-            res_bias_val_1.x = __float2half(0.0f);
-            res_bias_val_1.y = __float2half(0.0f);
-        } else {
-            res_bias_val_1 = p_residual_bias[element_index];
-        }
-        vals[it].x = vals[it].x + __half2float(res_bias_val_1.x);
-        vals[it].y = vals[it].y + __half2float(res_bias_val_1.y);
-
-        WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-    }
-
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        float2 norm_value;
-        half2 scale_1 = p_scale[element_index];
-        half2 bias_1 = p_bias[element_index];
-        norm_value.x =
-            (vals[it].x - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.x) + __half2float(bias_1.x);
-        norm_value.y =
-            (vals[it].y - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.y) + __half2float(bias_1.y);
-
-        half2 res;
-        res.x = __float2half(norm_value.x);
-        res.y = __float2half(norm_value.y);
-
-        p_output[element_index] = res;
-
-        half2 r1;
-        if (is_post_ln) {
-            r1 = res;
-        } else {
-            r1.x = __float2half(vals[it].x);
-            r1.y = __float2half(vals[it].y);
-        }
-        p_residual[element_index] = r1;
-    }
-}
-
-void IxinferResidualBiasLn(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                           half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream,
-                           bool is_post_ln) {
-    if (hidden_size > 2048) {
-        throw std::runtime_error("hidden_size should <= 1024");
-    }
-    if ((hidden_size % 2 == 0) && (hidden_size % (C10_WARP_SIZE * 2) != 0)) {
-        IxinferResidualBiasLnPad(input, scale, bias, residual_bias, output, residual, batch_tokens, hidden_size, stream,
-                                 is_post_ln);
-    } else {
-        if (hidden_size % (C10_WARP_SIZE * 2) != 0) {
-            throw std::runtime_error("hidden_size // (C10_WARP_SIZE*2) != 0");
-        }
-        dim3 gridSize(batch_tokens);
-        dim3 blockSize(C10_WARP_SIZE);
-
-        int num_warp = hidden_size / C10_WARP_SIZE / 2;
-
-        switch (num_warp) {
-            case 1:
-                IxinferResidualBiasLn<1><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 2:
-                IxinferResidualBiasLn<2><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 3:
-                IxinferResidualBiasLn<3><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 4:
-                IxinferResidualBiasLn<4><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 5:
-                IxinferResidualBiasLn<5><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 6:
-                IxinferResidualBiasLn<6><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 7:
-                IxinferResidualBiasLn<7><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 8:
-                IxinferResidualBiasLn<8><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 9:
-                IxinferResidualBiasLn<9><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 10:
-                IxinferResidualBiasLn<10><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 11:
-                IxinferResidualBiasLn<11><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 12:
-                IxinferResidualBiasLn<12><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 13:
-                IxinferResidualBiasLn<13><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 14:
-                IxinferResidualBiasLn<14><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 15:
-                IxinferResidualBiasLn<15><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 16:
-                IxinferResidualBiasLn<16><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            default:
-                throw std::runtime_error("IxinferResidualBiasLn");
-                break;
-        }
-    }
-}
-
-template <typename T, bool has_bias>
-int32_t computeSkipLayerNorm(cudaStream_t stream, int32_t E, int32_t volume, const T* input, const T* gamma, const T* beta, const T* bias, T* skip, T* output)
-{
-    assert(volume % E == 0);
-    int32_t batch_tokens = volume / E;
-    IxinferResidualBiasLn(input, gamma, beta, bias, output, skip, batch_tokens, E, stream, true);
-    return 0;
-}
-
-template int32_t computeSkipLayerNorm<half, true>(cudaStream_t, int32_t, int32_t, const half*, const half*, const half*, const half*, half*, half*);
-template int32_t computeSkipLayerNorm<half, false>(cudaStream_t, int32_t, int32_t, const half*, const half*, const half*, const half*, half*, half*);
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h b/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h
deleted file mode 100644
index fa37318f..00000000
--- a/models/nlp/plm/bert_base_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#include <string>
-#include <vector>
-
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-template <typename T, bool has_bias>
-int32_t computeSkipLayerNorm(cudaStream_t stream, int32_t E, int32_t volume, const T* input, const T* gamma, const T* beta, const T* bias, T* skip, T* output);
-
-void IxinferResidualBiasLn(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                           half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream,
-                           bool is_post_ln);
-
-void IxinferResidualBiasLnPad(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                              half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream,
-                              bool is_post_ln);
-class SkipLayerNormPluginDynamic : public IPluginV2DynamicExt {
-   public:
-    SkipLayerNormPluginDynamic(const std::string name, const nvinfer1::DataType type, int32_t const ld,
-        nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& bias);
-    SkipLayerNormPluginDynamic(const std::string &name, void const* data, size_t length);
-    SkipLayerNormPluginDynamic() noexcept = delete;
-    ~SkipLayerNormPluginDynamic() override = default;
-
-    // IPluginV2 methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* libNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext methods
-    DataType getOutputDataType(int32_t index, DataType const* inputType, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt methods
-    IPluginV2DynamicExt* clone() const noexcept override;
-    DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                  IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out,
-                         int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs,
-                            int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
-                    void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-   private:
-    const std::string mLayerName;
-    std::string mNamespace;
-    cuda_unique_ptr<void> mGammaDev;
-    cuda_unique_ptr<void> mBetaDev;
-    WeightsWithOwnership mGamma;
-    WeightsWithOwnership mBeta;
-    size_t mHiddenSize{};
-    size_t mParamWordsize{};
-    DataType mType;
-    DataType mCfgType;
-    // mCfgType is the dataType for beta, gamma bias weights, always fp16 or fp32
-    // mType is the plugin IO datatype, can be int8
-    
-    bool mHasBias{};
-    cuda_unique_ptr<void> mBiasDev;
-    WeightsWithOwnership mBias;
-};
-
-class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator
-{
-public:
-    SkipLayerNormPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(
-        char const* name, void const* serialData, size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
-- 
Gitee


From 0086a62b0479b569e8b0c36f00d42aaff05704da Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 10 Dec 2025 14:18:37 +0800
Subject: [PATCH 4/7] update bert large squad

---
 .../{script => scripts}/infer_bert_large_squad_fp16_accuracy.sh   | 0
 .../infer_bert_large_squad_fp16_performance.sh                    | 0
 .../{script => scripts}/infer_bert_large_squad_int8_accuracy.sh   | 0
 .../infer_bert_large_squad_int8_performance.sh                    | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename models/nlp/plm/bert_large_squad/ixrt/{script => scripts}/infer_bert_large_squad_fp16_accuracy.sh (100%)
 rename models/nlp/plm/bert_large_squad/ixrt/{script => scripts}/infer_bert_large_squad_fp16_performance.sh (100%)
 rename models/nlp/plm/bert_large_squad/ixrt/{script => scripts}/infer_bert_large_squad_int8_accuracy.sh (100%)
 rename models/nlp/plm/bert_large_squad/ixrt/{script => scripts}/infer_bert_large_squad_int8_performance.sh (100%)

diff --git a/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_fp16_accuracy.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_accuracy.sh
similarity index 100%
rename from models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_fp16_accuracy.sh
rename to models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_accuracy.sh
diff --git a/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_fp16_performance.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_performance.sh
similarity index 100%
rename from models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_fp16_performance.sh
rename to models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_performance.sh
diff --git a/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_int8_accuracy.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_accuracy.sh
similarity index 100%
rename from models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_int8_accuracy.sh
rename to models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_accuracy.sh
diff --git a/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_int8_performance.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_performance.sh
similarity index 100%
rename from models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_int8_performance.sh
rename to models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_performance.sh
-- 
Gitee


From 96a961f569728acdb02144331ad3283d701927b3 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 10 Dec 2025 14:18:54 +0800
Subject: [PATCH 5/7] sync transformer

---
 .../plm/transformer/build_helpers/__init__.py |    3 +
 .../build_helpers/build_helpers.py            |  234 ++++
 .../transformer/build_helpers/get_vendored.py |  121 ++
 .../transformer/build_helpers/test_helpers.py |  142 ++
 .../nlp/plm/transformer/ixrt/build_engine.py  |   77 ++
 models/nlp/plm/transformer/ixrt/ci/prepare.sh |   42 +
 models/nlp/plm/transformer/ixrt/common.py     |   92 ++
 .../inference_wmt14_en_fr_fp16_accuracy.py    |  488 +++++++
 ...erence_wmt14_en_fr_fp16_accuracy_plugin.py |  517 ++++++++
 .../inference_wmt14_en_fr_fp16_performance.py |  149 +++
 ...nce_wmt14_en_fr_fp16_performance_plugin.py |  147 +++
 .../nlp/plm/transformer/ixrt/requirements.txt |    6 +
 .../infer_transformer_fp16_accuracy.sh        |   44 +
 .../infer_transformer_fp16_performance.sh     |   45 +
 models/nlp/plm/transformer/omegaconf.py       | 1160 +++++++++++++++++
 models/nlp/plm/transformer/plugin/__init__.py |    0
 .../plm/transformer/plugin/build_engine.py    |  401 ++++++
 .../plm/transformer/plugin/builder_utils.py   |  323 +++++
 .../transformer/plugin/load_ixrt_plugin.py    |   28 +
 .../plm/transformer/plugin/plugin_utils.py    |  918 +++++++++++++
 .../plm/transformer/plugin/transformer_cfg.py |   15 +
 models/nlp/plm/transformer/plugin/trt.py      |  356 +++++
 models/nlp/plm/transformer/setup.py           |   76 ++
 23 files changed, 5384 insertions(+)
 create mode 100644 models/nlp/plm/transformer/build_helpers/__init__.py
 create mode 100644 models/nlp/plm/transformer/build_helpers/build_helpers.py
 create mode 100644 models/nlp/plm/transformer/build_helpers/get_vendored.py
 create mode 100644 models/nlp/plm/transformer/build_helpers/test_helpers.py
 create mode 100644 models/nlp/plm/transformer/ixrt/build_engine.py
 create mode 100644 models/nlp/plm/transformer/ixrt/ci/prepare.sh
 create mode 100644 models/nlp/plm/transformer/ixrt/common.py
 create mode 100644 models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_accuracy.py
 create mode 100644 models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_accuracy_plugin.py
 create mode 100644 models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_performance.py
 create mode 100644 models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_performance_plugin.py
 create mode 100644 models/nlp/plm/transformer/ixrt/requirements.txt
 create mode 100644 models/nlp/plm/transformer/ixrt/scripts/infer_transformer_fp16_accuracy.sh
 create mode 100644 models/nlp/plm/transformer/ixrt/scripts/infer_transformer_fp16_performance.sh
 create mode 100644 models/nlp/plm/transformer/omegaconf.py
 create mode 100644 models/nlp/plm/transformer/plugin/__init__.py
 create mode 100644 models/nlp/plm/transformer/plugin/build_engine.py
 create mode 100644 models/nlp/plm/transformer/plugin/builder_utils.py
 create mode 100644 models/nlp/plm/transformer/plugin/load_ixrt_plugin.py
 create mode 100644 models/nlp/plm/transformer/plugin/plugin_utils.py
 create mode 100644 models/nlp/plm/transformer/plugin/transformer_cfg.py
 create mode 100644 models/nlp/plm/transformer/plugin/trt.py
 create mode 100644 models/nlp/plm/transformer/setup.py

diff --git a/models/nlp/plm/transformer/build_helpers/__init__.py b/models/nlp/plm/transformer/build_helpers/__init__.py
new file mode 100644
index 00000000..aa0c875c
--- /dev/null
+++ b/models/nlp/plm/transformer/build_helpers/__init__.py
@@ -0,0 +1,3 @@
+# Order of imports is important (see warning otherwise when running tests)
+import setuptools  # isort:skip # noqa
+import distutils  # isort:skip # noqa
diff --git a/models/nlp/plm/transformer/build_helpers/build_helpers.py b/models/nlp/plm/transformer/build_helpers/build_helpers.py
new file mode 100644
index 00000000..7109c6df
--- /dev/null
+++ b/models/nlp/plm/transformer/build_helpers/build_helpers.py
@@ -0,0 +1,234 @@
+import codecs
+import distutils.log
+import errno
+import os
+from os.path import abspath, basename, dirname, exists, isdir, join
+import logging
+from functools import partial
+import re
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+from typing import List, Optional
+
+from setuptools import Command
+from setuptools.command import build_py, develop, sdist
+
+
+class ANTLRCommand(Command):  # type: ignore  # pragma: no cover
+    """Generate parsers using ANTLR."""
+
+    description = "Run ANTLR"
+    user_options: List[str] = []
+
+    def run(self) -> None:
+        """Run command."""
+        build_dir = Path(__file__).parent.absolute()
+        project_root = build_dir.parent
+        for grammar in [
+            "OmegaConfGrammarLexer.g4",
+            "OmegaConfGrammarParser.g4",
+        ]:
+            command = [
+                "java",
+                "-jar",
+                str(build_dir / "bin" / "antlr-4.9.3-complete.jar"),
+                "-Dlanguage=Python3",
+                "-o",
+                str(project_root / "omegaconf" / "grammar" / "gen"),
+                "-Xexact-output-dir",
+                "-visitor",
+                str(project_root / "omegaconf" / "grammar" / grammar),
+            ]
+
+            self.announce(
+                f"Generating parser for Python3: {command}",
+                level=distutils.log.INFO,
+            )
+
+            subprocess.check_call(command)
+
+    def initialize_options(self) -> None:
+        pass
+
+    def finalize_options(self) -> None:
+        pass
+
+
+class BuildPyCommand(build_py.build_py):  # pragma: no cover
+    def run(self) -> None:
+        if not self.dry_run:
+            self.run_command("clean")
+            run_antlr(self)
+        build_py.build_py.run(self)
+
+
+class CleanCommand(Command):  # type: ignore  # pragma: no cover
+    """
+    Our custom command to clean out junk files.
+    """
+
+    description = "Cleans out generated and junk files we don't want in the repo"
+    dry_run: bool
+    user_options: List[str] = []
+
+    def run(self) -> None:
+        root = Path(__file__).parent.parent.absolute()
+        files = find(
+            root=root,
+            include_files=["^omegaconf/grammar/gen/.*"],
+            include_dirs=[
+                "^omegaconf\\.egg-info$",
+                "\\.eggs$",
+                "^\\.mypy_cache$",
+                "^\\.pytest_cache$",
+                ".*/__pycache__$",
+                "^__pycache__$",
+                "^build$",
+            ],
+            scan_exclude=["^.git$", "^.nox/.*$"],
+            excludes=[".*\\.gitignore$", ".*/__init__.py"],
+        )
+
+        if self.dry_run:
+            print("Dry run! Would clean up the following files and dirs:")
+            print("\n".join(sorted(map(str, files))))
+        else:
+            for f in files:
+                if f.exists():
+                    if f.is_dir():
+                        shutil.rmtree(f, ignore_errors=True)
+                    else:
+                        f.unlink()
+
+    def initialize_options(self) -> None:
+        pass
+
+    def finalize_options(self) -> None:
+        pass
+
+log = logging.getLogger(__name__)
+
+class HYDRAANTLRCommand(Command):  # type: ignore
+    """Generate parsers using ANTLR."""
+
+    description = "Run ANTLR"
+    user_options: List[str] = []
+
+    def run(self) -> None:
+        """Run command."""
+        root_dir = abspath(dirname(__file__))
+        project_root = abspath(dirname(basename(__file__)))
+        for grammar in [
+            "hydra/grammar/OverrideLexer.g4",
+            "hydra/grammar/OverrideParser.g4",
+        ]:
+            command = [
+                "java",
+                "-jar",
+                join(root_dir, "bin/antlr-4.9.3-complete.jar"),
+                "-Dlanguage=Python3",
+                "-o",
+                join(project_root, "hydra/grammar/gen/"),
+                "-Xexact-output-dir",
+                "-visitor",
+                join(project_root, grammar),
+            ]
+
+            log.info(f"Generating parser for Python3: {command}")
+
+            subprocess.check_call(command)
+
+    def initialize_options(self) -> None:
+        pass
+
+    def finalize_options(self) -> None:
+        pass
+
+
+class DevelopCommand(develop.develop):  # pragma: no cover
+    def run(self) -> None:  # type: ignore
+        if not self.dry_run:
+            run_antlr(self)
+        develop.develop.run(self)
+
+
+class SDistCommand(sdist.sdist):  # pragma: no cover
+    def run(self) -> None:
+        if not self.dry_run:  # type: ignore
+            self.run_command("clean")
+            run_antlr(self)
+        sdist.sdist.run(self)
+
+
+def find(
+    root: Path,
+    include_files: List[str],
+    include_dirs: List[str],
+    excludes: List[str],
+    rbase: Optional[Path] = None,
+    scan_exclude: Optional[List[str]] = None,
+) -> List[Path]:
+    if rbase is None:
+        rbase = Path()
+    if scan_exclude is None:
+        scan_exclude = []
+    files = []
+    scan_root = root / rbase
+    for entry in scan_root.iterdir():
+        path = rbase / entry.name
+        if matches(scan_exclude, path):
+            continue
+
+        if entry.is_dir():
+            if matches(include_dirs, path):
+                if not matches(excludes, path):
+                    files.append(path)
+            else:
+                ret = find(
+                    root=root,
+                    include_files=include_files,
+                    include_dirs=include_dirs,
+                    excludes=excludes,
+                    rbase=path,
+                    scan_exclude=scan_exclude,
+                )
+                files.extend(ret)
+        else:
+            if matches(include_files, path) and not matches(excludes, path):
+                files.append(path)
+
+    return files
+
+
+def find_version(*file_paths: str) -> str:
+    root = Path(__file__).parent.parent.absolute()
+    with codecs.open(root / Path(*file_paths), "r") as fp:  # type: ignore
+        version_file = fp.read()
+    version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
+    if version_match:
+        return version_match.group(1)
+    raise RuntimeError("Unable to find version string.")  # pragma: no cover
+
+
+def matches(patterns: List[str], path: Path) -> bool:
+    string = str(path).replace(os.sep, "/")  # for Windows
+    for pattern in patterns:
+        if re.match(pattern, string):
+            return True
+    return False
+
+
+def run_antlr(cmd: Command) -> None:  # pragma: no cover
+    try:
+        cmd.announce("Generating parsers with antlr4", level=distutils.log.INFO)
+        cmd.run_command("antlr")
+    except OSError as e:
+        if e.errno == errno.ENOENT:
+            msg = f"| Unable to generate parsers: {e} |"
+            msg = "=" * len(msg) + "\n" + msg + "\n" + "=" * len(msg)
+            cmd.announce(f"{msg}", level=distutils.log.FATAL)
+            sys.exit(1)
+        else:
+            raise
diff --git a/models/nlp/plm/transformer/build_helpers/get_vendored.py b/models/nlp/plm/transformer/build_helpers/get_vendored.py
new file mode 100644
index 00000000..2f5d6591
--- /dev/null
+++ b/models/nlp/plm/transformer/build_helpers/get_vendored.py
@@ -0,0 +1,121 @@
+import re
+import shutil
+import subprocess
+from functools import partial
+from itertools import chain
+from pathlib import Path
+from typing import Callable, FrozenSet, Generator, List, Set, Tuple, Union
+
+WHITELIST = {'README.txt', '__init__.py', 'vendor.txt'}
+
+
+def delete_all(*paths: Path, whitelist: Union[Set[str], FrozenSet[str]] = frozenset()) -> None:
+    """Clear all the items in each of the indicated paths, except for elements listed
+    in the whitelist"""
+    for item in paths:
+        if item.is_dir():
+            shutil.rmtree(item, ignore_errors=True)
+        elif item.is_file() and item.name not in whitelist:
+            item.unlink()
+
+
+def iter_subtree(path: Path, depth: int = 0) -> Generator[Tuple[Path, int], None, None]:
+    """Recursively yield all files in a subtree, depth-first"""
+    if not path.is_dir():
+        if path.is_file():
+            yield path, depth
+        return
+    for item in path.iterdir():
+        if item.is_dir():
+            yield from iter_subtree(item, depth + 1)
+        elif item.is_file():
+            yield item, depth + 1
+
+
+def patch_vendor_imports(file: Path, replacements: List[Callable[[str], str]]) -> None:
+    """Apply a list of replacements/patches to a given file"""
+    text = file.read_text('utf8')
+    for replacement in replacements:
+        text = replacement(text)
+    file.write_text(text, 'utf8')
+
+
+def find_vendored_libs(vendor_dir: Path, whitelist: Set[str]) -> Tuple[List[str], List[Path]]:
+    vendored_libs = []
+    paths = []
+    for item in vendor_dir.iterdir():
+        if item.is_dir():
+            vendored_libs.append(item.name)
+        elif item.is_file() and item.name not in whitelist:
+            vendored_libs.append(item.stem)  # without extension
+        else:  # not a dir or a file not in the whilelist
+            continue
+        paths.append(item)
+    return vendored_libs, paths
+
+
+def vendor(vendor_dir: Path, relative_imports: bool = False) -> None:
+    # target package is <parent>.<vendor_dir>; foo/vendor -> foo.vendor
+    pkgname = f'{vendor_dir.parent.name}.{vendor_dir.name}'
+
+    # remove everything
+    delete_all(*vendor_dir.iterdir(), whitelist=WHITELIST)
+
+    # install with pip
+    subprocess.run([
+        'pip', 'install', '-t', str(vendor_dir),
+        '-r', str(vendor_dir / 'vendor.txt'),
+        '--no-compile', '--no-deps'
+    ])
+
+    # delete stuff that's not needed
+    delete_all(
+        *vendor_dir.glob('*.dist-info'),
+        *vendor_dir.glob('*.egg-info'),
+        vendor_dir / 'bin')
+
+    vendored_libs, paths = find_vendored_libs(vendor_dir, WHITELIST)
+
+    if not relative_imports:
+        replacements: List[Callable[[str], str]] = []
+        for lib in vendored_libs:
+            replacements += (
+                partial(  # import bar -> import foo.vendor.bar
+                    re.compile(r'(^\s*)import {}\n'.format(lib), flags=re.M).sub,
+                    r'\1from {} import {}\n'.format(pkgname, lib)
+                ),
+                partial(  # from bar -> from foo.vendor.bar
+                    re.compile(r'(^\s*)from {}(\.|\s+)'.format(lib), flags=re.M).sub,
+                    r'\1from {}.{}\2'.format(pkgname, lib)
+                ),
+            )
+
+    for file, depth in chain.from_iterable(map(iter_subtree, paths)):
+        if relative_imports:
+            pkgname = '.' * (depth - 1)
+            replacements = []
+            for lib in vendored_libs:
+                replacements += (
+                    partial(
+                        re.compile(r'(^\s*)import {}\n'.format(lib), flags=re.M).sub,
+                        r'\1from {} import {}\n'.format(pkgname, "")
+                    ),
+                    partial(
+                        re.compile(r'^from {}(\s+)'.format(lib), flags=re.M).sub,
+                        r'from .{}\1'.format(pkgname)
+                    ),
+                    partial(
+                        re.compile(r'(^\s*)from {}(\.+)'.format(lib), flags=re.M).sub,
+                        r'\1from {}\2'.format(pkgname)
+                    ),
+                )
+        patch_vendor_imports(file, replacements)
+
+
+if __name__ == '__main__':
+    # this assumes this is a script in `build_helpers`
+    here = Path('__file__').resolve().parent
+    vendor_dir = here / 'omegaconf' / 'vendor'
+    assert (vendor_dir / 'vendor.txt').exists(), 'omegaconf/vendor/vendor.txt file not found'
+    assert (vendor_dir / '__init__.py').exists(), 'omegaconf/vendor/__init__.py file not found'
+    vendor(vendor_dir, relative_imports=True)
diff --git a/models/nlp/plm/transformer/build_helpers/test_helpers.py b/models/nlp/plm/transformer/build_helpers/test_helpers.py
new file mode 100644
index 00000000..7f10a1d8
--- /dev/null
+++ b/models/nlp/plm/transformer/build_helpers/test_helpers.py
@@ -0,0 +1,142 @@
+from pathlib import Path
+from typing import List
+
+import pytest
+
+from build_helpers.build_helpers import find, find_version, matches
+
+
+@pytest.mark.parametrize(
+    "path_rel,include_files,include_dirs,excludes,scan_exclude,expected",
+    [
+        pytest.param("test_files", [], [], [], None, [], id="none"),
+        pytest.param(
+            "test_files",
+            [".*"],
+            [],
+            [],
+            [],
+            [
+                "a/b/bad_dir/.gitkeep",
+                "a/b/file2.txt",
+                "a/b/file1.txt",
+                "a/b/junk.txt",
+                "c/bad_dir/.gitkeep",
+                "c/file2.txt",
+                "c/file1.txt",
+                "c/junk.txt",
+            ],
+            id="all",
+        ),
+        pytest.param(
+            "test_files",
+            [".*"],
+            [],
+            ["^a/.*"],
+            [],
+            ["c/bad_dir/.gitkeep", "c/file2.txt", "c/file1.txt", "c/junk.txt"],
+            id="filter_a",
+        ),
+        pytest.param(
+            "test_files",
+            [".*"],
+            [],
+            [],
+            ["^a/.*"],
+            ["c/bad_dir/.gitkeep", "c/file2.txt", "c/file1.txt", "c/junk.txt"],
+            id="do_not_scan_a",
+        ),
+        pytest.param(
+            "test_files",
+            ["^a/.*"],
+            [],
+            [],
+            [],
+            ["a/b/bad_dir/.gitkeep", "a/b/file2.txt", "a/b/file1.txt", "a/b/junk.txt"],
+            id="include_a",
+        ),
+        pytest.param(
+            "test_files",
+            ["^a/.*"],
+            [],
+            [".*/file1\\.txt"],
+            [],
+            ["a/b/bad_dir/.gitkeep", "a/b/file2.txt", "a/b/junk.txt"],
+            id="include_a,exclude_file1",
+        ),
+        pytest.param(
+            "test_files",
+            [".*"],
+            [],
+            ["^.*/junk.txt$"],
+            [],
+            [
+                "a/b/bad_dir/.gitkeep",
+                "a/b/file2.txt",
+                "a/b/file1.txt",
+                "c/bad_dir/.gitkeep",
+                "c/file2.txt",
+                "c/file1.txt",
+            ],
+            id="no_junk",
+        ),
+        pytest.param(
+            "test_files",
+            ["^.*/junk.txt"],
+            [],
+            [],
+            [],
+            ["a/b/junk.txt", "c/junk.txt"],
+            id="junk_only",
+        ),
+        pytest.param("test_files", [], ["^a$"], [], [], ["a"], id="exact_a"),
+        pytest.param(
+            "test_files",
+            [],
+            [".*bad_dir$"],
+            [],
+            [],
+            ["a/b/bad_dir", "c/bad_dir"],
+            id="bad_dirs",
+        ),
+    ],
+)
+def test_find(
+    path_rel: str,
+    include_files: List[str],
+    include_dirs: List[str],
+    excludes: List[str],
+    scan_exclude: List[str],
+    expected: List[str],
+) -> None:
+    basedir = Path(__file__).parent.absolute()
+    path = basedir / path_rel
+    ret = find(
+        root=path,
+        excludes=excludes,
+        include_files=include_files,
+        include_dirs=include_dirs,
+        scan_exclude=scan_exclude,
+    )
+
+    ret_set = set([str(x) for x in ret])
+    expected_set = set([str(Path(x)) for x in expected])
+    assert ret_set == expected_set
+
+
+@pytest.mark.parametrize(
+    "patterns,query,expected",
+    [
+        (["^a/.*"], Path("a") / "b.txt", True),
+        (["^/foo/bar/.*"], Path("/foo") / "bar" / "blag", True),
+    ],
+)
+def test_matches(patterns: List[str], query: Path, expected: bool) -> None:
+    ret = matches(patterns, query)
+    assert ret == expected
+
+
+def test_find_version() -> None:
+    version = find_version("omegaconf", "version.py")
+    # Ensure `version` is a string starting with a digit.
+    assert isinstance(version, str) and version and "0" <= version[0] <= "9"
diff --git a/models/nlp/plm/transformer/ixrt/build_engine.py b/models/nlp/plm/transformer/ixrt/build_engine.py
new file mode 100644
index 00000000..51cff74a
--- /dev/null
+++ b/models/nlp/plm/transformer/ixrt/build_engine.py
@@ -0,0 +1,77 @@
+import os
+import argparse
+import tensorrt
+from tensorrt import Dims
+
+
+def build_engine_trtapi_staticshape(config):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    if precision == tensorrt.BuilderFlag.INT8:
+        parser.parse_from_files(config.model, config.quant_file)
+    else:
+        parser.parse_from_file(config.model)
+
+    build_config.set_flag(precision)
+
+    plan = builder.build_serialized_network(network, build_config)
+    engine_file_path = config.engine
+    with open(engine_file_path, "wb") as f:
+        f.write(plan)
+    print("Build static shape engine done!")
+
+
+def build_engine_trtapi_dynamicshape(config):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+
+    profile = builder.create_optimization_profile()
+    profile.set_shape("src_tokens", Dims([1, 1]), Dims([56, 43]), Dims([128, 102]))
+    build_config.add_optimization_profile(profile)
+
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    if precision == tensorrt.BuilderFlag.INT8:
+        parser.parse_from_files(config.model, config.quant_file)
+    else:
+        parser.parse_from_file(config.model)
+
+    build_config.set_flag(precision)
+
+    # set dynamic
+    num_inputs = network.num_inputs
+    for i in range(num_inputs):
+        input_tensor = network.get_input(i)
+        input_tensor.shape = Dims([-1, -1])
+
+    plan = builder.build_serialized_network(network, build_config)
+    engine_file_path = config.engine
+    with open(engine_file_path, "wb") as f:
+        f.write(plan)
+    print("Build dynamic shape engine done!")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--quant_file", type=str, default=None)
+    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="float16",
+            help="The precision of datatype")
+    parser.add_argument("--engine", type=str, default=None)
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    build_engine_trtapi_dynamicshape(args)
diff --git a/models/nlp/plm/transformer/ixrt/ci/prepare.sh b/models/nlp/plm/transformer/ixrt/ci/prepare.sh
new file mode 100644
index 00000000..7408252b
--- /dev/null
+++ b/models/nlp/plm/transformer/ixrt/ci/prepare.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y numactl
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y numactl
+else
+    echo "Not Support Os"
+fi
+
+pip3 install -r requirements.txt
+# reference: https://github.com/facebookresearch/fairseq/commit/3d262bb25690e4eb2e7d3c1309b1e9c406ca4b99
+ln -s /root/data/3rd_party/fairseq ../
+# reference: https://github.com/omry/omegaconf/tree/v2.3.0
+ln -s /root/data/3rd_party/omegaconf ../
+cp ../omegaconf.py ../omegaconf/
+# reference: https://github.com/facebookresearch/hydra/tree/v1.3.2
+ln -s /root/data/3rd_party/hydra ../
+cd ../
+python3 setup.py build_ext --inplace
+cd ixrt/
+mkdir -p data/datasets/
+mkdir -p data/checkpoints
+ln -s /root/data/datasets/corex-inference-data-4.0.0/checkpoints/transformer/wmt14.en-fr.joined-dict.transformer ./data/checkpoints/
+ln -s /root/data/datasets/corex-inference-data-4.0.0/datasets/wmt14.en-fr.joined-dict.newstest2014 ./data/datasets/
\ No newline at end of file
diff --git a/models/nlp/plm/transformer/ixrt/common.py b/models/nlp/plm/transformer/ixrt/common.py
new file mode 100644
index 00000000..4759060f
--- /dev/null
+++ b/models/nlp/plm/transformer/ixrt/common.py
@@ -0,0 +1,92 @@
+import os
+import cv2
+import glob
+import torch
+import tensorrt
+import numpy as np
+import cuda.cudart as cudart
+
+
+def create_engine_context(engine_path, logger):
+    with open(engine_path, "rb") as f:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+
+    return engine, context
+
+
+def get_io_bindings(engine):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = engine.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        err, allocation = cudart.cudaMalloc(size)
+        assert err == cudart.cudaError_t.cudaSuccess
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+            "nbytes": size,
+        }
+        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
+
+
+def setup_io_bindings(engine, context):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = context.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        err, allocation = cudart.cudaMalloc(size)
+        assert err == cudart.cudaError_t.cudaSuccess
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+            "nbytes": size,
+        }
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
diff --git a/models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_accuracy.py b/models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_accuracy.py
new file mode 100644
index 00000000..ac09f806
--- /dev/null
+++ b/models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_accuracy.py
@@ -0,0 +1,488 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Translate pre-processed data with a trained model.
+"""
+
+import ast
+import logging
+import math
+import os
+import pickle
+import sys
+from argparse import Namespace
+from itertools import chain
+
+import numpy as np
+import torch
+from fairseq import checkpoint_utils, options, scoring, tasks, utils
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.logging import progress_bar
+from fairseq.logging.meters import StopwatchMeter, TimeMeter
+from omegaconf import DictConfig
+
+import tensorrt
+from tensorrt import Dims
+from common import create_engine_context, get_io_bindings, setup_io_bindings
+
+import cuda.cudart as cudart
+
+def engine_init(engine):
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+    engine, context = create_engine_context(engine, logger)
+    return engine, context
+
+
+def tensorrt_infer(engine, context, features):
+    input_names=["src_tokens"]
+    output_names=["output"]
+    input_idx = engine.get_binding_index(input_names[0])
+    input_shape = features.shape
+    context.set_binding_shape(input_idx, Dims(input_shape))
+
+    inputs, outputs, allocations = setup_io_bindings(engine, context)
+    pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+    (err,) = cudart.cudaMemcpy(
+        inputs[0]["allocation"],
+        features,
+        inputs[0]["nbytes"],
+        cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
+    )
+    assert err == cudart.cudaError_t.cudaSuccess
+    context.execute_v2(allocations)
+    (err,) = cudart.cudaMemcpy(
+        pred_output,
+        outputs[0]["allocation"],
+        outputs[0]["nbytes"],
+        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+    )
+    assert err == cudart.cudaError_t.cudaSuccess
+    return pred_output
+
+
+def main(cfg: DictConfig):
+
+    if isinstance(cfg, Namespace):
+        cfg = convert_namespace_to_omegaconf(cfg)
+
+    assert cfg.common_eval.path is not None, "--path required for generation!"
+    assert (
+        not cfg.generation.sampling or cfg.generation.nbest == cfg.generation.beam
+    ), "--sampling requires --nbest to be equal to --beam"
+    assert (
+        cfg.generation.replace_unk is None or cfg.dataset.dataset_impl == "raw"
+    ), "--replace-unk requires a raw text dataset (--dataset-impl=raw)"
+
+    if cfg.common_eval.results_path is not None:
+        os.makedirs(cfg.common_eval.results_path, exist_ok=True)
+        output_path = os.path.join(
+            cfg.common_eval.results_path,
+            "generate-{}.txt".format(cfg.dataset.gen_subset),
+        )
+        with open(output_path, "w", buffering=1, encoding="utf-8") as h:
+            return _main(cfg, h)
+    else:
+        return _main(cfg, sys.stdout)
+
+
+def get_symbols_to_strip_from_output(generator):
+    if hasattr(generator, "symbols_to_strip_from_output"):
+        return generator.symbols_to_strip_from_output
+    else:
+        return {generator.eos}
+
+
+def _main(cfg: DictConfig, output_file):
+    logging.basicConfig(
+        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=os.environ.get("LOGLEVEL", "INFO").upper(),
+        stream=output_file,
+    )
+    logger = logging.getLogger("fairseq_cli.generate")
+
+    utils.import_user_module(cfg.common)
+
+    if cfg.dataset.max_tokens is None and cfg.dataset.batch_size is None:
+        cfg.dataset.max_tokens = 12000
+    logger.info(cfg)
+
+    # Fix seed for stochastic decoding
+    if cfg.common.seed is not None and not cfg.generation.no_seed_provided:
+        np.random.seed(cfg.common.seed)
+        utils.set_torch_seed(cfg.common.seed)
+
+    use_cuda = torch.cuda.is_available() and not cfg.common.cpu
+
+    # Load dataset splits
+    task = tasks.setup_task(cfg.task)
+
+    # Set dictionaries
+    try:
+        src_dict = getattr(task, "source_dictionary", None)
+    except NotImplementedError:
+        src_dict = None
+    tgt_dict = task.target_dictionary
+
+    overrides = ast.literal_eval(cfg.common_eval.model_overrides)
+
+    # Load ensemble
+    logger.info("loading model(s) from {}".format(cfg.common_eval.path))
+    models, saved_cfg = checkpoint_utils.load_model_ensemble(
+        utils.split_paths(cfg.common_eval.path),
+        arg_overrides=overrides,
+        task=task,
+        suffix=cfg.checkpoint.checkpoint_suffix,
+        strict=(cfg.checkpoint.checkpoint_shard_count == 1),
+        num_shards=cfg.checkpoint.checkpoint_shard_count,
+    )
+
+    # loading the dataset should happen after the checkpoint has been loaded so we can give it the saved task config
+    task.load_dataset(cfg.dataset.gen_subset, task_cfg=saved_cfg.task)
+
+    if cfg.generation.lm_path is not None:
+        overrides["data"] = cfg.task.data
+
+        try:
+            lms, _ = checkpoint_utils.load_model_ensemble(
+                [cfg.generation.lm_path], arg_overrides=overrides, task=None
+            )
+        except:
+            logger.warning(
+                f"Failed to load language model! Please make sure that the language model dict is the same "
+                f"as target dict and is located in the data dir ({cfg.task.data})"
+            )
+            raise
+
+        assert len(lms) == 1
+    else:
+        lms = [None]
+
+    # Optimize ensemble for generation
+    for model in chain(models, lms):
+        if model is None:
+            continue
+        if cfg.common.fp16:
+            model.half()
+        if use_cuda and not cfg.distributed_training.pipeline_model_parallel:
+            model.cuda()
+        model.prepare_for_inference_(cfg)
+
+    # Load alignment dictionary for unknown word replacement
+    # (None if no unknown word replacement, empty if no path to align dictionary)
+    align_dict = utils.load_align_dict(cfg.generation.replace_unk)
+
+    # Load dataset (possibly sharded)
+    itr = task.get_batch_iterator(
+        dataset=task.dataset(cfg.dataset.gen_subset),
+        max_tokens=cfg.dataset.max_tokens,
+        max_sentences=cfg.dataset.batch_size,
+        max_positions=utils.resolve_max_positions(
+            task.max_positions(), *[m.max_positions() for m in models]
+        ),
+        ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test,
+        required_batch_size_multiple=cfg.dataset.required_batch_size_multiple,
+        seed=cfg.common.seed,
+        num_shards=cfg.distributed_training.distributed_world_size,
+        shard_id=cfg.distributed_training.distributed_rank,
+        num_workers=cfg.dataset.num_workers,
+        data_buffer_size=cfg.dataset.data_buffer_size,
+    ).next_epoch_itr(shuffle=False)
+    progress = progress_bar.progress_bar(
+        itr,
+        log_format=cfg.common.log_format,
+        log_interval=cfg.common.log_interval,
+        default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"),
+    )
+
+    # Initialize generator
+    gen_timer = StopwatchMeter()
+
+    extra_gen_cls_kwargs = {"lm_model": lms[0], "lm_weight": cfg.generation.lm_weight}
+    generator = task.build_generator(
+        models, cfg.generation, extra_gen_cls_kwargs=extra_gen_cls_kwargs
+    )
+
+    # Handle tokenization and BPE
+    tokenizer = task.build_tokenizer(cfg.tokenizer)
+    bpe = task.build_bpe(cfg.bpe)
+
+    def decode_fn(x):
+        if bpe is not None:
+            x = bpe.decode(x)
+        if tokenizer is not None:
+            x = tokenizer.decode(x)
+        return x
+
+    scorer = scoring.build_scorer(cfg.scoring, tgt_dict)
+
+    max_batch_size = 128
+    max_seq_length = 128
+
+    model_dir = os.path.split(cfg.common_eval["path"])[0]
+
+    print("1. load engine")
+    engine_path = os.path.join(model_dir,'transformer.engine')
+    print(f"load engine from {engine_path}")
+    # runtime = init_ixrt_by_igie(engine_path)
+    engine, context = engine_init(engine_path)
+
+    print("3. inference")
+    num_sentences = 0
+    has_target = True
+    wps_meter = TimeMeter()
+
+    total_samples = []
+
+    for i,sample in enumerate(progress):
+        sample = utils.move_to_cuda(sample) if use_cuda else sample
+        device = sample["net_input"]["src_tokens"].device
+        if "net_input" not in sample:
+            continue
+
+        prefix_tokens = None
+        if cfg.generation.prefix_size > 0:
+            prefix_tokens = sample["target"][:, : cfg.generation.prefix_size]
+
+        constraints = None
+        if "constraints" in sample:
+            constraints = sample["constraints"]
+
+        src_tokens = (
+            sample["net_input"]["src_tokens"].detach().cpu().numpy().astype(np.int32)
+        )
+        gen_timer.start()    
+        new_tokens = tensorrt_infer(engine, context, src_tokens)
+        num_generated_tokens = new_tokens.shape[0] * new_tokens.shape[1]
+        gen_timer.stop(num_generated_tokens)
+        tokens = torch.tensor(new_tokens).cuda()
+        new_hypos = []
+        for i in range(len(tokens)):
+            new_hypo = {
+                # "tokens": hypos[i][0]['tokens'],
+                "tokens": tokens[i],
+                "alignment": torch.tensor([]).to(device),
+            }
+            new_hypos.append([new_hypo])
+        # exit()
+        hypos = new_hypos
+
+        for i, sample_id in enumerate(sample["id"].tolist()):
+            has_target = sample["target"] is not None
+
+            # Remove padding
+            if "src_tokens" in sample["net_input"]:
+                src_tokens = utils.strip_pad(
+                    sample["net_input"]["src_tokens"][i, :], tgt_dict.pad()
+                )
+            else:
+                src_tokens = None
+
+            target_tokens = None
+            if has_target:
+                target_tokens = (
+                    utils.strip_pad(sample["target"][i, :], tgt_dict.pad()).int().cpu()
+                )
+
+            # Either retrieve the original sentences or regenerate them from tokens.
+            if align_dict is not None:
+                src_str = task.dataset(cfg.dataset.gen_subset).src.get_original_text(
+                    sample_id
+                )
+                target_str = task.dataset(cfg.dataset.gen_subset).tgt.get_original_text(
+                    sample_id
+                )
+            else:
+                if src_dict is not None:
+                    src_str = src_dict.string(src_tokens, cfg.common_eval.post_process)
+                else:
+                    src_str = ""
+                if has_target:
+                    target_str = tgt_dict.string(
+                        target_tokens,
+                        cfg.common_eval.post_process,
+                        escape_unk=True,
+                        extra_symbols_to_ignore=get_symbols_to_strip_from_output(
+                            generator
+                        ),
+                    )
+
+            src_str = decode_fn(src_str)
+            if has_target:
+                target_str = decode_fn(target_str)
+
+            if not cfg.common_eval.quiet:
+                if src_dict is not None:
+                    print("S-{}\t{}".format(sample_id, src_str), file=output_file)
+                if has_target:
+                    print("T-{}\t{}".format(sample_id, target_str), file=output_file)
+
+            # Process top predictions
+            for j, hypo in enumerate(hypos[i][: cfg.generation.nbest]):
+                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
+                    hypo_tokens=hypo["tokens"].int().cpu(),
+                    src_str=src_str,
+                    alignment=hypo["alignment"],
+                    align_dict=align_dict,
+                    tgt_dict=tgt_dict,
+                    remove_bpe=cfg.common_eval.post_process,
+                    extra_symbols_to_ignore=get_symbols_to_strip_from_output(generator),
+                )
+                detok_hypo_str = decode_fn(hypo_str)
+                if not cfg.common_eval.quiet:
+                    score = hypo["score"] / math.log(2)  # convert to base 2
+                    # original hypothesis (after tokenization and BPE)
+                    print(
+                        "H-{}\t{}\t{}".format(sample_id, score, hypo_str),
+                        file=output_file,
+                    )
+                    # detokenized hypothesis
+                    print(
+                        "D-{}\t{}\t{}".format(sample_id, score, detok_hypo_str),
+                        file=output_file,
+                    )
+                    print(
+                        "P-{}\t{}".format(
+                            sample_id,
+                            " ".join(
+                                map(
+                                    lambda x: "{:.4f}".format(x),
+                                    # convert from base e to base 2
+                                    hypo["positional_scores"]
+                                    .div_(math.log(2))
+                                    .tolist(),
+                                )
+                            ),
+                        ),
+                        file=output_file,
+                    )
+
+                    if cfg.generation.print_alignment == "hard":
+                        print(
+                            "A-{}\t{}".format(
+                                sample_id,
+                                " ".join(
+                                    [
+                                        "{}-{}".format(src_idx, tgt_idx)
+                                        for src_idx, tgt_idx in alignment
+                                    ]
+                                ),
+                            ),
+                            file=output_file,
+                        )
+                    if cfg.generation.print_alignment == "soft":
+                        print(
+                            "A-{}\t{}".format(
+                                sample_id,
+                                " ".join(
+                                    [",".join(src_probs) for src_probs in alignment]
+                                ),
+                            ),
+                            file=output_file,
+                        )
+
+                    if cfg.generation.print_step:
+                        print(
+                            "I-{}\t{}".format(sample_id, hypo["steps"]),
+                            file=output_file,
+                        )
+
+                    if cfg.generation.retain_iter_history:
+                        for step, h in enumerate(hypo["history"]):
+                            _, h_str, _ = utils.post_process_prediction(
+                                hypo_tokens=h["tokens"].int().cpu(),
+                                src_str=src_str,
+                                alignment=None,
+                                align_dict=None,
+                                tgt_dict=tgt_dict,
+                                remove_bpe=None,
+                            )
+                            print(
+                                "E-{}_{}\t{}".format(sample_id, step, h_str),
+                                file=output_file,
+                            )
+
+                # Score only the top hypothesis
+                if has_target and j == 0:
+                    if (
+                        align_dict is not None
+                        or cfg.common_eval.post_process is not None
+                    ):
+                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
+                        target_tokens = tgt_dict.encode_line(
+                            target_str, add_if_not_exist=True
+                        )
+                        hypo_tokens = tgt_dict.encode_line(
+                            detok_hypo_str, add_if_not_exist=True
+                        )
+                    if hasattr(scorer, "add_string"):
+                        scorer.add_string(target_str, detok_hypo_str)
+                    else:
+                        scorer.add(target_tokens, hypo_tokens)
+
+        wps_meter.update(num_generated_tokens)
+        progress.log({"wps": round(wps_meter.avg)})
+        num_sentences += (
+            sample["nsentences"] if "nsentences" in sample else sample["id"].numel()
+        )
+
+    logger.info("NOTE: hypothesis and token scores are output in base 2")
+    logger.info(
+        "Translated {:,} sentences ({:,} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)".format(
+            num_sentences,
+            gen_timer.n,
+            gen_timer.sum,
+            num_sentences / gen_timer.sum,
+            1.0 / gen_timer.avg,
+        )
+    )
+    if has_target:
+        if cfg.bpe and not cfg.generation.sacrebleu:
+            if cfg.common_eval.post_process:
+                logger.warning(
+                    "BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization"
+                )
+            else:
+                logger.warning(
+                    "If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words.  Use --sacrebleu for standard 13a BLEU tokenization"
+                )
+        # use print to be consistent with other main outputs: S-, H-, T-, D- and so on
+        print(
+            "Generate {} with beam={}: {}".format(
+                cfg.dataset.gen_subset, cfg.generation.beam, scorer.result_string()
+            ),
+            file=output_file,
+        )
+
+    return scorer
+
+
+def cli_main():
+    parser = options.get_generation_parser()
+    # TODO: replace this workaround with refactoring of `AudioPretraining`
+    parser.add_argument(
+        "--arch",
+        "-a",
+        metavar="ARCH",
+        default="wav2vec2",
+        help="Model architecture. For constructing tasks that rely on "
+        "model args (e.g. `AudioPretraining`)",
+    )
+    args = options.parse_args_and_arch(parser)
+    score = main(args).score()
+    target_score = float(os.environ["Accuracy"])
+    print("BLEU4: = ", score, "target BLEU4: ", target_score)
+    if score >= target_score:
+        print("pass!")
+        exit()
+    else:
+        print("failed!")
+        exit(1)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_accuracy_plugin.py b/models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_accuracy_plugin.py
new file mode 100644
index 00000000..3a3c648a
--- /dev/null
+++ b/models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_accuracy_plugin.py
@@ -0,0 +1,517 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Translate pre-processed data with a trained model.
+"""
+
+import ast
+import logging
+import math
+import os
+import pickle
+import sys
+
+import sys  
+sys.path.append("../")
+from argparse import Namespace
+from itertools import chain
+
+import numpy as np
+import torch
+from fairseq import checkpoint_utils, options, scoring, tasks, utils
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.logging import progress_bar
+from fairseq.logging.meters import StopwatchMeter, TimeMeter
+from omegaconf import DictConfig
+
+import tensorrt
+from tensorrt import Dims
+from common import create_engine_context, get_io_bindings, setup_io_bindings
+
+
+from plugin.transformer_cfg import TransformerBaseConfig
+from plugin.trt import T5TRTDecoder, T5TRTEncoder,inference
+
+import cuda.cudart as cudart
+
+def engine_init(engine):
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+    engine, context = create_engine_context(engine, logger)
+    return engine, context
+
+
+def tensorrt_infer(engine, context, features):
+    input_names=["src_tokens"]
+    output_names=["output"]
+    input_idx = engine.get_binding_index(input_names[0])
+    input_shape = features.shape
+    context.set_binding_shape(input_idx, Dims(input_shape))
+
+    inputs, outputs, allocations = setup_io_bindings(engine, context)
+    pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+    (err,) = cudart.cudaMemcpy(
+        inputs[0]["allocation"],
+        features,
+        inputs[0]["nbytes"],
+        cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
+    )
+    assert err == cudart.cudaError_t.cudaSuccess
+    context.execute_v2(allocations)
+    (err,) = cudart.cudaMemcpy(
+        pred_output,
+        outputs[0]["allocation"],
+        outputs[0]["nbytes"],
+        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+    )
+    assert err == cudart.cudaError_t.cudaSuccess
+    return pred_output
+
+
+def main(cfg: DictConfig):
+
+    if isinstance(cfg, Namespace):
+        cfg = convert_namespace_to_omegaconf(cfg)
+
+    assert cfg.common_eval.path is not None, "--path required for generation!"
+    assert (
+        not cfg.generation.sampling or cfg.generation.nbest == cfg.generation.beam
+    ), "--sampling requires --nbest to be equal to --beam"
+    assert (
+        cfg.generation.replace_unk is None or cfg.dataset.dataset_impl == "raw"
+    ), "--replace-unk requires a raw text dataset (--dataset-impl=raw)"
+
+    if cfg.common_eval.results_path is not None:
+        os.makedirs(cfg.common_eval.results_path, exist_ok=True)
+        output_path = os.path.join(
+            cfg.common_eval.results_path,
+            "generate-{}.txt".format(cfg.dataset.gen_subset),
+        )
+        with open(output_path, "w", buffering=1, encoding="utf-8") as h:
+            return _main(cfg, h)
+    else:
+        return _main(cfg, sys.stdout)
+
+
+def get_symbols_to_strip_from_output(generator):
+    if hasattr(generator, "symbols_to_strip_from_output"):
+        return generator.symbols_to_strip_from_output
+    else:
+        return {generator.eos}
+
+
+def _main(cfg: DictConfig, output_file):
+    logging.basicConfig(
+        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=os.environ.get("LOGLEVEL", "INFO").upper(),
+        stream=output_file,
+    )
+    logger = logging.getLogger("fairseq_cli.generate")
+
+    utils.import_user_module(cfg.common)
+
+    if cfg.dataset.max_tokens is None and cfg.dataset.batch_size is None:
+        cfg.dataset.max_tokens = 12000
+    logger.info(cfg)
+
+    # Fix seed for stochastic decoding
+    if cfg.common.seed is not None and not cfg.generation.no_seed_provided:
+        np.random.seed(cfg.common.seed)
+        utils.set_torch_seed(cfg.common.seed)
+
+    use_cuda = torch.cuda.is_available() and not cfg.common.cpu
+
+    # Load dataset splits
+    task = tasks.setup_task(cfg.task)
+
+    # Set dictionaries
+    try:
+        src_dict = getattr(task, "source_dictionary", None)
+    except NotImplementedError:
+        src_dict = None
+    tgt_dict = task.target_dictionary
+
+    overrides = ast.literal_eval(cfg.common_eval.model_overrides)
+
+    # Load ensemble
+    logger.info("loading model(s) from {}".format(cfg.common_eval.path))
+    models, saved_cfg = checkpoint_utils.load_model_ensemble(
+        utils.split_paths(cfg.common_eval.path),
+        arg_overrides=overrides,
+        task=task,
+        suffix=cfg.checkpoint.checkpoint_suffix,
+        strict=(cfg.checkpoint.checkpoint_shard_count == 1),
+        num_shards=cfg.checkpoint.checkpoint_shard_count,
+    )
+
+    # loading the dataset should happen after the checkpoint has been loaded so we can give it the saved task config
+    task.load_dataset(cfg.dataset.gen_subset, task_cfg=saved_cfg.task)
+
+    if cfg.generation.lm_path is not None:
+        overrides["data"] = cfg.task.data
+
+        try:
+            lms, _ = checkpoint_utils.load_model_ensemble(
+                [cfg.generation.lm_path], arg_overrides=overrides, task=None
+            )
+        except:
+            logger.warning(
+                f"Failed to load language model! Please make sure that the language model dict is the same "
+                f"as target dict and is located in the data dir ({cfg.task.data})"
+            )
+            raise
+
+        assert len(lms) == 1
+    else:
+        lms = [None]
+
+    # Optimize ensemble for generation
+    for model in chain(models, lms):
+        if model is None:
+            continue
+        if cfg.common.fp16:
+            model.half()
+        if use_cuda and not cfg.distributed_training.pipeline_model_parallel:
+            model.cuda()
+        model.prepare_for_inference_(cfg)
+
+    # Load alignment dictionary for unknown word replacement
+    # (None if no unknown word replacement, empty if no path to align dictionary)
+    align_dict = utils.load_align_dict(cfg.generation.replace_unk)
+
+    # Load dataset (possibly sharded)
+    itr = task.get_batch_iterator(
+        dataset=task.dataset(cfg.dataset.gen_subset),
+        max_tokens=cfg.dataset.max_tokens,
+        max_sentences=cfg.dataset.batch_size,
+        max_positions=utils.resolve_max_positions(
+            task.max_positions(), *[m.max_positions() for m in models]
+        ),
+        ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test,
+        required_batch_size_multiple=cfg.dataset.required_batch_size_multiple,
+        seed=cfg.common.seed,
+        num_shards=cfg.distributed_training.distributed_world_size,
+        shard_id=cfg.distributed_training.distributed_rank,
+        num_workers=cfg.dataset.num_workers,
+        data_buffer_size=cfg.dataset.data_buffer_size,
+    ).next_epoch_itr(shuffle=False)
+    progress = progress_bar.progress_bar(
+        itr,
+        log_format=cfg.common.log_format,
+        log_interval=cfg.common.log_interval,
+        default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"),
+    )
+
+    # Initialize generator
+    gen_timer = StopwatchMeter()
+
+    extra_gen_cls_kwargs = {"lm_model": lms[0], "lm_weight": cfg.generation.lm_weight}
+    generator = task.build_generator(
+        models, cfg.generation, extra_gen_cls_kwargs=extra_gen_cls_kwargs
+    )
+
+    # Handle tokenization and BPE
+    tokenizer = task.build_tokenizer(cfg.tokenizer)
+    bpe = task.build_bpe(cfg.bpe)
+
+    def decode_fn(x):
+        if bpe is not None:
+            x = bpe.decode(x)
+        if tokenizer is not None:
+            x = tokenizer.decode(x)
+        return x
+
+    scorer = scoring.build_scorer(cfg.scoring, tgt_dict)
+
+
+    model_dir = os.path.split(cfg.common_eval["path"])[0]
+
+    print("1. load engine")
+    batch_size = cfg.dataset["batch_size"]
+    config_path = os.path.join(model_dir,'transformer_config.json')
+    config = TransformerBaseConfig(config_path)
+    
+    encoder_engine =  os.path.join(model_dir,'Encoder.engine')
+    print(f"2 load encoder engine from {encoder_engine}") 
+    encoder = T5TRTEncoder(encoder_engine,config, batch_size=batch_size) 
+    
+    
+    decoder_engine =  os.path.join(model_dir,'Decoder.engine')
+    print(f"3 load decoder_engine engine from {decoder_engine}") 
+    decoder = T5TRTDecoder(decoder_engine,config,batch_size=batch_size)  
+    
+    
+    print("4. inference")
+    num_sentences = 0
+    has_target = True
+    wps_meter = TimeMeter()
+
+    total_samples = []
+    
+    num = 0
+    
+    for i,sample in enumerate(progress):
+        sample = utils.move_to_cuda(sample) if use_cuda else sample
+        device = sample["net_input"]["src_tokens"].device
+        if "net_input" not in sample:
+            continue
+
+        prefix_tokens = None
+        if cfg.generation.prefix_size > 0:
+            prefix_tokens = sample["target"][:, : cfg.generation.prefix_size]
+
+        constraints = None
+        if "constraints" in sample:
+            constraints = sample["constraints"]
+
+        src_tokens = (
+            sample["net_input"]["src_tokens"].int()
+        )
+        current_bs = src_tokens.shape[0]
+        
+        src_tokens_pad = torch.torch.full((batch_size,src_tokens.shape[1]), 2,dtype = torch.int32).cuda()
+        src_tokens_pad[:current_bs,:] = src_tokens
+        gen_timer.start()        
+        new_tokens = inference(config,encoder,decoder,src_tokens_pad).cpu().numpy()[:current_bs,:]
+        num_generated_tokens = new_tokens.shape[0] * new_tokens.shape[1]
+        
+        gen_timer.stop(num_generated_tokens)
+        tokens = torch.tensor(new_tokens).cuda()
+        new_hypos = []
+        for i in range(len(tokens)):
+            new_hypo = {
+                # "tokens": hypos[i][0]['tokens'],
+                "tokens": tokens[i],
+                "alignment": torch.tensor([]).to(device),
+            }
+            new_hypos.append([new_hypo])
+        # exit()
+        hypos = new_hypos
+
+        for i, sample_id in enumerate(sample["id"].tolist()):
+            has_target = sample["target"] is not None
+
+            # Remove padding
+            if "src_tokens" in sample["net_input"]:
+                src_tokens = utils.strip_pad(
+                    sample["net_input"]["src_tokens"][i, :], tgt_dict.pad()
+                )
+            else:
+                src_tokens = None
+
+            target_tokens = None
+            if has_target:
+                target_tokens = (
+                    utils.strip_pad(sample["target"][i, :], tgt_dict.pad()).int().cpu()
+                )
+
+            # Either retrieve the original sentences or regenerate them from tokens.
+            if align_dict is not None:
+                src_str = task.dataset(cfg.dataset.gen_subset).src.get_original_text(
+                    sample_id
+                )
+                target_str = task.dataset(cfg.dataset.gen_subset).tgt.get_original_text(
+                    sample_id
+                )
+            else:
+                if src_dict is not None:
+                    try:
+                        src_str = src_dict.string(src_tokens, cfg.common_eval.post_process)
+                    except:
+                        print(src_tokens)    
+                else:
+                    src_str = ""
+                if has_target:
+                    target_str = tgt_dict.string(
+                        target_tokens,
+                        cfg.common_eval.post_process,
+                        escape_unk=True,
+                        extra_symbols_to_ignore=get_symbols_to_strip_from_output(
+                            generator
+                        ),
+                    )
+
+            src_str = decode_fn(src_str)
+            if has_target:
+                target_str = decode_fn(target_str)
+
+            if not cfg.common_eval.quiet:
+                if src_dict is not None:
+                    print("S-{}\t{}".format(sample_id, src_str), file=output_file)
+                if has_target:
+                    print("T-{}\t{}".format(sample_id, target_str), file=output_file)
+
+            # Process top predictions
+            for j, hypo in enumerate(hypos[i][: cfg.generation.nbest]):
+                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
+                    hypo_tokens=hypo["tokens"].int().cpu(),
+                    src_str=src_str,
+                    alignment=hypo["alignment"],
+                    align_dict=align_dict,
+                    tgt_dict=tgt_dict,
+                    remove_bpe=cfg.common_eval.post_process,
+                    extra_symbols_to_ignore=get_symbols_to_strip_from_output(generator),
+                )
+                detok_hypo_str = decode_fn(hypo_str)
+                if not cfg.common_eval.quiet:
+                    score = hypo["score"] / math.log(2)  # convert to base 2
+                    # original hypothesis (after tokenization and BPE)
+                    print(
+                        "H-{}\t{}\t{}".format(sample_id, score, hypo_str),
+                        file=output_file,
+                    )
+                    # detokenized hypothesis
+                    print(
+                        "D-{}\t{}\t{}".format(sample_id, score, detok_hypo_str),
+                        file=output_file,
+                    )
+                    print(
+                        "P-{}\t{}".format(
+                            sample_id,
+                            " ".join(
+                                map(
+                                    lambda x: "{:.4f}".format(x),
+                                    # convert from base e to base 2
+                                    hypo["positional_scores"]
+                                    .div_(math.log(2))
+                                    .tolist(),
+                                )
+                            ),
+                        ),
+                        file=output_file,
+                    )
+
+                    if cfg.generation.print_alignment == "hard":
+                        print(
+                            "A-{}\t{}".format(
+                                sample_id,
+                                " ".join(
+                                    [
+                                        "{}-{}".format(src_idx, tgt_idx)
+                                        for src_idx, tgt_idx in alignment
+                                    ]
+                                ),
+                            ),
+                            file=output_file,
+                        )
+                    if cfg.generation.print_alignment == "soft":
+                        print(
+                            "A-{}\t{}".format(
+                                sample_id,
+                                " ".join(
+                                    [",".join(src_probs) for src_probs in alignment]
+                                ),
+                            ),
+                            file=output_file,
+                        )
+
+                    if cfg.generation.print_step:
+                        print(
+                            "I-{}\t{}".format(sample_id, hypo["steps"]),
+                            file=output_file,
+                        )
+
+                    if cfg.generation.retain_iter_history:
+                        for step, h in enumerate(hypo["history"]):
+                            _, h_str, _ = utils.post_process_prediction(
+                                hypo_tokens=h["tokens"].int().cpu(),
+                                src_str=src_str,
+                                alignment=None,
+                                align_dict=None,
+                                tgt_dict=tgt_dict,
+                                remove_bpe=None,
+                            )
+                            print(
+                                "E-{}_{}\t{}".format(sample_id, step, h_str),
+                                file=output_file,
+                            )
+
+                # Score only the top hypothesis
+                if has_target and j == 0:
+                    if (
+                        align_dict is not None
+                        or cfg.common_eval.post_process is not None
+                    ):
+                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
+                        target_tokens = tgt_dict.encode_line(
+                            target_str, add_if_not_exist=True
+                        )
+                        hypo_tokens = tgt_dict.encode_line(
+                            detok_hypo_str, add_if_not_exist=True
+                        )
+                    if hasattr(scorer, "add_string"):
+                        scorer.add_string(target_str, detok_hypo_str)
+                    else:
+                        scorer.add(target_tokens, hypo_tokens)
+
+        wps_meter.update(num_generated_tokens)
+        progress.log({"wps": round(wps_meter.avg)})
+        num_sentences += (
+            sample["nsentences"] if "nsentences" in sample else sample["id"].numel()
+        )
+    decoder.clear()
+    encoder.clear()    
+    
+    logger.info("NOTE: hypothesis and token scores are output in base 2")
+    logger.info(
+        "Translated {:,} sentences ({:,} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)".format(
+            num_sentences,
+            gen_timer.n,
+            gen_timer.sum,
+            num_sentences / gen_timer.sum,
+            1.0 / gen_timer.avg,
+        )
+    )
+    if has_target:
+        if cfg.bpe and not cfg.generation.sacrebleu:
+            if cfg.common_eval.post_process:
+                logger.warning(
+                    "BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization"
+                )
+            else:
+                logger.warning(
+                    "If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words.  Use --sacrebleu for standard 13a BLEU tokenization"
+                )
+        # use print to be consistent with other main outputs: S-, H-, T-, D- and so on
+        print(
+            "Generate {} with beam={}: {}".format(
+                cfg.dataset.gen_subset, cfg.generation.beam, scorer.result_string()
+            ),
+            file=output_file,
+        )
+
+    return scorer
+
+
+def cli_main():
+    parser = options.get_generation_parser()
+    # TODO: replace this workaround with refactoring of `AudioPretraining`
+    parser.add_argument(
+        "--arch",
+        "-a",
+        metavar="ARCH",
+        default="wav2vec2",
+        help="Model architecture. For constructing tasks that rely on "
+        "model args (e.g. `AudioPretraining`)",
+    )
+    args = options.parse_args_and_arch(parser)
+    score = main(args).score()
+    target_score = float(os.environ["Accuracy"])
+    print("BLEU4: = ", score, "target BLEU4: ", target_score)
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["BLEU4"] = round(score, 3)
+    print(metricResult)
+    if score >= target_score:
+        print("pass!")
+        exit()
+    else:
+        print("failed!")
+        exit(1)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_performance.py b/models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_performance.py
new file mode 100644
index 00000000..a7ca02ad
--- /dev/null
+++ b/models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_performance.py
@@ -0,0 +1,149 @@
+import json
+import os
+import numpy as np
+import argparse
+import time
+
+import tensorrt
+from tensorrt import Dims
+from common import create_engine_context, get_io_bindings, setup_io_bindings
+
+import cuda.cudart as cudart
+
+
+def engine_init(engine):
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+    engine, context = create_engine_context(engine, logger)
+
+    return engine, context
+
+
+def tensorrt_infer(engine, context, features):
+    input_names=["src_tokens"]
+    output_names=["output"]
+    input_idx = engine.get_binding_index(input_names[0])
+    input_shape = features.shape
+    context.set_binding_shape(input_idx, Dims(input_shape))
+
+    inputs, outputs, allocations = setup_io_bindings(engine, context)
+    pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+    (err,) = cudart.cudaMemcpy(
+        inputs[0]["allocation"],
+        features,
+        inputs[0]["nbytes"],
+        cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
+    )
+    assert err == cudart.cudaError_t.cudaSuccess
+    context.execute_v2(allocations)
+    (err,) = cudart.cudaMemcpy(
+        pred_output,
+        outputs[0]["allocation"],
+        outputs[0]["nbytes"],
+        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+    )
+    assert err == cudart.cudaError_t.cudaSuccess
+    return pred_output
+
+
+def generate_batch(features):
+    all_inputs = []
+    tmp = []
+    for data in features:
+        if len(tmp) == args.max_batch_size:
+            batch_max_len = max([len(i) for i in tmp])
+            new_tmp = []
+            for i in tmp:
+                i = i[:args.max_seq_len]
+                i = [pad_id]*(batch_max_len-len(i)) + i
+                new_tmp.append(i)
+            all_inputs.append(np.array(new_tmp).astype(np.int32))
+            tmp = []
+        tmp.append(data)
+    if len(tmp):
+        batch_max_len = max([len(i) for i in tmp])
+        new_tmp = []
+        for i in tmp:
+            i = i[:args.max_seq_len]
+            i = [pad_id]*(batch_max_len-len(i)) + i
+            new_tmp.append(i)
+        all_inputs.append(np.array(new_tmp).astype(np.int32))
+    return all_inputs
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="build ixrt graph and convert weights", usage=""
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        required=True,
+        help="max batch size for inference",
+    )
+    parser.add_argument(
+        "--max_seq_len",
+        type=int,
+        default=102,
+        help="max sequence length for inference",
+    )
+    parser.add_argument(
+        "--data_dir",
+        type=str
+    )
+    parser.add_argument(
+        "--model_dir",
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    assert args.max_seq_len <= 102
+    pad_id = 1
+    feature_file = os.path.join(args.data_dir,'features.json')
+
+    with open(feature_file,'r') as f:
+        features = json.loads(f.read())
+
+    all_inputs = generate_batch(features)
+    print(f"max_batch_size: {args.max_batch_size}, max_seq_len: {args.max_seq_len}")
+
+    print("1. build engine")
+    engine_path = os.path.join(args.model_dir,'transformer.engine')
+    print(f"load engine from {engine_path}")
+
+    engine, context = engine_init(engine_path)
+
+    print("2. warmup")
+    for i in range(5):
+        batch = np.random.randint(10, 20, [args.max_batch_size, args.max_seq_len]).astype(
+            np.int32
+        )
+        tensorrt_infer(engine, context, batch)
+
+    print("3. inference")
+    start_time = time.time()
+    num_sentences = 0
+    for i,batch in enumerate(all_inputs):
+        num_sentences += batch.shape[0]
+        res = tensorrt_infer(engine, context, batch)
+
+    end_time = time.time()
+    QPS = num_sentences/(end_time-start_time)
+    print(f"Translated {num_sentences} sentences, {QPS} sentences/s")
+    target_qps = float(os.environ['Accuracy'])
+
+    # Release the resouce of context and engine
+    del context
+    del engine
+
+    print("QPS: = ", QPS, "target QPS: ", target_qps)
+    if QPS >= target_qps:
+        print("pass!")
+        exit()
+    else:
+        print("failed!")
+        exit(10)
diff --git a/models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_performance_plugin.py b/models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_performance_plugin.py
new file mode 100644
index 00000000..e6984b8e
--- /dev/null
+++ b/models/nlp/plm/transformer/ixrt/inference_wmt14_en_fr_fp16_performance_plugin.py
@@ -0,0 +1,147 @@
+import json
+import os
+import numpy as np
+import argparse
+import time
+
+import tensorrt
+from tensorrt import Dims
+from common import create_engine_context, get_io_bindings, setup_io_bindings
+
+import sys
+sys.path.append("../")
+from plugin.transformer_cfg import TransformerBaseConfig
+from plugin.trt import T5TRTDecoder, T5TRTEncoder,inference,benchmark
+
+import torch
+from torch.utils.data import DataLoader
+
+
+class CustomDataset(torch.utils.data.Dataset):
+    def __init__(self, inputs):
+        self.inputs = inputs
+
+    def __getitem__(self, index):
+        input = self.inputs[index]        
+        return input
+    
+    def __len__(self):
+        return len(self.inputs)
+
+
+
+
+
+def generate_batch(features):
+    all_inputs = []
+    tmp = []
+    for data in features:
+        if len(tmp) == args.max_batch_size:
+            batch_max_len = max([len(i) for i in tmp])
+            new_tmp = []
+            for i in tmp:
+                i = i[:args.max_seq_len]
+                i = [pad_id]*(batch_max_len-len(i)) + i
+                new_tmp.append(i)
+            all_inputs.append(np.array(new_tmp).astype(np.int32))
+            tmp = []
+        tmp.append(data)
+        
+    return all_inputs
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="build ixrt graph and convert weights", usage=""
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        required=True,
+        help="max batch size for inference",
+    )
+    parser.add_argument(
+        "--max_seq_len",
+        type=int,
+        default=102,
+        help="max sequence length for inference",
+    )
+    parser.add_argument(
+        "--data_dir",
+        type=str
+    )
+    parser.add_argument(
+        "--model_dir",
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    assert args.max_seq_len <= 102
+    pad_id = 1
+    feature_file = os.path.join(args.data_dir,'features.json')
+
+    with open(feature_file,'r') as f:
+        features = json.loads(f.read())
+
+    all_inputs = generate_batch(features)
+    print(f"max_batch_size: {args.max_batch_size}, max_seq_len: {args.max_seq_len}")
+
+    print("1. build engine")
+    
+    
+    batch_size = args.max_batch_size
+    config_path = os.path.join(args.model_dir,'transformer_config.json')
+    config = TransformerBaseConfig(config_path)
+    
+    encoder_engine =  os.path.join(args.model_dir,'Encoder.engine')
+    print(f"2 load encoder engine from {encoder_engine}") 
+    encoder = T5TRTEncoder(encoder_engine,config, batch_size=batch_size) 
+    
+    
+    decoder_engine =  os.path.join(args.model_dir,'Decoder.engine')
+    print(f"3 load decoder_engine engine from {decoder_engine}") 
+    decoder = T5TRTDecoder(decoder_engine,config,batch_size=batch_size)  
+    
+    
+    device = torch.device("cuda:0")
+    dataset = CustomDataset(all_inputs)
+    dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=False, num_workers=1,drop_last=True)
+    
+    prev_tokens = torch.full((batch_size,1), int(config.sos_token_id),dtype = torch.int32).cuda()
+    for i, data in enumerate(dataloader):     
+        data = torch.squeeze(data,0).to(device)
+        benchmark(config,encoder,decoder,data,prev_tokens)
+
+    print("3. inference")
+    
+    total_time = 0
+    
+    num_sentences = 0
+    for i, data in enumerate(dataloader):
+        data = torch.squeeze(data,0).to(device)
+        num_sentences += data.shape[0]
+        start_time = time.time()
+        benchmark(config,encoder,decoder,data,prev_tokens)
+        end_time = time.time()
+        total_time +=(end_time-start_time)
+        
+    QPS = num_sentences/total_time
+    print(f"Translated {num_sentences} sentences, {QPS} sentences/s")
+    target_qps = float(os.environ['Accuracy'])
+    decoder.clear()
+    encoder.clear() 
+
+    print("QPS: = ", QPS, "target QPS: ", target_qps)
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["QPS"] = round(QPS, 3)
+    print(metricResult)
+    if QPS >= target_qps:
+        print("pass!")
+        exit()
+    else:
+        print("failed!")
+        exit(10)
diff --git a/models/nlp/plm/transformer/ixrt/requirements.txt b/models/nlp/plm/transformer/ixrt/requirements.txt
new file mode 100644
index 00000000..483936bb
--- /dev/null
+++ b/models/nlp/plm/transformer/ixrt/requirements.txt
@@ -0,0 +1,6 @@
+numpy==1.26.4
+cython
+antlr4-python3-runtime==4.9.3
+sacrebleu==2.5.1
+bitarray
+scikit-learn
\ No newline at end of file
diff --git a/models/nlp/plm/transformer/ixrt/scripts/infer_transformer_fp16_accuracy.sh b/models/nlp/plm/transformer/ixrt/scripts/infer_transformer_fp16_accuracy.sh
new file mode 100644
index 00000000..a846446c
--- /dev/null
+++ b/models/nlp/plm/transformer/ixrt/scripts/infer_transformer_fp16_accuracy.sh
@@ -0,0 +1,44 @@
+set -euo pipefail
+
+EXIT_STATUS=0
+check_status()
+{
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    echo "fails"
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    fi
+}
+
+BATCH_SIZE=${BATCH_SIZE:=128}
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BATCH_SIZE=${arguments[index]};;
+      --tgt) Accuracy=${arguments[index]};;
+    esac
+done
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+DATA_DIR=${current_path}/../data/datasets/wmt14.en-fr.joined-dict.newstest2014
+MODEL_DIR=${current_path}/../data/checkpoints/wmt14.en-fr.joined-dict.transformer
+CPU_AFFINITY=$(ixsmi topo -m|grep "^GPU0" |awk '{print $(NF-1)}')
+
+if [[ ! -f "${MODEL_DIR}/Encoder.engine" ||  ! -f "${MODEL_DIR}/Decoder.engine" ]]; then
+    echo "Build Engine."
+    python3 ../plugin/build_engine.py \
+        --model_dir ${MODEL_DIR}  
+fi
+
+echo "Inference(Test Accuracy)"
+export Accuracy=${Accuracy:=42}
+numactl --physcpubind=${CPU_AFFINITY} python3 inference_wmt14_en_fr_fp16_accuracy_plugin.py  ${DATA_DIR}  \
+    --path ${MODEL_DIR}/model.pt \
+    --beam 1 --batch-size ${BATCH_SIZE} \
+    --remove-bpe --quiet --fp16; check_status;
+exit ${EXIT_STATUS}
diff --git a/models/nlp/plm/transformer/ixrt/scripts/infer_transformer_fp16_performance.sh b/models/nlp/plm/transformer/ixrt/scripts/infer_transformer_fp16_performance.sh
new file mode 100644
index 00000000..d5969000
--- /dev/null
+++ b/models/nlp/plm/transformer/ixrt/scripts/infer_transformer_fp16_performance.sh
@@ -0,0 +1,45 @@
+set -euo pipefail
+
+
+EXIT_STATUS=0
+check_status()
+{
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    echo "fails"
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    fi
+}
+
+BATCH_SIZE=${BATCH_SIZE:=128}
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BATCH_SIZE=${arguments[index]};;
+      --tgt) Accuracy=${arguments[index]};;
+    esac
+done
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+DATA_DIR=${current_path}/../data/datasets/wmt14.en-fr.joined-dict.newstest2014
+MODEL_DIR=${current_path}/../data/checkpoints/wmt14.en-fr.joined-dict.transformer
+CPU_AFFINITY=$(ixsmi topo -m|grep "^GPU0" |awk '{print $(NF-1)}')
+
+if [[ ! -f "${MODEL_DIR}/Encoder.engine" ||  ! -f "${MODEL_DIR}/Decoder.engine" ]]; then
+    echo "Build Engine."
+    python3 ../plugin/build_engine.py \
+        --model_dir ${MODEL_DIR}  
+fi
+
+echo "Inference(Test QPS)"
+export Accuracy=${Accuracy:=270}
+numactl --physcpubind=${CPU_AFFINITY} python3 inference_wmt14_en_fr_fp16_performance_plugin.py \
+    --max_batch_size ${BATCH_SIZE}  \
+    --model_dir ${MODEL_DIR} \
+    --data_dir ${DATA_DIR}; check_status;
+exit ${EXIT_STATUS}
diff --git a/models/nlp/plm/transformer/omegaconf.py b/models/nlp/plm/transformer/omegaconf.py
new file mode 100644
index 00000000..84d642f5
--- /dev/null
+++ b/models/nlp/plm/transformer/omegaconf.py
@@ -0,0 +1,1160 @@
+"""OmegaConf module"""
+from dataclasses import _MISSING_TYPE
+import copy
+import inspect
+import io
+import os
+import pathlib
+import sys
+import warnings
+from collections import defaultdict
+from contextlib import contextmanager
+from enum import Enum
+from textwrap import dedent
+from typing import (
+    IO,
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    Union,
+    overload,
+)
+
+import yaml
+
+from . import DictConfig, DictKeyType, ListConfig
+from ._utils import (
+    _DEFAULT_MARKER_,
+    _ensure_container,
+    _get_value,
+    format_and_raise,
+    get_dict_key_value_types,
+    get_list_element_type,
+    get_omega_conf_dumper,
+    get_type_of,
+    is_attr_class,
+    is_dataclass,
+    is_dict_annotation,
+    is_int,
+    is_list_annotation,
+    is_primitive_container,
+    is_primitive_dict,
+    is_primitive_list,
+    is_structured_config,
+    is_tuple_annotation,
+    is_union_annotation,
+    nullcontext,
+    split_key,
+    type_str,
+)
+from .base import Box, Container, Node, SCMode, UnionNode
+from .basecontainer import BaseContainer
+from .errors import (
+    MissingMandatoryValue,
+    OmegaConfBaseException,
+    UnsupportedInterpolationType,
+    ValidationError,
+)
+from .nodes import (
+    AnyNode,
+    BooleanNode,
+    BytesNode,
+    EnumNode,
+    FloatNode,
+    IntegerNode,
+    PathNode,
+    StringNode,
+    ValueNode,
+)
+
+MISSING: Any = "???"
+
+Resolver = Callable[..., Any]
+
+
+def II(interpolation: str) -> Any:
+    """
+    Equivalent to ``${interpolation}``
+
+    :param interpolation:
+    :return: input ``${node}`` with type Any
+    """
+    return "${" + interpolation + "}"
+
+
+def SI(interpolation: str) -> Any:
+    """
+    Use this for String interpolation, for example ``"http://${host}:${port}"``
+
+    :param interpolation: interpolation string
+    :return: input interpolation with type ``Any``
+    """
+    return interpolation
+
+
+def register_default_resolvers() -> None:
+    from omegaconf.resolvers import oc
+
+    OmegaConf.register_new_resolver("oc.create", oc.create)
+    OmegaConf.register_new_resolver("oc.decode", oc.decode)
+    OmegaConf.register_new_resolver("oc.deprecated", oc.deprecated)
+    OmegaConf.register_new_resolver("oc.env", oc.env)
+    OmegaConf.register_new_resolver("oc.select", oc.select)
+    OmegaConf.register_new_resolver("oc.dict.keys", oc.dict.keys)
+    OmegaConf.register_new_resolver("oc.dict.values", oc.dict.values)
+
+
+class OmegaConf:
+    """OmegaConf primary class"""
+
+    def __init__(self) -> None:
+        raise NotImplementedError("Use one of the static construction functions")
+
+    @staticmethod
+    def structured(
+        obj: Any,
+        parent: Optional[BaseContainer] = None,
+        flags: Optional[Dict[str, bool]] = None,
+    ) -> Any:
+        return OmegaConf.create(obj, parent, flags)
+
+    @staticmethod
+    @overload
+    def create(
+        obj: str,
+        parent: Optional[BaseContainer] = None,
+        flags: Optional[Dict[str, bool]] = None,
+    ) -> Union[DictConfig, ListConfig]:
+        ...
+
+    @staticmethod
+    @overload
+    def create(
+        obj: Union[List[Any], Tuple[Any, ...]],
+        parent: Optional[BaseContainer] = None,
+        flags: Optional[Dict[str, bool]] = None,
+    ) -> ListConfig:
+        ...
+
+    @staticmethod
+    @overload
+    def create(
+        obj: DictConfig,
+        parent: Optional[BaseContainer] = None,
+        flags: Optional[Dict[str, bool]] = None,
+    ) -> DictConfig:
+        ...
+
+    @staticmethod
+    @overload
+    def create(
+        obj: ListConfig,
+        parent: Optional[BaseContainer] = None,
+        flags: Optional[Dict[str, bool]] = None,
+    ) -> ListConfig:
+        ...
+
+    @staticmethod
+    @overload
+    def create(
+        obj: Optional[Dict[Any, Any]] = None,
+        parent: Optional[BaseContainer] = None,
+        flags: Optional[Dict[str, bool]] = None,
+    ) -> DictConfig:
+        ...
+
+    @staticmethod
+    def create(  # noqa F811
+        obj: Any = _DEFAULT_MARKER_,
+        parent: Optional[BaseContainer] = None,
+        flags: Optional[Dict[str, bool]] = None,
+    ) -> Union[DictConfig, ListConfig]:
+        return OmegaConf._create_impl(
+            obj=obj,
+            parent=parent,
+            flags=flags,
+        )
+
+    @staticmethod
+    def load(file_: Union[str, pathlib.Path, IO[Any]]) -> Union[DictConfig, ListConfig]:
+        from ._utils import get_yaml_loader
+
+        if isinstance(file_, (str, pathlib.Path)):
+            with io.open(os.path.abspath(file_), "r", encoding="utf-8") as f:
+                obj = yaml.load(f, Loader=get_yaml_loader())
+        elif getattr(file_, "read", None):
+            obj = yaml.load(file_, Loader=get_yaml_loader())
+        else:
+            raise TypeError("Unexpected file type")
+
+        if obj is not None and not isinstance(obj, (list, dict, str)):
+            raise IOError(  # pragma: no cover
+                f"Invalid loaded object type: {type(obj).__name__}"
+            )
+
+        ret: Union[DictConfig, ListConfig]
+        if obj is None:
+            ret = OmegaConf.create()
+        else:
+            ret = OmegaConf.create(obj)
+        return ret
+
+    @staticmethod
+    def save(
+        config: Any, f: Union[str, pathlib.Path, IO[Any]], resolve: bool = False
+    ) -> None:
+        """
+        Save as configuration object to a file
+
+        :param config: omegaconf.Config object (DictConfig or ListConfig).
+        :param f: filename or file object
+        :param resolve: True to save a resolved config (defaults to False)
+        """
+        if is_dataclass(config) or is_attr_class(config):
+            config = OmegaConf.create(config)
+        data = OmegaConf.to_yaml(config, resolve=resolve)
+        if isinstance(f, (str, pathlib.Path)):
+            with io.open(os.path.abspath(f), "w", encoding="utf-8") as file:
+                file.write(data)
+        elif hasattr(f, "write"):
+            f.write(data)
+            f.flush()
+        else:
+            raise TypeError("Unexpected file type")
+
+    @staticmethod
+    def from_cli(args_list: Optional[List[str]] = None) -> DictConfig:
+        if args_list is None:
+            # Skip program name
+            args_list = sys.argv[1:]
+        return OmegaConf.from_dotlist(args_list)
+
+    @staticmethod
+    def from_dotlist(dotlist: List[str]) -> DictConfig:
+        """
+        Creates config from the content sys.argv or from the specified args list of not None
+
+        :param dotlist: A list of dotlist-style strings, e.g. ``["foo.bar=1", "baz=qux"]``.
+        :return: A ``DictConfig`` object created from the dotlist.
+        """
+        conf = OmegaConf.create()
+        conf.merge_with_dotlist(dotlist)
+        return conf
+
+    @staticmethod
+    def merge(
+        *configs: Union[
+            DictConfig,
+            ListConfig,
+            Dict[DictKeyType, Any],
+            List[Any],
+            Tuple[Any, ...],
+            Any,
+        ],
+    ) -> Union[ListConfig, DictConfig]:
+        """
+        Merge a list of previously created configs into a single one
+
+        :param configs: Input configs
+        :return: the merged config object.
+        """
+        assert len(configs) > 0
+        target = copy.deepcopy(configs[0])
+        target = _ensure_container(target)
+        assert isinstance(target, (DictConfig, ListConfig))
+
+        with flag_override(target, "readonly", False):
+            target.merge_with(*configs[1:])
+            turned_readonly = target._get_flag("readonly") is True
+
+        if turned_readonly:
+            OmegaConf.set_readonly(target, True)
+
+        return target
+
+    @staticmethod
+    def unsafe_merge(
+        *configs: Union[
+            DictConfig,
+            ListConfig,
+            Dict[DictKeyType, Any],
+            List[Any],
+            Tuple[Any, ...],
+            Any,
+        ],
+    ) -> Union[ListConfig, DictConfig]:
+        """
+        Merge a list of previously created configs into a single one
+        This is much faster than OmegaConf.merge() as the input configs are not copied.
+        However, the input configs must not be used after this operation as will become inconsistent.
+
+        :param configs: Input configs
+        :return: the merged config object.
+        """
+        assert len(configs) > 0
+        target = configs[0]
+        target = _ensure_container(target)
+        assert isinstance(target, (DictConfig, ListConfig))
+
+        with flag_override(
+            target, ["readonly", "no_deepcopy_set_nodes"], [False, True]
+        ):
+            target.merge_with(*configs[1:])
+            turned_readonly = target._get_flag("readonly") is True
+
+        if turned_readonly:
+            OmegaConf.set_readonly(target, True)
+
+        return target
+
+    @staticmethod
+    def register_resolver(name: str, resolver: Resolver) -> None:
+        warnings.warn(
+            dedent(
+                """\
+            register_resolver() is deprecated.
+            See https://github.com/omry/omegaconf/issues/426 for migration instructions.
+            """
+            ),
+            stacklevel=2,
+        )
+        return OmegaConf.legacy_register_resolver(name, resolver)
+
+    # This function will eventually be deprecated and removed.
+    @staticmethod
+    def legacy_register_resolver(name: str, resolver: Resolver) -> None:
+        assert callable(resolver), "resolver must be callable"
+        # noinspection PyProtectedMember
+        assert (
+            name not in BaseContainer._resolvers
+        ), f"resolver '{name}' is already registered"
+
+        def resolver_wrapper(
+            config: BaseContainer,
+            parent: BaseContainer,
+            node: Node,
+            args: Tuple[Any, ...],
+            args_str: Tuple[str, ...],
+        ) -> Any:
+            cache = OmegaConf.get_cache(config)[name]
+            # "Un-escape " spaces and commas.
+            args_unesc = [x.replace(r"\ ", " ").replace(r"\,", ",") for x in args_str]
+
+            # Nested interpolations behave in a potentially surprising way with
+            # legacy resolvers (they remain as strings, e.g., "${foo}"). If any
+            # input looks like an interpolation we thus raise an exception.
+            try:
+                bad_arg = next(i for i in args_unesc if "${" in i)
+            except StopIteration:
+                pass
+            else:
+                raise ValueError(
+                    f"Resolver '{name}' was called with argument '{bad_arg}' that appears "
+                    f"to be an interpolation. Nested interpolations are not supported for "
+                    f"resolvers registered with `[legacy_]register_resolver()`, please use "
+                    f"`register_new_resolver()` instead (see "
+                    f"https://github.com/omry/omegaconf/issues/426 for migration instructions)."
+                )
+            key = args_str
+            val = cache[key] if key in cache else resolver(*args_unesc)
+            cache[key] = val
+            return val
+
+        # noinspection PyProtectedMember
+        BaseContainer._resolvers[name] = resolver_wrapper
+
+    @staticmethod
+    def register_new_resolver(
+        name: str,
+        resolver: Resolver,
+        *,
+        replace: bool = False,
+        use_cache: bool = False,
+    ) -> None:
+        """
+        Register a resolver.
+
+        :param name: Name of the resolver.
+        :param resolver: Callable whose arguments are provided in the interpolation,
+            e.g., with ${foo:x,0,${y.z}} these arguments are respectively "x" (str),
+            0 (int) and the value of ``y.z``.
+        :param replace: If set to ``False`` (default), then a ``ValueError`` is raised if
+            an existing resolver has already been registered with the same name.
+            If set to ``True``, then the new resolver replaces the previous one.
+            NOTE: The cache on existing config objects is not affected, use
+            ``OmegaConf.clear_cache(cfg)`` to clear it.
+        :param use_cache: Whether the resolver's outputs should be cached. The cache is
+            based only on the string literals representing the resolver arguments, e.g.,
+            ${foo:${bar}} will always return the same value regardless of the value of
+            ``bar`` if the cache is enabled for ``foo``.
+        """
+        if not callable(resolver):
+            raise TypeError("resolver must be callable")
+        if not name:
+            raise ValueError("cannot use an empty resolver name")
+
+        if not replace and OmegaConf.has_resolver(name):
+            raise ValueError(f"resolver '{name}' is already registered")
+
+        try:
+            sig: Optional[inspect.Signature] = inspect.signature(resolver)
+        except ValueError:
+            sig = None
+
+        def _should_pass(special: str) -> bool:
+            ret = sig is not None and special in sig.parameters
+            if ret and use_cache:
+                raise ValueError(
+                    f"use_cache=True is incompatible with functions that receive the {special}"
+                )
+            return ret
+
+        pass_parent = _should_pass("_parent_")
+        pass_node = _should_pass("_node_")
+        pass_root = _should_pass("_root_")
+
+        def resolver_wrapper(
+            config: BaseContainer,
+            parent: Container,
+            node: Node,
+            args: Tuple[Any, ...],
+            args_str: Tuple[str, ...],
+        ) -> Any:
+            if use_cache:
+                cache = OmegaConf.get_cache(config)[name]
+                try:
+                    return cache[args_str]
+                except KeyError:
+                    pass
+
+            # Call resolver.
+            kwargs: Dict[str, Node] = {}
+            if pass_parent:
+                kwargs["_parent_"] = parent
+            if pass_node:
+                kwargs["_node_"] = node
+            if pass_root:
+                kwargs["_root_"] = config
+
+            ret = resolver(*args, **kwargs)
+
+            if use_cache:
+                cache[args_str] = ret
+            return ret
+
+        # noinspection PyProtectedMember
+        BaseContainer._resolvers[name] = resolver_wrapper
+
+    @classmethod
+    def has_resolver(cls, name: str) -> bool:
+        return cls._get_resolver(name) is not None
+
+    # noinspection PyProtectedMember
+    @staticmethod
+    def clear_resolvers() -> None:
+        """
+        Clear(remove) all OmegaConf resolvers, then re-register OmegaConf's default resolvers.
+        """
+        BaseContainer._resolvers = {}
+        register_default_resolvers()
+
+    @classmethod
+    def clear_resolver(cls, name: str) -> bool:
+        """
+        Clear(remove) any resolver only if it exists.
+
+        Returns a bool: True if resolver is removed and False if not removed.
+
+        .. warning:
+            This method can remove deafult resolvers as well.
+
+        :param name: Name of the resolver.
+        :return: A bool (``True`` if resolver is removed, ``False`` if not found before removing).
+        """
+        if cls.has_resolver(name):
+            BaseContainer._resolvers.pop(name)
+            return True
+        else:
+            # return False if resolver does not exist
+            return False
+
+    @staticmethod
+    def get_cache(conf: BaseContainer) -> Dict[str, Any]:
+        return conf._metadata.resolver_cache
+
+    @staticmethod
+    def set_cache(conf: BaseContainer, cache: Dict[str, Any]) -> None:
+        conf._metadata.resolver_cache = copy.deepcopy(cache)
+
+    @staticmethod
+    def clear_cache(conf: BaseContainer) -> None:
+        OmegaConf.set_cache(conf, defaultdict(dict, {}))
+
+    @staticmethod
+    def copy_cache(from_config: BaseContainer, to_config: BaseContainer) -> None:
+        OmegaConf.set_cache(to_config, OmegaConf.get_cache(from_config))
+
+    @staticmethod
+    def set_readonly(conf: Node, value: Optional[bool]) -> None:
+        # noinspection PyProtectedMember
+        conf._set_flag("readonly", value)
+
+    @staticmethod
+    def is_readonly(conf: Node) -> Optional[bool]:
+        # noinspection PyProtectedMember
+        return conf._get_flag("readonly")
+
+    @staticmethod
+    def set_struct(conf: Container, value: Optional[bool]) -> None:
+        # noinspection PyProtectedMember
+        conf._set_flag("struct", value)
+
+    @staticmethod
+    def is_struct(conf: Container) -> Optional[bool]:
+        # noinspection PyProtectedMember
+        return conf._get_flag("struct")
+
+    @staticmethod
+    def masked_copy(conf: DictConfig, keys: Union[str, List[str]]) -> DictConfig:
+        """
+        Create a masked copy of of this config that contains a subset of the keys
+
+        :param conf: DictConfig object
+        :param keys: keys to preserve in the copy
+        :return: The masked ``DictConfig`` object.
+        """
+        from .dictconfig import DictConfig
+
+        if not isinstance(conf, DictConfig):
+            raise ValueError("masked_copy is only supported for DictConfig")
+
+        if isinstance(keys, str):
+            keys = [keys]
+        content = {key: value for key, value in conf.items_ex(resolve=False, keys=keys)}
+        return DictConfig(content=content)
+
+    @staticmethod
+    def to_container(
+        cfg: Any,
+        *,
+        resolve: bool = False,
+        throw_on_missing: bool = False,
+        enum_to_str: bool = False,
+        structured_config_mode: SCMode = SCMode.DICT,
+    ) -> Union[Dict[DictKeyType, Any], List[Any], None, str, Any]:
+        """
+        Resursively converts an OmegaConf config to a primitive container (dict or list).
+
+        :param cfg: the config to convert
+        :param resolve: True to resolve all values
+        :param throw_on_missing: When True, raise MissingMandatoryValue if any missing values are present.
+            When False (the default), replace missing values with the string "???" in the output container.
+        :param enum_to_str: True to convert Enum keys and values to strings
+        :param structured_config_mode: Specify how Structured Configs (DictConfigs backed by a dataclass) are handled.
+            - By default (``structured_config_mode=SCMode.DICT``) structured configs are converted to plain dicts.
+            - If ``structured_config_mode=SCMode.DICT_CONFIG``, structured config nodes will remain as DictConfig.
+            - If ``structured_config_mode=SCMode.INSTANTIATE``, this function will instantiate structured configs
+              (DictConfigs backed by a dataclass), by creating an instance of the underlying dataclass.
+
+          See also OmegaConf.to_object.
+        :return: A dict or a list representing this config as a primitive container.
+        """
+        if not OmegaConf.is_config(cfg):
+            raise ValueError(
+                f"Input cfg is not an OmegaConf config object ({type_str(type(cfg))})"
+            )
+
+        return BaseContainer._to_content(
+            cfg,
+            resolve=resolve,
+            throw_on_missing=throw_on_missing,
+            enum_to_str=enum_to_str,
+            structured_config_mode=structured_config_mode,
+        )
+
+    @staticmethod
+    def to_object(cfg: Any) -> Union[Dict[DictKeyType, Any], List[Any], None, str, Any]:
+        """
+        Resursively converts an OmegaConf config to a primitive container (dict or list).
+        Any DictConfig objects backed by dataclasses or attrs classes are instantiated
+        as instances of those backing classes.
+
+        This is an alias for OmegaConf.to_container(..., resolve=True, throw_on_missing=True,
+                                                    structured_config_mode=SCMode.INSTANTIATE)
+
+        :param cfg: the config to convert
+        :return: A dict or a list or dataclass representing this config.
+        """
+        return OmegaConf.to_container(
+            cfg=cfg,
+            resolve=True,
+            throw_on_missing=True,
+            enum_to_str=False,
+            structured_config_mode=SCMode.INSTANTIATE,
+        )
+
+    @staticmethod
+    def is_missing(cfg: Any, key: DictKeyType) -> bool:
+        assert isinstance(cfg, Container)
+        try:
+            node = cfg._get_child(key)
+            if node is None:
+                return False
+            assert isinstance(node, Node)
+            return node._is_missing()
+        except (UnsupportedInterpolationType, KeyError, AttributeError):
+            return False
+
+    @staticmethod
+    def is_interpolation(node: Any, key: Optional[Union[int, str]] = None) -> bool:
+        if key is not None:
+            assert isinstance(node, Container)
+            target = node._get_child(key)
+        else:
+            target = node
+        if target is not None:
+            assert isinstance(target, Node)
+            return target._is_interpolation()
+        return False
+
+    @staticmethod
+    def is_list(obj: Any) -> bool:
+        from . import ListConfig
+
+        return isinstance(obj, ListConfig)
+
+    @staticmethod
+    def is_dict(obj: Any) -> bool:
+        from . import DictConfig
+
+        return isinstance(obj, DictConfig)
+
+    @staticmethod
+    def is_config(obj: Any) -> bool:
+        from . import Container
+
+        return isinstance(obj, Container)
+
+    @staticmethod
+    def get_type(obj: Any, key: Optional[str] = None) -> Optional[Type[Any]]:
+        if key is not None:
+            c = obj._get_child(key)
+        else:
+            c = obj
+        return OmegaConf._get_obj_type(c)
+
+    @staticmethod
+    def select(
+        cfg: Container,
+        key: str,
+        *,
+        default: Any = _DEFAULT_MARKER_,
+        throw_on_resolution_failure: bool = True,
+        throw_on_missing: bool = False,
+    ) -> Any:
+        """
+        :param cfg: Config node to select from
+        :param key: Key to select
+        :param default: Default value to return if key is not found
+        :param throw_on_resolution_failure: Raise an exception if an interpolation
+               resolution error occurs, otherwise return None
+        :param throw_on_missing: Raise an exception if an attempt to select a missing key (with the value '???')
+               is made, otherwise return None
+        :return: selected value or None if not found.
+        """
+        from ._impl import select_value
+
+        try:
+            return select_value(
+                cfg=cfg,
+                key=key,
+                default=default,
+                throw_on_resolution_failure=throw_on_resolution_failure,
+                throw_on_missing=throw_on_missing,
+            )
+        except Exception as e:
+            format_and_raise(node=cfg, key=key, value=None, cause=e, msg=str(e))
+
+    @staticmethod
+    def update(
+        cfg: Container,
+        key: str,
+        value: Any = None,
+        *,
+        merge: bool = True,
+        force_add: bool = False,
+    ) -> None:
+        """
+        Updates a dot separated key sequence to a value
+
+        :param cfg: input config to update
+        :param key: key to update (can be a dot separated path)
+        :param value: value to set, if value if a list or a dict it will be merged or set
+            depending on merge_config_values
+        :param merge: If value is a dict or a list, True (default) to merge
+                      into the destination, False to replace the destination.
+        :param force_add: insert the entire path regardless of Struct flag or Structured Config nodes.
+        """
+
+        split = split_key(key)
+        root = cfg
+        for i in range(len(split) - 1):
+            k = split[i]
+            # if next_root is a primitive (string, int etc) replace it with an empty map
+            next_root, key_ = _select_one(root, k, throw_on_missing=False)
+            if not isinstance(next_root, Container):
+                if force_add:
+                    with flag_override(root, "struct", False):
+                        root[key_] = {}
+                else:
+                    root[key_] = {}
+            root = root[key_]
+
+        last = split[-1]
+
+        assert isinstance(
+            root, Container
+        ), f"Unexpected type for root: {type(root).__name__}"
+
+        last_key: Union[str, int] = last
+        if isinstance(root, ListConfig):
+            last_key = int(last)
+
+        ctx = flag_override(root, "struct", False) if force_add else nullcontext()
+        with ctx:
+            if merge and (OmegaConf.is_config(value) or is_primitive_container(value)):
+                assert isinstance(root, BaseContainer)
+                node = root._get_child(last_key)
+                if OmegaConf.is_config(node):
+                    assert isinstance(node, BaseContainer)
+                    node.merge_with(value)
+                    return
+
+            if OmegaConf.is_dict(root):
+                assert isinstance(last_key, str)
+                root.__setattr__(last_key, value)
+            elif OmegaConf.is_list(root):
+                assert isinstance(last_key, int)
+                root.__setitem__(last_key, value)
+            else:
+                assert False
+
+    @staticmethod
+    def to_yaml(cfg: Any, *, resolve: bool = False, sort_keys: bool = False) -> str:
+        """
+        returns a yaml dump of this config object.
+
+        :param cfg: Config object, Structured Config type or instance
+        :param resolve: if True, will return a string with the interpolations resolved, otherwise
+            interpolations are preserved
+        :param sort_keys: If True, will print dict keys in sorted order. default False.
+        :return: A string containing the yaml representation.
+        """
+        cfg = _ensure_container(cfg)
+        container = OmegaConf.to_container(cfg, resolve=resolve, enum_to_str=True)
+        return yaml.dump(  # type: ignore
+            container,
+            default_flow_style=False,
+            allow_unicode=True,
+            sort_keys=sort_keys,
+            Dumper=get_omega_conf_dumper(),
+        )
+
+    @staticmethod
+    def resolve(cfg: Container) -> None:
+        """
+        Resolves all interpolations in the given config object in-place.
+
+        :param cfg: An OmegaConf container (DictConfig, ListConfig)
+                    Raises a ValueError if the input object is not an OmegaConf container.
+        """
+        import omegaconf._impl
+
+        if not OmegaConf.is_config(cfg):
+            # Since this function is mutating the input object in-place, it doesn't make sense to
+            # auto-convert the input object to an OmegaConf container
+            raise ValueError(
+                f"Invalid config type ({type(cfg).__name__}), expected an OmegaConf Container"
+            )
+        omegaconf._impl._resolve(cfg)
+
+    @staticmethod
+    def missing_keys(cfg: Any) -> Set[str]:
+        """
+        Returns a set of missing keys in a dotlist style.
+
+        :param cfg: An ``OmegaConf.Container``,
+                    or a convertible object via ``OmegaConf.create`` (dict, list, ...).
+        :return: set of strings of the missing keys.
+        :raises ValueError: On input not representing a config.
+        """
+        cfg = _ensure_container(cfg)
+        missings: Set[str] = set()
+
+        def gather(_cfg: Container) -> None:
+            itr: Iterable[Any]
+            if isinstance(_cfg, ListConfig):
+                itr = range(len(_cfg))
+            else:
+                itr = _cfg
+
+            for key in itr:
+                if OmegaConf.is_missing(_cfg, key):
+                    missings.add(_cfg._get_full_key(key))
+                elif OmegaConf.is_config(_cfg[key]):
+                    gather(_cfg[key])
+
+        gather(cfg)
+        return missings
+
+    # === private === #
+
+    @staticmethod
+    def _create_impl(  # noqa F811
+        obj: Any = _DEFAULT_MARKER_,
+        parent: Optional[BaseContainer] = None,
+        flags: Optional[Dict[str, bool]] = None,
+    ) -> Union[DictConfig, ListConfig]:
+        try:
+            from ._utils import get_yaml_loader
+            from .dictconfig import DictConfig
+            from .listconfig import ListConfig
+
+            if obj is _DEFAULT_MARKER_:
+                obj = {}
+            if isinstance(obj, _MISSING_TYPE):
+                return OmegaConf.create({}, parent=parent, flags=flags)
+            if isinstance(obj, str):
+                obj = yaml.load(obj, Loader=get_yaml_loader())
+                if obj is None:
+                    return OmegaConf.create({}, parent=parent, flags=flags)
+                elif isinstance(obj, str):
+                    return OmegaConf.create({obj: None}, parent=parent, flags=flags)
+                else:
+                    assert isinstance(obj, (list, dict))
+                    return OmegaConf.create(obj, parent=parent, flags=flags)
+
+            else:
+                if (
+                    is_primitive_dict(obj)
+                    or OmegaConf.is_dict(obj)
+                    or is_structured_config(obj)
+                    or obj is None
+                ):
+                    if isinstance(obj, DictConfig):
+                        return DictConfig(
+                            content=obj,
+                            parent=parent,
+                            ref_type=obj._metadata.ref_type,
+                            is_optional=obj._metadata.optional,
+                            key_type=obj._metadata.key_type,
+                            element_type=obj._metadata.element_type,
+                            flags=flags,
+                        )
+                    else:
+                        obj_type = OmegaConf.get_type(obj)
+                        key_type, element_type = get_dict_key_value_types(obj_type)
+                        return DictConfig(
+                            content=obj,
+                            parent=parent,
+                            key_type=key_type,
+                            element_type=element_type,
+                            flags=flags,
+                        )
+                elif is_primitive_list(obj) or OmegaConf.is_list(obj):
+                    if isinstance(obj, ListConfig):
+                        return ListConfig(
+                            content=obj,
+                            parent=parent,
+                            element_type=obj._metadata.element_type,
+                            ref_type=obj._metadata.ref_type,
+                            is_optional=obj._metadata.optional,
+                            flags=flags,
+                        )
+                    else:
+                        obj_type = OmegaConf.get_type(obj)
+                        element_type = get_list_element_type(obj_type)
+                        return ListConfig(
+                            content=obj,
+                            parent=parent,
+                            element_type=element_type,
+                            ref_type=Any,
+                            is_optional=True,
+                            flags=flags,
+                        )
+                else:
+                    if isinstance(obj, type):
+                        raise ValidationError(
+                            f"Input class '{obj.__name__}' is not a structured config. "
+                            "did you forget to decorate it as a dataclass?"
+                        )
+                    else:
+                        raise ValidationError(
+                            f"Object of unsupported type: '{type(obj).__name__}'"
+                        )
+        except OmegaConfBaseException as e:
+            format_and_raise(node=None, key=None, value=None, msg=str(e), cause=e)
+            assert False
+
+    @staticmethod
+    def _get_obj_type(c: Any) -> Optional[Type[Any]]:
+        if is_structured_config(c):
+            return get_type_of(c)
+        elif c is None:
+            return None
+        elif isinstance(c, DictConfig):
+            if c._is_none():
+                return None
+            elif c._is_missing():
+                return None
+            else:
+                if is_structured_config(c._metadata.object_type):
+                    return c._metadata.object_type
+                else:
+                    return dict
+        elif isinstance(c, ListConfig):
+            return list
+        elif isinstance(c, ValueNode):
+            return type(c._value())
+        elif isinstance(c, UnionNode):
+            return type(_get_value(c))
+        elif isinstance(c, dict):
+            return dict
+        elif isinstance(c, (list, tuple)):
+            return list
+        else:
+            return get_type_of(c)
+
+    @staticmethod
+    def _get_resolver(
+        name: str,
+    ) -> Optional[
+        Callable[
+            [Container, Container, Node, Tuple[Any, ...], Tuple[str, ...]],
+            Any,
+        ]
+    ]:
+        # noinspection PyProtectedMember
+        return (
+            BaseContainer._resolvers[name] if name in BaseContainer._resolvers else None
+        )
+
+
+# register all default resolvers
+register_default_resolvers()
+
+
+@contextmanager
+def flag_override(
+    config: Node,
+    names: Union[List[str], str],
+    values: Union[List[Optional[bool]], Optional[bool]],
+) -> Generator[Node, None, None]:
+    if isinstance(names, str):
+        names = [names]
+    if values is None or isinstance(values, bool):
+        values = [values]
+
+    prev_states = [config._get_node_flag(name) for name in names]
+
+    try:
+        config._set_flag(names, values)
+        yield config
+    finally:
+        config._set_flag(names, prev_states)
+
+
+@contextmanager
+def read_write(config: Node) -> Generator[Node, None, None]:
+    prev_state = config._get_node_flag("readonly")
+    try:
+        OmegaConf.set_readonly(config, False)
+        yield config
+    finally:
+        OmegaConf.set_readonly(config, prev_state)
+
+
+@contextmanager
+def open_dict(config: Container) -> Generator[Container, None, None]:
+    prev_state = config._get_node_flag("struct")
+    try:
+        OmegaConf.set_struct(config, False)
+        yield config
+    finally:
+        OmegaConf.set_struct(config, prev_state)
+
+
+# === private === #
+
+
+def _node_wrap(
+    parent: Optional[Box],
+    is_optional: bool,
+    value: Any,
+    key: Any,
+    ref_type: Any = Any,
+) -> Node:
+    node: Node
+    if is_dict_annotation(ref_type) or (is_primitive_dict(value) and ref_type is Any):
+        key_type, element_type = get_dict_key_value_types(ref_type)
+        node = DictConfig(
+            content=value,
+            key=key,
+            parent=parent,
+            ref_type=ref_type,
+            is_optional=is_optional,
+            key_type=key_type,
+            element_type=element_type,
+        )
+    elif (is_list_annotation(ref_type) or is_tuple_annotation(ref_type)) or (
+        type(value) in (list, tuple) and ref_type is Any
+    ):
+        element_type = get_list_element_type(ref_type)
+        node = ListConfig(
+            content=value,
+            key=key,
+            parent=parent,
+            is_optional=is_optional,
+            element_type=element_type,
+            ref_type=ref_type,
+        )
+    elif is_structured_config(ref_type) or is_structured_config(value):
+        key_type, element_type = get_dict_key_value_types(value)
+        node = DictConfig(
+            ref_type=ref_type,
+            is_optional=is_optional,
+            content=value,
+            key=key,
+            parent=parent,
+            key_type=key_type,
+            element_type=element_type,
+        )
+    elif is_union_annotation(ref_type):
+        node = UnionNode(
+            content=value,
+            ref_type=ref_type,
+            is_optional=is_optional,
+            key=key,
+            parent=parent,
+        )
+    elif ref_type == Any or ref_type is None:
+        node = AnyNode(value=value, key=key, parent=parent)
+    elif isinstance(ref_type, type) and issubclass(ref_type, Enum):
+        node = EnumNode(
+            enum_type=ref_type,
+            value=value,
+            key=key,
+            parent=parent,
+            is_optional=is_optional,
+        )
+    elif ref_type == int:
+        node = IntegerNode(value=value, key=key, parent=parent, is_optional=is_optional)
+    elif ref_type == float:
+        node = FloatNode(value=value, key=key, parent=parent, is_optional=is_optional)
+    elif ref_type == bool:
+        node = BooleanNode(value=value, key=key, parent=parent, is_optional=is_optional)
+    elif ref_type == str:
+        node = StringNode(value=value, key=key, parent=parent, is_optional=is_optional)
+    elif ref_type == bytes:
+        node = BytesNode(value=value, key=key, parent=parent, is_optional=is_optional)
+    elif ref_type == pathlib.Path:
+        node = PathNode(value=value, key=key, parent=parent, is_optional=is_optional)
+    else:
+        if parent is not None and parent._get_flag("allow_objects") is True:
+            if type(value) in (list, tuple):
+                node = ListConfig(
+                    content=value,
+                    key=key,
+                    parent=parent,
+                    ref_type=ref_type,
+                    is_optional=is_optional,
+                )
+            elif is_primitive_dict(value):
+                node = DictConfig(
+                    content=value,
+                    key=key,
+                    parent=parent,
+                    ref_type=ref_type,
+                    is_optional=is_optional,
+                )
+            else:
+                node = AnyNode(value=value, key=key, parent=parent)
+        else:
+            raise ValidationError(f"Unexpected type annotation: {type_str(ref_type)}")
+    return node
+
+
+def _maybe_wrap(
+    ref_type: Any,
+    key: Any,
+    value: Any,
+    is_optional: bool,
+    parent: Optional[BaseContainer],
+) -> Node:
+    # if already a node, update key and parent and return as is.
+    # NOTE: that this mutate the input node!
+    if isinstance(value, Node):
+        value._set_key(key)
+        value._set_parent(parent)
+        return value
+    else:
+        return _node_wrap(
+            ref_type=ref_type,
+            parent=parent,
+            is_optional=is_optional,
+            value=value,
+            key=key,
+        )
+
+
+def _select_one(
+    c: Container, key: str, throw_on_missing: bool, throw_on_type_error: bool = True
+) -> Tuple[Optional[Node], Union[str, int]]:
+    from .dictconfig import DictConfig
+    from .listconfig import ListConfig
+
+    ret_key: Union[str, int] = key
+    assert isinstance(c, Container), f"Unexpected type: {c}"
+    if c._is_none():
+        return None, ret_key
+
+    if isinstance(c, DictConfig):
+        assert isinstance(ret_key, str)
+        val = c._get_child(ret_key, validate_access=False)
+    elif isinstance(c, ListConfig):
+        assert isinstance(ret_key, str)
+        if not is_int(ret_key):
+            if throw_on_type_error:
+                raise TypeError(
+                    f"Index '{ret_key}' ({type(ret_key).__name__}) is not an int"
+                )
+            else:
+                val = None
+        else:
+            ret_key = int(ret_key)
+            if ret_key < 0 or ret_key + 1 > len(c):
+                val = None
+            else:
+                val = c._get_child(ret_key)
+    else:
+        assert False
+
+    if val is not None:
+        assert isinstance(val, Node)
+        if val._is_missing():
+            if throw_on_missing:
+                raise MissingMandatoryValue(
+                    f"Missing mandatory value: {c._get_full_key(ret_key)}"
+                )
+            else:
+                return val, ret_key
+
+    assert val is None or isinstance(val, Node)
+    return val, ret_key
diff --git a/models/nlp/plm/transformer/plugin/__init__.py b/models/nlp/plm/transformer/plugin/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/models/nlp/plm/transformer/plugin/build_engine.py b/models/nlp/plm/transformer/plugin/build_engine.py
new file mode 100644
index 00000000..a520fe19
--- /dev/null
+++ b/models/nlp/plm/transformer/plugin/build_engine.py
@@ -0,0 +1,401 @@
+#!/usr/bin/env python3
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+#
+
+import argparse
+import ctypes
+import json
+import os
+import sys
+import time
+import numpy as np
+
+import tensorrt as trt
+from builder_utils import load_onnx_weights_and_quant
+from plugin_utils import (
+    TRT_LOGGER,
+    create_decoder_emb_plugin,
+    create_encoder_emb_plugin,
+    transformer_decoder_layer,
+    transformer_encoder_layer,
+    cross_attention_kv_cache,
+    create_top1_plugin,
+    custom_fc
+)
+
+from transformer_cfg import TransformerBaseConfig
+
+
+def get_mha_dtype(config):
+    dtype = trt.float32
+    if config.use_fp16:
+        dtype = trt.float16
+    return int(dtype)
+
+
+def transformer_encoder(config, init_dict, network, input_tensor, input_mask):
+    """
+    Create the bert model
+    """
+
+    block = "encoder"
+    prev_input = input_tensor
+    for ss in range(config.num_hidden_layers):
+        out_layer = transformer_encoder_layer(
+            block, ss, config, init_dict, network, prev_input, input_mask
+        )
+        prev_input = out_layer.get_output(0)
+    return prev_input
+
+
+def transformer_decoder(
+    config,
+    init_dict,
+    network,
+    encoder_emb_out,
+    input_mask,
+    encoder_out,
+    steps,
+    kv_cache_inputs,
+    kv_cache_outputs,
+    encoder_kv_cache_inputs
+):
+    """
+    Create the bert model
+    """
+    prev_input = encoder_emb_out
+    block = "decoder"
+    for ss in range(config.num_hidden_layers):
+        out_layer = transformer_decoder_layer(
+            block,
+            ss,
+            config,
+            init_dict,
+            network,
+            prev_input,
+            input_mask,
+            encoder_out,
+            steps,
+            kv_cache_inputs,
+            kv_cache_outputs,
+            encoder_kv_cache_inputs
+        )
+        prev_input = out_layer.get_output(0)
+
+    decoder_output_projection_weight = init_dict[f"{block}.output_projection.weight"]
+    # out_proj_layer = network.add_fully_connected(
+    #     prev_input, config.tgt_vocab_size, decoder_output_projection_weight
+    # )  #
+
+    out_proj_layer = custom_fc(network, prev_input, config.tgt_vocab_size, decoder_output_projection_weight, None)
+
+    reshape_layer = network.add_shuffle(out_proj_layer.get_output(0))
+
+    reshape_layer.reshape_dims = trt.Dims([0, -1])  # reshape [bsz,vocab_size]
+    decoder_blk_out = reshape_layer.get_output(0)
+    return decoder_blk_out
+
+
+def build_encoder_engine(batch_sizes, sequence_lengths, config, weights_dict):
+    explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+    encoder_emb_plugin = create_encoder_emb_plugin(weights_dict, config)
+
+    builder = trt.Builder(TRT_LOGGER)
+    with builder.create_network(
+        explicit_batch_flag
+    ) as network, builder.create_builder_config() as builder_config:
+
+        builder_config.set_flag(trt.BuilderFlag.FP16)
+        input_ids = network.add_input(
+            name="src_tokens", dtype=trt.int32, shape=[-1, -1]
+        )
+        MIN_SHAPE = (batch_sizes[0], sequence_lengths[0])
+        OPT_SHAPE = (batch_sizes[1], sequence_lengths[1])
+        MAX_SHAPE = (batch_sizes[2], sequence_lengths[2])
+
+        profile = builder.create_optimization_profile()
+        profile.set_shape("src_tokens", MIN_SHAPE, OPT_SHAPE, MAX_SHAPE)
+        builder_config.add_optimization_profile(profile)
+
+        #######################{transformer Encoder emb layer}#####################
+        emb_layer = network.add_plugin_v2([input_ids], encoder_emb_plugin)
+        ###########################################################################
+        embeddings = emb_layer.get_output(0)
+        mask_idx = emb_layer.get_output(1)
+
+        #######################{transformer Encoder  block}#####################
+        
+        encoder_out = transformer_encoder(
+            config, weights_dict, network, embeddings, mask_idx
+        )
+        #######################################################################
+        
+        
+        for layer_index in range(config.num_hidden_layers):
+            block = "decoder"
+            k_cache,v_cache =  cross_attention_kv_cache(block, layer_index, config, weights_dict, network, encoder_out)
+            
+            k_cache.name = f"past_key_values.{layer_index}.encoder.key"
+            network.mark_output(k_cache)
+            k_cache.dtype = trt.float16
+            
+            v_cache.name = f"past_key_values.{layer_index}.encoder.value"
+            network.mark_output(v_cache)
+            v_cache.dtype = trt.float16
+            
+        mask_idx.name = "mask"
+        network.mark_output(mask_idx)
+        mask_idx.dtype = trt.int32
+
+        plan = builder.build_serialized_network(network, builder_config)
+   
+        return plan
+
+
+def build_engine_decoder(batch_sizes, sequence_lengths, config, weights_dict):
+    explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+    builder = trt.Builder(TRT_LOGGER)
+    decoder_emb_plugin = create_decoder_emb_plugin(weights_dict)
+
+    MIN_BSZ = batch_sizes[0]
+    OPT_BSZ = batch_sizes[1]
+    MAX_BSZ = batch_sizes[2]
+
+    MIN_LEN = sequence_lengths[0]
+    OPT_LEN = sequence_lengths[1]
+    MAX_LEN = sequence_lengths[2]
+
+    with builder.create_network(
+        explicit_batch_flag
+    ) as network, builder.create_builder_config() as builder_config:
+        builder_config.set_flag(trt.BuilderFlag.FP16)
+
+        ###################IxinferDecFormatEncOutput
+
+        token_id = network.add_input(
+            "token_id", dtype=trt.int32, shape=(-1, 1)
+        )  # [bsz,1]
+        steps = network.add_input("steps", dtype=trt.int32, shape=(1,))  # [1,1]
+        mask = network.add_input(
+            "mask", dtype=trt.int32, shape=(-1, -1)
+        )  # [bsz,seq_len]
+
+############################################################################################
+        kv_cache_inputs = {}  # past_key_values
+        kv_cache_outputs = {}  # present_key_values
+
+        for i in range(config.num_hidden_layers):
+            k_cache_name = f"past_key_values.{i}.decoder.key"
+            v_cache_name = f"past_key_values.{i}.decoder.value"
+            k_cache_input = network.add_input(
+                k_cache_name,
+                dtype=trt.float16,
+                shape=(
+                    -1,
+                    config.num_attention_heads,
+                    -1,
+                    config.head_size,
+                ),  # (bsz, config.num_attention_heads, steps, config.head_size)
+            )
+            v_cache_input = network.add_input(
+                v_cache_name,
+                dtype=trt.float16,
+                shape=(-1, config.num_attention_heads, -1, config.head_size),
+            )
+            kv_cache_inputs[k_cache_name] = k_cache_input
+            kv_cache_inputs[v_cache_name] = v_cache_input
+
+        profile = builder.create_optimization_profile()
+        for i in range(config.num_hidden_layers):
+            k_cache_name = f"past_key_values.{i}.decoder.key"
+            v_cache_name = f"past_key_values.{i}.decoder.value"
+            profile.set_shape(
+                k_cache_name,
+                (MIN_BSZ, config.num_attention_heads, 0, config.head_size),  #0 fist step kv cache don't concat
+                (OPT_BSZ, config.num_attention_heads, OPT_LEN, config.head_size),
+                (MAX_BSZ, config.num_attention_heads, MAX_LEN, config.head_size),
+            )
+            profile.set_shape(
+                v_cache_name,
+                (MIN_BSZ, config.num_attention_heads, 0, config.head_size),  #0 fist step kv cache don't concat
+                (OPT_BSZ, config.num_attention_heads, OPT_LEN, config.head_size),
+                (MAX_BSZ, config.num_attention_heads, MAX_LEN, config.head_size),
+            )
+            
+############################################################################################
+
+        encoder_kv_cache_inputs = {}
+        #cross attention kv cache
+        for i in range(config.num_hidden_layers):
+            k_cache_name = f"past_key_values.{i}.encoder.key"
+            v_cache_name = f"past_key_values.{i}.encoder.value"
+            k_cache_input = network.add_input(
+                k_cache_name,
+                dtype=trt.float16,
+                shape=(
+                    -1,
+                    config.num_attention_heads,
+                    -1,
+                    config.head_size,
+                ),  # (bsz, config.num_attention_heads, steps, config.head_size)
+            )
+            v_cache_input = network.add_input(
+                v_cache_name,
+                dtype=trt.float16,
+                shape=(-1, config.num_attention_heads, -1, config.head_size),
+            )
+            encoder_kv_cache_inputs[k_cache_name] = k_cache_input
+            encoder_kv_cache_inputs[v_cache_name] = v_cache_input
+            
+        
+        for i in range(config.num_hidden_layers):
+            k_cache_name = f"past_key_values.{i}.encoder.key"
+            v_cache_name = f"past_key_values.{i}.encoder.value"
+            profile.set_shape(
+                k_cache_name,
+                (MIN_BSZ, config.num_attention_heads, 1, config.head_size), 
+                (OPT_BSZ, config.num_attention_heads, OPT_LEN, config.head_size),
+                (MAX_BSZ, config.num_attention_heads, MAX_LEN, config.head_size),
+            )
+            profile.set_shape(
+                v_cache_name,
+                (MIN_BSZ, config.num_attention_heads, 1, config.head_size),
+                (OPT_BSZ, config.num_attention_heads, OPT_LEN, config.head_size),
+                (MAX_BSZ, config.num_attention_heads, MAX_LEN, config.head_size),
+            )    
+            
+            
+            
+########################################################################################3###
+        profile.set_shape("token_id", (MIN_BSZ, 1), (OPT_BSZ, 1), (MAX_BSZ, 1))
+        profile.set_shape(
+            "mask", (MIN_BSZ, MIN_LEN), (OPT_BSZ, OPT_LEN), (MAX_BSZ, MAX_LEN)
+        )
+        builder_config.add_optimization_profile(profile)
+        
+        encoder_reshape_out = None
+
+        ############################## decodr
+        encoder_emb_layer = network.add_plugin_v2([token_id, steps], decoder_emb_plugin)
+        encoder_emb_out = encoder_emb_layer.get_output(0)
+
+        ##############################
+
+        decoder_out = transformer_decoder(
+            config,
+            weights_dict,
+            network,
+            encoder_emb_out,
+            mask,
+            encoder_reshape_out,
+            steps,
+            kv_cache_inputs,
+            kv_cache_outputs,
+            encoder_kv_cache_inputs
+        )
+
+        # top1_layer = network.add_topk(
+        #     decoder_out, op=trt.TopKOperation.MAX, k=1, axes=2
+        # )
+        
+        top1_plg = create_top1_plugin()
+        top1_layer = network.add_plugin_v2([decoder_out], top1_plg)
+        token_out = top1_layer.get_output(0)
+        token_out.dtype = trt.int32
+        token_out.name = "decoder_id"
+        network.mark_output(token_out)
+
+        for i in range(config.num_hidden_layers):
+            k_cache_name = f"present_key_values.{i}.decoder.key"
+            v_cache_name = f"present_key_values.{i}.decoder.value"
+            key_out = kv_cache_outputs[k_cache_name]
+            key_out.name = k_cache_name
+            network.mark_output(key_out)
+            key_out.dtype = trt.float16
+
+            value_out = kv_cache_outputs[v_cache_name]
+            value_out.name = v_cache_name
+            network.mark_output(value_out)
+            value_out.dtype = trt.float16
+        plan = builder.build_serialized_network(network, builder_config)
+
+        return plan
+
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="TensorRT Transformer Base Sample",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    
+    
+    parser.add_argument(
+        "--model_dir",
+        default="/inferencesamples/data/checkpoints/transformer/wmt14.en-fr.joined-dict.transformer/",
+        help="The ONNX model file path.",
+    )
+
+    parser.add_argument(
+        "--batch_size",
+        default=[1, 64, 128], # min,opt,max
+        action="append",
+        help="Batch size(s) to optimize for",
+        type=int,
+    )
+    parser.add_argument(
+        "--sequence_length",
+        default=[1, 64, 257], # min,opt,max
+        action="append",
+        help="Sequence length of the transformer model",
+        type=int,
+    )
+
+
+    args = parser.parse_args()
+    config_path = os.path.join(args.model_dir, "transformer_config.json")
+    config = TransformerBaseConfig(config_path)
+    onnx_path = os.path.join(args.model_dir, "transformer.onnx")
+    weights_dict = load_onnx_weights_and_quant(onnx_path, config)
+    
+    
+    encoder_path = os.path.join(args.model_dir, "Encoder.engine")
+    with build_encoder_engine(
+        args.batch_size, args.sequence_length, config, weights_dict
+    ) as serialized_engine:
+        print("Saving Engine to {:}".format(encoder_path))
+        with open(encoder_path, "wb") as fout:
+            fout.write(serialized_engine)
+        print("Serializing Encoder Done.")
+
+    decoder_path = os.path.join(args.model_dir, "Decoder.engine")
+    
+
+    with build_engine_decoder(
+        args.batch_size, args.sequence_length, config, weights_dict
+    ) as serialized_engine:
+        print("Saving Engine to {:}".format(decoder_path))
+        
+        with open(decoder_path, "wb") as fout:
+            fout.write(serialized_engine)
+        print("Serializing Decoder Done.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/nlp/plm/transformer/plugin/builder_utils.py b/models/nlp/plm/transformer/plugin/builder_utils.py
new file mode 100644
index 00000000..38a3efd4
--- /dev/null
+++ b/models/nlp/plm/transformer/plugin/builder_utils.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+#
+
+
+import onnx
+import numpy as np
+import tensorrt as trt
+import json
+import torch
+
+TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+
+
+def reformat_weight_name(name):
+    
+    i = name[1]
+    #emb
+    if name.find("pos_emb_weight") !=-1:
+        return name 
+    
+    if name.find("token_emb_weight") !=-1:
+        return name 
+    
+    if name.find("enc_token_emb_weight") !=-1:
+        return name 
+    
+    if name.find("enc_pos_emb_weight") !=-1:
+        return name 
+#################################################################
+#enccoder layer weights
+    #self atten to_q、to_q、to_v compute together
+    if name.find("enc_self_attn_qkv_weight") !=-1:
+        return f"encoder.layers.{i}.self_attn.qkv_proj.weight"
+    if name.find("enc_self_attn_qkv_bias") !=-1:
+        return f"encoder.layers.{i}.self_attn.qkv_proj.bias"
+    
+    
+    
+    if name.find("enc_self_attn_out_proj_weight") !=-1:
+        return f"encoder.layers.{i}.self_attn.out_proj.weight"
+    if name.find("enc_self_attn_out_proj_bias") !=-1:
+        return f"encoder.layers.{i}.self_attn.out_proj.bias"
+    
+    
+    if name.find("enc_self_attn_ln_weight") !=-1:
+        return f"encoder.layers.{i}.self_attn_layer_norm.weight"
+    if name.find("enc_self_attn_ln_bias") !=-1:
+        return f"encoder.layers.{i}.self_attn_layer_norm.bias"
+    
+        #ffn
+    if name.find("enc_ff1_weight") !=-1:
+        return f'encoder.layers.{i}.fc1.weight'
+    if name.find("enc_ff1_bias") !=-1:
+        return f'encoder.layers.{i}.fc1.bias'
+
+    if name.find("enc_ff2_weight") !=-1:
+        return f'encoder.layers.{i}.fc2.weight'
+    if name.find("enc_ff2_bias") !=-1:
+        return f'encoder.layers.{i}.fc2.bias'
+    
+    
+        #layernorm
+    if name.find("enc_final_ln_weight") !=-1:
+        return f"encoder.layers.{i}.final_layer_norm.weight"
+    if name.find("enc_final_ln_bias") !=-1:
+        return f"encoder.layers.{i}.final_layer_norm.bias"
+    
+    
+    
+####################################################################
+#Decoder layer self attention  weights
+
+    #self attention
+    
+    #self atten to_q、to_q、to_v compute together
+    if name.find("self_attn_qkv_proj_weight") !=-1:
+        return f"decoder.layers.{i}.self_attn.qkv_proj.weight"
+    if name.find("self_attn_qkv_proj_bias") !=-1:
+        return f"decoder.layers.{i}.self_attn.qkv_proj.bias"
+    
+    
+    #self attention proj out
+    if name.find("self_attn_out_proj_weight") !=-1:
+        return f"decoder.layers.{i}.self_attn.out_proj.weight"
+    if name.find("self_attn_out_proj_bias") !=-1:
+        return f"decoder.layers.{i}.self_attn.out_proj.bias"
+    
+    #layernorm
+    if name.find("self_attn_ln_weight") !=-1:
+        return f"decoder.layers.{i}.self_attn_layer_norm.weight"
+    if name.find("self_attn_ln_bias") !=-1:
+        return f"decoder.layers.{i}.self_attn_layer_norm.bias"
+    
+########################################################################    
+########################################################################
+#Decoder layer cross attention  weights 
+
+    #self atten to_q、to_q、to_v compute split
+    #to q
+    if name.find("enc_attn_q_proj_weight") !=-1:
+        return f'decoder.layers.{i}.encoder_attn.q_proj.weight'
+    if name.find("enc_attn_q_proj_bias") !=-1:
+        return f'decoder.layers.{i}.encoder_attn.q_proj.bias'
+    
+    #to_kv split affter
+    if name.find("enc_attn_kv_proj_weight") !=-1:
+        return f'decoder.layers.{i}.encoder_attn.kv_proj.weight'
+    if name.find("enc_attn_kv_proj_bias") !=-1:
+        return f'decoder.layers.{i}.encoder_attn.kv_proj.bias' 
+    
+    if name.find("enc_attn_out_proj_weight") !=-1:
+        return f'decoder.layers.{i}.encoder_attn.out_proj.weight' 
+    if name.find("enc_attn_out_proj_bias") !=-1:
+        return f'decoder.layers.{i}.encoder_attn.out_proj.bias' 
+       
+    #layernorm
+    if name.find("enc_attn_ln_weight") !=-1:
+        return f'decoder.layers.{i}.encoder_attn_layer_norm.weight'
+    if name.find("enc_attn_ln_bias") !=-1:
+        return f'decoder.layers.{i}.encoder_attn_layer_norm.bias'
+########################################################################    
+    #ffn
+    if name.find("ff1_weight") !=-1:
+        return f'decoder.layers.{i}.fc1.weight'
+    if name.find("ff1_bias") !=-1:
+        return f'decoder.layers.{i}.fc1.bias'
+
+    if name.find("ff2_weight") !=-1:
+        return f'decoder.layers.{i}.fc2.weight'
+    if name.find("ff2_bias") !=-1:
+        return f'decoder.layers.{i}.fc2.bias'
+
+    #layernorm
+    if name.find("final_ln_weight") !=-1:
+        return f"decoder.layers.{i}.final_layer_norm.weight"
+    if name.find("final_ln_bias") !=-1:
+        return f"decoder.layers.{i}.final_layer_norm.bias"
+    
+#############################################################    
+    if name.find("linear_weight") !=-1:
+        return f"decoder.output_projection.weight"
+    
+    
+    
+    else:
+        return None
+    
+def get_onnx_weight_dict(tensor_dict, config):
+    N = config.num_attention_heads
+    H = config.head_size
+    hidden_size = config.hidden_size
+
+    weights_dict = dict()
+    
+    for name , tensor in tensor_dict.items():
+    
+        update_name  = reformat_weight_name(name)
+        if update_name is None:
+            continue
+        if update_name.find("encoder_attn.kv_proj.bias") !=-1:
+            k_bias = tensor[:1024]
+            v_bias = tensor[1024:]
+            temp_bias_name = update_name.replace("encoder_attn.kv_proj.bias","")
+            k_bias_name = temp_bias_name+ "encoder_attn.k_proj.bias"
+            v_bias_name = temp_bias_name+ "encoder_attn.v_proj.bias"
+            weights_dict[k_bias_name] = np.ascontiguousarray(k_bias).flatten().astype(np.float32)
+            weights_dict[v_bias_name] = np.ascontiguousarray(v_bias).flatten().astype(np.float32)
+
+
+        elif update_name.find("encoder_attn.kv_proj.weight")!=-1:
+            k_weight = tensor[:1024]
+            v_weight = tensor[1024:]            
+            temp_weight_name = update_name.replace("encoder_attn.kv_proj.weight","")
+            k_weight_name = temp_weight_name+"encoder_attn.k_proj.weight"
+            v_weight_name = temp_weight_name+"encoder_attn.v_proj.weight"
+            weights_dict[k_weight_name] = np.ascontiguousarray(k_weight).flatten().astype(np.float32)
+            weights_dict[v_weight_name] = np.ascontiguousarray(v_weight).flatten().astype(np.float32)
+            
+        if update_name.find("self_attn.qkv_proj.bias") !=-1 and update_name.find("decoder.layers") !=-1:
+            temp_bias_name = update_name.replace("self_attn.qkv_proj.bias","")
+            qkv_bias_name = temp_bias_name+ "self_attn.qkv_proj.bias"                      
+            weights_dict[qkv_bias_name] = np.ascontiguousarray(tensor).flatten().astype(np.float32)
+              
+        elif update_name.find("self_attn.qkv_proj.weight") !=-1 and update_name.find("decoder.layers") !=-1:
+            temp_weight_name = update_name.replace("self_attn.qkv_proj.weight","")
+            qkv_weight_name = temp_weight_name+"self_attn.qkv_proj.weight"            
+            weights_dict[qkv_weight_name] = np.ascontiguousarray(tensor).flatten().astype(np.float32)
+                        
+        else:
+            flat_tensor = np.ascontiguousarray(tensor).flatten().astype(np.float32)
+            weights_dict[update_name] = flat_tensor
+
+    return weights_dict
+
+def onnx_to_trt_name(onnx_name):
+    """
+    Converting variables in the onnx checkpoint to names corresponding to the naming convention used in the TF version, expected by the builder
+    """
+    qkv_strings = {'key', 'value', 'query', 'query_key_value'}
+    onnx_name = onnx_name.lower()
+    toks = [t.strip('_') for t in onnx_name.split('.')]
+    if toks[0] == 'bert': #embeddings or encoder
+        if toks[1] == 'encoder': #transformer
+            # Token conversions for sparse checkpoints
+            if toks[-2] == 'dense_act':
+                toks[-2] = 'dense'
+            elif toks[-3] == 'dense_act':
+                if toks[-2] == 'input_quantizer':
+                    toks[-2] = 'input'
+                elif toks[-2] == 'weight_quantizer':
+                    toks[-2] = 'kernel'
+                toks[-3] = 'dense'
+            elif toks[-2].startswith('matmul'):
+                toks[-2] = {
+                    'matmul_q_quantizer': 'qv_a_input_quantizer',
+                    'matmul_k_quantizer': 'qv_b_input_quantizer',
+                    'matmul_v_quantizer': 'av_b_input_quantizer',
+                    'matmul_a_quantizer': 'av_a_input_quantizer',
+                }[toks[-2].replace('input_', '')]
+
+            # Token conversions for all checkpoints
+            if toks[-2] == 'layernorm': #bias->beta, weight->gamma
+                toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma'
+            elif (toks[-2] == 'dense' or toks[-2] in qkv_strings) and toks[-1] == 'weight':
+                toks[-1] = 'kernel'
+            elif (toks[-3] == 'dense' or toks[-3] in qkv_strings) and toks[-1] == 'amax':
+                if toks[-2] == 'weight_quantizer':
+                    toks[-2] = 'kernel'
+                elif toks[-2] == 'input_quantizer':
+                    toks[-2] = 'input'
+
+            if 'final_input_quantizer' not in toks[2]:
+                ind = toks.index('layers')+1 if 'layers' in toks else 3
+                toks = toks[ind:]
+                toks[0] = 'l{}'.format(int(toks[0]))
+        else:
+            if toks[-2] == 'layernorm': #bias->beta, weight->gamma
+                toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma'
+            else: #embeddings: drop "_weight" suffix
+                if toks[-1] == 'amax':
+                    toks[-2] = 'amax'
+                toks = toks[:-1]
+    elif 'qa' in onnx_name:
+        name = 'cls_squad_output_bias' if toks[-1] == 'bias' else 'cls_squad_output_weights'
+        return name
+    else:
+        print("Encountered unknown case:", onnx_name)
+        assert(False)
+    parsed = '_'.join(toks)
+    return parsed
+
+def load_onnx_weights_and_quant(path, config):
+    """
+    Load the weights from the onnx checkpoint
+    """
+    model = onnx.load(path)
+    weights = model.graph.initializer
+    tensor_dict = dict((w.name, np.frombuffer(w.raw_data, np.float16).reshape(w.dims))
+                       for w in weights)
+    return get_onnx_weight_dict(tensor_dict, config)
+
+def load_pytorch_weights_and_quant(path, config):
+    """
+    Load the weights from the pytorch checkpoint
+    """
+    state_dict = torch.load(path, map_location='cpu')["model"]
+    tensor_dict = {onnx_to_trt_name(name):val.numpy() for name, val in state_dict.items()}
+    return get_onnx_weight_dict(tensor_dict, config)
+
+class transformerBaseConfig:
+    def __init__(self, bert_config_path, use_fp16, use_int8=False):
+        with open(bert_config_path, "r") as f:
+            data = json.load(f)
+            self.num_attention_heads = data["num_attention_heads"]
+            self.hidden_size = data["hidden_size"]
+            self.intermediate_size = data["intermediate_size"]
+            self.num_hidden_layers = data["num_hidden_layers"]
+            self.head_size = self.hidden_size // self.num_attention_heads
+            self.use_fp16 = use_fp16
+            self.use_int8 = use_int8
+
+if __name__ == '__main__':
+    config_path = './wmt14_en_de/transformer_config.json'
+    onnx_model_path = './wmt14_en_de/transformer.onnx'
+    weight_save_path = "./wmt14_en_de/transformer.wts"
+    config = config = transformerBaseConfig(config_path, True)
+    weights_dict = load_onnx_weights_and_quant(onnx_model_path, config)
+    
+    for tensor_name, tensor in weights_dict.items():
+        print(tensor_name,":",tensor.shape)
+    
+    
+    
+    # f = open(weight_save_path, "w")
+    # num = 0
+    # for key, value in weights_dict.items():
+    #     if key.find('_amax') == -1:
+    #         num += 1
+    
+    # f.write('{}\n'.format(num))
+    # for key, value in weights_dict.items():
+    #     print('key: ', key)
+    #     if key.find('_amax') != -1:
+    #         continue
+    #     f.write("{} {}".format(key, len(value)))
+    #     print(len(value))
+    #     for v in value:
+    #         f.write(" ")
+    #         f.write(struct.pack('>f', float(v)).hex())
+        # f.write("\n")
diff --git a/models/nlp/plm/transformer/plugin/load_ixrt_plugin.py b/models/nlp/plm/transformer/plugin/load_ixrt_plugin.py
new file mode 100644
index 00000000..22d0a9ad
--- /dev/null
+++ b/models/nlp/plm/transformer/plugin/load_ixrt_plugin.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from os.path import join, dirname, exists
+import tensorrt as trt
+import ctypes
+
+def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""):
+    if not dynamic_path:
+        dynamic_path = join(dirname(trt.__file__), "lib", "libixrt_plugin.so")
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+    ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+    trt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/nlp/plm/transformer/plugin/plugin_utils.py b/models/nlp/plm/transformer/plugin/plugin_utils.py
new file mode 100644
index 00000000..8ec2e372
--- /dev/null
+++ b/models/nlp/plm/transformer/plugin/plugin_utils.py
@@ -0,0 +1,918 @@
+#!/usr/bin/env python3
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+#
+
+import argparse
+import ctypes
+import json
+import os
+import sys
+import time
+
+import numpy as np
+import tensorrt
+import tensorrt as trt
+
+trt_version = [int(n) for n in trt.__version__.split(".")[:3]]
+
+TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
+from load_ixrt_plugin import load_ixrt_plugin
+
+load_ixrt_plugin(
+    TRT_LOGGER
+)
+
+plg_registry = trt.get_plugin_registry()
+
+qkv2ctx_plg_creator = plg_registry.get_plugin_creator(
+    "CustomQKVToContextPluginDynamic_IxRT", "1", ""
+)
+skln_plg_creator = plg_registry.get_plugin_creator(
+    "CustomSkipLayerNormPluginDynamic_IxRT", "1", ""
+)
+
+encoder_emb_plg_creator = plg_registry.get_plugin_creator(
+        "TransformerEncoderEmb_IxRT", "1"
+    )
+attention_plugin_creator = plg_registry.get_plugin_creator(
+        "CustomQkvCrossToContext_IxRT", "1"
+    )
+
+decoder_emb_plg_creator = plg_registry.get_plugin_creator(
+        "TransformerDecoderEmb_IxRT", "1"
+    )
+
+top1_plg_creator = plg_registry.get_plugin_creator(
+        "CustomArgmax_IxRT", "1"
+    )
+
+ffn_plg_creator = plg_registry.get_plugin_creator("CustomFFNPluginDynamic_IxRT", "1", "")
+
+fc_plg_creator = plg_registry.get_plugin_creator("CustomFCPluginDynamic_IxRT", "1", "")
+
+def get_mha_dtype(config):
+    dtype = trt.float32
+    if config.use_fp16:
+        dtype = trt.float16
+    return int(dtype)
+
+
+
+
+def create_split_qkv_plugin(num_head,num_dim,index):
+
+    plugin_registry = tensorrt.get_plugin_registry()
+    assert plugin_registry
+
+    plugin_creator = plugin_registry.get_plugin_creator("SplitQKVUpdateKVCache_IxRT", "1")
+    assert plugin_creator
+    
+    head_num_field = tensorrt.PluginField(
+    "num_head",
+    np.array([num_head], dtype=np.int32),
+    tensorrt.PluginFieldType.INT32)
+    
+    head_dim_field = tensorrt.PluginField(
+    "head_dim",
+    np.array([num_dim], dtype=np.int32),
+    tensorrt.PluginFieldType.INT32)
+    
+    field_collection = tensorrt.PluginFieldCollection([head_num_field,head_dim_field ])
+    plugin = plugin_creator.create_plugin(f"SplitQKVUpdateKVCache_IxRT_{index}", field_collection)
+
+    return plugin
+
+
+def create_encoder_emb_plugin(
+    weights_dict,
+    config
+):
+
+    embed_scale_field = trt.PluginField(
+        "embed_scale",
+        np.array([32], dtype=np.float32),
+        trt.PluginFieldType.FLOAT32,
+    )
+    hidden_size_field = trt.PluginField(
+        "hidden_size",
+        np.array([config.hidden_size], dtype=np.int32),
+        trt.PluginFieldType.INT32,
+    )
+    max_pos_field = trt.PluginField(
+        "max_pos",
+        np.array([1024], dtype=np.int32),
+        trt.PluginFieldType.INT32,
+    )
+
+    pad_idx_field = trt.PluginField(
+        "pad_idx",
+        np.array([1], dtype=np.int32),
+        trt.PluginFieldType.INT32,
+    )
+
+    token_w_field = trt.PluginField(
+        "enc_token_emb_weight",
+        weights_dict["enc_token_emb_weight"],
+        trt.PluginFieldType.FLOAT32,
+    )
+
+    pos_w_field = trt.PluginField(
+        "enc_pos_emb_weight",
+        weights_dict["enc_pos_emb_weight"],
+        trt.PluginFieldType.FLOAT32,
+    )
+
+    field_collection = trt.PluginFieldCollection(
+        [
+            embed_scale_field,
+            hidden_size_field,
+            max_pos_field,
+            pad_idx_field,
+            token_w_field,
+            pos_w_field,
+        ]
+    )
+
+    emb_plugin = encoder_emb_plg_creator.create_plugin(
+        "py_TransformerEncoderEmb_ixrt", field_collection
+    )
+
+    return emb_plugin  
+
+
+
+def custom_fc(network, input_tensor, out_dims, W, B):
+    pf_out_dims = trt.PluginField("out_dims", np.array(out_dims, dtype=np.int32), trt.PluginFieldType.INT32)
+    pf_type = trt.PluginField("type_id", np.array(int(trt.float16), dtype=np.int32), trt.PluginFieldType.INT32)
+    pf_W = trt.PluginField("W", W, trt.PluginFieldType.FLOAT32)
+    fields = [pf_out_dims, pf_type, pf_W]
+    if B is not None:
+        pf_B = trt.PluginField("B", B, trt.PluginFieldType.FLOAT32)
+        fields.append(pf_B)
+
+    pfc = trt.PluginFieldCollection(fields)
+    fc_plugin = fc_plg_creator.create_plugin("fcplugin", pfc)
+    plug_inputs = [input_tensor]
+    out_dense = network.add_plugin_v2(plug_inputs, fc_plugin)
+    return out_dense          
+ 
+ 
+ 
+def create_encoder_attention_plugin():
+   plugin_registry = tensorrt.get_plugin_registry()
+   assert plugin_registry
+   plugin_creator = plugin_registry.get_plugin_creator(
+       "CustomQkvCrossToContext_IxRT", "1"
+   )
+   assert plugin_creator
+   type_id_field = tensorrt.PluginField(
+       "type_id",
+       np.array([1], dtype=np.int32),
+       tensorrt.PluginFieldType.INT32,
+   )
+   has_mask_field = tensorrt.PluginField(
+       "has_mask",
+       np.array([1], dtype=np.int32),
+       tensorrt.PluginFieldType.INT32,
+   )
+   
+   mask_type_field = tensorrt.PluginField(
+       "type_mask",
+       np.array([3], dtype=np.int32),
+       tensorrt.PluginFieldType.INT32,
+   )
+   
+   scale_field = tensorrt.PluginField(
+       "scale",
+       np.array([1.0 / 8], dtype=np.float32),  # 1 / sqrt(head_num)
+       tensorrt.PluginFieldType.FLOAT32,
+   )
+   field_collection = tensorrt.PluginFieldCollection([type_id_field, has_mask_field,mask_type_field,scale_field])
+   plugin = plugin_creator.create_plugin("py_QkvCrossToContext_ixrt", field_collection)
+   return plugin
+
+
+           
+def encoder_self_attention_layer(
+    block, layer_index, config, init_dict, network, input_tensor, imask=None
+):
+    """
+    Add the attention layer
+    """
+
+    B, S, hidden_size, _, _ = input_tensor.shape
+    num_heads = config.num_attention_heads
+    head_size = int(hidden_size / num_heads)
+
+    self_attn_qkv_proj_weight = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.qkv_proj.weight"
+    ]
+    self_attn_qkv_proj_bias = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.qkv_proj.bias"
+    ]
+
+    # q_proj,k_proj,v_proj
+    # to_qkv = network.add_fully_connected(
+    #     input_tensor,
+    #     3 * hidden_size,
+    #     self_attn_qkv_proj_weight,
+    #     self_attn_qkv_proj_bias,
+    # )
+    
+    to_qkv = custom_fc(network, input_tensor, 3 * hidden_size, self_attn_qkv_proj_weight, self_attn_qkv_proj_bias)
+
+    has_mask = imask is not None
+    # QKV2CTX
+    pf_type = trt.PluginField(
+        "type_id",
+        np.array([get_mha_dtype(config)], np.int32),
+        trt.PluginFieldType.INT32,
+    )
+    pf_hidden_size = trt.PluginField(
+        "hidden_size", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32
+    )
+    pf_num_heads = trt.PluginField(
+        "num_heads", np.array([num_heads], np.int32), trt.PluginFieldType.INT32
+    )
+    pf_has_mask = trt.PluginField(
+        "has_mask", np.array([has_mask], np.int32), trt.PluginFieldType.INT32
+    )
+    pfc = trt.PluginFieldCollection(
+        [pf_hidden_size, pf_num_heads, pf_has_mask, pf_type]
+    )
+    qkv2ctx_plug = qkv2ctx_plg_creator.create_plugin("qkv2ctx", pfc)
+
+    qkv_in = [to_qkv.get_output(0)]
+    if has_mask:
+        qkv_in.append(imask)
+    qkv2ctx = network.add_plugin_v2(qkv_in, qkv2ctx_plug)
+    return qkv2ctx
+
+def skipln(
+    block, layer_index, name, config, init_dict, network, input_tensor, skip, bias=None
+):
+    """
+    Add the skip layer
+    """
+    idims = input_tensor.shape
+    
+    # assert len(idims) == 5
+    hidden_size = idims[2]
+
+    dtype = trt.float32
+    if config.use_fp16:
+        dtype = trt.float16
+
+    pf_ld = trt.PluginField(
+        "ld", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32
+    )
+
+    ln_weight = init_dict[f"{block}.layers.{layer_index}.{name}.weight"]
+    pf_gamma = trt.PluginField("gamma", ln_weight, trt.PluginFieldType.FLOAT32)
+
+    ln_bias = init_dict[f"{block}.layers.{layer_index}.{name}.bias"]
+    pf_beta = trt.PluginField("beta", ln_bias, trt.PluginFieldType.FLOAT32)
+
+    pf_type = trt.PluginField(
+        "type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32
+    )
+    fields = [pf_ld, pf_beta, pf_gamma, pf_type]
+
+    if bias is not None:
+        pf_bias = trt.PluginField("bias", bias, trt.PluginFieldType.FLOAT32)
+        fields.append(pf_bias)
+
+    pfc = trt.PluginFieldCollection(fields)
+    skipln_plug = skln_plg_creator.create_plugin("skipln", pfc)
+
+    skipln_inputs = [input_tensor, skip]
+    layer = network.add_plugin_v2(skipln_inputs, skipln_plug)
+    return layer
+
+def ffn(block, layer_index, config, init_dict, network, input_tensor):
+
+    fc1_weight = init_dict[f"{block}.layers.{layer_index}.fc1.weight"]
+    fc1_bias = init_dict[f"{block}.layers.{layer_index}.fc1.bias"]
+
+    # mid_dense = network.add_fully_connected(
+    #     input_tensor, config.intermediate_size, fc1_weight, fc1_bias
+    # )
+    # mid_dense = custom_fc(network, input_tensor, config.intermediate_size, fc1_weight, fc1_bias)
+    
+
+    # relu_inputs = mid_dense.get_output(0)
+    # relu_layer = network.add_activation(relu_inputs, tensorrt.ActivationType.RELU)
+
+    # intermediate_act = relu_layer.get_output(0)
+
+    fc2_weight = init_dict[f"{block}.layers.{layer_index}.fc2.weight"]
+    fc2_bias = init_dict[f"{block}.layers.{layer_index}.fc2.bias"]
+    # out_dense = network.add_fully_connected(
+    #     intermediate_act, config.hidden_size, fc2_weight, fc2_bias
+    # )
+    # out_dense = custom_fc(network, intermediate_act, config.hidden_size, fc2_weight, fc2_bias)
+    
+    
+    pf_out_dim = trt.PluginField("out_dims", np.array(config.hidden_size, np.int32), trt.PluginFieldType.INT32)
+    pf_type = trt.PluginField("type_id", np.array(int(trt.float16), np.int32), trt.PluginFieldType.INT32)
+    pf_W1 = trt.PluginField("W1", fc1_weight, trt.PluginFieldType.FLOAT32)
+    pf_B1 = trt.PluginField("B1", fc1_bias, trt.PluginFieldType.FLOAT32)
+    pf_W2 = trt.PluginField("W2", fc2_weight, trt.PluginFieldType.FLOAT32)
+    pf_act_type = trt.PluginField("act_type", np.array(int(4), np.int32), trt.PluginFieldType.INT32) #RELU=4
+    pfc = trt.PluginFieldCollection([pf_out_dim, pf_type, pf_W1, pf_W2, pf_B1, pf_act_type])
+    ffn_plug = ffn_plg_creator.create_plugin("ffn", pfc)
+
+    ffn_inputs = [input_tensor]
+    out_dense = network.add_plugin_v2(ffn_inputs, ffn_plug)
+    
+    out_layer = skipln(
+        block,
+        layer_index,
+        "final_layer_norm",
+        config,
+        init_dict,
+        network,
+        out_dense.get_output(0),
+        input_tensor,
+        fc2_bias
+    )
+    return out_layer
+
+def transformer_encoder_layer(
+    block, layer_index, config, init_dict, network, input_tensor, imask
+):
+    """
+    Add the transformer layer
+    """
+    idims = input_tensor.shape
+    assert len(idims) == 5
+    hidden_size = idims[2]
+
+    self_attention = encoder_self_attention_layer(
+        block, layer_index, config, init_dict, network, input_tensor,imask
+    )  # l0_enc_self_attn_qkv_weight  l0_enc_self_attn_qkv_bias
+    
+    # self_attention = encoder_self_attention_layer2(
+    #     block, layer_index, config, init_dict, network, input_tensor,imask
+    # )  
+
+
+    self_attn_out_proj_weight = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.out_proj.weight"
+    ]
+    self_attn_out_proj_bias = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.out_proj.bias"
+    ]
+
+    # out_proj = network.add_fully_connected(
+    #     self_attention.get_output(0),
+    #     hidden_size,
+    #     self_attn_out_proj_weight,
+    #     self_attn_out_proj_bias,
+    # )
+    out_proj = custom_fc(network, self_attention.get_output(0), hidden_size, self_attn_out_proj_weight, self_attn_out_proj_bias)
+    
+
+    self_attention_skipln = skipln(
+        block,
+        layer_index,
+        "self_attn_layer_norm",
+        config,
+        init_dict,
+        network,
+        out_proj.get_output(0),
+        input_tensor,
+    )
+    attention_ln = self_attention_skipln.get_output(0)
+
+    ffn_layer = ffn(block, layer_index, config, init_dict, network, attention_ln)
+
+    return ffn_layer
+
+
+def create_decoder_emb_plugin(weights_dict):
+
+    plugin_registry = tensorrt.get_plugin_registry()
+    assert plugin_registry
+    plugin_creator = plugin_registry.get_plugin_creator(
+        "TransformerDecoderEmb_IxRT", "1"
+    )
+    assert plugin_creator
+
+    embed_scale_field = tensorrt.PluginField(
+        "embed_scale",
+        np.array([32], dtype=np.float32),
+        tensorrt.PluginFieldType.FLOAT32,
+    )
+    embed_dim_field = tensorrt.PluginField(
+        "embed_dim",
+        np.array([1024], dtype=np.int32),
+        tensorrt.PluginFieldType.INT32,
+    )
+    pad_idx_field = tensorrt.PluginField(
+        "pad_idx",
+        np.array([1], dtype=np.int32),
+        tensorrt.PluginFieldType.INT32,
+    )
+
+    token_w = weights_dict["token_emb_weight"]
+    token_w_field = tensorrt.PluginField(
+        "token_emb_weight",
+        token_w.astype(np.float16),
+        tensorrt.PluginFieldType.FLOAT16,
+    )
+
+    pos_w = weights_dict["pos_emb_weight"]
+
+    pos_w_field = tensorrt.PluginField(
+        "pos_emb_weight",
+        pos_w.astype(np.float16),
+        tensorrt.PluginFieldType.FLOAT16,
+    )
+
+    field_collection = tensorrt.PluginFieldCollection(
+        [
+            embed_scale_field,
+            embed_dim_field,
+            pad_idx_field,
+            token_w_field,
+            pos_w_field,
+        ]
+    )
+
+    plugin = plugin_creator.create_plugin(
+        "py_TransformerDecoderEmb_ixrt", field_collection
+    )
+
+    return plugin
+
+
+def create_decoder_self_attention_plugin():
+
+    plugin_registry = tensorrt.get_plugin_registry()
+    assert plugin_registry
+
+    plugin_creator = plugin_registry.get_plugin_creator(
+        "CustomQkvCrossToContext_IxRT", "1"
+    )
+    assert plugin_creator
+
+    type_id_field = tensorrt.PluginField(
+        "type_id",
+        np.array([1], dtype=np.int32),
+        tensorrt.PluginFieldType.INT32,
+    )
+
+    has_mask_field = tensorrt.PluginField(
+        "has_mask",
+        np.array([0], dtype=np.int32),
+        tensorrt.PluginFieldType.INT32,
+    )
+    
+    mask_type_field = tensorrt.PluginField(
+       "type_mask",
+       np.array([3], dtype=np.int32),
+       tensorrt.PluginFieldType.INT32,
+   )
+   
+    scale_field = tensorrt.PluginField(
+       "scale",
+       np.array([1.0 / 8], dtype=np.float32),  # 1 / sqrt(head_num)
+       tensorrt.PluginFieldType.FLOAT32,
+   )
+
+    field_collection = tensorrt.PluginFieldCollection([type_id_field, has_mask_field,mask_type_field,scale_field])
+
+    plugin = plugin_creator.create_plugin("py_QkvCrossToContext_ixrt", field_collection)
+
+    return plugin
+
+
+
+def create_cross_attention_plugin():
+
+    plugin_registry = tensorrt.get_plugin_registry()
+    assert plugin_registry
+
+    plugin_creator = plugin_registry.get_plugin_creator(
+        "CustomQkvCrossToContext_IxRT", "1"
+    )
+    assert plugin_creator
+
+    type_id_field = tensorrt.PluginField(
+        "type_id",
+        np.array([1], dtype=np.int32),
+        tensorrt.PluginFieldType.INT32,
+    )
+
+    has_mask_field = tensorrt.PluginField(
+        "has_mask",
+        np.array([1], dtype=np.int32),
+        tensorrt.PluginFieldType.INT32,
+    )
+    
+    mask_type_field = tensorrt.PluginField(
+       "type_mask",
+       np.array([3], dtype=np.int32),
+       tensorrt.PluginFieldType.INT32,
+   )
+   
+    scale_field = tensorrt.PluginField(
+       "scale",
+       np.array([1.0 / 8], dtype=np.float32),  # 1 / sqrt(head_num)
+       tensorrt.PluginFieldType.FLOAT32,
+   )
+
+    field_collection = tensorrt.PluginFieldCollection([type_id_field, has_mask_field,mask_type_field,scale_field])
+
+    plugin = plugin_creator.create_plugin("py_QkvCrossToContext_ixrt", field_collection)
+
+    return plugin
+
+
+
+def cross_attention_kv_cache(
+    block, layer_index, config, init_dict, network, encoder_out
+):
+
+    """
+    Add the cross attention layer
+    """
+
+    to_k_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.k_proj.weight"
+    ]
+    to_k_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.k_proj.bias"
+    ]
+    # to_k_layer = network.add_fully_connected(
+    #     encoder_out, config.hidden_size, to_k_layer_weight, to_k_layer_bias
+    # )
+    to_k_layer = custom_fc(network, encoder_out, config.hidden_size, to_k_layer_weight, to_k_layer_bias)
+    
+    k_output = to_k_layer.get_output(0)
+    k_t_layer = network.add_shuffle(k_output)
+    k_t_layer.reshape_dims = trt.Dims(
+        [0, -1, config.num_attention_heads, config.head_size]
+    )
+    k_t_layer.second_transpose = trt.Permutation([0, 2, 1, 3])
+    input_k = k_t_layer.get_output(0)
+
+    to_v_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.v_proj.weight"
+    ]
+    to_v_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.v_proj.bias"
+    ]
+    # to_v_layer = network.add_fully_connected(
+    #     encoder_out, config.hidden_size, to_v_layer_weight, to_v_layer_bias
+    # )
+    to_v_layer = custom_fc(network, encoder_out, config.hidden_size, to_v_layer_weight, to_v_layer_bias)
+    
+    v_output = to_v_layer.get_output(0)
+    v_t_layer = network.add_shuffle(v_output)
+    v_t_layer.reshape_dims = trt.Dims(
+        [0, -1, config.num_attention_heads, config.head_size]
+    )
+    v_t_layer.second_transpose = trt.Permutation([0, 2, 1, 3])
+    input_v = v_t_layer.get_output(0)
+
+    return input_k,input_v
+
+
+def decoder_cross_attention_layer(
+    block, layer_index, config, init_dict, network, input_tensor, imask, encoder_out
+):
+
+    """
+    Add the cross attention layer
+    """
+    to_q_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.q_proj.weight"
+    ]
+    to_q_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.q_proj.bias"
+    ]
+    # to_q_layer = network.add_fully_connected(
+    #     input_tensor, config.hidden_size, to_q_layer_weight, to_q_layer_bias
+    # )
+    
+    print("input_tensor:",input_tensor.shape)
+    
+    to_q_layer = custom_fc(network, input_tensor, config.hidden_size, to_q_layer_weight, to_q_layer_bias)
+    
+    q_output = to_q_layer.get_output(0)
+
+    q_t_layer = network.add_shuffle(q_output)
+    q_t_layer.reshape_dims = trt.Dims(
+        [0, -1, config.num_attention_heads, config.head_size]
+    )  # reshape  [bs,sequence_len, hidden_size] -->[bs,sequence_len,num_attention_heads ,head_dim]
+    q_t_layer.second_transpose = trt.Permutation([0, 2, 1, 3])
+    input_q = q_t_layer.get_output(0)
+
+    to_k_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.k_proj.weight"
+    ]
+    to_k_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.k_proj.bias"
+    ]
+    # to_k_layer = network.add_fully_connected(
+    #     encoder_out, config.hidden_size, to_k_layer_weight, to_k_layer_bias
+    # )
+    
+    to_k_layer = custom_fc(network, encoder_out, config.hidden_size, to_k_layer_weight, to_k_layer_bias)
+    
+    
+    k_output = to_k_layer.get_output(0)
+    k_t_layer = network.add_shuffle(k_output)
+    k_t_layer.reshape_dims = trt.Dims(
+        [0, -1, config.num_attention_heads, config.head_size]
+    )
+    k_t_layer.second_transpose = trt.Permutation([0, 2, 1, 3])
+    input_k = k_t_layer.get_output(0)
+
+    to_v_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.v_proj.weight"
+    ]
+    to_v_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.v_proj.bias"
+    ]
+    # to_v_layer = network.add_fully_connected(
+    #     encoder_out, config.hidden_size, to_v_layer_weight, to_v_layer_bias
+    # )
+    
+    to_v_layer = custom_fc(network, encoder_out, config.hidden_size, to_v_layer_weight, to_v_layer_bias)
+    
+    
+    v_output = to_v_layer.get_output(0)
+    v_t_layer = network.add_shuffle(v_output)
+    v_t_layer.reshape_dims = trt.Dims(
+        [0, -1, config.num_attention_heads, config.head_size]
+    )
+    v_t_layer.second_transpose = trt.Permutation([0, 2, 1, 3])
+    input_v = v_t_layer.get_output(0)
+
+    attention_plug = create_cross_attention_plugin()
+    atten = network.add_plugin_v2([input_q, input_k, input_v,imask], attention_plug)
+    
+    scores = atten.get_output(0)
+    scores_t_layer = network.add_shuffle(scores)
+    scores_t_layer.first_transpose = trt.Permutation([0, 2, 1, 3])
+    scores_t_layer.reshape_dims = trt.Dims([0, 0, config.num_attention_heads*config.head_size, 1, 1])
+
+    scores_out = scores_t_layer.get_output(0)
+    to_out_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.out_proj.weight"
+    ]
+    to_out_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.out_proj.bias"
+    ]
+    # to_out_layer = network.add_fully_connected(
+    #     scores_out, config.hidden_size, to_out_layer_weight, to_out_layer_bias
+    # )
+    to_out_layer = custom_fc(network, scores_out, config.hidden_size, to_out_layer_weight, to_out_layer_bias)
+    
+
+    return to_out_layer
+
+
+
+
+
+
+def decoder_cross_attention_kvcache_layer(
+    block, layer_index, config, init_dict, network, input_tensor, imask, encoder_out, encoder_kv_cache_inputs
+):
+
+    """
+    Add the cross attention layer
+    """
+    to_q_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.q_proj.weight"
+    ]
+    to_q_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.q_proj.bias"
+    ]
+    # to_q_layer = network.add_fully_connected(
+    #     input_tensor, config.hidden_size, to_q_layer_weight, to_q_layer_bias
+    # )
+    
+    to_q_layer = custom_fc(network, input_tensor, config.hidden_size, to_q_layer_weight, to_q_layer_bias)
+    
+    
+    q_output = to_q_layer.get_output(0)
+
+    q_t_layer = network.add_shuffle(q_output)
+    q_t_layer.reshape_dims = trt.Dims(
+        [0, -1, config.num_attention_heads, config.head_size]
+    )  # reshape  [bs,sequence_len, hidden_size] -->[bs,sequence_len,num_attention_heads ,head_dim]
+    q_t_layer.second_transpose = trt.Permutation([0, 2, 1, 3])
+    input_q = q_t_layer.get_output(0)
+    
+    
+    input_k = encoder_kv_cache_inputs[f"past_key_values.{layer_index}.encoder.key"]
+    input_v = encoder_kv_cache_inputs[f"past_key_values.{layer_index}.encoder.value"]
+
+  
+    attention_plug = create_cross_attention_plugin()
+    atten = network.add_plugin_v2([input_q, input_k, input_v,imask], attention_plug)
+    
+    # atten = attention2(network,input_q, input_k, input_v)
+
+    scores = atten.get_output(0)
+    scores_t_layer = network.add_shuffle(scores)
+    scores_t_layer.first_transpose = trt.Permutation([0, 2, 1, 3])
+    scores_t_layer.reshape_dims = trt.Dims([0, 0, config.num_attention_heads*config.head_size, 1, 1])
+
+    scores_out = scores_t_layer.get_output(0)
+    to_out_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.out_proj.weight"
+    ]
+    to_out_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.out_proj.bias"
+    ]
+    # to_out_layer = network.add_fully_connected(
+    #     scores_out, config.hidden_size, to_out_layer_weight, to_out_layer_bias
+    # )
+    
+    to_out_layer = custom_fc(network, scores_out, config.hidden_size, to_out_layer_weight, to_out_layer_bias)
+    
+
+    return to_out_layer
+
+
+def decoder_self_attention_layer(
+    block,
+    layer_index,
+    config,
+    init_dict,
+    network,
+    input_tensor,
+    imask,
+    encoder_out,
+    steps,
+    kv_cache_inputs,
+    kv_cache_outputs
+):
+
+    """
+    Add the cross attention layer
+    """
+    to_qkv_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.qkv_proj.weight"
+    ]
+    to_qkv_layer_bias = init_dict[f"{block}.layers.{layer_index}.self_attn.qkv_proj.bias"]
+
+    to_qkv_layer = custom_fc(network, input_tensor, 3*config.hidden_size, to_qkv_layer_weight, to_qkv_layer_bias)
+        
+    linear_qkv_output = to_qkv_layer.get_output(0)
+    reshape_qkv_layer = network.add_shuffle(linear_qkv_output)
+    reshape_qkv_layer.reshape_dims = trt.Dims(
+        [0, 0, 0]
+    )
+    
+    split_qkv_plugin = create_split_qkv_plugin(config.num_attention_heads,config.head_size,layer_index)
+    split_qkv_layers = network.add_plugin_v2([reshape_qkv_layer.get_output(0), kv_cache_inputs[f"past_key_values.{layer_index}.decoder.key"],
+                                                kv_cache_inputs[f"past_key_values.{layer_index}.decoder.value"]], split_qkv_plugin)
+        
+    input_q = split_qkv_layers.get_output(0)
+    present_key = split_qkv_layers.get_output(1)
+    present_value = split_qkv_layers.get_output(2)
+    
+    attention_plug = create_decoder_self_attention_plugin()
+    atten = network.add_plugin_v2([input_q, present_key, present_value], attention_plug)
+    
+    scores = atten.get_output(0)
+    
+    scores_t_layer = network.add_shuffle(scores)
+    scores_t_layer.first_transpose = trt.Permutation([0, 2, 1, 3])
+    scores_t_layer.reshape_dims = trt.Dims([0, 0, config.num_attention_heads*config.head_size, 1, 1])
+    
+    
+    kv_cache_outputs[f"present_key_values.{layer_index}.decoder.key"] = present_key
+    kv_cache_outputs[f"present_key_values.{layer_index}.decoder.value"] = present_value
+    
+
+    return scores_t_layer
+
+
+def transformer_decoder_layer(
+    block,
+    layer_index,
+    config,
+    init_dict,
+    network,
+    input_tensor,
+    imask,
+    encoder_out,
+    steps,
+    kv_cache_inputs,
+    kv_cache_outputs,
+    encoder_kv_cache_inputs
+):
+    
+
+    """
+    Add the transformer layer
+    """
+    idims = input_tensor.shape
+    assert len(idims) == 5
+    hidden_size = idims[2]
+    self_attention = decoder_self_attention_layer(
+        block,
+        layer_index,
+        config,
+        init_dict,
+        network,
+        input_tensor,
+        imask,
+        encoder_out,
+        steps,
+        kv_cache_inputs,
+        kv_cache_outputs
+    )
+    self_attn_out_proj_weight = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.out_proj.weight"
+    ]
+    self_attn_out_proj_bias = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.out_proj.bias"
+    ]
+    
+    # out_proj = network.add_fully_connected(
+    #     self_attention.get_output(0),
+    #     hidden_size,
+    #     self_attn_out_proj_weight,
+    #     self_attn_out_proj_bias,
+    # )
+    
+    out_proj = custom_fc(network, self_attention.get_output(0), hidden_size, self_attn_out_proj_weight, self_attn_out_proj_bias)
+    
+    self_attention_skipln = skipln(
+        block,
+        layer_index,
+        "self_attn_layer_norm",
+        config,
+        init_dict,
+        network,
+        out_proj.get_output(0),
+        input_tensor,
+    )
+
+    query = self_attention_skipln.get_output(0)
+    # cross_attention = decoder_cross_attention_layer(
+    #     block, layer_index, config, init_dict, network, query, imask, encoder_out
+    # )
+    
+    cross_attention = decoder_cross_attention_kvcache_layer(
+        block, layer_index, config, init_dict, network, query, imask, encoder_out,encoder_kv_cache_inputs
+    )
+    crosss_attention_skipln = skipln(
+        block,
+        layer_index,
+        "encoder_attn_layer_norm",
+        config,
+        init_dict,
+        network,
+        cross_attention.get_output(0),
+        query,
+    )
+    attention_ln = crosss_attention_skipln.get_output(0)
+
+    ffn_layer = ffn(block, layer_index, config, init_dict, network, attention_ln)
+
+    return ffn_layer
+
+
+
+
+def create_top1_plugin():
+    pad_idx_field = trt.PluginField(
+        "pad_idx",
+        np.array([1], dtype=np.int32),
+        trt.PluginFieldType.INT32,
+    )
+
+    field_collection = trt.PluginFieldCollection(
+        [pad_idx_field]
+    )
+
+    plugin = top1_plg_creator.create_plugin(
+        "argmax", field_collection
+    )
+
+    return plugin  
+
diff --git a/models/nlp/plm/transformer/plugin/transformer_cfg.py b/models/nlp/plm/transformer/plugin/transformer_cfg.py
new file mode 100644
index 00000000..be47c4d2
--- /dev/null
+++ b/models/nlp/plm/transformer/plugin/transformer_cfg.py
@@ -0,0 +1,15 @@
+import json
+class TransformerBaseConfig:
+    def __init__(self, config_path, use_fp16=True):
+        with open(config_path, "r") as f:
+            data = json.load(f)
+            self.num_attention_heads = data["num_attention_heads"]
+            self.hidden_size = data["hidden_size"]
+            self.intermediate_size = data["intermediate_size"]
+            self.num_hidden_layers = data["num_hidden_layers"]
+            self.head_size = self.hidden_size // self.num_attention_heads
+            self.tgt_vocab_size = data["tgt_vocab_size"]
+            self.max_sequence_length = data["max_sequence_length"]
+            self.sos_token_id = data["sos_token_id"]
+            self.eos_token_id = data["eos_token_id"]
+            self.use_fp16 = use_fp16
\ No newline at end of file
diff --git a/models/nlp/plm/transformer/plugin/trt.py b/models/nlp/plm/transformer/plugin/trt.py
new file mode 100644
index 00000000..30510024
--- /dev/null
+++ b/models/nlp/plm/transformer/plugin/trt.py
@@ -0,0 +1,356 @@
+import tensorrt
+import os 
+import numpy as np
+from typing import Dict, List
+from functools import reduce
+import pprint
+
+
+import argparse
+
+from .load_ixrt_plugin import load_ixrt_plugin
+
+from .transformer_cfg import TransformerBaseConfig
+
+TRT_LOGGER = tensorrt.Logger(tensorrt.Logger.ERROR)
+load_ixrt_plugin(TRT_LOGGER)
+
+
+import torch
+
+def create_engine_context(engine_path, logger):
+    with open(engine_path, "rb") as f:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+
+    return engine, context
+
+
+def create_context(engine_file):
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+    engine, context = create_engine_context(engine_file, logger)
+    
+    return engine, context
+
+
+def allocate_binding_buffer(types_dict, shapes_dict):
+    '''
+    Allocate binding buffers for trt based on provided types and shapes dict
+    '''
+    return {
+        k: torch.zeros(reduce(lambda v, a: v*a, shape), dtype=types_dict[k]).cuda()
+        for k, shape in shapes_dict.items()
+    }
+
+class T5TRTEncoder():
+
+
+    def __init__(
+        self,
+        trt_engine_file: str,
+        config,
+        batch_size: int = 1,
+
+    ):
+
+        self.data_type = torch.float16
+
+        self.max_sequence_length = config.max_sequence_length
+        self.hidden_size = config.hidden_size
+        self.main_input_name = "src_tokens"
+        self.batch_size = batch_size
+        self.num_hidden_layers = config.num_hidden_layers
+        self.num_attention_heads = config.num_attention_heads
+        self.head_size = config.head_size
+        
+        # We only have one profile to select so we can just grab the profile at the start of the class
+        # self.profile_idx = self.get_optimization_profile(batch_size=self.batch_size, sequence_length=1)
+        
+        print("Start Deserializing Encoder Engine,it will cost a little time...")
+        self.trt_engine, self.trt_context = create_context(trt_engine_file)
+        print("Deserializing Encoder Engine DONE !")
+        
+        self.input_shapes = {
+            "src_tokens": (self.batch_size, self.max_sequence_length)
+        }
+        self.input_types = {
+            "src_tokens": torch.int32
+        }
+        
+        self.output_shapes = {}
+        self.output_types = {}
+        
+        for layer_index in range(self.num_hidden_layers):  
+            self.output_shapes[f"past_key_values.{layer_index}.encoder.key"] = (self.batch_size, self.num_attention_heads, self.max_sequence_length, self.head_size)
+            self.output_shapes[f"past_key_values.{layer_index}.encoder.value"] = (self.batch_size, self.num_attention_heads, self.max_sequence_length, self.head_size)
+            
+            self.output_types[f"past_key_values.{layer_index}.encoder.key"] = torch.float16
+            self.output_types[f"past_key_values.{layer_index}.encoder.value"] = torch.float16   
+                 
+        self.output_shapes["mask"] = (self.batch_size, self.max_sequence_length)
+        self.output_types["mask"] = torch.int32
+        
+        self.bindings = self._allocate_memory(self.input_shapes, self.input_types, self.output_shapes, self.output_types)
+        
+    def _allocate_memory(self,
+                         input_shapes: Dict[str, tuple],
+                         input_types: Dict[str, torch.dtype],
+                         output_shapes: Dict[str, tuple],
+                         output_types: Dict[str, torch.dtype]):
+        """Helper function for binding several inputs at once and pre-allocating the results."""
+        # Allocate memories as 1D linear buffers for simpler handling of dynamic shapes.
+        
+
+        self.inputs = allocate_binding_buffer(input_types, input_shapes)
+        self.outputs = allocate_binding_buffer(output_types, output_shapes)
+        bindings = [0] * self.trt_engine.num_bindings
+        for input_name, input_array in self.inputs.items():
+            # Allocate memory for inputs
+            input_idx = self.trt_engine.get_binding_index(input_name)
+            self.trt_context.set_binding_shape(input_idx, input_shapes[input_name])
+            bindings[input_idx] = input_array.data_ptr()
+
+        assert self.trt_context.all_binding_shapes_specified
+
+        for output_name, output_array in self.outputs.items():
+            # Output shape should be allocated from context size
+            output_idx = self.trt_engine.get_binding_index(output_name)
+            bindings[output_idx] = output_array.data_ptr()
+            
+
+        return bindings
+
+    def forward(self, input_ids, *args, **kwargs):
+
+        self.bindings[0] = input_ids.data_ptr()
+        self.trt_context.set_binding_shape(0, input_ids.shape)
+        self.trt_context.execute_v2(self.bindings)
+            
+        return self.outputs
+    
+    def clear(self):
+        del self.trt_context
+        del self.trt_engine
+
+
+
+class T5TRTDecoder():
+
+    def __init__(
+        self,
+        trt_engine_file, 
+        hf_config,
+        batch_size: int = 1,
+        num_beams: int = 1
+    ):
+        self.data_type =  torch.float16
+        self.batch_size = batch_size
+        self.num_beams = num_beams
+        self.use_cache = True
+        self.max_input_length = hf_config.max_sequence_length
+        self.max_output_length = hf_config.max_sequence_length
+        
+        self.device = torch.device('cuda') 
+        self.main_input_name = "token_id"  #shape:[bsz,1]
+        self.second_input_name = "steps"   #shape:[1]
+        self.third_input_name = "mask"     #shape:[bsz,input_length] 
+        
+        self.main_out_name ="decoder_id"
+
+        
+        self.encoder_hidden_size = hf_config.hidden_size
+        self.num_heads = hf_config.num_attention_heads
+        self.embedding_size_per_head = hf_config.head_size
+        self.num_decoder_layers = hf_config.num_hidden_layers
+        
+        print("Start Deserializing Decoder Engine,it will cost a little time...")
+        self.trt_engine, self.trt_context = create_context(trt_engine_file)
+        
+        print("Deserializing Decoder Engine DONE !")
+        self.bindings = [0] * self.trt_engine.num_bindings
+        
+        
+        
+        self.output = torch.ones((batch_size,1), dtype=torch.int32).cuda()
+        out_index_1 = self.trt_engine.get_binding_index(self.main_out_name)
+        self.bindings[out_index_1] = self.output.data_ptr()   
+    
+        if self.use_cache:
+
+            self.self_attention_cache = {}
+            self_attention_kv_shape = (self.batch_size * num_beams, self.num_heads,self.max_output_length - 1,self.embedding_size_per_head)
+
+            # Set self attention kv cache shape and type
+            for i in range(self.num_decoder_layers):
+                for code in ["key", "value"]:
+                    
+                    self_attention_name = f"key_values.{i}.decoder.{code}"
+                    input_buffer = torch.zeros(self_attention_kv_shape, dtype = self.data_type).cuda()
+    
+                    input_idx = self.trt_engine.get_binding_index("past_" + self_attention_name)
+                    self.self_attention_cache[self_attention_name] = input_buffer
+                    self.bindings[input_idx] = input_buffer.data_ptr()
+                    
+                    output_idx = self.trt_engine.get_binding_index("present_" + self_attention_name)
+                    #TODO Allocate self attention buffer. The buffer is used both as inputs and outputs,IxRT now ERROR
+                    #self.bindings[output_idx] = input_buffer.data_ptr()
+            
+                    self_attention_name_out = f"key_values.{i}.decoder.{code}.output"
+                    output_buffer = torch.zeros(self_attention_kv_shape, dtype = self.data_type).cuda()
+                    self.self_attention_cache[self_attention_name_out] = output_buffer
+                    self.bindings[output_idx] = output_buffer.data_ptr()
+                
+       
+            self.kv_cache_binding_offset = 3 # 0: token_id, 1: steps,2:mask,  kv cache input indices start from 3
+            self.cross_kv_cache_binding_offset = self.kv_cache_binding_offset + 2 * self.num_decoder_layers
+
+            
+    def _switch_input_output_binding(self):
+        '''
+        For kv cache mode, switch input and output pointers to avoid data concurrency issue
+        '''
+        for i in range(self.num_decoder_layers):
+            for code in ["key", "value"]:
+                self_attention_name = f"key_values.{i}.decoder.{code}"
+                input_idx = self.trt_engine.get_binding_index("past_" + self_attention_name)
+                output_idx = self.trt_engine.get_binding_index("present_" + self_attention_name)
+                # Switch generation mode kv cache bindings
+                temp = self.bindings[output_idx]
+                self.bindings[output_idx] = self.bindings[input_idx]
+                self.bindings[input_idx] = temp    
+        
+        
+    def forward(self, input_ids, encoder_out, step, sequence_len, *args, **kwargs):
+        
+        # Get the batch size.
+        bs = input_ids.shape[0] # in beam search mode, bs is batch_size * num_beams
+        ##############################################################################################
+        #input bindings
+        input_ids = input_ids.cuda()
+        
+        index_1 = self.trt_engine.get_binding_index(self.main_input_name)
+        self.bindings[index_1] = input_ids.data_ptr()
+        self.trt_context.set_binding_shape(index_1, input_ids.shape)
+        
+        #shape not channge
+        index_2 = self.trt_engine.get_binding_index(self.second_input_name)
+        step_tensor = torch.tensor([step + 1],dtype = torch.int32).cuda()
+        self.bindings[index_2] = step_tensor.data_ptr()
+        
+        mask_shape = (bs , sequence_len) 
+        index_3 = self.trt_engine.get_binding_index(self.third_input_name)
+        self.bindings[index_3] = encoder_out["mask"].data_ptr()
+        self.trt_context.set_binding_shape(index_3, mask_shape)
+        
+
+        if self.use_cache:
+            self_atten_kv_shape = (bs, self.num_heads, step, self.embedding_size_per_head)
+            for i in range(self.num_decoder_layers):
+                
+                self_atten_past_key = f"past_key_values.{i}.decoder.key"
+                key_idx = self.trt_engine.get_binding_index(self_atten_past_key)
+                self.trt_context.set_binding_shape(self.kv_cache_binding_offset+2*i, self_atten_kv_shape)
+                
+                self_atten_past_value = f"past_key_values.{i}.decoder.key"
+                value_idx = self.trt_engine.get_binding_index(self_atten_past_value) 
+                self.trt_context.set_binding_shape(self.kv_cache_binding_offset+2*i + 1, self_atten_kv_shape)
+            
+            
+            cross_atten_kv_shape = (bs, self.num_heads, sequence_len, self.embedding_size_per_head)     
+            for i in range(self.num_decoder_layers):
+                cross_atten_past_key = f"past_key_values.{i}.encoder.key"
+                key_idx = self.trt_engine.get_binding_index(cross_atten_past_key)
+                self.bindings[key_idx] = encoder_out[cross_atten_past_key].data_ptr()
+                self.trt_context.set_binding_shape(key_idx, cross_atten_kv_shape)
+                
+                cross_atten_past_value = f"past_key_values.{i}.encoder.value"
+                value_idx = self.trt_engine.get_binding_index(cross_atten_past_value)                
+                self.bindings[value_idx] = encoder_out[cross_atten_past_value].data_ptr()
+                self.trt_context.set_binding_shape(value_idx, cross_atten_kv_shape)
+                        
+        ##############################################################################################
+                
+        #output bindings                     
+        assert self.trt_context.all_binding_shapes_specified                   
+        self.trt_context.execute_v2(self.bindings)
+        self._switch_input_output_binding()
+        
+        return self.output
+    
+    def clear(self):
+        del self.trt_context
+        del self.trt_engine
+    
+
+    
+def inference(config,encoder,decoder,input_ids):
+    
+    prev_tokens = torch.full((input_ids.shape[0],1), int(config.sos_token_id),dtype = torch.int32).cuda()
+    encoder_out = encoder.forward(input_ids)
+    result_tokens = torch.full((input_ids.shape[0],config.max_sequence_length), int(config.sos_token_id),dtype = torch.int32).cuda()
+    sequence_len = input_ids.shape[1]
+    for step in range(config.max_sequence_length-1):
+        current_tokens = decoder.forward(prev_tokens,encoder_out,step,sequence_len) 
+        if step > 1:        
+            update_tokens = torch.where(prev_tokens == int(config.eos_token_id), int(config.eos_token_id), current_tokens)
+            result_tokens[:,step:step+1] = update_tokens
+            prev_tokens = update_tokens 
+            if torch.all(update_tokens == int(config.eos_token_id)): 
+                break    
+        else:
+            result_tokens[:,step:step+1] = current_tokens
+            prev_tokens = current_tokens 
+                           
+    return result_tokens 
+
+
+
+
+def benchmark(config,encoder,decoder,input_ids,prev_tokens):
+    encoder_out = encoder.forward(input_ids) 
+    sequence_len = input_ids.shape[1]
+    test_step =0
+    for step in range(config.max_sequence_length-1): 
+        test_step +=1
+        current_tokens = decoder.forward(prev_tokens,encoder_out, step,sequence_len) 
+        if step > 1:        
+            update_tokens = torch.where(prev_tokens == int(config.eos_token_id), int(config.eos_token_id), current_tokens)
+            prev_tokens = update_tokens 
+            if torch.all(update_tokens == int(config.eos_token_id)): 
+                break    
+        else:
+            prev_tokens = current_tokens 
+        
+ 
+    
+
+    
+def main():
+    parser = argparse.ArgumentParser(description="TensorRT Transformer Sample")
+    parser.add_argument("--decoder_engine", required=False, default="/inferencesamples/data/checkpoints/transformer/wmt14.en-fr.joined-dict.transformer/Decoder.engine", help="The transformer engine file, ex transformer.engine")
+    parser.add_argument("--encoder_engine", required=False, default="/inferencesamples/data/checkpoints/transformer/wmt14.en-fr.joined-dict.transformer/Encoder.engine", help="The transformer engine file, ex transformer.engine")
+    parser.add_argument("--config_file", required=False, default="./data/wmt14_en_de/transformer_config.json", help="The transformer config file")
+        
+    args = parser.parse_args()
+    
+    config_path =args.config_file
+    config = TransformerBaseConfig(config_path)
+       
+    input_ids = torch.from_numpy(np.load("/inferencesamples/benchmarks/nlp/translation/transformer/plugin/data/tensorrt_input/2.npy")).cuda()
+        
+    batch_size = input_ids.shape[0]
+    decoder = T5TRTDecoder(args.decoder_engine,config,batch_size=batch_size)    
+    encoder = T5TRTEncoder(args.encoder_engine,config, batch_size=batch_size)
+    
+    
+    result_tokens= inference(config,encoder,decoder,input_ids) 
+    print(result_tokens)
+    
+        
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/nlp/plm/transformer/setup.py b/models/nlp/plm/transformer/setup.py
new file mode 100644
index 00000000..245399b9
--- /dev/null
+++ b/models/nlp/plm/transformer/setup.py
@@ -0,0 +1,76 @@
+import os
+import subprocess
+import sys
+
+from setuptools import Extension, find_packages, setup
+import numpy as np
+
+from build_helpers.build_helpers import (
+    ANTLRCommand,
+    HYDRAANTLRCommand,
+    BuildPyCommand,
+    CleanCommand,
+    DevelopCommand,
+    SDistCommand,
+    find_version,
+)
+
+
+if sys.platform == "darwin":
+    extra_compile_args = ["-stdlib=libc++", "-O3"]
+else:
+    extra_compile_args = ["-std=c++11", "-O3"]
+
+
+class NumpyExtension(Extension):
+    """Source: https://stackoverflow.com/a/54128391"""
+
+    def __init__(self, *args, **kwargs):
+        self.__include_dirs = []
+        super().__init__(*args, **kwargs)
+
+    @property
+    def include_dirs(self):
+        import numpy
+
+        return self.__include_dirs + [numpy.get_include()]
+
+    @include_dirs.setter
+    def include_dirs(self, dirs):
+        self.__include_dirs = dirs
+
+extensions = [
+    Extension(
+        "fairseq.libbleu",
+        sources=[
+            "fairseq/clib/libbleu/libbleu.cpp",
+            "fairseq/clib/libbleu/module.cpp",
+        ],
+        extra_compile_args=extra_compile_args,
+    ),
+    NumpyExtension(
+        "fairseq.data.data_utils_fast",
+        sources=["fairseq/data/data_utils_fast.pyx"],
+        language="c++",
+        extra_compile_args=extra_compile_args,
+    ),
+    NumpyExtension(
+        "fairseq.data.token_block_utils_fast",
+        sources=["fairseq/data/token_block_utils_fast.pyx"],
+        language="c++",
+        extra_compile_args=extra_compile_args,
+    ),
+]
+
+setup(
+    cmdclass={
+        "antlr": ANTLRCommand,
+        "hydra_antlr":HYDRAANTLRCommand,
+        "clean": CleanCommand,
+        "sdist": SDistCommand,
+        "build_py": BuildPyCommand,
+        "develop": DevelopCommand,
+    },
+    name="fairseq_extension",
+    ext_modules=extensions,
+)
\ No newline at end of file
-- 
Gitee


From a403c5af9f27dd59248b9597a26858ffb9f5c5a6 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 10 Dec 2025 17:31:23 +0800
Subject: [PATCH 6/7] sync igie yolov5s transformer

---
 .../object_detection/yolov5s/igie/README.md   |  93 ++
 .../yolov5s/igie/build_engine.py              |  38 +
 .../yolov5s/igie/build_nms_engine.py          |  82 ++
 .../yolov5s/igie/calibration_dataset.py       |  31 +
 .../yolov5s/igie/ci/prepare.sh                |  35 +
 .../yolov5s/igie/coco_labels.py               |  89 ++
 .../object_detection/yolov5s/igie/common.py   |  86 ++
 .../yolov5s/igie/config/YOLOV5S_CONFIG        |  49 +
 .../yolov5s/igie/cut_model.py                 |  16 +
 .../yolov5s/igie/datasets/__init__.py         |   0
 .../yolov5s/igie/datasets/coco.py             | 116 +++
 .../yolov5s/igie/datasets/common.py           |  66 ++
 .../yolov5s/igie/datasets/post_process.py     | 115 +++
 .../yolov5s/igie/datasets/pre_process.py      |  56 ++
 .../yolov5s/igie/datasets/vision.py           | 136 +++
 .../object_detection/yolov5s/igie/deploy.py   | 134 +++
 .../yolov5s/igie/inference.py                 | 265 +++++
 .../yolov5s/igie/load_ixrt_plugin.py          |  12 +
 .../yolov5s/igie/modify_batchsize.py          |  37 +
 .../cv/object_detection/yolov5s/igie/quant.py |  52 +
 .../scripts/infer_yolov5s_fp16_accuracy.sh    | 209 ++++
 .../scripts/infer_yolov5s_fp16_performance.sh | 209 ++++
 .../yolov5s/igie/simplify_model.py            |  21 +
 models/nlp/plm/transformer/igie/__init__.py   |   0
 .../nlp/plm/transformer/igie/build_engine.py  | 398 ++++++++
 .../nlp/plm/transformer/igie/builder_utils.py | 323 ++++++
 models/nlp/plm/transformer/igie/ci/prepare.sh |  42 +
 models/nlp/plm/transformer/igie/common.py     |  92 ++
 .../inference_wmt14_en_fr_fp16_accuracy.py    | 517 ++++++++++
 .../inference_wmt14_en_fr_fp16_performance.py | 147 +++
 .../plm/transformer/igie/load_ixrt_plugin.py  |  28 +
 .../nlp/plm/transformer/igie/plugin_utils.py  | 918 ++++++++++++++++++
 .../nlp/plm/transformer/igie/requirements.txt |   6 +
 .../infer_transformer_fp16_accuracy.sh        |  45 +
 .../infer_transformer_fp16_performance.sh     |  45 +
 .../plm/transformer/igie/transformer_cfg.py   |  15 +
 36 files changed, 4523 insertions(+)
 create mode 100644 models/cv/object_detection/yolov5s/igie/README.md
 create mode 100644 models/cv/object_detection/yolov5s/igie/build_engine.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/build_nms_engine.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/calibration_dataset.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/ci/prepare.sh
 create mode 100644 models/cv/object_detection/yolov5s/igie/coco_labels.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/common.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/config/YOLOV5S_CONFIG
 create mode 100644 models/cv/object_detection/yolov5s/igie/cut_model.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/datasets/__init__.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/datasets/coco.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/datasets/common.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/datasets/post_process.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/datasets/pre_process.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/datasets/vision.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/deploy.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/inference.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/load_ixrt_plugin.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/modify_batchsize.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/quant.py
 create mode 100644 models/cv/object_detection/yolov5s/igie/scripts/infer_yolov5s_fp16_accuracy.sh
 create mode 100644 models/cv/object_detection/yolov5s/igie/scripts/infer_yolov5s_fp16_performance.sh
 create mode 100644 models/cv/object_detection/yolov5s/igie/simplify_model.py
 create mode 100644 models/nlp/plm/transformer/igie/__init__.py
 create mode 100644 models/nlp/plm/transformer/igie/build_engine.py
 create mode 100644 models/nlp/plm/transformer/igie/builder_utils.py
 create mode 100644 models/nlp/plm/transformer/igie/ci/prepare.sh
 create mode 100644 models/nlp/plm/transformer/igie/common.py
 create mode 100644 models/nlp/plm/transformer/igie/inference_wmt14_en_fr_fp16_accuracy.py
 create mode 100644 models/nlp/plm/transformer/igie/inference_wmt14_en_fr_fp16_performance.py
 create mode 100644 models/nlp/plm/transformer/igie/load_ixrt_plugin.py
 create mode 100644 models/nlp/plm/transformer/igie/plugin_utils.py
 create mode 100644 models/nlp/plm/transformer/igie/requirements.txt
 create mode 100644 models/nlp/plm/transformer/igie/scripts/infer_transformer_fp16_accuracy.sh
 create mode 100644 models/nlp/plm/transformer/igie/scripts/infer_transformer_fp16_performance.sh
 create mode 100644 models/nlp/plm/transformer/igie/transformer_cfg.py

diff --git a/models/cv/object_detection/yolov5s/igie/README.md b/models/cv/object_detection/yolov5s/igie/README.md
new file mode 100644
index 00000000..82e0a387
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/README.md
@@ -0,0 +1,93 @@
+# YOLOv5s (IGIE)
+
+## Model Description
+
+The YOLOv5 architecture is designed for efficient and accurate object detection tasks in real-time scenarios. It employs a single convolutional neural network to simultaneously predict bounding boxes and class probabilities for multiple objects within an image. The YOLOV5s is a tiny model.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.3.0 | 25.09 |
+
+## Model Preparation
+
+### Prepare Resources
+
+Pretrained model: <https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt>
+
+Dataset:
+
+- <https://github.com/ultralytics/assets/releases/download/v0.0.0/coco2017labels.zip> to download the labels dataset.
+- <http://images.cocodataset.org/zips/val2017.zip> to download the validation dataset.
+- <http://images.cocodataset.org/zips/train2017.zip> to download the train dataset.
+
+```bash
+unzip -q -d ./ coco2017labels.zip
+unzip -q -d ./coco/images/ train2017.zip
+unzip -q -d ./coco/images/ val2017.zip
+
+coco
+├── annotations
+│   └── instances_val2017.json
+├── images
+│   ├── train2017
+│   └── val2017
+├── labels
+│   ├── train2017
+│   └── val2017
+├── LICENSE
+├── README.txt
+├── test-dev2017.txt
+├── train2017.cache
+├── train2017.txt
+├── val2017.cache
+└── val2017.txt
+```
+
+### Install Dependencies
+
+```bash
+pip3 install -r ../../ixrt_common/requirements.txt
+```
+
+### Model Conversion
+
+```bash
+mkdir checkpoints
+git clone -b v6.1 --depth 1 https://github.com/ultralytics/yolov5
+
+# 有一些环境需要安装
+wget https://ultralytics.com/assets/Arial.ttf
+cp Arial.ttf  /root/.config/Ultralytics/Arial.ttf
+
+# 转换为onnx (具体实现可以参考 export.py 中的 export_onnx 函数)
+pushd ./yolov5
+# set weights_only=False to be comaptible with pytorch 2.7 
+sed -i '96 s/map_location)/map_location, weights_only=False)/' ./models/experimental.py
+
+python3 export.py --weights yolov5s.pt --include onnx --opset 11 --batch-size 32
+mv yolov5s.onnx ../checkpoints
+popd
+```
+
+## Model Inference
+
+```bash
+export DATASETS_DIR=./coco/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_yolov5s_fp16_accuracy.sh
+# Performance
+bash scripts/infer_yolov5s_fp16_performance.sh
+```
+
+## Model Results
+
+| Model   | BatchSize | Precision | FPS     | MAP@0.5 | MAP@0.5:0.95 |
+| :----: | :----: | :----: | :----: | :----: | :----: |
+| YOLOv5s | 32        | FP16      | 1112.66 | 0.565   | 0.370        |
diff --git a/models/cv/object_detection/yolov5s/igie/build_engine.py b/models/cv/object_detection/yolov5s/igie/build_engine.py
new file mode 100644
index 00000000..938f095a
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/build_engine.py
@@ -0,0 +1,38 @@
+import os
+import cv2
+import argparse
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+
+def main(config):
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")
+    device = tvm.device(target.kind.name, 0)
+    precision = config.precision 
+    if config.precision == "float16":
+        precision = "fp16"
+    
+    inputs_info = {"images": ([config.bsz, 3, 640, 640], "float32")}
+    mod, params = import_model_to_igie(config.model, inputs_info, outputs_info=None, precision=precision, backend="tensorrt")
+    lib = relay.build(mod, target=target, params=params, precision=precision, device=device)
+    lib.export_library(config.engine)
+    print("Build engine done!")
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+            help="The precision of datatype")
+    parser.add_argument("--bsz", type=int)
+    # engine args 
+    parser.add_argument("--engine", type=str, default=None)
+
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/build_nms_engine.py b/models/cv/object_detection/yolov5s/igie/build_nms_engine.py
new file mode 100644
index 00000000..51d70747
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/build_nms_engine.py
@@ -0,0 +1,82 @@
+import os
+import argparse
+import torch
+import onnx
+from onnx import helper
+from onnx import TensorProto, numpy_helper
+import tensorrt
+
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
+
+def create_onnx(args):
+    nms = helper.make_node(
+        "DetectionNMS_IxRT",
+        name="NMS",
+        inputs=["nms_input"],
+        outputs=["nms_output0", "nms_output1"],
+        nMaxKeep=args.max_box_pre_img,
+        fIoUThresh=args.iou_thresh,
+        fScoreThresh=args.score_thresh
+    )
+    graph = helper.make_graph(
+        nodes=[nms],
+        name="gpu_nms",
+        inputs=[
+            helper.make_tensor_value_info(
+                "nms_input", onnx.TensorProto.FLOAT, (args.bsz, args.all_box_num, 6)
+            )
+        ],
+        outputs=[
+            helper.make_tensor_value_info(
+                "nms_output0", onnx.TensorProto.FLOAT, (args.bsz, args.max_box_pre_img, 6)
+            ),
+            helper.make_tensor_value_info(
+                "nms_output1", onnx.TensorProto.INT32, (args.bsz,)
+            )
+        ],
+        initializer=[]
+    )
+
+    op = onnx.OperatorSetIdProto()
+    op.version = 13
+    model = onnx.helper.make_model(graph)
+
+    model = onnx.helper.make_model(graph, opset_imports=[op])
+    onnx_path = args.path + "/nms.onnx"
+    onnx.save(model, onnx_path)
+
+def build_engine(args):
+    onnx_path = args.path + "/nms.onnx"
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(onnx_path)
+    plan = builder.build_serialized_network(network, build_config)
+
+    engine_path = args.path + "/nms.engine"
+    with open(engine_path, "wb") as f:
+        f.write(plan)
+
+def main(args):
+    create_onnx(args)
+    build_engine(args)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--bsz", type=int, default=1, help="batch size")
+    parser.add_argument("--path", type=str)
+    parser.add_argument("--all_box_num", type=int, default=25200)
+    parser.add_argument("--max_box_pre_img", type=int, default=1000)
+    parser.add_argument("--iou_thresh", type=float, default=0.6)
+    parser.add_argument("--score_thresh", type=float, default=0.001)
+
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/calibration_dataset.py b/models/cv/object_detection/yolov5s/igie/calibration_dataset.py
new file mode 100644
index 00000000..578e013d
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/calibration_dataset.py
@@ -0,0 +1,31 @@
+import os
+import torch
+import torchvision.datasets
+from torch.utils.data import DataLoader
+
+
+
+from datasets.coco import CocoDetection
+
+def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
+    dataset = CocoDetection(
+        root=data_path,
+        annFile=annFile,
+        img_size=img_sz,
+        data_process_type=data_process_type
+    )
+    calibration_dataset = dataset
+    num_samples = min(5000, batch_size * step)
+    if num_samples > 0:
+        calibration_dataset = torch.utils.data.Subset(
+            dataset, indices=range(num_samples)
+        )
+
+    calibration_dataloader = DataLoader(
+        calibration_dataset,
+        shuffle=False,
+        batch_size=batch_size,
+        drop_last=False,
+        num_workers=workers,
+    )
+    return calibration_dataloader
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/ci/prepare.sh b/models/cv/object_detection/yolov5s/igie/ci/prepare.sh
new file mode 100644
index 00000000..b53ca6d1
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/ci/prepare.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+
+pip3 install -r ../../ixrt_common/requirements.txt
+
+mkdir -p checkpoints
+cp -r /mnt/deepspark/data/3rd_party/yolov5 ./
+
+cd yolov5/
+
+# 有一些环境需要安装
+# wget https://ultralytics.com/assets/Arial.ttf
+mkdir -p /root/.config/Ultralytics
+cp /mnt/deepspark/data/3rd_party/Arial.ttf /root/.config/Ultralytics/Arial.ttf
+
+ln -s /mnt/deepspark/data/checkpoints/yolov5s.pt ./
+# 转换为onnx (具体实现可以参考 export.py 中的 export_onnx 函数)
+python3 export.py --weights yolov5s.pt --include onnx --opset 11 --batch-size 32
+mv yolov5s.onnx ../checkpoints
+cd ..
diff --git a/models/cv/object_detection/yolov5s/igie/coco_labels.py b/models/cv/object_detection/yolov5s/igie/coco_labels.py
new file mode 100644
index 00000000..69d38878
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/coco_labels.py
@@ -0,0 +1,89 @@
+labels = [
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+]
+def coco80_to_coco91_class():  # converts 80-index (val2014) to 91-index (paper)
+    return [
+        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
+        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+        64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
+
+__all__ = ["labels"]
diff --git a/models/cv/object_detection/yolov5s/igie/common.py b/models/cv/object_detection/yolov5s/igie/common.py
new file mode 100644
index 00000000..5f543555
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/common.py
@@ -0,0 +1,86 @@
+import numpy as np
+from tqdm import tqdm
+
+import tensorrt
+import cuda.cuda as cuda
+import cuda.cudart as cudart
+
+# input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
+# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
+def box_class85to6(input):
+    center_x_y = input[:, :2]
+    side = input[:, 2:4]
+    conf = input[:, 4:5]
+    class_id = np.argmax(input[:, 5:], axis = -1)
+    class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
+    max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
+    x1_y1 = center_x_y - 0.5 * side
+    x2_y2 = center_x_y + 0.5 * side
+    nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
+    return nms_input
+
+def save2json(batch_img_id, pred_boxes, json_result, class_trans):
+    for i, boxes in enumerate(pred_boxes):
+        if boxes is not None:
+            image_id = int(batch_img_id[i])
+            # have no target
+            if image_id == -1:
+                continue
+            for x, y, w, h, c, p in boxes:
+                x, y, w, h, p = float(x), float(y), float(w), float(h), float(p)
+                c = int(c)
+                json_result.append(
+                    {
+                        "image_id": image_id,
+                        "category_id": class_trans[c - 1],
+                        "bbox": [x, y, w, h],
+                        "score": p,
+                    }
+                )
+
+def create_engine_context(engine_path, logger):
+    with open(engine_path, "rb") as f:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+
+    return engine, context
+
+def get_io_bindings(engine):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = engine.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        err, allocation = cudart.cudaMalloc(size)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+            "nbytes": size,
+        }
+        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/config/YOLOV5S_CONFIG b/models/cv/object_detection/yolov5s/igie/config/YOLOV5S_CONFIG
new file mode 100644
index 00000000..c3f46cf8
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/config/YOLOV5S_CONFIG
@@ -0,0 +1,49 @@
+# BSZ : 构建engine以及推理时的batchsize
+# IMGSIZE : 模型输入hw大小
+# RUN_MODE : [FPS, MAP]
+# PRECISION : [float16, int8]
+# MODEL_NAME : 生成onnx/engine的basename
+# ORIGINE_MODEL : 原始onnx文件
+# COCO_GT : COCOEVAL标签文件
+# DATASET_DIR : 量化/推理数据集路径
+# CHECKPOINTS_DIR : 存放生成的onnx/engine路径
+# LAYER_FUSION : decoder部分走融合算子实现  0不融合 1融合
+# DECODER_FASTER : 有两种融合实现,faster版本速度快且可以直接对接gpu nms;另一种实现的输出和onnx保持一致.  1:faster
+IMGSIZE=640
+MODEL_NAME=yolov5s
+ORIGINE_MODEL=yolov5s.onnx
+DATA_PROCESS_TYPE=yolov5
+MODEL_INPUT_NAMES=(images)
+
+LAYER_FUSION=1
+DECODER_FASTER=1
+DECODER_NUM_CLASS=80
+DECODER_INPUT_NAMES=(/model.24/m.0/Conv_output_0 /model.24/m.1/Conv_output_0 /model.24/m.2/Conv_output_0)
+DECODER_8_ANCHOR=(10 13 16 30 33 23)
+DECODER_16_ANCHOR=(30 61 62 45 59 119)
+DECODER_32_ANCHOR=(116 90 156 198 373 326)
+
+# NMS CONFIG
+    # IOU_THRESH : iou阈值
+    # SCORE_THRESH : bbox置信度阈值
+    # MAX_BOX_PRE_IMG : 每张图片预测bbox的数量上限
+    # ALL_BOX_NUM : nms接收每张图片的box数量
+    # NMS_TYPE : GPU/CPU(TODO)
+IOU_THRESH=0.6
+SCORE_THRESH=0.001
+MAX_BOX_PRE_IMG=1000
+ALL_BOX_NUM=25200
+NMS_TYPE=GPU
+
+# QUANT CONFIG (仅PRECISION为int8时生效)
+    # QUANT_OBSERVER : 量化策略，可选 [hist_percentile, percentile, minmax, entropy, ema]
+    # QUANT_BATCHSIZE : 量化时组dataloader的batchsize, 最好和onnx中的batchsize保持一致，有些op可能推导shape错误(比如Reshape)
+    # QUANT_STEP : 量化步数
+    # QUANT_SEED : 随机种子 保证量化结果可复现
+    # QUANT_EXIST_ONNX : 如果有其他来源的量化模型则填写
+QUANT_OBSERVER=hist_percentile
+QUANT_BATCHSIZE=1
+QUANT_STEP=32
+QUANT_SEED=42
+DISABLE_QUANT_LIST=()
+QUANT_EXIST_ONNX=
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/cut_model.py b/models/cv/object_detection/yolov5s/igie/cut_model.py
new file mode 100644
index 00000000..af0a3a4f
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/cut_model.py
@@ -0,0 +1,16 @@
+import onnx
+import argparse
+from onnxsim import simplify
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    parser.add_argument("--input_names", nargs='+', type=str)
+    parser.add_argument("--output_names", nargs='+', type=str)
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+onnx.utils.extract_model(args.input_model, args.output_model, args.input_names, args.output_names)
+print("  Cut Model Done.")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/datasets/__init__.py b/models/cv/object_detection/yolov5s/igie/datasets/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/models/cv/object_detection/yolov5s/igie/datasets/coco.py b/models/cv/object_detection/yolov5s/igie/datasets/coco.py
new file mode 100644
index 00000000..7f355b84
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/datasets/coco.py
@@ -0,0 +1,116 @@
+import os.path
+from typing import Any, Callable, List, Optional, Tuple
+
+import cv2
+
+from .vision import VisionDataset
+from .pre_process import get_post_process
+class CocoDetection(VisionDataset):
+    """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
+
+    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
+
+    Args:
+        root (string): Root directory where images are downloaded to.
+        annFile (string): Path to json annotation file.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.PILToTensor``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        transforms (callable, optional): A function/transform that takes input sample and its target as entry
+            and returns a transformed version.
+    """
+
+    def __init__(
+        self,
+        root: str,
+        annFile: str,
+        img_size: int,
+        data_process_type: str,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        transforms: Optional[Callable] = None,
+        
+    ) -> None:
+        super().__init__(root, transforms, transform, target_transform)
+        from pycocotools.coco import COCO
+
+        self.coco = COCO(annFile)
+        self.ids = list(sorted(self.coco.imgs.keys()))
+        self.img_size = img_size
+        
+        self.transforms = get_post_process(data_process_type)
+
+    def _load_image(self, id: int):
+        path = self.coco.loadImgs(id)[0]["file_name"]
+        data = cv2.imread(os.path.join(self.root, path))
+        return data
+
+    def _load_target(self, id: int) -> List[Any]:
+        return self.coco.loadAnns(self.coco.getAnnIds(id))
+
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        id = self.ids[index]
+        image = self._load_image(id)
+        target = self._load_target(id)
+        origin_shape = image.shape[:2]
+
+        if self.transforms is not None:
+            image = self.transforms(image, self.img_size)
+
+        if len(target) > 0:
+            image_id = target[0]["image_id"]
+        else:
+            # have no target
+            image_id = -1
+        return image, origin_shape, image_id
+
+    def __len__(self) -> int:
+        return len(self.ids)
+
+
+class CocoCaptions(CocoDetection):
+    """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
+
+    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
+
+    Args:
+        root (string): Root directory where images are downloaded to.
+        annFile (string): Path to json annotation file.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.PILToTensor``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        transforms (callable, optional): A function/transform that takes input sample and its target as entry
+            and returns a transformed version.
+
+    Example:
+
+        .. code:: python
+
+            import torchvision.datasets as dset
+            import torchvision.transforms as transforms
+            cap = dset.CocoCaptions(root = 'dir where images are',
+                                    annFile = 'json annotation file',
+                                    transform=transforms.PILToTensor())
+
+            print('Number of samples: ', len(cap))
+            img, target = cap[3] # load 4th sample
+
+            print("Image Size: ", img.size())
+            print(target)
+
+        Output: ::
+
+            Number of samples: 82783
+            Image Size: (3L, 427L, 640L)
+            [u'A plane emitting smoke stream flying over a mountain.',
+            u'A plane darts across a bright blue sky behind a mountain covered in snow',
+            u'A plane leaves a contrail above the snowy mountain top.',
+            u'A mountain that has a plane flying overheard in the distance.',
+            u'A mountain view with a plume of smoke in the background']
+
+    """
+
+    def _load_target(self, id: int) -> List[str]:
+        return [ann["caption"] for ann in super()._load_target(id)]
diff --git a/models/cv/object_detection/yolov5s/igie/datasets/common.py b/models/cv/object_detection/yolov5s/igie/datasets/common.py
new file mode 100644
index 00000000..e120e00f
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/datasets/common.py
@@ -0,0 +1,66 @@
+import cv2
+import math
+import numpy as np
+
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return im, ratio, (dw, dh)
+
+def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
+    # Rescale boxes (xyxy) from net_shape to ori_shape
+
+    if use_letterbox:
+
+        gain = min(
+            net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
+        )  # gain  = new / old
+        pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
+            net_shape[0] - ori_shape[0] * gain
+        ) / 2.0
+
+        boxes[:, [0, 2]] -= pad[0]  # x padding
+        boxes[:, [1, 3]] -= pad[1]  # y padding
+        boxes[:, :4] /= gain
+    else:
+        x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
+
+        boxes[:, 0] /= x_scale
+        boxes[:, 1] /= y_scale
+        boxes[:, 2] /= x_scale
+        boxes[:, 3] /= y_scale
+
+    clip_boxes(boxes, ori_shape)
+    return boxes
+
+def clip_boxes(boxes, shape):
+
+    boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
+    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/datasets/post_process.py b/models/cv/object_detection/yolov5s/igie/datasets/post_process.py
new file mode 100644
index 00000000..a58c02f8
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/datasets/post_process.py
@@ -0,0 +1,115 @@
+import cv2
+import math
+import numpy as np
+
+from .common import letterbox, scale_boxes, clip_boxes
+
+def get_post_process(data_process_type):
+    if data_process_type == "yolov5":
+        return Yolov5Postprocess
+    elif data_process_type == "yolov3":
+        return Yolov3Postprocess
+    elif data_process_type == "yolox":
+        return YoloxPostprocess
+    return None
+
+def Yolov3Postprocess(
+    ori_img_shape,
+    imgsz,
+    box_datas,
+    box_nums,
+    sample_num,
+    max_det=1000,
+):
+    all_box = []
+    data_offset = 0
+
+    box_datas = box_datas.flatten()
+    box_nums = box_nums.flatten()
+
+    for i in range(sample_num):
+        box_num = box_nums[i]
+        if box_num == 0:
+            boxes = None
+        else:
+            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
+            boxes = scale_boxes(
+                (imgsz[0], imgsz[1]),
+                cur_box,
+                (ori_img_shape[0][i], ori_img_shape[1][i]),
+                use_letterbox=False
+            )
+            # xyxy2xywh
+            boxes[:, 2] -= boxes[:, 0]
+            boxes[:, 3] -= boxes[:, 1]
+
+        all_box.append(boxes)
+        data_offset += max_det * 6
+
+    return all_box
+
+def Yolov5Postprocess(
+    ori_img_shape,
+    imgsz,
+    box_datas,
+    box_nums,
+    sample_num,
+    max_det=1000,
+):
+    all_box = []
+    data_offset = 0
+
+    box_datas = box_datas.flatten()
+    box_nums = box_nums.flatten()
+
+    for i in range(sample_num):
+        box_num = box_nums[i]
+        if box_num == 0:
+            boxes = None
+        else:
+            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
+            boxes = scale_boxes(
+                (imgsz[0], imgsz[1]),
+                cur_box,
+                (ori_img_shape[0][i], ori_img_shape[1][i]),
+                use_letterbox=True
+            )
+            # xyxy2xywh
+            boxes[:, 2] -= boxes[:, 0]
+            boxes[:, 3] -= boxes[:, 1]
+
+        all_box.append(boxes)
+        data_offset += max_det * 6
+
+    return all_box
+
+def YoloxPostprocess(
+    ori_img_shape,
+    imgsz,
+    box_datas,
+    box_nums,
+    sample_num,
+    max_det=1000,
+):
+    all_box = []
+    data_offset = 0
+    box_datas = box_datas.flatten()
+    box_nums = box_nums.flatten()
+
+    for i in range(sample_num):
+        box_num = box_nums[i]
+        if box_num == 0:
+            boxes = None
+        else:
+            boxes = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
+            r = min(imgsz[0]/ori_img_shape[0][i], imgsz[1]/ori_img_shape[1][i])
+            boxes[:, :4] /= r
+            # xyxy2xywh
+            boxes[:, 2] -= boxes[:, 0]
+            boxes[:, 3] -= boxes[:, 1]
+            clip_boxes(boxes, (ori_img_shape[0][i], ori_img_shape[1][i]))
+
+        all_box.append(boxes)
+        data_offset += max_det * 6
+
+    return all_box
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/datasets/pre_process.py b/models/cv/object_detection/yolov5s/igie/datasets/pre_process.py
new file mode 100644
index 00000000..8cc643a8
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/datasets/pre_process.py
@@ -0,0 +1,56 @@
+import cv2
+import math
+import numpy as np
+
+from .common import letterbox
+
+def get_post_process(data_process_type):
+    if data_process_type == "yolov5":
+        return Yolov5Preprocess
+    elif data_process_type == "yolov3":
+        return Yolov3Preprocess
+    elif data_process_type == "yolox":
+        return YoloxPreprocess
+    return None
+
+def Yolov3Preprocess(image, img_size):
+
+    h0, w0 = image.shape[:2]  # orig hw
+    r = img_size / max(h0, w0)  # ratio
+
+    image = cv2.resize(image, (img_size, img_size))
+    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
+    return image
+
+def Yolov5Preprocess(image, img_size, augment=False):
+
+    h0, w0 = image.shape[:2]  # orig hw
+    r = img_size / max(h0, w0)  # ratio
+
+    if r != 1:  # if sizes are not equal
+        interp = cv2.INTER_LINEAR if (augment or r > 1) else cv2.INTER_AREA
+        image = cv2.resize(image, (math.ceil(w0 * r), math.ceil(h0 * r)), interpolation=interp)
+
+    # shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  rect == True
+
+    image, ratio, dwdh = letterbox(image, new_shape=img_size, auto=False, scaleup=False)
+    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
+    return image
+
+def YoloxPreprocess(img, img_size, swap=(2,0,1)):
+
+    padded_img = np.ones((img_size, img_size, 3), dtype=np.uint8) * 114
+    r = min(img_size / img.shape[0], img_size / img.shape[1])
+    resized_img = cv2.resize(
+        img,
+        (int(img.shape[1] * r), int(img.shape[0] * r)),
+        interpolation=cv2.INTER_LINEAR, 
+    ).astype(np.uint8)
+
+    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+    padded_img = padded_img.transpose(swap)
+    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+
+    return padded_img
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/datasets/vision.py b/models/cv/object_detection/yolov5s/igie/datasets/vision.py
new file mode 100644
index 00000000..32da4a78
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/datasets/vision.py
@@ -0,0 +1,136 @@
+import os
+from typing import Any, Callable, List, Optional, Tuple
+
+import torch
+import torch.utils.data as data
+
+from types import FunctionType
+
+def _log_api_usage_once(obj: Any) -> None:
+
+    """
+    Logs API usage(module and name) within an organization.
+    In a large ecosystem, it's often useful to track the PyTorch and
+    TorchVision APIs usage. This API provides the similar functionality to the
+    logging module in the Python stdlib. It can be used for debugging purpose
+    to log which methods are used and by default it is inactive, unless the user
+    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
+    Please note it is triggered only once for the same API call within a process.
+    It does not collect any data from open-source users since it is no-op by default.
+    For more information, please refer to
+    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
+    * Logging policy: https://github.com/pytorch/vision/issues/5052;
+
+    Args:
+        obj (class instance or method): an object to extract info from.
+    """
+    module = obj.__module__
+    if not module.startswith("torchvision"):
+        module = f"torchvision.internal.{module}"
+    name = obj.__class__.__name__
+    if isinstance(obj, FunctionType):
+        name = obj.__name__
+    torch._C._log_api_usage_once(f"{module}.{name}")
+
+class VisionDataset(data.Dataset):
+    """
+    Base Class For making datasets which are compatible with torchvision.
+    It is necessary to override the ``__getitem__`` and ``__len__`` method.
+
+    Args:
+        root (string): Root directory of dataset.
+        transforms (callable, optional): A function/transforms that takes in
+            an image and a label and returns the transformed versions of both.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+
+    .. note::
+
+        :attr:`transforms` and the combination of :attr:`transform` and :attr:`target_transform` are mutually exclusive.
+    """
+
+    _repr_indent = 4
+
+    def __init__(
+        self,
+        root: str,
+        transforms: Optional[Callable] = None,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+    ) -> None:
+        _log_api_usage_once(self)
+        if isinstance(root, str):
+            root = os.path.expanduser(root)
+        self.root = root
+
+        has_transforms = transforms is not None
+        has_separate_transform = transform is not None or target_transform is not None
+        if has_transforms and has_separate_transform:
+            raise ValueError("Only transforms or transform/target_transform can be passed as argument")
+
+        # for backwards-compatibility
+        self.transform = transform
+        self.target_transform = target_transform
+
+        if has_separate_transform:
+            transforms = StandardTransform(transform, target_transform)
+        self.transforms = transforms
+
+    def __getitem__(self, index: int) -> Any:
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            (Any): Sample and meta data, optionally transformed by the respective transforms.
+        """
+        raise NotImplementedError
+
+    def __len__(self) -> int:
+        raise NotImplementedError
+
+    def __repr__(self) -> str:
+        head = "Dataset " + self.__class__.__name__
+        body = [f"Number of datapoints: {self.__len__()}"]
+        if self.root is not None:
+            body.append(f"Root location: {self.root}")
+        body += self.extra_repr().splitlines()
+        if hasattr(self, "transforms") and self.transforms is not None:
+            body += [repr(self.transforms)]
+        lines = [head] + [" " * self._repr_indent + line for line in body]
+        return "\n".join(lines)
+
+    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
+        lines = transform.__repr__().splitlines()
+        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
+
+    def extra_repr(self) -> str:
+        return ""
+
+
+class StandardTransform:
+    def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None:
+        self.transform = transform
+        self.target_transform = target_transform
+
+    def __call__(self, input: Any, target: Any) -> Tuple[Any, Any]:
+        if self.transform is not None:
+            input = self.transform(input)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return input, target
+
+    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
+        lines = transform.__repr__().splitlines()
+        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
+
+    def __repr__(self) -> str:
+        body = [self.__class__.__name__]
+        if self.transform is not None:
+            body += self._format_transform_repr(self.transform, "Transform: ")
+        if self.target_transform is not None:
+            body += self._format_transform_repr(self.target_transform, "Target transform: ")
+
+        return "\n".join(body)
diff --git a/models/cv/object_detection/yolov5s/igie/deploy.py b/models/cv/object_detection/yolov5s/igie/deploy.py
new file mode 100644
index 00000000..ec56b7ab
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/deploy.py
@@ -0,0 +1,134 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+from tensorrt.deploy.api import GraphTransform, create_source, create_target
+
+class Transform:
+    def __init__(self, graph):
+        self.t = GraphTransform(graph)
+        self.graph = graph
+
+    def ReplaceFocus(self, input_edge, outputs, to_op):
+        input_var = self.graph.get_variable(input_edge)
+        op = self.graph.get_operator(to_op)
+        self.t.delete_operators_between_var_op(
+            from_var=input_var, to_op=op
+        )
+        self.t.make_operator(
+            "Focus", inputs=input_edge, outputs=outputs
+        )
+        return self.graph
+
+    def AddYoloDecoderOp(self, inputs: list, outputs: list, op_type, **attributes):
+        if attributes["anchor"] is None:
+            del attributes["anchor"]
+        self.t.make_operator(
+            op_type, inputs=inputs, outputs=outputs, **attributes
+        )
+        return self.graph
+
+    def AddConcatOp(self, inputs: list, outputs, **attributes):
+        self.t.make_operator(
+            "Concat", inputs=inputs, outputs=outputs, **attributes
+        )
+        return self.graph
+
+def customize_ops(graph, args):
+    t = Transform(graph)
+    fuse_focus = args.focus_input is not None and args.focus_output is not None and args.focus_last_node is not None
+    if fuse_focus:
+        graph = t.ReplaceFocus(
+            input_edge=args.focus_input,
+            outputs=args.focus_output,
+            to_op=args.focus_last_node
+        )
+    decoder_input = args.decoder_input_names
+    num = len(decoder_input) // 3
+    graph = t.AddYoloDecoderOp(
+        inputs=decoder_input[:num],
+        outputs=["decoder_8"],
+        op_type=args.decoder_type,
+        anchor=args.decoder8_anchor,
+        num_class=args.num_class,
+        stride=8,
+        faster_impl=args.faster
+    )
+    graph = t.AddYoloDecoderOp(
+        inputs=decoder_input[num:num*2],
+        outputs=["decoder_16"],
+        op_type=args.decoder_type,
+        anchor=args.decoder16_anchor,
+        num_class=args.num_class,
+        stride=16,
+        faster_impl=args.faster
+    )
+    graph = t.AddYoloDecoderOp(
+        inputs=decoder_input[num*2:num*2+1],
+        outputs=["decoder_32"],
+        op_type=args.decoder_type,
+        anchor=args.decoder32_anchor,
+        num_class=args.num_class,
+        stride=32,
+        faster_impl=args.faster
+    )
+    if args.decoder64_anchor is not None:
+        graph = t.AddYoloDecoderOp(
+            inputs=decoder_input[num*2+1:],
+            outputs=["decoder_64"],
+            op_type=args.decoder_type,
+            anchor=args.decoder64_anchor,
+            num_class=args.num_class,
+            stride=64,
+            faster_impl=args.faster
+        )
+        graph = t.AddConcatOp(
+            inputs=["decoder_8", "decoder_16", "decoder_32", "decoder_64"],
+            outputs=["output"],
+            axis=1
+        )
+    elif args.with_nms:
+        graph = t.AddConcatOp(
+            inputs=["decoder_32", "decoder_16", "decoder_8"],
+            outputs=["output"],
+            axis=1
+        )
+
+        graph.outputs.clear()
+        graph.add_output("output")
+        graph.outputs["output"].dtype = "FLOAT"
+    else:
+        graph.outputs.clear()
+        graph.add_output("decoder_8")
+        graph.outputs["decoder_8"].dtype = "FLOAT"
+        graph.add_output("decoder_16")
+        graph.outputs["decoder_16"].dtype = "FLOAT"
+        graph.add_output("decoder_32")
+        graph.outputs["decoder_32"].dtype = "FLOAT"
+    return graph
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str)
+    parser.add_argument("--dst", type=str)
+    parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
+    parser.add_argument("--with_nms", type=bool, default=False, help="engine with nms")
+    parser.add_argument("--decoder_input_names", nargs='+', type=str)
+    parser.add_argument("--decoder8_anchor", nargs='*', type=int)
+    parser.add_argument("--decoder16_anchor", nargs='*', type=int)
+    parser.add_argument("--decoder32_anchor", nargs='*', type=int)
+    parser.add_argument("--decoder64_anchor", nargs='*', type=int, default=None)
+    parser.add_argument("--num_class", type=int, default=80)
+    parser.add_argument("--faster", type=int, default=1)
+    parser.add_argument("--focus_input", type=str, default=None)
+    parser.add_argument("--focus_output", type=str, default=None)
+    parser.add_argument("--focus_last_node", type=str, default=None)
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+
+    args = parse_args()
+    graph = create_source(args.src)()
+    graph = customize_ops(graph, args)
+    create_target(saved_path=args.dst).export(graph)
+    print("Surged onnx lies on", args.dst)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/inference.py b/models/cv/object_detection/yolov5s/igie/inference.py
new file mode 100644
index 00000000..a514ddb9
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/inference.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+import glob
+import json
+import os
+import time
+import sys
+
+import torch
+import numpy as np
+import cuda.cuda as cuda
+import cuda.cudart as cudart
+
+from coco_labels import coco80_to_coco91_class, labels
+from common import save2json, box_class85to6
+from common import create_engine_context, get_io_bindings
+from calibration_dataset import create_dataloaders
+from datasets.post_process import get_post_process
+
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from tqdm import tqdm
+from tqdm.contrib import tzip
+
+import tensorrt
+
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
+
+
+import tvm
+from tvm.contrib import graph_executor
+
+def init_by_igie(engine_path):
+    device = tvm.device("iluvatar", 0)
+    lib = tvm.runtime.load_module(engine_path)
+    module = graph_executor.GraphModule(lib["default"](device))
+    # engine, context = module.engine, module.context
+    # inputs, outputs, allocations = module.inputs, module.outputs, module.allocations
+    return module
+
+def igie_infer(module, batch_data):
+    # set input
+    module.set_input(module.inputs[0]["name"], batch_data)    
+    ### infer model
+    module.run()
+    # get output data
+    output = module.get_output(0)
+    return output
+
+
+def main(config):
+
+    # Load dataloader
+    dataloader = create_dataloaders(
+        data_path=config.eval_dir,
+        annFile=config.coco_gt,
+        img_sz=config.imgsz,
+        batch_size=config.bsz,
+        step=config.loop_count,
+        data_process_type=config.data_process_type
+    )
+
+    # Load post process func
+    if config.test_mode == "MAP":
+        post_process_func = get_post_process(config.data_process_type)
+
+    bsz = config.bsz
+    num_samples = 5000
+    if config.loop_count > 0:
+        num_samples = bsz * config.loop_count
+    num_batch = len(dataloader)
+    print("=" * 30)
+    print(f"Test Mode : {'Asynchronous' if config.use_async else 'Synchronous'}")
+    print(f"Total sample : {num_samples}\nBatch_size : {bsz}\nRun Batch : {num_batch}")
+    print("=" * 30)
+
+    json_result = []
+    forward_time = 0.0
+    class_map = coco80_to_coco91_class()
+
+
+    # Load Engine
+    module = init_by_igie(config.model_engine)
+    
+    
+    # Load nms_engine
+    if config.test_mode == "MAP" and config.nms_type == "GPU":
+        logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+        nms_engine, nms_context = create_engine_context(config.nms_engine, logger)
+        nms_inputs, nms_outputs, nms_allocations = get_io_bindings(nms_engine)
+        nms_output0 = np.zeros(nms_outputs[0]["shape"], nms_outputs[0]["dtype"])
+        nms_output1 = np.zeros(nms_outputs[1]["shape"], nms_outputs[1]["dtype"])
+        print(f"nms_output0 shape : {nms_output0.shape}   nms_output0 type : {nms_output0.dtype}")
+        print(f"nms_output1 shape : {nms_output1.shape}   nms_output1 type : {nms_output1.dtype}")
+
+    # Warm up
+    if config.warm_up > 0:
+        print("\nWarm Start.")
+        for i in range(config.warm_up):
+            module.run()
+        print("Warm Done.")
+
+    for batch_data, batch_img_shape, batch_img_id in tqdm(dataloader):
+        batch_data = batch_data.numpy()
+        batch_img_shape = [batch_img_shape[0].numpy(), batch_img_shape[1].numpy()]
+        # batch_img_id = batch_img_id.numpy()
+        cur_bsz_sample = batch_data.shape[0]
+
+        if config.test_mode == "MAP":
+            # Fetch output
+            output = igie_infer(module, batch_data)
+
+            # Step 1 : prepare data to nms
+            _, box_num, box_unit = output.shape
+            if config.debug:
+                print(f"[Debug] box_num(25200) : {box_num}, box_unit(6) : {box_unit}")
+
+            if config.decoder_faster == 0:
+                nms_input = box_class85to6(output.reshape(-1, box_unit))
+            else:
+                nms_input = output
+
+            # Step 2 : nms
+            # cpu nms(TODO)
+
+            # gpu nms
+            if config.nms_type == "GPU":
+
+                # Set nms input
+                err, = cuda.cuMemcpyHtoD(nms_inputs[0]["allocation"], nms_input, nms_input.nbytes)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                nms_context.execute_v2(nms_allocations)
+                err, = cuda.cuMemcpyDtoH(nms_output0, nms_outputs[0]["allocation"], nms_outputs[0]["nbytes"])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyDtoH(nms_output1, nms_outputs[1]["allocation"], nms_outputs[1]["nbytes"])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+
+            # Step 3 : post process + save
+            pred_boxes = post_process_func(
+                ori_img_shape=batch_img_shape,
+                imgsz=(config.imgsz, config.imgsz),
+                box_datas=nms_output0,
+                box_nums=nms_output1,
+                sample_num=cur_bsz_sample,
+                max_det=config.max_det
+            )
+            save2json(batch_img_id, pred_boxes, json_result, class_map)
+
+    # fps = num_samples / forward_time
+
+    if config.test_mode == "FPS":
+        start_time = time.time()       
+        for i in range(config.loop_count):
+            module.run()  
+        end_time = time.time()  
+        forward_time = end_time - start_time      
+        fps = (config.loop_count*config.bsz) / forward_time
+        print("FPS : ", fps)
+        print(f"Performance Check : Test {fps} >= target {config.fps_target}")
+        if fps >= config.fps_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(10)
+
+    if config.test_mode == "MAP":
+        if len(json_result) == 0:
+            print("Predict zero box!")
+            exit(10)
+
+        if not os.path.exists(config.pred_dir):
+            os.makedirs(config.pred_dir)
+
+        pred_json = os.path.join(
+            config.pred_dir, f"{config.model_name}_{config.precision}_preds.json"
+        )
+        with open(pred_json, "w") as f:
+            json.dump(json_result, f)
+
+        anno_json = config.coco_gt
+        anno = COCO(anno_json)  # init annotations api
+        pred = anno.loadRes(pred_json)  # init predictions api
+        eval = COCOeval(anno, pred, "bbox")
+
+        eval.evaluate()
+        eval.accumulate()
+        print(
+            f"==============================eval {config.model_name} {config.precision} coco map =============================="
+        )
+        eval.summarize()
+
+        map, map50 = eval.stats[:2]
+        print("MAP@0.5 : ", map50)
+        print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
+        if map50 >= config.map_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(10)
+
+def parse_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name", type=str, default="YOLOV5s", help="YOLOV3 YOLOV5 YOLOV7 YOLOX"
+    )
+    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+            help="The precision of datatype")
+    parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
+    parser.add_argument(
+        "--model_engine",
+        type=str,
+        default="",
+        help="model engine path",
+    )
+    parser.add_argument(
+        "--nms_engine",
+        type=str,
+        default="",
+        help="nms engine path",
+    )
+    parser.add_argument(
+        "--coco_gt",
+        type=str,
+        default="data/datasets/cv/coco2017/annotations/instances_val2017.json",
+        help="coco instances_val2017.json",
+    )
+    parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
+    parser.add_argument("--loop_count", type=int, default=-1, help="loop count")
+    parser.add_argument(
+        "--eval_dir",
+        type=str,
+        default="data/datasets/cv/coco2017/val2017",
+        help="coco image dir",
+    )
+    parser.add_argument("--bsz", type=int, default=32, help="test batch size")
+    parser.add_argument(
+        "--imgsz",
+        "--img",
+        "--img-size",
+        type=int,
+        default=640,
+        help="inference size h,w",
+    )
+    parser.add_argument("--max_det", type=int, default=1000, help="maximum detections per image")
+    parser.add_argument("--data_process_type", type=str,  default="none")
+    parser.add_argument("--use_async", action="store_true")
+    parser.add_argument("--debug", action="store_true")
+    parser.add_argument("--pred_dir", type=str, default=".", help="pred save json dirs")
+    parser.add_argument("--map_target", type=float, default=0.56, help="target mAP")
+    parser.add_argument("--fps_target", type=float, default=-1.0, help="target fps")
+    parser.add_argument("--decoder_faster", type=int, default=0, help="decoder faster can use gpu nms directly")
+    parser.add_argument("--nms_type", type=str, default="GPU", help="GPU/CPU")
+
+    config = parser.parse_args()
+    print("config:", config)
+    return config
+
+if __name__ == "__main__":
+    config = parse_config()
+    main(config)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/load_ixrt_plugin.py b/models/cv/object_detection/yolov5s/igie/load_ixrt_plugin.py
new file mode 100644
index 00000000..932efbdf
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/load_ixrt_plugin.py
@@ -0,0 +1,12 @@
+import ctypes
+import tensorrt
+from os.path import join, dirname, exists
+def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
+    if not dynamic_path:
+        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+    ctypes.CDLL(dynamic_path)
+    tensorrt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/modify_batchsize.py b/models/cv/object_detection/yolov5s/igie/modify_batchsize.py
new file mode 100644
index 00000000..00ed65dd
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/modify_batchsize.py
@@ -0,0 +1,37 @@
+import onnx
+import argparse
+
+def change_input_dim(model, bsz):
+    batch_size = bsz
+
+    # The following code changes the first dimension of every input to be batch_size
+    # Modify as appropriate ... note that this requires all inputs to
+    # have the same batch_size
+    inputs = model.graph.input
+    for input in inputs:
+        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
+        # Add checks as needed.
+        dim1 = input.type.tensor_type.shape.dim[0]
+        # update dim to be a symbolic value
+        if isinstance(batch_size, str):
+            # set dynamic batch size
+            dim1.dim_param = batch_size
+        elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
+            # set given batch size
+            dim1.dim_value = int(batch_size)
+        else:
+            # set batch size of 1
+            dim1.dim_value = 1
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch_size", type=int)
+    parser.add_argument("--origin_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+model = onnx.load(args.origin_model)
+change_input_dim(model, args.batch_size)
+onnx.save(model, args.output_model)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/quant.py b/models/cv/object_detection/yolov5s/igie/quant.py
new file mode 100644
index 00000000..bcf5d9b6
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/quant.py
@@ -0,0 +1,52 @@
+import os
+import random
+import argparse
+import numpy as np
+from tensorrt.deploy import static_quantize
+
+import torch
+from calibration_dataset import create_dataloaders
+
+def setseed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str)
+    parser.add_argument("--model", type=str,  default="yolov5s_with_decoder.onnx")
+    parser.add_argument("--data_process_type", type=str,  default="none")
+    parser.add_argument("--dataset_dir", type=str,  default="./coco2017/val2017")
+    parser.add_argument("--ann_file", type=str,  default="./coco2017/annotations/instances_val2017.json")
+    parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
+    parser.add_argument("--disable_quant_names", nargs='*', type=str)
+    parser.add_argument("--save_dir", type=str,  help="save path", default=None)
+    parser.add_argument("--bsz", type=int, default=32)
+    parser.add_argument("--step", type=int, default=20)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--imgsz", type=int, default=640)
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+setseed(args.seed)
+model_name = args.model_name
+
+out_dir = args.save_dir
+dataloader = create_dataloaders(
+    data_path=args.dataset_dir,
+    annFile=args.ann_file,
+    img_sz=args.imgsz,
+    batch_size=args.bsz,
+    step=args.step,
+    data_process_type=args.data_process_type
+)
+# print("disable_quant_names : ", args.disable_quant_names)
+static_quantize(args.model,
+        calibration_dataloader=dataloader,
+        save_quant_onnx_path=os.path.join(out_dir, f"quantized_{model_name}.onnx"),
+        observer=args.observer,
+        data_preprocess=lambda x: x[0].to("cuda"),
+        quant_format="qdq",
+        disable_quant_names=args.disable_quant_names)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/scripts/infer_yolov5s_fp16_accuracy.sh b/models/cv/object_detection/yolov5s/igie/scripts/infer_yolov5s_fp16_accuracy.sh
new file mode 100644
index 00000000..7090320b
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/scripts/infer_yolov5s_fp16_accuracy.sh
@@ -0,0 +1,209 @@
+#!/bin/bash
+
+EXIT_STATUS=0
+check_status()
+{
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+WARM_UP=-1
+TGT=0.56
+LOOP_COUNT=-1
+RUN_MODE=MAP
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+PROJ_DIR=./
+DATASETS_DIR="${PROJ_DIR}/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/checkpoints"
+RUN_DIR="${PROJ_DIR}"
+CONFIG_DIR="${RUN_DIR}/config/YOLOV5S_CONFIG"
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
+step=0
+faster=0
+CURRENT_MODEL=${ORIGINE_MODEL}
+if [[ ${LAYER_FUSION} == 1 && ${DECODER_FASTER} == 1 ]];then
+    faster=1
+fi
+
+# Simplify Model
+let step++
+echo [STEP ${step}] : Simplify Model
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model skip, ${SIM_MODEL} has been existed
+else
+    python3 ${RUN_DIR}/simplify_model.py    \
+    --origin_model ${CURRENT_MODEL}         \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
+fi
+CURRENT_MODEL=${SIM_MODEL}
+
+# Cut Decoder
+let step++
+echo [STEP ${step}] : Cut Decoder
+NO_DECODER_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_without_decoder.onnx
+if [ -f ${NO_DECODER_MODEL} ];then
+    echo "  "Cut Decoder skip, ${SIM_MNO_DECODER_MODELODEL} has been existed
+else
+    python3 ${RUN_DIR}/cut_model.py         \
+    --input_model  ${CURRENT_MODEL}         \
+    --output_model ${NO_DECODER_MODEL}      \
+    --input_names ${MODEL_INPUT_NAMES[@]}   \
+    --output_names ${DECODER_INPUT_NAMES[@]}
+fi
+CURRENT_MODEL=${NO_DECODER_MODEL}
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        CURRENT_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py                         \
+            --model ${CURRENT_MODEL}                        \
+            --model_name ${MODEL_NAME}                      \
+            --dataset_dir ${EVAL_DIR}                       \
+            --ann_file ${COCO_GT}                           \
+            --data_process_type ${DATA_PROCESS_TYPE}        \
+            --observer ${QUANT_OBSERVER}                    \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
+            --save_dir $CHECKPOINTS_DIR                     \
+            --bsz   ${QUANT_BATCHSIZE}                      \
+            --step  ${QUANT_STEP}                           \
+            --seed  ${QUANT_SEED}                           \
+            --imgsz ${IMGSIZE}
+        echo "  "Generate ${QUANT_EXIST_ONNX}
+    fi
+    CURRENT_MODEL=${QUANT_EXIST_ONNX}
+fi
+
+# Add Decoder
+if [ $LAYER_FUSION == "1" ]; then
+    let step++
+    echo;
+    echo [STEP ${step}] : Add Decoder
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion_cancat.onnx
+    if [ -f $FUSION_ONNX ];then
+        echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
+    else
+        python3 ${RUN_DIR}/deploy.py                        \
+            --src ${CURRENT_MODEL}                          \
+            --dst ${FUSION_ONNX}                            \
+            --decoder_type        YoloV5Decoder             \
+            --with_nms             True                     \
+            --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
+            --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
+            --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
+            --decoder32_anchor    ${DECODER_32_ANCHOR[@]}   \
+            --num_class           ${DECODER_NUM_CLASS}      \
+            --faster              ${faster}
+    fi
+    CURRENT_MODEL=${FUSION_ONNX}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}_with_nms.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision ${PRECISION}                \
+        --bsz ${BSZ}                            \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+if [[ ${RUN_MODE} == "MAP" && ${NMS_TYPE} == "GPU" ]];then
+    NMS_ENGINE=${CHECKPOINTS_DIR}/nms.engine
+    # Build NMS Engine
+    python3 ${RUN_DIR}/build_nms_engine.py      \
+        --bsz ${BSZ}                            \
+        --path ${CHECKPOINTS_DIR}               \
+        --all_box_num ${ALL_BOX_NUM}            \
+        --max_box_pre_img   ${MAX_BOX_PRE_IMG}  \
+        --iou_thresh        ${IOU_THRESH}       \
+        --score_thresh      ${SCORE_THRESH}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py                 \
+    --model_engine=${ENGINE_FILE}               \
+    --nms_engine=${NMS_ENGINE}                  \
+    --coco_gt=${COCO_GT}                        \
+    --eval_dir=${EVAL_DIR}                      \
+    --data_process_type ${DATA_PROCESS_TYPE}    \
+    --decoder_faster=${faster}                  \
+    --imgsz=${IMGSIZE}                          \
+    --warm_up=${WARM_UP}                        \
+    --loop_count ${LOOP_COUNT}                  \
+    --test_mode ${RUN_MODE}                     \
+    --model_name ${MODEL_NAME}                  \
+    --precision  ${PRECISION}                   \
+    --pred_dir   ${CHECKPOINTS_DIR}             \
+    --map_target ${TGT}                         \
+    --max_det ${MAX_BOX_PRE_IMG}                \
+    --nms_type ${NMS_TYPE}                      \
+    --bsz ${BSZ}; check_status
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/scripts/infer_yolov5s_fp16_performance.sh b/models/cv/object_detection/yolov5s/igie/scripts/infer_yolov5s_fp16_performance.sh
new file mode 100644
index 00000000..35cc5785
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/scripts/infer_yolov5s_fp16_performance.sh
@@ -0,0 +1,209 @@
+#!/bin/bash
+
+EXIT_STATUS=0
+check_status()
+{
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+WARM_UP=3
+TGT=840
+LOOP_COUNT=100
+RUN_MODE=FPS
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+PROJ_DIR=./
+DATASETS_DIR="${PROJ_DIR}/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/checkpoints"
+RUN_DIR="${PROJ_DIR}"
+CONFIG_DIR="${RUN_DIR}/config/YOLOV5S_CONFIG"
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
+step=0
+faster=0
+CURRENT_MODEL=${ORIGINE_MODEL}
+if [[ ${LAYER_FUSION} == 1 && ${DECODER_FASTER} == 1 ]];then
+    faster=1
+fi
+
+# Simplify Model
+let step++
+echo [STEP ${step}] : Simplify Model
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model skip, ${SIM_MODEL} has been existed
+else
+    python3 ${RUN_DIR}/simplify_model.py    \
+    --origin_model ${CURRENT_MODEL}         \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
+fi
+CURRENT_MODEL=${SIM_MODEL}
+
+# Cut Decoder
+let step++
+echo [STEP ${step}] : Cut Decoder
+NO_DECODER_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_without_decoder.onnx
+if [ -f ${NO_DECODER_MODEL} ];then
+    echo "  "Cut Decoder skip, ${SIM_MNO_DECODER_MODELODEL} has been existed
+else
+    python3 ${RUN_DIR}/cut_model.py         \
+    --input_model  ${CURRENT_MODEL}         \
+    --output_model ${NO_DECODER_MODEL}      \
+    --input_names ${MODEL_INPUT_NAMES[@]}   \
+    --output_names ${DECODER_INPUT_NAMES[@]}
+fi
+CURRENT_MODEL=${NO_DECODER_MODEL}
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        CURRENT_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py                         \
+            --model ${CURRENT_MODEL}                        \
+            --model_name ${MODEL_NAME}                      \
+            --dataset_dir ${EVAL_DIR}                       \
+            --ann_file ${COCO_GT}                           \
+            --data_process_type ${DATA_PROCESS_TYPE}        \
+            --observer ${QUANT_OBSERVER}                    \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
+            --save_dir $CHECKPOINTS_DIR                     \
+            --bsz   ${QUANT_BATCHSIZE}                      \
+            --step  ${QUANT_STEP}                           \
+            --seed  ${QUANT_SEED}                           \
+            --imgsz ${IMGSIZE}
+        echo "  "Generate ${QUANT_EXIST_ONNX}
+    fi
+    CURRENT_MODEL=${QUANT_EXIST_ONNX}
+fi
+
+# Add Decoder
+if [ $LAYER_FUSION == "1" ]; then
+    let step++
+    echo;
+    echo [STEP ${step}] : Add Decoder
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion_no_cancat.onnx
+    if [ -f $FUSION_ONNX ];then
+        echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
+    else
+        python3 ${RUN_DIR}/deploy.py                        \
+            --src ${CURRENT_MODEL}                          \
+            --dst ${FUSION_ONNX}                            \
+            --decoder_type        YoloV5Decoder             \
+            --with_nms             False                    \
+            --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
+            --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
+            --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
+            --decoder32_anchor    ${DECODER_32_ANCHOR[@]}   \
+            --num_class           ${DECODER_NUM_CLASS}      \
+            --faster              ${faster}
+    fi
+    CURRENT_MODEL=${FUSION_ONNX}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}_without_nms.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision ${PRECISION}                \
+        --bsz ${BSZ}                            \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+if [[ ${RUN_MODE} == "MAP" && ${NMS_TYPE} == "GPU" ]];then
+    NMS_ENGINE=${CHECKPOINTS_DIR}/nms.engine
+    # Build NMS Engine
+    python3 ${RUN_DIR}/build_nms_engine.py      \
+        --bsz ${BSZ}                            \
+        --path ${CHECKPOINTS_DIR}               \
+        --all_box_num ${ALL_BOX_NUM}            \
+        --max_box_pre_img   ${MAX_BOX_PRE_IMG}  \
+        --iou_thresh        ${IOU_THRESH}       \
+        --score_thresh      ${SCORE_THRESH}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py                 \
+    --model_engine=${ENGINE_FILE}               \
+    --nms_engine=${NMS_ENGINE}                  \
+    --coco_gt=${COCO_GT}                        \
+    --eval_dir=${EVAL_DIR}                      \
+    --data_process_type ${DATA_PROCESS_TYPE}    \
+    --decoder_faster=${faster}                  \
+    --imgsz=${IMGSIZE}                          \
+    --warm_up=${WARM_UP}                        \
+    --loop_count ${LOOP_COUNT}                  \
+    --test_mode ${RUN_MODE}                     \
+    --model_name ${MODEL_NAME}                  \
+    --precision  ${PRECISION}                   \
+    --pred_dir   ${CHECKPOINTS_DIR}             \
+    --fps_target ${TGT}                         \
+    --max_det ${MAX_BOX_PRE_IMG}                \
+    --nms_type ${NMS_TYPE}                      \
+    --bsz ${BSZ}; check_status
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/igie/simplify_model.py b/models/cv/object_detection/yolov5s/igie/simplify_model.py
new file mode 100644
index 00000000..b4254b6f
--- /dev/null
+++ b/models/cv/object_detection/yolov5s/igie/simplify_model.py
@@ -0,0 +1,21 @@
+import onnx
+import argparse
+from onnxsim import simplify
+
+# Simplify
+def simplify_model(args):
+    onnx_model = onnx.load(args.origin_model)
+    model_simp, check = simplify(onnx_model)
+    model_simp = onnx.shape_inference.infer_shapes(model_simp)
+    onnx.save(model_simp, args.output_model)
+    print("  Simplify onnx Done.")
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--origin_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+simplify_model(args)
\ No newline at end of file
diff --git a/models/nlp/plm/transformer/igie/__init__.py b/models/nlp/plm/transformer/igie/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/models/nlp/plm/transformer/igie/build_engine.py b/models/nlp/plm/transformer/igie/build_engine.py
new file mode 100644
index 00000000..5ecaae7e
--- /dev/null
+++ b/models/nlp/plm/transformer/igie/build_engine.py
@@ -0,0 +1,398 @@
+#!/usr/bin/env python3
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+#
+
+import argparse
+import ctypes
+import json
+import os
+import sys
+import time
+import numpy as np
+
+import tensorrt as trt
+from builder_utils import load_onnx_weights_and_quant
+from plugin_utils import (
+    TRT_LOGGER,
+    create_decoder_emb_plugin,
+    create_encoder_emb_plugin,
+    transformer_decoder_layer,
+    transformer_encoder_layer,
+    cross_attention_kv_cache,
+    create_top1_plugin
+)
+
+from transformer_cfg import TransformerBaseConfig
+
+
+def get_mha_dtype(config):
+    dtype = trt.float32
+    if config.use_fp16:
+        dtype = trt.float16
+    return int(dtype)
+
+
+def transformer_encoder(config, init_dict, network, input_tensor, input_mask):
+    """
+    Create the bert model
+    """
+
+    block = "encoder"
+    prev_input = input_tensor
+    for ss in range(config.num_hidden_layers):
+        out_layer = transformer_encoder_layer(
+            block, ss, config, init_dict, network, prev_input, input_mask
+        )
+        prev_input = out_layer.get_output(0)
+    return prev_input
+
+
+def transformer_decoder(
+    config,
+    init_dict,
+    network,
+    encoder_emb_out,
+    input_mask,
+    encoder_out,
+    steps,
+    kv_cache_inputs,
+    kv_cache_outputs,
+    encoder_kv_cache_inputs
+):
+    """
+    Create the bert model
+    """
+    prev_input = encoder_emb_out
+    block = "decoder"
+    for ss in range(config.num_hidden_layers):
+        out_layer = transformer_decoder_layer(
+            block,
+            ss,
+            config,
+            init_dict,
+            network,
+            prev_input,
+            input_mask,
+            encoder_out,
+            steps,
+            kv_cache_inputs,
+            kv_cache_outputs,
+            encoder_kv_cache_inputs
+        )
+        prev_input = out_layer.get_output(0)
+
+    decoder_output_projection_weight = init_dict[f"{block}.output_projection.weight"]
+    out_proj_layer = network.add_fully_connected(
+        prev_input, config.tgt_vocab_size, decoder_output_projection_weight
+    )  #
+
+    reshape_layer = network.add_shuffle(out_proj_layer.get_output(0))
+
+    reshape_layer.reshape_dims = trt.Dims([0, -1])  # reshape [bsz,vocab_size]
+    decoder_blk_out = reshape_layer.get_output(0)
+    return decoder_blk_out
+
+
+def build_encoder_engine(batch_sizes, sequence_lengths, config, weights_dict):
+    explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+    encoder_emb_plugin = create_encoder_emb_plugin(weights_dict, config)
+
+    builder = trt.Builder(TRT_LOGGER)
+    with builder.create_network(
+        explicit_batch_flag
+    ) as network, builder.create_builder_config() as builder_config:
+
+        builder_config.set_flag(trt.BuilderFlag.FP16)
+        input_ids = network.add_input(
+            name="src_tokens", dtype=trt.int32, shape=[-1, -1]
+        )
+        MIN_SHAPE = (batch_sizes[0], sequence_lengths[0])
+        OPT_SHAPE = (batch_sizes[1], sequence_lengths[1])
+        MAX_SHAPE = (batch_sizes[2], sequence_lengths[2])
+
+        profile = builder.create_optimization_profile()
+        profile.set_shape("src_tokens", MIN_SHAPE, OPT_SHAPE, MAX_SHAPE)
+        builder_config.add_optimization_profile(profile)
+
+        #######################{transformer Encoder emb layer}#####################
+        emb_layer = network.add_plugin_v2([input_ids], encoder_emb_plugin)
+        ###########################################################################
+        embeddings = emb_layer.get_output(0)
+        mask_idx = emb_layer.get_output(1)
+
+        #######################{transformer Encoder  block}#####################
+        
+        encoder_out = transformer_encoder(
+            config, weights_dict, network, embeddings, mask_idx
+        )
+        #######################################################################
+        
+        
+        for layer_index in range(config.num_hidden_layers):
+            block = "decoder"
+            k_cache,v_cache =  cross_attention_kv_cache(block, layer_index, config, weights_dict, network, encoder_out)
+            
+            k_cache.dtype = trt.float16
+            k_cache.name = f"past_key_values.{layer_index}.encoder.key"
+            network.mark_output(k_cache)
+            
+            v_cache.dtype = trt.float16
+            v_cache.name = f"past_key_values.{layer_index}.encoder.value"
+            network.mark_output(v_cache)
+            
+        mask_idx.dtype = trt.int32
+        mask_idx.name = "mask"
+        network.mark_output(mask_idx)
+
+        plan = builder.build_serialized_network(network, builder_config)
+   
+        return plan
+
+
+def build_engine_decoder(batch_sizes, sequence_lengths, config, weights_dict):
+    explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+    builder = trt.Builder(TRT_LOGGER)
+    decoder_emb_plugin = create_decoder_emb_plugin(weights_dict)
+
+    MIN_BSZ = batch_sizes[0]
+    OPT_BSZ = batch_sizes[1]
+    MAX_BSZ = batch_sizes[2]
+
+    MIN_LEN = sequence_lengths[0]
+    OPT_LEN = sequence_lengths[1]
+    MAX_LEN = sequence_lengths[2]
+
+    with builder.create_network(
+        explicit_batch_flag
+    ) as network, builder.create_builder_config() as builder_config:
+        builder_config.set_flag(trt.BuilderFlag.FP16)
+
+        ###################IxinferDecFormatEncOutput
+
+        token_id = network.add_input(
+            "token_id", dtype=trt.int32, shape=(-1, 1)
+        )  # [bsz,1]
+        steps = network.add_input("steps", dtype=trt.int32, shape=(1,))  # [1,1]
+        mask = network.add_input(
+            "mask", dtype=trt.int32, shape=(-1, -1)
+        )  # [bsz,seq_len]
+
+############################################################################################
+        kv_cache_inputs = {}  # past_key_values
+        kv_cache_outputs = {}  # present_key_values
+
+        for i in range(config.num_hidden_layers):
+            k_cache_name = f"past_key_values.{i}.decoder.key"
+            v_cache_name = f"past_key_values.{i}.decoder.value"
+            k_cache_input = network.add_input(
+                k_cache_name,
+                dtype=trt.float16,
+                shape=(
+                    -1,
+                    config.num_attention_heads,
+                    -1,
+                    config.head_size,
+                ),  # (bsz, config.num_attention_heads, steps, config.head_size)
+            )
+            v_cache_input = network.add_input(
+                v_cache_name,
+                dtype=trt.float16,
+                shape=(-1, config.num_attention_heads, -1, config.head_size),
+            )
+            kv_cache_inputs[k_cache_name] = k_cache_input
+            kv_cache_inputs[v_cache_name] = v_cache_input
+
+        profile = builder.create_optimization_profile()
+        for i in range(config.num_hidden_layers):
+            k_cache_name = f"past_key_values.{i}.decoder.key"
+            v_cache_name = f"past_key_values.{i}.decoder.value"
+            profile.set_shape(
+                k_cache_name,
+                (MIN_BSZ, config.num_attention_heads, 0, config.head_size),  #0 fist step kv cache don't concat
+                (OPT_BSZ, config.num_attention_heads, OPT_LEN, config.head_size),
+                (MAX_BSZ, config.num_attention_heads, MAX_LEN, config.head_size),
+            )
+            profile.set_shape(
+                v_cache_name,
+                (MIN_BSZ, config.num_attention_heads, 0, config.head_size),  #0 fist step kv cache don't concat
+                (OPT_BSZ, config.num_attention_heads, OPT_LEN, config.head_size),
+                (MAX_BSZ, config.num_attention_heads, MAX_LEN, config.head_size),
+            )
+            
+############################################################################################
+
+        encoder_kv_cache_inputs = {}
+        #cross attention kv cache
+        for i in range(config.num_hidden_layers):
+            k_cache_name = f"past_key_values.{i}.encoder.key"
+            v_cache_name = f"past_key_values.{i}.encoder.value"
+            k_cache_input = network.add_input(
+                k_cache_name,
+                dtype=trt.float16,
+                shape=(
+                    -1,
+                    config.num_attention_heads,
+                    -1,
+                    config.head_size,
+                ),  # (bsz, config.num_attention_heads, steps, config.head_size)
+            )
+            v_cache_input = network.add_input(
+                v_cache_name,
+                dtype=trt.float16,
+                shape=(-1, config.num_attention_heads, -1, config.head_size),
+            )
+            encoder_kv_cache_inputs[k_cache_name] = k_cache_input
+            encoder_kv_cache_inputs[v_cache_name] = v_cache_input
+            
+        
+        for i in range(config.num_hidden_layers):
+            k_cache_name = f"past_key_values.{i}.encoder.key"
+            v_cache_name = f"past_key_values.{i}.encoder.value"
+            profile.set_shape(
+                k_cache_name,
+                (MIN_BSZ, config.num_attention_heads, 1, config.head_size), 
+                (OPT_BSZ, config.num_attention_heads, OPT_LEN, config.head_size),
+                (MAX_BSZ, config.num_attention_heads, MAX_LEN, config.head_size),
+            )
+            profile.set_shape(
+                v_cache_name,
+                (MIN_BSZ, config.num_attention_heads, 1, config.head_size),
+                (OPT_BSZ, config.num_attention_heads, OPT_LEN, config.head_size),
+                (MAX_BSZ, config.num_attention_heads, MAX_LEN, config.head_size),
+            )    
+            
+            
+            
+########################################################################################3###
+        profile.set_shape("token_id", (MIN_BSZ, 1), (OPT_BSZ, 1), (MAX_BSZ, 1))
+        profile.set_shape(
+            "mask", (MIN_BSZ, MIN_LEN), (OPT_BSZ, OPT_LEN), (MAX_BSZ, MAX_LEN)
+        )
+        builder_config.add_optimization_profile(profile)
+        
+        encoder_reshape_out = None
+
+        ############################## decodr
+        encoder_emb_layer = network.add_plugin_v2([token_id, steps], decoder_emb_plugin)
+        encoder_emb_out = encoder_emb_layer.get_output(0)
+
+        ##############################
+
+        decoder_out = transformer_decoder(
+            config,
+            weights_dict,
+            network,
+            encoder_emb_out,
+            mask,
+            encoder_reshape_out,
+            steps,
+            kv_cache_inputs,
+            kv_cache_outputs,
+            encoder_kv_cache_inputs
+        )
+
+        # top1_layer = network.add_topk(
+        #     decoder_out, op=trt.TopKOperation.MAX, k=1, axes=2
+        # )
+        
+        top1_plg = create_top1_plugin()
+        top1_layer = network.add_plugin_v2([decoder_out], top1_plg)
+        token_out = top1_layer.get_output(0)
+        token_out.dtype = trt.int32
+        token_out.name = "decoder_id"
+        network.mark_output(token_out)
+
+        for i in range(config.num_hidden_layers):
+            k_cache_name = f"present_key_values.{i}.decoder.key"
+            v_cache_name = f"present_key_values.{i}.decoder.value"
+            key_out = kv_cache_outputs[k_cache_name]
+            key_out.name = k_cache_name
+            key_out.dtype = trt.float16
+            network.mark_output(key_out)
+
+            value_out = kv_cache_outputs[v_cache_name]
+            value_out.name = v_cache_name
+            value_out.dtype = trt.float16
+            network.mark_output(value_out)
+        plan = builder.build_serialized_network(network, builder_config)
+
+        return plan
+
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="TensorRT Transformer Base Sample",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    
+    
+    parser.add_argument(
+        "--model_dir",
+        default="/inferencesamples/data/checkpoints/transformer/wmt14.en-fr.joined-dict.transformer/",
+        help="The ONNX model file path.",
+    )
+
+    parser.add_argument(
+        "--batch_size",
+        default=[1, 64, 128], # min,opt,max
+        action="append",
+        help="Batch size(s) to optimize for",
+        type=int,
+    )
+    parser.add_argument(
+        "--sequence_length",
+        default=[1, 64, 257], # min,opt,max
+        action="append",
+        help="Sequence length of the transformer model",
+        type=int,
+    )
+
+
+    args = parser.parse_args()
+    config_path = os.path.join(args.model_dir, "transformer_config.json")
+    config = TransformerBaseConfig(config_path)
+    onnx_path = os.path.join(args.model_dir, "transformer.onnx")
+    weights_dict = load_onnx_weights_and_quant(onnx_path, config)
+    
+    
+    encoder_path = os.path.join(args.model_dir, "Encoder.engine")
+    with build_encoder_engine(
+        args.batch_size, args.sequence_length, config, weights_dict
+    ) as serialized_engine:
+        print("Saving Engine to {:}".format(encoder_path))
+        with open(encoder_path, "wb") as fout:
+            fout.write(serialized_engine)
+        print("Serializing Encoder Done.")
+
+    decoder_path = os.path.join(args.model_dir, "Decoder.engine")
+    
+
+    with build_engine_decoder(
+        args.batch_size, args.sequence_length, config, weights_dict
+    ) as serialized_engine:
+        print("Saving Engine to {:}".format(decoder_path))
+        
+        with open(decoder_path, "wb") as fout:
+            fout.write(serialized_engine)
+        print("Serializing Decoder Done.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/nlp/plm/transformer/igie/builder_utils.py b/models/nlp/plm/transformer/igie/builder_utils.py
new file mode 100644
index 00000000..38a3efd4
--- /dev/null
+++ b/models/nlp/plm/transformer/igie/builder_utils.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+#
+
+
+import onnx
+import numpy as np
+import tensorrt as trt
+import json
+import torch
+
+TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+
+
+def reformat_weight_name(name):
+    
+    i = name[1]
+    #emb
+    if name.find("pos_emb_weight") !=-1:
+        return name 
+    
+    if name.find("token_emb_weight") !=-1:
+        return name 
+    
+    if name.find("enc_token_emb_weight") !=-1:
+        return name 
+    
+    if name.find("enc_pos_emb_weight") !=-1:
+        return name 
+#################################################################
+#enccoder layer weights
+    #self atten to_q、to_q、to_v compute together
+    if name.find("enc_self_attn_qkv_weight") !=-1:
+        return f"encoder.layers.{i}.self_attn.qkv_proj.weight"
+    if name.find("enc_self_attn_qkv_bias") !=-1:
+        return f"encoder.layers.{i}.self_attn.qkv_proj.bias"
+    
+    
+    
+    if name.find("enc_self_attn_out_proj_weight") !=-1:
+        return f"encoder.layers.{i}.self_attn.out_proj.weight"
+    if name.find("enc_self_attn_out_proj_bias") !=-1:
+        return f"encoder.layers.{i}.self_attn.out_proj.bias"
+    
+    
+    if name.find("enc_self_attn_ln_weight") !=-1:
+        return f"encoder.layers.{i}.self_attn_layer_norm.weight"
+    if name.find("enc_self_attn_ln_bias") !=-1:
+        return f"encoder.layers.{i}.self_attn_layer_norm.bias"
+    
+        #ffn
+    if name.find("enc_ff1_weight") !=-1:
+        return f'encoder.layers.{i}.fc1.weight'
+    if name.find("enc_ff1_bias") !=-1:
+        return f'encoder.layers.{i}.fc1.bias'
+
+    if name.find("enc_ff2_weight") !=-1:
+        return f'encoder.layers.{i}.fc2.weight'
+    if name.find("enc_ff2_bias") !=-1:
+        return f'encoder.layers.{i}.fc2.bias'
+    
+    
+        #layernorm
+    if name.find("enc_final_ln_weight") !=-1:
+        return f"encoder.layers.{i}.final_layer_norm.weight"
+    if name.find("enc_final_ln_bias") !=-1:
+        return f"encoder.layers.{i}.final_layer_norm.bias"
+    
+    
+    
+####################################################################
+#Decoder layer self attention  weights
+
+    #self attention
+    
+    #self atten to_q、to_q、to_v compute together
+    if name.find("self_attn_qkv_proj_weight") !=-1:
+        return f"decoder.layers.{i}.self_attn.qkv_proj.weight"
+    if name.find("self_attn_qkv_proj_bias") !=-1:
+        return f"decoder.layers.{i}.self_attn.qkv_proj.bias"
+    
+    
+    #self attention proj out
+    if name.find("self_attn_out_proj_weight") !=-1:
+        return f"decoder.layers.{i}.self_attn.out_proj.weight"
+    if name.find("self_attn_out_proj_bias") !=-1:
+        return f"decoder.layers.{i}.self_attn.out_proj.bias"
+    
+    #layernorm
+    if name.find("self_attn_ln_weight") !=-1:
+        return f"decoder.layers.{i}.self_attn_layer_norm.weight"
+    if name.find("self_attn_ln_bias") !=-1:
+        return f"decoder.layers.{i}.self_attn_layer_norm.bias"
+    
+########################################################################    
+########################################################################
+#Decoder layer cross attention  weights 
+
+    #self atten to_q、to_q、to_v compute split
+    #to q
+    if name.find("enc_attn_q_proj_weight") !=-1:
+        return f'decoder.layers.{i}.encoder_attn.q_proj.weight'
+    if name.find("enc_attn_q_proj_bias") !=-1:
+        return f'decoder.layers.{i}.encoder_attn.q_proj.bias'
+    
+    #to_kv split affter
+    if name.find("enc_attn_kv_proj_weight") !=-1:
+        return f'decoder.layers.{i}.encoder_attn.kv_proj.weight'
+    if name.find("enc_attn_kv_proj_bias") !=-1:
+        return f'decoder.layers.{i}.encoder_attn.kv_proj.bias' 
+    
+    if name.find("enc_attn_out_proj_weight") !=-1:
+        return f'decoder.layers.{i}.encoder_attn.out_proj.weight' 
+    if name.find("enc_attn_out_proj_bias") !=-1:
+        return f'decoder.layers.{i}.encoder_attn.out_proj.bias' 
+       
+    #layernorm
+    if name.find("enc_attn_ln_weight") !=-1:
+        return f'decoder.layers.{i}.encoder_attn_layer_norm.weight'
+    if name.find("enc_attn_ln_bias") !=-1:
+        return f'decoder.layers.{i}.encoder_attn_layer_norm.bias'
+########################################################################    
+    #ffn
+    if name.find("ff1_weight") !=-1:
+        return f'decoder.layers.{i}.fc1.weight'
+    if name.find("ff1_bias") !=-1:
+        return f'decoder.layers.{i}.fc1.bias'
+
+    if name.find("ff2_weight") !=-1:
+        return f'decoder.layers.{i}.fc2.weight'
+    if name.find("ff2_bias") !=-1:
+        return f'decoder.layers.{i}.fc2.bias'
+
+    #layernorm
+    if name.find("final_ln_weight") !=-1:
+        return f"decoder.layers.{i}.final_layer_norm.weight"
+    if name.find("final_ln_bias") !=-1:
+        return f"decoder.layers.{i}.final_layer_norm.bias"
+    
+#############################################################    
+    if name.find("linear_weight") !=-1:
+        return f"decoder.output_projection.weight"
+    
+    
+    
+    else:
+        return None
+    
+def get_onnx_weight_dict(tensor_dict, config):
+    N = config.num_attention_heads
+    H = config.head_size
+    hidden_size = config.hidden_size
+
+    weights_dict = dict()
+    
+    for name , tensor in tensor_dict.items():
+    
+        update_name  = reformat_weight_name(name)
+        if update_name is None:
+            continue
+        if update_name.find("encoder_attn.kv_proj.bias") !=-1:
+            k_bias = tensor[:1024]
+            v_bias = tensor[1024:]
+            temp_bias_name = update_name.replace("encoder_attn.kv_proj.bias","")
+            k_bias_name = temp_bias_name+ "encoder_attn.k_proj.bias"
+            v_bias_name = temp_bias_name+ "encoder_attn.v_proj.bias"
+            weights_dict[k_bias_name] = np.ascontiguousarray(k_bias).flatten().astype(np.float32)
+            weights_dict[v_bias_name] = np.ascontiguousarray(v_bias).flatten().astype(np.float32)
+
+
+        elif update_name.find("encoder_attn.kv_proj.weight")!=-1:
+            k_weight = tensor[:1024]
+            v_weight = tensor[1024:]            
+            temp_weight_name = update_name.replace("encoder_attn.kv_proj.weight","")
+            k_weight_name = temp_weight_name+"encoder_attn.k_proj.weight"
+            v_weight_name = temp_weight_name+"encoder_attn.v_proj.weight"
+            weights_dict[k_weight_name] = np.ascontiguousarray(k_weight).flatten().astype(np.float32)
+            weights_dict[v_weight_name] = np.ascontiguousarray(v_weight).flatten().astype(np.float32)
+            
+        if update_name.find("self_attn.qkv_proj.bias") !=-1 and update_name.find("decoder.layers") !=-1:
+            temp_bias_name = update_name.replace("self_attn.qkv_proj.bias","")
+            qkv_bias_name = temp_bias_name+ "self_attn.qkv_proj.bias"                      
+            weights_dict[qkv_bias_name] = np.ascontiguousarray(tensor).flatten().astype(np.float32)
+              
+        elif update_name.find("self_attn.qkv_proj.weight") !=-1 and update_name.find("decoder.layers") !=-1:
+            temp_weight_name = update_name.replace("self_attn.qkv_proj.weight","")
+            qkv_weight_name = temp_weight_name+"self_attn.qkv_proj.weight"            
+            weights_dict[qkv_weight_name] = np.ascontiguousarray(tensor).flatten().astype(np.float32)
+                        
+        else:
+            flat_tensor = np.ascontiguousarray(tensor).flatten().astype(np.float32)
+            weights_dict[update_name] = flat_tensor
+
+    return weights_dict
+
+def onnx_to_trt_name(onnx_name):
+    """
+    Converting variables in the onnx checkpoint to names corresponding to the naming convention used in the TF version, expected by the builder
+    """
+    qkv_strings = {'key', 'value', 'query', 'query_key_value'}
+    onnx_name = onnx_name.lower()
+    toks = [t.strip('_') for t in onnx_name.split('.')]
+    if toks[0] == 'bert': #embeddings or encoder
+        if toks[1] == 'encoder': #transformer
+            # Token conversions for sparse checkpoints
+            if toks[-2] == 'dense_act':
+                toks[-2] = 'dense'
+            elif toks[-3] == 'dense_act':
+                if toks[-2] == 'input_quantizer':
+                    toks[-2] = 'input'
+                elif toks[-2] == 'weight_quantizer':
+                    toks[-2] = 'kernel'
+                toks[-3] = 'dense'
+            elif toks[-2].startswith('matmul'):
+                toks[-2] = {
+                    'matmul_q_quantizer': 'qv_a_input_quantizer',
+                    'matmul_k_quantizer': 'qv_b_input_quantizer',
+                    'matmul_v_quantizer': 'av_b_input_quantizer',
+                    'matmul_a_quantizer': 'av_a_input_quantizer',
+                }[toks[-2].replace('input_', '')]
+
+            # Token conversions for all checkpoints
+            if toks[-2] == 'layernorm': #bias->beta, weight->gamma
+                toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma'
+            elif (toks[-2] == 'dense' or toks[-2] in qkv_strings) and toks[-1] == 'weight':
+                toks[-1] = 'kernel'
+            elif (toks[-3] == 'dense' or toks[-3] in qkv_strings) and toks[-1] == 'amax':
+                if toks[-2] == 'weight_quantizer':
+                    toks[-2] = 'kernel'
+                elif toks[-2] == 'input_quantizer':
+                    toks[-2] = 'input'
+
+            if 'final_input_quantizer' not in toks[2]:
+                ind = toks.index('layers')+1 if 'layers' in toks else 3
+                toks = toks[ind:]
+                toks[0] = 'l{}'.format(int(toks[0]))
+        else:
+            if toks[-2] == 'layernorm': #bias->beta, weight->gamma
+                toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma'
+            else: #embeddings: drop "_weight" suffix
+                if toks[-1] == 'amax':
+                    toks[-2] = 'amax'
+                toks = toks[:-1]
+    elif 'qa' in onnx_name:
+        name = 'cls_squad_output_bias' if toks[-1] == 'bias' else 'cls_squad_output_weights'
+        return name
+    else:
+        print("Encountered unknown case:", onnx_name)
+        assert(False)
+    parsed = '_'.join(toks)
+    return parsed
+
+def load_onnx_weights_and_quant(path, config):
+    """
+    Load the weights from the onnx checkpoint
+    """
+    model = onnx.load(path)
+    weights = model.graph.initializer
+    tensor_dict = dict((w.name, np.frombuffer(w.raw_data, np.float16).reshape(w.dims))
+                       for w in weights)
+    return get_onnx_weight_dict(tensor_dict, config)
+
+def load_pytorch_weights_and_quant(path, config):
+    """
+    Load the weights from the pytorch checkpoint
+    """
+    state_dict = torch.load(path, map_location='cpu')["model"]
+    tensor_dict = {onnx_to_trt_name(name):val.numpy() for name, val in state_dict.items()}
+    return get_onnx_weight_dict(tensor_dict, config)
+
+class transformerBaseConfig:
+    def __init__(self, bert_config_path, use_fp16, use_int8=False):
+        with open(bert_config_path, "r") as f:
+            data = json.load(f)
+            self.num_attention_heads = data["num_attention_heads"]
+            self.hidden_size = data["hidden_size"]
+            self.intermediate_size = data["intermediate_size"]
+            self.num_hidden_layers = data["num_hidden_layers"]
+            self.head_size = self.hidden_size // self.num_attention_heads
+            self.use_fp16 = use_fp16
+            self.use_int8 = use_int8
+
+if __name__ == '__main__':
+    config_path = './wmt14_en_de/transformer_config.json'
+    onnx_model_path = './wmt14_en_de/transformer.onnx'
+    weight_save_path = "./wmt14_en_de/transformer.wts"
+    config = config = transformerBaseConfig(config_path, True)
+    weights_dict = load_onnx_weights_and_quant(onnx_model_path, config)
+    
+    for tensor_name, tensor in weights_dict.items():
+        print(tensor_name,":",tensor.shape)
+    
+    
+    
+    # f = open(weight_save_path, "w")
+    # num = 0
+    # for key, value in weights_dict.items():
+    #     if key.find('_amax') == -1:
+    #         num += 1
+    
+    # f.write('{}\n'.format(num))
+    # for key, value in weights_dict.items():
+    #     print('key: ', key)
+    #     if key.find('_amax') != -1:
+    #         continue
+    #     f.write("{} {}".format(key, len(value)))
+    #     print(len(value))
+    #     for v in value:
+    #         f.write(" ")
+    #         f.write(struct.pack('>f', float(v)).hex())
+        # f.write("\n")
diff --git a/models/nlp/plm/transformer/igie/ci/prepare.sh b/models/nlp/plm/transformer/igie/ci/prepare.sh
new file mode 100644
index 00000000..ca686639
--- /dev/null
+++ b/models/nlp/plm/transformer/igie/ci/prepare.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y numactl
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y numactl
+else
+    echo "Not Support Os"
+fi
+
+pip3 install -r requirements.txt
+# reference: https://github.com/facebookresearch/fairseq/commit/3d262bb25690e4eb2e7d3c1309b1e9c406ca4b99
+ln -s /mnt/deepspark/data/3rd_party/fairseq ../
+# reference: https://github.com/omry/omegaconf/tree/v2.3.0
+ln -s /mnt/deepspark/data/3rd_party/omegaconf ../
+cp ../omegaconf.py ../omegaconf/
+# reference: https://github.com/facebookresearch/hydra/tree/v1.3.2
+ln -s /mnt/deepspark/data/3rd_party/hydra ../
+cd ../
+python3 setup.py build_ext --inplace
+cd igie/
+mkdir -p data/datasets/
+mkdir -p data/checkpoints
+ln -s /mnt/deepspark/data/datasets/corex-inference-data-4.0.0/checkpoints/transformer/wmt14.en-fr.joined-dict.transformer ./data/checkpoints/
+ln -s /mnt/deepspark/data/datasets/corex-inference-data-4.0.0/datasets/wmt14.en-fr.joined-dict.newstest2014 ./data/datasets/
\ No newline at end of file
diff --git a/models/nlp/plm/transformer/igie/common.py b/models/nlp/plm/transformer/igie/common.py
new file mode 100644
index 00000000..4759060f
--- /dev/null
+++ b/models/nlp/plm/transformer/igie/common.py
@@ -0,0 +1,92 @@
+import os
+import cv2
+import glob
+import torch
+import tensorrt
+import numpy as np
+import cuda.cudart as cudart
+
+
+def create_engine_context(engine_path, logger):
+    with open(engine_path, "rb") as f:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+
+    return engine, context
+
+
+def get_io_bindings(engine):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = engine.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        err, allocation = cudart.cudaMalloc(size)
+        assert err == cudart.cudaError_t.cudaSuccess
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+            "nbytes": size,
+        }
+        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
+
+
+def setup_io_bindings(engine, context):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = context.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        err, allocation = cudart.cudaMalloc(size)
+        assert err == cudart.cudaError_t.cudaSuccess
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+            "nbytes": size,
+        }
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
diff --git a/models/nlp/plm/transformer/igie/inference_wmt14_en_fr_fp16_accuracy.py b/models/nlp/plm/transformer/igie/inference_wmt14_en_fr_fp16_accuracy.py
new file mode 100644
index 00000000..3a3c648a
--- /dev/null
+++ b/models/nlp/plm/transformer/igie/inference_wmt14_en_fr_fp16_accuracy.py
@@ -0,0 +1,517 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Translate pre-processed data with a trained model.
+"""
+
+import ast
+import logging
+import math
+import os
+import pickle
+import sys
+
+import sys  
+sys.path.append("../")
+from argparse import Namespace
+from itertools import chain
+
+import numpy as np
+import torch
+from fairseq import checkpoint_utils, options, scoring, tasks, utils
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.logging import progress_bar
+from fairseq.logging.meters import StopwatchMeter, TimeMeter
+from omegaconf import DictConfig
+
+import tensorrt
+from tensorrt import Dims
+from common import create_engine_context, get_io_bindings, setup_io_bindings
+
+
+from plugin.transformer_cfg import TransformerBaseConfig
+from plugin.trt import T5TRTDecoder, T5TRTEncoder,inference
+
+import cuda.cudart as cudart
+
+def engine_init(engine):
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+    engine, context = create_engine_context(engine, logger)
+    return engine, context
+
+
+def tensorrt_infer(engine, context, features):
+    input_names=["src_tokens"]
+    output_names=["output"]
+    input_idx = engine.get_binding_index(input_names[0])
+    input_shape = features.shape
+    context.set_binding_shape(input_idx, Dims(input_shape))
+
+    inputs, outputs, allocations = setup_io_bindings(engine, context)
+    pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+    (err,) = cudart.cudaMemcpy(
+        inputs[0]["allocation"],
+        features,
+        inputs[0]["nbytes"],
+        cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
+    )
+    assert err == cudart.cudaError_t.cudaSuccess
+    context.execute_v2(allocations)
+    (err,) = cudart.cudaMemcpy(
+        pred_output,
+        outputs[0]["allocation"],
+        outputs[0]["nbytes"],
+        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+    )
+    assert err == cudart.cudaError_t.cudaSuccess
+    return pred_output
+
+
+def main(cfg: DictConfig):
+
+    if isinstance(cfg, Namespace):
+        cfg = convert_namespace_to_omegaconf(cfg)
+
+    assert cfg.common_eval.path is not None, "--path required for generation!"
+    assert (
+        not cfg.generation.sampling or cfg.generation.nbest == cfg.generation.beam
+    ), "--sampling requires --nbest to be equal to --beam"
+    assert (
+        cfg.generation.replace_unk is None or cfg.dataset.dataset_impl == "raw"
+    ), "--replace-unk requires a raw text dataset (--dataset-impl=raw)"
+
+    if cfg.common_eval.results_path is not None:
+        os.makedirs(cfg.common_eval.results_path, exist_ok=True)
+        output_path = os.path.join(
+            cfg.common_eval.results_path,
+            "generate-{}.txt".format(cfg.dataset.gen_subset),
+        )
+        with open(output_path, "w", buffering=1, encoding="utf-8") as h:
+            return _main(cfg, h)
+    else:
+        return _main(cfg, sys.stdout)
+
+
+def get_symbols_to_strip_from_output(generator):
+    if hasattr(generator, "symbols_to_strip_from_output"):
+        return generator.symbols_to_strip_from_output
+    else:
+        return {generator.eos}
+
+
+def _main(cfg: DictConfig, output_file):
+    logging.basicConfig(
+        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=os.environ.get("LOGLEVEL", "INFO").upper(),
+        stream=output_file,
+    )
+    logger = logging.getLogger("fairseq_cli.generate")
+
+    utils.import_user_module(cfg.common)
+
+    if cfg.dataset.max_tokens is None and cfg.dataset.batch_size is None:
+        cfg.dataset.max_tokens = 12000
+    logger.info(cfg)
+
+    # Fix seed for stochastic decoding
+    if cfg.common.seed is not None and not cfg.generation.no_seed_provided:
+        np.random.seed(cfg.common.seed)
+        utils.set_torch_seed(cfg.common.seed)
+
+    use_cuda = torch.cuda.is_available() and not cfg.common.cpu
+
+    # Load dataset splits
+    task = tasks.setup_task(cfg.task)
+
+    # Set dictionaries
+    try:
+        src_dict = getattr(task, "source_dictionary", None)
+    except NotImplementedError:
+        src_dict = None
+    tgt_dict = task.target_dictionary
+
+    overrides = ast.literal_eval(cfg.common_eval.model_overrides)
+
+    # Load ensemble
+    logger.info("loading model(s) from {}".format(cfg.common_eval.path))
+    models, saved_cfg = checkpoint_utils.load_model_ensemble(
+        utils.split_paths(cfg.common_eval.path),
+        arg_overrides=overrides,
+        task=task,
+        suffix=cfg.checkpoint.checkpoint_suffix,
+        strict=(cfg.checkpoint.checkpoint_shard_count == 1),
+        num_shards=cfg.checkpoint.checkpoint_shard_count,
+    )
+
+    # loading the dataset should happen after the checkpoint has been loaded so we can give it the saved task config
+    task.load_dataset(cfg.dataset.gen_subset, task_cfg=saved_cfg.task)
+
+    if cfg.generation.lm_path is not None:
+        overrides["data"] = cfg.task.data
+
+        try:
+            lms, _ = checkpoint_utils.load_model_ensemble(
+                [cfg.generation.lm_path], arg_overrides=overrides, task=None
+            )
+        except:
+            logger.warning(
+                f"Failed to load language model! Please make sure that the language model dict is the same "
+                f"as target dict and is located in the data dir ({cfg.task.data})"
+            )
+            raise
+
+        assert len(lms) == 1
+    else:
+        lms = [None]
+
+    # Optimize ensemble for generation
+    for model in chain(models, lms):
+        if model is None:
+            continue
+        if cfg.common.fp16:
+            model.half()
+        if use_cuda and not cfg.distributed_training.pipeline_model_parallel:
+            model.cuda()
+        model.prepare_for_inference_(cfg)
+
+    # Load alignment dictionary for unknown word replacement
+    # (None if no unknown word replacement, empty if no path to align dictionary)
+    align_dict = utils.load_align_dict(cfg.generation.replace_unk)
+
+    # Load dataset (possibly sharded)
+    itr = task.get_batch_iterator(
+        dataset=task.dataset(cfg.dataset.gen_subset),
+        max_tokens=cfg.dataset.max_tokens,
+        max_sentences=cfg.dataset.batch_size,
+        max_positions=utils.resolve_max_positions(
+            task.max_positions(), *[m.max_positions() for m in models]
+        ),
+        ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test,
+        required_batch_size_multiple=cfg.dataset.required_batch_size_multiple,
+        seed=cfg.common.seed,
+        num_shards=cfg.distributed_training.distributed_world_size,
+        shard_id=cfg.distributed_training.distributed_rank,
+        num_workers=cfg.dataset.num_workers,
+        data_buffer_size=cfg.dataset.data_buffer_size,
+    ).next_epoch_itr(shuffle=False)
+    progress = progress_bar.progress_bar(
+        itr,
+        log_format=cfg.common.log_format,
+        log_interval=cfg.common.log_interval,
+        default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"),
+    )
+
+    # Initialize generator
+    gen_timer = StopwatchMeter()
+
+    extra_gen_cls_kwargs = {"lm_model": lms[0], "lm_weight": cfg.generation.lm_weight}
+    generator = task.build_generator(
+        models, cfg.generation, extra_gen_cls_kwargs=extra_gen_cls_kwargs
+    )
+
+    # Handle tokenization and BPE
+    tokenizer = task.build_tokenizer(cfg.tokenizer)
+    bpe = task.build_bpe(cfg.bpe)
+
+    def decode_fn(x):
+        if bpe is not None:
+            x = bpe.decode(x)
+        if tokenizer is not None:
+            x = tokenizer.decode(x)
+        return x
+
+    scorer = scoring.build_scorer(cfg.scoring, tgt_dict)
+
+
+    model_dir = os.path.split(cfg.common_eval["path"])[0]
+
+    print("1. load engine")
+    batch_size = cfg.dataset["batch_size"]
+    config_path = os.path.join(model_dir,'transformer_config.json')
+    config = TransformerBaseConfig(config_path)
+    
+    encoder_engine =  os.path.join(model_dir,'Encoder.engine')
+    print(f"2 load encoder engine from {encoder_engine}") 
+    encoder = T5TRTEncoder(encoder_engine,config, batch_size=batch_size) 
+    
+    
+    decoder_engine =  os.path.join(model_dir,'Decoder.engine')
+    print(f"3 load decoder_engine engine from {decoder_engine}") 
+    decoder = T5TRTDecoder(decoder_engine,config,batch_size=batch_size)  
+    
+    
+    print("4. inference")
+    num_sentences = 0
+    has_target = True
+    wps_meter = TimeMeter()
+
+    total_samples = []
+    
+    num = 0
+    
+    for i,sample in enumerate(progress):
+        sample = utils.move_to_cuda(sample) if use_cuda else sample
+        device = sample["net_input"]["src_tokens"].device
+        if "net_input" not in sample:
+            continue
+
+        prefix_tokens = None
+        if cfg.generation.prefix_size > 0:
+            prefix_tokens = sample["target"][:, : cfg.generation.prefix_size]
+
+        constraints = None
+        if "constraints" in sample:
+            constraints = sample["constraints"]
+
+        src_tokens = (
+            sample["net_input"]["src_tokens"].int()
+        )
+        current_bs = src_tokens.shape[0]
+        
+        src_tokens_pad = torch.torch.full((batch_size,src_tokens.shape[1]), 2,dtype = torch.int32).cuda()
+        src_tokens_pad[:current_bs,:] = src_tokens
+        gen_timer.start()        
+        new_tokens = inference(config,encoder,decoder,src_tokens_pad).cpu().numpy()[:current_bs,:]
+        num_generated_tokens = new_tokens.shape[0] * new_tokens.shape[1]
+        
+        gen_timer.stop(num_generated_tokens)
+        tokens = torch.tensor(new_tokens).cuda()
+        new_hypos = []
+        for i in range(len(tokens)):
+            new_hypo = {
+                # "tokens": hypos[i][0]['tokens'],
+                "tokens": tokens[i],
+                "alignment": torch.tensor([]).to(device),
+            }
+            new_hypos.append([new_hypo])
+        # exit()
+        hypos = new_hypos
+
+        for i, sample_id in enumerate(sample["id"].tolist()):
+            has_target = sample["target"] is not None
+
+            # Remove padding
+            if "src_tokens" in sample["net_input"]:
+                src_tokens = utils.strip_pad(
+                    sample["net_input"]["src_tokens"][i, :], tgt_dict.pad()
+                )
+            else:
+                src_tokens = None
+
+            target_tokens = None
+            if has_target:
+                target_tokens = (
+                    utils.strip_pad(sample["target"][i, :], tgt_dict.pad()).int().cpu()
+                )
+
+            # Either retrieve the original sentences or regenerate them from tokens.
+            if align_dict is not None:
+                src_str = task.dataset(cfg.dataset.gen_subset).src.get_original_text(
+                    sample_id
+                )
+                target_str = task.dataset(cfg.dataset.gen_subset).tgt.get_original_text(
+                    sample_id
+                )
+            else:
+                if src_dict is not None:
+                    try:
+                        src_str = src_dict.string(src_tokens, cfg.common_eval.post_process)
+                    except:
+                        print(src_tokens)    
+                else:
+                    src_str = ""
+                if has_target:
+                    target_str = tgt_dict.string(
+                        target_tokens,
+                        cfg.common_eval.post_process,
+                        escape_unk=True,
+                        extra_symbols_to_ignore=get_symbols_to_strip_from_output(
+                            generator
+                        ),
+                    )
+
+            src_str = decode_fn(src_str)
+            if has_target:
+                target_str = decode_fn(target_str)
+
+            if not cfg.common_eval.quiet:
+                if src_dict is not None:
+                    print("S-{}\t{}".format(sample_id, src_str), file=output_file)
+                if has_target:
+                    print("T-{}\t{}".format(sample_id, target_str), file=output_file)
+
+            # Process top predictions
+            for j, hypo in enumerate(hypos[i][: cfg.generation.nbest]):
+                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
+                    hypo_tokens=hypo["tokens"].int().cpu(),
+                    src_str=src_str,
+                    alignment=hypo["alignment"],
+                    align_dict=align_dict,
+                    tgt_dict=tgt_dict,
+                    remove_bpe=cfg.common_eval.post_process,
+                    extra_symbols_to_ignore=get_symbols_to_strip_from_output(generator),
+                )
+                detok_hypo_str = decode_fn(hypo_str)
+                if not cfg.common_eval.quiet:
+                    score = hypo["score"] / math.log(2)  # convert to base 2
+                    # original hypothesis (after tokenization and BPE)
+                    print(
+                        "H-{}\t{}\t{}".format(sample_id, score, hypo_str),
+                        file=output_file,
+                    )
+                    # detokenized hypothesis
+                    print(
+                        "D-{}\t{}\t{}".format(sample_id, score, detok_hypo_str),
+                        file=output_file,
+                    )
+                    print(
+                        "P-{}\t{}".format(
+                            sample_id,
+                            " ".join(
+                                map(
+                                    lambda x: "{:.4f}".format(x),
+                                    # convert from base e to base 2
+                                    hypo["positional_scores"]
+                                    .div_(math.log(2))
+                                    .tolist(),
+                                )
+                            ),
+                        ),
+                        file=output_file,
+                    )
+
+                    if cfg.generation.print_alignment == "hard":
+                        print(
+                            "A-{}\t{}".format(
+                                sample_id,
+                                " ".join(
+                                    [
+                                        "{}-{}".format(src_idx, tgt_idx)
+                                        for src_idx, tgt_idx in alignment
+                                    ]
+                                ),
+                            ),
+                            file=output_file,
+                        )
+                    if cfg.generation.print_alignment == "soft":
+                        print(
+                            "A-{}\t{}".format(
+                                sample_id,
+                                " ".join(
+                                    [",".join(src_probs) for src_probs in alignment]
+                                ),
+                            ),
+                            file=output_file,
+                        )
+
+                    if cfg.generation.print_step:
+                        print(
+                            "I-{}\t{}".format(sample_id, hypo["steps"]),
+                            file=output_file,
+                        )
+
+                    if cfg.generation.retain_iter_history:
+                        for step, h in enumerate(hypo["history"]):
+                            _, h_str, _ = utils.post_process_prediction(
+                                hypo_tokens=h["tokens"].int().cpu(),
+                                src_str=src_str,
+                                alignment=None,
+                                align_dict=None,
+                                tgt_dict=tgt_dict,
+                                remove_bpe=None,
+                            )
+                            print(
+                                "E-{}_{}\t{}".format(sample_id, step, h_str),
+                                file=output_file,
+                            )
+
+                # Score only the top hypothesis
+                if has_target and j == 0:
+                    if (
+                        align_dict is not None
+                        or cfg.common_eval.post_process is not None
+                    ):
+                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
+                        target_tokens = tgt_dict.encode_line(
+                            target_str, add_if_not_exist=True
+                        )
+                        hypo_tokens = tgt_dict.encode_line(
+                            detok_hypo_str, add_if_not_exist=True
+                        )
+                    if hasattr(scorer, "add_string"):
+                        scorer.add_string(target_str, detok_hypo_str)
+                    else:
+                        scorer.add(target_tokens, hypo_tokens)
+
+        wps_meter.update(num_generated_tokens)
+        progress.log({"wps": round(wps_meter.avg)})
+        num_sentences += (
+            sample["nsentences"] if "nsentences" in sample else sample["id"].numel()
+        )
+    decoder.clear()
+    encoder.clear()    
+    
+    logger.info("NOTE: hypothesis and token scores are output in base 2")
+    logger.info(
+        "Translated {:,} sentences ({:,} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)".format(
+            num_sentences,
+            gen_timer.n,
+            gen_timer.sum,
+            num_sentences / gen_timer.sum,
+            1.0 / gen_timer.avg,
+        )
+    )
+    if has_target:
+        if cfg.bpe and not cfg.generation.sacrebleu:
+            if cfg.common_eval.post_process:
+                logger.warning(
+                    "BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization"
+                )
+            else:
+                logger.warning(
+                    "If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words.  Use --sacrebleu for standard 13a BLEU tokenization"
+                )
+        # use print to be consistent with other main outputs: S-, H-, T-, D- and so on
+        print(
+            "Generate {} with beam={}: {}".format(
+                cfg.dataset.gen_subset, cfg.generation.beam, scorer.result_string()
+            ),
+            file=output_file,
+        )
+
+    return scorer
+
+
+def cli_main():
+    parser = options.get_generation_parser()
+    # TODO: replace this workaround with refactoring of `AudioPretraining`
+    parser.add_argument(
+        "--arch",
+        "-a",
+        metavar="ARCH",
+        default="wav2vec2",
+        help="Model architecture. For constructing tasks that rely on "
+        "model args (e.g. `AudioPretraining`)",
+    )
+    args = options.parse_args_and_arch(parser)
+    score = main(args).score()
+    target_score = float(os.environ["Accuracy"])
+    print("BLEU4: = ", score, "target BLEU4: ", target_score)
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["BLEU4"] = round(score, 3)
+    print(metricResult)
+    if score >= target_score:
+        print("pass!")
+        exit()
+    else:
+        print("failed!")
+        exit(1)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/models/nlp/plm/transformer/igie/inference_wmt14_en_fr_fp16_performance.py b/models/nlp/plm/transformer/igie/inference_wmt14_en_fr_fp16_performance.py
new file mode 100644
index 00000000..e6984b8e
--- /dev/null
+++ b/models/nlp/plm/transformer/igie/inference_wmt14_en_fr_fp16_performance.py
@@ -0,0 +1,147 @@
+import json
+import os
+import numpy as np
+import argparse
+import time
+
+import tensorrt
+from tensorrt import Dims
+from common import create_engine_context, get_io_bindings, setup_io_bindings
+
+import sys
+sys.path.append("../")
+from plugin.transformer_cfg import TransformerBaseConfig
+from plugin.trt import T5TRTDecoder, T5TRTEncoder,inference,benchmark
+
+import torch
+from torch.utils.data import DataLoader
+
+
+class CustomDataset(torch.utils.data.Dataset):
+    def __init__(self, inputs):
+        self.inputs = inputs
+
+    def __getitem__(self, index):
+        input = self.inputs[index]        
+        return input
+    
+    def __len__(self):
+        return len(self.inputs)
+
+
+
+
+
+def generate_batch(features):
+    all_inputs = []
+    tmp = []
+    for data in features:
+        if len(tmp) == args.max_batch_size:
+            batch_max_len = max([len(i) for i in tmp])
+            new_tmp = []
+            for i in tmp:
+                i = i[:args.max_seq_len]
+                i = [pad_id]*(batch_max_len-len(i)) + i
+                new_tmp.append(i)
+            all_inputs.append(np.array(new_tmp).astype(np.int32))
+            tmp = []
+        tmp.append(data)
+        
+    return all_inputs
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="build ixrt graph and convert weights", usage=""
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        required=True,
+        help="max batch size for inference",
+    )
+    parser.add_argument(
+        "--max_seq_len",
+        type=int,
+        default=102,
+        help="max sequence length for inference",
+    )
+    parser.add_argument(
+        "--data_dir",
+        type=str
+    )
+    parser.add_argument(
+        "--model_dir",
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    assert args.max_seq_len <= 102
+    pad_id = 1
+    feature_file = os.path.join(args.data_dir,'features.json')
+
+    with open(feature_file,'r') as f:
+        features = json.loads(f.read())
+
+    all_inputs = generate_batch(features)
+    print(f"max_batch_size: {args.max_batch_size}, max_seq_len: {args.max_seq_len}")
+
+    print("1. build engine")
+    
+    
+    batch_size = args.max_batch_size
+    config_path = os.path.join(args.model_dir,'transformer_config.json')
+    config = TransformerBaseConfig(config_path)
+    
+    encoder_engine =  os.path.join(args.model_dir,'Encoder.engine')
+    print(f"2 load encoder engine from {encoder_engine}") 
+    encoder = T5TRTEncoder(encoder_engine,config, batch_size=batch_size) 
+    
+    
+    decoder_engine =  os.path.join(args.model_dir,'Decoder.engine')
+    print(f"3 load decoder_engine engine from {decoder_engine}") 
+    decoder = T5TRTDecoder(decoder_engine,config,batch_size=batch_size)  
+    
+    
+    device = torch.device("cuda:0")
+    dataset = CustomDataset(all_inputs)
+    dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=False, num_workers=1,drop_last=True)
+    
+    prev_tokens = torch.full((batch_size,1), int(config.sos_token_id),dtype = torch.int32).cuda()
+    for i, data in enumerate(dataloader):     
+        data = torch.squeeze(data,0).to(device)
+        benchmark(config,encoder,decoder,data,prev_tokens)
+
+    print("3. inference")
+    
+    total_time = 0
+    
+    num_sentences = 0
+    for i, data in enumerate(dataloader):
+        data = torch.squeeze(data,0).to(device)
+        num_sentences += data.shape[0]
+        start_time = time.time()
+        benchmark(config,encoder,decoder,data,prev_tokens)
+        end_time = time.time()
+        total_time +=(end_time-start_time)
+        
+    QPS = num_sentences/total_time
+    print(f"Translated {num_sentences} sentences, {QPS} sentences/s")
+    target_qps = float(os.environ['Accuracy'])
+    decoder.clear()
+    encoder.clear() 
+
+    print("QPS: = ", QPS, "target QPS: ", target_qps)
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["QPS"] = round(QPS, 3)
+    print(metricResult)
+    if QPS >= target_qps:
+        print("pass!")
+        exit()
+    else:
+        print("failed!")
+        exit(10)
diff --git a/models/nlp/plm/transformer/igie/load_ixrt_plugin.py b/models/nlp/plm/transformer/igie/load_ixrt_plugin.py
new file mode 100644
index 00000000..22d0a9ad
--- /dev/null
+++ b/models/nlp/plm/transformer/igie/load_ixrt_plugin.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from os.path import join, dirname, exists
+import tensorrt as trt
+import ctypes
+
+def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""):
+    if not dynamic_path:
+        dynamic_path = join(dirname(trt.__file__), "lib", "libixrt_plugin.so")
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+    ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+    trt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/nlp/plm/transformer/igie/plugin_utils.py b/models/nlp/plm/transformer/igie/plugin_utils.py
new file mode 100644
index 00000000..bf79099d
--- /dev/null
+++ b/models/nlp/plm/transformer/igie/plugin_utils.py
@@ -0,0 +1,918 @@
+#!/usr/bin/env python3
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+#
+
+import argparse
+import ctypes
+import json
+import os
+import sys
+import time
+
+import numpy as np
+import tensorrt
+import tensorrt as trt
+
+trt_version = [int(n) for n in trt.__version__.split(".")]
+
+TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
+from load_ixrt_plugin import load_ixrt_plugin
+
+load_ixrt_plugin(
+    TRT_LOGGER
+)
+
+plg_registry = trt.get_plugin_registry()
+
+qkv2ctx_plg_creator = plg_registry.get_plugin_creator(
+    "CustomQKVToContextPluginDynamic_IxRT", "1", ""
+)
+skln_plg_creator = plg_registry.get_plugin_creator(
+    "CustomSkipLayerNormPluginDynamic_IxRT", "1", ""
+)
+
+encoder_emb_plg_creator = plg_registry.get_plugin_creator(
+        "TransformerEncoderEmb_IxRT", "1"
+    )
+attention_plugin_creator = plg_registry.get_plugin_creator(
+        "CustomQkvCrossToContext_IxRT", "1"
+    )
+
+decoder_emb_plg_creator = plg_registry.get_plugin_creator(
+        "TransformerDecoderEmb_IxRT", "1"
+    )
+
+top1_plg_creator = plg_registry.get_plugin_creator(
+        "CustomArgmax_IxRT", "1"
+    )
+
+ffn_plg_creator = plg_registry.get_plugin_creator("CustomFFNPluginDynamic_IxRT", "1", "")
+
+fc_plg_creator = plg_registry.get_plugin_creator("CustomFCPluginDynamic_IxRT", "1", "")
+
+def get_mha_dtype(config):
+    dtype = trt.float32
+    if config.use_fp16:
+        dtype = trt.float16
+    return int(dtype)
+
+
+
+
+def create_split_qkv_plugin(num_head,num_dim,index):
+
+    plugin_registry = tensorrt.get_plugin_registry()
+    assert plugin_registry
+
+    plugin_creator = plugin_registry.get_plugin_creator("SplitQKVUpdateKVCache_IxRT", "1")
+    assert plugin_creator
+    
+    head_num_field = tensorrt.PluginField(
+    "num_head",
+    np.array([num_head], dtype=np.int32),
+    tensorrt.PluginFieldType.INT32)
+    
+    head_dim_field = tensorrt.PluginField(
+    "head_dim",
+    np.array([num_dim], dtype=np.int32),
+    tensorrt.PluginFieldType.INT32)
+    
+    field_collection = tensorrt.PluginFieldCollection([head_num_field,head_dim_field ])
+    plugin = plugin_creator.create_plugin(f"SplitQKVUpdateKVCache_IxRT_{index}", field_collection)
+
+    return plugin
+
+
+def create_encoder_emb_plugin(
+    weights_dict,
+    config
+):
+
+    embed_scale_field = trt.PluginField(
+        "embed_scale",
+        np.array([32], dtype=np.float32),
+        trt.PluginFieldType.FLOAT32,
+    )
+    hidden_size_field = trt.PluginField(
+        "hidden_size",
+        np.array([config.hidden_size], dtype=np.int32),
+        trt.PluginFieldType.INT32,
+    )
+    max_pos_field = trt.PluginField(
+        "max_pos",
+        np.array([1024], dtype=np.int32),
+        trt.PluginFieldType.INT32,
+    )
+
+    pad_idx_field = trt.PluginField(
+        "pad_idx",
+        np.array([1], dtype=np.int32),
+        trt.PluginFieldType.INT32,
+    )
+
+    token_w_field = trt.PluginField(
+        "enc_token_emb_weight",
+        weights_dict["enc_token_emb_weight"],
+        trt.PluginFieldType.FLOAT32,
+    )
+
+    pos_w_field = trt.PluginField(
+        "enc_pos_emb_weight",
+        weights_dict["enc_pos_emb_weight"],
+        trt.PluginFieldType.FLOAT32,
+    )
+
+    field_collection = trt.PluginFieldCollection(
+        [
+            embed_scale_field,
+            hidden_size_field,
+            max_pos_field,
+            pad_idx_field,
+            token_w_field,
+            pos_w_field,
+        ]
+    )
+
+    emb_plugin = encoder_emb_plg_creator.create_plugin(
+        "py_TransformerEncoderEmb_ixrt", field_collection
+    )
+
+    return emb_plugin  
+
+
+
+def custom_fc(network, input_tensor, out_dims, W, B):
+    pf_out_dims = trt.PluginField("out_dims", np.array(out_dims, dtype=np.int32), trt.PluginFieldType.INT32)
+    pf_type = trt.PluginField("type_id", np.array(int(trt.float16), dtype=np.int32), trt.PluginFieldType.INT32)
+    pf_W = trt.PluginField("W", W, trt.PluginFieldType.FLOAT32)
+    fields = [pf_out_dims, pf_type, pf_W]
+    if B is not None:
+        pf_B = trt.PluginField("B", B, trt.PluginFieldType.FLOAT32)
+        fields.append(pf_B)
+
+    pfc = trt.PluginFieldCollection(fields)
+    fc_plugin = fc_plg_creator.create_plugin("fcplugin", pfc)
+    plug_inputs = [input_tensor]
+    out_dense = network.add_plugin_v2(plug_inputs, fc_plugin)
+    return out_dense          
+ 
+ 
+ 
+def create_encoder_attention_plugin():
+   plugin_registry = tensorrt.get_plugin_registry()
+   assert plugin_registry
+   plugin_creator = plugin_registry.get_plugin_creator(
+       "CustomQkvCrossToContext_IxRT", "1"
+   )
+   assert plugin_creator
+   type_id_field = tensorrt.PluginField(
+       "type_id",
+       np.array([2], dtype=np.int32),
+       tensorrt.PluginFieldType.INT32,
+   )
+   has_mask_field = tensorrt.PluginField(
+       "has_mask",
+       np.array([1], dtype=np.int32),
+       tensorrt.PluginFieldType.INT32,
+   )
+   
+   mask_type_field = tensorrt.PluginField(
+       "type_mask",
+       np.array([4], dtype=np.int32),
+       tensorrt.PluginFieldType.INT32,
+   )
+   
+   scale_field = tensorrt.PluginField(
+       "scale",
+       np.array([1.0 / 8], dtype=np.float32),  # 1 / sqrt(head_num)
+       tensorrt.PluginFieldType.FLOAT32,
+   )
+   field_collection = tensorrt.PluginFieldCollection([type_id_field, has_mask_field,mask_type_field,scale_field])
+   plugin = plugin_creator.create_plugin("py_QkvCrossToContext_ixrt", field_collection)
+   return plugin
+
+
+           
+def encoder_self_attention_layer(
+    block, layer_index, config, init_dict, network, input_tensor, imask=None
+):
+    """
+    Add the attention layer
+    """
+
+    B, S, hidden_size, _, _ = input_tensor.shape
+    num_heads = config.num_attention_heads
+    head_size = int(hidden_size / num_heads)
+
+    self_attn_qkv_proj_weight = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.qkv_proj.weight"
+    ]
+    self_attn_qkv_proj_bias = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.qkv_proj.bias"
+    ]
+
+    # q_proj,k_proj,v_proj
+    # to_qkv = network.add_fully_connected(
+    #     input_tensor,
+    #     3 * hidden_size,
+    #     self_attn_qkv_proj_weight,
+    #     self_attn_qkv_proj_bias,
+    # )
+    
+    to_qkv = custom_fc(network, input_tensor, 3 * hidden_size, self_attn_qkv_proj_weight, self_attn_qkv_proj_bias)
+
+    has_mask = imask is not None
+    # QKV2CTX
+    pf_type = trt.PluginField(
+        "type_id",
+        np.array([get_mha_dtype(config)], np.int32),
+        trt.PluginFieldType.INT32,
+    )
+    pf_hidden_size = trt.PluginField(
+        "hidden_size", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32
+    )
+    pf_num_heads = trt.PluginField(
+        "num_heads", np.array([num_heads], np.int32), trt.PluginFieldType.INT32
+    )
+    pf_has_mask = trt.PluginField(
+        "has_mask", np.array([has_mask], np.int32), trt.PluginFieldType.INT32
+    )
+    pfc = trt.PluginFieldCollection(
+        [pf_hidden_size, pf_num_heads, pf_has_mask, pf_type]
+    )
+    qkv2ctx_plug = qkv2ctx_plg_creator.create_plugin("qkv2ctx", pfc)
+
+    qkv_in = [to_qkv.get_output(0)]
+    if has_mask:
+        qkv_in.append(imask)
+    qkv2ctx = network.add_plugin_v2(qkv_in, qkv2ctx_plug)
+    return qkv2ctx
+
+def skipln(
+    block, layer_index, name, config, init_dict, network, input_tensor, skip, bias=None
+):
+    """
+    Add the skip layer
+    """
+    idims = input_tensor.shape
+    
+    # assert len(idims) == 5
+    hidden_size = idims[2]
+
+    dtype = trt.float32
+    if config.use_fp16:
+        dtype = trt.float16
+
+    pf_ld = trt.PluginField(
+        "ld", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32
+    )
+
+    ln_weight = init_dict[f"{block}.layers.{layer_index}.{name}.weight"]
+    pf_gamma = trt.PluginField("gamma", ln_weight, trt.PluginFieldType.FLOAT32)
+
+    ln_bias = init_dict[f"{block}.layers.{layer_index}.{name}.bias"]
+    pf_beta = trt.PluginField("beta", ln_bias, trt.PluginFieldType.FLOAT32)
+
+    pf_type = trt.PluginField(
+        "type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32
+    )
+    fields = [pf_ld, pf_beta, pf_gamma, pf_type]
+
+    if bias is not None:
+        pf_bias = trt.PluginField("bias", bias, trt.PluginFieldType.FLOAT32)
+        fields.append(pf_bias)
+
+    pfc = trt.PluginFieldCollection(fields)
+    skipln_plug = skln_plg_creator.create_plugin("skipln", pfc)
+
+    skipln_inputs = [input_tensor, skip]
+    layer = network.add_plugin_v2(skipln_inputs, skipln_plug)
+    return layer
+
+def ffn(block, layer_index, config, init_dict, network, input_tensor):
+
+    fc1_weight = init_dict[f"{block}.layers.{layer_index}.fc1.weight"]
+    fc1_bias = init_dict[f"{block}.layers.{layer_index}.fc1.bias"]
+
+    # mid_dense = network.add_fully_connected(
+    #     input_tensor, config.intermediate_size, fc1_weight, fc1_bias
+    # )
+    # mid_dense = custom_fc(network, input_tensor, config.intermediate_size, fc1_weight, fc1_bias)
+    
+
+    # relu_inputs = mid_dense.get_output(0)
+    # relu_layer = network.add_activation(relu_inputs, tensorrt.ActivationType.RELU)
+
+    # intermediate_act = relu_layer.get_output(0)
+
+    fc2_weight = init_dict[f"{block}.layers.{layer_index}.fc2.weight"]
+    fc2_bias = init_dict[f"{block}.layers.{layer_index}.fc2.bias"]
+    # out_dense = network.add_fully_connected(
+    #     intermediate_act, config.hidden_size, fc2_weight, fc2_bias
+    # )
+    # out_dense = custom_fc(network, intermediate_act, config.hidden_size, fc2_weight, fc2_bias)
+    
+    
+    pf_out_dim = trt.PluginField("out_dims", np.array(config.hidden_size, np.int32), trt.PluginFieldType.INT32)
+    pf_type = trt.PluginField("type_id", np.array(int(trt.float16), np.int32), trt.PluginFieldType.INT32)
+    pf_W1 = trt.PluginField("W1", fc1_weight, trt.PluginFieldType.FLOAT32)
+    pf_B1 = trt.PluginField("B1", fc1_bias, trt.PluginFieldType.FLOAT32)
+    pf_W2 = trt.PluginField("W2", fc2_weight, trt.PluginFieldType.FLOAT32)
+    pf_act_type = trt.PluginField("act_type", np.array(int(4), np.int32), trt.PluginFieldType.INT32) #RELU=4
+    pfc = trt.PluginFieldCollection([pf_out_dim, pf_type, pf_W1, pf_W2, pf_B1, pf_act_type])
+    ffn_plug = ffn_plg_creator.create_plugin("ffn", pfc)
+
+    ffn_inputs = [input_tensor]
+    out_dense = network.add_plugin_v2(ffn_inputs, ffn_plug)
+    
+    out_layer = skipln(
+        block,
+        layer_index,
+        "final_layer_norm",
+        config,
+        init_dict,
+        network,
+        out_dense.get_output(0),
+        input_tensor,
+        fc2_bias
+    )
+    return out_layer
+
+def transformer_encoder_layer(
+    block, layer_index, config, init_dict, network, input_tensor, imask
+):
+    """
+    Add the transformer layer
+    """
+    idims = input_tensor.shape
+    assert len(idims) == 5
+    hidden_size = idims[2]
+
+    self_attention = encoder_self_attention_layer(
+        block, layer_index, config, init_dict, network, input_tensor,imask
+    )  # l0_enc_self_attn_qkv_weight  l0_enc_self_attn_qkv_bias
+    
+    # self_attention = encoder_self_attention_layer2(
+    #     block, layer_index, config, init_dict, network, input_tensor,imask
+    # )  
+
+
+    self_attn_out_proj_weight = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.out_proj.weight"
+    ]
+    self_attn_out_proj_bias = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.out_proj.bias"
+    ]
+
+    # out_proj = network.add_fully_connected(
+    #     self_attention.get_output(0),
+    #     hidden_size,
+    #     self_attn_out_proj_weight,
+    #     self_attn_out_proj_bias,
+    # )
+    out_proj = custom_fc(network, self_attention.get_output(0), hidden_size, self_attn_out_proj_weight, self_attn_out_proj_bias)
+    
+
+    self_attention_skipln = skipln(
+        block,
+        layer_index,
+        "self_attn_layer_norm",
+        config,
+        init_dict,
+        network,
+        out_proj.get_output(0),
+        input_tensor,
+    )
+    attention_ln = self_attention_skipln.get_output(0)
+
+    ffn_layer = ffn(block, layer_index, config, init_dict, network, attention_ln)
+
+    return ffn_layer
+
+
+def create_decoder_emb_plugin(weights_dict):
+
+    plugin_registry = tensorrt.get_plugin_registry()
+    assert plugin_registry
+    plugin_creator = plugin_registry.get_plugin_creator(
+        "TransformerDecoderEmb_IxRT", "1"
+    )
+    assert plugin_creator
+
+    embed_scale_field = tensorrt.PluginField(
+        "embed_scale",
+        np.array([32], dtype=np.float32),
+        tensorrt.PluginFieldType.FLOAT32,
+    )
+    embed_dim_field = tensorrt.PluginField(
+        "embed_dim",
+        np.array([1024], dtype=np.int32),
+        tensorrt.PluginFieldType.INT32,
+    )
+    pad_idx_field = tensorrt.PluginField(
+        "pad_idx",
+        np.array([1], dtype=np.int32),
+        tensorrt.PluginFieldType.INT32,
+    )
+
+    token_w = weights_dict["token_emb_weight"]
+    token_w_field = tensorrt.PluginField(
+        "token_emb_weight",
+        token_w.astype(np.float16),
+        tensorrt.PluginFieldType.FLOAT16,
+    )
+
+    pos_w = weights_dict["pos_emb_weight"]
+
+    pos_w_field = tensorrt.PluginField(
+        "pos_emb_weight",
+        pos_w.astype(np.float16),
+        tensorrt.PluginFieldType.FLOAT16,
+    )
+
+    field_collection = tensorrt.PluginFieldCollection(
+        [
+            embed_scale_field,
+            embed_dim_field,
+            pad_idx_field,
+            token_w_field,
+            pos_w_field,
+        ]
+    )
+
+    plugin = plugin_creator.create_plugin(
+        "py_TransformerDecoderEmb_ixrt", field_collection
+    )
+
+    return plugin
+
+
+def create_decoder_self_attention_plugin():
+
+    plugin_registry = tensorrt.get_plugin_registry()
+    assert plugin_registry
+
+    plugin_creator = plugin_registry.get_plugin_creator(
+        "CustomQkvCrossToContext_IxRT", "1"
+    )
+    assert plugin_creator
+
+    type_id_field = tensorrt.PluginField(
+        "type_id",
+        np.array([2], dtype=np.int32),
+        tensorrt.PluginFieldType.INT32,
+    )
+
+    has_mask_field = tensorrt.PluginField(
+        "has_mask",
+        np.array([0], dtype=np.int32),
+        tensorrt.PluginFieldType.INT32,
+    )
+    
+    mask_type_field = tensorrt.PluginField(
+       "type_mask",
+       np.array([4], dtype=np.int32),
+       tensorrt.PluginFieldType.INT32,
+   )
+   
+    scale_field = tensorrt.PluginField(
+       "scale",
+       np.array([1.0 / 8], dtype=np.float32),  # 1 / sqrt(head_num)
+       tensorrt.PluginFieldType.FLOAT32,
+   )
+
+    field_collection = tensorrt.PluginFieldCollection([type_id_field, has_mask_field,mask_type_field,scale_field])
+
+    plugin = plugin_creator.create_plugin("py_QkvCrossToContext_ixrt", field_collection)
+
+    return plugin
+
+
+
+def create_cross_attention_plugin():
+
+    plugin_registry = tensorrt.get_plugin_registry()
+    assert plugin_registry
+
+    plugin_creator = plugin_registry.get_plugin_creator(
+        "CustomQkvCrossToContext_IxRT", "1"
+    )
+    assert plugin_creator
+
+    type_id_field = tensorrt.PluginField(
+        "type_id",
+        np.array([2], dtype=np.int32),
+        tensorrt.PluginFieldType.INT32,
+    )
+
+    has_mask_field = tensorrt.PluginField(
+        "has_mask",
+        np.array([1], dtype=np.int32),
+        tensorrt.PluginFieldType.INT32,
+    )
+    
+    mask_type_field = tensorrt.PluginField(
+       "type_mask",
+       np.array([4], dtype=np.int32),
+       tensorrt.PluginFieldType.INT32,
+   )
+   
+    scale_field = tensorrt.PluginField(
+       "scale",
+       np.array([1.0 / 8], dtype=np.float32),  # 1 / sqrt(head_num)
+       tensorrt.PluginFieldType.FLOAT32,
+   )
+
+    field_collection = tensorrt.PluginFieldCollection([type_id_field, has_mask_field,mask_type_field,scale_field])
+
+    plugin = plugin_creator.create_plugin("py_QkvCrossToContext_ixrt", field_collection)
+
+    return plugin
+
+
+
+def cross_attention_kv_cache(
+    block, layer_index, config, init_dict, network, encoder_out
+):
+
+    """
+    Add the cross attention layer
+    """
+
+    to_k_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.k_proj.weight"
+    ]
+    to_k_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.k_proj.bias"
+    ]
+    # to_k_layer = network.add_fully_connected(
+    #     encoder_out, config.hidden_size, to_k_layer_weight, to_k_layer_bias
+    # )
+    to_k_layer = custom_fc(network, encoder_out, config.hidden_size, to_k_layer_weight, to_k_layer_bias)
+    
+    k_output = to_k_layer.get_output(0)
+    k_t_layer = network.add_shuffle(k_output)
+    k_t_layer.reshape_dims = trt.Dims(
+        [0, -1, config.num_attention_heads, config.head_size]
+    )
+    k_t_layer.second_transpose = trt.Permutation([0, 2, 1, 3])
+    input_k = k_t_layer.get_output(0)
+
+    to_v_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.v_proj.weight"
+    ]
+    to_v_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.v_proj.bias"
+    ]
+    # to_v_layer = network.add_fully_connected(
+    #     encoder_out, config.hidden_size, to_v_layer_weight, to_v_layer_bias
+    # )
+    to_v_layer = custom_fc(network, encoder_out, config.hidden_size, to_v_layer_weight, to_v_layer_bias)
+    
+    v_output = to_v_layer.get_output(0)
+    v_t_layer = network.add_shuffle(v_output)
+    v_t_layer.reshape_dims = trt.Dims(
+        [0, -1, config.num_attention_heads, config.head_size]
+    )
+    v_t_layer.second_transpose = trt.Permutation([0, 2, 1, 3])
+    input_v = v_t_layer.get_output(0)
+
+    return input_k,input_v
+
+
+def decoder_cross_attention_layer(
+    block, layer_index, config, init_dict, network, input_tensor, imask, encoder_out
+):
+
+    """
+    Add the cross attention layer
+    """
+    to_q_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.q_proj.weight"
+    ]
+    to_q_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.q_proj.bias"
+    ]
+    # to_q_layer = network.add_fully_connected(
+    #     input_tensor, config.hidden_size, to_q_layer_weight, to_q_layer_bias
+    # )
+    
+    print("input_tensor:",input_tensor.shape)
+    
+    to_q_layer = custom_fc(network, input_tensor, config.hidden_size, to_q_layer_weight, to_q_layer_bias)
+    
+    q_output = to_q_layer.get_output(0)
+
+    q_t_layer = network.add_shuffle(q_output)
+    q_t_layer.reshape_dims = trt.Dims(
+        [0, -1, config.num_attention_heads, config.head_size]
+    )  # reshape  [bs,sequence_len, hidden_size] -->[bs,sequence_len,num_attention_heads ,head_dim]
+    q_t_layer.second_transpose = trt.Permutation([0, 2, 1, 3])
+    input_q = q_t_layer.get_output(0)
+
+    to_k_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.k_proj.weight"
+    ]
+    to_k_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.k_proj.bias"
+    ]
+    # to_k_layer = network.add_fully_connected(
+    #     encoder_out, config.hidden_size, to_k_layer_weight, to_k_layer_bias
+    # )
+    
+    to_k_layer = custom_fc(network, encoder_out, config.hidden_size, to_k_layer_weight, to_k_layer_bias)
+    
+    
+    k_output = to_k_layer.get_output(0)
+    k_t_layer = network.add_shuffle(k_output)
+    k_t_layer.reshape_dims = trt.Dims(
+        [0, -1, config.num_attention_heads, config.head_size]
+    )
+    k_t_layer.second_transpose = trt.Permutation([0, 2, 1, 3])
+    input_k = k_t_layer.get_output(0)
+
+    to_v_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.v_proj.weight"
+    ]
+    to_v_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.v_proj.bias"
+    ]
+    # to_v_layer = network.add_fully_connected(
+    #     encoder_out, config.hidden_size, to_v_layer_weight, to_v_layer_bias
+    # )
+    
+    to_v_layer = custom_fc(network, encoder_out, config.hidden_size, to_v_layer_weight, to_v_layer_bias)
+    
+    
+    v_output = to_v_layer.get_output(0)
+    v_t_layer = network.add_shuffle(v_output)
+    v_t_layer.reshape_dims = trt.Dims(
+        [0, -1, config.num_attention_heads, config.head_size]
+    )
+    v_t_layer.second_transpose = trt.Permutation([0, 2, 1, 3])
+    input_v = v_t_layer.get_output(0)
+
+    attention_plug = create_cross_attention_plugin()
+    atten = network.add_plugin_v2([input_q, input_k, input_v,imask], attention_plug)
+    
+    scores = atten.get_output(0)
+    scores_t_layer = network.add_shuffle(scores)
+    scores_t_layer.first_transpose = trt.Permutation([0, 2, 1, 3])
+    scores_t_layer.reshape_dims = trt.Dims([0, 0, config.num_attention_heads*config.head_size, 1, 1])
+
+    scores_out = scores_t_layer.get_output(0)
+    to_out_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.out_proj.weight"
+    ]
+    to_out_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.out_proj.bias"
+    ]
+    # to_out_layer = network.add_fully_connected(
+    #     scores_out, config.hidden_size, to_out_layer_weight, to_out_layer_bias
+    # )
+    to_out_layer = custom_fc(network, scores_out, config.hidden_size, to_out_layer_weight, to_out_layer_bias)
+    
+
+    return to_out_layer
+
+
+
+
+
+
+def decoder_cross_attention_kvcache_layer(
+    block, layer_index, config, init_dict, network, input_tensor, imask, encoder_out, encoder_kv_cache_inputs
+):
+
+    """
+    Add the cross attention layer
+    """
+    to_q_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.q_proj.weight"
+    ]
+    to_q_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.q_proj.bias"
+    ]
+    # to_q_layer = network.add_fully_connected(
+    #     input_tensor, config.hidden_size, to_q_layer_weight, to_q_layer_bias
+    # )
+    
+    to_q_layer = custom_fc(network, input_tensor, config.hidden_size, to_q_layer_weight, to_q_layer_bias)
+    
+    
+    q_output = to_q_layer.get_output(0)
+
+    q_t_layer = network.add_shuffle(q_output)
+    q_t_layer.reshape_dims = trt.Dims(
+        [0, -1, config.num_attention_heads, config.head_size]
+    )  # reshape  [bs,sequence_len, hidden_size] -->[bs,sequence_len,num_attention_heads ,head_dim]
+    q_t_layer.second_transpose = trt.Permutation([0, 2, 1, 3])
+    input_q = q_t_layer.get_output(0)
+    
+    
+    input_k = encoder_kv_cache_inputs[f"past_key_values.{layer_index}.encoder.key"]
+    input_v = encoder_kv_cache_inputs[f"past_key_values.{layer_index}.encoder.value"]
+
+  
+    attention_plug = create_cross_attention_plugin()
+    atten = network.add_plugin_v2([input_q, input_k, input_v,imask], attention_plug)
+    
+    # atten = attention2(network,input_q, input_k, input_v)
+
+    scores = atten.get_output(0)
+    scores_t_layer = network.add_shuffle(scores)
+    scores_t_layer.first_transpose = trt.Permutation([0, 2, 1, 3])
+    scores_t_layer.reshape_dims = trt.Dims([0, 0, config.num_attention_heads*config.head_size, 1, 1])
+
+    scores_out = scores_t_layer.get_output(0)
+    to_out_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.out_proj.weight"
+    ]
+    to_out_layer_bias = init_dict[
+        f"{block}.layers.{layer_index}.encoder_attn.out_proj.bias"
+    ]
+    # to_out_layer = network.add_fully_connected(
+    #     scores_out, config.hidden_size, to_out_layer_weight, to_out_layer_bias
+    # )
+    
+    to_out_layer = custom_fc(network, scores_out, config.hidden_size, to_out_layer_weight, to_out_layer_bias)
+    
+
+    return to_out_layer
+
+
+def decoder_self_attention_layer(
+    block,
+    layer_index,
+    config,
+    init_dict,
+    network,
+    input_tensor,
+    imask,
+    encoder_out,
+    steps,
+    kv_cache_inputs,
+    kv_cache_outputs
+):
+
+    """
+    Add the cross attention layer
+    """
+    to_qkv_layer_weight = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.qkv_proj.weight"
+    ]
+    to_qkv_layer_bias = init_dict[f"{block}.layers.{layer_index}.self_attn.qkv_proj.bias"]
+
+    to_qkv_layer = custom_fc(network, input_tensor, 3*config.hidden_size, to_qkv_layer_weight, to_qkv_layer_bias)
+        
+    linear_qkv_output = to_qkv_layer.get_output(0)
+    reshape_qkv_layer = network.add_shuffle(linear_qkv_output)
+    reshape_qkv_layer.reshape_dims = trt.Dims(
+        [0, 0, 0]
+    )
+    
+    split_qkv_plugin = create_split_qkv_plugin(config.num_attention_heads,config.head_size,layer_index)
+    split_qkv_layers = network.add_plugin_v2([reshape_qkv_layer.get_output(0), kv_cache_inputs[f"past_key_values.{layer_index}.decoder.key"],
+                                                kv_cache_inputs[f"past_key_values.{layer_index}.decoder.value"]], split_qkv_plugin)
+        
+    input_q = split_qkv_layers.get_output(0)
+    present_key = split_qkv_layers.get_output(1)
+    present_value = split_qkv_layers.get_output(2)
+    
+    attention_plug = create_decoder_self_attention_plugin()
+    atten = network.add_plugin_v2([input_q, present_key, present_value], attention_plug)
+    
+    scores = atten.get_output(0)
+    
+    scores_t_layer = network.add_shuffle(scores)
+    scores_t_layer.first_transpose = trt.Permutation([0, 2, 1, 3])
+    scores_t_layer.reshape_dims = trt.Dims([0, 0, config.num_attention_heads*config.head_size, 1, 1])
+    
+    
+    kv_cache_outputs[f"present_key_values.{layer_index}.decoder.key"] = present_key
+    kv_cache_outputs[f"present_key_values.{layer_index}.decoder.value"] = present_value
+    
+
+    return scores_t_layer
+
+
+def transformer_decoder_layer(
+    block,
+    layer_index,
+    config,
+    init_dict,
+    network,
+    input_tensor,
+    imask,
+    encoder_out,
+    steps,
+    kv_cache_inputs,
+    kv_cache_outputs,
+    encoder_kv_cache_inputs
+):
+    
+
+    """
+    Add the transformer layer
+    """
+    idims = input_tensor.shape
+    assert len(idims) == 5
+    hidden_size = idims[2]
+    self_attention = decoder_self_attention_layer(
+        block,
+        layer_index,
+        config,
+        init_dict,
+        network,
+        input_tensor,
+        imask,
+        encoder_out,
+        steps,
+        kv_cache_inputs,
+        kv_cache_outputs
+    )
+    self_attn_out_proj_weight = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.out_proj.weight"
+    ]
+    self_attn_out_proj_bias = init_dict[
+        f"{block}.layers.{layer_index}.self_attn.out_proj.bias"
+    ]
+    
+    # out_proj = network.add_fully_connected(
+    #     self_attention.get_output(0),
+    #     hidden_size,
+    #     self_attn_out_proj_weight,
+    #     self_attn_out_proj_bias,
+    # )
+    
+    out_proj = custom_fc(network, self_attention.get_output(0), hidden_size, self_attn_out_proj_weight, self_attn_out_proj_bias)
+    
+    self_attention_skipln = skipln(
+        block,
+        layer_index,
+        "self_attn_layer_norm",
+        config,
+        init_dict,
+        network,
+        out_proj.get_output(0),
+        input_tensor,
+    )
+
+    query = self_attention_skipln.get_output(0)
+    # cross_attention = decoder_cross_attention_layer(
+    #     block, layer_index, config, init_dict, network, query, imask, encoder_out
+    # )
+    
+    cross_attention = decoder_cross_attention_kvcache_layer(
+        block, layer_index, config, init_dict, network, query, imask, encoder_out,encoder_kv_cache_inputs
+    )
+    crosss_attention_skipln = skipln(
+        block,
+        layer_index,
+        "encoder_attn_layer_norm",
+        config,
+        init_dict,
+        network,
+        cross_attention.get_output(0),
+        query,
+    )
+    attention_ln = crosss_attention_skipln.get_output(0)
+
+    ffn_layer = ffn(block, layer_index, config, init_dict, network, attention_ln)
+
+    return ffn_layer
+
+
+
+
+def create_top1_plugin():
+    pad_idx_field = trt.PluginField(
+        "pad_idx",
+        np.array([1], dtype=np.int32),
+        trt.PluginFieldType.INT32,
+    )
+
+    field_collection = trt.PluginFieldCollection(
+        [pad_idx_field]
+    )
+
+    plugin = top1_plg_creator.create_plugin(
+        "argmax", field_collection
+    )
+
+    return plugin  
+
diff --git a/models/nlp/plm/transformer/igie/requirements.txt b/models/nlp/plm/transformer/igie/requirements.txt
new file mode 100644
index 00000000..483936bb
--- /dev/null
+++ b/models/nlp/plm/transformer/igie/requirements.txt
@@ -0,0 +1,6 @@
+numpy==1.26.4
+cython
+antlr4-python3-runtime==4.9.3
+sacrebleu==2.5.1
+bitarray
+scikit-learn
\ No newline at end of file
diff --git a/models/nlp/plm/transformer/igie/scripts/infer_transformer_fp16_accuracy.sh b/models/nlp/plm/transformer/igie/scripts/infer_transformer_fp16_accuracy.sh
new file mode 100644
index 00000000..6bfff839
--- /dev/null
+++ b/models/nlp/plm/transformer/igie/scripts/infer_transformer_fp16_accuracy.sh
@@ -0,0 +1,45 @@
+set -euo pipefail
+
+EXIT_STATUS=0
+check_status()
+{
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    echo "fails"
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    fi
+}
+
+BATCH_SIZE=${BATCH_SIZE:=128}
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BATCH_SIZE=${arguments[index]};;
+      --tgt) Accuracy=${arguments[index]};;
+    esac
+done
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+DATA_DIR=${current_path}/../data/datasets/wmt14.en-fr.joined-dict.newstest2014
+MODEL_DIR=${current_path}/../data/checkpoints/wmt14.en-fr.joined-dict.transformer
+CPU_AFFINITY=$(ixsmi topo -m|grep "^GPU0" |awk '{print $(NF-1)}')
+
+if [[ ! -f "${MODEL_DIR}/Encoder.engine" ||  ! -f "${MODEL_DIR}/Decoder.engine" ]]; then
+    echo "Build Engine."
+    python3 ../igie/build_engine.py \
+        --model_dir ${MODEL_DIR}  
+fi
+
+
+echo "Inference(Test Accuracy)"
+export Accuracy=${Accuracy:=42}
+numactl --physcpubind=${CPU_AFFINITY} python3 inference_wmt14_en_fr_fp16_accuracy.py  ${DATA_DIR}  \
+    --path ${MODEL_DIR}/model.pt \
+    --beam 1 --batch-size ${BATCH_SIZE} \
+    --remove-bpe --quiet --fp16; check_status;
+exit ${EXIT_STATUS}
diff --git a/models/nlp/plm/transformer/igie/scripts/infer_transformer_fp16_performance.sh b/models/nlp/plm/transformer/igie/scripts/infer_transformer_fp16_performance.sh
new file mode 100644
index 00000000..a93100bb
--- /dev/null
+++ b/models/nlp/plm/transformer/igie/scripts/infer_transformer_fp16_performance.sh
@@ -0,0 +1,45 @@
+set -euo pipefail
+
+
+EXIT_STATUS=0
+check_status()
+{
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    echo "fails"
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    fi
+}
+
+BATCH_SIZE=${BATCH_SIZE:=128}
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BATCH_SIZE=${arguments[index]};;
+      --tgt) Accuracy=${arguments[index]};;
+    esac
+done
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+DATA_DIR=${current_path}/../data/datasets/wmt14.en-fr.joined-dict.newstest2014
+MODEL_DIR=${current_path}/../data/checkpoints/wmt14.en-fr.joined-dict.transformer
+CPU_AFFINITY=$(ixsmi topo -m|grep "^GPU0" |awk '{print $(NF-1)}')
+
+if [[ ! -f "${MODEL_DIR}/Encoder.engine" ||  ! -f "${MODEL_DIR}/Decoder.engine" ]]; then
+    echo "Build Engine."
+    python3 ../plugin/build_engine.py \
+        --model_dir ${MODEL_DIR}  
+fi
+
+echo "Inference(Test QPS)"
+export Accuracy=${Accuracy:=270}
+numactl --physcpubind=${CPU_AFFINITY} python3 inference_wmt14_en_fr_fp16_performance.py \
+    --max_batch_size ${BATCH_SIZE}  \
+    --model_dir ${MODEL_DIR} \
+    --data_dir ${DATA_DIR}; check_status;
+exit ${EXIT_STATUS}
diff --git a/models/nlp/plm/transformer/igie/transformer_cfg.py b/models/nlp/plm/transformer/igie/transformer_cfg.py
new file mode 100644
index 00000000..be47c4d2
--- /dev/null
+++ b/models/nlp/plm/transformer/igie/transformer_cfg.py
@@ -0,0 +1,15 @@
+import json
+class TransformerBaseConfig:
+    def __init__(self, config_path, use_fp16=True):
+        with open(config_path, "r") as f:
+            data = json.load(f)
+            self.num_attention_heads = data["num_attention_heads"]
+            self.hidden_size = data["hidden_size"]
+            self.intermediate_size = data["intermediate_size"]
+            self.num_hidden_layers = data["num_hidden_layers"]
+            self.head_size = self.hidden_size // self.num_attention_heads
+            self.tgt_vocab_size = data["tgt_vocab_size"]
+            self.max_sequence_length = data["max_sequence_length"]
+            self.sos_token_id = data["sos_token_id"]
+            self.eos_token_id = data["eos_token_id"]
+            self.use_fp16 = use_fp16
\ No newline at end of file
-- 
Gitee


From c6c84ad15742c4c025bec16209e451bf0ce63d60 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 10 Dec 2025 17:34:20 +0800
Subject: [PATCH 7/7] update tests ci

---
 .../scripts/infer_conformer_fp16_accuracy.sh  |   2 +-
 .../infer_conformer_fp16_performance.sh       |   2 +-
 .../cv/object_detection/yolov5/ixrt/README.md |   4 +-
 .../scripts/infer_yolov5s_fp16_accuracy.sh    |   3 +
 .../scripts/infer_yolov5s_fp16_performance.sh |   3 +
 .../scripts/infer_yolov5s_int8_accuracy.sh    |   3 +
 .../scripts/infer_yolov5s_int8_performance.sh |   3 +
 tests/model_info.json                         |  99 +++++++++++++++
 tests/run_igie.py                             |  73 ++++++++---
 tests/run_ixrt.py                             | 113 ++++++++++++------
 10 files changed, 247 insertions(+), 58 deletions(-)

diff --git a/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_accuracy.sh b/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_accuracy.sh
index 1f9b7fb2..e73b551c 100644
--- a/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_accuracy.sh
+++ b/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_accuracy.sh
@@ -59,5 +59,5 @@ python3 ixrt_inference_accuracy.py \
     --infer_type fp16 \
     --batch_size ${batchsize} \
     --data_dir ${DATA_DIR}  \
-    --model_dir ${MODEL_DIR} "$@"; check_status
+    --model_dir ${MODEL_DIR}; check_status
 exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_performance.sh b/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_performance.sh
index 49902122..653185fc 100644
--- a/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_performance.sh
+++ b/models/audio/speech_recognition/conformer/igie/scripts/infer_conformer_fp16_performance.sh
@@ -60,5 +60,5 @@ python3 ixrt_inference_performance.py \
     --infer_type fp16 \
     --batch_size ${batchsize} \
     --data_dir ${DATA_DIR}  \
-    --model_dir ${MODEL_DIR} "$@"; check_status
+    --model_dir ${MODEL_DIR}; check_status
 exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/README.md b/models/cv/object_detection/yolov5/ixrt/README.md
index a1812061..41bec0dc 100644
--- a/models/cv/object_detection/yolov5/ixrt/README.md
+++ b/models/cv/object_detection/yolov5/ixrt/README.md
@@ -86,11 +86,11 @@ popd
 
 ```bash
 export PROJ_DIR=./
-export DATASETS_DIR=./coco/
 export DATASETS_DIR=/Path/to/coco/
+export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
 export EVAL_DIR=${DATASETS_DIR}/images/val2017
-export RUN_DIR=../../ixrt_common/
+export RUN_DIR=../../ixrt_common
 export CONFIG_DIR=../../ixrt_common/config/YOLOV5M_CONFIG
 ```
 
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh
index 52ec959f..aaba3015 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh
@@ -40,6 +40,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh
index 5e2f97fb..5d5da24d 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh
@@ -40,6 +40,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh
index 606fc94c..3bf0bf7d 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh
@@ -40,6 +40,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh
index b2983669..8be04a7e 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh
@@ -40,6 +40,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
diff --git a/tests/model_info.json b/tests/model_info.json
index 771afcc9..7c28d1e9 100644
--- a/tests/model_info.json
+++ b/tests/model_info.json
@@ -8766,6 +8766,105 @@
             "type": "inference",
             "hasDemo": false,
             "demoType": ""
+        },
+        {
+            "display_name": "Transformer",
+            "model_name": "transformer",
+            "framework": "ixrt",
+            "release_version": "25.12",
+            "release_sdk": "CoreX 4.3.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.3.0",
+            "latest_gpgpu": "",
+            "category": "nlp/plm",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/nlp/plm/transformer/ixrt",
+            "readme_file": "models/nlp/plm/transformer/ixrt/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "",
+            "need_third_part": "",
+            "precisions": [
+                "fp16"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
+        },
+        {
+            "display_name": "Transformer",
+            "model_name": "transformer",
+            "framework": "igie",
+            "release_version": "25.12",
+            "release_sdk": "CoreX 4.3.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.3.0",
+            "latest_gpgpu": "",
+            "category": "nlp/plm",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/nlp/plm/transformer/igie",
+            "readme_file": "models/nlp/plm/transformer/igie/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "",
+            "need_third_part": "",
+            "precisions": [
+                "fp16"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
+        },
+        {
+            "display_name": "YOLOv5s",
+            "model_name": "yolov5s",
+            "framework": "igie",
+            "release_version": "25.12",
+            "release_sdk": "4.3.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.3.0",
+            "latest_gpgpu": "",
+            "category": "cv/object_detection",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/cv/object_detection/yolov5s/igie/",
+            "readme_file": "models/cv/object_detection/yolov5s/igie/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "local/coco",
+            "download_url": "https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5s.pt",
+            "need_third_part": true,
+            "precisions": [
+                "fp16"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
         }
     ]
 }
\ No newline at end of file
diff --git a/tests/run_igie.py b/tests/run_igie.py
index 63841081..b3a918ef 100644
--- a/tests/run_igie.py
+++ b/tests/run_igie.py
@@ -194,12 +194,19 @@ def run_clf_testcase(model, batch_size):
     for prec in model["precisions"]:
         result["result"].setdefault(prec, {"status": "FAIL"})
         for bs in batch_size_list:
+            if bs == "None":
+                bs = "Default"
+                script = base_script + f"""
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh
+                    bash scripts/infer_{model_name}_{prec}_performance.sh
+                """
+            else:
+                script = base_script + f"""
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh --bs {bs}
+                    bash scripts/infer_{model_name}_{prec}_performance.sh --bs {bs}
+                """
             result["result"][prec].setdefault(bs, {})
             logging.info(f"Start running {model_name} {prec} bs={bs} test case")
-            script = base_script + f"""
-                bash scripts/infer_{model_name}_{prec}_accuracy.sh --bs {bs}
-                bash scripts/infer_{model_name}_{prec}_performance.sh --bs {bs}
-            """
 
             r, t = run_script(script)
             sout = r.stdout
@@ -270,12 +277,28 @@ def run_detec_testcase(model, batch_size):
     for prec in model["precisions"]:
         result["result"].setdefault(prec, {"status": "FAIL"})
         for bs in batch_size_list:
+            if bs == "None":
+                bs = "Default"
+                script = base_script + f"""
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh
+                    bash scripts/infer_{model_name}_{prec}_performance.sh
+                """
+            else:
+                export_onnx_script = ""
+                if model_name == "yolov5s":
+                    export_onnx_script = f"""
+                        cd ../{model['model_path']}/yolov5
+                        python3 export.py --weights yolov5s.pt --include onnx --opset 11 --batch-size {bs}
+                        mv yolov5s.onnx ../checkpoints
+                        rm -rf ../checkpoints/tmp
+                        cd -
+                    """
+                script = export_onnx_script + base_script + f"""
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh --bs {bs}
+                    bash scripts/infer_{model_name}_{prec}_performance.sh --bs {bs}
+                """
             result["result"][prec].setdefault(bs, {})
             logging.info(f"Start running {model_name} {prec} bs={bs} test case")
-            script = base_script + f"""
-                bash scripts/infer_{model_name}_{prec}_accuracy.sh --bs {bs}
-                bash scripts/infer_{model_name}_{prec}_performance.sh --bs {bs}
-            """
 
             r, t = run_script(script)
             sout = r.stdout
@@ -494,10 +517,10 @@ def run_nlp_testcase(model, batch_size):
     d_url = model["download_url"]
     checkpoint_n = d_url.split("/")[-1]
     dataset_n = model["datasets"].split("/")[-1]
-    target_dirs = {"bert_base_squad": "csarron/bert-base-uncased-squad-v1", "bert_base_ner":"test", "bert_large_squad": "neuralmagic/bert-large-uncased-finetuned-squadv1"}
+    target_dirs = {"bert_base_squad": "csarron/bert-base-uncased-squad-v1", "bert_base_ner":"test", "bert_large_squad": "neuralmagic/bert-large-uncased-finetuned-squadv1", "transformer": ""}
     target_dir = target_dirs[model_name]
     dirname = os.path.dirname(target_dir)
-    mkdir_script = f"mkdir -p {dirname}" if dirname else ""
+    mkdir_script = f"mkdir -p {dirname}" if dirname != "" else ""
 
     prepare_script = f"""
     set -x
@@ -527,12 +550,19 @@ def run_nlp_testcase(model, batch_size):
     for prec in model["precisions"]:
         result["result"].setdefault(prec, {"status": "FAIL"})
         for bs in batch_size_list:
+            if bs == "None":
+                bs = "Default"
+                script = base_script + f"""
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh
+                    bash scripts/infer_{model_name}_{prec}_performance.sh
+                """
+            else:
+                script = base_script + f"""
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh --bs {bs}
+                    bash scripts/infer_{model_name}_{prec}_performance.sh --bs {bs}
+                """
             result["result"][prec].setdefault(bs, {})
             logging.info(f"Start running {model_name} {prec} bs={bs} test case")
-            script = base_script + f"""
-            bash scripts/infer_{model_name}_{prec}_accuracy.sh
-            bash scripts/infer_{model_name}_{prec}_performance.sh
-            """
 
             r, t = run_script(script)
             sout = r.stdout
@@ -587,12 +617,19 @@ def run_speech_testcase(model, batch_size):
     for prec in model["precisions"]:
         result["result"].setdefault(prec, {"status": "FAIL"})
         for bs in batch_size_list:
+            if bs == "None":
+                bs = "Default"
+                script = base_script + f"""
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh
+                    bash scripts/infer_{model_name}_{prec}_performance.sh
+                """
+            else:
+                script = base_script + f"""
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh --bs {bs}
+                    bash scripts/infer_{model_name}_{prec}_performance.sh --bs {bs}
+                """
             result["result"][prec].setdefault(bs, {})
             logging.info(f"Start running {model_name} {prec} bs={bs} test case")
-            script = base_script + f"""
-            bash scripts/infer_{model_name}_{prec}_accuracy.sh
-            bash scripts/infer_{model_name}_{prec}_performance.sh
-            """
 
             r, t = run_script(script)
             sout = r.stdout
diff --git a/tests/run_ixrt.py b/tests/run_ixrt.py
index c924124b..e8148093 100644
--- a/tests/run_ixrt.py
+++ b/tests/run_ixrt.py
@@ -219,14 +219,20 @@ def run_clf_testcase(model, batch_size):
     for prec in model["precisions"]:
         result["result"].setdefault(prec, {"status": "FAIL"})
         for bs in batch_size_list:
+            if bs == "None":
+                bs = "Default"
+                script = base_script + f"""
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh
+                    bash scripts/infer_{model_name}_{prec}_performance.sh
+                """
+            else:
+                script = base_script + f"""
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh --bs {bs}
+                    bash scripts/infer_{model_name}_{prec}_performance.sh --bs {bs}
+                """
             result["result"][prec].setdefault(bs, {})
             logging.info(f"Start running {model_name} {prec} bs={bs} test case")
 
-            script = base_script + f"""
-                bash scripts/infer_{model_name}_{prec}_accuracy.sh --bs {bs}
-                bash scripts/infer_{model_name}_{prec}_performance.sh --bs {bs}
-            """
-
             if model_name == "swin_transformer_large":
                 script = base_script
 
@@ -325,13 +331,44 @@ def run_detec_testcase(model, batch_size):
     for prec in model["precisions"]:
         result["result"].setdefault(prec, {"status": "FAIL"})
         for bs in batch_size_list:
+            if bs == "None":
+                bs = "Default"
+                script = base_script + f"""
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh
+                    bash scripts/infer_{model_name}_{prec}_performance.sh
+                """
+            else:
+                export_onnx_script = ""
+                if model_name == "yolov5":
+                    export_onnx_script = f"""
+                        cd ../{model['model_path']}/yolov5
+                        python3 export.py --weights yolov5m.pt --include onnx --opset 11 --batch-size {bs}
+                        mv yolov5m.onnx ../checkpoints
+                        rm -rf ../checkpoints/tmp
+                        cd -
+                    """
+                elif model_name == "yolox":
+                    export_onnx_script = f"""
+                        cd ../{model['model_path']}/YOLOX
+                        python3 tools/export_onnx.py --output-name ../yolox.onnx -n yolox-m -c yolox_m.pth --batch-size {bs}
+                        rm -rf ../checkpoints/tmp
+                        cd -
+                    """
+                elif model_name == "yolov5s":
+                    export_onnx_script = f"""
+                        cd ../{model['model_path']}/yolov5
+                        python3 export.py --weights yolov5s.pt --include onnx --opset 11 --batch-size {bs}
+                        mv yolov5s.onnx ../checkpoints
+                        rm -rf ../checkpoints/tmp
+                        cd -
+                    """
+                script = export_onnx_script + base_script + f"""
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh --bs {bs}
+                    bash scripts/infer_{model_name}_{prec}_performance.sh --bs {bs}
+                """
             result["result"][prec].setdefault(bs, {})
             logging.info(f"Start running {model_name} {prec} bs={bs} test case")
             result["result"].setdefault(prec, {"status": "FAIL"})
-            script = base_script + f"""
-                bash scripts/infer_{model_name}_{prec}_accuracy.sh --bs {bs}
-                bash scripts/infer_{model_name}_{prec}_performance.sh --bs {bs}
-            """
 
             if model_name == "rtmpose":
                 script = f"""
@@ -568,30 +605,26 @@ def run_nlp_testcase(model, batch_size):
     for prec in model["precisions"]:
         result["result"].setdefault(prec, {"status": "FAIL"})
         for bs in batch_size_list:
-            result["result"][prec].setdefault(bs, {})
-            logging.info(f"Start running {model_name} {prec} bs: {bs} test case")
             script = base_script
-
-            if model_name == "bert_base_squad":
-                script = f"""
-                set -x
-                cd ../{model['model_path']}/python
-                bash script/infer_bert_base_squad_{prec}_ixrt.sh --bs {bs}
-                """
-            elif model_name == "bert_large_squad":
-                script = f"""
-                set -x
-                cd ../{model['model_path']}/
-                bash script/infer_bert_large_squad_fp16_accuracy.sh --bs {bs}
-                bash script/infer_bert_large_squad_fp16_performance.sh --bs {bs}
-                """
-                if prec == "int8":
+            if bs == "None":
+                bs = "Default"
+                if model_name in ["bert_base_squad", "bert_large_squad", "transformer"]:
+                    script = f"""
+                        set -x
+                        cd ../{model['model_path']}/
+                        bash scripts/infer_{model_name}_{prec}_accuracy.sh
+                        bash scripts/infer_{model_name}_{prec}_performance.sh
+                    """
+            else:
+                if model_name in ["bert_base_squad", "bert_large_squad", "transformer"]:
                     script = f"""
-                    set -x
-                    cd ../{model['model_path']}/
-                    bash script/infer_bert_large_squad_int8_accuracy.sh --bs {bs}
-                    bash script/infer_bert_large_squad_int8_performance.sh --bs {bs}
+                        set -x
+                        cd ../{model['model_path']}/
+                        bash scripts/infer_{model_name}_{prec}_accuracy.sh --bs {bs}
+                        bash scripts/infer_{model_name}_{prec}_performance.sh --bs {bs}
                     """
+            result["result"][prec].setdefault(bs, {})
+            logging.info(f"Start running {model_name} {prec} bs: {bs} test case")
 
             r, t = run_script(script)
             sout = r.stdout
@@ -611,7 +644,7 @@ def run_nlp_testcase(model, batch_size):
                 result["result"][prec][bs].update(get_metric_result(m))
                 result["result"][prec]["status"] = "PASS"
             
-            if model_name == "bert_large_squad":
+            if model_name == "bert_large_squad" or model_name == "bert_base_squad":
                 patterns = {
                     "LatencyQPS": r"Latency QPS\s*:\s*(\d+\.?\d*)",
                     "exact_match": r"\"exact_match\"\s*:\s*(\d+\.?\d*)",
@@ -658,13 +691,21 @@ def run_speech_testcase(model, batch_size):
     for prec in model["precisions"]:
         result["result"].setdefault(prec, {"status": "FAIL"})
         for bs in batch_size_list:
+            if bs == "None":
+                bs = "Default"
+                script = f"""
+                    cd ../{model['model_path']}
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh
+                    bash scripts/infer_{model_name}_{prec}_performance.sh
+                """
+            else:
+                script = f"""
+                    cd ../{model['model_path']}
+                    bash scripts/infer_{model_name}_{prec}_accuracy.sh --bs {bs}
+                    bash scripts/infer_{model_name}_{prec}_performance.sh --bs {bs}
+                """
             result["result"][prec].setdefault(bs, {})
             logging.info(f"Start running {model_name} {prec} bs:{bs} test case")
-            script = f"""
-            cd ../{model['model_path']}
-            bash scripts/infer_{model_name}_{prec}_accuracy.sh --bs {bs}
-            bash scripts/infer_{model_name}_{prec}_performance.sh --bs {bs}
-            """
 
             if model_name == "transformer_asr":
                 script = f"""
-- 
Gitee