From b991b701806889807f49ab6201f37eb6bbb4ea20 Mon Sep 17 00:00:00 2001
From: yanpeng <yanpeng46@huawei.com>
Date: Wed, 22 Oct 2025 08:56:25 +0800
Subject: [PATCH] add kml kulitho kudnn tests

---
 kml/build.sh                  |   13 +
 kml/run.sh                    |    5 +
 kml/test/Makefile             |   36 ++
 kml/test/kml_blas_test.c      |   24 +
 kml/test/kml_blas_test.h      |   14 +
 kml/test/kml_conv_test.c      |   40 ++
 kml/test/kml_conv_test.h      |   14 +
 kml/test/kml_fft_test.c       |   38 ++
 kml/test/kml_fft_test.h       |   14 +
 kml/test/kml_ipl_test.c       |   33 +
 kml/test/kml_ipl_test.h       |   14 +
 kml/test/kml_lapack_test.c    |   23 +
 kml/test/kml_lapack_test.h    |   14 +
 kml/test/kml_math_test.c      |   18 +
 kml/test/kml_math_test.h      |   14 +
 kml/test/kml_scalapack_test.c |   77 +++
 kml/test/kml_scalapack_test.h |   14 +
 kml/test/kml_solver_test.c    |  154 +++++
 kml/test/kml_solver_test.h    |   14 +
 kml/test/kml_spblas_test.c    |   29 +
 kml/test/kml_spblas_test.h    |   14 +
 kml/test/kml_svml_test.c      |   20 +
 kml/test/kml_svml_test.h      |   14 +
 kml/test/kml_test_main.cpp    |   75 +++
 kml/test/kml_vml_test.c       |   13 +
 kml/test/kml_vml_test.h       |   14 +
 kml/test/kml_vsl_test.c       |   41 ++
 kml/test/kml_vsl_test.h       |   14 +
 kudnn/build.sh                |   20 +
 kudnn/run.sh                  |    8 +
 kudnn/src/function.cpp        | 1112 +++++++++++++++++++++++++++++++++
 kudnn/src/function.hpp        |   73 +++
 kudnn/src/main.cpp            |   45 ++
 kulitho/build.sh              |   18 +
 kulitho/run.sh                |    7 +
 kulitho/test/Makefile         |   21 +
 kulitho/test/kulitho_test.cpp |   92 +++
 37 files changed, 2203 insertions(+)
 create mode 100644 kml/build.sh
 create mode 100644 kml/run.sh
 create mode 100644 kml/test/Makefile
 create mode 100644 kml/test/kml_blas_test.c
 create mode 100644 kml/test/kml_blas_test.h
 create mode 100644 kml/test/kml_conv_test.c
 create mode 100644 kml/test/kml_conv_test.h
 create mode 100644 kml/test/kml_fft_test.c
 create mode 100644 kml/test/kml_fft_test.h
 create mode 100644 kml/test/kml_ipl_test.c
 create mode 100644 kml/test/kml_ipl_test.h
 create mode 100644 kml/test/kml_lapack_test.c
 create mode 100644 kml/test/kml_lapack_test.h
 create mode 100644 kml/test/kml_math_test.c
 create mode 100644 kml/test/kml_math_test.h
 create mode 100644 kml/test/kml_scalapack_test.c
 create mode 100644 kml/test/kml_scalapack_test.h
 create mode 100644 kml/test/kml_solver_test.c
 create mode 100644 kml/test/kml_solver_test.h
 create mode 100644 kml/test/kml_spblas_test.c
 create mode 100644 kml/test/kml_spblas_test.h
 create mode 100644 kml/test/kml_svml_test.c
 create mode 100644 kml/test/kml_svml_test.h
 create mode 100644 kml/test/kml_test_main.cpp
 create mode 100644 kml/test/kml_vml_test.c
 create mode 100644 kml/test/kml_vml_test.h
 create mode 100644 kml/test/kml_vsl_test.c
 create mode 100644 kml/test/kml_vsl_test.h
 create mode 100644 kudnn/build.sh
 create mode 100644 kudnn/run.sh
 create mode 100644 kudnn/src/function.cpp
 create mode 100644 kudnn/src/function.hpp
 create mode 100644 kudnn/src/main.cpp
 create mode 100644 kulitho/build.sh
 create mode 100644 kulitho/run.sh
 create mode 100644 kulitho/test/Makefile
 create mode 100644 kulitho/test/kulitho_test.cpp

diff --git a/kml/build.sh b/kml/build.sh
new file mode 100644
index 0000000..871ece4
--- /dev/null
+++ b/kml/build.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# export HPCKIT_PATH=/opt/HPCKit/
+if [ -z "${HPCKIT_PATH}" ]; then
+    echo "Error: Environment variable 'HPCKIT_PATH' is not set!"
+    echo "Please configure HPCKIT_PATH to point to the HPCKit installation directory, for example:"
+    echo "    export HPCKIT_PATH=/opt/HPCKit"
+    echo "Then re-run the build command."
+    exit 1
+fi
+echo "HPCKIT_PATH=${HPCKIT_PATH}"
+cd ./test
+make clean
+make 
diff --git a/kml/run.sh b/kml/run.sh
new file mode 100644
index 0000000..81ca259
--- /dev/null
+++ b/kml/run.sh
@@ -0,0 +1,5 @@
+export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
+export OMPI_ALLOW_RUN_AS_ROOT=1
+
+cd ./test
+mpirun -np 1  ./test_kml
\ No newline at end of file
diff --git a/kml/test/Makefile b/kml/test/Makefile
new file mode 100644
index 0000000..76329ff
--- /dev/null
+++ b/kml/test/Makefile
@@ -0,0 +1,36 @@
+# Makefile
+CXX      = g++
+CC       = gcc
+CXXFLAGS = -I. -I  $(HPCKIT_PATH)/latest/hmpi/gcc/release/hmpi/include -L$(HPCKIT_PATH)/latest/hmpi/gcc/release/hmpi/lib/
+CFLAGS   = -I. -I  $(HPCKIT_PATH)/latest/hmpi/gcc/release/hmpi/include -L$(HPCKIT_PATH)/latest/hmpi/gcc/release/hmpi/lib/
+LDFLAGS  = -lgtest -lgtest_main -lpthread -L. -lkfft -lkfftf -lkffth -lkblas -lklapack_full -lm -lksolver -lmpi -lkscalapack_full -lkipl -lkconv -lkm -lkvml -lkvsl -lksvml -lkspblas
+
+TARGET   = test_kml
+SOURCES_C = kml_fft_test.c kml_blas_test.c kml_lapack_test.c kml_solver_test.c kml_scalapack_test.c kml_ipl_test.c kml_conv_test.c kml_vml_test.c kml_vsl_test.c kml_svml_test.c kml_math_test.c kml_spblas_test.c
+SOURCES_CPP = kml_test_main.cpp
+OBJECTS_C = $(SOURCES_C:.c=.o)
+OBJECTS_CPP = $(SOURCES_CPP:.cpp=.o)
+OBJECTS = $(OBJECTS_C) $(OBJECTS_CPP)
+
+# 默认目标
+all: $(TARGET)
+
+# 构建可执行文件
+$(TARGET): $(OBJECTS)
+	$(CXX) $(OBJECTS) -o $@ $(LDFLAGS) $(CXXFLAGS)
+
+# 编译 C 文件
+%.o: %.c
+	$(CC) $(CFLAGS) -c $< -o $@
+
+# 编译 C++ 文件
+%.o: %.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+# 清理
+clean:
+	rm -f *.o $(TARGET)
+
+rebuild: clean all
+
+.PHONY: all clean rebuild
\ No newline at end of file
diff --git a/kml/test/kml_blas_test.c b/kml/test/kml_blas_test.c
new file mode 100644
index 0000000..e5302e2
--- /dev/null
+++ b/kml/test/kml_blas_test.c
@@ -0,0 +1,24 @@
+#include "kml_blas_test.h"
+#include "kblas.h"
+#include <stdlib.h>
+
+int kml_blas_sgemm_01()
+{
+    int m = 4, k = 3, n = 4, lda = 4, ldb = 3, ldc = 4; 
+    float alpha = 1.0, beta = 2.0; 
+    float a[12] = {0.340188, -0.105617, 0.283099, 
+                    0.298440, 0.411647, -0.302449, 
+                    -0.164777, 0.268230, -0.222225, 
+                    0.053970, -0.022603, 0.128871}; 
+    float b[12] = {-0.135216, 0.013401, 0.452230, 0.416195, 
+                    0.135712, 0.217297, -0.358397, 0.106969, 
+                    -0.483699, -0.257113, -0.362768, 0.304177}; 
+    float c[16] = {-0.343321, -0.099056, -0.370210, -0.391191, 
+                    0.498924, -0.281743, 0.012932, 0.339112, 
+                    0.112640, -0.203968, 0.137552, 0.024287, 
+                    -0.006417, 0.472775, -0.207483, 0.271358}; 
+
+    cblas_sgemm(CblasColMajor,CblasNoTrans,CblasNoTrans, m, n, k, alpha,
+        a, lda, b, ldb, beta, c, ldc);
+    return 1; // 成功
+}
\ No newline at end of file
diff --git a/kml/test/kml_blas_test.h b/kml/test/kml_blas_test.h
new file mode 100644
index 0000000..f36c029
--- /dev/null
+++ b/kml/test/kml_blas_test.h
@@ -0,0 +1,14 @@
+#ifndef KML_BLAS_TEST_H
+#define KML_BLAS_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kml_blas_sgemm_01();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KML_BLAS_TEST_H
\ No newline at end of file
diff --git a/kml/test/kml_conv_test.c b/kml/test/kml_conv_test.c
new file mode 100644
index 0000000..99a59b8
--- /dev/null
+++ b/kml/test/kml_conv_test.c
@@ -0,0 +1,40 @@
+#include "kml_conv_test.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include "conv.h"
+
+
+int kml_conv_conv2d_01()
+{
+    int batch = 1;
+    int inputChannels = 1;
+    int inputHeight = 6;
+    int inputWidth = 6;
+    int kernelHeight = 3;
+    int kernelWidth = 3;
+    int strideY = 1;
+    int strideX = 1;
+    int padHeight = 0;
+    int padWidth = 0;
+    int dilationY = 1;
+    int dilationX = 1;
+    int outputChannels = 1;
+    float input[36] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0,
+                       7.0, 8.0, 9.0, 10.0, 11.0, 12.0,
+                       13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+                       19.0, 20.0, 21.0, 22.0, 23.0, 24.0,
+                       25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+                       31.0, 32.0, 33.0, 34.0, 35.0, 36.0};
+    float kernel[9] = {1.0, 2.0, 3.0,
+                       4.0, 5.0, 6.0,
+                       7.0, 8.0, 9.0};
+
+    float *bias = NULL;
+    int outputHeight = (inputHeight + 2 * padHeight - dilationY * (kernelHeight - 1) - 1) / strideY + 1;
+    int outputWidth = (inputWidth + 2 * padWidth - dilationX * (kernelWidth - 1) - 1) / strideX + 1;
+
+    float output[16] = {0.0};
+    conv2d_fp32(input, batch, inputChannels, inputHeight, inputWidth, kernel, kernelHeight, kernelWidth, strideY, strideX, padHeight, padWidth, dilationY, dilationX, bias, output, outputChannels);
+
+    return 1; // 成功
+}
\ No newline at end of file
diff --git a/kml/test/kml_conv_test.h b/kml/test/kml_conv_test.h
new file mode 100644
index 0000000..6e386cf
--- /dev/null
+++ b/kml/test/kml_conv_test.h
@@ -0,0 +1,14 @@
+#ifndef KML_CONV_TEST_H
+#define KML_CONV_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kml_conv_conv2d_01();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KML_CONV_TEST_H
\ No newline at end of file
diff --git a/kml/test/kml_fft_test.c b/kml/test/kml_fft_test.c
new file mode 100644
index 0000000..6c523fc
--- /dev/null
+++ b/kml/test/kml_fft_test.c
@@ -0,0 +1,38 @@
+#include "kml_fft_test.h"
+#include <stdlib.h>
+
+#include "kfft.h"
+
+int kml_fft_c2c_01()
+{
+    int rank = 2;
+    int *n = (int*)kml_fft_malloc(sizeof(int) * rank);
+    n[0] = 2;
+    n[1] = 3;
+
+    double init[6][2] = {{120, 0}, {8, 8}, {0, 0}, {0, 16}, {0, 16}, {-8, 8}};
+    kml_fft_complex *in;
+    in = (kml_fft_complex*)kml_fft_malloc(sizeof(kml_fft_complex) * n[0] * n[1]);
+    for (int i = 0; i < n[0] * n[1]; i++) {
+        in[i][0] = init[i][0];
+        in[i][1] = init[i][1];
+    }
+
+    kml_fft_complex *out;
+    out = (kml_fft_complex*)kml_fft_malloc(sizeof(kml_fft_complex) * n[0] * n[1]);
+
+    kml_fft_plan plan;
+    plan = kml_fft_plan_dft(rank, n, in, out, KML_FFT_FORWARD, KML_FFT_ESTIMATE);
+    if (!plan) {
+        return 0;
+    }
+
+    kml_fft_execute_dft(plan, in, out);
+
+    kml_fft_destroy_plan(plan);
+    kml_fft_free(n);
+    kml_fft_free(in);
+    kml_fft_free(out);
+
+    return 1; // 成功
+}
\ No newline at end of file
diff --git a/kml/test/kml_fft_test.h b/kml/test/kml_fft_test.h
new file mode 100644
index 0000000..7cf542e
--- /dev/null
+++ b/kml/test/kml_fft_test.h
@@ -0,0 +1,14 @@
+#ifndef KML_FFT_TEST_H
+#define KML_FFT_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kml_fft_c2c_01();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KML_FFT_TEST_H
\ No newline at end of file
diff --git a/kml/test/kml_ipl_test.c b/kml/test/kml_ipl_test.c
new file mode 100644
index 0000000..94c302e
--- /dev/null
+++ b/kml/test/kml_ipl_test.c
@@ -0,0 +1,33 @@
+#include "kml_ipl_test.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include "kipl.h"
+
+
+int kml_ipl_interp2d_01()
+{
+    const size_t xMin = 4;
+    const size_t xMax = 2048;
+    const size_t xSize = 4;
+    const size_t yMin = 4;
+    const size_t yMax = 2048;
+    const size_t ySize = 4;
+    const float xValLo = 1;
+    const float xValHi = 2;
+    const float yValLo = 1;
+    const float yValHi = 2;
+    float xArr[4] = {1,1.33,1.67,2};
+    float yArr[4] = {1,1.33,1.67,2};
+    float* zArr = (float*)malloc(xSize * ySize * sizeof(float));
+    float x = 1;
+    float y = 1;
+    float z = 0;
+    kml_float_interp2d *interp = kml_float_interp2d_alloc(kml_float_interp2d_bicubic, xSize, ySize);
+    kml_interp_accel *xacc = kml_interp_accel_alloc();
+    kml_interp_accel *yacc = kml_interp_accel_alloc();
+    kml_float_interp2d_init(interp, xArr, yArr, zArr, xSize, ySize);
+    z = kml_float_interp2d_eval(interp, xArr, yArr, zArr, x, y, xacc, yacc);
+
+
+    return 1; // 成功
+}
\ No newline at end of file
diff --git a/kml/test/kml_ipl_test.h b/kml/test/kml_ipl_test.h
new file mode 100644
index 0000000..400efd3
--- /dev/null
+++ b/kml/test/kml_ipl_test.h
@@ -0,0 +1,14 @@
+#ifndef KML_IPL_TEST_H
+#define KML_IPL_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kml_ipl_interp2d_01();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KML_IPL_TEST_H
\ No newline at end of file
diff --git a/kml/test/kml_lapack_test.c b/kml/test/kml_lapack_test.c
new file mode 100644
index 0000000..f59a0c4
--- /dev/null
+++ b/kml/test/kml_lapack_test.c
@@ -0,0 +1,23 @@
+#include "kml_lapack_test.h"
+#include "klapack.h"
+#include <stdlib.h>
+
+int kml_lapack_dgetrf_01()
+{
+    char trans = 'N';
+    int n = 5;
+    int nrhs = 1;
+    int lda = 5;
+    int ldb = 5;
+    int ipiv[5];
+    int info = 0;
+    double a[]={72.1673 , 66.1857 , 64.7644 , 28.0199 , 91.4151,
+                6.5180 , 62.8483 , 72.4323 , 46.5760 ,  8.6928,
+                28.9821 , 42.1828 , 18.6437 , 99.8612 , 35.6972,
+                67.9812 ,  5.0880 , 85.5035 ,79.2945 , 54.5920,
+                28.6869 , 49.7512 ,  7.5186 ,28.6929 , 84.6041};
+    double b[]={1.0, 2.0, 3.0, 4.0, 5.0};
+    dgetrf_(&n, &n, a, &lda, ipiv, &info);
+    dgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, &info);
+    return info;
+}
\ No newline at end of file
diff --git a/kml/test/kml_lapack_test.h b/kml/test/kml_lapack_test.h
new file mode 100644
index 0000000..3749aca
--- /dev/null
+++ b/kml/test/kml_lapack_test.h
@@ -0,0 +1,14 @@
+#ifndef KML_LAPACK_TEST_H
+#define KML_LAPACK_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kml_lapack_dgetrf_01();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KML_LAPACK_TEST_H
\ No newline at end of file
diff --git a/kml/test/kml_math_test.c b/kml/test/kml_math_test.c
new file mode 100644
index 0000000..903514b
--- /dev/null
+++ b/kml/test/kml_math_test.c
@@ -0,0 +1,18 @@
+#include "kml_math_test.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "km.h"
+
+
+int kml_math_sin_01()
+{
+    double pi = acos(-1); 
+    // typical usage 
+    double a = pi/6, b = 1.0, c = -3*pi/4, d = pi/3; 
+    // special handling 
+    double e = INFINITY, f = -INFINITY, g = NAN; 
+    sin(a);
+
+    return 1; // 成功
+}
\ No newline at end of file
diff --git a/kml/test/kml_math_test.h b/kml/test/kml_math_test.h
new file mode 100644
index 0000000..e03f769
--- /dev/null
+++ b/kml/test/kml_math_test.h
@@ -0,0 +1,14 @@
+#ifndef KML_MATH_TEST_H
+#define KML_MATH_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kml_math_sin_01();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KML_MATH_TEST_H
\ No newline at end of file
diff --git a/kml/test/kml_scalapack_test.c b/kml/test/kml_scalapack_test.c
new file mode 100644
index 0000000..44dcbf0
--- /dev/null
+++ b/kml/test/kml_scalapack_test.c
@@ -0,0 +1,77 @@
+#include "kml_scalapack_test.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include "kscalapack.h"
+void blacs_get_(int*, int*, int*);
+void blacs_gridinit_(int*, char*, int*, int*);
+void blacs_gridinfo_(int*, int*, int*, int*, int*);
+void descinit_(int*, int*, int*, int*, int*, int*, int*, int*, int*, int*);
+void blacs_gridexit_(int*);
+int numroc_(int*, int*, int*, int*, int*);
+
+int kml_scalapack_pdgetrf_01() {
+    // 默认值
+    int izero=0;
+    int ione=1;
+    int N = 1000; // 矩阵维度
+    int MB = 64, NB = 64; // 块大小
+    int P = 1, Q = 1; // 进程网格尺寸
+
+    // 初始化MPI环境
+    MPI_Init(NULL, NULL);
+    int iam, np;
+    MPI_Comm_rank(MPI_COMM_WORLD, &iam);
+    MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+    if (P * Q != np) {
+        if (iam == 0) {
+            fprintf(stderr, "Error: The product of P and Q must equal the number of processes.\n");
+        }
+        MPI_Finalize();
+        return -1;
+    }
+
+    // 创建BLACS上下文
+    int ictxt, myrow, mycol;
+    blacs_get_(&izero, &izero, &ictxt); // -> Create context
+    blacs_gridinit_(&ictxt, "Row", &P, &Q);
+    blacs_gridinfo_(&ictxt, &P, &Q, &myrow, &mycol);
+
+    // 定义矩阵分布描述符
+    int desca[9], descb[9];
+    int info;
+
+    // 计算本地矩阵尺寸
+    int M_loc = numroc_(&N, &MB, &myrow, &izero, &P);
+    int N_loc = numroc_(&N, &NB, &mycol, &izero, &Q);
+
+    // 分配局部矩阵存储空间
+    double *A = (double *)malloc(M_loc * N_loc * sizeof(double));
+    if (!A) {
+        printf("Process %d could not allocate memory for A\n", iam);
+        return -1;
+    }
+
+    // 初始化局部矩阵数据
+    for (int i = 0; i < M_loc * N_loc; ++i) {
+        A[i] = drand48(); // 使用随机数初始化
+    }
+
+    // 设置矩阵描述符
+    descinit_(desca, &N, &N, &MB, &NB, &izero, &izero, &ictxt, &M_loc, &info);
+
+
+    // 调用ScaLAPACK pdgetrf函数进行LU分解
+    int *ipiv = (int *)malloc((M_loc + MB) * sizeof(int));
+    pdgetrf_(&N, &N, A, &ione, &ione, desca, ipiv, &info);
+
+    // 清理资源
+    free(A);
+    free(ipiv);
+    blacs_gridexit_(&ictxt);
+    MPI_Finalize();
+
+    return info;
+}
\ No newline at end of file
diff --git a/kml/test/kml_scalapack_test.h b/kml/test/kml_scalapack_test.h
new file mode 100644
index 0000000..2ce9a1e
--- /dev/null
+++ b/kml/test/kml_scalapack_test.h
@@ -0,0 +1,14 @@
+#ifndef KML_SCALAPACK_TEST_H
+#define KML_SCALAPACK_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kml_scalapack_pdgetrf_01();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KML_SCALAPACK_TEST_H
\ No newline at end of file
diff --git a/kml/test/kml_solver_test.c b/kml/test/kml_solver_test.c
new file mode 100644
index 0000000..cf2df0d
--- /dev/null
+++ b/kml/test/kml_solver_test.c
@@ -0,0 +1,154 @@
+#include "kml_solver_test.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include "kml_solver.h"
+void PrepareMatrix(KmlSolverMatrixStore *store,
+                    KmlSolverMatrixOption *opt, int nRow, int nCol)
+{
+    store->indexType = KMLSS_INDEX_INT32;
+    store->valueType = KMLSS_VALUE_FP64;
+    store->nRow = nRow;
+    store->nCol = nCol;
+    opt->fieldMask = KMLSS_MATRIX_OPTION_TYPE;
+    opt->type = KMLSS_MATRIX_GEN;
+}
+
+int kml_solver_tydss_01()
+{
+int ierr;
+    int n = 8;
+    int nrhs = 1;
+
+    // Create matrix A
+    int ia[9] = {0, 2, 4, 6, 7, 8, 10, 12, 14};
+    int ja[14] = {0, 7, 1, 6, 2, 5, 3, 4, 2, 5, 1, 6, 0, 7};
+    double a[14] = {1.0, 2.0, -2.0, 3.0, 3.0, 4.0, -4.0, 5.0, 4.0, -6.0, 3.0, 7.0, 2.0, 8.0};
+    KmlSolverMatrixStore storeA;
+    KmlSolverMatrixOption optA;
+    PrepareMatrix(&storeA, &optA, n, n);
+    storeA.format = KMLSS_MATRIX_STORE_CSR;
+    storeA.csr.rowOffset = ia;
+    storeA.csr.colIndex = ja;
+    storeA.csr.value = a;
+
+    KmlSolverMatrix *A;
+    ierr = KmlSolverMatrixCreate(&A, &storeA, &optA);
+    if (ierr != KMLSS_NO_ERROR) {
+        printf("ERROR when create A: %d\n", ierr);
+        return 1;
+    }
+
+    // Create vector b
+    double b[8] = {3.0, 1.0, 7.0, -4.0, 5.0, -2.0, 10.0, 10.0};
+    KmlSolverMatrixStore storeB;
+    KmlSolverMatrixOption optB;
+    PrepareMatrix(&storeB, &optB, n, nrhs);
+    storeB.format = KMLSS_MATRIX_STORE_DENSE_COL_MAJOR;
+    storeB.dense.value = b;
+    storeB.dense.ld = n;
+    KmlSolverMatrix *B;
+    ierr = KmlSolverMatrixCreate(&B, &storeB, &optB);
+
+    if (ierr != KMLSS_NO_ERROR) {
+        printf("ERROR when create b: %d\n", ierr);
+        return 1;
+    }
+
+    // Create vector x
+    double x[8] = {0};
+    KmlSolverMatrixStore storeX;
+    KmlSolverMatrixOption optX;
+    PrepareMatrix(&storeX, &optX, n, nrhs);
+    storeX.format = KMLSS_MATRIX_STORE_DENSE_COL_MAJOR;
+    storeX.dense.value = x;
+    storeX.dense.ld = n;
+    KmlSolverMatrix *X;
+    ierr = KmlSolverMatrixCreate(&X, &storeX, &optX);
+    if (ierr != KMLSS_NO_ERROR) {
+        printf("ERROR when create x: %d\n", ierr);
+        return 1;
+    }
+
+    // Init solver
+    KmlDssInitOption opt;
+    opt.fieldMask = KMLDSS_INIT_OPTION_NTHREADS;
+    opt.nThreads = 32;
+    KmlDssSolver *solver;
+    ierr = KmlDssInit(&solver, &opt);
+    if (ierr != KMLSS_NO_ERROR) {
+        printf("ERROR in KmlDssInit: %d\n", ierr);
+        return ierr;
+    }
+
+    // Analyze
+    KmlDssAnalyzeOption optAnalyze;
+    optAnalyze.fieldMask = KMLDSS_ANALYZE_OPTION_NTHREADS_RDR;
+    optAnalyze.nThreadsRdr = 8;
+    ierr = KmlDssAnalyze(solver, A, &optAnalyze);
+    if (ierr != KMLSS_NO_ERROR) {
+        printf("ERROR in KmlDssAnalyze: %d\n", ierr);
+        return ierr;
+    }
+
+    // Factorize
+    KmlDssFactorizeOption optFact;
+    optFact.fieldMask = KMLDSS_FACTORIZE_OPTION_PERTURBATION_THRESHOLD;
+    optFact.perturbationThreshold = 1e-8;
+    ierr = KmlDssFactorize(solver, A, &optFact);
+    if (ierr != KMLSS_NO_ERROR) {
+        printf("ERROR in KmlDssFactorize: %d\n", ierr);
+        return ierr;
+    }
+
+    // Solve
+    KmlDssSolveOption optSolve;
+    optSolve.fieldMask = KMLDSS_SOLVE_OPTION_SOLVE_STAGE |
+                        KMLDSS_SOLVE_OPTION_REFINE_METHOD;
+    optSolve.stage = KMLDSS_SOLVE_ALL;
+    optSolve.refineMethod = KMLDSS_REFINE_OFF;
+    ierr = KmlDssSolve(solver, B, X, &optSolve);
+    if (ierr != KMLSS_NO_ERROR) {
+        printf("ERROR in KmlDssSolve: %d\n", ierr);
+        return ierr;
+    }
+
+    // Set new values of A
+    double a1[14] = {2.0, 3.0, -3.0, 4.0, 4.0, 5.0, -5.0,
+                        6.0, 5.0, -7.0, 4.0, 8.0, 3.0, 9.0};
+    KmlSolverMatrixSetValue(A, a1);
+    // Factorize with new values
+    ierr = KmlDssFactorize(solver, A, &optFact);
+    if (ierr != KMLSS_NO_ERROR) {
+        printf("ERROR in KmlDssFactorize: %d\n", ierr);
+        return ierr;
+    }
+
+
+
+    // Set new values of B
+    double b1[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+    KmlSolverMatrixSetValue(B, b1);
+    // Solve with new values
+    ierr = KmlDssSolve(solver, B, X, &optSolve);
+    if (ierr != KMLSS_NO_ERROR) {
+        printf("ERROR in KmlDssSolve: %d\n", ierr);
+        return ierr;
+    }
+
+    // Query
+    KmlDssInfo info;
+    info.fieldMask = KMLDSS_INFO_PEAK_MEM;
+    
+    ierr = KmlDssQuery(solver, &info);
+    if (ierr != KMLSS_NO_ERROR) {
+        printf("ERROR in KmlDssQuery: %d\n", ierr);
+        return ierr;
+    }
+    
+    // Destroy
+    KmlDssClean(&solver);
+    KmlSolverMatrixDestroy(&A);
+    KmlSolverMatrixDestroy(&B);
+    KmlSolverMatrixDestroy(&X);
+    return 0;
+}
\ No newline at end of file
diff --git a/kml/test/kml_solver_test.h b/kml/test/kml_solver_test.h
new file mode 100644
index 0000000..48f7bbe
--- /dev/null
+++ b/kml/test/kml_solver_test.h
@@ -0,0 +1,14 @@
+#ifndef KML_SOLVER_TEST_H
+#define KML_SOLVER_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kml_solver_tydss_01();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KML_SOLVER_TEST_H
\ No newline at end of file
diff --git a/kml/test/kml_spblas_test.c b/kml/test/kml_spblas_test.c
new file mode 100644
index 0000000..75739ed
--- /dev/null
+++ b/kml/test/kml_spblas_test.c
@@ -0,0 +1,29 @@
+#include "kml_spblas_test.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "kspblas.h"
+
+
+int kml_spblas_spgemm_01()
+{
+    kml_sparse_operation_t opt = KML_SPARSE_OPERATION_NON_TRANSPOSE; 
+    KML_INT m = 3; 
+    KML_INT n = 3; 
+    KML_INT k = 3; 
+    float alpha = 1.0; 
+    float beta = 1.0; 
+    char *matdescra = "G00C"; //一般矩阵，基0索引 
+    float val[4] = {9, 8, 5, 2}; 
+    KML_INT indx[4] = {2, 1, 2, 2}; 
+    KML_INT pntrb[3] = {0, 2, 3}; 
+    KML_INT pntre[3] = {2, 3, 4}; 
+    float b[9] = {1, 5, 7, 4, 7, 7, 3, 3, 7}; 
+    float c[9] = {0, 8, 8, 0, 2, 8, 3, 5, 6}; 
+    KML_INT ldb = 3; 
+    KML_INT ldc = 3; 
+    kml_sparse_status_t status = kml_sparse_scsrmm(opt, m, n, k, alpha, matdescra, val, indx, pntrb, pntre, b, ldb, beta, c, ldc); 
+
+
+    return 1; // 成功
+}
\ No newline at end of file
diff --git a/kml/test/kml_spblas_test.h b/kml/test/kml_spblas_test.h
new file mode 100644
index 0000000..3b1cbfc
--- /dev/null
+++ b/kml/test/kml_spblas_test.h
@@ -0,0 +1,14 @@
+#ifndef KML_SPBLAS_TEST_H
+#define KML_SPBLAS_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kml_spblas_spgemm_01();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KML_SPBLAS_TEST_H
\ No newline at end of file
diff --git a/kml/test/kml_svml_test.c b/kml/test/kml_svml_test.c
new file mode 100644
index 0000000..c6d6eb4
--- /dev/null
+++ b/kml/test/kml_svml_test.c
@@ -0,0 +1,20 @@
+#include "kml_vml_test.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "kvml.h"
+
+
+int kml_vml_sin_01()
+{
+    int i, len = 4; 
+    float src[] = {0.0f, 2.0f, INFINITY, NAN}; 
+    float* dst = (float*)malloc(sizeof(float) * len); 
+    if (dst == NULL) { 
+        printf("Malloc Failed!\n"); 
+        return 0; 
+    } 
+    vssin(len, src, dst); 
+
+    return 1; // 成功
+}
\ No newline at end of file
diff --git a/kml/test/kml_svml_test.h b/kml/test/kml_svml_test.h
new file mode 100644
index 0000000..8a2e1af
--- /dev/null
+++ b/kml/test/kml_svml_test.h
@@ -0,0 +1,14 @@
+#ifndef KML_SVML_TEST_H
+#define KML_SVML_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kml_svml_sin_01();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KML_SVML_TEST_H
\ No newline at end of file
diff --git a/kml/test/kml_test_main.cpp b/kml/test/kml_test_main.cpp
new file mode 100644
index 0000000..f5ac12f
--- /dev/null
+++ b/kml/test/kml_test_main.cpp
@@ -0,0 +1,75 @@
+#include <gtest/gtest.h>
+extern "C" {
+#include "kml_fft_test.h"
+#include "kml_blas_test.h"
+#include "kml_lapack_test.h"
+#include "kml_solver_test.h"
+#include "kml_scalapack_test.h"
+#include "kml_ipl_test.h"
+#include "kml_conv_test.h"
+#include "kml_vml_test.h"
+#include "kml_vsl_test.h"
+#include "kml_svml_test.h"
+#include "kml_math_test.h"
+#include "kml_spblas_test.h"
+}
+
+TEST(kml_test, kml_fft) {
+    int result = kml_fft_c2c_01();
+    EXPECT_EQ(result, 1) << "kml_fft_c2c_01 should succeed";
+}
+
+TEST(kml_test, kml_blas) {
+    int result = kml_blas_sgemm_01();
+    EXPECT_EQ(result, 1) << "kml_blas_sgemm_01 should succeed";
+}
+
+TEST(kml_test, kml_lapack) {
+    int result = kml_lapack_dgetrf_01();
+    EXPECT_EQ(result, 0) << "kml_lapack_dgetrf_01 should succeed";
+}
+
+TEST(kml_test, kml_solver) {
+    int result = kml_solver_tydss_01();
+    EXPECT_EQ(result, 0) << "kml_solver_tydss_01 should succeed";
+}
+
+TEST(kml_test, kml_scalapack) {
+    int result = kml_scalapack_pdgetrf_01();
+    EXPECT_EQ(result, 0) << "kml_scalapack_pdgetrf_01 should succeed";
+}
+
+TEST(kml_test, kml_ipl) {
+    int result = kml_ipl_interp2d_01();
+    EXPECT_EQ(result, 1) << "kml_ipl_interp2d_01 should succeed";
+}
+
+TEST(kml_test, kml_conv) {
+    int result = kml_conv_conv2d_01();
+    EXPECT_EQ(result, 1) << "kml_conv_conv2d_01 should succeed";
+}
+
+TEST(kml_test, kml_vml) {
+    int result = kml_vml_sin_01();
+    EXPECT_EQ(result, 1) << "kml_vml_sin_01 should succeed";
+}
+
+TEST(kml_test, kml_vsl) {
+    int result = kml_vsl_uniform_01();
+    EXPECT_EQ(result, 1) << "kml_vsl_uniform_01 should succeed";
+}
+
+TEST(kml_test, kml_svml) {
+    int result = kml_svml_sin_01();
+    EXPECT_EQ(result, 1) << "kml_svml_sin_01 should succeed";
+}
+
+TEST(kml_test, kml_math) {
+    int result = kml_math_sin_01();
+    EXPECT_EQ(result, 1) << "kml_math_sin_01 should succeed";
+}
+
+TEST(kml_test, kml_spblas) {
+    int result = kml_spblas_spgemm_01();
+    EXPECT_EQ(result, 1) << "kml_spblas_spgemm_01 should succeed";
+}
\ No newline at end of file
diff --git a/kml/test/kml_vml_test.c b/kml/test/kml_vml_test.c
new file mode 100644
index 0000000..2166553
--- /dev/null
+++ b/kml/test/kml_vml_test.c
@@ -0,0 +1,13 @@
+#include "kml_svml_test.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "ksvml.h"
+
+
+int kml_svml_sin_01()
+{
+    float32x4_t src = {0.0f, 2.0f, INFINITY, NAN};
+    float32x4_t dst = svml128_sin_f32(src);
+    return 1; // 成功
+}
\ No newline at end of file
diff --git a/kml/test/kml_vml_test.h b/kml/test/kml_vml_test.h
new file mode 100644
index 0000000..e332b54
--- /dev/null
+++ b/kml/test/kml_vml_test.h
@@ -0,0 +1,14 @@
+#ifndef KML_VML_TEST_H
+#define KML_VML_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kml_vml_sin_01();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KML_VML_TEST_H
\ No newline at end of file
diff --git a/kml/test/kml_vsl_test.c b/kml/test/kml_vsl_test.c
new file mode 100644
index 0000000..a5105ed
--- /dev/null
+++ b/kml/test/kml_vsl_test.c
@@ -0,0 +1,41 @@
+#include "kml_vsl_test.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "krng.h"
+
+
+int kml_vsl_uniform_01()
+{
+    VSLStreamStatePtr stream;
+    unsigned seed = 42;
+    int errcode = vslNewStream(&stream, VSL_BRNG_MCG59, seed);
+    if (errcode != VSL_STATUS_OK) {
+        fprintf(stderr, "Failure in newstream\n");
+        return 0;
+    }
+
+    SizeType n = 10;
+    float *r = (float *)malloc(sizeof(float) * n);
+    if (r == NULL) {
+        fprintf(stderr, "Failure in malloc\n");
+        return 0;
+    }
+
+    if (vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, n, r, 0.0, 1.0)) {
+        fprintf(stderr, "Failure in vsRngUniform\n");
+        goto out;
+    }
+
+    /* deinitialize the stream */
+    errcode = vslDeleteStream(&stream);
+    if (errcode != VSL_STATUS_OK) {
+        fprintf(stderr, "Failure in deleting stream\n");
+        goto out;
+    }
+
+out:
+    free(r);
+
+    return 1; // 成功
+}
\ No newline at end of file
diff --git a/kml/test/kml_vsl_test.h b/kml/test/kml_vsl_test.h
new file mode 100644
index 0000000..7679a90
--- /dev/null
+++ b/kml/test/kml_vsl_test.h
@@ -0,0 +1,14 @@
+#ifndef KML_VSL_TEST_H
+#define KML_VSL_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kml_vsl_uniform_01();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KML_VSL_TEST_H
\ No newline at end of file
diff --git a/kudnn/build.sh b/kudnn/build.sh
new file mode 100644
index 0000000..b13c175
--- /dev/null
+++ b/kudnn/build.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# export DNN_INC=~/local/HPCKit/latest/kudnn/bisheng/include
+# export DNN_LIB=~/local/HPCKit/latest/kudnn/bisheng/lib
+# export BLAS_LIB=~/local/HPCKit/latest/kml/bisheng/lib/sme/kblas/multi
+cputype=$(cat /proc/cpuinfo | grep 'CPU part' | awk 'NR==1{print $4}')
+if [[ "$cputype" == "0xd22" ]]; then
+    echo "CXX=${CXX}"
+    echo "DNN_INC=${DNN_INC}"
+    echo "DNN_LIB=${DNN_LIB}"
+    echo "BLAS_LIB=${BLAS_LIB}"
+    ${CXX} src/function.cpp src/main.cpp -o function_test \
+        -I ${DNN_INC} \
+        -L ${DNN_LIB} \
+        -L ${BLAS_LIB} \
+        -lkblas \
+        -lkdnn
+
+else
+  echo "kudnn: Not suppot on this platform"
+fi
diff --git a/kudnn/run.sh b/kudnn/run.sh
new file mode 100644
index 0000000..847a2f0
--- /dev/null
+++ b/kudnn/run.sh
@@ -0,0 +1,8 @@
+./function_test kudnn_linearActivation_01
+./function_test kudnn_linearRes_01
+./function_test kudnn_rnorm_01
+./function_test kudnn_lnorm_01
+./function_test kudnn_softmax_01
+./function_test kudnn_gnorm_01
+./function_test kudnn_conv_01
+./function_test kudnn_conv_02
\ No newline at end of file
diff --git a/kudnn/src/function.cpp b/kudnn/src/function.cpp
new file mode 100644
index 0000000..0d929ce
--- /dev/null
+++ b/kudnn/src/function.cpp
@@ -0,0 +1,1112 @@
+#include "function.hpp"
+
+using SizeType = KDNN::SizeType;
+using Shape = KDNN::Shape;
+using TensorInfo = const KDNN::TensorInfo;
+KDNN::Element::TypeT TypeF16 = KDNN::Element::TypeT::F16;
+KDNN::Element::TypeT TypeF32 = KDNN::Element::TypeT::F32;
+const int MAX_THS = 100;
+const float FACTOR_THS = 0.02;
+float errBound = 1e-3;
+float maxError = 1e-3;
+
+template <typename T>
+struct typeMap;
+
+template <>
+struct typeMap<__fp16> {
+    static constexpr KDNN::Element::TypeT val = KDNN::Element::TypeT::F16;
+};
+
+template <>
+struct typeMap<float> {
+    static constexpr KDNN::Element::TypeT val = KDNN::Element::TypeT::F32;
+};
+
+template <typename T>
+static void conv2dRef(Shape &srcShape, Shape &weiShape, Shape &dstShape, Shape &strides, Shape &dilates,
+                      Shape &paddingL, Shape &paddingR, const T *src, const T *wei, T *dst, const T *bia)
+{
+    SizeType N = srcShape[0], IC = srcShape[1], IH = srcShape[2], IW = srcShape[3];
+    SizeType KH = weiShape[2], KW = weiShape[3];
+    SizeType OC = dstShape[1], OH = dstShape[2], OW = dstShape[3];
+    SizeType SH = strides[0], SW = strides[1];
+    SizeType DH = dilates[0], DW = dilates[1];
+    SizeType PH_L = paddingL[0], PW_L = paddingL[1];
+    SizeType PH_R = paddingR[0], PW_R = paddingR[1];
+    OH = (IH + PH_L + PH_R - 1 - (KH - 1) * (DH + 1)) / SH + 1;
+    OW = (IW + PW_L + PW_R - 1 - (KW - 1) * (DW + 1)) / SW + 1;
+    int threadsUsed = (int)ceil((float)(N * OC * OH * OW) * FACTOR_THS);
+    threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed;
+#pragma omp parallel for collapse(4) num_threads(threadsUsed)
+    for (SizeType n = 0; n < N; ++n) {
+        for (SizeType oc = 0; oc < OC; ++oc) {
+            T biaVal = *(bia + oc);
+            for (SizeType oh = 0; oh < OH; ++oh) {
+                for (SizeType ow = 0; ow < OW; ++ow) {
+                    T sum = (T)0.0f;
+                    for (SizeType ic = 0; ic < IC; ++ic) {
+                        for (SizeType kh = 0; kh < KH; ++kh) {
+                            for (SizeType kw = 0; kw < KW; ++kw) {
+                                SizeType srcX = oh * SH + kh * (DH + 1) - PH_L;
+                                SizeType srcY = ow * SW + kw * (DW + 1) - PW_L;
+                                T srcVal = (T)0.0f;
+                                if (srcX >= 0 && srcX < IH && srcY >= 0 && srcY < IW) {
+                                    srcVal = *(src + n * IC * IH * IW + ic * IH * IW + srcX * IW + srcY);
+                                }
+                                T weiVal = *(wei + oc * IC * KH * KW + ic * KH * KW + kh * KW + kw);
+                                sum += srcVal * weiVal;
+                            }
+                        }
+                    }
+                    *(dst + n * OC * OH * OW + oc * OH * OW + oh * OW + ow) = sum + biaVal;
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+static bool Conv2dFWDFunc1(Shape &srcShape, Shape &weiShape, Shape &dstShape, Shape &strides, Shape &dilates,
+                           Shape &paddingL, Shape &paddingR, KDNN::ConvolutionAlgorithm &alg)
+{
+    SizeType N = srcShape[0], IC = srcShape[1], IH = srcShape[2], IW = srcShape[3];
+    SizeType OC = dstShape[1], OH = dstShape[2], OW = dstShape[3];
+    SizeType KH = weiShape[2], KW = weiShape[3];
+    const KDNN::TensorInfo srcTensor = {srcShape, typeMap<T>::val, KDNN::Layout::ABCD};
+    const KDNN::TensorInfo weightsTensor = {weiShape, typeMap<T>::val, KDNN::Layout::ABCD};
+    const KDNN::TensorInfo dstTensor = {dstShape, typeMap<T>::val, KDNN::Layout::ABCD};
+    const KDNN::TensorInfo biasTensor = {{OC}, typeMap<T>::val, KDNN::Layout::A};
+
+    KDNN::ConvolutionLayerFWD convFwdLayer1(srcTensor, weightsTensor, dstTensor, biasTensor, strides, dilates, paddingL,
+                                            paddingR, alg);
+
+    SizeType srcSize = N * IC * IH * IW;
+    SizeType dstSize = N * OC * OH * OW;
+    SizeType weiSize = OC * IC * KH * KW;
+    SizeType biaSize = OC;
+    T *src = (T *)malloc(srcSize * sizeof(T));
+    T *dst = (T *)malloc(dstSize * sizeof(T));
+    T *dstRef = (T *)malloc(dstSize * sizeof(T));
+    T *wei = (T *)malloc(weiSize * sizeof(T));
+    T *bia = (T *)malloc(biaSize * sizeof(T));
+    if (src == nullptr || dst == nullptr || dstRef == nullptr || wei == nullptr || bia == nullptr) {
+        std::cerr << "Memory allocation failed" << std::endl;
+        return false;
+    }
+    // generate random test data
+    std::uniform_real_distribution<float> u(-1, 1);
+    std::default_random_engine e(time(NULL));
+    int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS);
+    threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed;
+#pragma omp parallel for num_threads(threadsUsed) schedule(static)
+    for (SizeType i = 0; i < srcSize; ++i) {
+        *(src + i) = (T)u(e);
+    }
+    for (SizeType i = 0; i < weiSize; ++i) {
+        *(wei + i) = (T)u(e);
+    }
+    for (SizeType i = 0; i < biaSize; ++i) {
+        *(bia + i) = (T)u(e);
+    }
+    convFwdLayer1.Run(src, wei, dst, bia);
+    conv2dRef<T>(srcShape, weiShape, dstShape, strides, dilates, paddingL, paddingR, src, wei, dstRef, bia);
+    float error = 0.0;
+#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed)
+    for (SizeType i = 0; i < dstSize; ++i) {
+        error += *(dst + i) - *(dstRef + i);
+    }
+    free(src);
+    free(dst);
+    free(dstRef);
+    free(wei);
+    free(bia);
+    error = std::abs(error) / dstSize;
+    
+    return error < errBound;
+}
+
+bool kudnn_conv_01()
+{
+    SizeType N = 4, IC = 4, IH = 128, IW = 100;
+    SizeType OC = 5, OH = 0, OW = 0;
+    SizeType KH = 3, KW = 3;
+    Shape strides(1, 1), dilates(1, 1), paddingL(1, 1), paddingR(1, 1);
+    OH = (IH + paddingL[0] + paddingR[0] - 1 - (KH - 1) * (dilates[0] + 1)) / strides[0] + 1;
+    OW = (IW + paddingL[1] + paddingR[1] - 1 - (KW - 1) * (dilates[1] + 1)) / strides[1] + 1;
+    Shape srcShape(N, IC, IH, IW), weiShape(OC, IC, KH, KW), dstShape(N, OC, OH, OW);
+    KDNN::ConvolutionAlgorithm alg(KDNN::ConvolutionAlgorithm::AUTO);
+    return Conv2dFWDFunc1<float>(srcShape, weiShape, dstShape, strides, dilates, paddingL, paddingR, alg);
+}
+
+template <typename T>
+static void conv3dRef(Shape &srcShape, Shape &weiShape, Shape &dstShape, Shape &strides, Shape &dilates,
+                      Shape &paddingL, Shape &paddingR, const T *src, const T *wei, T *dst, const T *bia)
+{
+    SizeType N = srcShape[0], IC = srcShape[1], ID = srcShape[2], IH = srcShape[3], IW = srcShape[4];
+    SizeType KD = weiShape[2], KH = weiShape[3], KW = weiShape[4];
+    SizeType OC = dstShape[1], OD = dstShape[2], OH = dstShape[2], OW = dstShape[3];
+    SizeType SD = strides[0], SH = strides[1], SW = strides[2];
+    SizeType DD = dilates[0], DH = dilates[1], DW = dilates[2];
+    SizeType PD_L = paddingL[0], PH_L = paddingL[1], PW_L = paddingL[2];
+    SizeType PD_R = paddingR[0], PH_R = paddingR[1], PW_R = paddingR[2];
+    OD = (ID + PD_L + PD_R - 1 - (KD - 1) * (DD + 1)) / SD + 1;
+    OH = (IH + PH_L + PH_R - 1 - (KH - 1) * (DH + 1)) / SH + 1;
+    OW = (IW + PW_L + PW_R - 1 - (KW - 1) * (DW + 1)) / SW + 1;
+    int threadsUsed = (int)ceil((float)(N * OC * OD * OH * OW) * FACTOR_THS);
+    threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed;
+#pragma omp parallel for collapse(5) num_threads(threadsUsed)
+    for (SizeType n = 0; n < N; ++n) {
+        for (SizeType oc = 0; oc < OC; ++oc) {
+            T biaVal = *(bia + oc);
+            for (SizeType od = 0; od < OD; ++od) {
+                for (SizeType oh = 0; oh < OH; ++oh) {
+                    for (SizeType ow = 0; ow < OW; ++ow) {
+                        T sum = (T)0.0f;
+                        for (SizeType ic = 0; ic < IC; ++ic) {
+                            for (SizeType kd = 0; kd < KD; ++kd) {
+                                for (SizeType kh = 0; kh < KH; ++kh) {
+                                    for (SizeType kw = 0; kw < KW; ++kw) {
+                                        SizeType srcZ = od * SD + kd * (DD + 1) - PD_L;
+                                        SizeType srcX = oh * SH + kh * (DH + 1) - PH_L;
+                                        SizeType srcY = ow * SW + kw * (DW + 1) - PW_L;
+                                        T srcVal = (T)0.0f;
+                                        if (srcZ >= 0 && srcZ < ID && srcX >= 0 && srcX < IH && srcY >= 0 &&
+                                            srcY < IW) {
+                                            srcVal = *(src + n * IC * ID * IH * IW + ic * ID * IH * IW +
+                                                       srcZ * IH * IW + srcX * IW + srcY);
+                                        }
+                                        T weiVal = *(wei + oc * IC * KD * KH * KW + ic * KD * KH * KW + kd * KH * KW +
+                                                     kh * KW + kw);
+                                        sum += srcVal * weiVal;
+                                    }
+                                }
+                            }
+                        }
+                        *(dst + n * OC * OD * OH * OW + oc * OD * OH * OW + od * OH * OW + oh * OW + ow) = sum + biaVal;
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+static bool Conv3dFWDFunc1(Shape &srcShape, Shape &weiShape, Shape &dstShape, Shape &strides, Shape &dilates,
+                           Shape &paddingL, Shape &paddingR, KDNN::ConvolutionAlgorithm &alg)
+{
+    SizeType N = srcShape[0], IC = srcShape[1], ID = srcShape[2], IH = srcShape[3], IW = srcShape[4];
+    SizeType OC = dstShape[1], OD = dstShape[2], OH = dstShape[3], OW = dstShape[4];
+    SizeType KD = weiShape[2], KH = weiShape[3], KW = weiShape[4];
+    const KDNN::TensorInfo srcTensor = {srcShape, typeMap<T>::val, KDNN::Layout::ABCDE};
+    const KDNN::TensorInfo weightsTensor = {weiShape, typeMap<T>::val, KDNN::Layout::ABCDE};
+    const KDNN::TensorInfo dstTensor = {dstShape, typeMap<T>::val, KDNN::Layout::ABCDE};
+    const KDNN::TensorInfo biasTensor = {{OC}, typeMap<T>::val, KDNN::Layout::A};
+
+    KDNN::ConvolutionLayerFWD convFwdLayer1(srcTensor, weightsTensor, dstTensor, biasTensor, strides, dilates, paddingL,
+                                            paddingR, alg);
+
+    SizeType srcSize = N * IC * ID * IH * IW;
+    SizeType dstSize = N * OC * OD * OH * OW;
+    SizeType weiSize = OC * IC * KD * KH * KW;
+    SizeType biaSize = OC;
+    T *src = (T *)malloc(srcSize * sizeof(T));
+    T *dst = (T *)malloc(dstSize * sizeof(T));
+    T *dstRef = (T *)malloc(dstSize * sizeof(T));
+    T *wei = (T *)malloc(weiSize * sizeof(T));
+    T *bia = (T *)malloc(biaSize * sizeof(T));
+    if (src == nullptr || dst == nullptr || dstRef == nullptr || wei == nullptr || bia == nullptr) {
+        std::cerr << "Memory allocation failed" << std::endl;
+        return false;
+    }
+    // generate random test data
+    std::uniform_real_distribution<float> u(-1, 1);
+    std::default_random_engine e(time(NULL));
+    int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS);
+    threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed;
+#pragma omp parallel for num_threads(threadsUsed) schedule(static)
+    for (SizeType i = 0; i < srcSize; ++i) {
+        *(src + i) = (T)u(e);
+    }
+    for (SizeType i = 0; i < weiSize; ++i) {
+        *(wei + i) = (T)u(e);
+    }
+    for (SizeType i = 0; i < biaSize; ++i) {
+        *(bia + i) = (T)u(e);
+    }
+    conv3dRef<T>(srcShape, weiShape, dstShape, strides, dilates, paddingL, paddingR, src, wei, dstRef, bia);
+    convFwdLayer1.Run(src, wei, dst, bia);
+    float error = 0.0;
+#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed)
+    for (SizeType i = 0; i < dstSize; ++i) {
+        error += *(dst + i) - *(dstRef + i);
+    }
+    free(src);
+    free(dst);
+    free(dstRef);
+    free(wei);
+    free(bia);
+    error = std::abs(error) / dstSize;
+    
+    return error < errBound;
+}
+
+bool kudnn_conv_02()
+{
+    SizeType N = 1, IC = 5, ID = 20, IH = 90, IW = 160;
+    SizeType OC = 10, OD = 0, OH = 0, OW = 0;
+    SizeType KD = 1, KH = 1, KW = 1;
+    Shape strides(1, 1, 1), dilates(0, 0, 0), paddingL(0, 0, 0), paddingR(0, 0, 0);
+    OD = (ID + paddingL[0] + paddingR[0] - 1 - (KD - 1) * (dilates[0] + 1)) / strides[0] + 1;
+    OH = (IH + paddingL[1] + paddingR[1] - 1 - (KH - 1) * (dilates[1] + 1)) / strides[1] + 1;
+    OW = (IW + paddingL[2] + paddingR[2] - 1 - (KW - 1) * (dilates[2] + 1)) / strides[2] + 1;
+    Shape srcShape(N, IC, ID, IH, IW), weiShape(OC, IC, KD, KH, KW), dstShape(N, OC, OD, OH, OW);
+    KDNN::ConvolutionAlgorithm alg(KDNN::ConvolutionAlgorithm::AUTO);
+    return Conv3dFWDFunc1<__fp16>(srcShape, weiShape, dstShape, strides, dilates, paddingL, paddingR, alg);
+}
+
+template <typename T>
+static void gnormRef(Shape shape, SizeType groupInfo, const T *src, T *dst, const T *scale, const T *shift,
+                     bool scaleApply, bool shiftApply, float *mean, float *variance, const float eps,
+                     bool global_stats = false)
+{
+    SizeType innerSize = 1;
+    SizeType numDims = shape.GetNumDims();
+    if (numDims == 4) {
+        innerSize = shape[2] * shape[3];
+    } else if (numDims == 5) {
+        innerSize = shape[2] * shape[3] * shape[4];
+    } else if (numDims == 3) {
+        innerSize = shape[2];
+    } else {
+        return;
+    }
+    SizeType A = shape[0], B = shape[1];
+    SizeType chPerGroup = B / groupInfo;
+    innerSize *= chPerGroup;
+    int threadsUsed = (int)ceil((float)(A * groupInfo) * FACTOR_THS);
+    threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed;
+    if (!global_stats) {
+        // mean compute
+#pragma omp parallel for collapse(2) num_threads(threadsUsed)
+        for (SizeType a = 0; a < A; ++a) {
+            for (SizeType g = 0; g < groupInfo; ++g) {
+                float sum = 0.0f;
+                for (SizeType i = 0; i < innerSize; ++i) {
+                    sum += (float)*(src + a * groupInfo * innerSize + g * innerSize + i);
+                }
+                *(mean + a * groupInfo + g) = sum / innerSize;
+            }
+        }
+        // variance compute
+#pragma omp parallel for collapse(2) num_threads(threadsUsed)
+        for (SizeType a = 0; a < A; ++a) {
+            for (SizeType g = 0; g < groupInfo; ++g) {
+                float sum = 0.0f;
+                float meanVal = *(mean + a * groupInfo + g);
+                for (SizeType i = 0; i < innerSize; ++i) {
+                    float diff = (float)*(src + a * groupInfo * innerSize + g * innerSize + i) - meanVal;
+                    sum += diff * diff;
+                }
+                *(variance + a * groupInfo + g) = sum / innerSize;
+            }
+        }
+    }
+    // norm
+    innerSize /= chPerGroup;
+#pragma omp parallel for collapse(2) num_threads(threadsUsed)
+    for (SizeType a = 0; a < A; ++a) {
+        for (SizeType g = 0; g < groupInfo; ++g) {
+            float meanVal = *(mean + a * groupInfo + g);
+            float invStd = 1.0f / std::sqrt(*(variance + a * groupInfo + g) + eps);
+            for (SizeType gi = 0; gi < chPerGroup; ++gi) {
+                int b = g * chPerGroup + gi;
+                for (SizeType i = 0; i < innerSize; ++i) {
+                    float srcVal = (float)*(src + a * B * innerSize + b * innerSize + i);
+                    float normalized = (srcVal - meanVal) * invStd;
+                    if (scaleApply) {
+                        normalized *= scale[b];
+                    }
+                    if (shiftApply) {
+                        normalized += shift[b];
+                    }
+                    *(dst + a * B * innerSize + b * innerSize + i) = static_cast<T>(normalized);
+                }
+            }
+        }
+    }
+}
+// forward
+template <typename T>
+static bool GnormForwardFunc1(const TensorInfo &srcInfo, const TensorInfo &scaleShiftInfo, SizeType groupInfo,
+                              const TensorInfo &dstInfo, KDNN::NormalizationFlags flags)
+{
+    KDNN::GroupNormalizationLayerFWD gnormLayer1(srcInfo, scaleShiftInfo, groupInfo, dstInfo, flags);
+
+    SizeType srcSize = srcInfo.GetTotalTensorSize();
+    SizeType dstSize = dstInfo.GetTotalTensorSize();
+    SizeType statSize = srcInfo.GetDims()[0] * groupInfo;
+    SizeType scaleSize = scaleShiftInfo.GetTotalTensorSize();
+
+    T *src = (T *)malloc(srcSize * sizeof(T));
+    T *dst = (T *)malloc(srcSize * sizeof(T));
+    T *dstRef = (T *)malloc(srcSize * sizeof(T));
+    float *mean = (float *)malloc(statSize * sizeof(float));
+    float *variance = (float *)malloc(statSize * sizeof(float));
+    T *scale = (T *)malloc(scaleSize * sizeof(T));
+    T *shift = (T *)malloc(scaleSize * sizeof(T));
+    float eps = 1e-5;
+    if (src == nullptr || dst == nullptr || dstRef == nullptr || mean == nullptr || variance == nullptr ||
+        scale == nullptr || shift == nullptr) {
+        std::cerr << "Memory allocation failed" << std::endl;
+        return false;
+    }
+
+    bool global_stats = static_cast<bool>(flags & KDNN::NormalizationFlags::USE_GLOBAL_STATS);
+
+    // generate random test data
+    std::uniform_real_distribution<float> u(-1, 1);
+    std::default_random_engine e(time(NULL));
+    int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS);
+    threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed;
+#pragma omp parallel for num_threads(threadsUsed) schedule(static)
+    for (SizeType i = 0; i < srcSize; ++i) {
+        *(src + i) = (T)u(e);
+    }
+    for (SizeType i = 0; i < scaleSize; ++i) {
+        *(scale + i) = (T)u(e);
+        *(shift + i) = (T)u(e);
+    }
+    if (global_stats) {
+        for (SizeType i = 0; i < statSize; ++i) {
+            *(mean + i) = 1.0f;
+            *(variance + i) = 0.1f;
+        }
+    }
+
+    gnormLayer1.Run(src, dst, scale, shift, mean, variance, true, eps);
+    bool scaleApply = static_cast<bool>(flags & KDNN::NormalizationFlags::USE_SCALE);
+    bool shiftApply = static_cast<bool>(flags & KDNN::NormalizationFlags::USE_SHIFT);
+
+    gnormRef<T>(srcInfo.GetDims(), groupInfo, src, dstRef, scale, shift, scaleApply, shiftApply, mean, variance, eps,
+                global_stats);
+    float error = 0.0;
+#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed)
+    for (SizeType i = 0; i < dstSize; ++i) {
+        error += *(dst + i) - *(dstRef + i);
+    }
+    free(src);
+    free(dst);
+    free(dstRef);
+    free(mean);
+    free(variance);
+    free(scale);
+    free(shift);
+    error = std::abs(error) / dstSize;
+    
+    return error < errBound;
+}
+
+bool kudnn_gnorm_01()
+{
+    Shape shape(4, 128, 18, 320);
+    SizeType groupInfo = 2;
+    TensorInfo srcInfo = {shape, TypeF16, KDNN::Layout::ABCD};
+    TensorInfo scaleShiftInfo = {{shape[1]}, TypeF16, KDNN::Layout::A};
+    TensorInfo dstInfo = {shape, TypeF16, KDNN::Layout::ABCD};
+    KDNN::NormalizationFlags flags = KDNN::NormalizationFlags::NONE;
+    return GnormForwardFunc1<__fp16>(srcInfo, scaleShiftInfo, groupInfo, dstInfo, flags);
+}
+
+template <typename T>
+static void lnormRef(SizeType outerSize, SizeType innerSize, const T *src, T *dst, const T *scale, const T *shift,
+                     bool scaleApply, bool shiftApply, float *mean, float *variance, const float eps,
+                     bool global_stats = false)
+{
+    int threadsUsed = (int)ceil((float)outerSize * FACTOR_THS);
+    threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed;
+    if (!global_stats) {
+        // mean compute
+#pragma omp parallel for num_threads(threadsUsed) schedule(static)
+        for (SizeType i = 0; i < outerSize; ++i) {
+            float sum = 0.0f;
+            for (SizeType j = 0; j < innerSize; ++j) {
+                sum += (float)*(src + i * innerSize + j);
+            }
+            *(mean + i) = sum / innerSize;
+        }
+        // variance compute
+#pragma omp parallel for num_threads(threadsUsed) schedule(static)
+        for (SizeType i = 0; i < outerSize; ++i) {
+            float sum = 0.0f;
+            float meanVal = *(mean + i);
+            for (SizeType j = 0; j < innerSize; ++j) {
+                float diff = (float)*(src + i * innerSize + j) - meanVal;
+                sum += diff * diff;
+            }
+            *(variance + i) = sum / innerSize;
+        }
+    }
+    // norm
+#pragma omp parallel for collapse(2) num_threads(threadsUsed)
+    for (SizeType i = 0; i < outerSize; ++i) {
+        float meanVal = *(mean + i);
+        float varVal = *(variance + i);
+        float invStd = 1.0f / std::sqrt(varVal + eps);
+        for (SizeType j = 0; j < innerSize; ++j) {
+            float srcVal = (float)*(src + i * innerSize + j);
+            float normalized = (srcVal - meanVal) * invStd;
+            if (scaleApply) {
+                normalized *= scale[j];
+            }
+            if (shiftApply) {
+                normalized += shift[j];
+            }
+            *(dst + i * innerSize + j) = static_cast<T>(normalized);
+        }
+    }
+}
+// forward
+template <typename T>
+static bool LnormForwardFunc1(const TensorInfo &srcInfo, const TensorInfo &statInfo, const TensorInfo &scaleShiftInfo,
+                              const TensorInfo &dstInfo, KDNN::NormalizationFlags flags)
+{
+    KDNN::NormalizationLayerFWD lnormLayer1(srcInfo, statInfo, scaleShiftInfo, dstInfo, flags);
+    SizeType srcSize = srcInfo.GetTotalTensorSize();
+    SizeType dstSize = dstInfo.GetTotalTensorSize();
+    SizeType statSize = statInfo.GetTotalTensorSize();
+    SizeType innerSize = scaleShiftInfo.GetTotalTensorSize();
+    T *src = (T *)malloc(srcSize * sizeof(T));
+    T *dst = (T *)malloc(dstSize * sizeof(T));
+    T *dstRef = (T *)malloc(dstSize * sizeof(T));
+    float *mean = (float *)malloc(statSize * sizeof(float));
+    float *variance = (float *)malloc(statSize * sizeof(float));
+    T *scale = (T *)malloc(innerSize * sizeof(T));
+    T *shift = (T *)malloc(innerSize * sizeof(T));
+    float eps = 1e-5;
+    if (src == nullptr || dst == nullptr || dstRef == nullptr || mean == nullptr || variance == nullptr ||
+        scale == nullptr || shift == nullptr) {
+        std::cerr << "Memory allocation failed" << std::endl;
+        return false;
+    }
+
+    bool global_stats = static_cast<bool>(flags & KDNN::NormalizationFlags::USE_GLOBAL_STATS);
+
+    // generate random test data
+    std::uniform_real_distribution<float> u(-1, 1);
+    std::default_random_engine e(time(NULL));
+    int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS);
+    threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed;
+#pragma omp parallel for num_threads(threadsUsed) schedule(static)
+    for (SizeType i = 0; i < srcSize; ++i) {
+        *(src + i) = (T)u(e);
+    }
+    for (SizeType j = 0; j < innerSize; ++j) {
+        *(scale + j) = (T)u(e);
+        *(shift + j) = (T)u(e);
+    }
+    if (global_stats) {
+        for (SizeType i = 0; i < statSize; i++) {
+            *(mean + i) = 1.0f;
+            *(variance + i) = 0.1f;
+        }
+    }
+
+    float error = 0.0f;
+    lnormLayer1.Run(src, dst, scale, shift, mean, variance, true, eps);
+    bool scaleApply = static_cast<bool>(flags & KDNN::NormalizationFlags::USE_SCALE);
+    bool shiftApply = static_cast<bool>(flags & KDNN::NormalizationFlags::USE_SHIFT);
+    lnormRef<T>(statSize, innerSize, src, dstRef, scale, shift, scaleApply, shiftApply, mean, variance, eps,
+                global_stats);
+#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed)
+    for (SizeType i = 0; i < dstSize; ++i) {
+        error += *(dst + i) - *(dstRef + i);
+    }
+    free(src);
+    free(dst);
+    free(dstRef);
+    free(mean);
+    free(variance);
+    free(scale);
+    free(shift);
+    error = std::abs(error) / dstSize;
+    
+    return error < errBound;
+}
+
+bool kudnn_lnorm_01()
+{
+    Shape shape(2, 3240, 1152);
+    TensorInfo srcInfo = {shape, TypeF32, KDNN::Layout::ABC};
+    TensorInfo statInfo = {{shape[0], shape[1]}, TypeF32, KDNN::Layout::AB};
+    TensorInfo scaleShiftInfo = {{shape[2]}, TypeF32, KDNN::Layout::A};
+    TensorInfo dstInfo = {shape, TypeF32, KDNN::Layout::ABC};
+    KDNN::NormalizationFlags flags = KDNN::NormalizationFlags::NONE;
+    return LnormForwardFunc1<float>(srcInfo, statInfo, scaleShiftInfo, dstInfo, flags);
+}
+
+template <typename T>
+static void siluSimple(T *dst, SizeType size)
+{
+    if (!dst || size == 0)
+        return;
+    for (SizeType i = 0; i < size; ++i) {
+        T sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(static_cast<float>(-dst[i])));
+        dst[i] = dst[i] * sigmoid_x;
+    }
+}
+
+template <typename T>
+static void linearActivationSimple(SizeType batch, SizeType m, SizeType n, SizeType k, const T *src, T *weight, T *dst,
+                                   T *bias, KDNN::ActivationFunction kind)
+{
+    for (SizeType bh = 0; bh < batch; ++bh) {
+        for (SizeType i = 0; i < m; ++i) {
+            for (SizeType j = 0; j < n; ++j) {
+                T sum = 0;
+                for (SizeType l = 0; l < k; ++l) {
+                    sum += src[bh * m * k + i * k + l] * weight[bh * n * k + n * l + j];
+                }
+                dst[bh * m * n + i * n + j] = sum + bias[bh * m * n + i * n + j];
+            }
+        }
+    }
+    if (kind == KDNN::ActivationFunction::SWISH) {
+        siluSimple<T>(dst, batch * m * n);
+    }
+}
+
+template <typename T>
+static void floatDataInit(T **src, T **weight, T **dst, T **dstRef, T **bias, SizeType srcTotalSize,
+                          SizeType weightTotalSize, SizeType dstTotalSize)
+{
+    *src = (T *)malloc(srcTotalSize * sizeof(T));
+    *weight = (T *)malloc(weightTotalSize * sizeof(T));
+    *dst = (T *)malloc(dstTotalSize * sizeof(T));
+    *dstRef = (T *)malloc(dstTotalSize * sizeof(T));
+    *bias = (T *)malloc(dstTotalSize * sizeof(T));
+
+    if (*src == nullptr || *weight == nullptr || *dst == nullptr || *dstRef == nullptr || *bias == nullptr) {
+        std::cerr << "Memory allocation failed" << std::endl;
+        return;
+    }
+    std::uniform_real_distribution<float> u(-1, 1);
+    static std::default_random_engine e(time(NULL));
+
+    for (SizeType i = 0; i < srcTotalSize; ++i) {
+        (*src)[i] = (T)u(e);
+    }
+    for (SizeType i = 0; i < weightTotalSize; ++i) {
+        (*weight)[i] = (T)u(e);
+    }
+    for (SizeType i = 0; i < dstTotalSize; ++i) {
+        (*bias)[i] = (T)u(e);
+    }
+}
+
+static std::pair<float, float> compareEle(float ref, float diff, KDNN::Element::TypeT type, int K)
+{
+    float e = 0.0f;
+    float eps = 0.0f;
+    std::pair<float, float> pairEps = {0.0f, 0.0f};
+    switch (type) {
+        case KDNN::Element::TypeT::F16:
+            eps = 1e-3 * K;
+            e = (std::fabs(ref) > eps) ? diff / ref : static_cast<float>(diff);
+            pairEps = {e, eps};
+            break;
+        case KDNN::Element::TypeT::BF16:
+            eps = 1e-2 * K;
+            e = (std::fabs(ref) > eps) ? diff / ref : static_cast<float>(diff);
+            pairEps = {e, eps};
+            break;
+        case KDNN::Element::TypeT::F32:
+            eps = 1e-4;
+            e = (std::fabs(ref) > 1e-4) ? diff / ref : static_cast<float>(diff);
+            pairEps = {e, eps};
+            break;
+        default:
+            eps = 1;
+            e = diff;
+            pairEps = {e, eps};
+            break;
+    }
+    return pairEps;
+}
+
+template <typename T>
+static float findMaxValue(T *value1, SizeType valueSize1, T *value2, SizeType valueSize2)
+{
+    T maxVal = value1[0];
+
+    for (SizeType i = 0; i < valueSize1; ++i) {
+        maxVal = std::max(maxVal, value1[i]);
+    }
+
+    for (SizeType i = 0; i < valueSize2; ++i) {
+        maxVal = std::max(maxVal, value2[i]);
+    }
+    return static_cast<float>(maxVal);
+}
+
+template <typename T>
+static bool Test_Template_Function(const KDNN::TensorInfo srcTensor, const KDNN::TensorInfo weightTensor,
+                                   const KDNN::TensorInfo dstTensor, const KDNN::TensorInfo biasTensor, float alpha,
+                                   float beta, KDNN::ActivationFunction algKind, int numThreads)
+{
+    KDNN::LinearActivationLayerFWD linearActivationLayerFwd(srcTensor, weightTensor, dstTensor, biasTensor, alpha, beta,
+                                                            algKind, numThreads);
+
+    T *src = nullptr, *weight = nullptr, *dst = nullptr, *dstRef = nullptr, *bias = nullptr;
+    floatDataInit<T>(&src, &weight, &dst, &dstRef, &bias, srcTensor.GetTotalTensorSize(),
+                     weightTensor.GetTotalTensorSize(), dstTensor.GetTotalTensorSize());
+
+    Shape srcShape = srcTensor.GetDims();
+    Shape weightShape = weightTensor.GetDims();
+    SizeType srcDimNums = srcShape.GetNumDims();
+    SizeType weightDimNums = weightShape.GetNumDims();
+    SizeType m = srcShape[srcDimNums - 2];
+    SizeType n = weightShape[weightDimNums - 1];
+    SizeType k = srcShape[srcDimNums - 1];
+    SizeType batch = dstTensor.GetTotalTensorSize() / (m * n);
+
+    linearActivationSimple(batch, m, n, k, src, weight, dstRef, bias, algKind);
+
+    linearActivationLayerFwd.Run(src, weight, dst, bias, numThreads);
+
+    float maxValue = findMaxValue<T>(src, batch * m * k, weight, batch * n * k);
+
+    bool flag = true;
+    for (SizeType i = 0; i < batch * m * n; ++i) {
+        float diff = std::abs(static_cast<float>(*(dst + i) - *(dstRef + i)));
+        auto pairEps = compareEle(maxValue, diff, KDNN::Element::MatchType<T>(), k);
+
+        if (std::abs(pairEps.first) > pairEps.second) {
+            flag = false;
+            break;
+        }
+    }
+    if (src != nullptr) {
+        free(src);
+    }
+    if (weight != nullptr) {
+        free(weight);
+    }
+    if (dst != nullptr) {
+        free(dst);
+    }
+    if (dstRef != nullptr) {
+        free(dstRef);
+    }
+    if (bias != nullptr) {
+        free(bias);
+    }
+    return flag;
+}
+bool kudnn_linearActivation_01()
+{
+    const KDNN::TensorInfo srcTensor = {{3, 2, 110, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCD};
+    const KDNN::TensorInfo weightTensor = {{3, 2, 20, 200}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCD};
+    const KDNN::TensorInfo dstTensor = {{3, 2, 110, 200}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCD};
+    const KDNN::TensorInfo biasTensor = {{3, 2, 110, 200}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCD};
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    KDNN::ActivationFunction algKind = KDNN::ActivationFunction::SWISH;
+    int numThreads = 0;
+    return Test_Template_Function<float>(srcTensor, weightTensor, dstTensor, biasTensor, alpha, beta, algKind, numThreads);
+}
+
+static void linearSimpleFloat(SizeType batch, SizeType m, SizeType n, SizeType k, const float *src, float *weight,
+                              float *dst, float *bias, int transSrc = 0, int transDst = 0)
+{
+    for (SizeType bh = 0; bh < batch; ++bh) {
+        for (SizeType i = 0; i < m; ++i) {
+            for (SizeType j = 0; j < n; ++j) {
+                float sum = 0;
+                for (SizeType l = 0; l < k; ++l) {
+                    if (transSrc == 0 && transDst == 0) {
+                        sum += src[bh * m * k + i * k + l] * weight[bh * n * k + n * l + j];
+                    } else if (transSrc == 1 && transDst == 0) {
+                        sum += src[bh * m * k + i * k + l] * weight[bh * n * k + j * k + l];
+                    } else if (transSrc == 0 && transDst == 1) {
+                        sum += src[bh * m * k + l * m + i] * weight[bh * n * k + n * l + j];
+                    } else {
+                        sum += src[bh * m * k + l * m + i] * weight[bh * n * k + j * k + l];
+                    }
+                }
+                if (bias) {
+                    dst[bh * m * n + i * n + j] = sum + bias[bh * m * n + i * n + j];
+                } else {
+                    dst[bh * m * n + i * n + j] = sum;
+                }
+            }
+        }
+    }
+}
+template <typename T>
+static void DataInit(T **src, T **weight, T **dst, T **dstRef, T **bias, T **res, T **linearRes, T **linearResRef,
+                     SizeType srcTotalSize, SizeType weightTotalSize, SizeType dstTotalSize, SizeType resTotalSize,
+                     SizeType linearResTotalSize)
+{
+    *src = (T *)malloc(srcTotalSize * sizeof(T));
+    *weight = (T *)malloc(weightTotalSize * sizeof(T));
+    *dst = (T *)malloc(dstTotalSize * sizeof(T));
+    *dstRef = (T *)malloc(dstTotalSize * sizeof(T));
+    *bias = (T *)malloc(dstTotalSize * sizeof(T));
+    *res = (T *)malloc(resTotalSize * sizeof(T));
+    *linearRes = (T *)malloc(linearResTotalSize * sizeof(T));
+    *linearResRef = (T *)malloc(linearResTotalSize * sizeof(T));
+
+    if (*src == nullptr || *weight == nullptr || *dst == nullptr || *dstRef == nullptr || *bias == nullptr) {
+        std::cerr << "Memory allocation failed" << std::endl;
+        return;
+    }
+    std::uniform_real_distribution<float> u(-1, 1);
+    static std::default_random_engine e(time(NULL));
+
+    for (SizeType i = 0; i < srcTotalSize; ++i) {
+        (*src)[i] = (T)u(e);
+    }
+    for (SizeType i = 0; i < weightTotalSize; ++i) {
+        (*weight)[i] = (T)u(e);
+    }
+    for (SizeType i = 0; i < dstTotalSize; ++i) {
+        (*bias)[i] = (T)u(e);
+    }
+
+    for (SizeType i = 0; i < resTotalSize; ++i) {
+        (*res)[i] = (T)u(e);
+    }
+}
+
+static void addFloat(const float *dst, float *res, float *linearRes, float gamma, SizeType size)
+{
+    for (SizeType i = 0; i < size; i++) {
+        linearRes[i] = dst[i] + res[i] * gamma;
+    }
+}
+
+static std::pair<float, float> LresCompareEle(float ref, float diff, KDNN::Element::TypeT type, SizeType K, SizeType L)
+{
+    float e = 0.0f;
+    float eps = 0.0f;
+    std::pair<float, float> pairEps = {0.0f, 0.0f};
+    switch (type) {
+        case KDNN::Element::TypeT::F16:
+            eps = 1e-3 * K * L;
+            e = (std::fabs(ref) > eps) ? diff / ref : static_cast<float>(diff);
+            pairEps = {e, eps};
+            break;
+        case KDNN::Element::TypeT::BF16:
+            eps = 1e-2 * K * L;
+            e = (std::fabs(ref) > eps) ? diff / ref : static_cast<float>(diff);
+            pairEps = {e, eps};
+            break;
+        case KDNN::Element::TypeT::F32:
+            eps = 1e-4;
+            e = (std::fabs(ref) > 1e-4) ? diff / ref : static_cast<float>(diff);
+            pairEps = {e, eps};
+            break;
+        default:
+            eps = 1;
+            e = diff;
+            pairEps = {e, eps};
+            break;
+    }
+    return pairEps;
+}
+
+static bool Test_Template_Function_FP32(const KDNN::TensorInfo srcTensor, const KDNN::TensorInfo weightTensor,
+                                        const KDNN::TensorInfo dstTensor, const KDNN::TensorInfo biasTensor,
+                                        const KDNN::TensorInfo resTensor, const KDNN::TensorInfo linearResTensor,
+                                        float alpha, float beta, float gamma, KDNN::ResOpsFunction algKind,
+                                        int numThreads, int transSrc = 0, int transDst = 0)
+{
+    KDNN::LinearResFWD LinearResFWD(srcTensor, weightTensor, dstTensor, biasTensor, resTensor, linearResTensor, alpha,
+                                    beta, gamma, algKind, numThreads);
+
+    float *src = nullptr, *weight = nullptr, *dst = nullptr, *dstRef = nullptr, *bias = nullptr, *res = nullptr,
+          *linearRes = nullptr, *linearResRef = nullptr;
+    DataInit<float>(&src, &weight, &dst, &dstRef, &bias, &res, &linearRes, &linearResRef,
+                    srcTensor.GetTotalTensorSize(), weightTensor.GetTotalTensorSize(), dstTensor.GetTotalTensorSize(),
+                    resTensor.GetTotalTensorSize(), linearResTensor.GetTotalTensorSize());
+
+    Shape srcShape = srcTensor.GetDims();
+    Shape weightShape = weightTensor.GetDims();
+    Shape resShape = resTensor.GetDims();
+    SizeType srcDimNums = srcShape.GetNumDims();
+    SizeType weightDimNums = weightShape.GetNumDims();
+    SizeType resDimNums = resShape.GetNumDims();
+    SizeType dst_row = srcShape[srcDimNums - 2];        // m
+    SizeType dst_col = weightShape[weightDimNums - 1];  // n
+    SizeType src_col = srcShape[srcDimNums - 1];        // k
+    SizeType res_col = resShape[resDimNums - 1];        // l
+    SizeType LinearBatch = dstTensor.GetTotalTensorSize() / (dst_row * dst_col);
+    SizeType PostOpsBatch = linearResTensor.GetTotalTensorSize() / (dst_row * res_col);
+
+    SizeType L = 1;  // compareEle: K *L
+
+    linearSimpleFloat(LinearBatch, dst_row, dst_col, src_col, src, weight, dstRef, bias, transSrc, transDst);
+    if (algKind == KDNN::ResOpsFunction::RES_MUL) {
+        L = dst_col;
+        linearSimpleFloat(PostOpsBatch, dst_row, res_col, dst_col, dstRef, res, linearResRef, nullptr, transSrc,
+                          transDst);
+    } else if (algKind == KDNN::ResOpsFunction::RES_IDENTIAL) {
+        addFloat(dstRef, res, linearResRef, 1.0f, linearResTensor.GetTotalTensorSize());
+    } else {
+        addFloat(dstRef, res, linearResRef, gamma, linearResTensor.GetTotalTensorSize());
+    }
+
+    LinearResFWD.Run(src, weight, dst, bias, res, linearRes, gamma);
+
+    float maxValue =
+        findMaxValue<float>(src, srcTensor.GetTotalTensorSize(), weight, weightTensor.GetTotalTensorSize());
+
+    bool flag = true;
+    for (SizeType i = 0; i < linearResTensor.GetTotalTensorSize(); ++i) {
+        float diff = std::abs(static_cast<float>(*(linearRes + i) - *(linearResRef + i)));
+        auto pairEps = LresCompareEle(maxValue, diff, KDNN::Element::MatchType<float>(), src_col, L);
+        if (std::abs(pairEps.first) > pairEps.second) {
+            flag = false;
+            break;
+        }
+    }
+
+    if (src != nullptr) {
+        free(src);
+    }
+    if (weight != nullptr) {
+        free(weight);
+    }
+    if (dst != nullptr) {
+        free(dst);
+    }
+    if (dstRef != nullptr) {
+        free(dstRef);
+    }
+    if (bias != nullptr) {
+        free(bias);
+    }
+    if (res != nullptr) {
+        free(res);
+    }
+    if (linearRes != nullptr) {
+        free(linearRes);
+    }
+    if (linearResRef != nullptr) {
+        free(linearResRef);
+    }
+    return flag;
+}
+
+bool kudnn_linearRes_01()
+{
+    const KDNN::TensorInfo srcTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE};
+    const KDNN::TensorInfo weightTensor = {{4, 3, 2, 20, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE};
+    const KDNN::TensorInfo dstTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE};
+    const KDNN::TensorInfo biasTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE};
+    const KDNN::TensorInfo resTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE};
+    const KDNN::TensorInfo linearResTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE};
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    float gamma = 2.0f;
+    KDNN::ResOpsFunction algKind = KDNN::ResOpsFunction::RES_IDENTIAL;
+    int numThreads = 0;
+    return Test_Template_Function_FP32(srcTensor, weightTensor, dstTensor, biasTensor, resTensor, linearResTensor, alpha, beta,
+                                gamma, algKind, numThreads);
+}
+
+template <typename T>
+static void rmsnormRef(SizeType outerSize, SizeType innerSize, const T *src, T *dst, const T *scale, bool scaleApply,
+                       float *variance, const float eps, bool global_stats = false)
+{
+    int threadsUsed = (int)ceil((float)outerSize * FACTOR_THS);
+    threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed;
+    // variance compute
+    if (!global_stats) {
+#pragma omp parallel for num_threads(threadsUsed) schedule(static)
+        for (SizeType i = 0; i < outerSize; ++i) {
+            float sum = 0.0f;
+            for (SizeType j = 0; j < innerSize; ++j) {
+                float srcVal = (float)*(src + i * innerSize + j);
+                sum += srcVal * srcVal;
+            }
+            *(variance + i) = sum / innerSize;
+        }
+    }
+    // norm
+#pragma omp parallel for collapse(2) num_threads(threadsUsed) schedule(static)
+    for (SizeType i = 0; i < outerSize; ++i) {
+        float varVal = *(variance + i);
+        float invStd = 1.0f / std::sqrt(varVal + eps);
+        for (SizeType j = 0; j < innerSize; ++j) {
+            float srcVal = (float)*(src + i * innerSize + j);
+            float normalized = srcVal * invStd;
+            if (scaleApply) {
+                normalized *= scale[j];
+            }
+            *(dst + i * innerSize + j) = static_cast<T>(normalized);
+        }
+    }
+}
+// forward
+template <typename T>
+static bool RnormForwardFunc1(const TensorInfo &srcInfo, const TensorInfo &statInfo, const TensorInfo &scaleInfo,
+                              const TensorInfo &dstInfo, KDNN::NormalizationFlags flags)
+{
+    KDNN::RMSNormalizationLayerFWD rmsLayer1(srcInfo, statInfo, scaleInfo, dstInfo, flags);
+
+    SizeType srcSize = srcInfo.GetTotalTensorSize();
+    SizeType dstSize = dstInfo.GetTotalTensorSize();
+    SizeType statSize = statInfo.GetTotalTensorSize();
+    SizeType innerSize = scaleInfo.GetTotalTensorSize();
+
+    T *src = (T *)malloc(srcSize * sizeof(T));
+    T *dst = (T *)malloc(dstSize * sizeof(T));
+    T *dstRef = (T *)malloc(dstSize * sizeof(T));
+    float *variance = (float *)malloc(statSize * sizeof(float));
+    T *scale = (T *)malloc(innerSize * sizeof(T));
+    float eps = 1e-5;
+    if (src == nullptr || dst == nullptr || dstRef == nullptr || variance == nullptr || scale == nullptr) {
+        std::cerr << "Memory allocation failed" << std::endl;
+        return false;
+    }
+
+    bool global_stats = static_cast<bool>(flags & KDNN::NormalizationFlags::USE_GLOBAL_STATS);
+
+    // generate random test data
+    std::uniform_real_distribution<float> u(-1, 1);
+    std::default_random_engine e(time(NULL));
+    int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS);
+    threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed;
+#pragma omp parallel for num_threads(threadsUsed) schedule(static)
+    for (SizeType i = 0; i < srcSize; ++i) {
+        *(src + i) = (T)u(e);
+    }
+    for (SizeType j = 0; j < innerSize; ++j) {
+        *(scale + j) = (T)u(e);
+    }
+    if (global_stats) {
+        for (SizeType i = 0; i < statSize; i++) {
+            *(variance + i) = 0.1f;
+        }
+    }
+
+    float error = 0.0;
+    rmsLayer1.Run(src, dst, scale, variance, true, eps);
+    bool scaleApply = static_cast<bool>(flags & KDNN::NormalizationFlags::USE_SCALE);
+    rmsnormRef<T>(statSize, innerSize, src, dstRef, scale, scaleApply, variance, eps, global_stats);
+#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed)
+    for (SizeType i = 0; i < dstSize; ++i) {
+        error += *(dst + i) - *(dstRef + i);
+    }
+    free(src);
+    free(dst);
+    free(dstRef);
+    free(variance);
+    free(scale);
+    error = std::abs(error) / dstSize;
+    
+    return error < errBound;
+}
+
+bool kudnn_rnorm_01()
+{
+    Shape shape(100, 100);
+    TensorInfo srcInfo = {shape, TypeF16, KDNN::Layout::AB};
+    TensorInfo statInfo = {{shape[0]}, TypeF16, KDNN::Layout::A};
+    TensorInfo scaleInfo = {{shape[1]}, TypeF16, KDNN::Layout::A};
+    TensorInfo dstInfo = {shape, TypeF16, KDNN::Layout::AB};
+    KDNN::NormalizationFlags flags = KDNN::NormalizationFlags::USE_SHIFT;
+    return RnormForwardFunc1<__fp16>(srcInfo, statInfo, scaleInfo, dstInfo, flags);
+}
+
+void softmaxHelp(float *src, int start, int end)
+{
+    if (start > end || !src)
+        return;
+
+    float max_val = src[start];
+    for (int i = start + 1; i <= end; ++i) {
+        if (src[i] > max_val) {
+            max_val = src[i];
+        }
+    }
+    float sum_exp = 0.0f;
+    for (int i = start; i <= end; ++i) {
+        sum_exp += std::exp(src[i] - max_val);
+    }
+    for (int i = start; i <= end; ++i) {
+        src[i] = std::exp(src[i] - max_val) / sum_exp;
+    }
+}
+
+static void softmaxSimple(Shape shape, float *src)
+{
+    SizeType numDims = shape.GetNumDims();
+    SizeType outSize = 1;
+    for (SizeType i = 0; i < numDims - 1; ++i) {
+        outSize *= shape[i];
+    }
+    SizeType innerSize = shape[numDims - 1];
+
+    for (SizeType i = 0; i < outSize; ++i) {
+        softmaxHelp(src, i * innerSize, (i + 1) * innerSize - 1);
+    }
+}
+
+static void dataInit(float **src, float **dst, float **dstRef, SizeType totalSize)
+{
+    *src = (float *)malloc(totalSize * sizeof(float));
+    *dst = (float *)malloc(totalSize * sizeof(float));
+    *dstRef = (float *)malloc(totalSize * sizeof(float));
+
+    if (*src == nullptr || *dst == nullptr || *dstRef == nullptr) {
+        std::cerr << "Memory allocation failed" << std::endl;
+        return;
+    }
+    std::uniform_real_distribution<float> u(-1, 1);
+    std::default_random_engine e(time(NULL));
+
+    for (SizeType i = 0; i < totalSize; ++i) {
+        (*src)[i] = u(e);
+    }
+    memcpy(*dstRef, *src, totalSize * sizeof(float));
+}
+
+static void dataFree(float **src, float **dst, float **dstRef)
+{
+    free(*src);
+    free(*dst);
+    free(*dstRef);
+
+    *src = nullptr;
+    *dst = nullptr;
+    *dstRef = nullptr;
+}
+static bool FWD_3D_FP32(const Shape &shape)
+{
+    const KDNN::TensorInfo srcTensor = {shape, KDNN::Element::TypeT::F32, KDNN::Layout::ABC};
+    const KDNN::TensorInfo dstTensor = {shape, KDNN::Element::TypeT::F32, KDNN::Layout::ABC};
+    SizeType axis = 2;
+    KDNN::SoftmaxLayerFWD softmaxLayerFwd(srcTensor, dstTensor, axis, KDNN::SoftmaxAlgorithmKind::SOFTMAX);
+
+    float *src = nullptr, *dst = nullptr, *dstRef = nullptr;
+    SizeType totalSize = srcTensor.GetTotalTensorSize();
+    dataInit(&src, &dst, &dstRef, totalSize);
+    softmaxSimple(shape, dstRef);
+    softmaxLayerFwd.Run(src, dst);
+    for (SizeType i = 0; i < totalSize; ++i) {
+        float error = std::abs(*(dst + i) - *(dstRef + i));
+        if (error > maxError) {
+            return false;
+        }
+    }
+    dataFree(&src, &dst, &dstRef);
+    return true;
+}
+bool kudnn_softmax_01()
+{
+    Shape shape(20, 40, 50);
+    return FWD_3D_FP32(shape);
+}
\ No newline at end of file
diff --git a/kudnn/src/function.hpp b/kudnn/src/function.hpp
new file mode 100644
index 0000000..c823def
--- /dev/null
+++ b/kudnn/src/function.hpp
@@ -0,0 +1,73 @@
+#include "kdnn.hpp"
+#include <vector>
+#include <string>
+#include <string.h>
+#include <unordered_map>
+#include <random>
+#include <iostream>
+
+static std::unordered_map<std::string, KDNN::Layout> layoutMap = {
+    {"KDNN::Layout::UNDEFINED", KDNN::Layout::UNDEFINED},
+    {"KDNN::Layout::A", KDNN::Layout::A},
+    {"KDNN::Layout::AB", KDNN::Layout::AB},
+    {"KDNN::Layout::BA", KDNN::Layout::BA},
+    {"KDNN::Layout::ABC", KDNN::Layout::ABC},
+    {"KDNN::Layout::ACB", KDNN::Layout::ACB},
+    {"KDNN::Layout::BAC", KDNN::Layout::BAC},
+    {"KDNN::Layout::BCA", KDNN::Layout::BCA},
+    {"KDNN::Layout::CAB", KDNN::Layout::CAB},
+    {"KDNN::Layout::CBA", KDNN::Layout::CBA},
+    {"KDNN::Layout::ABCD", KDNN::Layout::ABCD},
+    {"KDNN::Layout::ABDC", KDNN::Layout::ABDC},
+    {"KDNN::Layout::ACBD", KDNN::Layout::ACBD},
+    {"KDNN::Layout::ACDB", KDNN::Layout::ACDB},
+    {"KDNN::Layout::ADBC", KDNN::Layout::ADBC},
+    {"KDNN::Layout::ADCB", KDNN::Layout::ADCB},
+    {"KDNN::Layout::BACD", KDNN::Layout::BACD},
+    {"KDNN::Layout::BCDA", KDNN::Layout::BCDA},
+    {"KDNN::Layout::CDAB", KDNN::Layout::CDAB},
+    {"KDNN::Layout::CDBA", KDNN::Layout::CDBA},
+    {"KDNN::Layout::DCAB", KDNN::Layout::DCAB},
+    {"KDNN::Layout::ABCDE", KDNN::Layout::ABCDE},
+    {"KDNN::Layout::ABCED", KDNN::Layout::ABCED},
+    {"KDNN::Layout::ABDEC", KDNN::Layout::ABDEC},
+    {"KDNN::Layout::ACBDE", KDNN::Layout::ACBDE},
+    {"KDNN::Layout::ACDEB", KDNN::Layout::ACDEB},
+    {"KDNN::Layout::ADECB", KDNN::Layout::ADECB},
+    {"KDNN::Layout::BACDE", KDNN::Layout::BACDE},
+    {"KDNN::Layout::BCDEA", KDNN::Layout::BCDEA},
+    {"KDNN::Layout::CDEAB", KDNN::Layout::CDEAB},
+    {"KDNN::Layout::CDEBA", KDNN::Layout::CDEBA},
+    {"KDNN::Layout::DECAB", KDNN::Layout::DECAB},
+    {"KDNN::Layout::ROW_MAJOR", KDNN::Layout::ROW_MAJOR},
+    {"KDNN::Layout::COL_MAJOR", KDNN::Layout::COL_MAJOR},
+    {"KDNN::Layout::NCHW", KDNN::Layout::NCHW},
+    {"KDNN::Layout::NHWC", KDNN::Layout::NHWC},
+    {"KDNN::Layout::NCDHW", KDNN::Layout::NCDHW},
+    {"KDNN::Layout::NDHWC", KDNN::Layout::NDHWC},
+    {"KDNN::Layout::OIHW", KDNN::Layout::OIHW},
+    {"KDNN::Layout::HWIO", KDNN::Layout::HWIO},
+    {"KDNN::Layout::HWOI", KDNN::Layout::HWOI},
+    {"KDNN::Layout::OHWI", KDNN::Layout::OHWI},
+    {"KDNN::Layout::OHWO", KDNN::Layout::OHWO},
+    {"KDNN::Layout::IOHW", KDNN::Layout::IOHW},
+};
+
+static std::unordered_map<std::string, KDNN::Element::TypeT> typeTMap = {
+    {"UNDEFINED", KDNN::Element::TypeT::UNDEFINED},
+    {"KDNN::Element::TypeT::F32", KDNN::Element::TypeT::F32},
+    {"KDNN::Element::TypeT::F16", KDNN::Element::TypeT::F16},
+    {"KDNN::Element::TypeT::BF16", KDNN::Element::TypeT::BF16},
+    {"KDNN::Element::TypeT::S32", KDNN::Element::TypeT::S32},
+    {"KDNN::Element::TypeT::S8", KDNN::Element::TypeT::S8},
+    {"KDNN::Element::TypeT::U8", KDNN::Element::TypeT::U8},
+};
+
+bool kudnn_conv_01();
+bool kudnn_conv_02();
+bool kudnn_gnorm_01();
+bool kudnn_lnorm_01();
+bool kudnn_linearActivation_01();
+bool kudnn_linearRes_01();
+bool kudnn_rnorm_01();
+bool kudnn_softmax_01();
\ No newline at end of file
diff --git a/kudnn/src/main.cpp b/kudnn/src/main.cpp
new file mode 100644
index 0000000..226f49e
--- /dev/null
+++ b/kudnn/src/main.cpp
@@ -0,0 +1,45 @@
+#include <functional>
+#include "function.hpp"
+
+std::unordered_map<std::string, std::function<bool()>> funcMap;
+int main(int argc, char **argv)
+{
+    funcMap["kudnn_conv_01"] = kudnn_conv_01;
+    funcMap["kudnn_conv_02"] = kudnn_conv_02;
+    funcMap["kudnn_gnorm_01"] = kudnn_gnorm_01;
+    funcMap["kudnn_lnorm_01"] = kudnn_lnorm_01;
+    funcMap["kudnn_linearActivation_01"] = kudnn_linearActivation_01;
+    funcMap["kudnn_linearRes_01"] = kudnn_linearRes_01;
+    funcMap["kudnn_rnorm_01"] = kudnn_rnorm_01;
+    funcMap["kudnn_softmax_01"] = kudnn_softmax_01;
+    bool allPass = true;
+
+    std::cout << "start test" << std::endl;
+    if (argc == 2) {
+        if (funcMap.count(argv[1]) > 0) {
+            bool ret = funcMap[argv[1]]();
+            if (!ret) {
+                std::cout << "test " << argv[1] << " FAILED" << std::endl;
+            } else {
+                std::cout << "test " << argv[1] << " PASS" << std::endl;
+            }
+        } else {
+            std::cout << "No test named " << argv[1] << std::endl;
+        }
+    } else if (argc == 1) {
+        for (auto &func : funcMap) {
+            std::cout << "run test " << func.first << " ......" << std::endl;
+            bool ret = func.second();   
+            if (!ret) {
+                std::cout << "test " << func.first << " FAILED" << std::endl;
+                allPass = false;
+            } else {
+                std::cout << "test " << func.first << " PASS" << std::endl;
+            }
+        }
+    }
+
+    if(allPass)
+        std::cout << "All test passed" << std::endl;
+    return 0;
+}
\ No newline at end of file
diff --git a/kulitho/build.sh b/kulitho/build.sh
new file mode 100644
index 0000000..ec6e69f
--- /dev/null
+++ b/kulitho/build.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+cputype=$(cat /proc/cpuinfo | grep 'CPU part' | awk 'NR==1{print $4}')
+if [[ "$cputype" == "0xD01" ]]; then
+  echo "kulitho: Not suppot on this platform" 
+else
+  if [ -z "${HPCKIT_PATH}" ]; then
+      echo "Error: Environment variable 'HPCKIT_PATH' is not set!"
+      echo "Please configure HPCKIT_PATH to point to the HPCKit installation directory, for example:"
+      echo "    export HPCKIT_PATH=/opt/HPCKit"
+      echo "Then re-run the build command."
+      exit 1
+  fi
+  echo "HPCKIT_PATH=${HPCKIT_PATH}"
+  cd ./test
+  make clean
+  make 
+fi
diff --git a/kulitho/run.sh b/kulitho/run.sh
new file mode 100644
index 0000000..73e5628
--- /dev/null
+++ b/kulitho/run.sh
@@ -0,0 +1,7 @@
+cputype=$(cat /proc/cpuinfo | grep 'CPU part' | awk 'NR==1{print $4}')
+if [[ "$cputype" == "0xD01" ]]; then
+    echo "kulitho: Not suppot on this platform" 
+else
+    cd ./test
+    ./test_kulitho
+fi
diff --git a/kulitho/test/Makefile b/kulitho/test/Makefile
new file mode 100644
index 0000000..0124f4c
--- /dev/null
+++ b/kulitho/test/Makefile
@@ -0,0 +1,21 @@
+# Makefile
+CXX      = g++
+CXXFLAGS = -O3 -I$(HPCKIT_PATH)/latest/kulitho/gcc/include
+LDFLAGS  = -L$(HPCKIT_PATH)/latest/kulitho/gcc/lib -lkulitho -lpthread -lm
+
+TARGET   = test_kulitho
+SOURCES  = kulitho_test.cpp
+OBJECTS  = $(SOURCES:.cpp=.o)
+
+# ========== 主目标 ==========
+$(TARGET): $(OBJECTS)
+	$(CXX) $(OBJECTS) -o $@ $(LDFLAGS)
+
+# ========== 编译规则 ==========
+%.o: %.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+# ========== 清理 ==========
+.PHONY: clean run
+clean:
+	rm -f *.o $(TARGET)
\ No newline at end of file
diff --git a/kulitho/test/kulitho_test.cpp b/kulitho/test/kulitho_test.cpp
new file mode 100644
index 0000000..2604c41
--- /dev/null
+++ b/kulitho/test/kulitho_test.cpp
@@ -0,0 +1,92 @@
+#include <iostream>
+#include <cmath>
+#include "kulitho_image.h"
+
+double real_out_upsampling[256] = {
+    -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007,
+	0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362,
+	0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187,
+	0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500,
+	0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022,
+	0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750,
+	0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478,
+	0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000,
+	0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313,
+	0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138,
+	0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507,
+	0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500,
+	0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672,
+	0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750,
+	0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828
+};
+double real_out_lapacian[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+int verify_upsampling(double *out, size_t rows, size_t cols)
+{
+    for (size_t i = 0; i < rows; i++) {
+        for (size_t j = 0; j < cols; j++) {
+            if (abs(out[i * cols + j] - real_out_upsampling[i * cols + j]) > 1e-5)
+                // printf("%f, %f\n", out[i * cols + j], real_out_upsampling[i * cols + j]);
+                return 0;
+        }
+    }
+    return 1;
+}
+int verify_laplacian(double *out, size_t rows, size_t cols)
+{
+    for (size_t i = 0; i < rows; i++) {
+        for (size_t j = 0; j < cols; j++) {
+            if (abs(out[i * cols + j] - real_out_lapacian[i * cols + j]) > 1e-5)
+                // printf("%f, %f\n", out[i * cols + j], real_out_lapacian[i * cols + j]);
+                return 0;
+        }
+    }
+    return 1;
+}
+
+int kulitho_upsampling_01()
+{
+    size_t src_rows = 4, src_cols = 4;
+    size_t dst_rows = src_rows * 4;
+    size_t dst_cols = src_cols * 4;
+
+    double *grid_in = new double[src_rows * src_cols];
+    for (int i = 0; i < src_rows; ++i) {
+        for (int j = 0; j < src_cols; ++j) {
+            grid_in[i * src_cols + j] = j;
+        }
+    }
+    double *grid_out = new double[dst_rows * dst_cols];
+    kulitho::kulitho_FFTupsampling(grid_in, grid_out, src_rows, src_cols, dst_rows, dst_cols, 0, 0);
+
+    if (verify_upsampling(grid_out, dst_rows, dst_cols)) {
+        std::cout << "kulitho upsampling case: PASSED" << std::endl;
+    } else {
+        std::cout << "kulitho upsampling case: FAILED" << std::endl;
+    }
+
+    delete[] grid_in;
+    delete[] grid_out;
+    return 0;
+}
+
+int kulitho_laplacian_01()
+{
+    size_t rows = 3, cols = 3;
+    double src[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
+    double dst[rows * cols];
+
+    kulitho::kulitho_laplacian(src, rows, cols, dst);
+    if (verify_laplacian(dst, rows, cols)) {
+        std::cout << "kulitho laplacian case: PASSED" << std::endl;
+    } else {
+        std::cout << "kulitho laplacian case: FAILED" << std::endl;
+    }
+    return 0;
+}
+
+int main()
+{
+    kulitho_upsampling_01();
+    kulitho_laplacian_01();
+}
\ No newline at end of file
-- 
Gitee