From b991b701806889807f49ab6201f37eb6bbb4ea20 Mon Sep 17 00:00:00 2001 From: yanpeng Date: Wed, 22 Oct 2025 08:56:25 +0800 Subject: [PATCH] add kml kulitho kudnn tests --- kml/build.sh | 13 + kml/run.sh | 5 + kml/test/Makefile | 36 ++ kml/test/kml_blas_test.c | 24 + kml/test/kml_blas_test.h | 14 + kml/test/kml_conv_test.c | 40 ++ kml/test/kml_conv_test.h | 14 + kml/test/kml_fft_test.c | 38 ++ kml/test/kml_fft_test.h | 14 + kml/test/kml_ipl_test.c | 33 + kml/test/kml_ipl_test.h | 14 + kml/test/kml_lapack_test.c | 23 + kml/test/kml_lapack_test.h | 14 + kml/test/kml_math_test.c | 18 + kml/test/kml_math_test.h | 14 + kml/test/kml_scalapack_test.c | 77 +++ kml/test/kml_scalapack_test.h | 14 + kml/test/kml_solver_test.c | 154 +++++ kml/test/kml_solver_test.h | 14 + kml/test/kml_spblas_test.c | 29 + kml/test/kml_spblas_test.h | 14 + kml/test/kml_svml_test.c | 20 + kml/test/kml_svml_test.h | 14 + kml/test/kml_test_main.cpp | 75 +++ kml/test/kml_vml_test.c | 13 + kml/test/kml_vml_test.h | 14 + kml/test/kml_vsl_test.c | 41 ++ kml/test/kml_vsl_test.h | 14 + kudnn/build.sh | 20 + kudnn/run.sh | 8 + kudnn/src/function.cpp | 1112 +++++++++++++++++++++++++++++++++ kudnn/src/function.hpp | 73 +++ kudnn/src/main.cpp | 45 ++ kulitho/build.sh | 18 + kulitho/run.sh | 7 + kulitho/test/Makefile | 21 + kulitho/test/kulitho_test.cpp | 92 +++ 37 files changed, 2203 insertions(+) create mode 100644 kml/build.sh create mode 100644 kml/run.sh create mode 100644 kml/test/Makefile create mode 100644 kml/test/kml_blas_test.c create mode 100644 kml/test/kml_blas_test.h create mode 100644 kml/test/kml_conv_test.c create mode 100644 kml/test/kml_conv_test.h create mode 100644 kml/test/kml_fft_test.c create mode 100644 kml/test/kml_fft_test.h create mode 100644 kml/test/kml_ipl_test.c create mode 100644 kml/test/kml_ipl_test.h create mode 100644 kml/test/kml_lapack_test.c create mode 100644 kml/test/kml_lapack_test.h create mode 100644 kml/test/kml_math_test.c create mode 100644 kml/test/kml_math_test.h create mode 100644 kml/test/kml_scalapack_test.c create mode 100644 kml/test/kml_scalapack_test.h create mode 100644 kml/test/kml_solver_test.c create mode 100644 kml/test/kml_solver_test.h create mode 100644 kml/test/kml_spblas_test.c create mode 100644 kml/test/kml_spblas_test.h create mode 100644 kml/test/kml_svml_test.c create mode 100644 kml/test/kml_svml_test.h create mode 100644 kml/test/kml_test_main.cpp create mode 100644 kml/test/kml_vml_test.c create mode 100644 kml/test/kml_vml_test.h create mode 100644 kml/test/kml_vsl_test.c create mode 100644 kml/test/kml_vsl_test.h create mode 100644 kudnn/build.sh create mode 100644 kudnn/run.sh create mode 100644 kudnn/src/function.cpp create mode 100644 kudnn/src/function.hpp create mode 100644 kudnn/src/main.cpp create mode 100644 kulitho/build.sh create mode 100644 kulitho/run.sh create mode 100644 kulitho/test/Makefile create mode 100644 kulitho/test/kulitho_test.cpp diff --git a/kml/build.sh b/kml/build.sh new file mode 100644 index 0000000..871ece4 --- /dev/null +++ b/kml/build.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# export HPCKIT_PATH=/opt/HPCKit/ +if [ -z "${HPCKIT_PATH}" ]; then + echo "Error: Environment variable 'HPCKIT_PATH' is not set!" + echo "Please configure HPCKIT_PATH to point to the HPCKit installation directory, for example:" + echo " export HPCKIT_PATH=/opt/HPCKit" + echo "Then re-run the build command." + exit 1 +fi +echo "HPCKIT_PATH=${HPCKIT_PATH}" +cd ./test +make clean +make diff --git a/kml/run.sh b/kml/run.sh new file mode 100644 index 0000000..81ca259 --- /dev/null +++ b/kml/run.sh @@ -0,0 +1,5 @@ +export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 +export OMPI_ALLOW_RUN_AS_ROOT=1 + +cd ./test +mpirun -np 1 ./test_kml \ No newline at end of file diff --git a/kml/test/Makefile b/kml/test/Makefile new file mode 100644 index 0000000..76329ff --- /dev/null +++ b/kml/test/Makefile @@ -0,0 +1,36 @@ +# Makefile +CXX = g++ +CC = gcc +CXXFLAGS = -I. -I $(HPCKIT_PATH)/latest/hmpi/gcc/release/hmpi/include -L$(HPCKIT_PATH)/latest/hmpi/gcc/release/hmpi/lib/ +CFLAGS = -I. -I $(HPCKIT_PATH)/latest/hmpi/gcc/release/hmpi/include -L$(HPCKIT_PATH)/latest/hmpi/gcc/release/hmpi/lib/ +LDFLAGS = -lgtest -lgtest_main -lpthread -L. -lkfft -lkfftf -lkffth -lkblas -lklapack_full -lm -lksolver -lmpi -lkscalapack_full -lkipl -lkconv -lkm -lkvml -lkvsl -lksvml -lkspblas + +TARGET = test_kml +SOURCES_C = kml_fft_test.c kml_blas_test.c kml_lapack_test.c kml_solver_test.c kml_scalapack_test.c kml_ipl_test.c kml_conv_test.c kml_vml_test.c kml_vsl_test.c kml_svml_test.c kml_math_test.c kml_spblas_test.c +SOURCES_CPP = kml_test_main.cpp +OBJECTS_C = $(SOURCES_C:.c=.o) +OBJECTS_CPP = $(SOURCES_CPP:.cpp=.o) +OBJECTS = $(OBJECTS_C) $(OBJECTS_CPP) + +# 默认目标 +all: $(TARGET) + +# 构建可执行文件 +$(TARGET): $(OBJECTS) + $(CXX) $(OBJECTS) -o $@ $(LDFLAGS) $(CXXFLAGS) + +# 编译 C 文件 +%.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ + +# 编译 C++ 文件 +%.o: %.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + +# 清理 +clean: + rm -f *.o $(TARGET) + +rebuild: clean all + +.PHONY: all clean rebuild \ No newline at end of file diff --git a/kml/test/kml_blas_test.c b/kml/test/kml_blas_test.c new file mode 100644 index 0000000..e5302e2 --- /dev/null +++ b/kml/test/kml_blas_test.c @@ -0,0 +1,24 @@ +#include "kml_blas_test.h" +#include "kblas.h" +#include + +int kml_blas_sgemm_01() +{ + int m = 4, k = 3, n = 4, lda = 4, ldb = 3, ldc = 4; + float alpha = 1.0, beta = 2.0; + float a[12] = {0.340188, -0.105617, 0.283099, + 0.298440, 0.411647, -0.302449, + -0.164777, 0.268230, -0.222225, + 0.053970, -0.022603, 0.128871}; + float b[12] = {-0.135216, 0.013401, 0.452230, 0.416195, + 0.135712, 0.217297, -0.358397, 0.106969, + -0.483699, -0.257113, -0.362768, 0.304177}; + float c[16] = {-0.343321, -0.099056, -0.370210, -0.391191, + 0.498924, -0.281743, 0.012932, 0.339112, + 0.112640, -0.203968, 0.137552, 0.024287, + -0.006417, 0.472775, -0.207483, 0.271358}; + + cblas_sgemm(CblasColMajor,CblasNoTrans,CblasNoTrans, m, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); + return 1; // 成功 +} \ No newline at end of file diff --git a/kml/test/kml_blas_test.h b/kml/test/kml_blas_test.h new file mode 100644 index 0000000..f36c029 --- /dev/null +++ b/kml/test/kml_blas_test.h @@ -0,0 +1,14 @@ +#ifndef KML_BLAS_TEST_H +#define KML_BLAS_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +int kml_blas_sgemm_01(); + +#ifdef __cplusplus +} +#endif + +#endif // KML_BLAS_TEST_H \ No newline at end of file diff --git a/kml/test/kml_conv_test.c b/kml/test/kml_conv_test.c new file mode 100644 index 0000000..99a59b8 --- /dev/null +++ b/kml/test/kml_conv_test.c @@ -0,0 +1,40 @@ +#include "kml_conv_test.h" +#include +#include +#include "conv.h" + + +int kml_conv_conv2d_01() +{ + int batch = 1; + int inputChannels = 1; + int inputHeight = 6; + int inputWidth = 6; + int kernelHeight = 3; + int kernelWidth = 3; + int strideY = 1; + int strideX = 1; + int padHeight = 0; + int padWidth = 0; + int dilationY = 1; + int dilationX = 1; + int outputChannels = 1; + float input[36] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, + 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, + 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, + 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, + 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, + 31.0, 32.0, 33.0, 34.0, 35.0, 36.0}; + float kernel[9] = {1.0, 2.0, 3.0, + 4.0, 5.0, 6.0, + 7.0, 8.0, 9.0}; + + float *bias = NULL; + int outputHeight = (inputHeight + 2 * padHeight - dilationY * (kernelHeight - 1) - 1) / strideY + 1; + int outputWidth = (inputWidth + 2 * padWidth - dilationX * (kernelWidth - 1) - 1) / strideX + 1; + + float output[16] = {0.0}; + conv2d_fp32(input, batch, inputChannels, inputHeight, inputWidth, kernel, kernelHeight, kernelWidth, strideY, strideX, padHeight, padWidth, dilationY, dilationX, bias, output, outputChannels); + + return 1; // 成功 +} \ No newline at end of file diff --git a/kml/test/kml_conv_test.h b/kml/test/kml_conv_test.h new file mode 100644 index 0000000..6e386cf --- /dev/null +++ b/kml/test/kml_conv_test.h @@ -0,0 +1,14 @@ +#ifndef KML_CONV_TEST_H +#define KML_CONV_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +int kml_conv_conv2d_01(); + +#ifdef __cplusplus +} +#endif + +#endif // KML_CONV_TEST_H \ No newline at end of file diff --git a/kml/test/kml_fft_test.c b/kml/test/kml_fft_test.c new file mode 100644 index 0000000..6c523fc --- /dev/null +++ b/kml/test/kml_fft_test.c @@ -0,0 +1,38 @@ +#include "kml_fft_test.h" +#include + +#include "kfft.h" + +int kml_fft_c2c_01() +{ + int rank = 2; + int *n = (int*)kml_fft_malloc(sizeof(int) * rank); + n[0] = 2; + n[1] = 3; + + double init[6][2] = {{120, 0}, {8, 8}, {0, 0}, {0, 16}, {0, 16}, {-8, 8}}; + kml_fft_complex *in; + in = (kml_fft_complex*)kml_fft_malloc(sizeof(kml_fft_complex) * n[0] * n[1]); + for (int i = 0; i < n[0] * n[1]; i++) { + in[i][0] = init[i][0]; + in[i][1] = init[i][1]; + } + + kml_fft_complex *out; + out = (kml_fft_complex*)kml_fft_malloc(sizeof(kml_fft_complex) * n[0] * n[1]); + + kml_fft_plan plan; + plan = kml_fft_plan_dft(rank, n, in, out, KML_FFT_FORWARD, KML_FFT_ESTIMATE); + if (!plan) { + return 0; + } + + kml_fft_execute_dft(plan, in, out); + + kml_fft_destroy_plan(plan); + kml_fft_free(n); + kml_fft_free(in); + kml_fft_free(out); + + return 1; // 成功 +} \ No newline at end of file diff --git a/kml/test/kml_fft_test.h b/kml/test/kml_fft_test.h new file mode 100644 index 0000000..7cf542e --- /dev/null +++ b/kml/test/kml_fft_test.h @@ -0,0 +1,14 @@ +#ifndef KML_FFT_TEST_H +#define KML_FFT_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +int kml_fft_c2c_01(); + +#ifdef __cplusplus +} +#endif + +#endif // KML_FFT_TEST_H \ No newline at end of file diff --git a/kml/test/kml_ipl_test.c b/kml/test/kml_ipl_test.c new file mode 100644 index 0000000..94c302e --- /dev/null +++ b/kml/test/kml_ipl_test.c @@ -0,0 +1,33 @@ +#include "kml_ipl_test.h" +#include +#include +#include "kipl.h" + + +int kml_ipl_interp2d_01() +{ + const size_t xMin = 4; + const size_t xMax = 2048; + const size_t xSize = 4; + const size_t yMin = 4; + const size_t yMax = 2048; + const size_t ySize = 4; + const float xValLo = 1; + const float xValHi = 2; + const float yValLo = 1; + const float yValHi = 2; + float xArr[4] = {1,1.33,1.67,2}; + float yArr[4] = {1,1.33,1.67,2}; + float* zArr = (float*)malloc(xSize * ySize * sizeof(float)); + float x = 1; + float y = 1; + float z = 0; + kml_float_interp2d *interp = kml_float_interp2d_alloc(kml_float_interp2d_bicubic, xSize, ySize); + kml_interp_accel *xacc = kml_interp_accel_alloc(); + kml_interp_accel *yacc = kml_interp_accel_alloc(); + kml_float_interp2d_init(interp, xArr, yArr, zArr, xSize, ySize); + z = kml_float_interp2d_eval(interp, xArr, yArr, zArr, x, y, xacc, yacc); + + + return 1; // 成功 +} \ No newline at end of file diff --git a/kml/test/kml_ipl_test.h b/kml/test/kml_ipl_test.h new file mode 100644 index 0000000..400efd3 --- /dev/null +++ b/kml/test/kml_ipl_test.h @@ -0,0 +1,14 @@ +#ifndef KML_IPL_TEST_H +#define KML_IPL_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +int kml_ipl_interp2d_01(); + +#ifdef __cplusplus +} +#endif + +#endif // KML_IPL_TEST_H \ No newline at end of file diff --git a/kml/test/kml_lapack_test.c b/kml/test/kml_lapack_test.c new file mode 100644 index 0000000..f59a0c4 --- /dev/null +++ b/kml/test/kml_lapack_test.c @@ -0,0 +1,23 @@ +#include "kml_lapack_test.h" +#include "klapack.h" +#include + +int kml_lapack_dgetrf_01() +{ + char trans = 'N'; + int n = 5; + int nrhs = 1; + int lda = 5; + int ldb = 5; + int ipiv[5]; + int info = 0; + double a[]={72.1673 , 66.1857 , 64.7644 , 28.0199 , 91.4151, + 6.5180 , 62.8483 , 72.4323 , 46.5760 , 8.6928, + 28.9821 , 42.1828 , 18.6437 , 99.8612 , 35.6972, + 67.9812 , 5.0880 , 85.5035 ,79.2945 , 54.5920, + 28.6869 , 49.7512 , 7.5186 ,28.6929 , 84.6041}; + double b[]={1.0, 2.0, 3.0, 4.0, 5.0}; + dgetrf_(&n, &n, a, &lda, ipiv, &info); + dgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, &info); + return info; +} \ No newline at end of file diff --git a/kml/test/kml_lapack_test.h b/kml/test/kml_lapack_test.h new file mode 100644 index 0000000..3749aca --- /dev/null +++ b/kml/test/kml_lapack_test.h @@ -0,0 +1,14 @@ +#ifndef KML_LAPACK_TEST_H +#define KML_LAPACK_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +int kml_lapack_dgetrf_01(); + +#ifdef __cplusplus +} +#endif + +#endif // KML_LAPACK_TEST_H \ No newline at end of file diff --git a/kml/test/kml_math_test.c b/kml/test/kml_math_test.c new file mode 100644 index 0000000..903514b --- /dev/null +++ b/kml/test/kml_math_test.c @@ -0,0 +1,18 @@ +#include "kml_math_test.h" +#include +#include +#include +#include "km.h" + + +int kml_math_sin_01() +{ + double pi = acos(-1); + // typical usage + double a = pi/6, b = 1.0, c = -3*pi/4, d = pi/3; + // special handling + double e = INFINITY, f = -INFINITY, g = NAN; + sin(a); + + return 1; // 成功 +} \ No newline at end of file diff --git a/kml/test/kml_math_test.h b/kml/test/kml_math_test.h new file mode 100644 index 0000000..e03f769 --- /dev/null +++ b/kml/test/kml_math_test.h @@ -0,0 +1,14 @@ +#ifndef KML_MATH_TEST_H +#define KML_MATH_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +int kml_math_sin_01(); + +#ifdef __cplusplus +} +#endif + +#endif // KML_MATH_TEST_H \ No newline at end of file diff --git a/kml/test/kml_scalapack_test.c b/kml/test/kml_scalapack_test.c new file mode 100644 index 0000000..44dcbf0 --- /dev/null +++ b/kml/test/kml_scalapack_test.c @@ -0,0 +1,77 @@ +#include "kml_scalapack_test.h" + +#include +#include +#include +#include "kscalapack.h" +void blacs_get_(int*, int*, int*); +void blacs_gridinit_(int*, char*, int*, int*); +void blacs_gridinfo_(int*, int*, int*, int*, int*); +void descinit_(int*, int*, int*, int*, int*, int*, int*, int*, int*, int*); +void blacs_gridexit_(int*); +int numroc_(int*, int*, int*, int*, int*); + +int kml_scalapack_pdgetrf_01() { + // 默认值 + int izero=0; + int ione=1; + int N = 1000; // 矩阵维度 + int MB = 64, NB = 64; // 块大小 + int P = 1, Q = 1; // 进程网格尺寸 + + // 初始化MPI环境 + MPI_Init(NULL, NULL); + int iam, np; + MPI_Comm_rank(MPI_COMM_WORLD, &iam); + MPI_Comm_size(MPI_COMM_WORLD, &np); + + if (P * Q != np) { + if (iam == 0) { + fprintf(stderr, "Error: The product of P and Q must equal the number of processes.\n"); + } + MPI_Finalize(); + return -1; + } + + // 创建BLACS上下文 + int ictxt, myrow, mycol; + blacs_get_(&izero, &izero, &ictxt); // -> Create context + blacs_gridinit_(&ictxt, "Row", &P, &Q); + blacs_gridinfo_(&ictxt, &P, &Q, &myrow, &mycol); + + // 定义矩阵分布描述符 + int desca[9], descb[9]; + int info; + + // 计算本地矩阵尺寸 + int M_loc = numroc_(&N, &MB, &myrow, &izero, &P); + int N_loc = numroc_(&N, &NB, &mycol, &izero, &Q); + + // 分配局部矩阵存储空间 + double *A = (double *)malloc(M_loc * N_loc * sizeof(double)); + if (!A) { + printf("Process %d could not allocate memory for A\n", iam); + return -1; + } + + // 初始化局部矩阵数据 + for (int i = 0; i < M_loc * N_loc; ++i) { + A[i] = drand48(); // 使用随机数初始化 + } + + // 设置矩阵描述符 + descinit_(desca, &N, &N, &MB, &NB, &izero, &izero, &ictxt, &M_loc, &info); + + + // 调用ScaLAPACK pdgetrf函数进行LU分解 + int *ipiv = (int *)malloc((M_loc + MB) * sizeof(int)); + pdgetrf_(&N, &N, A, &ione, &ione, desca, ipiv, &info); + + // 清理资源 + free(A); + free(ipiv); + blacs_gridexit_(&ictxt); + MPI_Finalize(); + + return info; +} \ No newline at end of file diff --git a/kml/test/kml_scalapack_test.h b/kml/test/kml_scalapack_test.h new file mode 100644 index 0000000..2ce9a1e --- /dev/null +++ b/kml/test/kml_scalapack_test.h @@ -0,0 +1,14 @@ +#ifndef KML_SCALAPACK_TEST_H +#define KML_SCALAPACK_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +int kml_scalapack_pdgetrf_01(); + +#ifdef __cplusplus +} +#endif + +#endif // KML_SCALAPACK_TEST_H \ No newline at end of file diff --git a/kml/test/kml_solver_test.c b/kml/test/kml_solver_test.c new file mode 100644 index 0000000..cf2df0d --- /dev/null +++ b/kml/test/kml_solver_test.c @@ -0,0 +1,154 @@ +#include "kml_solver_test.h" +#include +#include +#include "kml_solver.h" +void PrepareMatrix(KmlSolverMatrixStore *store, + KmlSolverMatrixOption *opt, int nRow, int nCol) +{ + store->indexType = KMLSS_INDEX_INT32; + store->valueType = KMLSS_VALUE_FP64; + store->nRow = nRow; + store->nCol = nCol; + opt->fieldMask = KMLSS_MATRIX_OPTION_TYPE; + opt->type = KMLSS_MATRIX_GEN; +} + +int kml_solver_tydss_01() +{ +int ierr; + int n = 8; + int nrhs = 1; + + // Create matrix A + int ia[9] = {0, 2, 4, 6, 7, 8, 10, 12, 14}; + int ja[14] = {0, 7, 1, 6, 2, 5, 3, 4, 2, 5, 1, 6, 0, 7}; + double a[14] = {1.0, 2.0, -2.0, 3.0, 3.0, 4.0, -4.0, 5.0, 4.0, -6.0, 3.0, 7.0, 2.0, 8.0}; + KmlSolverMatrixStore storeA; + KmlSolverMatrixOption optA; + PrepareMatrix(&storeA, &optA, n, n); + storeA.format = KMLSS_MATRIX_STORE_CSR; + storeA.csr.rowOffset = ia; + storeA.csr.colIndex = ja; + storeA.csr.value = a; + + KmlSolverMatrix *A; + ierr = KmlSolverMatrixCreate(&A, &storeA, &optA); + if (ierr != KMLSS_NO_ERROR) { + printf("ERROR when create A: %d\n", ierr); + return 1; + } + + // Create vector b + double b[8] = {3.0, 1.0, 7.0, -4.0, 5.0, -2.0, 10.0, 10.0}; + KmlSolverMatrixStore storeB; + KmlSolverMatrixOption optB; + PrepareMatrix(&storeB, &optB, n, nrhs); + storeB.format = KMLSS_MATRIX_STORE_DENSE_COL_MAJOR; + storeB.dense.value = b; + storeB.dense.ld = n; + KmlSolverMatrix *B; + ierr = KmlSolverMatrixCreate(&B, &storeB, &optB); + + if (ierr != KMLSS_NO_ERROR) { + printf("ERROR when create b: %d\n", ierr); + return 1; + } + + // Create vector x + double x[8] = {0}; + KmlSolverMatrixStore storeX; + KmlSolverMatrixOption optX; + PrepareMatrix(&storeX, &optX, n, nrhs); + storeX.format = KMLSS_MATRIX_STORE_DENSE_COL_MAJOR; + storeX.dense.value = x; + storeX.dense.ld = n; + KmlSolverMatrix *X; + ierr = KmlSolverMatrixCreate(&X, &storeX, &optX); + if (ierr != KMLSS_NO_ERROR) { + printf("ERROR when create x: %d\n", ierr); + return 1; + } + + // Init solver + KmlDssInitOption opt; + opt.fieldMask = KMLDSS_INIT_OPTION_NTHREADS; + opt.nThreads = 32; + KmlDssSolver *solver; + ierr = KmlDssInit(&solver, &opt); + if (ierr != KMLSS_NO_ERROR) { + printf("ERROR in KmlDssInit: %d\n", ierr); + return ierr; + } + + // Analyze + KmlDssAnalyzeOption optAnalyze; + optAnalyze.fieldMask = KMLDSS_ANALYZE_OPTION_NTHREADS_RDR; + optAnalyze.nThreadsRdr = 8; + ierr = KmlDssAnalyze(solver, A, &optAnalyze); + if (ierr != KMLSS_NO_ERROR) { + printf("ERROR in KmlDssAnalyze: %d\n", ierr); + return ierr; + } + + // Factorize + KmlDssFactorizeOption optFact; + optFact.fieldMask = KMLDSS_FACTORIZE_OPTION_PERTURBATION_THRESHOLD; + optFact.perturbationThreshold = 1e-8; + ierr = KmlDssFactorize(solver, A, &optFact); + if (ierr != KMLSS_NO_ERROR) { + printf("ERROR in KmlDssFactorize: %d\n", ierr); + return ierr; + } + + // Solve + KmlDssSolveOption optSolve; + optSolve.fieldMask = KMLDSS_SOLVE_OPTION_SOLVE_STAGE | + KMLDSS_SOLVE_OPTION_REFINE_METHOD; + optSolve.stage = KMLDSS_SOLVE_ALL; + optSolve.refineMethod = KMLDSS_REFINE_OFF; + ierr = KmlDssSolve(solver, B, X, &optSolve); + if (ierr != KMLSS_NO_ERROR) { + printf("ERROR in KmlDssSolve: %d\n", ierr); + return ierr; + } + + // Set new values of A + double a1[14] = {2.0, 3.0, -3.0, 4.0, 4.0, 5.0, -5.0, + 6.0, 5.0, -7.0, 4.0, 8.0, 3.0, 9.0}; + KmlSolverMatrixSetValue(A, a1); + // Factorize with new values + ierr = KmlDssFactorize(solver, A, &optFact); + if (ierr != KMLSS_NO_ERROR) { + printf("ERROR in KmlDssFactorize: %d\n", ierr); + return ierr; + } + + + + // Set new values of B + double b1[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + KmlSolverMatrixSetValue(B, b1); + // Solve with new values + ierr = KmlDssSolve(solver, B, X, &optSolve); + if (ierr != KMLSS_NO_ERROR) { + printf("ERROR in KmlDssSolve: %d\n", ierr); + return ierr; + } + + // Query + KmlDssInfo info; + info.fieldMask = KMLDSS_INFO_PEAK_MEM; + + ierr = KmlDssQuery(solver, &info); + if (ierr != KMLSS_NO_ERROR) { + printf("ERROR in KmlDssQuery: %d\n", ierr); + return ierr; + } + + // Destroy + KmlDssClean(&solver); + KmlSolverMatrixDestroy(&A); + KmlSolverMatrixDestroy(&B); + KmlSolverMatrixDestroy(&X); + return 0; +} \ No newline at end of file diff --git a/kml/test/kml_solver_test.h b/kml/test/kml_solver_test.h new file mode 100644 index 0000000..48f7bbe --- /dev/null +++ b/kml/test/kml_solver_test.h @@ -0,0 +1,14 @@ +#ifndef KML_SOLVER_TEST_H +#define KML_SOLVER_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +int kml_solver_tydss_01(); + +#ifdef __cplusplus +} +#endif + +#endif // KML_SOLVER_TEST_H \ No newline at end of file diff --git a/kml/test/kml_spblas_test.c b/kml/test/kml_spblas_test.c new file mode 100644 index 0000000..75739ed --- /dev/null +++ b/kml/test/kml_spblas_test.c @@ -0,0 +1,29 @@ +#include "kml_spblas_test.h" +#include +#include +#include +#include "kspblas.h" + + +int kml_spblas_spgemm_01() +{ + kml_sparse_operation_t opt = KML_SPARSE_OPERATION_NON_TRANSPOSE; + KML_INT m = 3; + KML_INT n = 3; + KML_INT k = 3; + float alpha = 1.0; + float beta = 1.0; + char *matdescra = "G00C"; //一般矩阵,基0索引 + float val[4] = {9, 8, 5, 2}; + KML_INT indx[4] = {2, 1, 2, 2}; + KML_INT pntrb[3] = {0, 2, 3}; + KML_INT pntre[3] = {2, 3, 4}; + float b[9] = {1, 5, 7, 4, 7, 7, 3, 3, 7}; + float c[9] = {0, 8, 8, 0, 2, 8, 3, 5, 6}; + KML_INT ldb = 3; + KML_INT ldc = 3; + kml_sparse_status_t status = kml_sparse_scsrmm(opt, m, n, k, alpha, matdescra, val, indx, pntrb, pntre, b, ldb, beta, c, ldc); + + + return 1; // 成功 +} \ No newline at end of file diff --git a/kml/test/kml_spblas_test.h b/kml/test/kml_spblas_test.h new file mode 100644 index 0000000..3b1cbfc --- /dev/null +++ b/kml/test/kml_spblas_test.h @@ -0,0 +1,14 @@ +#ifndef KML_SPBLAS_TEST_H +#define KML_SPBLAS_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +int kml_spblas_spgemm_01(); + +#ifdef __cplusplus +} +#endif + +#endif // KML_SPBLAS_TEST_H \ No newline at end of file diff --git a/kml/test/kml_svml_test.c b/kml/test/kml_svml_test.c new file mode 100644 index 0000000..c6d6eb4 --- /dev/null +++ b/kml/test/kml_svml_test.c @@ -0,0 +1,20 @@ +#include "kml_vml_test.h" +#include +#include +#include +#include "kvml.h" + + +int kml_vml_sin_01() +{ + int i, len = 4; + float src[] = {0.0f, 2.0f, INFINITY, NAN}; + float* dst = (float*)malloc(sizeof(float) * len); + if (dst == NULL) { + printf("Malloc Failed!\n"); + return 0; + } + vssin(len, src, dst); + + return 1; // 成功 +} \ No newline at end of file diff --git a/kml/test/kml_svml_test.h b/kml/test/kml_svml_test.h new file mode 100644 index 0000000..8a2e1af --- /dev/null +++ b/kml/test/kml_svml_test.h @@ -0,0 +1,14 @@ +#ifndef KML_SVML_TEST_H +#define KML_SVML_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +int kml_svml_sin_01(); + +#ifdef __cplusplus +} +#endif + +#endif // KML_SVML_TEST_H \ No newline at end of file diff --git a/kml/test/kml_test_main.cpp b/kml/test/kml_test_main.cpp new file mode 100644 index 0000000..f5ac12f --- /dev/null +++ b/kml/test/kml_test_main.cpp @@ -0,0 +1,75 @@ +#include +extern "C" { +#include "kml_fft_test.h" +#include "kml_blas_test.h" +#include "kml_lapack_test.h" +#include "kml_solver_test.h" +#include "kml_scalapack_test.h" +#include "kml_ipl_test.h" +#include "kml_conv_test.h" +#include "kml_vml_test.h" +#include "kml_vsl_test.h" +#include "kml_svml_test.h" +#include "kml_math_test.h" +#include "kml_spblas_test.h" +} + +TEST(kml_test, kml_fft) { + int result = kml_fft_c2c_01(); + EXPECT_EQ(result, 1) << "kml_fft_c2c_01 should succeed"; +} + +TEST(kml_test, kml_blas) { + int result = kml_blas_sgemm_01(); + EXPECT_EQ(result, 1) << "kml_blas_sgemm_01 should succeed"; +} + +TEST(kml_test, kml_lapack) { + int result = kml_lapack_dgetrf_01(); + EXPECT_EQ(result, 0) << "kml_lapack_dgetrf_01 should succeed"; +} + +TEST(kml_test, kml_solver) { + int result = kml_solver_tydss_01(); + EXPECT_EQ(result, 0) << "kml_solver_tydss_01 should succeed"; +} + +TEST(kml_test, kml_scalapack) { + int result = kml_scalapack_pdgetrf_01(); + EXPECT_EQ(result, 0) << "kml_scalapack_pdgetrf_01 should succeed"; +} + +TEST(kml_test, kml_ipl) { + int result = kml_ipl_interp2d_01(); + EXPECT_EQ(result, 1) << "kml_ipl_interp2d_01 should succeed"; +} + +TEST(kml_test, kml_conv) { + int result = kml_conv_conv2d_01(); + EXPECT_EQ(result, 1) << "kml_conv_conv2d_01 should succeed"; +} + +TEST(kml_test, kml_vml) { + int result = kml_vml_sin_01(); + EXPECT_EQ(result, 1) << "kml_vml_sin_01 should succeed"; +} + +TEST(kml_test, kml_vsl) { + int result = kml_vsl_uniform_01(); + EXPECT_EQ(result, 1) << "kml_vsl_uniform_01 should succeed"; +} + +TEST(kml_test, kml_svml) { + int result = kml_svml_sin_01(); + EXPECT_EQ(result, 1) << "kml_svml_sin_01 should succeed"; +} + +TEST(kml_test, kml_math) { + int result = kml_math_sin_01(); + EXPECT_EQ(result, 1) << "kml_math_sin_01 should succeed"; +} + +TEST(kml_test, kml_spblas) { + int result = kml_spblas_spgemm_01(); + EXPECT_EQ(result, 1) << "kml_spblas_spgemm_01 should succeed"; +} \ No newline at end of file diff --git a/kml/test/kml_vml_test.c b/kml/test/kml_vml_test.c new file mode 100644 index 0000000..2166553 --- /dev/null +++ b/kml/test/kml_vml_test.c @@ -0,0 +1,13 @@ +#include "kml_svml_test.h" +#include +#include +#include +#include "ksvml.h" + + +int kml_svml_sin_01() +{ + float32x4_t src = {0.0f, 2.0f, INFINITY, NAN}; + float32x4_t dst = svml128_sin_f32(src); + return 1; // 成功 +} \ No newline at end of file diff --git a/kml/test/kml_vml_test.h b/kml/test/kml_vml_test.h new file mode 100644 index 0000000..e332b54 --- /dev/null +++ b/kml/test/kml_vml_test.h @@ -0,0 +1,14 @@ +#ifndef KML_VML_TEST_H +#define KML_VML_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +int kml_vml_sin_01(); + +#ifdef __cplusplus +} +#endif + +#endif // KML_VML_TEST_H \ No newline at end of file diff --git a/kml/test/kml_vsl_test.c b/kml/test/kml_vsl_test.c new file mode 100644 index 0000000..a5105ed --- /dev/null +++ b/kml/test/kml_vsl_test.c @@ -0,0 +1,41 @@ +#include "kml_vsl_test.h" +#include +#include +#include +#include "krng.h" + + +int kml_vsl_uniform_01() +{ + VSLStreamStatePtr stream; + unsigned seed = 42; + int errcode = vslNewStream(&stream, VSL_BRNG_MCG59, seed); + if (errcode != VSL_STATUS_OK) { + fprintf(stderr, "Failure in newstream\n"); + return 0; + } + + SizeType n = 10; + float *r = (float *)malloc(sizeof(float) * n); + if (r == NULL) { + fprintf(stderr, "Failure in malloc\n"); + return 0; + } + + if (vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, n, r, 0.0, 1.0)) { + fprintf(stderr, "Failure in vsRngUniform\n"); + goto out; + } + + /* deinitialize the stream */ + errcode = vslDeleteStream(&stream); + if (errcode != VSL_STATUS_OK) { + fprintf(stderr, "Failure in deleting stream\n"); + goto out; + } + +out: + free(r); + + return 1; // 成功 +} \ No newline at end of file diff --git a/kml/test/kml_vsl_test.h b/kml/test/kml_vsl_test.h new file mode 100644 index 0000000..7679a90 --- /dev/null +++ b/kml/test/kml_vsl_test.h @@ -0,0 +1,14 @@ +#ifndef KML_VSL_TEST_H +#define KML_VSL_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +int kml_vsl_uniform_01(); + +#ifdef __cplusplus +} +#endif + +#endif // KML_VSL_TEST_H \ No newline at end of file diff --git a/kudnn/build.sh b/kudnn/build.sh new file mode 100644 index 0000000..b13c175 --- /dev/null +++ b/kudnn/build.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# export DNN_INC=~/local/HPCKit/latest/kudnn/bisheng/include +# export DNN_LIB=~/local/HPCKit/latest/kudnn/bisheng/lib +# export BLAS_LIB=~/local/HPCKit/latest/kml/bisheng/lib/sme/kblas/multi +cputype=$(cat /proc/cpuinfo | grep 'CPU part' | awk 'NR==1{print $4}') +if [[ "$cputype" == "0xd22" ]]; then + echo "CXX=${CXX}" + echo "DNN_INC=${DNN_INC}" + echo "DNN_LIB=${DNN_LIB}" + echo "BLAS_LIB=${BLAS_LIB}" + ${CXX} src/function.cpp src/main.cpp -o function_test \ + -I ${DNN_INC} \ + -L ${DNN_LIB} \ + -L ${BLAS_LIB} \ + -lkblas \ + -lkdnn + +else + echo "kudnn: Not suppot on this platform" +fi diff --git a/kudnn/run.sh b/kudnn/run.sh new file mode 100644 index 0000000..847a2f0 --- /dev/null +++ b/kudnn/run.sh @@ -0,0 +1,8 @@ +./function_test kudnn_linearActivation_01 +./function_test kudnn_linearRes_01 +./function_test kudnn_rnorm_01 +./function_test kudnn_lnorm_01 +./function_test kudnn_softmax_01 +./function_test kudnn_gnorm_01 +./function_test kudnn_conv_01 +./function_test kudnn_conv_02 \ No newline at end of file diff --git a/kudnn/src/function.cpp b/kudnn/src/function.cpp new file mode 100644 index 0000000..0d929ce --- /dev/null +++ b/kudnn/src/function.cpp @@ -0,0 +1,1112 @@ +#include "function.hpp" + +using SizeType = KDNN::SizeType; +using Shape = KDNN::Shape; +using TensorInfo = const KDNN::TensorInfo; +KDNN::Element::TypeT TypeF16 = KDNN::Element::TypeT::F16; +KDNN::Element::TypeT TypeF32 = KDNN::Element::TypeT::F32; +const int MAX_THS = 100; +const float FACTOR_THS = 0.02; +float errBound = 1e-3; +float maxError = 1e-3; + +template +struct typeMap; + +template <> +struct typeMap<__fp16> { + static constexpr KDNN::Element::TypeT val = KDNN::Element::TypeT::F16; +}; + +template <> +struct typeMap { + static constexpr KDNN::Element::TypeT val = KDNN::Element::TypeT::F32; +}; + +template +static void conv2dRef(Shape &srcShape, Shape &weiShape, Shape &dstShape, Shape &strides, Shape &dilates, + Shape &paddingL, Shape &paddingR, const T *src, const T *wei, T *dst, const T *bia) +{ + SizeType N = srcShape[0], IC = srcShape[1], IH = srcShape[2], IW = srcShape[3]; + SizeType KH = weiShape[2], KW = weiShape[3]; + SizeType OC = dstShape[1], OH = dstShape[2], OW = dstShape[3]; + SizeType SH = strides[0], SW = strides[1]; + SizeType DH = dilates[0], DW = dilates[1]; + SizeType PH_L = paddingL[0], PW_L = paddingL[1]; + SizeType PH_R = paddingR[0], PW_R = paddingR[1]; + OH = (IH + PH_L + PH_R - 1 - (KH - 1) * (DH + 1)) / SH + 1; + OW = (IW + PW_L + PW_R - 1 - (KW - 1) * (DW + 1)) / SW + 1; + int threadsUsed = (int)ceil((float)(N * OC * OH * OW) * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for collapse(4) num_threads(threadsUsed) + for (SizeType n = 0; n < N; ++n) { + for (SizeType oc = 0; oc < OC; ++oc) { + T biaVal = *(bia + oc); + for (SizeType oh = 0; oh < OH; ++oh) { + for (SizeType ow = 0; ow < OW; ++ow) { + T sum = (T)0.0f; + for (SizeType ic = 0; ic < IC; ++ic) { + for (SizeType kh = 0; kh < KH; ++kh) { + for (SizeType kw = 0; kw < KW; ++kw) { + SizeType srcX = oh * SH + kh * (DH + 1) - PH_L; + SizeType srcY = ow * SW + kw * (DW + 1) - PW_L; + T srcVal = (T)0.0f; + if (srcX >= 0 && srcX < IH && srcY >= 0 && srcY < IW) { + srcVal = *(src + n * IC * IH * IW + ic * IH * IW + srcX * IW + srcY); + } + T weiVal = *(wei + oc * IC * KH * KW + ic * KH * KW + kh * KW + kw); + sum += srcVal * weiVal; + } + } + } + *(dst + n * OC * OH * OW + oc * OH * OW + oh * OW + ow) = sum + biaVal; + } + } + } + } +} + +template +static bool Conv2dFWDFunc1(Shape &srcShape, Shape &weiShape, Shape &dstShape, Shape &strides, Shape &dilates, + Shape &paddingL, Shape &paddingR, KDNN::ConvolutionAlgorithm &alg) +{ + SizeType N = srcShape[0], IC = srcShape[1], IH = srcShape[2], IW = srcShape[3]; + SizeType OC = dstShape[1], OH = dstShape[2], OW = dstShape[3]; + SizeType KH = weiShape[2], KW = weiShape[3]; + const KDNN::TensorInfo srcTensor = {srcShape, typeMap::val, KDNN::Layout::ABCD}; + const KDNN::TensorInfo weightsTensor = {weiShape, typeMap::val, KDNN::Layout::ABCD}; + const KDNN::TensorInfo dstTensor = {dstShape, typeMap::val, KDNN::Layout::ABCD}; + const KDNN::TensorInfo biasTensor = {{OC}, typeMap::val, KDNN::Layout::A}; + + KDNN::ConvolutionLayerFWD convFwdLayer1(srcTensor, weightsTensor, dstTensor, biasTensor, strides, dilates, paddingL, + paddingR, alg); + + SizeType srcSize = N * IC * IH * IW; + SizeType dstSize = N * OC * OH * OW; + SizeType weiSize = OC * IC * KH * KW; + SizeType biaSize = OC; + T *src = (T *)malloc(srcSize * sizeof(T)); + T *dst = (T *)malloc(dstSize * sizeof(T)); + T *dstRef = (T *)malloc(dstSize * sizeof(T)); + T *wei = (T *)malloc(weiSize * sizeof(T)); + T *bia = (T *)malloc(biaSize * sizeof(T)); + if (src == nullptr || dst == nullptr || dstRef == nullptr || wei == nullptr || bia == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return false; + } + // generate random test data + std::uniform_real_distribution u(-1, 1); + std::default_random_engine e(time(NULL)); + int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < srcSize; ++i) { + *(src + i) = (T)u(e); + } + for (SizeType i = 0; i < weiSize; ++i) { + *(wei + i) = (T)u(e); + } + for (SizeType i = 0; i < biaSize; ++i) { + *(bia + i) = (T)u(e); + } + convFwdLayer1.Run(src, wei, dst, bia); + conv2dRef(srcShape, weiShape, dstShape, strides, dilates, paddingL, paddingR, src, wei, dstRef, bia); + float error = 0.0; +#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed) + for (SizeType i = 0; i < dstSize; ++i) { + error += *(dst + i) - *(dstRef + i); + } + free(src); + free(dst); + free(dstRef); + free(wei); + free(bia); + error = std::abs(error) / dstSize; + + return error < errBound; +} + +bool kudnn_conv_01() +{ + SizeType N = 4, IC = 4, IH = 128, IW = 100; + SizeType OC = 5, OH = 0, OW = 0; + SizeType KH = 3, KW = 3; + Shape strides(1, 1), dilates(1, 1), paddingL(1, 1), paddingR(1, 1); + OH = (IH + paddingL[0] + paddingR[0] - 1 - (KH - 1) * (dilates[0] + 1)) / strides[0] + 1; + OW = (IW + paddingL[1] + paddingR[1] - 1 - (KW - 1) * (dilates[1] + 1)) / strides[1] + 1; + Shape srcShape(N, IC, IH, IW), weiShape(OC, IC, KH, KW), dstShape(N, OC, OH, OW); + KDNN::ConvolutionAlgorithm alg(KDNN::ConvolutionAlgorithm::AUTO); + return Conv2dFWDFunc1(srcShape, weiShape, dstShape, strides, dilates, paddingL, paddingR, alg); +} + +template +static void conv3dRef(Shape &srcShape, Shape &weiShape, Shape &dstShape, Shape &strides, Shape &dilates, + Shape &paddingL, Shape &paddingR, const T *src, const T *wei, T *dst, const T *bia) +{ + SizeType N = srcShape[0], IC = srcShape[1], ID = srcShape[2], IH = srcShape[3], IW = srcShape[4]; + SizeType KD = weiShape[2], KH = weiShape[3], KW = weiShape[4]; + SizeType OC = dstShape[1], OD = dstShape[2], OH = dstShape[2], OW = dstShape[3]; + SizeType SD = strides[0], SH = strides[1], SW = strides[2]; + SizeType DD = dilates[0], DH = dilates[1], DW = dilates[2]; + SizeType PD_L = paddingL[0], PH_L = paddingL[1], PW_L = paddingL[2]; + SizeType PD_R = paddingR[0], PH_R = paddingR[1], PW_R = paddingR[2]; + OD = (ID + PD_L + PD_R - 1 - (KD - 1) * (DD + 1)) / SD + 1; + OH = (IH + PH_L + PH_R - 1 - (KH - 1) * (DH + 1)) / SH + 1; + OW = (IW + PW_L + PW_R - 1 - (KW - 1) * (DW + 1)) / SW + 1; + int threadsUsed = (int)ceil((float)(N * OC * OD * OH * OW) * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for collapse(5) num_threads(threadsUsed) + for (SizeType n = 0; n < N; ++n) { + for (SizeType oc = 0; oc < OC; ++oc) { + T biaVal = *(bia + oc); + for (SizeType od = 0; od < OD; ++od) { + for (SizeType oh = 0; oh < OH; ++oh) { + for (SizeType ow = 0; ow < OW; ++ow) { + T sum = (T)0.0f; + for (SizeType ic = 0; ic < IC; ++ic) { + for (SizeType kd = 0; kd < KD; ++kd) { + for (SizeType kh = 0; kh < KH; ++kh) { + for (SizeType kw = 0; kw < KW; ++kw) { + SizeType srcZ = od * SD + kd * (DD + 1) - PD_L; + SizeType srcX = oh * SH + kh * (DH + 1) - PH_L; + SizeType srcY = ow * SW + kw * (DW + 1) - PW_L; + T srcVal = (T)0.0f; + if (srcZ >= 0 && srcZ < ID && srcX >= 0 && srcX < IH && srcY >= 0 && + srcY < IW) { + srcVal = *(src + n * IC * ID * IH * IW + ic * ID * IH * IW + + srcZ * IH * IW + srcX * IW + srcY); + } + T weiVal = *(wei + oc * IC * KD * KH * KW + ic * KD * KH * KW + kd * KH * KW + + kh * KW + kw); + sum += srcVal * weiVal; + } + } + } + } + *(dst + n * OC * OD * OH * OW + oc * OD * OH * OW + od * OH * OW + oh * OW + ow) = sum + biaVal; + } + } + } + } + } +} + +template +static bool Conv3dFWDFunc1(Shape &srcShape, Shape &weiShape, Shape &dstShape, Shape &strides, Shape &dilates, + Shape &paddingL, Shape &paddingR, KDNN::ConvolutionAlgorithm &alg) +{ + SizeType N = srcShape[0], IC = srcShape[1], ID = srcShape[2], IH = srcShape[3], IW = srcShape[4]; + SizeType OC = dstShape[1], OD = dstShape[2], OH = dstShape[3], OW = dstShape[4]; + SizeType KD = weiShape[2], KH = weiShape[3], KW = weiShape[4]; + const KDNN::TensorInfo srcTensor = {srcShape, typeMap::val, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo weightsTensor = {weiShape, typeMap::val, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo dstTensor = {dstShape, typeMap::val, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo biasTensor = {{OC}, typeMap::val, KDNN::Layout::A}; + + KDNN::ConvolutionLayerFWD convFwdLayer1(srcTensor, weightsTensor, dstTensor, biasTensor, strides, dilates, paddingL, + paddingR, alg); + + SizeType srcSize = N * IC * ID * IH * IW; + SizeType dstSize = N * OC * OD * OH * OW; + SizeType weiSize = OC * IC * KD * KH * KW; + SizeType biaSize = OC; + T *src = (T *)malloc(srcSize * sizeof(T)); + T *dst = (T *)malloc(dstSize * sizeof(T)); + T *dstRef = (T *)malloc(dstSize * sizeof(T)); + T *wei = (T *)malloc(weiSize * sizeof(T)); + T *bia = (T *)malloc(biaSize * sizeof(T)); + if (src == nullptr || dst == nullptr || dstRef == nullptr || wei == nullptr || bia == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return false; + } + // generate random test data + std::uniform_real_distribution u(-1, 1); + std::default_random_engine e(time(NULL)); + int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < srcSize; ++i) { + *(src + i) = (T)u(e); + } + for (SizeType i = 0; i < weiSize; ++i) { + *(wei + i) = (T)u(e); + } + for (SizeType i = 0; i < biaSize; ++i) { + *(bia + i) = (T)u(e); + } + conv3dRef(srcShape, weiShape, dstShape, strides, dilates, paddingL, paddingR, src, wei, dstRef, bia); + convFwdLayer1.Run(src, wei, dst, bia); + float error = 0.0; +#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed) + for (SizeType i = 0; i < dstSize; ++i) { + error += *(dst + i) - *(dstRef + i); + } + free(src); + free(dst); + free(dstRef); + free(wei); + free(bia); + error = std::abs(error) / dstSize; + + return error < errBound; +} + +bool kudnn_conv_02() +{ + SizeType N = 1, IC = 5, ID = 20, IH = 90, IW = 160; + SizeType OC = 10, OD = 0, OH = 0, OW = 0; + SizeType KD = 1, KH = 1, KW = 1; + Shape strides(1, 1, 1), dilates(0, 0, 0), paddingL(0, 0, 0), paddingR(0, 0, 0); + OD = (ID + paddingL[0] + paddingR[0] - 1 - (KD - 1) * (dilates[0] + 1)) / strides[0] + 1; + OH = (IH + paddingL[1] + paddingR[1] - 1 - (KH - 1) * (dilates[1] + 1)) / strides[1] + 1; + OW = (IW + paddingL[2] + paddingR[2] - 1 - (KW - 1) * (dilates[2] + 1)) / strides[2] + 1; + Shape srcShape(N, IC, ID, IH, IW), weiShape(OC, IC, KD, KH, KW), dstShape(N, OC, OD, OH, OW); + KDNN::ConvolutionAlgorithm alg(KDNN::ConvolutionAlgorithm::AUTO); + return Conv3dFWDFunc1<__fp16>(srcShape, weiShape, dstShape, strides, dilates, paddingL, paddingR, alg); +} + +template +static void gnormRef(Shape shape, SizeType groupInfo, const T *src, T *dst, const T *scale, const T *shift, + bool scaleApply, bool shiftApply, float *mean, float *variance, const float eps, + bool global_stats = false) +{ + SizeType innerSize = 1; + SizeType numDims = shape.GetNumDims(); + if (numDims == 4) { + innerSize = shape[2] * shape[3]; + } else if (numDims == 5) { + innerSize = shape[2] * shape[3] * shape[4]; + } else if (numDims == 3) { + innerSize = shape[2]; + } else { + return; + } + SizeType A = shape[0], B = shape[1]; + SizeType chPerGroup = B / groupInfo; + innerSize *= chPerGroup; + int threadsUsed = (int)ceil((float)(A * groupInfo) * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; + if (!global_stats) { + // mean compute +#pragma omp parallel for collapse(2) num_threads(threadsUsed) + for (SizeType a = 0; a < A; ++a) { + for (SizeType g = 0; g < groupInfo; ++g) { + float sum = 0.0f; + for (SizeType i = 0; i < innerSize; ++i) { + sum += (float)*(src + a * groupInfo * innerSize + g * innerSize + i); + } + *(mean + a * groupInfo + g) = sum / innerSize; + } + } + // variance compute +#pragma omp parallel for collapse(2) num_threads(threadsUsed) + for (SizeType a = 0; a < A; ++a) { + for (SizeType g = 0; g < groupInfo; ++g) { + float sum = 0.0f; + float meanVal = *(mean + a * groupInfo + g); + for (SizeType i = 0; i < innerSize; ++i) { + float diff = (float)*(src + a * groupInfo * innerSize + g * innerSize + i) - meanVal; + sum += diff * diff; + } + *(variance + a * groupInfo + g) = sum / innerSize; + } + } + } + // norm + innerSize /= chPerGroup; +#pragma omp parallel for collapse(2) num_threads(threadsUsed) + for (SizeType a = 0; a < A; ++a) { + for (SizeType g = 0; g < groupInfo; ++g) { + float meanVal = *(mean + a * groupInfo + g); + float invStd = 1.0f / std::sqrt(*(variance + a * groupInfo + g) + eps); + for (SizeType gi = 0; gi < chPerGroup; ++gi) { + int b = g * chPerGroup + gi; + for (SizeType i = 0; i < innerSize; ++i) { + float srcVal = (float)*(src + a * B * innerSize + b * innerSize + i); + float normalized = (srcVal - meanVal) * invStd; + if (scaleApply) { + normalized *= scale[b]; + } + if (shiftApply) { + normalized += shift[b]; + } + *(dst + a * B * innerSize + b * innerSize + i) = static_cast(normalized); + } + } + } + } +} +// forward +template +static bool GnormForwardFunc1(const TensorInfo &srcInfo, const TensorInfo &scaleShiftInfo, SizeType groupInfo, + const TensorInfo &dstInfo, KDNN::NormalizationFlags flags) +{ + KDNN::GroupNormalizationLayerFWD gnormLayer1(srcInfo, scaleShiftInfo, groupInfo, dstInfo, flags); + + SizeType srcSize = srcInfo.GetTotalTensorSize(); + SizeType dstSize = dstInfo.GetTotalTensorSize(); + SizeType statSize = srcInfo.GetDims()[0] * groupInfo; + SizeType scaleSize = scaleShiftInfo.GetTotalTensorSize(); + + T *src = (T *)malloc(srcSize * sizeof(T)); + T *dst = (T *)malloc(srcSize * sizeof(T)); + T *dstRef = (T *)malloc(srcSize * sizeof(T)); + float *mean = (float *)malloc(statSize * sizeof(float)); + float *variance = (float *)malloc(statSize * sizeof(float)); + T *scale = (T *)malloc(scaleSize * sizeof(T)); + T *shift = (T *)malloc(scaleSize * sizeof(T)); + float eps = 1e-5; + if (src == nullptr || dst == nullptr || dstRef == nullptr || mean == nullptr || variance == nullptr || + scale == nullptr || shift == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return false; + } + + bool global_stats = static_cast(flags & KDNN::NormalizationFlags::USE_GLOBAL_STATS); + + // generate random test data + std::uniform_real_distribution u(-1, 1); + std::default_random_engine e(time(NULL)); + int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < srcSize; ++i) { + *(src + i) = (T)u(e); + } + for (SizeType i = 0; i < scaleSize; ++i) { + *(scale + i) = (T)u(e); + *(shift + i) = (T)u(e); + } + if (global_stats) { + for (SizeType i = 0; i < statSize; ++i) { + *(mean + i) = 1.0f; + *(variance + i) = 0.1f; + } + } + + gnormLayer1.Run(src, dst, scale, shift, mean, variance, true, eps); + bool scaleApply = static_cast(flags & KDNN::NormalizationFlags::USE_SCALE); + bool shiftApply = static_cast(flags & KDNN::NormalizationFlags::USE_SHIFT); + + gnormRef(srcInfo.GetDims(), groupInfo, src, dstRef, scale, shift, scaleApply, shiftApply, mean, variance, eps, + global_stats); + float error = 0.0; +#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed) + for (SizeType i = 0; i < dstSize; ++i) { + error += *(dst + i) - *(dstRef + i); + } + free(src); + free(dst); + free(dstRef); + free(mean); + free(variance); + free(scale); + free(shift); + error = std::abs(error) / dstSize; + + return error < errBound; +} + +bool kudnn_gnorm_01() +{ + Shape shape(4, 128, 18, 320); + SizeType groupInfo = 2; + TensorInfo srcInfo = {shape, TypeF16, KDNN::Layout::ABCD}; + TensorInfo scaleShiftInfo = {{shape[1]}, TypeF16, KDNN::Layout::A}; + TensorInfo dstInfo = {shape, TypeF16, KDNN::Layout::ABCD}; + KDNN::NormalizationFlags flags = KDNN::NormalizationFlags::NONE; + return GnormForwardFunc1<__fp16>(srcInfo, scaleShiftInfo, groupInfo, dstInfo, flags); +} + +template +static void lnormRef(SizeType outerSize, SizeType innerSize, const T *src, T *dst, const T *scale, const T *shift, + bool scaleApply, bool shiftApply, float *mean, float *variance, const float eps, + bool global_stats = false) +{ + int threadsUsed = (int)ceil((float)outerSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; + if (!global_stats) { + // mean compute +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < outerSize; ++i) { + float sum = 0.0f; + for (SizeType j = 0; j < innerSize; ++j) { + sum += (float)*(src + i * innerSize + j); + } + *(mean + i) = sum / innerSize; + } + // variance compute +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < outerSize; ++i) { + float sum = 0.0f; + float meanVal = *(mean + i); + for (SizeType j = 0; j < innerSize; ++j) { + float diff = (float)*(src + i * innerSize + j) - meanVal; + sum += diff * diff; + } + *(variance + i) = sum / innerSize; + } + } + // norm +#pragma omp parallel for collapse(2) num_threads(threadsUsed) + for (SizeType i = 0; i < outerSize; ++i) { + float meanVal = *(mean + i); + float varVal = *(variance + i); + float invStd = 1.0f / std::sqrt(varVal + eps); + for (SizeType j = 0; j < innerSize; ++j) { + float srcVal = (float)*(src + i * innerSize + j); + float normalized = (srcVal - meanVal) * invStd; + if (scaleApply) { + normalized *= scale[j]; + } + if (shiftApply) { + normalized += shift[j]; + } + *(dst + i * innerSize + j) = static_cast(normalized); + } + } +} +// forward +template +static bool LnormForwardFunc1(const TensorInfo &srcInfo, const TensorInfo &statInfo, const TensorInfo &scaleShiftInfo, + const TensorInfo &dstInfo, KDNN::NormalizationFlags flags) +{ + KDNN::NormalizationLayerFWD lnormLayer1(srcInfo, statInfo, scaleShiftInfo, dstInfo, flags); + SizeType srcSize = srcInfo.GetTotalTensorSize(); + SizeType dstSize = dstInfo.GetTotalTensorSize(); + SizeType statSize = statInfo.GetTotalTensorSize(); + SizeType innerSize = scaleShiftInfo.GetTotalTensorSize(); + T *src = (T *)malloc(srcSize * sizeof(T)); + T *dst = (T *)malloc(dstSize * sizeof(T)); + T *dstRef = (T *)malloc(dstSize * sizeof(T)); + float *mean = (float *)malloc(statSize * sizeof(float)); + float *variance = (float *)malloc(statSize * sizeof(float)); + T *scale = (T *)malloc(innerSize * sizeof(T)); + T *shift = (T *)malloc(innerSize * sizeof(T)); + float eps = 1e-5; + if (src == nullptr || dst == nullptr || dstRef == nullptr || mean == nullptr || variance == nullptr || + scale == nullptr || shift == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return false; + } + + bool global_stats = static_cast(flags & KDNN::NormalizationFlags::USE_GLOBAL_STATS); + + // generate random test data + std::uniform_real_distribution u(-1, 1); + std::default_random_engine e(time(NULL)); + int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < srcSize; ++i) { + *(src + i) = (T)u(e); + } + for (SizeType j = 0; j < innerSize; ++j) { + *(scale + j) = (T)u(e); + *(shift + j) = (T)u(e); + } + if (global_stats) { + for (SizeType i = 0; i < statSize; i++) { + *(mean + i) = 1.0f; + *(variance + i) = 0.1f; + } + } + + float error = 0.0f; + lnormLayer1.Run(src, dst, scale, shift, mean, variance, true, eps); + bool scaleApply = static_cast(flags & KDNN::NormalizationFlags::USE_SCALE); + bool shiftApply = static_cast(flags & KDNN::NormalizationFlags::USE_SHIFT); + lnormRef(statSize, innerSize, src, dstRef, scale, shift, scaleApply, shiftApply, mean, variance, eps, + global_stats); +#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed) + for (SizeType i = 0; i < dstSize; ++i) { + error += *(dst + i) - *(dstRef + i); + } + free(src); + free(dst); + free(dstRef); + free(mean); + free(variance); + free(scale); + free(shift); + error = std::abs(error) / dstSize; + + return error < errBound; +} + +bool kudnn_lnorm_01() +{ + Shape shape(2, 3240, 1152); + TensorInfo srcInfo = {shape, TypeF32, KDNN::Layout::ABC}; + TensorInfo statInfo = {{shape[0], shape[1]}, TypeF32, KDNN::Layout::AB}; + TensorInfo scaleShiftInfo = {{shape[2]}, TypeF32, KDNN::Layout::A}; + TensorInfo dstInfo = {shape, TypeF32, KDNN::Layout::ABC}; + KDNN::NormalizationFlags flags = KDNN::NormalizationFlags::NONE; + return LnormForwardFunc1(srcInfo, statInfo, scaleShiftInfo, dstInfo, flags); +} + +template +static void siluSimple(T *dst, SizeType size) +{ + if (!dst || size == 0) + return; + for (SizeType i = 0; i < size; ++i) { + T sigmoid_x = static_cast(1) / (static_cast(1) + std::exp(static_cast(-dst[i]))); + dst[i] = dst[i] * sigmoid_x; + } +} + +template +static void linearActivationSimple(SizeType batch, SizeType m, SizeType n, SizeType k, const T *src, T *weight, T *dst, + T *bias, KDNN::ActivationFunction kind) +{ + for (SizeType bh = 0; bh < batch; ++bh) { + for (SizeType i = 0; i < m; ++i) { + for (SizeType j = 0; j < n; ++j) { + T sum = 0; + for (SizeType l = 0; l < k; ++l) { + sum += src[bh * m * k + i * k + l] * weight[bh * n * k + n * l + j]; + } + dst[bh * m * n + i * n + j] = sum + bias[bh * m * n + i * n + j]; + } + } + } + if (kind == KDNN::ActivationFunction::SWISH) { + siluSimple(dst, batch * m * n); + } +} + +template +static void floatDataInit(T **src, T **weight, T **dst, T **dstRef, T **bias, SizeType srcTotalSize, + SizeType weightTotalSize, SizeType dstTotalSize) +{ + *src = (T *)malloc(srcTotalSize * sizeof(T)); + *weight = (T *)malloc(weightTotalSize * sizeof(T)); + *dst = (T *)malloc(dstTotalSize * sizeof(T)); + *dstRef = (T *)malloc(dstTotalSize * sizeof(T)); + *bias = (T *)malloc(dstTotalSize * sizeof(T)); + + if (*src == nullptr || *weight == nullptr || *dst == nullptr || *dstRef == nullptr || *bias == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return; + } + std::uniform_real_distribution u(-1, 1); + static std::default_random_engine e(time(NULL)); + + for (SizeType i = 0; i < srcTotalSize; ++i) { + (*src)[i] = (T)u(e); + } + for (SizeType i = 0; i < weightTotalSize; ++i) { + (*weight)[i] = (T)u(e); + } + for (SizeType i = 0; i < dstTotalSize; ++i) { + (*bias)[i] = (T)u(e); + } +} + +static std::pair compareEle(float ref, float diff, KDNN::Element::TypeT type, int K) +{ + float e = 0.0f; + float eps = 0.0f; + std::pair pairEps = {0.0f, 0.0f}; + switch (type) { + case KDNN::Element::TypeT::F16: + eps = 1e-3 * K; + e = (std::fabs(ref) > eps) ? diff / ref : static_cast(diff); + pairEps = {e, eps}; + break; + case KDNN::Element::TypeT::BF16: + eps = 1e-2 * K; + e = (std::fabs(ref) > eps) ? diff / ref : static_cast(diff); + pairEps = {e, eps}; + break; + case KDNN::Element::TypeT::F32: + eps = 1e-4; + e = (std::fabs(ref) > 1e-4) ? diff / ref : static_cast(diff); + pairEps = {e, eps}; + break; + default: + eps = 1; + e = diff; + pairEps = {e, eps}; + break; + } + return pairEps; +} + +template +static float findMaxValue(T *value1, SizeType valueSize1, T *value2, SizeType valueSize2) +{ + T maxVal = value1[0]; + + for (SizeType i = 0; i < valueSize1; ++i) { + maxVal = std::max(maxVal, value1[i]); + } + + for (SizeType i = 0; i < valueSize2; ++i) { + maxVal = std::max(maxVal, value2[i]); + } + return static_cast(maxVal); +} + +template +static bool Test_Template_Function(const KDNN::TensorInfo srcTensor, const KDNN::TensorInfo weightTensor, + const KDNN::TensorInfo dstTensor, const KDNN::TensorInfo biasTensor, float alpha, + float beta, KDNN::ActivationFunction algKind, int numThreads) +{ + KDNN::LinearActivationLayerFWD linearActivationLayerFwd(srcTensor, weightTensor, dstTensor, biasTensor, alpha, beta, + algKind, numThreads); + + T *src = nullptr, *weight = nullptr, *dst = nullptr, *dstRef = nullptr, *bias = nullptr; + floatDataInit(&src, &weight, &dst, &dstRef, &bias, srcTensor.GetTotalTensorSize(), + weightTensor.GetTotalTensorSize(), dstTensor.GetTotalTensorSize()); + + Shape srcShape = srcTensor.GetDims(); + Shape weightShape = weightTensor.GetDims(); + SizeType srcDimNums = srcShape.GetNumDims(); + SizeType weightDimNums = weightShape.GetNumDims(); + SizeType m = srcShape[srcDimNums - 2]; + SizeType n = weightShape[weightDimNums - 1]; + SizeType k = srcShape[srcDimNums - 1]; + SizeType batch = dstTensor.GetTotalTensorSize() / (m * n); + + linearActivationSimple(batch, m, n, k, src, weight, dstRef, bias, algKind); + + linearActivationLayerFwd.Run(src, weight, dst, bias, numThreads); + + float maxValue = findMaxValue(src, batch * m * k, weight, batch * n * k); + + bool flag = true; + for (SizeType i = 0; i < batch * m * n; ++i) { + float diff = std::abs(static_cast(*(dst + i) - *(dstRef + i))); + auto pairEps = compareEle(maxValue, diff, KDNN::Element::MatchType(), k); + + if (std::abs(pairEps.first) > pairEps.second) { + flag = false; + break; + } + } + if (src != nullptr) { + free(src); + } + if (weight != nullptr) { + free(weight); + } + if (dst != nullptr) { + free(dst); + } + if (dstRef != nullptr) { + free(dstRef); + } + if (bias != nullptr) { + free(bias); + } + return flag; +} +bool kudnn_linearActivation_01() +{ + const KDNN::TensorInfo srcTensor = {{3, 2, 110, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCD}; + const KDNN::TensorInfo weightTensor = {{3, 2, 20, 200}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCD}; + const KDNN::TensorInfo dstTensor = {{3, 2, 110, 200}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCD}; + const KDNN::TensorInfo biasTensor = {{3, 2, 110, 200}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCD}; + float alpha = 1.0f; + float beta = 0.0f; + KDNN::ActivationFunction algKind = KDNN::ActivationFunction::SWISH; + int numThreads = 0; + return Test_Template_Function(srcTensor, weightTensor, dstTensor, biasTensor, alpha, beta, algKind, numThreads); +} + +static void linearSimpleFloat(SizeType batch, SizeType m, SizeType n, SizeType k, const float *src, float *weight, + float *dst, float *bias, int transSrc = 0, int transDst = 0) +{ + for (SizeType bh = 0; bh < batch; ++bh) { + for (SizeType i = 0; i < m; ++i) { + for (SizeType j = 0; j < n; ++j) { + float sum = 0; + for (SizeType l = 0; l < k; ++l) { + if (transSrc == 0 && transDst == 0) { + sum += src[bh * m * k + i * k + l] * weight[bh * n * k + n * l + j]; + } else if (transSrc == 1 && transDst == 0) { + sum += src[bh * m * k + i * k + l] * weight[bh * n * k + j * k + l]; + } else if (transSrc == 0 && transDst == 1) { + sum += src[bh * m * k + l * m + i] * weight[bh * n * k + n * l + j]; + } else { + sum += src[bh * m * k + l * m + i] * weight[bh * n * k + j * k + l]; + } + } + if (bias) { + dst[bh * m * n + i * n + j] = sum + bias[bh * m * n + i * n + j]; + } else { + dst[bh * m * n + i * n + j] = sum; + } + } + } + } +} +template +static void DataInit(T **src, T **weight, T **dst, T **dstRef, T **bias, T **res, T **linearRes, T **linearResRef, + SizeType srcTotalSize, SizeType weightTotalSize, SizeType dstTotalSize, SizeType resTotalSize, + SizeType linearResTotalSize) +{ + *src = (T *)malloc(srcTotalSize * sizeof(T)); + *weight = (T *)malloc(weightTotalSize * sizeof(T)); + *dst = (T *)malloc(dstTotalSize * sizeof(T)); + *dstRef = (T *)malloc(dstTotalSize * sizeof(T)); + *bias = (T *)malloc(dstTotalSize * sizeof(T)); + *res = (T *)malloc(resTotalSize * sizeof(T)); + *linearRes = (T *)malloc(linearResTotalSize * sizeof(T)); + *linearResRef = (T *)malloc(linearResTotalSize * sizeof(T)); + + if (*src == nullptr || *weight == nullptr || *dst == nullptr || *dstRef == nullptr || *bias == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return; + } + std::uniform_real_distribution u(-1, 1); + static std::default_random_engine e(time(NULL)); + + for (SizeType i = 0; i < srcTotalSize; ++i) { + (*src)[i] = (T)u(e); + } + for (SizeType i = 0; i < weightTotalSize; ++i) { + (*weight)[i] = (T)u(e); + } + for (SizeType i = 0; i < dstTotalSize; ++i) { + (*bias)[i] = (T)u(e); + } + + for (SizeType i = 0; i < resTotalSize; ++i) { + (*res)[i] = (T)u(e); + } +} + +static void addFloat(const float *dst, float *res, float *linearRes, float gamma, SizeType size) +{ + for (SizeType i = 0; i < size; i++) { + linearRes[i] = dst[i] + res[i] * gamma; + } +} + +static std::pair LresCompareEle(float ref, float diff, KDNN::Element::TypeT type, SizeType K, SizeType L) +{ + float e = 0.0f; + float eps = 0.0f; + std::pair pairEps = {0.0f, 0.0f}; + switch (type) { + case KDNN::Element::TypeT::F16: + eps = 1e-3 * K * L; + e = (std::fabs(ref) > eps) ? diff / ref : static_cast(diff); + pairEps = {e, eps}; + break; + case KDNN::Element::TypeT::BF16: + eps = 1e-2 * K * L; + e = (std::fabs(ref) > eps) ? diff / ref : static_cast(diff); + pairEps = {e, eps}; + break; + case KDNN::Element::TypeT::F32: + eps = 1e-4; + e = (std::fabs(ref) > 1e-4) ? diff / ref : static_cast(diff); + pairEps = {e, eps}; + break; + default: + eps = 1; + e = diff; + pairEps = {e, eps}; + break; + } + return pairEps; +} + +static bool Test_Template_Function_FP32(const KDNN::TensorInfo srcTensor, const KDNN::TensorInfo weightTensor, + const KDNN::TensorInfo dstTensor, const KDNN::TensorInfo biasTensor, + const KDNN::TensorInfo resTensor, const KDNN::TensorInfo linearResTensor, + float alpha, float beta, float gamma, KDNN::ResOpsFunction algKind, + int numThreads, int transSrc = 0, int transDst = 0) +{ + KDNN::LinearResFWD LinearResFWD(srcTensor, weightTensor, dstTensor, biasTensor, resTensor, linearResTensor, alpha, + beta, gamma, algKind, numThreads); + + float *src = nullptr, *weight = nullptr, *dst = nullptr, *dstRef = nullptr, *bias = nullptr, *res = nullptr, + *linearRes = nullptr, *linearResRef = nullptr; + DataInit(&src, &weight, &dst, &dstRef, &bias, &res, &linearRes, &linearResRef, + srcTensor.GetTotalTensorSize(), weightTensor.GetTotalTensorSize(), dstTensor.GetTotalTensorSize(), + resTensor.GetTotalTensorSize(), linearResTensor.GetTotalTensorSize()); + + Shape srcShape = srcTensor.GetDims(); + Shape weightShape = weightTensor.GetDims(); + Shape resShape = resTensor.GetDims(); + SizeType srcDimNums = srcShape.GetNumDims(); + SizeType weightDimNums = weightShape.GetNumDims(); + SizeType resDimNums = resShape.GetNumDims(); + SizeType dst_row = srcShape[srcDimNums - 2]; // m + SizeType dst_col = weightShape[weightDimNums - 1]; // n + SizeType src_col = srcShape[srcDimNums - 1]; // k + SizeType res_col = resShape[resDimNums - 1]; // l + SizeType LinearBatch = dstTensor.GetTotalTensorSize() / (dst_row * dst_col); + SizeType PostOpsBatch = linearResTensor.GetTotalTensorSize() / (dst_row * res_col); + + SizeType L = 1; // compareEle: K *L + + linearSimpleFloat(LinearBatch, dst_row, dst_col, src_col, src, weight, dstRef, bias, transSrc, transDst); + if (algKind == KDNN::ResOpsFunction::RES_MUL) { + L = dst_col; + linearSimpleFloat(PostOpsBatch, dst_row, res_col, dst_col, dstRef, res, linearResRef, nullptr, transSrc, + transDst); + } else if (algKind == KDNN::ResOpsFunction::RES_IDENTIAL) { + addFloat(dstRef, res, linearResRef, 1.0f, linearResTensor.GetTotalTensorSize()); + } else { + addFloat(dstRef, res, linearResRef, gamma, linearResTensor.GetTotalTensorSize()); + } + + LinearResFWD.Run(src, weight, dst, bias, res, linearRes, gamma); + + float maxValue = + findMaxValue(src, srcTensor.GetTotalTensorSize(), weight, weightTensor.GetTotalTensorSize()); + + bool flag = true; + for (SizeType i = 0; i < linearResTensor.GetTotalTensorSize(); ++i) { + float diff = std::abs(static_cast(*(linearRes + i) - *(linearResRef + i))); + auto pairEps = LresCompareEle(maxValue, diff, KDNN::Element::MatchType(), src_col, L); + if (std::abs(pairEps.first) > pairEps.second) { + flag = false; + break; + } + } + + if (src != nullptr) { + free(src); + } + if (weight != nullptr) { + free(weight); + } + if (dst != nullptr) { + free(dst); + } + if (dstRef != nullptr) { + free(dstRef); + } + if (bias != nullptr) { + free(bias); + } + if (res != nullptr) { + free(res); + } + if (linearRes != nullptr) { + free(linearRes); + } + if (linearResRef != nullptr) { + free(linearResRef); + } + return flag; +} + +bool kudnn_linearRes_01() +{ + const KDNN::TensorInfo srcTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo weightTensor = {{4, 3, 2, 20, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo dstTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo biasTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo resTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo linearResTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE}; + float alpha = 1.0f; + float beta = 0.0f; + float gamma = 2.0f; + KDNN::ResOpsFunction algKind = KDNN::ResOpsFunction::RES_IDENTIAL; + int numThreads = 0; + return Test_Template_Function_FP32(srcTensor, weightTensor, dstTensor, biasTensor, resTensor, linearResTensor, alpha, beta, + gamma, algKind, numThreads); +} + +template +static void rmsnormRef(SizeType outerSize, SizeType innerSize, const T *src, T *dst, const T *scale, bool scaleApply, + float *variance, const float eps, bool global_stats = false) +{ + int threadsUsed = (int)ceil((float)outerSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; + // variance compute + if (!global_stats) { +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < outerSize; ++i) { + float sum = 0.0f; + for (SizeType j = 0; j < innerSize; ++j) { + float srcVal = (float)*(src + i * innerSize + j); + sum += srcVal * srcVal; + } + *(variance + i) = sum / innerSize; + } + } + // norm +#pragma omp parallel for collapse(2) num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < outerSize; ++i) { + float varVal = *(variance + i); + float invStd = 1.0f / std::sqrt(varVal + eps); + for (SizeType j = 0; j < innerSize; ++j) { + float srcVal = (float)*(src + i * innerSize + j); + float normalized = srcVal * invStd; + if (scaleApply) { + normalized *= scale[j]; + } + *(dst + i * innerSize + j) = static_cast(normalized); + } + } +} +// forward +template +static bool RnormForwardFunc1(const TensorInfo &srcInfo, const TensorInfo &statInfo, const TensorInfo &scaleInfo, + const TensorInfo &dstInfo, KDNN::NormalizationFlags flags) +{ + KDNN::RMSNormalizationLayerFWD rmsLayer1(srcInfo, statInfo, scaleInfo, dstInfo, flags); + + SizeType srcSize = srcInfo.GetTotalTensorSize(); + SizeType dstSize = dstInfo.GetTotalTensorSize(); + SizeType statSize = statInfo.GetTotalTensorSize(); + SizeType innerSize = scaleInfo.GetTotalTensorSize(); + + T *src = (T *)malloc(srcSize * sizeof(T)); + T *dst = (T *)malloc(dstSize * sizeof(T)); + T *dstRef = (T *)malloc(dstSize * sizeof(T)); + float *variance = (float *)malloc(statSize * sizeof(float)); + T *scale = (T *)malloc(innerSize * sizeof(T)); + float eps = 1e-5; + if (src == nullptr || dst == nullptr || dstRef == nullptr || variance == nullptr || scale == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return false; + } + + bool global_stats = static_cast(flags & KDNN::NormalizationFlags::USE_GLOBAL_STATS); + + // generate random test data + std::uniform_real_distribution u(-1, 1); + std::default_random_engine e(time(NULL)); + int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < srcSize; ++i) { + *(src + i) = (T)u(e); + } + for (SizeType j = 0; j < innerSize; ++j) { + *(scale + j) = (T)u(e); + } + if (global_stats) { + for (SizeType i = 0; i < statSize; i++) { + *(variance + i) = 0.1f; + } + } + + float error = 0.0; + rmsLayer1.Run(src, dst, scale, variance, true, eps); + bool scaleApply = static_cast(flags & KDNN::NormalizationFlags::USE_SCALE); + rmsnormRef(statSize, innerSize, src, dstRef, scale, scaleApply, variance, eps, global_stats); +#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed) + for (SizeType i = 0; i < dstSize; ++i) { + error += *(dst + i) - *(dstRef + i); + } + free(src); + free(dst); + free(dstRef); + free(variance); + free(scale); + error = std::abs(error) / dstSize; + + return error < errBound; +} + +bool kudnn_rnorm_01() +{ + Shape shape(100, 100); + TensorInfo srcInfo = {shape, TypeF16, KDNN::Layout::AB}; + TensorInfo statInfo = {{shape[0]}, TypeF16, KDNN::Layout::A}; + TensorInfo scaleInfo = {{shape[1]}, TypeF16, KDNN::Layout::A}; + TensorInfo dstInfo = {shape, TypeF16, KDNN::Layout::AB}; + KDNN::NormalizationFlags flags = KDNN::NormalizationFlags::USE_SHIFT; + return RnormForwardFunc1<__fp16>(srcInfo, statInfo, scaleInfo, dstInfo, flags); +} + +void softmaxHelp(float *src, int start, int end) +{ + if (start > end || !src) + return; + + float max_val = src[start]; + for (int i = start + 1; i <= end; ++i) { + if (src[i] > max_val) { + max_val = src[i]; + } + } + float sum_exp = 0.0f; + for (int i = start; i <= end; ++i) { + sum_exp += std::exp(src[i] - max_val); + } + for (int i = start; i <= end; ++i) { + src[i] = std::exp(src[i] - max_val) / sum_exp; + } +} + +static void softmaxSimple(Shape shape, float *src) +{ + SizeType numDims = shape.GetNumDims(); + SizeType outSize = 1; + for (SizeType i = 0; i < numDims - 1; ++i) { + outSize *= shape[i]; + } + SizeType innerSize = shape[numDims - 1]; + + for (SizeType i = 0; i < outSize; ++i) { + softmaxHelp(src, i * innerSize, (i + 1) * innerSize - 1); + } +} + +static void dataInit(float **src, float **dst, float **dstRef, SizeType totalSize) +{ + *src = (float *)malloc(totalSize * sizeof(float)); + *dst = (float *)malloc(totalSize * sizeof(float)); + *dstRef = (float *)malloc(totalSize * sizeof(float)); + + if (*src == nullptr || *dst == nullptr || *dstRef == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return; + } + std::uniform_real_distribution u(-1, 1); + std::default_random_engine e(time(NULL)); + + for (SizeType i = 0; i < totalSize; ++i) { + (*src)[i] = u(e); + } + memcpy(*dstRef, *src, totalSize * sizeof(float)); +} + +static void dataFree(float **src, float **dst, float **dstRef) +{ + free(*src); + free(*dst); + free(*dstRef); + + *src = nullptr; + *dst = nullptr; + *dstRef = nullptr; +} +static bool FWD_3D_FP32(const Shape &shape) +{ + const KDNN::TensorInfo srcTensor = {shape, KDNN::Element::TypeT::F32, KDNN::Layout::ABC}; + const KDNN::TensorInfo dstTensor = {shape, KDNN::Element::TypeT::F32, KDNN::Layout::ABC}; + SizeType axis = 2; + KDNN::SoftmaxLayerFWD softmaxLayerFwd(srcTensor, dstTensor, axis, KDNN::SoftmaxAlgorithmKind::SOFTMAX); + + float *src = nullptr, *dst = nullptr, *dstRef = nullptr; + SizeType totalSize = srcTensor.GetTotalTensorSize(); + dataInit(&src, &dst, &dstRef, totalSize); + softmaxSimple(shape, dstRef); + softmaxLayerFwd.Run(src, dst); + for (SizeType i = 0; i < totalSize; ++i) { + float error = std::abs(*(dst + i) - *(dstRef + i)); + if (error > maxError) { + return false; + } + } + dataFree(&src, &dst, &dstRef); + return true; +} +bool kudnn_softmax_01() +{ + Shape shape(20, 40, 50); + return FWD_3D_FP32(shape); +} \ No newline at end of file diff --git a/kudnn/src/function.hpp b/kudnn/src/function.hpp new file mode 100644 index 0000000..c823def --- /dev/null +++ b/kudnn/src/function.hpp @@ -0,0 +1,73 @@ +#include "kdnn.hpp" +#include +#include +#include +#include +#include +#include + +static std::unordered_map layoutMap = { + {"KDNN::Layout::UNDEFINED", KDNN::Layout::UNDEFINED}, + {"KDNN::Layout::A", KDNN::Layout::A}, + {"KDNN::Layout::AB", KDNN::Layout::AB}, + {"KDNN::Layout::BA", KDNN::Layout::BA}, + {"KDNN::Layout::ABC", KDNN::Layout::ABC}, + {"KDNN::Layout::ACB", KDNN::Layout::ACB}, + {"KDNN::Layout::BAC", KDNN::Layout::BAC}, + {"KDNN::Layout::BCA", KDNN::Layout::BCA}, + {"KDNN::Layout::CAB", KDNN::Layout::CAB}, + {"KDNN::Layout::CBA", KDNN::Layout::CBA}, + {"KDNN::Layout::ABCD", KDNN::Layout::ABCD}, + {"KDNN::Layout::ABDC", KDNN::Layout::ABDC}, + {"KDNN::Layout::ACBD", KDNN::Layout::ACBD}, + {"KDNN::Layout::ACDB", KDNN::Layout::ACDB}, + {"KDNN::Layout::ADBC", KDNN::Layout::ADBC}, + {"KDNN::Layout::ADCB", KDNN::Layout::ADCB}, + {"KDNN::Layout::BACD", KDNN::Layout::BACD}, + {"KDNN::Layout::BCDA", KDNN::Layout::BCDA}, + {"KDNN::Layout::CDAB", KDNN::Layout::CDAB}, + {"KDNN::Layout::CDBA", KDNN::Layout::CDBA}, + {"KDNN::Layout::DCAB", KDNN::Layout::DCAB}, + {"KDNN::Layout::ABCDE", KDNN::Layout::ABCDE}, + {"KDNN::Layout::ABCED", KDNN::Layout::ABCED}, + {"KDNN::Layout::ABDEC", KDNN::Layout::ABDEC}, + {"KDNN::Layout::ACBDE", KDNN::Layout::ACBDE}, + {"KDNN::Layout::ACDEB", KDNN::Layout::ACDEB}, + {"KDNN::Layout::ADECB", KDNN::Layout::ADECB}, + {"KDNN::Layout::BACDE", KDNN::Layout::BACDE}, + {"KDNN::Layout::BCDEA", KDNN::Layout::BCDEA}, + {"KDNN::Layout::CDEAB", KDNN::Layout::CDEAB}, + {"KDNN::Layout::CDEBA", KDNN::Layout::CDEBA}, + {"KDNN::Layout::DECAB", KDNN::Layout::DECAB}, + {"KDNN::Layout::ROW_MAJOR", KDNN::Layout::ROW_MAJOR}, + {"KDNN::Layout::COL_MAJOR", KDNN::Layout::COL_MAJOR}, + {"KDNN::Layout::NCHW", KDNN::Layout::NCHW}, + {"KDNN::Layout::NHWC", KDNN::Layout::NHWC}, + {"KDNN::Layout::NCDHW", KDNN::Layout::NCDHW}, + {"KDNN::Layout::NDHWC", KDNN::Layout::NDHWC}, + {"KDNN::Layout::OIHW", KDNN::Layout::OIHW}, + {"KDNN::Layout::HWIO", KDNN::Layout::HWIO}, + {"KDNN::Layout::HWOI", KDNN::Layout::HWOI}, + {"KDNN::Layout::OHWI", KDNN::Layout::OHWI}, + {"KDNN::Layout::OHWO", KDNN::Layout::OHWO}, + {"KDNN::Layout::IOHW", KDNN::Layout::IOHW}, +}; + +static std::unordered_map typeTMap = { + {"UNDEFINED", KDNN::Element::TypeT::UNDEFINED}, + {"KDNN::Element::TypeT::F32", KDNN::Element::TypeT::F32}, + {"KDNN::Element::TypeT::F16", KDNN::Element::TypeT::F16}, + {"KDNN::Element::TypeT::BF16", KDNN::Element::TypeT::BF16}, + {"KDNN::Element::TypeT::S32", KDNN::Element::TypeT::S32}, + {"KDNN::Element::TypeT::S8", KDNN::Element::TypeT::S8}, + {"KDNN::Element::TypeT::U8", KDNN::Element::TypeT::U8}, +}; + +bool kudnn_conv_01(); +bool kudnn_conv_02(); +bool kudnn_gnorm_01(); +bool kudnn_lnorm_01(); +bool kudnn_linearActivation_01(); +bool kudnn_linearRes_01(); +bool kudnn_rnorm_01(); +bool kudnn_softmax_01(); \ No newline at end of file diff --git a/kudnn/src/main.cpp b/kudnn/src/main.cpp new file mode 100644 index 0000000..226f49e --- /dev/null +++ b/kudnn/src/main.cpp @@ -0,0 +1,45 @@ +#include +#include "function.hpp" + +std::unordered_map> funcMap; +int main(int argc, char **argv) +{ + funcMap["kudnn_conv_01"] = kudnn_conv_01; + funcMap["kudnn_conv_02"] = kudnn_conv_02; + funcMap["kudnn_gnorm_01"] = kudnn_gnorm_01; + funcMap["kudnn_lnorm_01"] = kudnn_lnorm_01; + funcMap["kudnn_linearActivation_01"] = kudnn_linearActivation_01; + funcMap["kudnn_linearRes_01"] = kudnn_linearRes_01; + funcMap["kudnn_rnorm_01"] = kudnn_rnorm_01; + funcMap["kudnn_softmax_01"] = kudnn_softmax_01; + bool allPass = true; + + std::cout << "start test" << std::endl; + if (argc == 2) { + if (funcMap.count(argv[1]) > 0) { + bool ret = funcMap[argv[1]](); + if (!ret) { + std::cout << "test " << argv[1] << " FAILED" << std::endl; + } else { + std::cout << "test " << argv[1] << " PASS" << std::endl; + } + } else { + std::cout << "No test named " << argv[1] << std::endl; + } + } else if (argc == 1) { + for (auto &func : funcMap) { + std::cout << "run test " << func.first << " ......" << std::endl; + bool ret = func.second(); + if (!ret) { + std::cout << "test " << func.first << " FAILED" << std::endl; + allPass = false; + } else { + std::cout << "test " << func.first << " PASS" << std::endl; + } + } + } + + if(allPass) + std::cout << "All test passed" << std::endl; + return 0; +} \ No newline at end of file diff --git a/kulitho/build.sh b/kulitho/build.sh new file mode 100644 index 0000000..ec6e69f --- /dev/null +++ b/kulitho/build.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +cputype=$(cat /proc/cpuinfo | grep 'CPU part' | awk 'NR==1{print $4}') +if [[ "$cputype" == "0xD01" ]]; then + echo "kulitho: Not suppot on this platform" +else + if [ -z "${HPCKIT_PATH}" ]; then + echo "Error: Environment variable 'HPCKIT_PATH' is not set!" + echo "Please configure HPCKIT_PATH to point to the HPCKit installation directory, for example:" + echo " export HPCKIT_PATH=/opt/HPCKit" + echo "Then re-run the build command." + exit 1 + fi + echo "HPCKIT_PATH=${HPCKIT_PATH}" + cd ./test + make clean + make +fi diff --git a/kulitho/run.sh b/kulitho/run.sh new file mode 100644 index 0000000..73e5628 --- /dev/null +++ b/kulitho/run.sh @@ -0,0 +1,7 @@ +cputype=$(cat /proc/cpuinfo | grep 'CPU part' | awk 'NR==1{print $4}') +if [[ "$cputype" == "0xD01" ]]; then + echo "kulitho: Not suppot on this platform" +else + cd ./test + ./test_kulitho +fi diff --git a/kulitho/test/Makefile b/kulitho/test/Makefile new file mode 100644 index 0000000..0124f4c --- /dev/null +++ b/kulitho/test/Makefile @@ -0,0 +1,21 @@ +# Makefile +CXX = g++ +CXXFLAGS = -O3 -I$(HPCKIT_PATH)/latest/kulitho/gcc/include +LDFLAGS = -L$(HPCKIT_PATH)/latest/kulitho/gcc/lib -lkulitho -lpthread -lm + +TARGET = test_kulitho +SOURCES = kulitho_test.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +# ========== 主目标 ========== +$(TARGET): $(OBJECTS) + $(CXX) $(OBJECTS) -o $@ $(LDFLAGS) + +# ========== 编译规则 ========== +%.o: %.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + +# ========== 清理 ========== +.PHONY: clean run +clean: + rm -f *.o $(TARGET) \ No newline at end of file diff --git a/kulitho/test/kulitho_test.cpp b/kulitho/test/kulitho_test.cpp new file mode 100644 index 0000000..2604c41 --- /dev/null +++ b/kulitho/test/kulitho_test.cpp @@ -0,0 +1,92 @@ +#include +#include +#include "kulitho_image.h" + +double real_out_upsampling[256] = { + -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, + 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, + 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, + 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, + 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, + 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, + 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, + 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, + 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, + 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, + 0.197507, 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, + 0.187500, 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, + 0.149672, 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, + 0.093750, 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, + 0.037828, -0.010007, 0.005362, 0.034187, 0.062500, 0.082022, 0.093750, 0.105478, 0.125000, 0.153313, 0.182138, 0.197507, 0.187500, 0.149672, 0.093750, 0.037828 +}; +double real_out_lapacian[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; + +int verify_upsampling(double *out, size_t rows, size_t cols) +{ + for (size_t i = 0; i < rows; i++) { + for (size_t j = 0; j < cols; j++) { + if (abs(out[i * cols + j] - real_out_upsampling[i * cols + j]) > 1e-5) + // printf("%f, %f\n", out[i * cols + j], real_out_upsampling[i * cols + j]); + return 0; + } + } + return 1; +} +int verify_laplacian(double *out, size_t rows, size_t cols) +{ + for (size_t i = 0; i < rows; i++) { + for (size_t j = 0; j < cols; j++) { + if (abs(out[i * cols + j] - real_out_lapacian[i * cols + j]) > 1e-5) + // printf("%f, %f\n", out[i * cols + j], real_out_lapacian[i * cols + j]); + return 0; + } + } + return 1; +} + +int kulitho_upsampling_01() +{ + size_t src_rows = 4, src_cols = 4; + size_t dst_rows = src_rows * 4; + size_t dst_cols = src_cols * 4; + + double *grid_in = new double[src_rows * src_cols]; + for (int i = 0; i < src_rows; ++i) { + for (int j = 0; j < src_cols; ++j) { + grid_in[i * src_cols + j] = j; + } + } + double *grid_out = new double[dst_rows * dst_cols]; + kulitho::kulitho_FFTupsampling(grid_in, grid_out, src_rows, src_cols, dst_rows, dst_cols, 0, 0); + + if (verify_upsampling(grid_out, dst_rows, dst_cols)) { + std::cout << "kulitho upsampling case: PASSED" << std::endl; + } else { + std::cout << "kulitho upsampling case: FAILED" << std::endl; + } + + delete[] grid_in; + delete[] grid_out; + return 0; +} + +int kulitho_laplacian_01() +{ + size_t rows = 3, cols = 3; + double src[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; + double dst[rows * cols]; + + kulitho::kulitho_laplacian(src, rows, cols, dst); + if (verify_laplacian(dst, rows, cols)) { + std::cout << "kulitho laplacian case: PASSED" << std::endl; + } else { + std::cout << "kulitho laplacian case: FAILED" << std::endl; + } + return 0; +} + +int main() +{ + kulitho_upsampling_01(); + kulitho_laplacian_01(); +} \ No newline at end of file -- Gitee