diff --git a/KuDNN/build.sh b/KuDNN/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..6e13f2846ea8f585e6057e41b287dc723b942db9 --- /dev/null +++ b/KuDNN/build.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# export DNN_INC=~/local/HPCKit/latest/kudnn/bisheng/include +# export DNN_LIB=~/local/HPCKit/latest/kudnn/bisheng/lib +# export BLAS_LIB=~/local/HPCKit/latest/kml/bisheng/lib/sme/kblas/multi + +echo "CXX=${CXX}" +echo "DNN_INC=${DNN_INC}" +echo "DNN_LIB=${DNN_LIB}" +echo "BLAS_LIB=${BLAS_LIB}" + + +${CXX} src/function.cpp src/main.cpp -o function_test \ + -I ${DNN_INC} \ + -L ${DNN_LIB} \ + -L ${BLAS_LIB} \ + -lkblas \ + -lkdnn diff --git a/KuDNN/run.sh b/KuDNN/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..847a2f0243fe4026d7d1568c45ccb769c3e7ac6b --- /dev/null +++ b/KuDNN/run.sh @@ -0,0 +1,8 @@ +./function_test kudnn_linearActivation_01 +./function_test kudnn_linearRes_01 +./function_test kudnn_rnorm_01 +./function_test kudnn_lnorm_01 +./function_test kudnn_softmax_01 +./function_test kudnn_gnorm_01 +./function_test kudnn_conv_01 +./function_test kudnn_conv_02 \ No newline at end of file diff --git a/KuDNN/src/function.cpp b/KuDNN/src/function.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0d929ce8f67b320295559c3991ce4952d8c15ca5 --- /dev/null +++ b/KuDNN/src/function.cpp @@ -0,0 +1,1112 @@ +#include "function.hpp" + +using SizeType = KDNN::SizeType; +using Shape = KDNN::Shape; +using TensorInfo = const KDNN::TensorInfo; +KDNN::Element::TypeT TypeF16 = KDNN::Element::TypeT::F16; +KDNN::Element::TypeT TypeF32 = KDNN::Element::TypeT::F32; +const int MAX_THS = 100; +const float FACTOR_THS = 0.02; +float errBound = 1e-3; +float maxError = 1e-3; + +template +struct typeMap; + +template <> +struct typeMap<__fp16> { + static constexpr KDNN::Element::TypeT val = KDNN::Element::TypeT::F16; +}; + +template <> +struct typeMap { + static constexpr KDNN::Element::TypeT val = KDNN::Element::TypeT::F32; +}; + +template +static void conv2dRef(Shape &srcShape, Shape &weiShape, Shape &dstShape, Shape &strides, Shape &dilates, + Shape &paddingL, Shape &paddingR, const T *src, const T *wei, T *dst, const T *bia) +{ + SizeType N = srcShape[0], IC = srcShape[1], IH = srcShape[2], IW = srcShape[3]; + SizeType KH = weiShape[2], KW = weiShape[3]; + SizeType OC = dstShape[1], OH = dstShape[2], OW = dstShape[3]; + SizeType SH = strides[0], SW = strides[1]; + SizeType DH = dilates[0], DW = dilates[1]; + SizeType PH_L = paddingL[0], PW_L = paddingL[1]; + SizeType PH_R = paddingR[0], PW_R = paddingR[1]; + OH = (IH + PH_L + PH_R - 1 - (KH - 1) * (DH + 1)) / SH + 1; + OW = (IW + PW_L + PW_R - 1 - (KW - 1) * (DW + 1)) / SW + 1; + int threadsUsed = (int)ceil((float)(N * OC * OH * OW) * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for collapse(4) num_threads(threadsUsed) + for (SizeType n = 0; n < N; ++n) { + for (SizeType oc = 0; oc < OC; ++oc) { + T biaVal = *(bia + oc); + for (SizeType oh = 0; oh < OH; ++oh) { + for (SizeType ow = 0; ow < OW; ++ow) { + T sum = (T)0.0f; + for (SizeType ic = 0; ic < IC; ++ic) { + for (SizeType kh = 0; kh < KH; ++kh) { + for (SizeType kw = 0; kw < KW; ++kw) { + SizeType srcX = oh * SH + kh * (DH + 1) - PH_L; + SizeType srcY = ow * SW + kw * (DW + 1) - PW_L; + T srcVal = (T)0.0f; + if (srcX >= 0 && srcX < IH && srcY >= 0 && srcY < IW) { + srcVal = *(src + n * IC * IH * IW + ic * IH * IW + srcX * IW + srcY); + } + T weiVal = *(wei + oc * IC * KH * KW + ic * KH * KW + kh * KW + kw); + sum += srcVal * weiVal; + } + } + } + *(dst + n * OC * OH * OW + oc * OH * OW + oh * OW + ow) = sum + biaVal; + } + } + } + } +} + +template +static bool Conv2dFWDFunc1(Shape &srcShape, Shape &weiShape, Shape &dstShape, Shape &strides, Shape &dilates, + Shape &paddingL, Shape &paddingR, KDNN::ConvolutionAlgorithm &alg) +{ + SizeType N = srcShape[0], IC = srcShape[1], IH = srcShape[2], IW = srcShape[3]; + SizeType OC = dstShape[1], OH = dstShape[2], OW = dstShape[3]; + SizeType KH = weiShape[2], KW = weiShape[3]; + const KDNN::TensorInfo srcTensor = {srcShape, typeMap::val, KDNN::Layout::ABCD}; + const KDNN::TensorInfo weightsTensor = {weiShape, typeMap::val, KDNN::Layout::ABCD}; + const KDNN::TensorInfo dstTensor = {dstShape, typeMap::val, KDNN::Layout::ABCD}; + const KDNN::TensorInfo biasTensor = {{OC}, typeMap::val, KDNN::Layout::A}; + + KDNN::ConvolutionLayerFWD convFwdLayer1(srcTensor, weightsTensor, dstTensor, biasTensor, strides, dilates, paddingL, + paddingR, alg); + + SizeType srcSize = N * IC * IH * IW; + SizeType dstSize = N * OC * OH * OW; + SizeType weiSize = OC * IC * KH * KW; + SizeType biaSize = OC; + T *src = (T *)malloc(srcSize * sizeof(T)); + T *dst = (T *)malloc(dstSize * sizeof(T)); + T *dstRef = (T *)malloc(dstSize * sizeof(T)); + T *wei = (T *)malloc(weiSize * sizeof(T)); + T *bia = (T *)malloc(biaSize * sizeof(T)); + if (src == nullptr || dst == nullptr || dstRef == nullptr || wei == nullptr || bia == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return false; + } + // generate random test data + std::uniform_real_distribution u(-1, 1); + std::default_random_engine e(time(NULL)); + int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < srcSize; ++i) { + *(src + i) = (T)u(e); + } + for (SizeType i = 0; i < weiSize; ++i) { + *(wei + i) = (T)u(e); + } + for (SizeType i = 0; i < biaSize; ++i) { + *(bia + i) = (T)u(e); + } + convFwdLayer1.Run(src, wei, dst, bia); + conv2dRef(srcShape, weiShape, dstShape, strides, dilates, paddingL, paddingR, src, wei, dstRef, bia); + float error = 0.0; +#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed) + for (SizeType i = 0; i < dstSize; ++i) { + error += *(dst + i) - *(dstRef + i); + } + free(src); + free(dst); + free(dstRef); + free(wei); + free(bia); + error = std::abs(error) / dstSize; + + return error < errBound; +} + +bool kudnn_conv_01() +{ + SizeType N = 4, IC = 4, IH = 128, IW = 100; + SizeType OC = 5, OH = 0, OW = 0; + SizeType KH = 3, KW = 3; + Shape strides(1, 1), dilates(1, 1), paddingL(1, 1), paddingR(1, 1); + OH = (IH + paddingL[0] + paddingR[0] - 1 - (KH - 1) * (dilates[0] + 1)) / strides[0] + 1; + OW = (IW + paddingL[1] + paddingR[1] - 1 - (KW - 1) * (dilates[1] + 1)) / strides[1] + 1; + Shape srcShape(N, IC, IH, IW), weiShape(OC, IC, KH, KW), dstShape(N, OC, OH, OW); + KDNN::ConvolutionAlgorithm alg(KDNN::ConvolutionAlgorithm::AUTO); + return Conv2dFWDFunc1(srcShape, weiShape, dstShape, strides, dilates, paddingL, paddingR, alg); +} + +template +static void conv3dRef(Shape &srcShape, Shape &weiShape, Shape &dstShape, Shape &strides, Shape &dilates, + Shape &paddingL, Shape &paddingR, const T *src, const T *wei, T *dst, const T *bia) +{ + SizeType N = srcShape[0], IC = srcShape[1], ID = srcShape[2], IH = srcShape[3], IW = srcShape[4]; + SizeType KD = weiShape[2], KH = weiShape[3], KW = weiShape[4]; + SizeType OC = dstShape[1], OD = dstShape[2], OH = dstShape[2], OW = dstShape[3]; + SizeType SD = strides[0], SH = strides[1], SW = strides[2]; + SizeType DD = dilates[0], DH = dilates[1], DW = dilates[2]; + SizeType PD_L = paddingL[0], PH_L = paddingL[1], PW_L = paddingL[2]; + SizeType PD_R = paddingR[0], PH_R = paddingR[1], PW_R = paddingR[2]; + OD = (ID + PD_L + PD_R - 1 - (KD - 1) * (DD + 1)) / SD + 1; + OH = (IH + PH_L + PH_R - 1 - (KH - 1) * (DH + 1)) / SH + 1; + OW = (IW + PW_L + PW_R - 1 - (KW - 1) * (DW + 1)) / SW + 1; + int threadsUsed = (int)ceil((float)(N * OC * OD * OH * OW) * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for collapse(5) num_threads(threadsUsed) + for (SizeType n = 0; n < N; ++n) { + for (SizeType oc = 0; oc < OC; ++oc) { + T biaVal = *(bia + oc); + for (SizeType od = 0; od < OD; ++od) { + for (SizeType oh = 0; oh < OH; ++oh) { + for (SizeType ow = 0; ow < OW; ++ow) { + T sum = (T)0.0f; + for (SizeType ic = 0; ic < IC; ++ic) { + for (SizeType kd = 0; kd < KD; ++kd) { + for (SizeType kh = 0; kh < KH; ++kh) { + for (SizeType kw = 0; kw < KW; ++kw) { + SizeType srcZ = od * SD + kd * (DD + 1) - PD_L; + SizeType srcX = oh * SH + kh * (DH + 1) - PH_L; + SizeType srcY = ow * SW + kw * (DW + 1) - PW_L; + T srcVal = (T)0.0f; + if (srcZ >= 0 && srcZ < ID && srcX >= 0 && srcX < IH && srcY >= 0 && + srcY < IW) { + srcVal = *(src + n * IC * ID * IH * IW + ic * ID * IH * IW + + srcZ * IH * IW + srcX * IW + srcY); + } + T weiVal = *(wei + oc * IC * KD * KH * KW + ic * KD * KH * KW + kd * KH * KW + + kh * KW + kw); + sum += srcVal * weiVal; + } + } + } + } + *(dst + n * OC * OD * OH * OW + oc * OD * OH * OW + od * OH * OW + oh * OW + ow) = sum + biaVal; + } + } + } + } + } +} + +template +static bool Conv3dFWDFunc1(Shape &srcShape, Shape &weiShape, Shape &dstShape, Shape &strides, Shape &dilates, + Shape &paddingL, Shape &paddingR, KDNN::ConvolutionAlgorithm &alg) +{ + SizeType N = srcShape[0], IC = srcShape[1], ID = srcShape[2], IH = srcShape[3], IW = srcShape[4]; + SizeType OC = dstShape[1], OD = dstShape[2], OH = dstShape[3], OW = dstShape[4]; + SizeType KD = weiShape[2], KH = weiShape[3], KW = weiShape[4]; + const KDNN::TensorInfo srcTensor = {srcShape, typeMap::val, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo weightsTensor = {weiShape, typeMap::val, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo dstTensor = {dstShape, typeMap::val, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo biasTensor = {{OC}, typeMap::val, KDNN::Layout::A}; + + KDNN::ConvolutionLayerFWD convFwdLayer1(srcTensor, weightsTensor, dstTensor, biasTensor, strides, dilates, paddingL, + paddingR, alg); + + SizeType srcSize = N * IC * ID * IH * IW; + SizeType dstSize = N * OC * OD * OH * OW; + SizeType weiSize = OC * IC * KD * KH * KW; + SizeType biaSize = OC; + T *src = (T *)malloc(srcSize * sizeof(T)); + T *dst = (T *)malloc(dstSize * sizeof(T)); + T *dstRef = (T *)malloc(dstSize * sizeof(T)); + T *wei = (T *)malloc(weiSize * sizeof(T)); + T *bia = (T *)malloc(biaSize * sizeof(T)); + if (src == nullptr || dst == nullptr || dstRef == nullptr || wei == nullptr || bia == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return false; + } + // generate random test data + std::uniform_real_distribution u(-1, 1); + std::default_random_engine e(time(NULL)); + int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < srcSize; ++i) { + *(src + i) = (T)u(e); + } + for (SizeType i = 0; i < weiSize; ++i) { + *(wei + i) = (T)u(e); + } + for (SizeType i = 0; i < biaSize; ++i) { + *(bia + i) = (T)u(e); + } + conv3dRef(srcShape, weiShape, dstShape, strides, dilates, paddingL, paddingR, src, wei, dstRef, bia); + convFwdLayer1.Run(src, wei, dst, bia); + float error = 0.0; +#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed) + for (SizeType i = 0; i < dstSize; ++i) { + error += *(dst + i) - *(dstRef + i); + } + free(src); + free(dst); + free(dstRef); + free(wei); + free(bia); + error = std::abs(error) / dstSize; + + return error < errBound; +} + +bool kudnn_conv_02() +{ + SizeType N = 1, IC = 5, ID = 20, IH = 90, IW = 160; + SizeType OC = 10, OD = 0, OH = 0, OW = 0; + SizeType KD = 1, KH = 1, KW = 1; + Shape strides(1, 1, 1), dilates(0, 0, 0), paddingL(0, 0, 0), paddingR(0, 0, 0); + OD = (ID + paddingL[0] + paddingR[0] - 1 - (KD - 1) * (dilates[0] + 1)) / strides[0] + 1; + OH = (IH + paddingL[1] + paddingR[1] - 1 - (KH - 1) * (dilates[1] + 1)) / strides[1] + 1; + OW = (IW + paddingL[2] + paddingR[2] - 1 - (KW - 1) * (dilates[2] + 1)) / strides[2] + 1; + Shape srcShape(N, IC, ID, IH, IW), weiShape(OC, IC, KD, KH, KW), dstShape(N, OC, OD, OH, OW); + KDNN::ConvolutionAlgorithm alg(KDNN::ConvolutionAlgorithm::AUTO); + return Conv3dFWDFunc1<__fp16>(srcShape, weiShape, dstShape, strides, dilates, paddingL, paddingR, alg); +} + +template +static void gnormRef(Shape shape, SizeType groupInfo, const T *src, T *dst, const T *scale, const T *shift, + bool scaleApply, bool shiftApply, float *mean, float *variance, const float eps, + bool global_stats = false) +{ + SizeType innerSize = 1; + SizeType numDims = shape.GetNumDims(); + if (numDims == 4) { + innerSize = shape[2] * shape[3]; + } else if (numDims == 5) { + innerSize = shape[2] * shape[3] * shape[4]; + } else if (numDims == 3) { + innerSize = shape[2]; + } else { + return; + } + SizeType A = shape[0], B = shape[1]; + SizeType chPerGroup = B / groupInfo; + innerSize *= chPerGroup; + int threadsUsed = (int)ceil((float)(A * groupInfo) * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; + if (!global_stats) { + // mean compute +#pragma omp parallel for collapse(2) num_threads(threadsUsed) + for (SizeType a = 0; a < A; ++a) { + for (SizeType g = 0; g < groupInfo; ++g) { + float sum = 0.0f; + for (SizeType i = 0; i < innerSize; ++i) { + sum += (float)*(src + a * groupInfo * innerSize + g * innerSize + i); + } + *(mean + a * groupInfo + g) = sum / innerSize; + } + } + // variance compute +#pragma omp parallel for collapse(2) num_threads(threadsUsed) + for (SizeType a = 0; a < A; ++a) { + for (SizeType g = 0; g < groupInfo; ++g) { + float sum = 0.0f; + float meanVal = *(mean + a * groupInfo + g); + for (SizeType i = 0; i < innerSize; ++i) { + float diff = (float)*(src + a * groupInfo * innerSize + g * innerSize + i) - meanVal; + sum += diff * diff; + } + *(variance + a * groupInfo + g) = sum / innerSize; + } + } + } + // norm + innerSize /= chPerGroup; +#pragma omp parallel for collapse(2) num_threads(threadsUsed) + for (SizeType a = 0; a < A; ++a) { + for (SizeType g = 0; g < groupInfo; ++g) { + float meanVal = *(mean + a * groupInfo + g); + float invStd = 1.0f / std::sqrt(*(variance + a * groupInfo + g) + eps); + for (SizeType gi = 0; gi < chPerGroup; ++gi) { + int b = g * chPerGroup + gi; + for (SizeType i = 0; i < innerSize; ++i) { + float srcVal = (float)*(src + a * B * innerSize + b * innerSize + i); + float normalized = (srcVal - meanVal) * invStd; + if (scaleApply) { + normalized *= scale[b]; + } + if (shiftApply) { + normalized += shift[b]; + } + *(dst + a * B * innerSize + b * innerSize + i) = static_cast(normalized); + } + } + } + } +} +// forward +template +static bool GnormForwardFunc1(const TensorInfo &srcInfo, const TensorInfo &scaleShiftInfo, SizeType groupInfo, + const TensorInfo &dstInfo, KDNN::NormalizationFlags flags) +{ + KDNN::GroupNormalizationLayerFWD gnormLayer1(srcInfo, scaleShiftInfo, groupInfo, dstInfo, flags); + + SizeType srcSize = srcInfo.GetTotalTensorSize(); + SizeType dstSize = dstInfo.GetTotalTensorSize(); + SizeType statSize = srcInfo.GetDims()[0] * groupInfo; + SizeType scaleSize = scaleShiftInfo.GetTotalTensorSize(); + + T *src = (T *)malloc(srcSize * sizeof(T)); + T *dst = (T *)malloc(srcSize * sizeof(T)); + T *dstRef = (T *)malloc(srcSize * sizeof(T)); + float *mean = (float *)malloc(statSize * sizeof(float)); + float *variance = (float *)malloc(statSize * sizeof(float)); + T *scale = (T *)malloc(scaleSize * sizeof(T)); + T *shift = (T *)malloc(scaleSize * sizeof(T)); + float eps = 1e-5; + if (src == nullptr || dst == nullptr || dstRef == nullptr || mean == nullptr || variance == nullptr || + scale == nullptr || shift == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return false; + } + + bool global_stats = static_cast(flags & KDNN::NormalizationFlags::USE_GLOBAL_STATS); + + // generate random test data + std::uniform_real_distribution u(-1, 1); + std::default_random_engine e(time(NULL)); + int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < srcSize; ++i) { + *(src + i) = (T)u(e); + } + for (SizeType i = 0; i < scaleSize; ++i) { + *(scale + i) = (T)u(e); + *(shift + i) = (T)u(e); + } + if (global_stats) { + for (SizeType i = 0; i < statSize; ++i) { + *(mean + i) = 1.0f; + *(variance + i) = 0.1f; + } + } + + gnormLayer1.Run(src, dst, scale, shift, mean, variance, true, eps); + bool scaleApply = static_cast(flags & KDNN::NormalizationFlags::USE_SCALE); + bool shiftApply = static_cast(flags & KDNN::NormalizationFlags::USE_SHIFT); + + gnormRef(srcInfo.GetDims(), groupInfo, src, dstRef, scale, shift, scaleApply, shiftApply, mean, variance, eps, + global_stats); + float error = 0.0; +#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed) + for (SizeType i = 0; i < dstSize; ++i) { + error += *(dst + i) - *(dstRef + i); + } + free(src); + free(dst); + free(dstRef); + free(mean); + free(variance); + free(scale); + free(shift); + error = std::abs(error) / dstSize; + + return error < errBound; +} + +bool kudnn_gnorm_01() +{ + Shape shape(4, 128, 18, 320); + SizeType groupInfo = 2; + TensorInfo srcInfo = {shape, TypeF16, KDNN::Layout::ABCD}; + TensorInfo scaleShiftInfo = {{shape[1]}, TypeF16, KDNN::Layout::A}; + TensorInfo dstInfo = {shape, TypeF16, KDNN::Layout::ABCD}; + KDNN::NormalizationFlags flags = KDNN::NormalizationFlags::NONE; + return GnormForwardFunc1<__fp16>(srcInfo, scaleShiftInfo, groupInfo, dstInfo, flags); +} + +template +static void lnormRef(SizeType outerSize, SizeType innerSize, const T *src, T *dst, const T *scale, const T *shift, + bool scaleApply, bool shiftApply, float *mean, float *variance, const float eps, + bool global_stats = false) +{ + int threadsUsed = (int)ceil((float)outerSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; + if (!global_stats) { + // mean compute +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < outerSize; ++i) { + float sum = 0.0f; + for (SizeType j = 0; j < innerSize; ++j) { + sum += (float)*(src + i * innerSize + j); + } + *(mean + i) = sum / innerSize; + } + // variance compute +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < outerSize; ++i) { + float sum = 0.0f; + float meanVal = *(mean + i); + for (SizeType j = 0; j < innerSize; ++j) { + float diff = (float)*(src + i * innerSize + j) - meanVal; + sum += diff * diff; + } + *(variance + i) = sum / innerSize; + } + } + // norm +#pragma omp parallel for collapse(2) num_threads(threadsUsed) + for (SizeType i = 0; i < outerSize; ++i) { + float meanVal = *(mean + i); + float varVal = *(variance + i); + float invStd = 1.0f / std::sqrt(varVal + eps); + for (SizeType j = 0; j < innerSize; ++j) { + float srcVal = (float)*(src + i * innerSize + j); + float normalized = (srcVal - meanVal) * invStd; + if (scaleApply) { + normalized *= scale[j]; + } + if (shiftApply) { + normalized += shift[j]; + } + *(dst + i * innerSize + j) = static_cast(normalized); + } + } +} +// forward +template +static bool LnormForwardFunc1(const TensorInfo &srcInfo, const TensorInfo &statInfo, const TensorInfo &scaleShiftInfo, + const TensorInfo &dstInfo, KDNN::NormalizationFlags flags) +{ + KDNN::NormalizationLayerFWD lnormLayer1(srcInfo, statInfo, scaleShiftInfo, dstInfo, flags); + SizeType srcSize = srcInfo.GetTotalTensorSize(); + SizeType dstSize = dstInfo.GetTotalTensorSize(); + SizeType statSize = statInfo.GetTotalTensorSize(); + SizeType innerSize = scaleShiftInfo.GetTotalTensorSize(); + T *src = (T *)malloc(srcSize * sizeof(T)); + T *dst = (T *)malloc(dstSize * sizeof(T)); + T *dstRef = (T *)malloc(dstSize * sizeof(T)); + float *mean = (float *)malloc(statSize * sizeof(float)); + float *variance = (float *)malloc(statSize * sizeof(float)); + T *scale = (T *)malloc(innerSize * sizeof(T)); + T *shift = (T *)malloc(innerSize * sizeof(T)); + float eps = 1e-5; + if (src == nullptr || dst == nullptr || dstRef == nullptr || mean == nullptr || variance == nullptr || + scale == nullptr || shift == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return false; + } + + bool global_stats = static_cast(flags & KDNN::NormalizationFlags::USE_GLOBAL_STATS); + + // generate random test data + std::uniform_real_distribution u(-1, 1); + std::default_random_engine e(time(NULL)); + int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < srcSize; ++i) { + *(src + i) = (T)u(e); + } + for (SizeType j = 0; j < innerSize; ++j) { + *(scale + j) = (T)u(e); + *(shift + j) = (T)u(e); + } + if (global_stats) { + for (SizeType i = 0; i < statSize; i++) { + *(mean + i) = 1.0f; + *(variance + i) = 0.1f; + } + } + + float error = 0.0f; + lnormLayer1.Run(src, dst, scale, shift, mean, variance, true, eps); + bool scaleApply = static_cast(flags & KDNN::NormalizationFlags::USE_SCALE); + bool shiftApply = static_cast(flags & KDNN::NormalizationFlags::USE_SHIFT); + lnormRef(statSize, innerSize, src, dstRef, scale, shift, scaleApply, shiftApply, mean, variance, eps, + global_stats); +#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed) + for (SizeType i = 0; i < dstSize; ++i) { + error += *(dst + i) - *(dstRef + i); + } + free(src); + free(dst); + free(dstRef); + free(mean); + free(variance); + free(scale); + free(shift); + error = std::abs(error) / dstSize; + + return error < errBound; +} + +bool kudnn_lnorm_01() +{ + Shape shape(2, 3240, 1152); + TensorInfo srcInfo = {shape, TypeF32, KDNN::Layout::ABC}; + TensorInfo statInfo = {{shape[0], shape[1]}, TypeF32, KDNN::Layout::AB}; + TensorInfo scaleShiftInfo = {{shape[2]}, TypeF32, KDNN::Layout::A}; + TensorInfo dstInfo = {shape, TypeF32, KDNN::Layout::ABC}; + KDNN::NormalizationFlags flags = KDNN::NormalizationFlags::NONE; + return LnormForwardFunc1(srcInfo, statInfo, scaleShiftInfo, dstInfo, flags); +} + +template +static void siluSimple(T *dst, SizeType size) +{ + if (!dst || size == 0) + return; + for (SizeType i = 0; i < size; ++i) { + T sigmoid_x = static_cast(1) / (static_cast(1) + std::exp(static_cast(-dst[i]))); + dst[i] = dst[i] * sigmoid_x; + } +} + +template +static void linearActivationSimple(SizeType batch, SizeType m, SizeType n, SizeType k, const T *src, T *weight, T *dst, + T *bias, KDNN::ActivationFunction kind) +{ + for (SizeType bh = 0; bh < batch; ++bh) { + for (SizeType i = 0; i < m; ++i) { + for (SizeType j = 0; j < n; ++j) { + T sum = 0; + for (SizeType l = 0; l < k; ++l) { + sum += src[bh * m * k + i * k + l] * weight[bh * n * k + n * l + j]; + } + dst[bh * m * n + i * n + j] = sum + bias[bh * m * n + i * n + j]; + } + } + } + if (kind == KDNN::ActivationFunction::SWISH) { + siluSimple(dst, batch * m * n); + } +} + +template +static void floatDataInit(T **src, T **weight, T **dst, T **dstRef, T **bias, SizeType srcTotalSize, + SizeType weightTotalSize, SizeType dstTotalSize) +{ + *src = (T *)malloc(srcTotalSize * sizeof(T)); + *weight = (T *)malloc(weightTotalSize * sizeof(T)); + *dst = (T *)malloc(dstTotalSize * sizeof(T)); + *dstRef = (T *)malloc(dstTotalSize * sizeof(T)); + *bias = (T *)malloc(dstTotalSize * sizeof(T)); + + if (*src == nullptr || *weight == nullptr || *dst == nullptr || *dstRef == nullptr || *bias == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return; + } + std::uniform_real_distribution u(-1, 1); + static std::default_random_engine e(time(NULL)); + + for (SizeType i = 0; i < srcTotalSize; ++i) { + (*src)[i] = (T)u(e); + } + for (SizeType i = 0; i < weightTotalSize; ++i) { + (*weight)[i] = (T)u(e); + } + for (SizeType i = 0; i < dstTotalSize; ++i) { + (*bias)[i] = (T)u(e); + } +} + +static std::pair compareEle(float ref, float diff, KDNN::Element::TypeT type, int K) +{ + float e = 0.0f; + float eps = 0.0f; + std::pair pairEps = {0.0f, 0.0f}; + switch (type) { + case KDNN::Element::TypeT::F16: + eps = 1e-3 * K; + e = (std::fabs(ref) > eps) ? diff / ref : static_cast(diff); + pairEps = {e, eps}; + break; + case KDNN::Element::TypeT::BF16: + eps = 1e-2 * K; + e = (std::fabs(ref) > eps) ? diff / ref : static_cast(diff); + pairEps = {e, eps}; + break; + case KDNN::Element::TypeT::F32: + eps = 1e-4; + e = (std::fabs(ref) > 1e-4) ? diff / ref : static_cast(diff); + pairEps = {e, eps}; + break; + default: + eps = 1; + e = diff; + pairEps = {e, eps}; + break; + } + return pairEps; +} + +template +static float findMaxValue(T *value1, SizeType valueSize1, T *value2, SizeType valueSize2) +{ + T maxVal = value1[0]; + + for (SizeType i = 0; i < valueSize1; ++i) { + maxVal = std::max(maxVal, value1[i]); + } + + for (SizeType i = 0; i < valueSize2; ++i) { + maxVal = std::max(maxVal, value2[i]); + } + return static_cast(maxVal); +} + +template +static bool Test_Template_Function(const KDNN::TensorInfo srcTensor, const KDNN::TensorInfo weightTensor, + const KDNN::TensorInfo dstTensor, const KDNN::TensorInfo biasTensor, float alpha, + float beta, KDNN::ActivationFunction algKind, int numThreads) +{ + KDNN::LinearActivationLayerFWD linearActivationLayerFwd(srcTensor, weightTensor, dstTensor, biasTensor, alpha, beta, + algKind, numThreads); + + T *src = nullptr, *weight = nullptr, *dst = nullptr, *dstRef = nullptr, *bias = nullptr; + floatDataInit(&src, &weight, &dst, &dstRef, &bias, srcTensor.GetTotalTensorSize(), + weightTensor.GetTotalTensorSize(), dstTensor.GetTotalTensorSize()); + + Shape srcShape = srcTensor.GetDims(); + Shape weightShape = weightTensor.GetDims(); + SizeType srcDimNums = srcShape.GetNumDims(); + SizeType weightDimNums = weightShape.GetNumDims(); + SizeType m = srcShape[srcDimNums - 2]; + SizeType n = weightShape[weightDimNums - 1]; + SizeType k = srcShape[srcDimNums - 1]; + SizeType batch = dstTensor.GetTotalTensorSize() / (m * n); + + linearActivationSimple(batch, m, n, k, src, weight, dstRef, bias, algKind); + + linearActivationLayerFwd.Run(src, weight, dst, bias, numThreads); + + float maxValue = findMaxValue(src, batch * m * k, weight, batch * n * k); + + bool flag = true; + for (SizeType i = 0; i < batch * m * n; ++i) { + float diff = std::abs(static_cast(*(dst + i) - *(dstRef + i))); + auto pairEps = compareEle(maxValue, diff, KDNN::Element::MatchType(), k); + + if (std::abs(pairEps.first) > pairEps.second) { + flag = false; + break; + } + } + if (src != nullptr) { + free(src); + } + if (weight != nullptr) { + free(weight); + } + if (dst != nullptr) { + free(dst); + } + if (dstRef != nullptr) { + free(dstRef); + } + if (bias != nullptr) { + free(bias); + } + return flag; +} +bool kudnn_linearActivation_01() +{ + const KDNN::TensorInfo srcTensor = {{3, 2, 110, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCD}; + const KDNN::TensorInfo weightTensor = {{3, 2, 20, 200}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCD}; + const KDNN::TensorInfo dstTensor = {{3, 2, 110, 200}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCD}; + const KDNN::TensorInfo biasTensor = {{3, 2, 110, 200}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCD}; + float alpha = 1.0f; + float beta = 0.0f; + KDNN::ActivationFunction algKind = KDNN::ActivationFunction::SWISH; + int numThreads = 0; + return Test_Template_Function(srcTensor, weightTensor, dstTensor, biasTensor, alpha, beta, algKind, numThreads); +} + +static void linearSimpleFloat(SizeType batch, SizeType m, SizeType n, SizeType k, const float *src, float *weight, + float *dst, float *bias, int transSrc = 0, int transDst = 0) +{ + for (SizeType bh = 0; bh < batch; ++bh) { + for (SizeType i = 0; i < m; ++i) { + for (SizeType j = 0; j < n; ++j) { + float sum = 0; + for (SizeType l = 0; l < k; ++l) { + if (transSrc == 0 && transDst == 0) { + sum += src[bh * m * k + i * k + l] * weight[bh * n * k + n * l + j]; + } else if (transSrc == 1 && transDst == 0) { + sum += src[bh * m * k + i * k + l] * weight[bh * n * k + j * k + l]; + } else if (transSrc == 0 && transDst == 1) { + sum += src[bh * m * k + l * m + i] * weight[bh * n * k + n * l + j]; + } else { + sum += src[bh * m * k + l * m + i] * weight[bh * n * k + j * k + l]; + } + } + if (bias) { + dst[bh * m * n + i * n + j] = sum + bias[bh * m * n + i * n + j]; + } else { + dst[bh * m * n + i * n + j] = sum; + } + } + } + } +} +template +static void DataInit(T **src, T **weight, T **dst, T **dstRef, T **bias, T **res, T **linearRes, T **linearResRef, + SizeType srcTotalSize, SizeType weightTotalSize, SizeType dstTotalSize, SizeType resTotalSize, + SizeType linearResTotalSize) +{ + *src = (T *)malloc(srcTotalSize * sizeof(T)); + *weight = (T *)malloc(weightTotalSize * sizeof(T)); + *dst = (T *)malloc(dstTotalSize * sizeof(T)); + *dstRef = (T *)malloc(dstTotalSize * sizeof(T)); + *bias = (T *)malloc(dstTotalSize * sizeof(T)); + *res = (T *)malloc(resTotalSize * sizeof(T)); + *linearRes = (T *)malloc(linearResTotalSize * sizeof(T)); + *linearResRef = (T *)malloc(linearResTotalSize * sizeof(T)); + + if (*src == nullptr || *weight == nullptr || *dst == nullptr || *dstRef == nullptr || *bias == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return; + } + std::uniform_real_distribution u(-1, 1); + static std::default_random_engine e(time(NULL)); + + for (SizeType i = 0; i < srcTotalSize; ++i) { + (*src)[i] = (T)u(e); + } + for (SizeType i = 0; i < weightTotalSize; ++i) { + (*weight)[i] = (T)u(e); + } + for (SizeType i = 0; i < dstTotalSize; ++i) { + (*bias)[i] = (T)u(e); + } + + for (SizeType i = 0; i < resTotalSize; ++i) { + (*res)[i] = (T)u(e); + } +} + +static void addFloat(const float *dst, float *res, float *linearRes, float gamma, SizeType size) +{ + for (SizeType i = 0; i < size; i++) { + linearRes[i] = dst[i] + res[i] * gamma; + } +} + +static std::pair LresCompareEle(float ref, float diff, KDNN::Element::TypeT type, SizeType K, SizeType L) +{ + float e = 0.0f; + float eps = 0.0f; + std::pair pairEps = {0.0f, 0.0f}; + switch (type) { + case KDNN::Element::TypeT::F16: + eps = 1e-3 * K * L; + e = (std::fabs(ref) > eps) ? diff / ref : static_cast(diff); + pairEps = {e, eps}; + break; + case KDNN::Element::TypeT::BF16: + eps = 1e-2 * K * L; + e = (std::fabs(ref) > eps) ? diff / ref : static_cast(diff); + pairEps = {e, eps}; + break; + case KDNN::Element::TypeT::F32: + eps = 1e-4; + e = (std::fabs(ref) > 1e-4) ? diff / ref : static_cast(diff); + pairEps = {e, eps}; + break; + default: + eps = 1; + e = diff; + pairEps = {e, eps}; + break; + } + return pairEps; +} + +static bool Test_Template_Function_FP32(const KDNN::TensorInfo srcTensor, const KDNN::TensorInfo weightTensor, + const KDNN::TensorInfo dstTensor, const KDNN::TensorInfo biasTensor, + const KDNN::TensorInfo resTensor, const KDNN::TensorInfo linearResTensor, + float alpha, float beta, float gamma, KDNN::ResOpsFunction algKind, + int numThreads, int transSrc = 0, int transDst = 0) +{ + KDNN::LinearResFWD LinearResFWD(srcTensor, weightTensor, dstTensor, biasTensor, resTensor, linearResTensor, alpha, + beta, gamma, algKind, numThreads); + + float *src = nullptr, *weight = nullptr, *dst = nullptr, *dstRef = nullptr, *bias = nullptr, *res = nullptr, + *linearRes = nullptr, *linearResRef = nullptr; + DataInit(&src, &weight, &dst, &dstRef, &bias, &res, &linearRes, &linearResRef, + srcTensor.GetTotalTensorSize(), weightTensor.GetTotalTensorSize(), dstTensor.GetTotalTensorSize(), + resTensor.GetTotalTensorSize(), linearResTensor.GetTotalTensorSize()); + + Shape srcShape = srcTensor.GetDims(); + Shape weightShape = weightTensor.GetDims(); + Shape resShape = resTensor.GetDims(); + SizeType srcDimNums = srcShape.GetNumDims(); + SizeType weightDimNums = weightShape.GetNumDims(); + SizeType resDimNums = resShape.GetNumDims(); + SizeType dst_row = srcShape[srcDimNums - 2]; // m + SizeType dst_col = weightShape[weightDimNums - 1]; // n + SizeType src_col = srcShape[srcDimNums - 1]; // k + SizeType res_col = resShape[resDimNums - 1]; // l + SizeType LinearBatch = dstTensor.GetTotalTensorSize() / (dst_row * dst_col); + SizeType PostOpsBatch = linearResTensor.GetTotalTensorSize() / (dst_row * res_col); + + SizeType L = 1; // compareEle: K *L + + linearSimpleFloat(LinearBatch, dst_row, dst_col, src_col, src, weight, dstRef, bias, transSrc, transDst); + if (algKind == KDNN::ResOpsFunction::RES_MUL) { + L = dst_col; + linearSimpleFloat(PostOpsBatch, dst_row, res_col, dst_col, dstRef, res, linearResRef, nullptr, transSrc, + transDst); + } else if (algKind == KDNN::ResOpsFunction::RES_IDENTIAL) { + addFloat(dstRef, res, linearResRef, 1.0f, linearResTensor.GetTotalTensorSize()); + } else { + addFloat(dstRef, res, linearResRef, gamma, linearResTensor.GetTotalTensorSize()); + } + + LinearResFWD.Run(src, weight, dst, bias, res, linearRes, gamma); + + float maxValue = + findMaxValue(src, srcTensor.GetTotalTensorSize(), weight, weightTensor.GetTotalTensorSize()); + + bool flag = true; + for (SizeType i = 0; i < linearResTensor.GetTotalTensorSize(); ++i) { + float diff = std::abs(static_cast(*(linearRes + i) - *(linearResRef + i))); + auto pairEps = LresCompareEle(maxValue, diff, KDNN::Element::MatchType(), src_col, L); + if (std::abs(pairEps.first) > pairEps.second) { + flag = false; + break; + } + } + + if (src != nullptr) { + free(src); + } + if (weight != nullptr) { + free(weight); + } + if (dst != nullptr) { + free(dst); + } + if (dstRef != nullptr) { + free(dstRef); + } + if (bias != nullptr) { + free(bias); + } + if (res != nullptr) { + free(res); + } + if (linearRes != nullptr) { + free(linearRes); + } + if (linearResRef != nullptr) { + free(linearResRef); + } + return flag; +} + +bool kudnn_linearRes_01() +{ + const KDNN::TensorInfo srcTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo weightTensor = {{4, 3, 2, 20, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo dstTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo biasTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo resTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE}; + const KDNN::TensorInfo linearResTensor = {{4, 3, 2, 30, 20}, KDNN::Element::TypeT::F32, KDNN::Layout::ABCDE}; + float alpha = 1.0f; + float beta = 0.0f; + float gamma = 2.0f; + KDNN::ResOpsFunction algKind = KDNN::ResOpsFunction::RES_IDENTIAL; + int numThreads = 0; + return Test_Template_Function_FP32(srcTensor, weightTensor, dstTensor, biasTensor, resTensor, linearResTensor, alpha, beta, + gamma, algKind, numThreads); +} + +template +static void rmsnormRef(SizeType outerSize, SizeType innerSize, const T *src, T *dst, const T *scale, bool scaleApply, + float *variance, const float eps, bool global_stats = false) +{ + int threadsUsed = (int)ceil((float)outerSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; + // variance compute + if (!global_stats) { +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < outerSize; ++i) { + float sum = 0.0f; + for (SizeType j = 0; j < innerSize; ++j) { + float srcVal = (float)*(src + i * innerSize + j); + sum += srcVal * srcVal; + } + *(variance + i) = sum / innerSize; + } + } + // norm +#pragma omp parallel for collapse(2) num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < outerSize; ++i) { + float varVal = *(variance + i); + float invStd = 1.0f / std::sqrt(varVal + eps); + for (SizeType j = 0; j < innerSize; ++j) { + float srcVal = (float)*(src + i * innerSize + j); + float normalized = srcVal * invStd; + if (scaleApply) { + normalized *= scale[j]; + } + *(dst + i * innerSize + j) = static_cast(normalized); + } + } +} +// forward +template +static bool RnormForwardFunc1(const TensorInfo &srcInfo, const TensorInfo &statInfo, const TensorInfo &scaleInfo, + const TensorInfo &dstInfo, KDNN::NormalizationFlags flags) +{ + KDNN::RMSNormalizationLayerFWD rmsLayer1(srcInfo, statInfo, scaleInfo, dstInfo, flags); + + SizeType srcSize = srcInfo.GetTotalTensorSize(); + SizeType dstSize = dstInfo.GetTotalTensorSize(); + SizeType statSize = statInfo.GetTotalTensorSize(); + SizeType innerSize = scaleInfo.GetTotalTensorSize(); + + T *src = (T *)malloc(srcSize * sizeof(T)); + T *dst = (T *)malloc(dstSize * sizeof(T)); + T *dstRef = (T *)malloc(dstSize * sizeof(T)); + float *variance = (float *)malloc(statSize * sizeof(float)); + T *scale = (T *)malloc(innerSize * sizeof(T)); + float eps = 1e-5; + if (src == nullptr || dst == nullptr || dstRef == nullptr || variance == nullptr || scale == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return false; + } + + bool global_stats = static_cast(flags & KDNN::NormalizationFlags::USE_GLOBAL_STATS); + + // generate random test data + std::uniform_real_distribution u(-1, 1); + std::default_random_engine e(time(NULL)); + int threadsUsed = (int)ceil((float)srcSize * FACTOR_THS); + threadsUsed = threadsUsed > MAX_THS ? MAX_THS : threadsUsed; +#pragma omp parallel for num_threads(threadsUsed) schedule(static) + for (SizeType i = 0; i < srcSize; ++i) { + *(src + i) = (T)u(e); + } + for (SizeType j = 0; j < innerSize; ++j) { + *(scale + j) = (T)u(e); + } + if (global_stats) { + for (SizeType i = 0; i < statSize; i++) { + *(variance + i) = 0.1f; + } + } + + float error = 0.0; + rmsLayer1.Run(src, dst, scale, variance, true, eps); + bool scaleApply = static_cast(flags & KDNN::NormalizationFlags::USE_SCALE); + rmsnormRef(statSize, innerSize, src, dstRef, scale, scaleApply, variance, eps, global_stats); +#pragma omp parallel for reduction(+ : error) num_threads(threadsUsed) + for (SizeType i = 0; i < dstSize; ++i) { + error += *(dst + i) - *(dstRef + i); + } + free(src); + free(dst); + free(dstRef); + free(variance); + free(scale); + error = std::abs(error) / dstSize; + + return error < errBound; +} + +bool kudnn_rnorm_01() +{ + Shape shape(100, 100); + TensorInfo srcInfo = {shape, TypeF16, KDNN::Layout::AB}; + TensorInfo statInfo = {{shape[0]}, TypeF16, KDNN::Layout::A}; + TensorInfo scaleInfo = {{shape[1]}, TypeF16, KDNN::Layout::A}; + TensorInfo dstInfo = {shape, TypeF16, KDNN::Layout::AB}; + KDNN::NormalizationFlags flags = KDNN::NormalizationFlags::USE_SHIFT; + return RnormForwardFunc1<__fp16>(srcInfo, statInfo, scaleInfo, dstInfo, flags); +} + +void softmaxHelp(float *src, int start, int end) +{ + if (start > end || !src) + return; + + float max_val = src[start]; + for (int i = start + 1; i <= end; ++i) { + if (src[i] > max_val) { + max_val = src[i]; + } + } + float sum_exp = 0.0f; + for (int i = start; i <= end; ++i) { + sum_exp += std::exp(src[i] - max_val); + } + for (int i = start; i <= end; ++i) { + src[i] = std::exp(src[i] - max_val) / sum_exp; + } +} + +static void softmaxSimple(Shape shape, float *src) +{ + SizeType numDims = shape.GetNumDims(); + SizeType outSize = 1; + for (SizeType i = 0; i < numDims - 1; ++i) { + outSize *= shape[i]; + } + SizeType innerSize = shape[numDims - 1]; + + for (SizeType i = 0; i < outSize; ++i) { + softmaxHelp(src, i * innerSize, (i + 1) * innerSize - 1); + } +} + +static void dataInit(float **src, float **dst, float **dstRef, SizeType totalSize) +{ + *src = (float *)malloc(totalSize * sizeof(float)); + *dst = (float *)malloc(totalSize * sizeof(float)); + *dstRef = (float *)malloc(totalSize * sizeof(float)); + + if (*src == nullptr || *dst == nullptr || *dstRef == nullptr) { + std::cerr << "Memory allocation failed" << std::endl; + return; + } + std::uniform_real_distribution u(-1, 1); + std::default_random_engine e(time(NULL)); + + for (SizeType i = 0; i < totalSize; ++i) { + (*src)[i] = u(e); + } + memcpy(*dstRef, *src, totalSize * sizeof(float)); +} + +static void dataFree(float **src, float **dst, float **dstRef) +{ + free(*src); + free(*dst); + free(*dstRef); + + *src = nullptr; + *dst = nullptr; + *dstRef = nullptr; +} +static bool FWD_3D_FP32(const Shape &shape) +{ + const KDNN::TensorInfo srcTensor = {shape, KDNN::Element::TypeT::F32, KDNN::Layout::ABC}; + const KDNN::TensorInfo dstTensor = {shape, KDNN::Element::TypeT::F32, KDNN::Layout::ABC}; + SizeType axis = 2; + KDNN::SoftmaxLayerFWD softmaxLayerFwd(srcTensor, dstTensor, axis, KDNN::SoftmaxAlgorithmKind::SOFTMAX); + + float *src = nullptr, *dst = nullptr, *dstRef = nullptr; + SizeType totalSize = srcTensor.GetTotalTensorSize(); + dataInit(&src, &dst, &dstRef, totalSize); + softmaxSimple(shape, dstRef); + softmaxLayerFwd.Run(src, dst); + for (SizeType i = 0; i < totalSize; ++i) { + float error = std::abs(*(dst + i) - *(dstRef + i)); + if (error > maxError) { + return false; + } + } + dataFree(&src, &dst, &dstRef); + return true; +} +bool kudnn_softmax_01() +{ + Shape shape(20, 40, 50); + return FWD_3D_FP32(shape); +} \ No newline at end of file diff --git a/KuDNN/src/function.hpp b/KuDNN/src/function.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c823defc40ebdbcdbddcd9050b1f0b272f8a1ce0 --- /dev/null +++ b/KuDNN/src/function.hpp @@ -0,0 +1,73 @@ +#include "kdnn.hpp" +#include +#include +#include +#include +#include +#include + +static std::unordered_map layoutMap = { + {"KDNN::Layout::UNDEFINED", KDNN::Layout::UNDEFINED}, + {"KDNN::Layout::A", KDNN::Layout::A}, + {"KDNN::Layout::AB", KDNN::Layout::AB}, + {"KDNN::Layout::BA", KDNN::Layout::BA}, + {"KDNN::Layout::ABC", KDNN::Layout::ABC}, + {"KDNN::Layout::ACB", KDNN::Layout::ACB}, + {"KDNN::Layout::BAC", KDNN::Layout::BAC}, + {"KDNN::Layout::BCA", KDNN::Layout::BCA}, + {"KDNN::Layout::CAB", KDNN::Layout::CAB}, + {"KDNN::Layout::CBA", KDNN::Layout::CBA}, + {"KDNN::Layout::ABCD", KDNN::Layout::ABCD}, + {"KDNN::Layout::ABDC", KDNN::Layout::ABDC}, + {"KDNN::Layout::ACBD", KDNN::Layout::ACBD}, + {"KDNN::Layout::ACDB", KDNN::Layout::ACDB}, + {"KDNN::Layout::ADBC", KDNN::Layout::ADBC}, + {"KDNN::Layout::ADCB", KDNN::Layout::ADCB}, + {"KDNN::Layout::BACD", KDNN::Layout::BACD}, + {"KDNN::Layout::BCDA", KDNN::Layout::BCDA}, + {"KDNN::Layout::CDAB", KDNN::Layout::CDAB}, + {"KDNN::Layout::CDBA", KDNN::Layout::CDBA}, + {"KDNN::Layout::DCAB", KDNN::Layout::DCAB}, + {"KDNN::Layout::ABCDE", KDNN::Layout::ABCDE}, + {"KDNN::Layout::ABCED", KDNN::Layout::ABCED}, + {"KDNN::Layout::ABDEC", KDNN::Layout::ABDEC}, + {"KDNN::Layout::ACBDE", KDNN::Layout::ACBDE}, + {"KDNN::Layout::ACDEB", KDNN::Layout::ACDEB}, + {"KDNN::Layout::ADECB", KDNN::Layout::ADECB}, + {"KDNN::Layout::BACDE", KDNN::Layout::BACDE}, + {"KDNN::Layout::BCDEA", KDNN::Layout::BCDEA}, + {"KDNN::Layout::CDEAB", KDNN::Layout::CDEAB}, + {"KDNN::Layout::CDEBA", KDNN::Layout::CDEBA}, + {"KDNN::Layout::DECAB", KDNN::Layout::DECAB}, + {"KDNN::Layout::ROW_MAJOR", KDNN::Layout::ROW_MAJOR}, + {"KDNN::Layout::COL_MAJOR", KDNN::Layout::COL_MAJOR}, + {"KDNN::Layout::NCHW", KDNN::Layout::NCHW}, + {"KDNN::Layout::NHWC", KDNN::Layout::NHWC}, + {"KDNN::Layout::NCDHW", KDNN::Layout::NCDHW}, + {"KDNN::Layout::NDHWC", KDNN::Layout::NDHWC}, + {"KDNN::Layout::OIHW", KDNN::Layout::OIHW}, + {"KDNN::Layout::HWIO", KDNN::Layout::HWIO}, + {"KDNN::Layout::HWOI", KDNN::Layout::HWOI}, + {"KDNN::Layout::OHWI", KDNN::Layout::OHWI}, + {"KDNN::Layout::OHWO", KDNN::Layout::OHWO}, + {"KDNN::Layout::IOHW", KDNN::Layout::IOHW}, +}; + +static std::unordered_map typeTMap = { + {"UNDEFINED", KDNN::Element::TypeT::UNDEFINED}, + {"KDNN::Element::TypeT::F32", KDNN::Element::TypeT::F32}, + {"KDNN::Element::TypeT::F16", KDNN::Element::TypeT::F16}, + {"KDNN::Element::TypeT::BF16", KDNN::Element::TypeT::BF16}, + {"KDNN::Element::TypeT::S32", KDNN::Element::TypeT::S32}, + {"KDNN::Element::TypeT::S8", KDNN::Element::TypeT::S8}, + {"KDNN::Element::TypeT::U8", KDNN::Element::TypeT::U8}, +}; + +bool kudnn_conv_01(); +bool kudnn_conv_02(); +bool kudnn_gnorm_01(); +bool kudnn_lnorm_01(); +bool kudnn_linearActivation_01(); +bool kudnn_linearRes_01(); +bool kudnn_rnorm_01(); +bool kudnn_softmax_01(); \ No newline at end of file diff --git a/KuDNN/src/main.cpp b/KuDNN/src/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..226f49e75bd3a52b6375b80ec756da44f9ffa284 --- /dev/null +++ b/KuDNN/src/main.cpp @@ -0,0 +1,45 @@ +#include +#include "function.hpp" + +std::unordered_map> funcMap; +int main(int argc, char **argv) +{ + funcMap["kudnn_conv_01"] = kudnn_conv_01; + funcMap["kudnn_conv_02"] = kudnn_conv_02; + funcMap["kudnn_gnorm_01"] = kudnn_gnorm_01; + funcMap["kudnn_lnorm_01"] = kudnn_lnorm_01; + funcMap["kudnn_linearActivation_01"] = kudnn_linearActivation_01; + funcMap["kudnn_linearRes_01"] = kudnn_linearRes_01; + funcMap["kudnn_rnorm_01"] = kudnn_rnorm_01; + funcMap["kudnn_softmax_01"] = kudnn_softmax_01; + bool allPass = true; + + std::cout << "start test" << std::endl; + if (argc == 2) { + if (funcMap.count(argv[1]) > 0) { + bool ret = funcMap[argv[1]](); + if (!ret) { + std::cout << "test " << argv[1] << " FAILED" << std::endl; + } else { + std::cout << "test " << argv[1] << " PASS" << std::endl; + } + } else { + std::cout << "No test named " << argv[1] << std::endl; + } + } else if (argc == 1) { + for (auto &func : funcMap) { + std::cout << "run test " << func.first << " ......" << std::endl; + bool ret = func.second(); + if (!ret) { + std::cout << "test " << func.first << " FAILED" << std::endl; + allPass = false; + } else { + std::cout << "test " << func.first << " PASS" << std::endl; + } + } + } + + if(allPass) + std::cout << "All test passed" << std::endl; + return 0; +} \ No newline at end of file