diff --git a/kurtm/build.sh b/kurtm/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..a837c5c043ce1af06a06ba1a6bd3c334fece6af8 --- /dev/null +++ b/kurtm/build.sh @@ -0,0 +1,13 @@ +set -e + +cputype=$(cat /proc/cpuinfo | grep 'CPU part' | awk 'NR==1{print $4}') +if [[ "$cputype" == "0xd22" ]]; then + echo "build KuRTM testsuite.." + cmake ./test + make + echo "build KuRTM testsuite done" +else + echo "Not suppot on this platform" +fi + + diff --git a/kurtm/run.sh b/kurtm/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..5a11fe60cade8b2821fec8ad316a85cbd0119354 --- /dev/null +++ b/kurtm/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# +# Copyright (c) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved. +# +set -e + +cputype=$(cat /proc/cpuinfo | grep 'CPU part' | awk 'NR==1{print $4}') +if [[ "$cputype" == "0xd22" ]]; then + echo "run KuRTM testsuite.." + ./benchmark 128 128 128 + echo "run KuRTM testsuite done" + echo "[ PASSED ] 1 tests" +else + echo "Not suppot on this platform" +fi + diff --git a/kurtm/test/CMakeLists.txt b/kurtm/test/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb3c046a5bfb9dfa51d95f6f98824788f9c70e66 --- /dev/null +++ b/kurtm/test/CMakeLists.txt @@ -0,0 +1,26 @@ +# +# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. +# + +cmake_minimum_required(VERSION 3.14) +project(kurtm_benchmark VERSION 1.0.0) + +if(CMAKE_C_COMPILER_ID STREQUAL "Clang") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=linxicore6100 --rtlib=compiler-rt") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.4-a+sve+sme-f64f64 -ffast-math -fomit-frame-pointer") +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -pipe -fopenmp") + +include_directories("${CMAKE_INSTALL_PREFIX}/include") +link_directories(${CMAKE_INSTALL_PREFIX}/lib) + +add_executable(benchmark benchmark.cpp) + +target_link_libraries(benchmark + -Wl,--start-group + kurtm + -Wl,--end-group +) + diff --git a/kurtm/test/benchmark.cpp b/kurtm/test/benchmark.cpp new file mode 100644 index 0000000000000000000000000000000000000000..518d9bea0da96d97811788186d9fcfadb2e14985 --- /dev/null +++ b/kurtm/test/benchmark.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(__AVX512F__) || defined(__AVX512__) +#include +#elif defined(__ARM_FEATURE_SVE) +#include +#else +#include +#endif +#if defined(__ARM_FEATURE_SME) +#include +#endif +#include "kurtm.h" + + +static inline uint64_t rotl(const uint64_t x, int k) { + return (x << k) | (x >> (64 - k)); +} + +template +T uniform() { + static uint64_t s[2] = { 0x41, 0x29837592 }; + const uint64_t s0 = s[0]; + uint64_t s1 = s[1]; + const uint64_t result = s0 + s1; + + s1 ^= s0; + s[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14); // a, b + s[1] = rotl(s1, 36); // c + + return result*((1.0/18446744073709551616.0)); +} + + +#define TOT_TIME 10 +template +double time_func(T func) { + int it = 1; + func(); // Warm up + double st = omp_get_wtime(); + double ed = st; + while (ed < st + TOT_TIME) { + for (int i = 0; i < it; ++i) + func(); + it <<= 1; + ed = omp_get_wtime(); + } + return (ed - st) / (it - 1); +} + +#define min(x, y) ((x) < (y) ? (x) : (y)) +#define ceil(x, y) (((x) + (y) - 1)/(y)) +#define index_g(i, j, k, sx, sy) ((i) + (j)*(sx) + (k)*(sy)) +#define index_b(i, j, k, sx, sy) (((i)/8)*256 + ((j)/8)*(sx) + ((k)/4)*(sy)) + +#if defined(__ARM_FEATURE_SME) +int main(int argc, char** argv)__arm_streaming { +#else +int main(int argc, char** argv) { +#endif + assert(argc == 4); + int nx = atoi(argv[1]); + int ny = atoi(argv[2]); + int nz = atoi(argv[3]); + int dimension = 2; + int radius = 4; + int GZ_x = 8, GZ_y = 4, GZ_z = 4; + int grid_dim_x = ceil(nx, 8)*8 + 2*GZ_x; + int grid_dim_y = ny + 2*GZ_y; + int grid_dim_z = nz + 2*GZ_z; + int n_threads = omp_get_max_threads(); + printf("KuRTM test configuration:\n"); + printf(" datatype: double\n"); + printf(" dimension: %d\n", dimension); + printf(" radius: %d\n", radius); + printf(" input grid layout: grid\n"); + printf(" output grid layout: grid\n"); + printf(" grid size: %d, %d, %d\n", nx, ny, nz); + printf(" tile size: [512, 8, 128]\n"); + printf(" number of threads: %d\n", n_threads); + printf(" brick size: [8, 8, 4]\n"); + printf(" use alignment: True\n"); + printf("\nbegin tests:\n"); + printf("malloc grid...\n"); + + double* alpha = (double*)malloc(sizeof(double)*729); + double* p0 = (double*)aligned_alloc(64, sizeof(double)*grid_dim_x*grid_dim_y*grid_dim_z); + double* p1 = (double*)aligned_alloc(64, sizeof(double)*grid_dim_x*grid_dim_y*grid_dim_z); + + printf("initalizing input grid...\n"); + for(int i = 0; i < 729; i += 1) { + alpha[i] = uniform(); + } + for(int i = 0; i < grid_dim_x*grid_dim_y*grid_dim_z; i += 1) { + p0[i] = uniform(); + } + printf("zeroing output grid...\n"); + memset(p1, 0, sizeof(double)*grid_dim_x*grid_dim_y*grid_dim_z); + + kurtm_stencil_factor_t kernel_factor[3] = {KURTM_STENCIL_FACTOR_IMMU, KURTM_STENCIL_FACTOR_IMMU, KURTM_STENCIL_FACTOR_IMMU}; + kurtm_stencil_kernel_t *kernel = NULL; + kurtm_stencil_direct_t kernel_dircet[3] = {KURTM_STENCIL_DIRECT_ON, KURTM_STENCIL_DIRECT_ON, KURTM_STENCIL_DIRECT_OFF}; + kernel = kurtm_stencil_2dkernel_create(radius, KURTM_STENCIL_TYPE_BOX, + kernel_dircet, kernel_factor, + alpha); + + int dimSize[3] = {nx, ny, nz}; + int dimStep[3] = {1, grid_dim_x, grid_dim_x * grid_dim_y}; + + kurtm_tensor_t *in = kurtm_tensor_create( + &p0[index_g(GZ_x, GZ_y, GZ_z, grid_dim_x, grid_dim_y)], + dimSize, dimStep, KURTM_TENSOR_DOUBLE); + kurtm_tensor_t *ou = kurtm_tensor_create( + &p1[index_g(GZ_x, GZ_y, GZ_z, grid_dim_x, grid_dim_y)], + dimSize, dimStep, KURTM_TENSOR_DOUBLE); + + printf("running benchmark...\n"); + + double elapsed_time = time_func([&](){ + kurtm_stencil_run(in, ou, kernel); + }); + + printf("run complete\n"); + printf(" elapsed time: %lfs\n", elapsed_time); + printf(" bandwidth: %lfGB/s\n", sizeof(double)*nx*ny*nz*2 / (elapsed_time * 1024*1024*1024)); + printf(" FLOPS: %lfGFLOPS\n", 161.0*nx*ny*nz / (elapsed_time * 1024*1024*1024)); + + return 0; +}