diff --git a/LMCache-0.1.4.alpha.tar.gz b/LMCache-0.1.4.alpha.tar.gz deleted file mode 100644 index c242010c5335191bfa5298c757768528425427ec..0000000000000000000000000000000000000000 Binary files a/LMCache-0.1.4.alpha.tar.gz and /dev/null differ diff --git a/LMCache-0.3.1.post1.tar.gz b/LMCache-0.3.1.post1.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..e0afd2f6a44fabe6c591b1acf8893f2d866ac754 Binary files /dev/null and b/LMCache-0.3.1.post1.tar.gz differ diff --git a/adapt-mindspore.patch b/adapt-mindspore.patch new file mode 100644 index 0000000000000000000000000000000000000000..a3c06e308f813dae4dc31784645c32782d592209 --- /dev/null +++ b/adapt-mindspore.patch @@ -0,0 +1,4688 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +new file mode 100644 +index 0000000..a77b665 +--- /dev/null ++++ b/CMakeLists.txt +@@ -0,0 +1,252 @@ ++cmake_minimum_required(VERSION 3.16) ++project(lmcache_C) ++ ++# include(CheckCXXcompilerFlag) ++# check_cxx_compiler_flag("-std=c++17", COMPILER_SUPPORTS_CXX17) ++set(CMAKE_CXX_STANDARD 17) ++ ++include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) ++ ++# Suppress potential warnings about unused manually-specified variables ++set(ignoreMe "${VLLM_PYTHON_PATH}") ++ ++# TODO: Add 3.12 back when torch-npu support 3.12 ++set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11") ++ ++find_package(pybind11 REQUIRED) ++ ++# append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") ++set(LMCACHE_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}") ++ ++# find_package(Torch REQUIRED) ++ ++set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") ++message(STATUS "Detected SOC version: ${SOC_VERSION}") ++ ++if (NOT CMAKE_BUILD_TYPE) ++ set(CMAKE_BUILD_TYPE "Release" CACHE STRINGS "Build type Release/Debug (default Release)" FORCE) ++endif() ++ ++if (CMAKE_INSTALL_PREFIX STREQUAL /usr/local) ++ set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRINGS "path to install()") ++endif() ++ ++message(STATUS "ASCEND_CANN_PACKAGE_PATH: ${ASCEND_CANN_PACKAGE_PATH}") ++ ++set(ASCEND_HOME_PATH ${ASCEND_CANN_PACKAGE_PATH}) ++if(EXISTS ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake) ++ set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake) ++elseif(EXISTS ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake) ++ set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake) ++elseif(EXISTS ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake) ++ set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake) ++elseif(EXISTS ${ASCEND_HOME_PATH}/aarch64-linux/tikcpp/ascendc_kernel_cmake) ++ set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/aarch64-linux/tikcpp/ascendc_kernel_cmake) ++else() ++ message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.") ++endif() ++ ++include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) ++file(GLOB KERNEL_FILES ++${CMAKE_CURRENT_SOURCE_DIR}/csrc/ascend/kernels/*.cpp) ++ ++ascendc_library(lmcache_ascend_kernels SHARED ++ ${KERNEL_FILES} ++) ++ ++ ++set(ASCENDC_TARGET_NAME lmcache_ascend_kernels PARENT_SCOPE) ++set(ASCENDC_TARGET_DIR "${CMAKE_BINARY_DIR}/lib" PARENT_SCOPE) ++ ++if(NOT PYTHON_EXECUTABLE AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows") ++ find_package(PythonInterp QUIET) # For older CMake or Windows specific cases ++elseif(NOT PYTHON_EXECUTABLE) ++ find_package(Python COMPONENTS Interpreter QUIET) # Modern CMake ++endif() ++ ++if (NOT PYTHON_EXECUTABLE) ++ set(PYTHON_EXECUTABLE "python3") # Fallback to system python3 if not found by CMake ++ message(WARNING "PYTHON_EXECUTABLE not found by CMake, falling back to 'python3'. Ensure this is the correct interpreter.") ++endif() ++ ++ ++if (USE_TORCH) ++ If (DEFINED TORCH_NPU_DIR AND IS_DIRECTORY "${TORCH_NPU_DIR}") ++ set(TORCH_NPU_PATH ${TORCH_NPU_DIR}) # This is the base path to the torch_npu package ++ message(STATUS "Using TORCH_NPU_PATH (from TORCH_NPU_DIR via setup.py): ${TORCH_NPU_PATH}") ++ ++ # Check for lib subdirectory ++ if(IS_DIRECTORY "${TORCH_NPU_PATH}/lib") ++ message(STATUS "Found Torch NPU 'lib' directory: ${TORCH_NPU_PATH}/lib") ++ else() ++ message(WARNING "Torch NPU 'lib' directory ('${TORCH_NPU_PATH}/lib') NOT found. Linker may fail.") ++ endif() ++ ++ # Check for include subdirectory ++ if(IS_DIRECTORY "${TORCH_NPU_PATH}/include") ++ message(STATUS "Found Torch NPU 'include' directory: ${TORCH_NPU_PATH}/include") ++ else() ++ message(WARNING "Torch NPU 'include' directory ('${TORCH_NPU_PATH}/include') NOT found.") ++ endif() ++ else() ++ execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import os; import torch_npu; print(os.path.dirname(torch_npu.__file__))" ++ OUTPUT_STRIP_TRAILING_WHITESPACE ++ OUTPUT_VARIABLE TORCH_NPU_PATH ++ ) ++ endif() ++ # make sure TORCH_NPU_PATH is set ++ if(NOT TORCH_NPU_PATH) ++ message(FATAL_ERROR "TORCH_NPU_PATH is not set") ++ endif() ++ message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}") ++else() ++ If (DEFINED MS_NPU_DIR AND IS_DIRECTORY "${MS_NPU_DIR}") ++ set(MS_NPU_PATH ${MS_NPU_DIR}) # This is the base path to the torch_npu package ++ message(STATUS "Using MS_NPU_DIR (from MS_NPU_DIR via setup.py): ${MS_NPU_PATH}") ++ else() ++ execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import os; import mindspore as ms; print(os.path.dirname(ms.__file__))" ++ OUTPUT_STRIP_TRAILING_WHITESPACE ++ OUTPUT_VARIABLE MS_NPU_PATH ++ ) ++ endif() ++ if(NOT MS_NPU_PATH) ++ message(FATAL_ERROR "MS_NPU_PATH is not set") ++ endif() ++ message("MS_NPU_PATH is ${MS_NPU_PATH}") ++endif() ++ ++file(GLOB LMCACHE_ASCEND_SRC ++${CMAKE_CURRENT_SOURCE_DIR}/csrc/ascend/*.cpp) ++pybind11_add_module(lmcache_C ${LMCACHE_ASCEND_SRC}) ++ ++get_filename_component(ASCEND_TOOLKIT_PATH ${ASCEND_HOME_PATH} DIRECTORY) ++get_filename_component(ASCEND_DIR_PATH ${ASCEND_TOOLKIT_PATH} DIRECTORY) ++ ++target_compile_options(lmcache_C PRIVATE -Wno-deprecated-declarations) ++ ++if (USE_TORCH) # This 'if' checks the *CMake variable* USE_TORCH (TRUE/FALSE) ++ # Only add the preprocessor definition if the CMake variable USE_TORCH is TRUE ++ target_compile_definitions(lmcache_C PRIVATE USE_TORCH) ++ message(STATUS "CMake: USE_TORCH variable is TRUE. Adding -DUSE_TORCH compile definition.") ++else() ++ # No need to explicitly undefine it, just don't add the -D flag. ++ find_package(Threads REQUIRED) ++ ++ target_compile_definitions(lmcache_C PRIVATE ++ -D_GLIBCXX_USE_CXX11_ABI=0 ++ ENABLE_FAST_HASH_TABLE=1 ++ CUSTOM_ASCEND_OP ++ ) ++ message(STATUS "CMake: USE_TORCH variable is FALSE. NOT adding -DUSE_TORCH compile definition.") ++endif() ++ ++set( ++ INCLUDES ++ ${pybind11_INCLUDE_DIRS} ++ ${PYTHON_INCLUDE_PATH} ++ ${ASCEND_HOME_PATH}/include ++ ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform ++ ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/ascend_hal ++ ${ASCEND_DIR_PATH}/driver/include ++) ++ ++if (USE_TORCH) ++ list(APPEND INCLUDES ${TORCH_INCLUDE_DIRS}) ++ list(APPEND INCLUDES ${TORCH_NPU_PATH}/include) ++else() ++ list(APPEND INCLUDES ${MS_NPU_PATH}) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/include) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/include/third_party) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/include/third_party/robin_hood_hashing) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/include/third_party/securec/include) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/core/include) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/core/mindrt/include) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/core/mindrt) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/ops) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/ops/kernel/include) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/ccsrc) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/ccsrc/include) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/ccsrc/minddata/mindrecord/include) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/lib) ++ list(APPEND INCLUDES ${MS_NPU_PATH}/lib/plugin) ++endif() ++ ++ ++target_include_directories(lmcache_C PRIVATE ${INCLUDES}) ++ ++set( ++ PLINK_DIRS ++ ${ASCEND_HOME_PATH}/lib64 ++ ${ASCEND_HOME_PATH}/aarch64-linux/devlib ++ ${ASCEND_HOME_PATH}/aarch64-linux/devlib/linux/x86_64 ++ ${ASCEND_HOME_PATH}/aarch64-linux/devlib/linux/aarch64 ++ /usr/local/Ascend/driver/lib64/driver ++) ++ ++if (USE_TORCH) ++ list(APPEND PLINK_DIRS ${TORCH_NPU_PATH}/lib) ++ set_target_properties(lmcache_C PROPERTIES ++ BUILD_RPATH "${TORCH_NPU_PATH}/lib;$$ORIGIN;$$ORIGIN/lib" ++ INSTALL_RPATH "$${TORCH_NPU_PATH}/lib;$$ORIGIN;$$ORIGIN/lib" ++ INSTALL_RPATH_USE_LINK_PATH TRUE # Important for ensuring RPATH is set correctly on install ++ ) ++else() ++ list(APPEND PLINK_DIRS ${MS_NPU_PATH}/lib) ++ list(APPEND PLINK_DIRS ${MS_NPU_PATH}/lib/plugin) ++ set_target_properties(lmcache_C PROPERTIES ++ BUILD_RPATH "${MS_NPU_PATH}/lib;${MS_NPU_PATH}/lib/plugin;$$ORIGIN;$$ORIGIN/lib" ++ INSTALL_RPATH "${MS_NPU_PATH}/lib;${MS_NPU_PATH}/lib/plugin;$$ORIGIN;$$ORIGIN/lib" ++ INSTALL_RPATH_USE_LINK_PATH TRUE ++ ) ++ ++endif() ++ ++target_link_directories( ++ lmcache_C ++ PRIVATE ++ ${PLINK_DIRS} ++) ++ ++ ++set( ++ PUBLINK_LIBS ++ lmcache_ascend_kernels ++ tiling_api ++ ascendcl ++ platform ++ ascend_hal ++ dcmi ++ drvdsmi_host ++) ++ ++if (USE_TORCH) ++ list(APPEND PUBLINK_LIBS ${TORCH_LIBRARIES}) ++ list(APPEND PUBLINK_LIBS libtorch_npu.so) ++else() ++ list(APPEND PUBLINK_LIBS mindspore_common) ++ list(APPEND PUBLINK_LIBS mindspore_ops) ++ list(APPEND PUBLINK_LIBS mindspore_pyboost) ++ list(APPEND PUBLINK_LIBS mindspore_frontend) ++ list(APPEND PUBLINK_LIBS mindspore_res_manager) ++ list(APPEND PUBLINK_LIBS mindspore_backend_manager) ++ list(APPEND PUBLINK_LIBS mindspore_ops_kernel_common) ++ list(APPEND PUBLINK_LIBS mindspore_runtime_pipeline) ++ list(APPEND PUBLINK_LIBS pthread) ++ # Define the absolute path to libmindspore_ascend.so.2 ++# This path was confirmed by your 'ls -l' output. ++ set(MINDSPORE_ASCEND_LIB_FULL_PATH "${MS_NPU_PATH}/lib/plugin/libmindspore_ascend.so.2") ++ list(APPEND PUBLINK_LIBS ${MINDSPORE_ASCEND_LIB_FULL_PATH}) ++endif() ++ ++message(STATUS "DEBUG: PUBLINK_LIBS content before linking: ${PUBLINK_LIBS}") ++ ++target_link_libraries( ++ lmcache_C ++ PUBLIC ++ ${PUBLINK_LIBS} ++) ++ ++ ++install(TARGETS lmcache_C lmcache_ascend_kernels DESTINATION ${LMCACHE_ASCEND_INSTALL_PATH}) ++ ++ +diff --git a/cmake/utils.cmake b/cmake/utils.cmake +new file mode 100644 +index 0000000..62078fd +--- /dev/null ++++ b/cmake/utils.cmake +@@ -0,0 +1,133 @@ ++# ++# Attempt to find the python package that uses the same python executable as ++# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`. ++# ++macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS) ++ file(REAL_PATH ${EXECUTABLE} EXECUTABLE) ++ set(Python_EXECUTABLE ${EXECUTABLE}) ++ find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule) ++ if (NOT Python_FOUND) ++ message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.") ++ endif() ++ set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") ++ set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN}) ++ if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST) ++ message(FATAL_ERROR ++ "Python version (${_VER}) is not one of the supported versions: " ++ "${_SUPPORTED_VERSIONS_LIST}.") ++ endif() ++ message(STATUS "Found python matching: ${EXECUTABLE}.") ++endmacro() ++ ++# ++# Run `EXPR` in python. The standard output of python is stored in `OUT` and ++# has trailing whitespace stripped. If an error is encountered when running ++# python, a fatal message `ERR_MSG` is issued. ++# ++function (run_python OUT EXPR ERR_MSG) ++ execute_process( ++ COMMAND ++ "${PYTHON_EXECUTABLE}" "-c" "${EXPR}" ++ OUTPUT_VARIABLE PYTHON_OUT ++ RESULT_VARIABLE PYTHON_ERROR_CODE ++ ERROR_VARIABLE PYTHON_STDERR ++ OUTPUT_STRIP_TRAILING_WHITESPACE) ++ ++ if(NOT PYTHON_ERROR_CODE EQUAL 0) ++ message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}") ++ endif() ++ set(${OUT} ${PYTHON_OUT} PARENT_SCOPE) ++endfunction() ++ ++# Run `EXPR` in python after importing `PKG`. Use the result of this to extend ++# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. ++macro (append_cmake_prefix_path PKG EXPR) ++ run_python(_PREFIX_PATH ++ "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") ++ list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH}) ++endmacro() ++ ++ ++# This cmake function is adapted from vllm /Users/ganyi/workspace/vllm-ascend/cmake/utils.cmake ++# Define a target named `GPU_MOD_NAME` for a single extension. The ++# arguments are: ++# ++# DESTINATION - Module destination directory. ++# LANGUAGE - The GPU language for this module, e.g CUDA, HIP, ++# etc. ++# SOURCES - List of source files relative to CMakeLists.txt ++# directory. ++# ++# Optional arguments: ++# ++# ARCHITECTURES - A list of target GPU architectures in cmake ++# format. ++# Refer `CMAKE_CUDA_ARCHITECTURES` documentation ++# and `CMAKE_HIP_ARCHITECTURES` for more info. ++# ARCHITECTURES will use cmake's defaults if ++# not provided. ++# COMPILE_FLAGS - Extra compiler flags passed to NVCC/hip. ++# INCLUDE_DIRECTORIES - Extra include directories. ++# LIBRARIES - Extra link libraries. ++# WITH_SOABI - Generate library with python SOABI suffix name. ++# USE_SABI - Use python stable api ++# ++# Note: optimization level/debug info is set via cmake build type. ++# ++function (define_gpu_extension_target GPU_MOD_NAME) ++ cmake_parse_arguments(PARSE_ARGV 1 ++ GPU ++ "WITH_SOABI" ++ "DESTINATION;LANGUAGE;USE_SABI" ++ "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES") ++ ++ # Add hipify preprocessing step when building with HIP/ROCm. ++ if (GPU_LANGUAGE STREQUAL "HIP") ++ hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}") ++ endif() ++ ++ if (GPU_WITH_SOABI) ++ set(GPU_WITH_SOABI WITH_SOABI) ++ else() ++ set(GPU_WITH_SOABI) ++ endif() ++ ++ if (GPU_USE_SABI) ++ Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}") ++ else() ++ Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}") ++ endif() ++ ++ if (GPU_LANGUAGE STREQUAL "HIP") ++ # Make this target dependent on the hipify preprocessor step. ++ add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME}) ++ endif() ++ ++ if (GPU_ARCHITECTURES) ++ set_target_properties(${GPU_MOD_NAME} PROPERTIES ++ ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}") ++ endif() ++ ++ set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17) ++ ++ target_compile_options(${GPU_MOD_NAME} PRIVATE ++ $<$:${GPU_COMPILE_FLAGS}>) ++ ++ target_compile_definitions(${GPU_MOD_NAME} PRIVATE ++ "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}") ++ ++ target_include_directories(${GPU_MOD_NAME} PRIVATE csrc ++ ${GPU_INCLUDE_DIRECTORIES}) ++ ++ target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES}) ++ ++ # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of ++ # dependencies that are not necessary and may not be installed. ++ if (GPU_LANGUAGE STREQUAL "CUDA") ++ target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver) ++ else() ++ target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) ++ endif() ++ ++ install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME}) ++endfunction() +diff --git a/csrc/ascend/dcmi_management.cpp b/csrc/ascend/dcmi_management.cpp +new file mode 100644 +index 0000000..64f3d7b +--- /dev/null ++++ b/csrc/ascend/dcmi_management.cpp +@@ -0,0 +1,35 @@ ++#include "dcmi_management.h" ++#include ++#include ++ ++namespace lmc { ++DCMIManager::DCMIManager() { ++ auto ret = dcmi_init(); ++ this->initialized = false; ++ if (ret == 0) { ++ this->initialized = true; ++ } else { ++ std::cerr << "Unable to initialize DCMI: " << std::to_string(ret) << std::endl; ++ } ++}; ++ ++DCMIManager::~DCMIManager() { ++}; ++ ++std::string DCMIManager::getCPUAffinityFromDeviceId(int8_t cardId, int8_t devId) { ++ if (!this->initialized) { ++ throw std::runtime_error("DCMI not initialized\n"); ++ } ++ ++ char affinityCpu[TOPO_INFO_MAX_LENTH] ={0}; ++ int length = 0; ++ ++ auto dcmiErr = dcmi_get_affinity_cpu_info_by_device_id(static_cast(cardId), ++ static_cast(devId), affinityCpu, &length); ++ if (dcmiErr != 0 ){ ++ throw std::runtime_error("Error in getting affinity CPU info: " + std::to_string(dcmiErr) + "\n"); ++ } ++ ++ return std::string(affinityCpu, length); ++} ++} +\ No newline at end of file +diff --git a/csrc/ascend/dcmi_management.h b/csrc/ascend/dcmi_management.h +new file mode 100644 +index 0000000..83dcf05 +--- /dev/null ++++ b/csrc/ascend/dcmi_management.h +@@ -0,0 +1,32 @@ ++#pragma once ++#include "dcmi_interface_api.h" ++#include ++ ++namespace lmc { ++class DCMIManager { ++private: ++ DCMIManager(); ++ ++ // Delete Copy constructor and assignment operator ++ DCMIManager(const DCMIManager&) = delete; ++ DCMIManager& operator=(const DCMIManager&) = delete; ++ DCMIManager(DCMIManager&&) = delete; ++ DCMIManager& operator=(DCMIManager&&) = delete; ++ ++ bool initialized; ++ std::string cpuAffinity; ++ ++public: ++ static DCMIManager& GetInstance() ++ { ++ static DCMIManager instance; ++ return instance; ++ } ++ ~DCMIManager(); ++ ++ // NOTE: at the moment we assume card and devId are the same. ++ // there might be scenario you won't have the same card and devId. ++ // we should indeed do this properly ? ++ std::string getCPUAffinityFromDeviceId(int8_t cardId, int8_t devId); ++}; ++} +diff --git a/csrc/ascend/framework_hal.h b/csrc/ascend/framework_hal.h +new file mode 100644 +index 0000000..5d22d91 +--- /dev/null ++++ b/csrc/ascend/framework_hal.h +@@ -0,0 +1,78 @@ ++#pragma once ++ ++#ifdef USE_TORCH ++ #include ++ #include ++#else ++ #include ++ #include ++ #include ++ #include ++ #include ++#endif ++ ++ ++namespace py = pybind11; ++namespace framework_hal { ++#ifdef USE_TORCH ++#else ++// NOTE (Gingfung): This function encapsulates the logic of calling MindSpore's Python API ++// The reason we had to do this were because when I directly included the mindspore runtime apis for building, ++// it kept erroring with linking issues. I was in touch with a couple of MindSpore guys but never really got it working. ++// Anyway, we will revisit using Mindspore runtime API once we are on 2.7. ++int32_t _get_device_id_from_mindspore_hal_python_api_impl() { ++ int device_id = -1; ++ ++ try { ++ py::module_ ms = py::module_::import("mindspore"); ++ py::module_ hal = ms.attr("hal"); ++ py::object current_stream_obj = hal.attr("current_stream")(); ++ py::object device_id_py_obj = current_stream_obj.attr("device_id"); ++ device_id = device_id_py_obj.cast(); ++ ++ // NOTE: Mindspore current_device_id starts from 0 no matter what, so we should shift the device id ++ // iff users specify the ASCEND_RT_VISIBLE_DEVICES ++ const char* env_visible_devices_p = std::getenv("ASCEND_RT_VISIBLE_DEVICES"); ++ if (env_visible_devices_p != nullptr) { ++ std::string env_visible_devices = env_visible_devices_p; ++ std::vector list_visible_devices; ++ std::stringstream ss(env_visible_devices); ++ std::string item; ++ while (std::getline(ss, item, ',')) { ++ list_visible_devices.push_back(std::stoi(item)); ++ } ++ std::sort(list_visible_devices.begin(), list_visible_devices.end()); ++ // from what I have seen, there are two cases: ++ // 1. no hccl, we just use current_device, even though we have specify the ASCEND_RT_VISIBLE_DEVICES ++ // 2. hccl, and we use current_device that seems to be correct ++ // for case 2, since the current_device would have been correct anyway, obtaining from the list would be fine. ++ // for case 1, we have shifted the device to the RT_VISIBLE_DEVICES, so it should be corrected. ++ device_id = list_visible_devices[device_id]; ++ } ++ ++ } catch (const py::error_already_set &e) { ++ std::cerr << "C++ Internal: Python error in _get_device_id_from_mindspore_hal_python_api_impl: " << e.what() << std::endl; ++ PyErr_Print(); // Print Python traceback ++ device_id = -1; ++ } catch (const std::exception &e) { ++ std::cerr << "C++ Internal: Standard C++ exception in _get_device_id_from_mindspore_hal_python_api_impl: " << e.what() << std::endl; ++ device_id = -1; ++ } ++ return device_id; ++} ++#endif ++ ++ ++int8_t GetDeviceIdx() { ++#ifdef USE_TORCH ++ return c10_npu::getCurrentNPUStream().device_index(); ++#else ++ // FIXME: should be always within 127 ? ++ auto devId = _get_device_id_from_mindspore_hal_python_api_impl(); ++ if (devId == -1) { ++ throw std::runtime_error("Failed to get device ID from MindSpore's Python API."); ++ } ++ return devId; ++#endif ++}; ++} // namespace framework_hal +\ No newline at end of file +diff --git a/csrc/ascend/kernels/paged_kv_copy.cpp b/csrc/ascend/kernels/paged_kv_copy.cpp +new file mode 100644 +index 0000000..d36b415 +--- /dev/null ++++ b/csrc/ascend/kernels/paged_kv_copy.cpp +@@ -0,0 +1,170 @@ ++/* NOTE: tested fp16, but not the rest yet. */ ++#include "kernel_operator.h" ++#include ++#include "types.h" ++#include "utils.h" ++ ++template class PagedKVCopy { ++ using local_scalar_t = AscendC::LocalTensor; ++ ++public: ++ __aicore__ inline PagedKVCopy() ++ { ++ } ++ ++ __aicore__ inline void init(GM_ADDR pagedKVCaches, GM_ADDR cacheTensor, GM_ADDR slotmappings, ++ const int64_t numPages, const int64_t hiddenDims, const int32_t pagedSize, ++ const int32_t numLayers, const int32_t numTokensChunk, const bool page2L, ++ AscendC::TPipe *pipe) ++ { ++ this->pipe_ = pipe; ++ this->numLayers_ = numLayers; ++ this->numPages_ = numPages; ++ this->hiddenDims_ = hiddenDims; ++ this->numTokensChunk_ = numTokensChunk; ++ this->pagedSize_ = pagedSize; ++ this->page2L_ = page2L; ++ this->invalid_ = false; ++ // TODO: Not sure whether we need double buffering since we are mainly copying data ++ // might only make sense if we have compute ++ this->pipe_->InitBuffer(pagedTokenQue_, 2, this->hiddenDims_*sizeof(scalar_t)); ++ } ++ ++ __aicore__ inline void updateMemOffset(__gm__ uint8_t *pagedKVCaches, __gm__ uint8_t* cacheTensor, ++ __gm__ uint8_t *slotmappings, const int tokenIdx, ++ const int kvIdx, const int layerIdx) ++ { ++ __gm__ int32_t *slotmappingPtr = reinterpret_cast<__gm__ int32_t*>(slotmappings); ++ int64_t slot = slotmappingPtr[tokenIdx]; ++ if (slot == -1) { ++ this->invalid_ = true; ++ return; ++ } ++ // its a pointer within the GM addr space, that point to another GM addr space ++ __gm__ uint8_t * __gm__ *pagedKVCachesPtr = reinterpret_cast<__gm__ uint8_t* __gm__ *>(pagedKVCaches); ++ ++ // getting the right ptr to the paged kvcache layer ++ __gm__ uint8_t *pagedLayerKVCaches = pagedKVCachesPtr[layerIdx]; ++ ++ int64_t pagedIdx = slot / this->pagedSize_; ++ int64_t withinPagedIdx = slot % this->pagedSize_; ++ ++ int64_t pagedIdxOffset = kvIdx * this->numPages_ * this->pagedSize_ * this->hiddenDims_ + ++ pagedIdx * this->pagedSize_ * this->hiddenDims_ + ++ withinPagedIdx * this->hiddenDims_; ++ ++ int64_t dstTensorIdxOffset = kvIdx * this->numLayers_ * this->numTokensChunk_ * this->hiddenDims_ + ++ layerIdx * this->numTokensChunk_ * this->hiddenDims_ + ++ tokenIdx * this->hiddenDims_; ++ ++ this->pagedTokenGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedLayerKVCaches) + pagedIdxOffset, ++ this->hiddenDims_); ++ this->lmcBufferGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(cacheTensor) + dstTensorIdxOffset, ++ this->hiddenDims_); ++ } ++ ++ __aicore__ inline void processFunc() { ++ if (this->invalid_) { ++ return; ++ } ++ // 1. Alloc Tensor for local page ++ local_scalar_t hiddenDimTensor = pagedTokenQue_.AllocTensor(); ++ ++ // 2. copy from global tensor into local ++ if (this->page2L_) { ++ AscendC::DataCopy(hiddenDimTensor, this->pagedTokenGlobal_, this->hiddenDims_); ++ } else { ++ AscendC::DataCopy(hiddenDimTensor, this->lmcBufferGlobal_, this->hiddenDims_); ++ } ++ ++ // 3. enque vecin ++ pagedTokenQue_.EnQue(hiddenDimTensor); ++ // 4. deque vecin, possible to reuse due to QueBind ++ hiddenDimTensor = pagedTokenQue_.DeQue(); ++ ++ // 5. datacopy into GM ++ if (this->page2L_) { ++ AscendC::DataCopy(this->lmcBufferGlobal_, hiddenDimTensor, this->hiddenDims_); ++ } else { ++ AscendC::DataCopy(this->pagedTokenGlobal_, hiddenDimTensor, this->hiddenDims_); ++ } ++ ++ // 6. free alloced Tensor ++ pagedTokenQue_.FreeTensor(hiddenDimTensor); ++ } ++ ++ __aicore__ inline void reset() { ++ this->invalid_ = false; ++ } ++ ++private: ++ AscendC::TPipe *pipe_; ++ // copying from NPUBuffer to the Chunked GM Tensor ++ AscendC::TQueBind pagedTokenQue_; ++ ++ // [layers * [kvs, numPages * pagedSize, heads*headsize]] ++ AscendC::GlobalTensor pagedTokenGlobal_; ++ // [kvs, layers, numTokensChunk, heads*headsize] ++ AscendC::GlobalTensor lmcBufferGlobal_; ++ int64_t numPages_; // num vllm npu blocks ++ int32_t pagedSize_; // per npu block tokens ++ int32_t numLayers_; // num layers ++ int64_t hiddenDims_; // heads * headsize ++ int64_t hiddenDimsAligned_; // heads *headsize aligned to datablock byte ++ int32_t numTokensChunk_; // num tokens in the cache tensor chunk ++ bool page2L_; // whether the direction of copy is from page to lmc ++ bool invalid_; // keep track of whether current iter is invalid ++}; ++ ++#define PAGED_KV_COPY_TYPE_DECLARE(TYPE) \ ++ extern "C" __global__ __aicore__ void paged_kv_copy_##TYPE( \ ++ __gm__ uint8_t* pagedKVCaches, __gm__ uint8_t* dstCacheTensor, __gm__ uint8_t* slotmappings, \ ++ const int64_t numPages, const int64_t hiddenDims, const int32_t pagedSize, \ ++ const int32_t kvs, const int32_t numLayers, const int32_t numTokensChunk, const int coreNum, \ ++ const bool page2L) \ ++ { \ ++ AscendC::TPipe pipe; \ ++ PagedKVCopy op{}; \ ++ op.init(pagedKVCaches, dstCacheTensor, slotmappings, numPages, hiddenDims, pagedSize, \ ++ numLayers, numTokensChunk, page2L, &pipe); \ ++ int64_t bIdx = AscendC::GetBlockIdx(); \ ++ for (int64_t i = bIdx; i < numTokensChunk; i+=coreNum) { \ ++ for (int32_t kvIdx = 0; kvIdx < kvs; kvIdx ++) { \ ++ for (int32_t layerIdx = 0; layerIdx < numLayers; layerIdx++) { \ ++ op.reset(); \ ++ op.updateMemOffset(pagedKVCaches, dstCacheTensor, slotmappings, i, kvIdx, layerIdx); \ ++ op.processFunc(); \ ++ } \ ++ } \ ++ } \ ++ } ++ ++// Declare support kernel entry ++PAGED_KV_COPY_TYPE_DECLARE(half); ++PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t); ++PAGED_KV_COPY_TYPE_DECLARE(int8_t); ++ ++namespace lmc_ops { ++ ++#define PAGED_KV_COPY_KERNEL_CALL(TYPE) \ ++ paged_kv_copy_##TYPE<<>>(pagedKVCaches, dstCacheTensor, slotmappings, numPages, \ ++ hiddenDims, pagedSize, kvs, numLayers, numTokensChunk, \ ++ aivNum, page2L); ++ ++extern void paged_kv_copy_impl(vllm_ascend::AscendType type, uint32_t blockDim, void *stream, uint8_t *pagedKVCaches, uint8_t *dstCacheTensor, ++ uint8_t *slotmappings, const int64_t numPages, const int64_t hiddenDims, ++ const int32_t pagedSize, const int32_t kvs, const int32_t numLayers, const int32_t numTokensChunk, ++ const int aivNum, const bool page2L) ++{ ++ if (type == vllm_ascend::AscendType::FP16) { ++ PAGED_KV_COPY_KERNEL_CALL(half); ++ } else if (type == vllm_ascend::AscendType::BF16) { ++ PAGED_KV_COPY_KERNEL_CALL(bfloat16_t); ++ } else if (type == vllm_ascend::AscendType::INT8) { ++ PAGED_KV_COPY_KERNEL_CALL(int8_t); ++ } else { ++ return; ++ } ++} ++ ++} // namespace lmc_ops +diff --git a/csrc/ascend/kernels/paged_kv_tuple_copy.cpp b/csrc/ascend/kernels/paged_kv_tuple_copy.cpp +new file mode 100644 +index 0000000..5769c4d +--- /dev/null ++++ b/csrc/ascend/kernels/paged_kv_tuple_copy.cpp +@@ -0,0 +1,176 @@ ++/* NOTE: tested fp16, but not the rest yet. */ ++#include "kernel_operator.h" ++#include ++#include "types.h" ++#include "utils.h" ++ ++template class PagedKVTupleCopy { ++ using local_scalar_t = AscendC::LocalTensor; ++ ++public: ++ __aicore__ inline PagedKVTupleCopy() ++ { ++ } ++ ++ __aicore__ inline void init(GM_ADDR pagedKVCaches, GM_ADDR cacheTensor, GM_ADDR slotmappings, ++ const int64_t numPages, const int64_t hiddenDims, const int32_t pagedSize, ++ const int32_t numLayers, const int32_t numTokensChunk, const bool page2L, ++ AscendC::TPipe *pipe) ++ { ++ this->pipe_ = pipe; ++ this->numLayers_ = numLayers; ++ this->numPages_ = numPages; ++ this->hiddenDims_ = hiddenDims; ++ this->numTokensChunk_ = numTokensChunk; ++ this->pagedSize_ = pagedSize; ++ this->page2L_ = page2L; ++ this->invalid_ = false; ++ // TODO: Not sure whether we need double buffering since we are mainly copying data ++ // might only make sense if we have compute ++ this->pipe_->InitBuffer(pagedTokenQue_, 2, this->hiddenDims_*sizeof(scalar_t)); ++ } ++ ++ __aicore__ inline void updateMemOffset(__gm__ uint8_t *pagedKVCaches, __gm__ uint8_t* cacheTensor, ++ __gm__ uint8_t *slotmappings, const int tokenIdx, ++ const int kvIdx, const int layerIdx, const int kvs) ++ { ++ __gm__ int32_t *slotmappingPtr = reinterpret_cast<__gm__ int32_t*>(slotmappings); ++ int32_t slot = slotmappingPtr[tokenIdx]; ++ if (slot == -1) { ++ this->invalid_ = true; ++ return; ++ } ++ // its a pointer within the GM addr space, that point to another tuple of GM addr space ++ __gm__ uint8_t * __gm__ *pagedKVCachesPtr = reinterpret_cast<__gm__ uint8_t* __gm__ *>(pagedKVCaches); ++ ++ // getting the right ptr to the paged kvcache layer ++ // the pagedKVCachesPtr is a list of list of ptrs ++ int layerKVIdx = layerIdx * kvs + kvIdx; ++ ++ __gm__ uint8_t *pagedLayerKVCaches = pagedKVCachesPtr[layerKVIdx]; ++ ++ int64_t pagedIdx = slot / this->pagedSize_; ++ int64_t withinPagedIdx = slot % this->pagedSize_; ++ ++ int64_t pagedIdxOffset = pagedIdx * this->pagedSize_ * this->hiddenDims_ + ++ withinPagedIdx * this->hiddenDims_; ++ ++ int64_t dstTensorIdxOffset = kvIdx * this->numLayers_ * this->numTokensChunk_ * this->hiddenDims_ + ++ layerIdx * this->numTokensChunk_ * this->hiddenDims_ + ++ tokenIdx * this->hiddenDims_; ++ ++ this->pagedTokenGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedLayerKVCaches) + pagedIdxOffset, ++ this->hiddenDims_); ++ this->lmcBufferGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(cacheTensor) + dstTensorIdxOffset, ++ this->hiddenDims_); ++ } ++ ++ __aicore__ inline void processFunc() { ++ if (this->invalid_) { ++ return; ++ } ++ // 1. Alloc Tensor for local page ++ local_scalar_t hiddenDimTensor = pagedTokenQue_.AllocTensor(); ++ ++ // 2. copy from global tensor into local ++ if (this->page2L_) { ++ // pagetoken -> local ++ AscendC::DataCopy(hiddenDimTensor, this->pagedTokenGlobal_, this->hiddenDims_); ++ } else { ++ // lmc -> local ++ AscendC::DataCopy(hiddenDimTensor, this->lmcBufferGlobal_, this->hiddenDims_); ++ } ++ ++ // 3. enque vecin ++ pagedTokenQue_.EnQue(hiddenDimTensor); ++ // 4. deque vecin, possible to reuse due to QueBind ++ hiddenDimTensor = pagedTokenQue_.DeQue(); ++ ++ // 5. datacopy into GM ++ if (this->page2L_) { ++ // local -> lmc ++ AscendC::DataCopy(this->lmcBufferGlobal_, hiddenDimTensor, this->hiddenDims_); ++ } else { ++ // local -> pagetoken ++ AscendC::DataCopy(this->pagedTokenGlobal_, hiddenDimTensor, this->hiddenDims_); ++ } ++ ++ // 6. free alloced Tensor ++ pagedTokenQue_.FreeTensor(hiddenDimTensor); ++ } ++ ++ __aicore__ inline void reset() { ++ this->invalid_ = false; ++ } ++ ++private: ++ AscendC::TPipe *pipe_; ++ // copying from NPUBuffer to the Chunked GM Tensor ++ AscendC::TQueBind pagedTokenQue_; ++ ++ // [layers * [kvs, numPages * pagedSize, heads*headsize]] ++ AscendC::GlobalTensor pagedTokenGlobal_; ++ // [kvs, layers, numTokensChunk, heads*headsize] ++ AscendC::GlobalTensor lmcBufferGlobal_; ++ int64_t numPages_; // num vllm npu blocks ++ int32_t pagedSize_; // per npu block tokens ++ int32_t numLayers_; // num layers ++ int64_t hiddenDims_; // heads * headsize ++ int64_t hiddenDimsAligned_; // heads *headsize aligned to datablock byte ++ int32_t numTokensChunk_; // num tokens in the cache tensor chunk ++ bool page2L_; // whether the direction of copy is from page tensor to lmc ++ bool invalid_; // keep track of whether current iter is invalid ++}; ++ ++#define PAGED_KV_TUPLE_COPY_TYPE_DECLARE(TYPE) \ ++ extern "C" __global__ __aicore__ void paged_kv_tuple_copy_##TYPE( \ ++ __gm__ uint8_t* pagedKVCaches, __gm__ uint8_t* dstCacheTensor, __gm__ uint8_t* slotmappings, \ ++ const int64_t numPages, const int64_t hiddenDims, const int32_t pagedSize, \ ++ const int32_t kvs, const int32_t numLayers, const int32_t numTokensChunk, const int coreNum, \ ++ const bool page2L) \ ++ { \ ++ AscendC::TPipe pipe; \ ++ PagedKVTupleCopy op{}; \ ++ op.init(pagedKVCaches, dstCacheTensor, slotmappings, numPages, hiddenDims, pagedSize, \ ++ numLayers, numTokensChunk, page2L, &pipe); \ ++ int64_t bIdx = AscendC::GetBlockIdx(); \ ++ for (int64_t i = bIdx; i < numTokensChunk; i+=coreNum) { \ ++ for (int32_t kvIdx = 0; kvIdx < kvs; kvIdx ++) { \ ++ for (int32_t layerIdx = 0; layerIdx < numLayers; layerIdx++) { \ ++ op.reset(); \ ++ op.updateMemOffset(pagedKVCaches, dstCacheTensor, slotmappings, i, kvIdx, layerIdx, kvs); \ ++ op.processFunc(); \ ++ } \ ++ } \ ++ } \ ++ } ++ ++// Declare support kernel entry ++PAGED_KV_TUPLE_COPY_TYPE_DECLARE(half); ++PAGED_KV_TUPLE_COPY_TYPE_DECLARE(bfloat16_t); ++PAGED_KV_TUPLE_COPY_TYPE_DECLARE(int8_t); ++ ++namespace lmc_ops { ++ ++#define PAGED_KV_TUPLE_COPY_KERNEL_CALL(TYPE) \ ++ paged_kv_tuple_copy_##TYPE<<>>(pagedKVCaches, dstCacheTensor, slotmappings, numPages, \ ++ hiddenDims, pagedSize, kvs, numLayers, numTokensChunk, \ ++ aivNum, page2L); ++ ++extern void paged_kv_tuple_copy_impl(vllm_ascend::AscendType type, uint32_t blockDim, void *stream, uint8_t *pagedKVCaches, uint8_t *dstCacheTensor, ++ uint8_t *slotmappings, const int64_t numPages, const int64_t hiddenDims, ++ const int32_t pagedSize, const int32_t kvs, const int32_t numLayers, const int32_t numTokensChunk, ++ const int aivNum, const bool page2L) ++{ ++ if (type == vllm_ascend::AscendType::FP16) { ++ PAGED_KV_TUPLE_COPY_KERNEL_CALL(half); ++ } else if (type == vllm_ascend::AscendType::BF16) { ++ PAGED_KV_TUPLE_COPY_KERNEL_CALL(bfloat16_t); ++ } else if (type == vllm_ascend::AscendType::INT8) { ++ PAGED_KV_TUPLE_COPY_KERNEL_CALL(int8_t); ++ } else { ++ return; ++ } ++} ++ ++} // namespace lmc_ops +diff --git a/csrc/ascend/kernels/types.h b/csrc/ascend/kernels/types.h +new file mode 100644 +index 0000000..d272781 +--- /dev/null ++++ b/csrc/ascend/kernels/types.h +@@ -0,0 +1,26 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++namespace vllm_ascend { ++enum struct AscendType { ++ FP16 = 0, ++ BF16 = 1, ++ FP32 = 2, ++ INT8 = 3, ++}; ++} +\ No newline at end of file +diff --git a/csrc/ascend/kernels/utils.h b/csrc/ascend/kernels/utils.h +new file mode 100644 +index 0000000..a6dd3f3 +--- /dev/null ++++ b/csrc/ascend/kernels/utils.h +@@ -0,0 +1,38 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++#include "kernel_type.h" ++namespace vllm_ascend { ++ ++template struct AccType; ++ ++template <> struct AccType { ++ using type = float; ++}; ++ ++template <> struct AccType { ++ using type = half; ++}; ++ ++template <> struct AccType { ++ using type = float; ++}; ++ ++template <> struct AccType { ++ using type = int; ++}; ++}// namespace vllm_ascend +\ No newline at end of file +diff --git a/csrc/ascend/np_tensor.h b/csrc/ascend/np_tensor.h +new file mode 100644 +index 0000000..41c23f7 +--- /dev/null ++++ b/csrc/ascend/np_tensor.h +@@ -0,0 +1,219 @@ ++#pragma once ++#include ++#include ++#include ++#include "pinned_mem.h" ++#include "ms_extension.h" ++#include "utils.h" ++#include "aclnn/opdev/platform.h" ++#include "tiling/platform/platform_ascendc.h" ++#include "tiling/tiling_api.h" ++#include "ops.h" ++ ++using BaseTensor = mindspore::tensor::BaseTensor; ++using BaseTensorPtr = mindspore::tensor::BaseTensorPtr; ++using PyBoostUtils = mindspore::kernel::pyboost::PyBoostUtils; ++ ++namespace py = pybind11; ++ ++struct DtypeCaster { ++ BaseTensorPtr CheckAndCast(const BaseTensorPtr &t, const std::string &name = "") { ++ mindspore::Int64ImmPtr dst_type = std::make_shared(mindspore::TypeId::kNumberTypeInt32); ++ if (t->data_type() != mindspore::TypeId::kNumberTypeInt32) { ++ if (!name.empty()) { ++ tensor_map_[name] = t; ++ } ++ return mindspore::kernel::pyboost::cast(t, dst_type); ++ } ++ return t; ++ } ++ BaseTensorPtr RecoveryTensorDtype(const BaseTensorPtr &t, const std::string &name) { ++ auto iter = tensor_map_.find(name); ++ if (iter == tensor_map_.end()) { ++ return t; ++ } ++ auto ori_tensor = iter->second; ++ auto ori_dtype = std::make_shared(ori_tensor->data_type()); ++ auto ret = mindspore::kernel::pyboost::cast(t, ori_dtype); ++ ori_tensor->AssignValue(*ret); ++ return ori_tensor; ++ } ++ std::map tensor_map_; ++}; ++ ++namespace lmcache_ascend { ++ ++ uint32_t get_static_aiv_core_num() { ++ static uint32_t aiv_num = []() -> uint32_t { ++ ++ const char* socName = aclrtGetSocName(); ++ if (socName == nullptr) { ++ throw std::runtime_error("Failed to get SoC name from ACL runtime."); ++ } ++ ++ platform_ascendc::PlatformAscendC* ascendcPlatform = nullptr; ++ try { ++ ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socName); ++ } catch (const std::exception& e) { ++ std::cerr << "Internal Error getting PlatformAscendCManager instance: " << e.what() << std::endl; ++ throw; ++ } ++ ++ if (ascendcPlatform == nullptr) { ++ throw std::runtime_error("Failed to get PlatformAscendCManager instance."); ++ } ++ ++ uint32_t calculated_aiv_num = ascendcPlatform->GetCoreNumAiv(); ++ return calculated_aiv_num; ++ }(); ++ return aiv_num; ++ } ++ ++ uint8_t *GetMSDataPtr(const BaseTensorPtr &t) { ++ return static_cast(t->device_address()->GetMutablePtr()) + t->data().itemsize() * t->storage_offset(); ++ } ++ ++ py::array create_mapped_numpy_with_dtype(const std::vector& shape, ++ py::object dtype_obj) { ++ if (shape.empty()) { ++ throw std::runtime_error("Shape must not be empty"); ++ } ++ ++ // 2. Calculate total number of elements and required bytes ++ size_t num_elements = 1; ++ std::vector py_shape; // Use py::ssize_t for numpy shape ++ for (int64_t dim : shape) { ++ if (dim <= 0) { ++ throw std::runtime_error("Shape dimensions must be positive."); ++ } ++ num_elements *= dim; ++ py_shape.push_back(static_cast(dim)); ++ } ++ ++ py::dtype np_dtype = py::dtype::from_args(dtype_obj); ++ size_t item_size = np_dtype.itemsize(); ++ ++ size_t size_bytes = num_elements * item_size; ++ if (size_bytes == 0) { ++ throw std::runtime_error("Calculated memory size is zero. Check shape and dtype."); ++ } ++ ++ uintptr_t ptr = lmc::alloc_pinned_mem(size_bytes); ++ if (ptr == 0) { ++ throw std::runtime_error("Failed to allocate pinned memory."); ++ } ++ ++ // 4. Calculate strides for a row-major (C-style) array ++ std::vector strides; ++ strides.reserve(py_shape.size()); ++ py::ssize_t current_stride = item_size; ++ for (int i = py_shape.size() - 1; i >= 0; --i) { ++ strides.insert(strides.begin(), current_stride); // Insert at beginning to reverse order ++ current_stride *= py_shape[i]; ++ } ++ ++ // 5. Create a py::capsule that will call MMapManager::release_mmap when destroyed ++ py::capsule free_mmap_capsule( ++ reinterpret_cast(ptr), // Payload: the ID of the mmap'd region ++ [](void* _ptr) { ++ lmc::free_pinned_mem(reinterpret_cast(_ptr)); ++ } ++ ); ++ ++ // 6. Create the NumPy array, passing the capsule as the base ++ // Use the generic py::array constructor (not py::array_t) since dtype is dynamic ++ return py::array(np_dtype, py_shape, strides, reinterpret_cast(ptr), free_mmap_capsule); ++ } ++ ++ py::array create_mmapped_numpy(size_t buffer_size) { ++ if (buffer_size <= 0 ) { ++ throw std::runtime_error("Buffer size must be positive."); ++ } ++ ++ int64_t numel = static_cast(buffer_size); ++ ++ uintptr_t ptr = lmc::alloc_pinned_mem(buffer_size); ++ if (ptr == 0) { ++ throw std::runtime_error("Failed to allocate pinned memory."); ++ } ++ ++ py::capsule free_mmap_capsule( ++ reinterpret_cast(ptr), // Payload: the ID of the mmap'd region ++ [](void* _ptr) { ++ lmc::free_pinned_mem(reinterpret_cast(_ptr)); ++ } ++ ); ++ ++ return py::array(py::dtype::from_args(py::str("uint8")), {numel}, {1}, ++ reinterpret_cast(ptr), free_mmap_capsule); ++ } ++ ++ // NOTE: Does not currently support in graph execution ++ void paged_layers_kv_transfer_ms( ++ py::array& lmc_buffer, ++ BaseTensorPtr& paged_kv_ptrs, ++ BaseTensorPtr& slot_mappings, ++ const uintptr_t lmc_buffer_hostptr, ++ const int num_pages, ++ const int page_size, ++ const int kvs, ++ const bool page2L, ++ const bool is_reg_mem) { ++ auto stream_id = PyBoostUtils::cur_stream_id(); ++ auto device_context = mindspore::runtime::OpRunner::GetDeviceContext("Ascend"); ++ // reset ++ if (page2L) { ++ memset(static_cast(lmc_buffer.mutable_data()), 0, lmc_buffer.nbytes()); ++ } ++ ++ uintptr_t lmc_offset_dptr; ++ if (is_reg_mem) { ++ // 1. we need to get the dev ptr from the host ptr and the paged_kv_ptrs list. ++ uintptr_t lmc_base_dptr = lmc::get_device_ptr(lmc_buffer_hostptr); ++ // 2. calculate offset from the lmcbaseptr to current dataptr ++ uintptr_t current_ptr = reinterpret_cast(lmc_buffer.mutable_data()); ++ uintptr_t offset_ptr = current_ptr - lmc_buffer_hostptr; ++ if (offset_ptr < 0) { ++ throw std::runtime_error( ++ "Offset calculation failed."); ++ } ++ lmc_offset_dptr = lmc_base_dptr + offset_ptr; ++ } else { ++ // we just get the ptr from the lmc buf ++ lmc_offset_dptr = reinterpret_cast(lmc_buffer.mutable_data()); ++ } ++ ++ // only support int32 slotmapping for now. ++ DtypeCaster caster; ++ slot_mappings = caster.CheckAndCast(slot_mappings, "slot_mapping"); ++ ++ ++ int num_tokens = slot_mappings->shape()[0]; ++ int num_layers = paged_kv_ptrs->shape()[0]; ++ int ndim = lmc_buffer.ndim(); ++ int hidden_dims = static_cast(lmc_buffer.shape(ndim - 1)); ++ auto ascend_type = vllm_ascend::get_dtype_from_np(lmc_buffer); ++ ++ uint32_t aivNum = get_static_aiv_core_num(); ++ ++ PyBoostUtils::PrepareOpInputs(device_context, stream_id, paged_kv_ptrs, slot_mappings); ++ PyBoostUtils::DispatchRun(std::make_shared([=]() { ++ PyBoostUtils::MallocOpInputs(device_context, paged_kv_ptrs, slot_mappings); ++ ++ uint8_t* paged_kv_dev_ptr = GetMSDataPtr(paged_kv_ptrs); ++ uint8_t* slot_mappings_dptr = GetMSDataPtr(slot_mappings); ++ auto acl_stream = device_context->device_res_manager_->GetStream(stream_id); ++ ++ mindspore::runtime::OpExecutor::DispatchLaunchTask([=]() { ++ lmc_ops::paged_kv_tuple_copy_impl(ascend_type, aivNum, acl_stream, paged_kv_dev_ptr, ++ reinterpret_cast(lmc_offset_dptr), slot_mappings_dptr, ++ static_cast(num_pages), static_cast(hidden_dims), ++ static_cast(page_size), static_cast(kvs), ++ static_cast(num_layers), static_cast(num_tokens), ++ static_cast(aivNum), page2L); ++ }); ++ })); ++ slot_mappings = caster.RecoveryTensorDtype(slot_mappings, "slot_mapping"); ++ } ++ ++} +diff --git a/csrc/ascend/ops.h b/csrc/ascend/ops.h +new file mode 100644 +index 0000000..e027107 +--- /dev/null ++++ b/csrc/ascend/ops.h +@@ -0,0 +1,33 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++#include ++#include ++#include "kernels/types.h" ++ ++namespace lmc_ops { ++ extern void paged_kv_copy_impl(vllm_ascend::AscendType type, uint32_t blockDim, void *stream, uint8_t *pagedKVCaches, uint8_t *dstCacheTensor, ++ uint8_t *slotmappings, const int64_t numPages, const int64_t hiddenDims, ++ const int32_t pagedSize, const int32_t kvs, const int32_t numLayers, const int32_t numTokensChunk, ++ const int aivNum, const bool page2L); ++ ++ extern void paged_kv_tuple_copy_impl(vllm_ascend::AscendType type, uint32_t blockDim, void *stream, uint8_t *pagedKVCaches, uint8_t *dstCacheTensor, ++ uint8_t *slotmappings, const int64_t numPages, const int64_t hiddenDims, ++ const int32_t pagedSize, const int32_t kvs, const int32_t numLayers, const int32_t numTokensChunk, ++ const int aivNum, const bool page2L); ++} +\ No newline at end of file +diff --git a/csrc/ascend/pinned_mem.cpp b/csrc/ascend/pinned_mem.cpp +new file mode 100644 +index 0000000..eba74b4 +--- /dev/null ++++ b/csrc/ascend/pinned_mem.cpp +@@ -0,0 +1,225 @@ ++#include "pinned_mem.h" ++#include ++#include ++// HACK: undefine the version from ascend_hal.h. ++#ifdef PROF_ERROR ++ // You can add a pragma message to see this in your build log if you want: ++ // #pragma message("Undefining PROF_ERROR from ascend_hal.h before NPU headers") ++ #undef PROF_ERROR ++#endif ++ ++#include ++#include "acl/acl.h" ++#include "framework_hal.h" ++#include "dcmi_management.h" ++#include ++#include ++#include "threading.h" ++#include ++ ++namespace lmc { ++ ++ ++// A more robust check for your specific environment. ++// numa_available() sometimes does not work in openEuler? ++bool is_numa_system_present() { ++ struct stat st; ++ // Check if the directory that the kernel uses to expose NUMA nodes exists. ++ if (stat("/sys/devices/system/node", &st) != 0) { ++ return false; // The path doesn't exist. ++ } ++ // Check if it's actually a directory. ++ return S_ISDIR(st.st_mode); ++} ++ ++// Function to parse a CPU range string (e.g., "144-167") ++// and return the first CPU in the range. ++int parse_first_cpu(const std::string& cpu_str) { ++ try { ++ size_t dash_pos = cpu_str.find('-'); ++ if (dash_pos != std::string::npos) { ++ return std::stoi(cpu_str.substr(0, dash_pos)); ++ } ++ // If it's a single CPU, not a range ++ return std::stoi(cpu_str); ++ } catch (const std::invalid_argument& e) { ++ throw std::runtime_error("Invalid CPU string format. Could not parse number."); ++ } catch (const std::out_of_range& e) { ++ throw std::runtime_error("CPU number is out of range."); ++ } ++} ++ ++PinnedMemoryManager::PinnedMemoryManager() { ++}; ++ ++PinnedMemoryManager::~PinnedMemoryManager() { ++ this->freeAll(); ++}; ++ ++PinnedMemoryRecord PinnedMemoryManager::getRecord(uintptr_t hostptr) { ++ const std::shared_lock lock(this->mux); ++ if (this->allocatedMap.find(hostptr) == this->allocatedMap.end()) { ++ std::cerr << "Host ptr: "<< hostptr << " does not exists. This should not happened." << std::endl; ++ return PinnedMemoryRecord{0, 0, 0, 0}; ++ } ++ return this->allocatedMap.at(hostptr); ++} ++ ++bool PinnedMemoryManager::innerFree(uintptr_t hostPtr, size_t bufferSize, int8_t device) { ++ // since this is a pinned and dev accessible mem ++ // we need to halhostunregister first ++ auto ret = halHostUnregisterEx(reinterpret_cast(hostPtr), ++ static_cast(device), HOST_MEM_MAP_DEV_PCIE_TH); ++ ++ if (ret != 0) { ++ std::cout << "Unable to hal host unregister: "<< ret << std::endl; ++ return false; ++ } ++ ++ auto mret = munmap(reinterpret_cast(hostPtr), bufferSize); ++ if (mret != 0) { ++ std::cout << "Unable to unmap memory: "<< ret << std::endl; ++ return false; ++ } ++ ++ return true; ++}; ++ ++bool PinnedMemoryManager::freePinned(uintptr_t hostPtr) { ++ const std::unique_lock lock(this->mux); ++ ++ // make sure this hostptr is in our allocated map ++ if (this->allocatedMap.find(hostPtr) == this->allocatedMap.end()) { ++ std::cerr << "HostPtr "<allocatedMap.at(hostPtr); ++ auto freed = this->innerFree(hostPtr, record.buffSize, record.device); ++ if (freed) { ++ this->allocatedMap.erase(hostPtr); ++ return true; ++ } ++ return false; ++}; ++ ++uintptr_t PinnedMemoryManager::allocPinned(size_t bufferSize) { ++ auto device = framework_hal::GetDeviceIdx(); ++ ++ int cpu = -1; ++ if (is_numa_system_present()) { ++ long numaErr; ++ int numa_node; ++ auto& dcmiManger = lmc::DCMIManager::GetInstance(); ++ auto cpuAffinityStr = dcmiManger.getCPUAffinityFromDeviceId(static_cast(device), 0); ++ cpu = lmc::parse_first_cpu(cpuAffinityStr); ++ } ++ ++ std::unique_ptr guard; ++ if (cpu >= 0) { ++ guard = std::make_unique(cpu); ++ } ++ ++ uintptr_t hostPtr; ++ int adviseErr; ++ ++ hostPtr = reinterpret_cast(mmap(nullptr, bufferSize, PROT_FLAGS, MAP_FLAGS, -1, 0)); ++ if ((void*) hostPtr == MAP_FAILED) { ++ throw std::runtime_error("Unable to alloc memory with mmap."); ++ } ++ ++ adviseErr = madvise(reinterpret_cast(hostPtr), bufferSize, MADV_HUGEPAGE); ++ ++ if (adviseErr != 0) { ++ // should be okay to continue ++ std::cerr << "Unable to get madvise with HugePages: "<< adviseErr << std::endl; ++ } ++ ++ // set to all zeros ++ memset(reinterpret_cast(hostPtr), 0, bufferSize); ++ ++ void* devPtr; ++ drvError_t drvRet; ++ drvRet = halHostRegister(reinterpret_cast(hostPtr), static_cast(bufferSize), ++ HOST_MEM_MAP_DEV_PCIE_TH, static_cast(device), (void**)&devPtr); ++ ++ if (drvRet != 0) { ++ throw std::runtime_error(std::string("Unable to register host memory with hal: ") + std::to_string(drvRet) + \ ++ " on device: " + std::to_string(device)); ++ } ++ ++ auto lockErr = mlock(reinterpret_cast(hostPtr), bufferSize); ++ if (lockErr == -1) { ++ std::cerr << "Unable to pin host memory with error code: "<< std::to_string(lockErr) << std::endl; ++ // this can happen in non-privileged mode or not enough rlimit, ++ // let's not proceed since we wanted to guarantee pinned ++ // because we already alloced, let's free ++ this->innerFree(hostPtr, bufferSize, static_cast(device)); ++ return 0; ++ } ++ ++ { ++ const std::unique_lock lock(this->mux); ++ this->allocatedMap.emplace(hostPtr, PinnedMemoryRecord{hostPtr, reinterpret_cast(devPtr), bufferSize, static_cast(device)}); ++ } ++ return hostPtr; ++}; ++ ++void PinnedMemoryManager::freeAll() { ++ const std::shared_lock lock(this->mux); ++ if (!this->allocatedMap.empty()) { ++ std::cerr << "PinnedMemoryManager::freeAll() called. " ++ << this->allocatedMap.size() ++ << " block(s) were still allocated. This might indicate " ++ << "that not all PyTorch tensor deleters were invoked." << std::endl; ++ ++ // Iterate carefully as erasing modifies the map ++ // One way is to collect keys then iterate, or use C++17 map::extract ++ std::vector keys_to_free; ++ for (const auto& pair : this->allocatedMap) { ++ keys_to_free.push_back(pair.first); ++ } ++ ++ for (uintptr_t hostPtr : keys_to_free) { ++ // No need to check if it exists here, as we are iterating over existing keys ++ auto record = this->allocatedMap.at(hostPtr); // Or find again, though at should be safe ++ std::cerr << "PinnedMemoryManager::freeAll() freeing hostPtr: " << hostPtr ++ << " with size: " << record.buffSize << std::endl; ++ this->innerFree(hostPtr, record.buffSize, record.device); ++ // The map entry will be removed below or after the loop ++ } ++ this->allocatedMap.clear(); // Clear the map after freeing all elements ++ } ++}; ++ ++uintptr_t get_device_ptr(uintptr_t hostptr) { ++ auto& pmm = lmc::PinnedMemoryManager::GetInstance(); ++ auto record = pmm.getRecord(hostptr); ++ if (record.devptr == 0) { ++ throw std::runtime_error("Unable to find host ptr: " + std::to_string(hostptr)); ++ } ++ return record.devptr; ++} ++ ++uintptr_t alloc_pinned_mem(size_t bufferSize) { ++ auto& pmm = lmc::PinnedMemoryManager::GetInstance(); ++ return pmm.allocPinned(bufferSize); ++} ++ ++ ++bool free_pinned_mem(uintptr_t hostptr) { ++ auto& pmm = lmc::PinnedMemoryManager::GetInstance(); ++ return pmm.freePinned(hostptr); ++} ++ ++void free_all() { ++ auto& pmm = lmc::PinnedMemoryManager::GetInstance(); ++ pmm.freeAll(); ++} ++ ++void pinned_memory_deleter(void* ptr) { ++ if(ptr) { ++ free_pinned_mem(reinterpret_cast(ptr)); ++ } ++} ++} // namespace lmc +\ No newline at end of file +diff --git a/csrc/ascend/pinned_mem.h b/csrc/ascend/pinned_mem.h +new file mode 100644 +index 0000000..66ecf69 +--- /dev/null ++++ b/csrc/ascend/pinned_mem.h +@@ -0,0 +1,69 @@ ++#pragma once ++#include ++#include ++#include ++#include ++#include "driver/ascend_hal_define.h" ++#include "driver/ascend_hal.h" ++#include "sys/mman.h" ++#include ++#include ++#include ++#include ++#include ++#include "acl/acl.h" ++ ++/* ++When aclrtHostRegister is supported, ++we could migrate the sharedmemory instantiation with the aclrtHostRegister call instead. ++*/ ++namespace lmc { ++ ++constexpr int32_t PROT_FLAGS = static_cast(PROT_READ) | static_cast(PROT_WRITE); ++constexpr int32_t MAP_FLAGS = static_cast(MAP_PRIVATE) | static_cast(MAP_ANONYMOUS) | static_cast(MAP_POPULATE); ++ ++struct PinnedMemoryRecord { ++ uintptr_t ptr; ++ uintptr_t devptr; ++ size_t buffSize; ++ int8_t device; ++}; ++ ++/* We are not responsible for acl init and ctx initialization, ++ we assume the user responsible for ctx initialization ++ */ ++class PinnedMemoryManager { ++private: ++ PinnedMemoryManager(); ++ ++ // Delete copy constructor and assignment operator ++ PinnedMemoryManager(const PinnedMemoryManager&) = delete; ++ PinnedMemoryManager& operator=(const PinnedMemoryManager&) = delete; ++ PinnedMemoryManager(PinnedMemoryManager&&) = delete; ++ PinnedMemoryManager& operator=(PinnedMemoryManager&&) = delete; ++ ++ std::unordered_map allocatedMap; ++ mutable std::shared_mutex mux; ++ ++ bool innerFree(uintptr_t hostPtr, size_t bufferSize, int8_t device); ++ ++public: ++ static PinnedMemoryManager& GetInstance() ++ { ++ static PinnedMemoryManager instance; ++ return instance; ++ } ++ ~PinnedMemoryManager(); ++ ++ PinnedMemoryRecord getRecord(uintptr_t hostptr); ++ uintptr_t allocPinned(size_t bufferSize); ++ bool freePinned(uintptr_t hostPtr); ++ void freeAll(); ++}; ++ ++uintptr_t get_device_ptr(uintptr_t hostptr); ++uintptr_t alloc_pinned_mem(size_t bufferSize); ++bool free_pinned_mem(uintptr_t hostptr); ++void free_all(); ++void pinned_memory_deleter(void* ptr); ++} +diff --git a/csrc/ascend/pybind.cpp b/csrc/ascend/pybind.cpp +new file mode 100644 +index 0000000..1538e11 +--- /dev/null ++++ b/csrc/ascend/pybind.cpp +@@ -0,0 +1,38 @@ ++#include ++#include "pinned_mem.h" ++#ifdef USE_TORCH ++ #include "torch_tensor.h" ++#else ++ #include "np_tensor.h" ++#endif ++ ++ ++namespace py = pybind11; ++ ++PYBIND11_MODULE(lmcache_C, m) { ++ m.def("alloc_pinned_mem", &lmc::alloc_pinned_mem, py::arg("bufferSize")); ++ m.def("free_pinned_mem", &lmc::free_pinned_mem, py::arg("hostptr")); ++ #ifdef USE_TORCH ++ m.def("paged_layers_kv_transfer", &lmcache_ascend::paged_layers_kv_transfer_torch, ++ py::arg("lmc_buffer"), py::arg("paged_kv_ptrs"), py::arg("slot_mappings"), ++ py::arg("lmc_buffer_hostptr"), py::arg("num_pages"), py::arg("page_size"), ++ py::arg("kvs"), py::arg("page2L"), py::arg("is_reg_mem") = false); ++ m.def("create_pinned_tensor", &lmcache_ascend::create_pinned_torch_tensor, py::arg("buffer_size"), ++ "Create a tensor backed by custom pinned memory and accessible by the device with a deleter."); ++ m.def("create_pinned_tensor_with_infos", &lmcache_ascend::create_pinned_torch_tensor_with_infos, ++ py::arg("shape"), py::arg("dtype"), ++ "Create a tensor of shape and dtype, backed by custom pinned memory and accessible by the device with a deleter."); ++ #else ++ m.def("paged_layers_kv_transfer", &lmcache_ascend::paged_layers_kv_transfer_ms, ++ py::arg("lmc_buffer"), py::arg("paged_kv_ptrs"), py::arg("slot_mappings"), ++ py::arg("lmc_buffer_hostptr"), py::arg("num_pages"), py::arg("page_size"), ++ py::arg("kvs"), py::arg("page2L"), py::arg("is_reg_mem") = false); ++ m.def("create_pinned_tensor", &lmcache_ascend::create_mmapped_numpy, py::arg("buffer_size"), ++ "Create a numpy tensor backed by custom pinned memory and accessible by the device with a deleter."); ++ m.def("create_pinned_tensor_with_infos", &lmcache_ascend::create_mapped_numpy_with_dtype, ++ py::arg("shape"), py::arg("dtype"), ++ "Create a numpy tensor of shape and dtype, backed by custom pinned memory and accessible by the device with a deleter."); ++ #endif ++ m.def("free_all", &lmc::free_all); ++ m.def("get_device_ptr", &lmc::get_device_ptr, py::arg("ptr")); ++} +\ No newline at end of file +diff --git a/csrc/ascend/threading.h b/csrc/ascend/threading.h +new file mode 100644 +index 0000000..1919605 +--- /dev/null ++++ b/csrc/ascend/threading.h +@@ -0,0 +1,65 @@ ++#pragma once ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/** ++ * @brief An RAII class to temporarily set thread affinity and restore it on destruction. ++ * ++ * Usage: ++ * { ++ * ThreadAffinityGuard guard(8); // Binds thread to CPU 8 ++ * // ... do work on CPU 8 ... ++ * } // Original affinity is automatically restored here ++ */ ++class ThreadAffinityGuard { ++public: ++ /** ++ * @brief Saves the current thread affinity and sets a new affinity to a single CPU. ++ * @param cpu_id The CPU core to bind the current thread to. ++ */ ++ explicit ThreadAffinityGuard(int cpu_id) { ++ // 1. Get and save the original affinity mask. ++ if (pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &original_mask_) != 0) { ++ // If we can't get the original mask, we can't safely restore it. ++ throw std::runtime_error("Failed to get original thread affinity."); ++ } ++ ++ // 2. Create and set the new, temporary mask. ++ cpu_set_t new_mask; ++ CPU_ZERO(&new_mask); ++ CPU_SET(cpu_id, &new_mask); ++ ++ if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &new_mask) != 0) { ++ // We failed to set the new mask. For safety, restore the original immediately. ++ pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &original_mask_); // Best effort restore ++ throw std::runtime_error(std::string("Failed to set thread affinity to CPU ") + std::to_string(cpu_id)); ++ } ++ } ++ ++ /** ++ * @brief Destructor that automatically restores the original thread affinity. ++ */ ++ ~ThreadAffinityGuard() { ++ // 3. Restore the original affinity mask. ++ if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &original_mask_) != 0) { ++ // It's generally a bad idea to throw from a destructor. ++ // A warning is more appropriate here. ++ std::cerr << "Warning: Failed to restore original thread affinity." << std::endl; ++ } ++ } ++ ++ // Delete copy and move constructors/assignments to prevent misuse. ++ ThreadAffinityGuard(const ThreadAffinityGuard&) = delete; ++ ThreadAffinityGuard& operator=(const ThreadAffinityGuard&) = delete; ++ ThreadAffinityGuard(ThreadAffinityGuard&&) = delete; ++ ThreadAffinityGuard& operator=(ThreadAffinityGuard&&) = delete; ++ ++private: ++ cpu_set_t original_mask_; // Member variable to store the original mask ++}; +\ No newline at end of file +diff --git a/csrc/ascend/torch_tensor.h b/csrc/ascend/torch_tensor.h +new file mode 100644 +index 0000000..b737608 +--- /dev/null ++++ b/csrc/ascend/torch_tensor.h +@@ -0,0 +1,141 @@ ++#pragma once ++#include "pinned_mem.h" ++#ifdef PROF_ERROR ++#undef PROF_ERROR ++#endif ++#include ++#include ++#include ++#include ++#include "utils.h" ++#include ++#include ++#include ++#include "aclnn/opdev/platform.h" ++#include "tiling/platform/platform_ascendc.h" ++#include "tiling/tiling_api.h" ++#include "ops.h" ++ ++namespace lmcache_ascend { ++ torch::Tensor create_pinned_torch_tensor_with_infos(const std::vector& shape, ++ c10::ScalarType dtype) { ++ if (shape.empty()) { ++ throw std::runtime_error( ++ "Shape must not be empty."); ++ } ++ ++ for (int64_t dim: shape) { ++ if (dim <= 0) { ++ throw std::runtime_error( ++ "Dimensions must be greater than zero."); ++ } ++ } ++ ++ int64_t numel = std::accumulate(shape.begin(), shape.end(), 1LL, std::multiplies()); ++ ++ size_t element_size = c10::elementSize(dtype); ++ ++ if (element_size == 0 && numel > 0) { ++ throw std::runtime_error( ++ "Invalid dtype."); ++ } ++ ++ size_t bufferSize = element_size * static_cast(numel); ++ ++ uintptr_t ptr = lmc::alloc_pinned_mem(bufferSize); ++ ++ if (ptr == 0) { ++ throw std::runtime_error( ++ "Failed to allocate pinned memory for tensor."); ++ } ++ ++ torch::TensorOptions tensorOpsCpu = torch::TensorOptions() ++ .dtype(dtype) ++ .device(torch::kCPU) ++ .pinned_memory(true); ++ ++ return torch::from_blob(reinterpret_cast(ptr), shape, lmc::pinned_memory_deleter, tensorOpsCpu); ++ ++ } ++ ++ torch::Tensor create_pinned_torch_tensor(size_t buffer_size) { ++ torch::TensorOptions tensorOpsCpu = torch::TensorOptions() ++ .dtype(torch::kUInt8) ++ .device(torch::kCPU) ++ .pinned_memory(true); ++ if (buffer_size <= 0) { ++ throw std::runtime_error( ++ "Buffer size must be greater than zero."); ++ } ++ ++ // unlikely this would be greater than int64_t ++ int64_t numel = static_cast(buffer_size); ++ ++ uintptr_t ptr = lmc::alloc_pinned_mem(buffer_size); ++ if (ptr == 0) { ++ throw std::runtime_error( ++ "Failed to allocate pinned memory for tensor."); ++ } ++ ++ std::vector dims = {numel}; ++ ++ return torch::from_blob(reinterpret_cast(ptr), dims, lmc::pinned_memory_deleter, tensorOpsCpu); ++ } ++ ++ ++ void paged_layers_kv_transfer_torch( ++ torch::Tensor& lmc_buffer, ++ const torch::Tensor& paged_kv_ptrs, ++ const torch::Tensor& slot_mappings, ++ const uintptr_t lmc_buffer_hostptr, ++ const int num_pages, ++ const int page_size, ++ const int kvs, ++ const bool page2L, ++ const bool is_reg_mem) { ++ // reset ++ if (page2L) { ++ memset(lmc_buffer.data_ptr(), 0, lmc_buffer.nbytes()); ++ } ++ uintptr_t lmc_offset_dptr; ++ if (is_reg_mem) { ++ // 1. we need to get the dev ptr from the host ptr and the paged_kv_ptrs list. ++ uintptr_t lmc_base_dptr = lmc::get_device_ptr(lmc_buffer_hostptr); ++ // 2. calculate offset from the lmcbaseptr to current dataptr ++ uintptr_t current_ptr = reinterpret_cast(lmc_buffer.data_ptr()); ++ uintptr_t offset_ptr = current_ptr - lmc_buffer_hostptr; ++ if (offset_ptr < 0) { ++ throw std::runtime_error( ++ "Offset calculation failed."); ++ } ++ lmc_offset_dptr = lmc_base_dptr + offset_ptr; ++ } else { ++ // we just get the ptr from the lmc buf ++ lmc_offset_dptr = reinterpret_cast(lmc_buffer.data_ptr()); ++ } ++ ++ ++ // 3. get the paged_kv_ptrs ++ uintptr_t paged_kv_dptr = reinterpret_cast(paged_kv_ptrs.data_ptr()); ++ ++ // 4. we make sure slot mappings are on the device ++ int num_tokens = slot_mappings.size(0); ++ int num_layers = paged_kv_ptrs.size(0); ++ int hidden_dims = lmc_buffer.size(-1); ++ ++ aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); ++ at::ScalarType scalar_type = lmc_buffer.scalar_type(); ++ ++ auto dtype_num = vllm_ascend::get_dtype_from_torch(scalar_type); ++ const char* socName = aclrtGetSocName(); ++ auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socName); ++ uint32_t aivNum = ascendcPlatform->GetCoreNumAiv(); ++ void* slot_mappings_dptr = slot_mappings.data_ptr(); ++ ++ lmc_ops::paged_kv_copy_impl(dtype_num, aivNum, stream, reinterpret_cast(paged_kv_dptr), ++ reinterpret_cast(lmc_offset_dptr), reinterpret_cast(slot_mappings_dptr), ++ static_cast(num_pages), static_cast(hidden_dims), static_cast(page_size), ++ static_cast(kvs), static_cast(num_layers), static_cast(num_tokens), ++ static_cast(aivNum), page2L); ++ } ++} +\ No newline at end of file +diff --git a/csrc/ascend/utils.h b/csrc/ascend/utils.h +new file mode 100644 +index 0000000..3728d27 +--- /dev/null ++++ b/csrc/ascend/utils.h +@@ -0,0 +1,48 @@ ++#pragma once ++ ++#include "kernels/types.h" ++#include ++namespace vllm_ascend { ++ ++#ifdef USE_TORCH ++ #include ++ AscendType get_dtype_from_torch(at::ScalarType scalarType) ++ { ++ if (scalarType == at::ScalarType::Float) { ++ return AscendType::FP32; ++ } else if (scalarType == at::ScalarType::BFloat16) { ++ return AscendType::BF16; ++ } else { ++ return AscendType::FP16; ++ } ++ } ++#else ++ #include ++ #include ++ #include ++ ++ ++ AscendType get_dtype_from_np(const py::array& arr) { ++ py::object array_dtype_obj = arr.dtype(); ++ std::string array_dtype_repr = py::repr(array_dtype_obj).cast(); ++ ++ if (array_dtype_repr.find("bfloat16") != std::string::npos) { ++ // HACK: Mindspore np weirdness ++ return AscendType::BF16; ++ } ++ ++ // Fallback to format string for other common dtypes ++ std::string format_str = arr.request().format; ++ if (format_str == "f" || format_str == "f4") { // float32 ++ return AscendType::FP32; ++ } else if (format_str == "h" || format_str == "f2") { // float16 ++ return AscendType::FP16; ++ } else if (format_str == "b" || format_str == "i1") { // <--- ADD THIS for signed 8-bit integer ++ return AscendType::INT8; ++ } ++ ++ throw std::runtime_error( ++ "Unsupported numpy dtype: " + format_str + ". Only float32, float16, and int8 are supported."); ++ } ++#endif ++} // namespace vllm_ascend + +diff --git a/lmcache/integration/vllm/vllm_v1_adapter.py b/lmcache/integration/vllm/vllm_v1_adapter.py +index 167b316..a0196df 100644 +--- a/lmcache/integration/vllm/vllm_v1_adapter.py ++++ b/lmcache/integration/vllm/vllm_v1_adapter.py +@@ -30,6 +30,7 @@ from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder + import torch + import vllm.envs as envs + import zmq ++import numpy as np + + # First Party + from lmcache.integration.vllm.utils import ( +@@ -85,10 +86,13 @@ class LMCacheLookupClient: + ) + + def lookup(self, token_ids: torch.Tensor) -> int: +- request = self.encoder.encode(token_ids) ++ # NOTE: the default encoder and decoder does not decode the mindspore tensor properly ++ # using numpy seems to work ++ request = self.encoder.encode(token_ids.asnumpy()) + self.socket.send_multipart(request, copy=False) + resp = self.socket.recv() + result = int.from_bytes(resp, "big") ++ logger.debug(f"Client Received Lookup Result: {result}") + return result + + def close(self): +@@ -103,7 +107,8 @@ class LMCacheLookupServer: + is_tp: bool, + vllm_config: "VllmConfig", + ): +- self.decoder = MsgpackDecoder(torch.Tensor) ++ # NOTE: Instead of mindspore tensor, let's do ndarray ++ self.decoder = MsgpackDecoder(np.ndarray) + self.ctx = zmq.Context() # type: ignore[attr-defined] + socket_path = get_zmq_rpc_path_lmcache(role, is_tp, vllm_config) + self.socket = make_zmq_socket( +@@ -118,15 +123,17 @@ class LMCacheLookupServer: + + def process_request(): + while self.running: +- # try: ++ try: + # request = self.socket.recv() +- frames = self.socket.recv_multipart(copy=False) +- token_ids = self.decoder.decode(frames) +- result = self.lmcache_engine.lookup(token_ids, pin=True) +- response = result.to_bytes(4, "big") +- self.socket.send(response) +- # except Exception as e: +- # logger.error("Error in LMCache lookup server: %s", e) ++ frames = self.socket.recv_multipart(copy=False) ++ token_ids = self.decoder.decode(frames) ++ result = self.lmcache_engine.lookup(token_ids, pin=True) ++ response = result.to_bytes(4, "big") ++ logger.debug(f"Lookup Result: {result}") ++ self.socket.send(response) ++ except Exception as e: ++ logger.error("Error in LMCache lookup server: %s", e) ++ raise e + # break + # continue + +@@ -507,12 +514,12 @@ class LMCacheConnectorV1Impl: + if request.load_spec is None: + continue + +- tokens = request.token_ids ++ tokens = request.token_ids.asnumpy() + # TODO: have a pre-allocated buffer to hold the slot_mappings + slot_mapping = request.slot_mapping.cuda() + assert len(tokens) == len(slot_mapping) + +- token_mask = torch.ones_like(tokens, dtype=torch.bool) ++ token_mask = np.ones_like(tokens, dtype=np.bool_) + masked_token_count = ( + request.load_spec.vllm_cached_tokens + // self._lmcache_chunk_size +@@ -728,7 +735,10 @@ class LMCacheConnectorV1Impl: + + token_ids = request.token_ids + assert isinstance(token_ids, torch.Tensor) +- assert token_ids.is_cpu ++ ++ # NOTE: Mindspore TorchAdapter is_cpu is always false ++ # assert token_ids.is_cpu ++ token_ids_np = token_ids.asnumpy() + + slot_mapping = request.slot_mapping + assert isinstance(slot_mapping, torch.Tensor) +@@ -746,7 +756,7 @@ class LMCacheConnectorV1Impl: + # ) + skip_leading_tokens = save_spec.skip_leading_tokens + +- if skip_leading_tokens == len(token_ids): ++ if skip_leading_tokens == len(token_ids_np): + continue # skip this request + # Align to lmcache chunk size + skip_leading_tokens = ( +@@ -755,19 +765,19 @@ class LMCacheConnectorV1Impl: + * self._lmcache_chunk_size + ) + +- store_mask = torch.ones_like(token_ids, dtype=torch.bool) ++ store_mask = np.ones_like(token_ids_np, dtype=np.bool_) + store_mask[:skip_leading_tokens] = False + + logger.info( + "Storing KV cache for %d out of %d tokens " + "(skip_leading_tokens=%d) for request %s", +- len(token_ids) - skip_leading_tokens, +- len(token_ids), ++ len(token_ids_np) - skip_leading_tokens, ++ len(token_ids_np), + skip_leading_tokens, + request.req_id, + ) + self.lmcache_engine.store( +- token_ids, ++ token_ids_np, + mask=store_mask, + kvcaches=kvcaches, + slot_mapping=slot_mapping, +diff --git a/lmcache/observability.py b/lmcache/observability.py +index 874d9aa..76ff9e2 100644 +--- a/lmcache/observability.py ++++ b/lmcache/observability.py +@@ -695,9 +695,11 @@ class LMCacheStatsLogger: + self.log_interval = log_interval + self.monitor = LMCStatsMonitor.GetOrCreate() + self.prometheus_logger = PrometheusLogger.GetOrCreate(metadata) +- self.is_running = True + ++ self.is_running = True ++ + self.thread = threading.Thread(target=self.log_worker, daemon=True) ++ + self.thread.start() + + def log_worker(self): +@@ -708,4 +710,5 @@ class LMCacheStatsLogger: + + def shutdown(self): + self.is_running = False +- self.thread.join() ++ if self.thread.is_alive(): ++ self.thread.join() +diff --git a/lmcache/storage_backend/serde/cachegen_decoder.py b/lmcache/storage_backend/serde/cachegen_decoder.py +index e109ae7..b128d51 100644 +--- a/lmcache/storage_backend/serde/cachegen_decoder.py ++++ b/lmcache/storage_backend/serde/cachegen_decoder.py +@@ -28,7 +28,7 @@ from lmcache.storage_backend.serde.cachegen_basics import ( + ) + from lmcache.storage_backend.serde.serde import Deserializer + from lmcache.utils import _lmcache_nvtx_annotate +-import lmcache.c_ops as lmc_ops ++# import lmcache.c_ops as lmc_ops + import lmcache.storage_backend.serde.cachegen_basics as CGBasics + + logger = init_logger(__name__) +@@ -76,6 +76,7 @@ def decode_chunk( + Write the decode output in target_buffer + Expected shape: [nlayers (kv in total), ntokens, nchannels] + """ ++ raise NotImplementedError() + bytes_tensor = data_chunk.bytestream + length_prefsum = ( + data_chunk.bytestream_lengths.flatten() +diff --git a/lmcache/storage_backend/serde/cachegen_encoder.py b/lmcache/storage_backend/serde/cachegen_encoder.py +index da5fa90..3745410 100644 +--- a/lmcache/storage_backend/serde/cachegen_encoder.py ++++ b/lmcache/storage_backend/serde/cachegen_encoder.py +@@ -28,7 +28,7 @@ from lmcache.storage_backend.serde.cachegen_basics import ( + ) + from lmcache.storage_backend.serde.serde import Serializer + from lmcache.utils import _lmcache_nvtx_annotate +-import lmcache.c_ops as lmc_ops ++# import lmcache.c_ops as lmc_ops + import lmcache.storage_backend.serde.cachegen_basics as CGBasics + + logger = init_logger(__name__) +@@ -273,6 +273,7 @@ def encode_ntokens( + + :return byte_tensor: the byte tensor + """ ++ raise NotImplementedError() + lmc_ops.encode_fast_new( + cdf_int, + encode_input, +diff --git a/lmcache/utils.py b/lmcache/utils.py +index dfdb6e4..e8412e5 100644 +--- a/lmcache/utils.py ++++ b/lmcache/utils.py +@@ -17,12 +17,13 @@ from __future__ import annotations + + # Standard + from dataclasses import dataclass +-from typing import TYPE_CHECKING, List, Optional, Tuple ++from typing import TYPE_CHECKING, List, Optional, Tuple, Dict, Any + import hashlib + import threading ++import json + + # Third Party +-from nvtx import annotate # type: ignore ++# from nvtx import annotate # type: ignore + import torch + + if TYPE_CHECKING: +@@ -31,7 +32,16 @@ if TYPE_CHECKING: + + # Type definition + KVCache = Tuple[Tuple[torch.Tensor, torch.Tensor], ...] ++from lmcache.logging import init_logger + ++logger = init_logger(__name__) ++ ++HAS_MS_TYPE = False ++try: ++ from mindspore.common import np_dtype ++ HAS_MS_TYPE = True ++except ImportError as ie: ++ pass + + @dataclass + class DiskCacheMetadata: +@@ -54,20 +64,100 @@ class DiskCacheMetadata: + def is_pinned(self) -> bool: + return self.is_pin + ++ def to_dict(self) -> Dict[str, Any]: ++ """ ++ Converts the metadata to a JSON-serializable dictionary. ++ """ ++ global TORCH_DTYPE_TO_STR_DTYPE ++ return { ++ "path": self.path, ++ "size": self.size, ++ # Convert torch.Size to a list, which is JSON-safe ++ "shape": list(self.shape) if self.shape is not None else None, ++ # Convert torch.dtype to a string, which is JSON-safe ++ "dtype": TORCH_DTYPE_TO_STR_DTYPE[self.dtype] if self.dtype is not None else None, ++ } ++ ++ @classmethod ++ def from_dict(cls, data_dict: Dict[str, Any]) -> 'DiskCacheMetadata': ++ """ ++ Creates a DiskCacheMetadata instance from a dictionary. ++ """ ++ path = data_dict.get("path") ++ size = data_dict.get("size") ++ global TORCH_STR_TO_DTYPE ++ # Convert shape from a list back to torch.Size ++ shape_list = data_dict.get("shape") ++ shape = torch.Size(shape_list) if shape_list is not None else None ++ ++ # Convert dtype from a string back to torch.dtype using the safe map ++ dtype_str = data_dict.get("dtype") ++ dtype = TORCH_STR_TO_DTYPE[dtype_str] if dtype_str is not None else None ++ ++ return cls(path=path, size=size, shape=shape, dtype=dtype) + + TORCH_DTYPE_TO_STR_DTYPE = { + torch.half: "half", + torch.float16: "half", + torch.bfloat16: "bfloat16", + torch.float: "float", +- torch.float32: "float", +- torch.float64: "double", ++ torch.float32: "float32", ++ torch.float64: "float64", + torch.double: "double", + torch.uint8: "fp8", +- torch.float8_e4m3fn: "fp8_e4m3", +- torch.float8_e5m2: "fp8_e5m2", + } + ++TORCH_STR_TO_DTYPE = { ++ "half": torch.float16, ++ "bfloat16": torch.bfloat16, ++ "float": torch.float, ++ "float32": torch.float32, ++ "float64": torch.float64, ++ "double": torch.double, ++ "fp8": torch.uint8, ++} ++ ++def update_dtypes(): ++ global TORCH_DTYPE_TO_STR_DTYPE ++ global HAS_MS_TYPE ++ try: ++ TORCH_DTYPE_TO_STR_DTYPE.update({torch.float8_e4m3fn: "fp8_e4m3"}) ++ TORCH_DTYPE_TO_STR_DTYPE.update({torch.float8_e5m2: "float8_e5m2"}) ++ TORCH_STR_TO_DTYPE.update({"fp8_e4m3": torch.float8_e4m3fn}) ++ TORCH_STR_TO_DTYPE.update({"float8_e5m2": torch.float8_e5m2}) ++ except AttributeError as ae: ++ if not HAS_MS_TYPE: ++ logger.error("Unable to update dtype: ", ae) ++ raise ae ++ else: ++ logger.warn("not using dtypes: torch.float8_e4m3fn and torch.float8_e5m2") ++ pass ++ ++ if HAS_MS_TYPE: ++ try: ++ TORCH_DTYPE_TO_STR_DTYPE.update({np_dtype.bfloat16: "np_bfloat16"}) ++ TORCH_STR_TO_DTYPE.update({"np_bfloat16": np_dtype.bfloat16}) ++ except AttributeError as ae: ++ logger.error("Unable to update dtype: ", ae) ++ pass ++ ++update_dtypes() ++ ++class TorchEncoderWithDecoderHook(json.JSONEncoder): ++ def default(self, obj): ++ if obj is not None and isinstance(obj, torch.dtype): ++ return TORCH_DTYPE_TO_STR_DTYPE[obj] ++ return json.JSONEncoder.default(self, obj) ++ ++ @staticmethod ++ def decode_hook(dct): ++ new_dct = {} ++ for key, value in dct.items(): ++ if isinstance(value, str) and value in TORCH_STR_TO_DTYPE: ++ new_dct[key] = TORCH_STR_TO_DTYPE[value] ++ else: ++ new_dct[key] = value ++ return new_dct + + @dataclass(order=True) + class CacheEngineKey: +@@ -89,8 +179,9 @@ class CacheEngineKey: + ) + + def to_string(self): ++ model_name = self.model_name.replace("-", "#") + return ( +- f"{self.fmt}@{self.model_name}@{self.world_size}" ++ f"{self.fmt}@{model_name}@{self.world_size}" + f"@{self.worker_id}@{self.chunk_hash}" + ) + +@@ -127,8 +218,9 @@ class CacheEngineKey: + parts = s.split("@") + if len(parts) != 5: + raise ValueError(f"Invalid key string: {s}") ++ model_name = parts[1].replace("#", "-") + return CacheEngineKey( +- parts[0], parts[1], int(parts[2]), int(parts[3]), parts[4] ++ parts[0], model_name, int(parts[2]), int(parts[3]), parts[4] + ) + + def to_dict(self): +@@ -219,14 +311,9 @@ def _get_color_for_nvtx(name): + idx = hash_value % len(_NVTX_COLORS) + return _NVTX_COLORS[idx] + +- + def _lmcache_nvtx_annotate(func, domain="lmcache"): + """Decorator for applying nvtx annotations to methods in lmcache.""" +- return annotate( +- message=func.__qualname__, +- color=_get_color_for_nvtx(func.__qualname__), +- domain=domain, +- )(func) ++ return (func) + + + ##### Observability Threading related ##### +diff --git a/lmcache/v1/_tensor.py b/lmcache/v1/_tensor.py +new file mode 100644 +index 0000000..a7a2525 +--- /dev/null ++++ b/lmcache/v1/_tensor.py +@@ -0,0 +1,85 @@ ++""" ++ This file contain methods that can be used to get information about a torch.Tensor or np.ndarray. ++""" ++import torch ++import numpy as np ++from typing import Union ++import ctypes ++ ++USE_MS = False ++ ++try: ++ import mindspore as ms ++ from mindspore.common import np_dtype ++ ++ USE_MS = True ++except: ++ pass ++ ++ ++MS_DTYPE_SIZE = {} ++ ++def get_dtype_compat(dtype: torch.dtype): ++ global USE_MS ++ if USE_MS and isinstance(dtype, ms.dtype.Type): ++ return ms.dtype_to_nptype(dtype) ++ return dtype ++ ++ ++def get_itemsize(dtype: torch.dtype): ++ """FIXME: is there a better way to do this ? :/""" ++ m = getattr(dtype, "itemsize", None) ++ global USE_MS ++ global MS_DTYPE_SIZE ++ if USE_MS: ++ if m is None: ++ # we are probably at mindspore ++ if dtype in MS_DTYPE_SIZE: ++ return MS_DTYPE_SIZE[dtype] ++ tmp = ms.Tensor([1.0], dtype=dtype) ++ MS_DTYPE_SIZE[dtype] = tmp.itemsize ++ m = MS_DTYPE_SIZE[dtype] ++ elif dtype == np_dtype.bfloat16: ++ # np does not have bfloat16 ++ return 2 ++ return m ++ ++ ++def get_data_ptr(tensor: Union[torch.Tensor, np.ndarray]): ++ """Get the data pointer of a torch.Tensor or np.ndarray.""" ++ if isinstance(tensor, torch.Tensor): ++ return tensor.data_ptr() ++ elif isinstance(tensor, np.ndarray): ++ return tensor.ctypes.data_as(ctypes.c_void_p).value ++ else: ++ raise TypeError(f"Expected torch.Tensor or np.ndarray, got {type(tensor)}") ++ ++ ++def get_numel(tensor: Union[torch.Tensor, np.ndarray, torch.Size]): ++ """Get the number of elements in a torch.Tensor or np.ndarray.""" ++ if isinstance(tensor, torch.Tensor): ++ return tensor.numel() ++ elif isinstance(tensor, np.ndarray): ++ return tensor.size ++ elif isinstance(tensor, torch.Size): ++ return torch.numel(tensor) ++ else: ++ raise TypeError(f"Expected torch.Tensor or np.ndarray, got {type(tensor)}") ++ ++def get_element_size(tensor: Union[torch.Tensor, np.ndarray]): ++ """Get the size of each element in a torch.Tensor or np.ndarray.""" ++ if isinstance(tensor, torch.Tensor): ++ return tensor.elemenet_size() ++ elif isinstance(tensor, np.ndarray): ++ return tensor.itemsize ++ else: ++ raise TypeError(f"Expected torch.Tensor or np.ndarray, got {type(tensor)}") ++ ++def view_and_shape(tensor: Union[torch.Tensor, np.ndarray], dtype: torch.dtype, shape: torch.Size): ++ """Get the view and shape of a torch.Tensor or np.ndarray.""" ++ if isinstance(tensor, torch.Tensor): ++ return tensor.view(dtype).view(shape) ++ elif isinstance(tensor, np.ndarray): ++ return tensor.reshape(-1).view(dtype).reshape(shape) ++ else: ++ raise TypeError(f"Expected torch.Tensor or np.ndarray, got {type(tensor)}") +diff --git a/lmcache/v1/cache_engine.py b/lmcache/v1/cache_engine.py +index 349e6f6..93e3047 100644 +--- a/lmcache/v1/cache_engine.py ++++ b/lmcache/v1/cache_engine.py +@@ -25,7 +25,6 @@ import torch + from lmcache.config import LMCacheEngineMetadata + from lmcache.logging import init_logger + from lmcache.observability import LMCacheStatsLogger, LMCStatsMonitor +-from lmcache.usage_context import InitializeUsageContext + from lmcache.utils import CacheEngineKey, _lmcache_nvtx_annotate + from lmcache.v1.config import LMCacheEngineConfig + from lmcache.v1.distributed_server import ( +@@ -51,7 +50,8 @@ from lmcache.v1.token_database import ( + SegmentTokenDatabase, + TokenDatabase, + ) +- ++import time ++import numpy as np + logger = init_logger(__name__) + + +@@ -140,15 +140,14 @@ class LMCacheEngine: + else: + self.fmt = MemoryFormat.KV_T2D + +- InitializeUsageContext(config.to_original_config(), metadata) + self.stats_monitor = LMCStatsMonitor.GetOrCreate() + + @_lmcache_nvtx_annotate + @torch.inference_mode() + def store( + self, +- tokens: torch.Tensor, +- mask: Optional[torch.Tensor] = None, ++ tokens: np.ndarray, ++ mask: Optional[np.ndarray] = None, + **kwargs, + ) -> None: + """Store the tokens and mask into the cache engine. +@@ -170,9 +169,9 @@ class LMCacheEngine: + """ + + if mask is not None: +- num_stored_tokens = torch.sum(mask).item() ++ num_stored_tokens = int(np.sum(mask)) + else: +- num_stored_tokens = len(tokens) ++ num_stored_tokens = tokens.size + monitor_req_id = self.stats_monitor.on_store_request(num_stored_tokens) + + starts = [] +@@ -343,10 +342,10 @@ class LMCacheEngine: + @torch.inference_mode() + def retrieve( + self, +- tokens: torch.Tensor, +- mask: Optional[torch.Tensor] = None, ++ tokens: np.ndarray, ++ mask: Optional[np.ndarray] = None, + **kwargs, +- ) -> torch.Tensor: ++ ) -> np.ndarray: + """Retrieve the KV caches from the cache engine. And put the retrieved + KV cache to the serving engine via the GPU connector. + +@@ -369,17 +368,18 @@ class LMCacheEngine: + multiple of the chunk size. + """ + if mask is not None: +- num_required_tokens = torch.sum(mask).item() ++ num_required_tokens = int(np.sum(mask)) + else: +- num_required_tokens = len(tokens) ++ num_required_tokens = tokens.size + monitor_req_id = self.stats_monitor.on_retrieve_request(num_required_tokens) + +- ret_mask = torch.zeros_like(tokens, dtype=torch.bool, device="cpu") ++ ret_mask = np.zeros(tokens.shape, dtype=np.bool_) ++ + for start, end, key in self.token_database.process_tokens(tokens, mask): + assert isinstance(key, CacheEngineKey) + + # Get the memory object from the storage backend +- memory_obj = self.storage_manager.get(key) ++ memory_obj = self.storage_manager.get(key, shape=self.metadata.kv_shape, dtype=self.metadata.kv_dtype) + + if memory_obj is None: + if self.enable_p2p: +@@ -397,7 +397,10 @@ class LMCacheEngine: + # cpu tensor for the sake of performance. + # For example, disk->gpu is faster than disk->cpu->gpu. + # RDMA is another example. ++ # t1 = time.perf_counter() + self.gpu_connector.to_gpu(memory_obj, start, end, **kwargs) ++ # t2 = time.perf_counter() ++ # print("To gpu: ", t2-t1) + memory_obj.ref_count_down() + + # NOTE (ApostaC): This is only for the current implementation: +@@ -408,7 +411,7 @@ class LMCacheEngine: + else: + self.storage_manager.batched_unpin([key]) + +- retrieved_tokens = torch.sum(ret_mask) ++ retrieved_tokens = np.sum(ret_mask) + self.stats_monitor.on_retrieve_finished(monitor_req_id, retrieved_tokens) + logger.debug( + f"Retrieved {retrieved_tokens} " +@@ -546,7 +549,7 @@ class LMCacheEngine: + @_lmcache_nvtx_annotate + def lookup( + self, +- tokens: Union[torch.Tensor, List[int]], ++ tokens: Union[torch.Tensor, List[int], np.ndarray], + search_range: Optional[List[str]] = None, + pin: bool = False, + ) -> int: +@@ -643,6 +646,7 @@ class LMCacheEngineBuilder: + def _Create_memory_allocator( + config: LMCacheEngineConfig, + metadata: LMCacheEngineMetadata, ++ framework: str = "torch", + ) -> MemoryAllocatorInterface: + if config.enable_nixl: + assert config.nixl_buffer_device is not None +@@ -653,7 +657,7 @@ class LMCacheEngineBuilder: + return CuFileMemoryAllocator(config.cufile_buffer_size * 1024**2) + + max_local_cpu_size = config.max_local_cpu_size +- return MixedMemoryAllocator(int(max_local_cpu_size * 1024**3)) ++ return MixedMemoryAllocator(int(max_local_cpu_size * 1024**3), framework) + + @staticmethod + def _Create_token_database( +@@ -671,6 +675,7 @@ class LMCacheEngineBuilder: + config: LMCacheEngineConfig, + metadata: LMCacheEngineMetadata, + gpu_connector: GPUConnectorInterface, ++ framework: str = "torch", + ) -> LMCacheEngine: + """ + Builds a new LMCacheEngine instance if it doesn't already exist for the +@@ -681,7 +686,7 @@ class LMCacheEngineBuilder: + """ + logger.info(f"Creating LMCacheEngine instance {instance_id}") + if instance_id not in cls._instances: +- memory_allocator = cls._Create_memory_allocator(config, metadata) ++ memory_allocator = cls._Create_memory_allocator(config, metadata, framework) + token_database = cls._Create_token_database(config, metadata) + stat_logger = LMCacheStatsLogger(metadata, log_interval=10) + +diff --git a/lmcache/v1/config.py b/lmcache/v1/config.py +index d27696d..54378ff 100644 +--- a/lmcache/v1/config.py ++++ b/lmcache/v1/config.py +@@ -49,6 +49,7 @@ class LMCacheEngineConfig: + # value even if local_cpu is disabled + local_disk: Optional[str] + max_local_disk_size: float # in GB ++ use_fuse_driver: bool + + remote_url: Optional[str] + remote_serde: Optional[str] # Can be "naive" or "cachegen" +@@ -101,6 +102,10 @@ class LMCacheEngineConfig: + # The url of the actual remote lmcache instance for auditing + audit_actual_remote_url: Optional[str] = None + ++ # Path of falconfs ++ workspace: Optional[str] = None ++ config_path: Optional[str] = None ++ + # The path under the WekaFS mount that the cache will be stored + weka_path: Optional[str] = None + # (Optional) The path under the File-based backend cache will be stored +@@ -127,6 +132,7 @@ class LMCacheEngineConfig: + max_local_cpu_size: float = 5.0, + local_disk: Optional[str] = None, + max_local_disk_size: int = 0, ++ use_fuse_driver: bool = False, + remote_url: Optional[str] = "lm://localhost:65432", + remote_serde: Optional[str] = "naive", + use_layerwise: bool = False, +@@ -151,6 +157,8 @@ class LMCacheEngineConfig: + nixl_buffer_device: Optional[str] = None, + nixl_enable_gc: Optional[bool] = False, + audit_actual_remote_url: Optional[str] = None, ++ workspace: Optional[str] = None, ++ config_path: Optional[str] = None, + weka_path: Optional[str] = None, + gds_path: Optional[str] = None, + cufile_buffer_size: Optional[int] = None, +@@ -165,6 +173,7 @@ class LMCacheEngineConfig: + max_local_cpu_size, + local_disk, + max_local_disk_size, ++ use_fuse_driver, + remote_url, + remote_serde, + use_layerwise, +@@ -189,6 +198,8 @@ class LMCacheEngineConfig: + nixl_buffer_device, + nixl_enable_gc, + audit_actual_remote_url, ++ workspace, ++ config_path, + weka_path, + gds_path, + cufile_buffer_size, +@@ -229,6 +240,7 @@ class LMCacheEngineConfig: + local_disk = "/local/disk_test/local_disk/" + max_local_disk_size = 5 + remote_url = None ++ use_fuse_driver = False + elif backend == "local_cpu_disk": + local_cpu = True + max_local_cpu_size = 5 +@@ -262,6 +274,7 @@ class LMCacheEngineConfig: + max_local_cpu_size=max_local_cpu_size, + local_disk=local_disk, + max_local_disk_size=max_local_disk_size, ++ use_fuse_driver=False, + remote_url=remote_url, + remote_serde=remote_serde, + use_layerwise=use_layerwise, +@@ -295,6 +308,7 @@ class LMCacheEngineConfig: + + local_disk = config.get("local_disk", None) + max_local_disk_size = config.get("max_local_disk_size", 5) ++ use_fuse_driver = config.get("use_fuse_driver", False) + + remote_url = config.get("remote_url", None) + remote_serde = config.get("remote_serde", "naive") +@@ -352,6 +366,9 @@ class LMCacheEngineConfig: + + audit_actual_remote_url = config.get("audit_actual_remote_url", None) + ++ workspace = config.get("workspace", None) ++ config_path = config.get("config_path", None) ++ + weka_path = config.get("weka_path", None) + gds_path = config.get("gds_path", None) + cufile_buffer_size = config.get("cufile_buffer_size", None) +@@ -370,6 +387,13 @@ class LMCacheEngineConfig: + case _: + raise ValueError(f"Invalid remote storage url: {remote_url}") + ++ match workspace: ++ case None: ++ print("----------------------workspace is None----------------------") ++ case path if re.match(r"/(.*)/", ++ path): # local disk directory ++ print(f"----------------------workspace is {workspace}----------------------") ++ + return ( + LMCacheEngineConfig( + chunk_size, +@@ -377,6 +401,7 @@ class LMCacheEngineConfig: + max_local_cpu_size, + local_disk_path, + max_local_disk_size, ++ use_fuse_driver, + remote_url, + remote_serde, + use_layerwise, +@@ -401,6 +426,8 @@ class LMCacheEngineConfig: + nixl_buffer_device, + nixl_enable_gc, + audit_actual_remote_url, ++ workspace, ++ config_path, + weka_path, + gds_path, + cufile_buffer_size, +@@ -469,6 +496,8 @@ class LMCacheEngineConfig: + config.max_local_disk_size = to_float( + parse_env(get_env_name("max_local_disk_size"), config.max_local_disk_size) + ) ++ config.use_fuse_driver = parse_env(get_env_name("use_fuse_driver"), ++ config.use_fuse_driver) + config.remote_url = parse_env(get_env_name("remote_url"), config.remote_url) + config.remote_serde = parse_env( + get_env_name("remote_serde"), config.remote_serde +@@ -573,6 +602,14 @@ class LMCacheEngineConfig: + config.audit_actual_remote_url, + ) + ++ config.workspace = parse_env( ++ get_env_name("workspace"), ++ config.workspace) ++ ++ config.workspace = parse_env( ++ get_env_name("config_path"), ++ config.config_path) ++ + config.weka_path = parse_env( + get_env_name("weka_path"), + config.weka_path, +@@ -679,6 +716,8 @@ class LMCacheEngineConfig: + "extra_config": self.extra_config, + "save_unfull_chunk": self.save_unfull_chunk, + "blocking_timeout_secs": self.blocking_timeout_secs, ++ "workspace": self.workspace, ++ "config_path": self.config_path, + } + logger.info(f"LMCache Configuration: {config_dict}") + +diff --git a/lmcache/v1/envs.py b/lmcache/v1/envs.py +new file mode 100644 +index 0000000..ec5e788 +--- /dev/null ++++ b/lmcache/v1/envs.py +@@ -0,0 +1,6 @@ ++import os ++from functools import lru_cache ++ ++@lru_cache(maxsize=10) ++def is_disable_custom_ops(): ++ return bool(int(os.getenv("DISABLE_CUSTOM_OPS", "0"))) +\ No newline at end of file +diff --git a/lmcache/v1/gpu_connector.py b/lmcache/v1/gpu_connector.py +index 3998f33..ed8c519 100644 +--- a/lmcache/v1/gpu_connector.py ++++ b/lmcache/v1/gpu_connector.py +@@ -26,10 +26,15 @@ from lmcache.utils import _lmcache_nvtx_annotate + from lmcache.v1.compute.blend.utils import LMCBlenderBuilder + from lmcache.v1.memory_management import GPUMemoryAllocator # noqa: E501 + from lmcache.v1.memory_management import MemoryFormat, MemoryObj +-import lmcache.c_ops as lmc_ops ++# import lmcache.c_ops as lmc_ops + + logger = init_logger(__name__) + ++try: ++ from lmcache import lmcache_C ++ _USE_LMC_OPS = True ++except ImportError as ie: ++ logger.error("Not using lmcache_C ops") + + class GPUConnectorInterface(metaclass=abc.ABCMeta): + @abc.abstractmethod +@@ -321,13 +326,21 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface): + self.kv_cache_pointers = torch.empty( + num_layers, dtype=torch.int64, device="cpu" + ) +- # Not sure we need a dict here. Maybe a single GPU connector always +- # works with a single device? +- self.kv_cache_pointers_on_gpu: dict[int, torch.Tensor] = {} ++ ++ self.use_mla = "use_mla" in kwargs and kwargs["use_mla"] ++ kvsize = 1 if self.use_mla else 2 ++ ++ # NOTE: Mindspore has no index from the torch tensor API ++ # will use mindspore.hal in the future, ++ # so we only use 1 tensor and manage for its own device ++ self.kv_cache_pointers_on_gpu = torch.empty([num_layers, kvsize], ++ dtype=torch.int64, ++ device='Ascend', ++ pin_memory=True) ++ self._initialized_kv_ptrs = False + self.page_buffer_size = 0 + + self.gpu_buffer: Optional[torch.Tensor] = None +- self.use_mla = "use_mla" in kwargs and kwargs["use_mla"] + if use_gpu: + assert "chunk_size" in kwargs, ( + "chunk_size should be provided to create a GPU buffer." +@@ -342,25 +355,24 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface): + ) + + def _initialize_pointers(self, kv_caches: List[torch.Tensor]) -> torch.Tensor: +- self.kv_cache_pointers.numpy()[:] = [t.data_ptr() for t in kv_caches] +- device = kv_caches[0].device +- assert device.type == "cuda", "The device should be CUDA." +- idx = device.index +- if idx not in self.kv_cache_pointers_on_gpu: +- self.kv_cache_pointers_on_gpu[idx] = torch.empty( +- self.num_layers, dtype=torch.int64, device=device +- ) +- self.kv_cache_pointers_on_gpu[idx].copy_(self.kv_cache_pointers) +- if self.use_mla: +- # kv_caches[0].shape: [num_pages, page_size, head_size] +- assert kv_caches[0].dim() == 3 +- self.page_buffer_size = kv_caches[0].shape[0] * kv_caches[0].shape[1] +- else: +- # kv_caches[0].shape: [2, num_pages, page_size, num_heads, head_size] +- assert kv_caches[0].dim() == 5 +- self.page_buffer_size = kv_caches[0].shape[1] * kv_caches[0].shape[2] +- +- return self.kv_cache_pointers_on_gpu[idx] ++ ++ logger.debug(f"{len(kv_caches[0])}, {kv_caches[0][0].shape}, {kv_caches[0][1].shape}") ++ ++ if self._initialized_kv_ptrs: ++ return ++ ++ kv_cache_pointers_cpus = torch.Tensor([[kvc.data_ptr() for kvc in layer_kv ]for layer_kv in kv_caches]) ++ self.kv_cache_pointers_on_gpu.copy_(kv_cache_pointers_cpus, non_blocking=False) ++ torch.cuda.synchronize() ++ ++ self._initialized_kv_ptrs = True ++ ++ # MINDSPORE store k and v separately ++ self.page_buffer_size = kv_caches[0][0].shape[0] * kv_caches[0][0].shape[1] ++ self.num_pages = kv_caches[0][0].shape[0] ++ self.page_size = kv_caches[0][0].shape[1] ++ self.kv_size = kv_cache_pointers_cpus.shape[1] ++ return self.kv_cache_pointers_on_gpu + + @_lmcache_nvtx_annotate + def to_gpu(self, memory_obj: MemoryObj, start: int, end: int, **kwargs): +@@ -404,17 +416,28 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface): + kvcaches: List[torch.Tensor] = kwargs["kvcaches"] + slot_mapping: torch.Tensor = kwargs["slot_mapping"] + +- kv_cache_pointers = self._initialize_pointers(kvcaches) +- +- lmc_ops.multi_layer_kv_transfer( +- memory_obj.tensor, +- kv_cache_pointers, +- slot_mapping[start:end], +- kvcaches[0].device, +- self.page_buffer_size, +- False, +- self.use_mla, +- ) ++ self._initialize_pointers(kvcaches) ++ ++ slot_mapping_range = slot_mapping[start:end] ++ lmcache_C.paged_layers_kv_transfer(memory_obj.tensor, ++ self.kv_cache_pointers_on_gpu, ++ slot_mapping_range, ++ memory_obj.base_ptr, ++ self.num_pages, ++ self.page_size, ++ self.kv_size, ++ False, ++ True) ++ ++ # lmc_ops.multi_layer_kv_transfer( ++ # memory_obj.tensor, ++ # kv_cache_pointers, ++ # slot_mapping[start:end], ++ # kvcaches[0].device, ++ # self.page_buffer_size, ++ # False, ++ # self.use_mla, ++ # ) + + @_lmcache_nvtx_annotate + def from_gpu(self, memory_obj: MemoryObj, start: int, end: int, **kwargs): +@@ -446,38 +469,49 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface): + kvcaches: List[torch.Tensor] = kwargs["kvcaches"] + slot_mapping: torch.Tensor = kwargs["slot_mapping"] + +- kv_cache_pointers = self._initialize_pointers(kvcaches) +- +- if self.gpu_buffer is None or end - start != self.gpu_buffer.shape[2]: +- lmc_ops.multi_layer_kv_transfer( +- memory_obj.tensor, +- kv_cache_pointers, +- slot_mapping[start:end], +- kvcaches[0].device, +- self.page_buffer_size, +- True, +- self.use_mla, +- ) +- else: +- # kvcaches -> gpu_buffer -> memobj +- assert self.gpu_buffer.device == kvcaches[0].device +- tmp_gpu_buffer = self.gpu_buffer[:, :, : end - start, :] +- lmc_ops.multi_layer_kv_transfer( +- tmp_gpu_buffer, +- kv_cache_pointers, +- slot_mapping[start:end], +- kvcaches[0].device, +- self.page_buffer_size, +- True, +- self.use_mla, +- ) +- memory_obj.tensor.copy_(tmp_gpu_buffer, non_blocking=True) +- +- if not memory_obj.tensor.is_cuda: +- # Force a synchronize if the target buffer is NOT CUDA device +- # NOTE: for better performance, we may not want to sync for every +- # memory object +- torch.cuda.synchronize() ++ self._initialize_pointers(kvcaches) ++ ++ slot_mapping_range = slot_mapping[start:end] ++ lmcache_C.paged_layers_kv_transfer(memory_obj.tensor, ++ self.kv_cache_pointers_on_gpu, ++ slot_mapping_range, ++ memory_obj.base_ptr, ++ self.num_pages, ++ self.page_size, ++ self.kv_size, ++ True, ++ True) ++ # if self.gpu_buffer is None or end - start != self.gpu_buffer.shape[2]: ++ # lmc_ops.multi_layer_kv_transfer( ++ # memory_obj.tensor, ++ # kv_cache_pointers, ++ # slot_mapping[start:end], ++ # kvcaches[0].device, ++ # self.page_buffer_size, ++ # True, ++ # self.use_mla, ++ # ) ++ # else: ++ # # kvcaches -> gpu_buffer -> memobj ++ # assert self.gpu_buffer.device == kvcaches[0].device ++ # tmp_gpu_buffer = self.gpu_buffer[:, :, : end - start, :] ++ # lmc_ops.multi_layer_kv_transfer( ++ # tmp_gpu_buffer, ++ # kv_cache_pointers, ++ # slot_mapping[start:end], ++ # kvcaches[0].device, ++ # self.page_buffer_size, ++ # True, ++ # self.use_mla, ++ # ) ++ # memory_obj.tensor.copy_(tmp_gpu_buffer, non_blocking=True) ++ ++ # NOTE: mindspore memory obj is a numpy tensor ++ # if not memory_obj.tensor.is_cuda: ++ # # Force a synchronize if the target buffer is NOT CUDA device ++ # # NOTE: for better performance, we may not want to sync for every ++ # # memory object ++ # torch.cuda.synchronize() + + if self.use_mla: + memory_obj.metadata.fmt = MemoryFormat.KV_MLA_FMT +diff --git a/lmcache/v1/memory_management.py b/lmcache/v1/memory_management.py +index 44958d4..a55820f 100644 +--- a/lmcache/v1/memory_management.py ++++ b/lmcache/v1/memory_management.py +@@ -20,6 +20,7 @@ from typing import List, Optional, Tuple, Union + import abc + import ctypes + import threading ++import numpy as np + + # Third Party + import sortedcontainers +@@ -29,6 +30,15 @@ import torch + from lmcache.logging import init_logger + from lmcache.observability import LMCStatsMonitor + from lmcache.utils import _lmcache_nvtx_annotate ++from .pin_buffer import create_pin_memory ++from ._tensor import ( ++ get_data_ptr, ++ get_numel, ++ get_element_size, ++ get_itemsize, ++ get_dtype_compat, ++ view_and_shape ++) + + logger = init_logger(__name__) + +@@ -109,12 +119,12 @@ class MemoryObjMetadata: + """ + Calculate the size of the memory object in bytes + """ +- if self.shape.numel() == 0: ++ num_elements = get_numel(self.shape) ++ if num_elements == 0: + return 0 + if self.dtype is None: + return 0 +- num_elements = self.shape.numel() +- element_size = self.dtype.itemsize ++ element_size = get_itemsize(self.dtype) + size_in_bytes = num_elements * element_size + return size_in_bytes + +@@ -236,6 +246,10 @@ class MemoryObj(metaclass=abc.ABCMeta): + """ + raise NotImplementedError + ++ @abc.abstractproperty ++ def framework(self): ++ raise NotImplementedError ++ + @property + @abc.abstractmethod + def metadata(self) -> MemoryObjMetadata: +@@ -268,6 +282,10 @@ class MemoryObj(metaclass=abc.ABCMeta): + """ + raise NotImplementedError + ++ @property ++ @abc.abstractmethod ++ def base_ptr(self) -> int: ++ raise NotImplementedError + + class TensorMemoryObj(MemoryObj): + """ +@@ -279,13 +297,16 @@ class TensorMemoryObj(MemoryObj): + raw_data: torch.Tensor, + metadata: MemoryObjMetadata, + parent_allocator: Optional["MemoryAllocatorInterface"] = None, ++ base_ptr: int = None, ++ framework: str = None + ): + self.raw_data = raw_data + self.meta = metadata + self.valid = True + self.lock = threading.Lock() + self.parent_allocator = parent_allocator +- ++ self._base_ptr = base_ptr ++ self._framework = framework + def invalidate(self): + self.valid = False + +@@ -293,8 +314,8 @@ class TensorMemoryObj(MemoryObj): + return self.valid + + def get_size(self) -> int: +- num_elements = self.raw_data.numel() +- element_size = self.raw_data.element_size() ++ num_elements = get_numel(self.raw_data) ++ element_size = get_element_size(self.raw_data) + size_in_bytes = num_elements * element_size + return size_in_bytes + +@@ -336,26 +357,28 @@ class TensorMemoryObj(MemoryObj): + def unpin(self) -> bool: + self.metadata.is_pin = False + return True +- ++ ++ def framework(self): ++ return self._framework + @property + def metadata(self) -> MemoryObjMetadata: + with self.lock: + return self.meta + + @property +- def tensor(self) -> Optional[torch.Tensor]: ++ def tensor(self) -> Optional[Union[torch.Tensor, np.ndarray]]: + if not self.valid: + logger.warning("Trying to access an invalidated MemoryObj") + return None + assert self.meta.dtype is not None +- return self.raw_data.view(self.meta.dtype).view(self.meta.shape) ++ return view_and_shape(self.raw_data, self.meta.dtype, self.meta.shape) + + @property + def byte_array(self) -> bytes: + kv_chunk = self.tensor + assert kv_chunk is not None +- num_bytes = kv_chunk.numel() * kv_chunk.element_size() +- ptr = kv_chunk.data_ptr() ++ num_bytes = get_numel(kv_chunk) * get_element_size(kv_chunk) ++ ptr = get_data_ptr(kv_chunk) + ubyte_ptr = ctypes.cast(ptr, ctypes.POINTER(ctypes.c_ubyte)) + byte_array = (ctypes.c_ubyte * num_bytes).from_address( + ctypes.addressof(ubyte_ptr.contents) +@@ -366,6 +389,9 @@ class TensorMemoryObj(MemoryObj): + def is_pinned(self) -> bool: + return self.metadata.is_pin + ++ @property ++ def base_ptr(self) -> int: ++ return self._base_ptr + + class BytesBufferMemoryObj(MemoryObj): + """ +@@ -446,6 +472,9 @@ class BytesBufferMemoryObj(MemoryObj): + def is_pinned(self) -> bool: + return self.metadata.is_pin + ++ @property ++ def base_ptr(self) -> int: ++ return 0 + + class MemoryAllocatorInterface(metaclass=abc.ABCMeta): + @abc.abstractmethod +@@ -521,13 +550,20 @@ class TensorMemoryAllocator(MemoryAllocatorInterface): + + ALIGN_BYTES = 512 + +- def __init__(self, tensor: torch.Tensor, align_bytes: int = ALIGN_BYTES): +- self.buffer = tensor.view(torch.uint8).flatten() ++ def __init__(self, tensor: Union[torch.Tensor, np.ndarray], align_bytes: int = ALIGN_BYTES): ++ # NOTE (Gingfung:) changed to reshape to enable ++ assert self._is_uint8_type(tensor) ++ self.buffer = tensor.reshape(-1) ++ self._base_ptr = get_data_ptr(self.buffer) ++ + self.align_bytes = align_bytes + + self.explicit_list = sortedcontainers.SortedList(key=lambda x: x.start) + +- self.explicit_list.add(FreeBlock(start=0, size=self.buffer.numel())) ++ self._buffer_is_np = isinstance(tensor, np.ndarray) ++ el_size = get_numel(self.buffer) ++ ++ self.explicit_list.add(FreeBlock(start=0, size=el_size)) + + # For debugging purposes + self.num_active_allocations = 0 +@@ -535,16 +571,28 @@ class TensorMemoryAllocator(MemoryAllocatorInterface): + + self.stats_monitor = LMCStatsMonitor.GetOrCreate() + ++ @property ++ def framework(self): ++ return "Mindspore" if self._buffer_is_np else "Torch" ++ + @staticmethod + @_lmcache_nvtx_annotate + def _Compute_raw_size(shape: torch.Size, dtype: torch.dtype) -> int: +- return shape.numel() * dtype.itemsize ++ return shape.numel() * get_itemsize(dtype) + + @staticmethod + @_lmcache_nvtx_annotate + def _Compute_aligned_size(raw_size: int, align: int) -> int: + return (raw_size + align - 1) & ~(align - 1) + ++ def _is_uint8_type(self, tensor: Union[torch.Tensor, np.ndarray]): ++ if isinstance(tensor, np.ndarray): ++ return tensor.dtype == np.uint8 ++ elif isinstance(tensor, torch.Tensor): ++ return tensor.dtype == torch.uint8 ++ else: ++ raise ValueError(f"tensor of type: {type(tensor)} not supported.") ++ + @_lmcache_nvtx_annotate + def _coalesce( + self, +@@ -595,6 +643,8 @@ class TensorMemoryAllocator(MemoryAllocatorInterface): + shape = torch.Size(shape) + + assert dtype is not None, "dtype must be specified" ++ dtype = get_dtype_compat(dtype) ++ + # Calculate the size of the tensor + raw_size = TensorMemoryAllocator._Compute_raw_size(shape, dtype) + if raw_size % self.align_bytes != 0: +@@ -640,6 +690,8 @@ class TensorMemoryAllocator(MemoryAllocatorInterface): + shape, dtype, block.start, aligned_size, 1, False, fmt + ), + parent_allocator=parent_allocator, ++ base_ptr=self._base_ptr, ++ framework=self.framework + ) + + @_lmcache_nvtx_annotate +@@ -658,6 +710,7 @@ class TensorMemoryAllocator(MemoryAllocatorInterface): + shape = torch.Size(shape) + + assert dtype is not None, "dtype must be specified" ++ dtype = get_dtype_compat(dtype) + + # Calculate the size of the tensor + unit_raw_size = TensorMemoryAllocator._Compute_raw_size(shape, dtype) +@@ -714,6 +767,7 @@ class TensorMemoryAllocator(MemoryAllocatorInterface): + shape, dtype, temp_start, unit_aligned_size, 1, False, fmt + ), + parent_allocator=parent_allocator, ++ framework=self.framework + ) + ) + temp_start += unit_aligned_size +@@ -820,7 +874,7 @@ class TensorMemoryAllocator(MemoryAllocatorInterface): + logger.info(f" - Total free size: {total_free_size / 1048576} MB") + + # Check if the numbers are consistent +- if total_free_size + self.total_allocated_size != self.buffer.numel(): ++ if total_free_size + self.total_allocated_size != get_numel(self.buffer): + logger.error("Memory allocator size is inconsistent") + logger.error("This implies a bug in the memory allocator") + clear = False +@@ -881,6 +935,9 @@ class BufferAllocator(MemoryAllocatorInterface): + def memcheck(self): + return True + ++ @property ++ def framework(self): ++ return "Torch" + + class HostMemoryAllocator(MemoryAllocatorInterface): + """Allocates memory in the pre-allocated Host memory.""" +@@ -929,6 +986,9 @@ class HostMemoryAllocator(MemoryAllocatorInterface): + with self.host_mem_lock: + return self.allocator.memcheck() + ++ @property ++ def framework(self): ++ return self.allocator.framework + + class PinMemoryAllocator(MemoryAllocatorInterface): + """Allocates memory in the pre-allocated pinned memory.""" +@@ -978,6 +1038,9 @@ class PinMemoryAllocator(MemoryAllocatorInterface): + with self.host_mem_lock: + return self.allocator.memcheck() + ++ @property ++ def framework(self): ++ return self.allocator.framework + + class MixedMemoryAllocator(MemoryAllocatorInterface): + """ +@@ -985,11 +1048,13 @@ class MixedMemoryAllocator(MemoryAllocatorInterface): + (2) byte_array buffer memory. + """ + +- def __init__(self, size: int): ++ def __init__(self, size: int, framework: str = "Torch"): + """ + :param int size: The size of the pinned memory in bytes. + """ +- buffer = torch.empty(size, dtype=torch.uint8, pin_memory=True) ++ ++ buffer = create_pin_memory(framework, size, dtype="uint8") ++ self._framework = framework + + self.pin_allocator = TensorMemoryAllocator(buffer) + self.buffer_allocator = BufferAllocator("cpu") +@@ -1076,6 +1141,9 @@ class MixedMemoryAllocator(MemoryAllocatorInterface): + with self.host_mem_lock: + return self.pin_allocator.memcheck() + ++ @property ++ def framework(self): ++ return self._framework + + class GPUMemoryAllocator(MemoryAllocatorInterface): + """Allocates memory in the pre-allocated GPU memory.""" +@@ -1126,6 +1194,9 @@ class GPUMemoryAllocator(MemoryAllocatorInterface): + with self.device_mem_lock: + return self.allocator.memcheck() + ++ @property ++ def framework(self): ++ return self.allocator.framework + + class AdHocMemoryAllocator(MemoryAllocatorInterface): + """ +@@ -1153,6 +1224,7 @@ class AdHocMemoryAllocator(MemoryAllocatorInterface): + shape = torch.Size(shape) + + assert dtype is not None, "dtype must be specified" ++ dtype = get_dtype_compat(dtype) + + # Return a dummy object with no actual memory allocation + return TensorMemoryObj( +@@ -1199,6 +1271,9 @@ class AdHocMemoryAllocator(MemoryAllocatorInterface): + def memcheck(self): + return True + ++ @property ++ def framework(self): ++ return "Torch" + + class CuFileMemoryAllocator(GPUMemoryAllocator): + def __init__(self, size: int, device=None): +diff --git a/lmcache/v1/npu_connector.py b/lmcache/v1/npu_connector.py +new file mode 100644 +index 0000000..8316a0b +--- /dev/null ++++ b/lmcache/v1/npu_connector.py +@@ -0,0 +1,291 @@ ++from lmcache.v1.gpu_connector import GPUConnectorInterface ++import torch ++from typing import List, Optional, Tuple, Union ++from lmcache.v1.memory_management import MemoryFormat, MemoryObj ++from lmcache.logging import init_logger ++logger = init_logger(__file__) ++from .envs import is_disable_custom_ops ++ ++_USE_LMC_OPS = False ++try: ++ from lmcache import lmcache_C ++ _USE_LMC_OPS = True ++except ImportError as ie: ++ logger.debug("Not using lmcache_C ops") ++ ++_USE_MS = False ++try: ++ from mindspore import mutable ++ _USE_MS = True ++except ImportError as ie: ++ logger.error("Unable to import mindspore") ++ ++class VLLMPagedMemNPUConnector(GPUConnectorInterface): ++ """ ++ FIXME: Currently a lot of the memcpy and transfer ops ++ are in sequence, we probably want to optimize this. ++ ++ The NPU KV cache should be a nested tuple of K and V tensors. ++ More specifically, we have: ++ - GPUTensor = Tuple[KVLayer, ...] ++ - KVLayer = Tuple[Tensor, Tensor] ++ - Tensor: [num_blocks, block_size, num_heads, head_size] ++ ++ It will produce / consume memory object with KV_BLOB format ++ """ ++ ++ ACL_MEMCPY_DEVICE_TO_HOST=2 ++ ++ def __init__(self, ++ hidden_dim_size: int, ++ num_layers: int, ++ use_npu: bool = True, ++ **kwargs): ++ """ ++ If use_gpu is true, it will create a gpu intermediate buffer. In this ++ case, it requires the following kwargs: ++ - chunk_size: The MAX size of the chunk to be copied to GPU. ++ - dtype: The data type of the intermediate buffer. ++ """ ++ global _USE_LMC_OPS ++ global _USE_MS ++ self._use_lmc_ops = _USE_LMC_OPS and not is_disable_custom_ops() ++ logger.debug(f"NPU Connector using lmc ops: {self._use_lmc_ops}") ++ self.hidden_dim_size = hidden_dim_size ++ self.num_layers = num_layers ++ self.is_tuple_layout = kwargs["kv_is_tuple"] ++ self.kv_size = kwargs["kv_size"] ++ ++ if not self.is_tuple_layout: ++ self.kv_cache_pointers = torch.empty(num_layers, ++ dtype=torch.int64, ++ device='cpu', ++ pin_memory=True) ++ self._kv_cache_devptrs = None ++ else: ++ if not _USE_MS: ++ raise ImportError("MindSpore is required for tuple layout") ++ # per layer, kvs ++ self._kv_cache_devptrs = torch.empty([num_layers, self.kv_size], ++ dtype=torch.int64, ++ device='Ascend', ++ pin_memory=True) ++ self.pointers_initialized = False ++ self.page_buffer_size = 0 ++ self.num_pages = 0 ++ self.page_size = 0 ++ ++ self.npu_buffer: Optional[torch.Tensor] = None ++ if use_npu: ++ assert "chunk_size" in kwargs, \ ++ "chunk_size should be provided to create a GPU buffer." ++ assert "dtype" in kwargs, \ ++ "dtype should be provided to create a GPU buffer." ++ assert "device" in kwargs, \ ++ "device should be provided to create a GPU buffer." ++ self.npu_buffer = torch.empty((self.kv_size, num_layers, ++ kwargs["chunk_size"], ++ self.hidden_dim_size), ++ dtype=kwargs["dtype"], ++ device=kwargs["device"]) ++ self.npu_buffer.zero_() ++ ++ def _initialize_pointers(self, kv_caches: List[torch.Tensor]): ++ if not self.is_tuple_layout: ++ for i in range(self.num_layers): ++ self.kv_cache_pointers[i] = kv_caches[i].data_ptr() ++ self._kv_cache_devptrs = self.kv_cache_pointers.npu() ++ else: ++ kv_cache_pointers_cpus = torch.Tensor([[kvc.data_ptr() for kvc in layer_kv ]for layer_kv in kv_caches]) ++ self._kv_cache_devptrs.copy_(kv_cache_pointers_cpus, non_blocking=False) ++ ++ torch.cuda.synchronize() ++ ++ self.pointers_initialized = True ++ kv_one_layer = kv_caches[0][0] if self.is_tuple_layout else kv_caches[0] ++ if not self.is_tuple_layout: ++ # kv_caches layer.shape: [2, num_pages, page_size, num_heads, head_size] ++ self.page_buffer_size = kv_one_layer.shape[1] * kv_one_layer.shape[2] ++ self.num_pages = kv_one_layer.shape[1] ++ self.page_size = kv_one_layer.shape[2] ++ else: ++ # kv_caches layer.shape: [num_pages, page_size, num_heads, head_size] ++ self.page_buffer_size = kv_one_layer.shape[0] * kv_one_layer.shape[1] ++ self.num_pages = kv_one_layer.shape[0] ++ self.page_size = kv_one_layer.shape[1] ++ ++ ++ def _pointers_are_good(self, kv_caches: Union[List[torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]]): ++ """ ++ Check if the initialized pointers are the same as the pointers in ++ the KV caches. ++ ++ Returns: ++ bool: True if the pointers are the same, False otherwise ( ++ including uninitialized). ++ """ ++ if not self.pointers_initialized: ++ return False ++ ++ for i in range(self.num_layers): ++ if not self.is_tuple_layout: ++ if self._kv_cache_devptrs[i] != kv_caches[i].data_ptr(): ++ return False ++ else: ++ for j in range(self.kv_size): ++ if self._kv_cache_devptrs[i][j] != kv_caches[i][j].data_ptr(): ++ return False ++ ++ return True ++ ++ def to_gpu(self, memory_obj: MemoryObj, start: int, end: int, **kwargs): ++ """Expect a kwarg 'kvcaches' which is a nested tuple of K and V tensors. ++ The kvcaches should correspond to the "WHOLE token sequence". ++ ++ Note: ++ 1. This function expects the 'slot_mapping' is a "full slot mapping" ++ where it's length is the same as the whole token sequence. ++ 2. In the case that there is prefix caching, slot_mapping will starts ++ with -1s until the end of the matched prefix. The start and end ++ should NEVER overlap with the prefix caching (which means the ++ underlying CUDA kernel will never see -1 in slot_mapping) ++ ++ ++ :raises ValueError: If 'kvcaches' is not provided in kwargs. ++ :raises AssertionError: If the memory object does not have a tensor. ++ :raises ValueError: If 'slot_mapping' is not provided in kwargs. ++ """ ++ assert memory_obj.tensor is not None ++ ++ if memory_obj.metadata.fmt != MemoryFormat.KV_BLOB: ++ raise ValueError( ++ "The memory object should be in KV_BLOB format in" ++ " order to be processed by VLLMPagedMemGPUConnector") ++ ++ if "kvcaches" not in kwargs: ++ raise ValueError("'kvcaches' should be provided in kwargs.") ++ ++ if "slot_mapping" not in kwargs: ++ raise ValueError("'slot_mapping' should be provided in kwargs.") ++ ++ kvcaches: Union[List[torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]] = kwargs["kvcaches"] ++ slot_mapping: torch.Tensor = kwargs["slot_mapping"] ++ ++ if not self.pointers_initialized: ++ self._initialize_pointers(kvcaches) ++ ++ if self._use_lmc_ops and memory_obj.base_ptr is not None: ++ slot_mapping_range = slot_mapping[start:end] ++ lmcache_C.paged_layers_kv_transfer(memory_obj.tensor, ++ self._kv_cache_devptrs, ++ slot_mapping_range, ++ memory_obj.base_ptr, ++ self.num_pages, ++ self.page_size, ++ self.kv_size, ++ False, ++ True) ++ else: ++ if self.is_tuple_layout: ++ raise NotImplementedError() ++ ++ self.npu_buffer.zero_() ++ ++ # FIXME: This seems inefficient ++ # Memobj -> npu_buffer -> kvcaches ++ # atm the main overhead is probably to do with the cpu overhead here ++ assert self.npu_buffer.device == kvcaches[0].device ++ tmp_npu_buffer = self.npu_buffer[:, :, :end-start, :] ++ tmp_npu_buffer.copy_(memory_obj.tensor, non_blocking=True) ++ ++ slots_tokens = slot_mapping[start:end] ++ valid_tokens = slots_tokens >= 0 ++ valid_idx = torch.nonzero(valid_tokens, as_tuple=True)[0] ++ physical_slots = slots_tokens[valid_tokens] ++ if physical_slots.numel() <= 0: ++ logger.error("Physical slots are zeros, this should not happened.") ++ return ++ for layer_id in range(len(kvcaches)): ++ # [blocks*block_size, num_heads*head_size] ++ for i in range(self.kv_size): ++ _layer = kvcaches[layer_id][i].view(self.page_buffer_size, -1) ++ _layer[physical_slots] = tmp_npu_buffer[i, layer_id, valid_idx] ++ ++ def from_gpu(self, memory_obj: MemoryObj, start: int, end: int, **kwargs): ++ """Expect a kwarg 'kvcaches' which is a nested tuple of K and V tensors. ++ The kvcaches should correspond to the "WHOLE token sequence". ++ ++ Will set the memory_obj.metadata.fmt to MemoryFormat.KV_BLOB. ++ ++ Note: ++ 1. This function expects the 'slot_mapping' is a "full slot mapping" ++ where it's length is the same as the whole token sequence. ++ 2. In the case that there is prefix caching, slot_mapping will starts ++ with -1s until the end of the matched prefix. The start and end ++ should NEVER overlap with the prefix caching ++ ++ :raises ValueError: If 'kvcaches' is not provided in kwargs, ++ :raises AssertionError: If the memory object does not have a tensor. ++ :raises ValueError: If 'slot_mapping' is not provided in kwargs. ++ """ ++ assert memory_obj.tensor is not None ++ ++ if "kvcaches" not in kwargs: ++ raise ValueError("'kvcaches' should be provided in kwargs.") ++ ++ if "slot_mapping" not in kwargs: ++ raise ValueError("'slot_mapping' should be provided in kwargs.") ++ ++ kvcaches: List[torch.Tensor] = kwargs["kvcaches"] ++ slot_mapping: torch.Tensor = kwargs["slot_mapping"] ++ ++ if not self.pointers_initialized: ++ self._initialize_pointers(kvcaches) ++ ++ if self._use_lmc_ops and memory_obj.base_ptr is not None: ++ slot_mapping_range = slot_mapping[start:end] ++ lmcache_C.paged_layers_kv_transfer(memory_obj.tensor, ++ self._kv_cache_devptrs, ++ slot_mapping_range, ++ memory_obj.base_ptr, ++ self.num_pages, ++ self.page_size, ++ self.kv_size, ++ True, ++ True) ++ else: ++ if self.is_tuple_layout: ++ raise NotImplementedError() ++ ++ self.npu_buffer.zero_() ++ temp_buf = self.npu_buffer[:, :, :end-start, :] ++ ++ # FIXME: Work on ascendc kernel copy version ++ # currently this is quite slow ++ # atm the main overhead is probably to do with the cpu overhead here ++ # copy from kvs into temp_buf ++ slots_tokens = slot_mapping[start:end] ++ valid_tokens = slots_tokens >= 0 ++ physical_slots = slots_tokens[valid_tokens] ++ if physical_slots.numel() <= 0: ++ logger.error("Physical slots are zeros, this should not happened.") ++ return ++ valid_idx = torch.nonzero(valid_tokens, as_tuple=True)[0] ++ for layer_id in range(len(kvcaches)): ++ for i in range(self.kv_size): ++ _layer = kvcaches[layer_id][i].view(self.page_buffer_size, -1) ++ temp_buf[i, layer_id, valid_idx] = _layer[physical_slots] ++ ++ memory_obj.tensor.zero_() ++ # sync memcpy ++ # size_in_bytes = temp_buf.numel() * temp_buf.element_size() ++ # ret = memcpy(memory_obj.tensor.data_ptr(), size_in_bytes, ++ # temp_buf.data_ptr(), size_in_bytes, self.ACL_MEMCPY_DEVICE_TO_HOST) ++ memory_obj.tensor.copy_(temp_buf.contiguous(), non_blocking=True) ++ memory_obj.metadata.fmt = MemoryFormat.KV_BLOB ++ # assert ret == 0, f"Memcpy failed, Got: {ret}." ++ # reset the npu buffer ++ ++ def get_shape(self, num_tokens: int) -> torch.Size: ++ return torch.Size( ++ [self.kv_size, self.num_layers, num_tokens, self.hidden_dim_size]) +diff --git a/lmcache/v1/pin_buffer.py b/lmcache/v1/pin_buffer.py +new file mode 100644 +index 0000000..197815d +--- /dev/null ++++ b/lmcache/v1/pin_buffer.py +@@ -0,0 +1,138 @@ ++import os ++import torch ++import numpy as np ++from lmcache.logging import init_logger ++import atexit ++import ctypes ++from .envs import is_disable_custom_ops ++ ++logger = init_logger(__file__) ++ ++ ++_HAS_ACL = False ++try: ++ import acl ++ _HAS_ACL = True ++except ImportError as ie: ++ logger.warning("Unable to import acl") ++ ++_HAS_MS = False ++try: ++ import mindspore as ms ++ _HAS_MS = True ++except ImportError as ie: ++ logger.warning("Unable to import mindspore") ++ ++_USE_LMCACHE_C = False ++try: ++ from lmcache import lmcache_C ++ _USE_LMCACHE_C = True ++except ImportError as ie: ++ logger.warning("unable to import LMCache_C") ++ ++STR_TO_TORCH_TYPE = { ++ "uint8" : torch.uint8 ++} ++ ++STR_TO_NP_TYPE = { ++ "uint8": np.uint8 ++} ++ ++STR_TO_CTYPES = { ++ "uint8": ctypes.c_uint8 ++} ++ ++__all__ = [ ++ "create_pin_memory" ++] ++ ++# TODO (Gingfung): Get rid of this global ? ++ACL_PIN_MEMORY_MANAGER = None ++ ++class aclPinnedMemoryManager: ++ def __init__(self): ++ assert _HAS_ACL, "Cannot create aclPinnedMemoryManager without acl" ++ ++ self._init() ++ ++ # TODO (Gingfung): Double check whether this will free the memory ++ atexit.register(self.clean_up) ++ ++ def _init(self): ++ """Since we only take care of alloc memory, ++ regardless whether we are initialized within a framework or not, ++ best we create our current context here. ++ """ ++ # NOTE (Gingfung): Just in case user haven't called acl ini ++ ret = acl.init() ++ dev = torch.cuda.current_device() ++ if dev == "Ascend": ++ # we are in mindspore land, ++ dev = ms.get_current_device().device_id ++ ++ if ret == 0: ++ self._to_finalized = True ++ ++ self._ctx, ret = acl.rt.create_context(dev) ++ assert ret == 0, "Cannot create context" ++ elif ret == 100002: ++ # already init by framework ++ self._to_finalized = False ++ self._ctx = None ++ else: ++ raise RuntimeError(f"Acl Init Failed: {ret}") ++ ++ ++ self._managed_ptrs = {} ++ ++ ++ def alloc(self, size: int): ++ _p, ret = acl.rt.malloc_host(size) ++ if ret != 0: ++ logger.error("Unable to malloc pinned host memory: %d", ret) ++ return None ++ ++ self._managed_ptrs[_p] = size ++ return _p ++ ++ def clean_up(self): ++ _ptrs = list(self._managed_ptrs.keys()) ++ for k in _ptrs: ++ ret = acl.rt.free_host(k) ++ if ret != 0: ++ logger.error("Unable to free host ptr: %d", ret) ++ ++ self._managed_ptrs.pop(k, None) ++ ++ if self._to_finalized: ++ ret = acl.rt.destroy_context(self._ctx) ++ assert ret == 0 ++ ret = acl.finalize() ++ assert ret == 0 ++ self._to_finalized = False ++ ++def _get_pinned_mem_manager(): ++ global ACL_PIN_MEMORY_MANAGER ++ if ACL_PIN_MEMORY_MANAGER is None: ++ ACL_PIN_MEMORY_MANAGER = aclPinnedMemoryManager() ++ return ACL_PIN_MEMORY_MANAGER ++ ++def get_np_pin_buffer(size: int, dtype: str): ++ pin_mem_manager = _get_pinned_mem_manager() ++ h_ptr = pin_mem_manager.alloc(size) ++ c_t = STR_TO_CTYPES[dtype] ++ el_size = ctypes.sizeof(c_t) ++ np_ptr = ctypes.cast(h_ptr, ctypes.POINTER(c_t * el_size)) ++ np_buf = np.ctypeslib.as_array(np_ptr, shape=(size,)).reshape(-1) ++ assert h_ptr == np_buf.ctypes.data_as(ctypes.c_void_p).value ++ return np_buf ++ ++def create_pin_memory(framework: str, size: int, dtype: str = "uint8"): ++ # if framework == "Mindspore": ++ # we need to use acl and numpy to represent our buffer. ++ # return get_np_pin_buffer(size, dtype) ++ global _USE_LMCACHE_C ++ if _USE_LMCACHE_C and not is_disable_custom_ops(): ++ logger.debug(f"create pinned tensor of size: {size} with lmcache_C") ++ return lmcache_C.create_pinned_tensor(size) ++ return torch.empty(size, dtype=torch.uint8, pin_memory=True) +diff --git a/lmcache/v1/protocol.py b/lmcache/v1/protocol.py +index 6e7583d..0f161d0 100644 +--- a/lmcache/v1/protocol.py ++++ b/lmcache/v1/protocol.py +@@ -48,8 +48,6 @@ DTYPE_TO_INT = { + torch.float64: 5, + torch.double: 5, + torch.uint8: 6, +- torch.float8_e4m3fn: 7, +- torch.float8_e5m2: 8, + } + + INT_TO_DTYPE = { +@@ -60,11 +58,16 @@ INT_TO_DTYPE = { + 4: torch.float, + 5: torch.float64, + 6: torch.uint8, +- 7: torch.float8_e4m3fn, +- 8: torch.float8_e5m2, + } + + ++try: ++ DTYPE_TO_INT.update({torch.float8_e4m3fn: 7, torch.float8_e5m2: 8}) ++ INT_TO_DTYPE.update({7: torch.float8_e4m3fn, 8: torch.float8_e5m2}) ++except AttributeError as ae: ++ # MSAdapter does not have these versions yet ++ pass ++ + @dataclass + class RemoteMetadata: + length: int +diff --git a/lmcache/v1/storage_backend/abstract_backend.py b/lmcache/v1/storage_backend/abstract_backend.py +index ce9be36..5c00add 100644 +--- a/lmcache/v1/storage_backend/abstract_backend.py ++++ b/lmcache/v1/storage_backend/abstract_backend.py +@@ -101,6 +101,7 @@ class StorageBackendInterface(metaclass=abc.ABCMeta): + def get_blocking( + self, + key: CacheEngineKey, ++ **kwargs + ) -> Optional[MemoryObj]: + """ + A blcocking function to get the kv cache from the storage backend. +diff --git a/lmcache/v1/storage_backend/connector/falconfs_connector.py b/lmcache/v1/storage_backend/connector/falconfs_connector.py +new file mode 100644 +index 0000000..d75f523 +--- /dev/null ++++ b/lmcache/v1/storage_backend/connector/falconfs_connector.py +@@ -0,0 +1,98 @@ ++# Copyright 2024-2025 LMCache Authors. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++ ++import uuid ++import time ++import asyncio ++import os ++import weakref ++from typing import List, Optional, Union, Tuple, no_type_check ++ ++import pyfalconfs ++import torch ++import operator ++from functools import reduce ++ ++from lmcache.v1.memory_management import (CopyLessMemoryObj, ++ MemoryAllocatorInterface, ++ MemoryObj, ++ MemoryFormat, ++ MemoryObjMetadata, ++ TensorMemoryAllocator) ++from lmcache.v1.protocol import RemoteMetadata ++from lmcache.v1.storage_backend.connector.base_connector import \ ++ RemoteConnector ++from lmcache.logging import init_logger ++from lmcache.utils import CacheEngineKey ++from concurrent.futures import ThreadPoolExecutor ++ ++logger = init_logger(__name__) ++ ++MAX_BUFFER_SIZE = 40 << 20 # 40MB ++METADATA_BYTES_LEN = 28 ++MAX_BUFFER_CNT = 500 ++ ++def fmtFile( ++ key: str, ++ ) -> str: ++ return "/data_py/" + key.replace("/", "-") + ".pt" ++ ++class FalconFsConnector(RemoteConnector): ++ """ ++ The remote url should start with "falconfs://" ++ """ ++ def __init__(self, workspace: str, config_path: str, ++ memory_allocator: MemoryAllocatorInterface, thread_pool_size = 256): ++ random_dir = workspace + str(uuid.uuid4()) ++ os.makedirs(random_dir) ++ #logger.info(f"sleeping 30: {os.getpid()}") ++ #time.sleep(30) ++ ++ self.connector = pyfalconfs.AsyncConnector(random_dir, config_path) ++ self.memory_allocator = memory_allocator ++ ++ async def exists(self, key: CacheEngineKey) -> bool: ++ ret = await self.connector.AsyncExists(fmtFile(key.to_string())) ++ return ret == 0 ++ ++ async def get(self, key: CacheEngineKey, shape, dtype) -> Optional[MemoryObj]: ++ key_str = key.to_string() ++ ++ memory_obj = self.memory_allocator.allocate(shape=shape, dtype=dtype) ++ ret = await self.connector.AsyncGet(fmtFile(key_str), memory_obj.byte_array, memory_obj.get_size(), 0) ++ if (ret < 0): ++ logger.warning(f"get: AsyncGet failed, ret = {ret}") ++ self.memory_allocator.ref_count_down(memory_obj) ++ return None ++ ++ return memory_obj ++ ++ async def put(self, key: CacheEngineKey, memory_obj: MemoryObj): ++ kv_bytes = memory_obj.byte_array ++ key_str = key.to_string() ++ ++ ret = await self.connector.AsyncPut(fmtFile(key_str), kv_bytes, len(kv_bytes), 0) ++ if ret < 0: ++ logger.warning(f"put: AsyncPut failed, ret = {ret}") ++ ++ self.memory_allocator.ref_count_down(memory_obj) ++ ++ # TODO ++ @no_type_check ++ async def list(self) -> List[str]: ++ pass ++ ++ async def close(self): ++ pass ++ +diff --git a/lmcache/v1/storage_backend/local_cpu_backend.py b/lmcache/v1/storage_backend/local_cpu_backend.py +index 9538014..b031dc2 100644 +--- a/lmcache/v1/storage_backend/local_cpu_backend.py ++++ b/lmcache/v1/storage_backend/local_cpu_backend.py +@@ -146,6 +146,7 @@ class LocalCPUBackend(StorageBackendInterface): + def get_blocking( + self, + key: CacheEngineKey, ++ **kwargs + ) -> Optional[MemoryObj]: + with self.cpu_lock: + if key not in self.hot_cache: +diff --git a/lmcache/v1/storage_backend/local_disk_backend.py b/lmcache/v1/storage_backend/local_disk_backend.py +index e7880f5..3f81838 100644 +--- a/lmcache/v1/storage_backend/local_disk_backend.py ++++ b/lmcache/v1/storage_backend/local_disk_backend.py +@@ -23,6 +23,8 @@ import threading + # Third Party + import aiofiles + import torch ++import json ++from dataclasses import asdict + + # First Party + from lmcache.logging import init_logger +@@ -65,6 +67,12 @@ class LocalDiskBackend(StorageBackendInterface): + os.makedirs(self.path) + logger.info(f"Created local disk cache directory: {self.path}") + ++ # NOTE: currently the pinned memory tensor ++ # seems to causes bad address error with FUSE driver ++ self.is_fuse_driver = config.use_fuse_driver ++ ++ self._rebuild_idx() ++ + self.lookup_server = lookup_server + + # Initialize the evictor +@@ -80,7 +88,28 @@ class LocalDiskBackend(StorageBackendInterface): + + def __str__(self): + return self.__class__.__name__ +- ++ ++ def _rebuild_idx(self): ++ """Rebuild Metadata across instances""" ++ for kv_file in os.listdir(self.path): ++ if "@meta" in kv_file: ++ with open(os.path.join(self.path, kv_file), "r") as f: ++ json_str = f.readlines() ++ if len(json_str) == 0: ++ continue ++ json_dct = json.loads(json_str[0]) ++ meta = DiskCacheMetadata.from_dict(json_dct) ++ key = self._path_to_key(meta.path) ++ with self.disk_lock: ++ self.dict[key] = meta ++ ++ def _path_to_key(self, p: str) -> CacheEngineKey: ++ """ ++ -3 = ".pt" ++ """ ++ _key = p[len(self.path):-3].replace("-", "/") ++ return CacheEngineKey.from_string(_key) ++ + def _key_to_path( + self, + key: CacheEngineKey, +@@ -140,12 +169,13 @@ class LocalDiskBackend(StorageBackendInterface): + KVEvictMsg(self.instance_id, key.worker_id, key.chunk_hash, "disk") + ) + +- def insert_key(self, key: CacheEngineKey, memory_obj: MemoryObj) -> None: ++ def insert_key(self, key: CacheEngineKey, memory_obj: MemoryObj) -> DiskCacheMetadata: + path = self._key_to_path(key) + size = memory_obj.get_size() + shape = memory_obj.metadata.shape + dtype = memory_obj.metadata.dtype + fmt = memory_obj.metadata.fmt ++ meta = DiskCacheMetadata(path, size, shape, dtype, fmt, False) + + has_stored = False + with self.disk_lock: +@@ -154,13 +184,14 @@ class LocalDiskBackend(StorageBackendInterface): + self.dict.pop(key) + has_stored = True + +- self.dict[key] = DiskCacheMetadata(path, size, shape, dtype, fmt, False) ++ self.dict[key] = meta + + # push kv admit msg + if self.lmcache_worker is not None and not has_stored: + self.lmcache_worker.put_msg( + KVAdmitMsg(self.instance_id, key.worker_id, key.chunk_hash, "disk") + ) ++ return meta + + def submit_put_task( + self, +@@ -229,6 +260,7 @@ class LocalDiskBackend(StorageBackendInterface): + def get_blocking( + self, + key: CacheEngineKey, ++ **kwargs + ) -> Optional[MemoryObj]: + """ + Blocking get function. +@@ -262,7 +294,6 @@ class LocalDiskBackend(StorageBackendInterface): + # TODO(Jiayi): Need to align prefetch and get_non_blocking + return self.submit_prefetch_task(key) + +- @_lmcache_nvtx_annotate + @torch.inference_mode() + async def async_save_bytes_to_disk( + self, +@@ -281,10 +312,25 @@ class LocalDiskBackend(StorageBackendInterface): + self.usage += size + self.stats_monitor.update_local_storage_usage(self.usage) + +- async with aiofiles.open(path, "wb") as f: +- await f.write(byte_array) +- +- self.insert_key(key, memory_obj) ++ # step1. save kv bytes ++ try: ++ async with aiofiles.open(path, 'wb') as f: ++ # works when made a copy, bytes(byte_array) ++ if self.is_fuse_driver: ++ await f.write(bytes(byte_array)) ++ else: ++ await f.write(byte_array) ++ except Exception as e: ++ logger.error("unable to save KVBytes: ", e) ++ meta = self.insert_key(key, memory_obj) ++ ++ # step2. save meta ++ meta_path = path.replace(".pt", "@meta.json") ++ try: ++ async with aiofiles.open(meta_path, 'w') as f: ++ await f.write(json.dumps(meta.to_dict())) ++ except Exception as e: ++ logger.error("unable to save DiskCacheMetadata: ", e) + + memory_obj.ref_count_down() + +@@ -304,14 +350,26 @@ class LocalDiskBackend(StorageBackendInterface): + if memory_obj is None: + logger.debug("Memory allocation failed during async disk load.") + return None +- buffer = memory_obj.byte_array ++ if not self.is_fuse_driver: ++ buffer = memory_obj.byte_array ++ else: ++ # create a temporary buffer ++ buffer = bytearray(memory_obj.get_size()) ++ + async with aiofiles.open(path, "rb") as f: + await f.readinto(buffer) ++ ++ if self.is_fuse_driver: ++ temp_tensor = torch.frombuffer(buffer, dtype=memory_obj.get_dtype()).view(shape) ++ memory_obj.tensor.copy_(temp_tensor) ++ + return memory_obj + + # TODO(Jiayi): use memory allocator to redeuce cpu buffer allocation + # TODO(Jiayi): the pinned cpu memory_obj should directly be passed into + # gpu connector; this gpu buffer could be avoided ++ # TODO(Gingfung): the frombuffer tensor copied is needed for FUSE driver ++ # however seems to cause overhead here, will need to optimize further + def load_bytes_from_disk( + self, path: str, dtype: torch.dtype, shape: torch.Size, fmt: MemoryFormat + ) -> Optional[MemoryObj]: +@@ -322,9 +380,26 @@ class LocalDiskBackend(StorageBackendInterface): + if memory_obj is None: + logger.debug("Memory allocation failed during async disk load.") + return None +- buffer = memory_obj.byte_array +- with open(path, "rb") as f: +- f.readinto(buffer) ++ ++ if not self.is_fuse_driver: ++ buffer = memory_obj.byte_array ++ else: ++ # create a temporary buffer ++ buffer = bytearray(memory_obj.get_size()) ++ ++ try: ++ with open(path, "rb") as f: ++ f.readinto(buffer) ++ ++ except Exception as e: ++ logger.error("unable to load KVBytes due to: ", e) ++ self.memory_allocator.ref_count_down(memory_obj) ++ return None ++ ++ if self.is_fuse_driver: ++ temp_tensor = torch.frombuffer(buffer, dtype=memory_obj.get_dtype()).view(shape) ++ memory_obj.tensor.copy_(temp_tensor, non_blocking=True) ++ + return memory_obj + + @_lmcache_nvtx_annotate +diff --git a/lmcache/v1/storage_backend/remote_backend.py b/lmcache/v1/storage_backend/remote_backend.py +index 4c15f9c..a90a417 100644 +--- a/lmcache/v1/storage_backend/remote_backend.py ++++ b/lmcache/v1/storage_backend/remote_backend.py +@@ -168,6 +168,10 @@ class RemoteBackend(StorageBackendInterface): + """ + Callback function for put tasks. + """ ++ exception = future.exception() ++ if exception is not None: ++ logger.error("During put remote, encounter exception: %s", exception) ++ raise exception + self.lock.acquire() + self.put_tasks.remove(key) + self.lock.release() +@@ -221,6 +225,7 @@ class RemoteBackend(StorageBackendInterface): + def get_blocking( + self, + key: CacheEngineKey, ++ **kwargs + ) -> Optional[MemoryObj]: + """ + Blocking get function. +@@ -235,7 +240,7 @@ class RemoteBackend(StorageBackendInterface): + key.fmt, key.model_name, key.world_size, 0, key.chunk_hash + ) + t1 = time.perf_counter() +- future = asyncio.run_coroutine_threadsafe(self.connection.get(key), self.loop) ++ future = asyncio.run_coroutine_threadsafe(self.connection.get(key, **kwargs), self.loop) + + try: + memory_obj = future.result(self.blocking_timeout_secs) +diff --git a/lmcache/v1/storage_backend/storage_manager.py b/lmcache/v1/storage_backend/storage_manager.py +index da2b261..7c79dc9 100644 +--- a/lmcache/v1/storage_backend/storage_manager.py ++++ b/lmcache/v1/storage_backend/storage_manager.py +@@ -188,7 +188,7 @@ class StorageManager: + for memory_obj in memory_objs: + memory_obj.ref_count_down() + +- def get(self, key: CacheEngineKey) -> Optional[MemoryObj]: ++ def get(self, key: CacheEngineKey, **kwargs) -> Optional[MemoryObj]: + """ + Blocking function to get the memory object from the storages. + """ +@@ -212,7 +212,7 @@ class StorageManager: + # Search all backends for blocking get + for backend_name, backend in self.storage_backends.items(): + # NOTE(Jiayi): bypass the allocator for now +- memory_obj = backend.get_blocking(key) ++ memory_obj = backend.get_blocking(key, **kwargs) + if memory_obj is not None: + if backend_name not in ["LocalCPUBackend", "NixlBackend"]: + local_cpu_backend = self.storage_backends["LocalCPUBackend"] +diff --git a/lmcache/v1/token_database.py b/lmcache/v1/token_database.py +index 5d026a2..9a8536f 100644 +--- a/lmcache/v1/token_database.py ++++ b/lmcache/v1/token_database.py +@@ -26,7 +26,11 @@ import torch + from lmcache.config import LMCacheEngineMetadata + from lmcache.utils import CacheEngineKey, _lmcache_nvtx_annotate + from lmcache.v1.config import LMCacheEngineConfig ++import numpy as np ++import xxhash + ++from lmcache.logging import init_logger ++logger = init_logger(__name__) + + class TokenDatabase(metaclass=abc.ABCMeta): + """TokenDatabase is used to convert input tokens into list of +@@ -42,8 +46,8 @@ class TokenDatabase(metaclass=abc.ABCMeta): + @abc.abstractmethod + def process_tokens( + self, +- tokens: Union[torch.Tensor, List[int]], +- mask: Optional[torch.Tensor] = None, ++ tokens: Union[np.ndarray, List[int]], ++ mask: Optional[np.ndarray] = None, + make_key: bool = True, + ) -> Iterable[Tuple[int, int, Union[CacheEngineKey, str]]]: + """Process the tokens and return the corresponding cache engine keys. +@@ -90,20 +94,25 @@ class ChunkedTokenDatabase(TokenDatabase): + + def _hash( + self, +- tokens: Union[torch.Tensor, List[int]], ++ tokens: Union[np.ndarray, List[int]], + prefix_hash: str, + ) -> str: + # TODO: change it to a more efficient hash function +- if isinstance(tokens, torch.Tensor): +- tokens_bytes = tokens.cpu().to(torch.uint32).numpy().tobytes() ++ if isinstance(tokens, np.ndarray): ++ tokens_bytes = tokens.tobytes() + elif isinstance(tokens, list): + tokens_bytes = array.array("I", tokens).tobytes() +- return hashlib.sha256(prefix_hash.encode("ascii") + tokens_bytes).hexdigest() ++ else: ++ raise TypeError(f"Unsupported type for tokens: {type(tokens)}") + ++ # # NOTE (GingFung): shouldn't really have collision... 2^64 tokens after ++ # return xxhash.xxh3_128_hexdigest(prefix_hash.encode("ascii") + tokens_bytes) ++ hashes_ = hashlib.sha256(prefix_hash.encode("ascii") + tokens_bytes).hexdigest() ++ return hashes_ + def _chunk_tokens( + self, +- tokens: Union[torch.Tensor, List[int]], +- ) -> Iterable[Union[torch.Tensor, List[int]]]: ++ tokens: Union[np.ndarray, List[int]], ++ ) -> Iterable[Union[np.ndarray, List[int]]]: + """ + Chunk the tokens into chunks of size self.chunk_size. + +@@ -114,16 +123,16 @@ class ChunkedTokenDatabase(TokenDatabase): + shape [chunk_size] + """ + end = ( +- len(tokens) ++ tokens.size + if self.save_unfull_chunk +- else (len(tokens) - len(tokens) % self.chunk_size) ++ else (tokens.size - tokens.size % self.chunk_size) + ) + for i in range(0, end, self.chunk_size): + yield tokens[i : i + self.chunk_size] + + def _prefix_hash( + self, +- token_chunks: Iterable[Union[torch.Tensor, List[int]]], ++ token_chunks: Iterable[Union[np.ndarray, List[int]]], + ) -> Iterable[str]: + prefix_hash = self._get_init_hash() + for token_chunk in token_chunks: +@@ -133,8 +142,8 @@ class ChunkedTokenDatabase(TokenDatabase): + @_lmcache_nvtx_annotate + def process_tokens( + self, +- tokens: Union[torch.Tensor, List[int]], +- mask: Optional[torch.Tensor] = None, ++ tokens: Union[np.ndarray, List[int]], ++ mask: Optional[np.ndarray] = None, + make_key: bool = True, + ) -> Iterable[Tuple[int, int, Union[CacheEngineKey, str]]]: + """Process the tokens and return the corresponding cache engine keys. +@@ -158,7 +167,7 @@ class ChunkedTokenDatabase(TokenDatabase): + multiple of the chunk size. + """ + if mask is not None: +- num_falses = mask.numel() - mask.long().sum().item() ++ num_falses = mask.size - int(np.sum(mask)) + else: + num_falses = 0 + +@@ -166,7 +175,7 @@ class ChunkedTokenDatabase(TokenDatabase): + raise ValueError( + "The number of Falses in the mask is not a multiple of the chunk size." + ) +- total_len = len(tokens) ++ total_len = tokens.size + + token_chunks = self._chunk_tokens(tokens) + prefix_hashes = self._prefix_hash(token_chunks) +diff --git a/pyproject.toml b/pyproject.toml +index 3c2ed13..9fa2626 100644 +--- a/pyproject.toml ++++ b/pyproject.toml +@@ -5,7 +5,7 @@ requires = [ + "packaging>=24.2", + "setuptools>=77.0.3,<81.0.0", + "setuptools_scm>=8", +- "torch>=2.5.1", ++ # "torch>=2.5.1", + "wheel", + ] + build-backend = "setuptools.build_meta" +@@ -13,8 +13,7 @@ build-backend = "setuptools.build_meta" + [project] + name = "lmcache" + authors = [{name = "LMCache Team", email = "lmcacheteam@gmail.com"}] +-license = "Apache-2.0" +-license-files = ["LICENSE"] ++license = { text = "Apache-2.0" } + readme = "README.md" + description = "A LLM serving engine extension to reduce TTFT and increase throughput, especially under long-context scenarios." + classifiers = [ +diff --git a/requirements/ascend.txt b/requirements/ascend.txt +new file mode 100644 +index 0000000..645bc46 +--- /dev/null ++++ b/requirements/ascend.txt +@@ -0,0 +1,2 @@ ++-r common.txt ++pybind11 +\ No newline at end of file +diff --git a/requirements/build.txt b/requirements/build.txt +index ccb5518..eea6e85 100644 +--- a/requirements/build.txt ++++ b/requirements/build.txt +@@ -4,5 +4,5 @@ ninja + packaging>=24.2 + setuptools>=77.0.3,<81.0.0 + setuptools_scm>=8 +-torch>=2.5.1 # Corresponds to the version used by vLLM main branch ++# torch>=2.5.1 # Corresponds to the version used by vLLM main branch + wheel +diff --git a/requirements/common.txt b/requirements/common.txt +index ba3c3a7..f2cd517 100644 +--- a/requirements/common.txt ++++ b/requirements/common.txt +@@ -4,7 +4,6 @@ aiohttp + infinistore; platform_machine == 'x86_64' + msgspec + numpy +-nvtx + prometheus_client >= 0.18.0 + psutil + pyyaml +@@ -14,5 +13,6 @@ safetensors + setuptools>=77.0.3,<81.0.0 + setuptools_scm>=8 + sortedcontainers +-torch>=2.5.1 # Should correspond to the version used by vLLM main branch ++# torch>=2.5.1 # Should correspond to the version used by vLLM main branch + transformers >= 4.51.1 ++xxhash +diff --git a/setup.py b/setup.py +index 12aad15..3b56457 100644 +--- a/setup.py ++++ b/setup.py +@@ -4,7 +4,17 @@ import os + import sys + + # Third Party +-from setuptools import find_packages, setup ++from setuptools import find_packages, setup, Extension ++from setuptools.command.build_ext import build_ext ++from setuptools.command.develop import develop ++from setuptools.command.install import install ++ ++import logging ++import sysconfig ++import subprocess ++import platform ++import shutil ++ + + ROOT_DIR = Path(__file__).parent + HIPIFY_DIR = os.path.join(ROOT_DIR, "csrc/") +@@ -14,10 +24,159 @@ HIPIFY_OUT_DIR = os.path.join(ROOT_DIR, "csrc_hip/") + # will run python setup.py sdist --dist-dir dist + BUILDING_SDIST = "sdist" in sys.argv or os.environ.get("NO_CUDA_EXT", "0") == "1" + +-# New environment variable to choose between CUDA and HIP +-BUILD_WITH_HIP = os.environ.get("BUILD_WITH_HIP", "0") == "1" ++# Environment variable to choose between CUDA, HIP, Ascend ++TARGET_DEVICE = os.environ.get("LMCACHE_TARGET_DEVICE", "ASCEND") ++ ++USE_TORCH_ENV = bool(os.getenv("USE_TORCH", "0") == "1") ++ ++logging.basicConfig(level=logging.INFO) ++logger = logging.getLogger(__name__) ++ ++def _get_ascend_home_path(): ++ # NOTE: standard Ascend CANN toolkit path ++ return os.environ.get("ASCEND_HOME_PATH", "/usr/local/Ascend/ascend-toolkit/latest") ++ ++def _get_ascend_env_path(): ++ # NOTE: standard Ascend Environment variable setup path ++ env_script_path = os.path.realpath(os.path.join(_get_ascend_home_path(), "..", "set_env.sh")) ++ if not os.path.exists(env_script_path): ++ raise ValueError(f"The file '{env_script_path}' is not found, " ++ "please make sure environment variable 'ASCEND_HOME_PATH' is set correctly.") ++ return env_script_path ++ ++def _get_npu_soc(): ++ _soc_version = os.getenv("SOC_VERSION", None) ++ if _soc_version is None: ++ npu_smi_cmd = [ ++ "bash", ++ "-c", ++ "npu-smi info | grep OK | awk '{print $3}' | head -n 1", ++ ] ++ try: ++ _soc_version = subprocess.check_output(npu_smi_cmd, ++ text=True).strip() ++ _soc_version = _soc_version.split("-")[0] ++ _soc_version = "Ascend"+_soc_version ++ return _soc_version ++ except subprocess.CalledProcessError as e: ++ raise RuntimeError(f"Retrieve SoC version failed: {e}") ++ ++ ++class CMakeExtension(Extension): ++ ++ def __init__(self, ++ name: str, ++ cmake_lists_dir: str = ".", ++ **kwargs) -> None: ++ super().__init__(name, sources=[], py_limited_api=False, **kwargs) ++ self.cmake_lists_dir = os.path.abspath(cmake_lists_dir) ++ ++class custom_install(install): ++ def run(self): ++ self.run_command("build_ext") ++ install.run(self) ++ ++class CustomAscendCmakeBuildExt(build_ext): ++ ++ def build_extension(self, ext): ++ # build the so as c_ops ++ ext_name = ext.name.split(".")[-1] ++ so_name = ext_name + ".so" ++ logger.info(f"Building {so_name} ...") ++ OPS_DIR = os.path.join(ROOT_DIR) ++ BUILD_OPS_DIR = os.path.join(ROOT_DIR, "build", "ascend") ++ os.makedirs(BUILD_OPS_DIR, exist_ok=True) ++ ++ ascend_home_path = _get_ascend_home_path() ++ env_path = _get_ascend_env_path() ++ _soc_version = _get_npu_soc() ++ _cxx_compiler = os.getenv("CXX") ++ _cc_compiler = os.getenv("CC") ++ python_executable = sys.executable + ++ try: ++ # if pybind11 is installed via pip ++ pybind11_cmake_path = (subprocess.check_output( ++ [python_executable, "-m", "pybind11", ++ "--cmakedir"]).decode().strip()) ++ except subprocess.CalledProcessError as e: ++ # else specify pybind11 path installed from source code on CI container ++ raise RuntimeError(f"CMake configuration failed: {e}") ++ ++ # import torch_npu ++ # torch_npu_path = os.path.dirname(os.path.abspath(torch_npu.__file__)) ++ # import torch ++ # torch_path = os.path.dirname(os.path.abspath(torch.__file__)) + ++ import mindspore ++ ms_path = os.path.dirname(os.path.abspath(mindspore.__file__)) ++ ++ # python include ++ python_include_path = sysconfig.get_path('include', scheme='posix_prefix') ++ ++ arch = platform.machine() ++ install_path = os.path.join(BUILD_OPS_DIR, 'install') ++ if isinstance(self.distribution.get_command_obj("develop"), develop): ++ install_path=BUILD_OPS_DIR ++ ++ cmake_cmd = [ ++ f"source {env_path} && " ++ f"cmake -S {OPS_DIR} -B {BUILD_OPS_DIR}" ++ f" -DSOC_VERSION={_soc_version}" ++ f" -DARCH={arch}" ++ " -DUSE_ASCEND=1" ++ f" -DPYTHON_EXECUTABLE={python_executable}" ++ f" -DCMAKE_PREFIX_PATH={pybind11_cmake_path}" ++ f" -DCMAKE_BUILD_TYPE=Release" ++ f" -DCMAKE_INSTALL_PREFIX={install_path}" ++ f" -DPYTHON_INCLUDE_PATH={python_include_path}" ++ # f" -DTORCH_NPU_PATH={torch_npu_path}" ++ # f" -DTORCH_PATH={torch_path}" ++ f" -DMS_NPU_DIR={ms_path}" ++ f" -DASCEND_CANN_PACKAGE_PATH={ascend_home_path}" ++ " -DCMAKE_VERBOSE_MAKEFILE=ON" ++ ] ++ ++ if _cxx_compiler is not None: ++ cmake_cmd += [f" -DCMAKE_CXX_COMPILER={_cxx_compiler}"] ++ ++ if _cc_compiler is not None: ++ cmake_cmd += [f" -DCMAKE_C_COMPILER={_cc_compiler}"] ++ ++ cmake_cmd += [f" && cmake --build {BUILD_OPS_DIR} -j --verbose"] ++ cmake_cmd += [f" && cmake --install {BUILD_OPS_DIR}"] ++ cmake_cmd = "".join(cmake_cmd) ++ ++ logger.info(f"Start running CMake commands:\n{cmake_cmd}") ++ try: ++ result = subprocess.run(cmake_cmd, cwd=ROOT_DIR, text=True, shell=True, check=True) ++ except subprocess.CalledProcessError as e: ++ raise RuntimeError(f"Failed to build {so_name}: {e}") ++ ++ build_lib_dir = self.get_ext_fullpath(ext.name) ++ os.makedirs(os.path.dirname(build_lib_dir), exist_ok=True) ++ ++ package_name = ext.name.split('.')[0] # e.g., 'lmcache' ++ src_dir = os.path.join(ROOT_DIR, package_name) ++ ++ for root, _, files in os.walk(install_path): ++ for file in files: ++ if file.endswith(".so"): ++ src_path = os.path.join(root, file) ++ dst_path = os.path.join(os.path.dirname(build_lib_dir), file) ++ if os.path.exists(dst_path): ++ os.remove(dst_path) ++ ++ if isinstance(self.distribution.get_command_obj("develop"), develop): ++ # For the ascend kernels ++ src_dir_file = os.path.join(src_dir, file) ++ shutil.copy(src_path, src_dir_file) ++ shutil.copy(src_path, dst_path) ++ ++ logger.info(f"Copied {file} to {dst_path}") ++ ++ ++ + def hipify_wrapper() -> None: + # Third Party + from torch.utils.hipify.hipify_python import hipify +@@ -58,7 +217,7 @@ def hipify_wrapper() -> None: + + + # Taken from https://github.com/vllm-project/vllm/blob/main/setup.py +-def get_requirements() -> list[str]: ++def get_requirements(target_device) -> list[str]: + """Get Python package dependencies from requirements.txt.""" + requirements_dir = ROOT_DIR / "requirements" + +@@ -76,8 +235,15 @@ def get_requirements() -> list[str]: + ): + resolved_requirements.append(line) + return resolved_requirements +- +- requirements = _read_requirements("common.txt") ++ ++ if target_device == "EMPTY": ++ requirements = _read_requirements("common.txt") ++ elif target_device == "CUDA" or target_device == "HIP": ++ requirements = _read_requirements("cuda.txt") ++ elif target_device == "ASCEND": ++ requirements = _read_requirements("ascend.txt") ++ else: ++ raise ValueError(f"Unknown target device: {target_device}") + return requirements + + +@@ -158,6 +324,12 @@ def rocm_extension() -> tuple[list, dict]: + return ext_modules, cmdclass + + ++def ascend_extension(): ++ print("Building Ascend extensions") ++ return [CMakeExtension(name="lmcache.lmcache_C")], \ ++ {"build_ext": CustomAscendCmakeBuildExt} ++ ++ + def source_dist_extension() -> tuple[list, dict]: + print("Not building CUDA/HIP extensions for sdist") + return [], {} +@@ -166,19 +338,31 @@ def source_dist_extension() -> tuple[list, dict]: + if __name__ == "__main__": + if BUILDING_SDIST: + get_extension = source_dist_extension +- elif BUILD_WITH_HIP: ++ TARGET_DEVICE = "EMPTY" ++ elif TARGET_DEVICE == "HIP": + get_extension = rocm_extension +- else: ++ elif TARGET_DEVICE == "CUDA": + get_extension = cuda_extension +- ++ elif TARGET_DEVICE == "ASCEND": ++ get_extension = ascend_extension ++ + ext_modules, cmdclass = get_extension() ++ requirments_list_origin = get_requirements(TARGET_DEVICE) ++ ++ if USE_TORCH_ENV: ++ requirments_list = requirments_list_origin ++ else: ++ requirments_list = ["mindspore >= 2.6.0" if s.startswith("torch") else s for s in requirments_list_origin] ++ + ++ logger.info("Python package dependencies: %s", requirments_list) + setup( + packages=find_packages( + exclude=("csrc",) + ), # Ensure csrc is excluded if it only contains sources +- install_requires=get_requirements(), ++ install_requires=requirments_list, + ext_modules=ext_modules, + cmdclass=cmdclass, + include_package_data=True, ++ package_data={ "lmcache": ['*.so'] } + ) +diff --git a/tests/v1/test_ms_mem_kernels.py b/tests/v1/test_ms_mem_kernels.py +new file mode 100644 +index 0000000..b032961 +--- /dev/null ++++ b/tests/v1/test_ms_mem_kernels.py +@@ -0,0 +1,190 @@ ++import random ++from typing import List ++ ++import torch ++import numpy as np ++ ++from lmcache import lmcache_C ++from mindspore import Tensor, nn, mutable, mint ++from lmcache.experimental.memory_management import MixedMemoryAllocator ++import debugpy ++import acl ++import time ++import mindspore as ms ++ms.set_context(mode=ms.PYNATIVE_MODE) ++ms.set_context(debug_level=ms.context.DEBUG) ++ ++ ++def generate_kv_cache_tuple_paged_list_tensors( ++ num_blocks, ++ device, ++ block_size=16, ++ num_layers=32, ++ num_heads=8, ++ head_size=128, ++ kvs_dim=2, ++ dtype=torch.bfloat16 ++): ++ """ ++ Return List[Tuple[Tensor, Tensor]] ++ """ ++ shape = [num_blocks, block_size, num_heads, head_size] ++ # calculate and print size ++ size = np.prod(shape) * 2 * num_layers * kvs_dim ++ print(f"KVCache size: {size / 1024 / 1024} MB") ++ kvcaches = [] ++ for i in range(num_layers): ++ current_cache = [] ++ for _ in range(kvs_dim): ++ t = mint.randn(shape, dtype=dtype) ++ t = t.move_to(device) ++ current_cache.append(mutable(t)) ++ kvcaches.append(mutable(tuple(current_cache))) ++ return mutable(kvcaches) ++ ++def check_paged_tuple_kv_cache_equal(left, ++ right, ++ num_tokens, ++ slot_mapping, ++ num_heads=8, ++ head_size=128): ++ """ ++ check whether two paged kv caches are the same at slot_mapping ++ """ ++ token_dim = 0 ++ for left_kv, right_kv in zip(left, right): ++ left_k = left_kv[0].reshape(-1, num_heads, head_size) ++ left_v = left_kv[1].reshape(-1, num_heads, head_size) ++ right_k = right_kv[0].reshape(-1, num_heads, head_size) ++ right_v = right_kv[1].reshape(-1, num_heads, head_size) ++ ++ assert len(left_k.shape) == 3 ++ assert len(left_v.shape) == 3 ++ assert len(right_k.shape) == 3 ++ assert len(right_v.shape) == 3 ++ ++ assert left_k.shape[token_dim] >= num_tokens ++ assert left_v.shape[token_dim] >= num_tokens ++ assert right_k.shape[token_dim] >= num_tokens ++ assert right_v.shape[token_dim] >= num_tokens ++ ++ # 1. Convert MindSpore Tensors to NumPy arrays ++ left_k_np = left_k.asnumpy() ++ right_k_np = right_k.asnumpy() ++ slot_mapping_np = slot_mapping.asnumpy() ++ ++ comp_left_k = left_k_np[slot_mapping_np, :, :] ++ comp_right_k = right_k_np[slot_mapping_np, :, :] ++ result_np_k = ( comp_left_k == comp_right_k ).all() ++ assert result_np_k ++ ++ left_v_np = left_v.asnumpy() ++ right_v_np = right_v.asnumpy() ++ ++ comp_left_v = left_v_np[slot_mapping_np, :, :] ++ comp_right_v = right_v_np[slot_mapping_np, :, :] ++ result_np_v = ( comp_left_v == comp_right_v ).all() ++ assert result_np_v ++ return True ++ ++ ++ ++def test_ms_multi_layer_kernel(mem_allocator, num_tokens, layers=60, kvs=2, ++ head_size=128, num_heads=10, ++ chunk_size=256, block_size=64): ++ device = "Ascend" ++ num_blocks = (num_tokens // block_size) + 100 ++ assert num_tokens >= chunk_size and num_tokens % chunk_size == 0 ++ dtype = torch.bfloat16 ++ kv_cache = generate_kv_cache_tuple_paged_list_tensors( ++ num_blocks, ++ device, ++ block_size=block_size, ++ num_layers=layers, ++ num_heads=num_heads, ++ head_size=head_size, ++ dtype=dtype ++ ) ++ slot_mapping = np.arange(0, num_tokens) ++ slot_mapping = torch.tensor(slot_mapping, device=device, dtype=torch.int32) ++ ++ # New extract with multi layer kernel ++ t1 = time.perf_counter() ++ kv_cache_pointers_dev = torch.empty([layers, kvs], ++ dtype=torch.int64, ++ device='Ascend', ++ pin_memory=True) ++ t2 = time.perf_counter() ++ kv_cache_pointers_cpus = torch.Tensor([[kvc.data_ptr() for kvc in layer_kv ]for layer_kv in kv_cache]) ++ kv_cache_pointers_dev.copy_(kv_cache_pointers_cpus, non_blocking=False) ++ t3 = time.perf_counter() ++ print("Cost cpu alloc: ", (t2-t1)*1000, "Cost KVCache ptrs: ", (t3 - t2)*1000) ++ ++ memory_obj_new_list = [] ++ start_event = torch.cuda.Event(enable_timing=True) ++ end_event = torch.cuda.Event(enable_timing=True) ++ start_event.record() ++ slot_mapping_chunked = torch.split(slot_mapping, chunk_size) ++ for chunk_id, slot_mapping_temp in enumerate(slot_mapping_chunked): ++ mem_obj_shape = [kvs, layers, len(slot_mapping_temp), num_heads * head_size] ++ memory_obj_new = mem_allocator.allocate(mem_obj_shape, dtype) ++ lmcache_C.paged_layers_kv_transfer(memory_obj_new.tensor, ++ kv_cache_pointers_dev, slot_mapping_temp, ++ memory_obj_new.base_ptr, num_blocks, ++ block_size, kvs, True, True) ++ memory_obj_new_list.append(memory_obj_new) ++ end_event.record() ++ torch.cuda.synchronize() ++ elapsed_time_ms = start_event.elapsed_time(end_event) ++ print("extract time: ", elapsed_time_ms / 1000) ++ ++ kv_cache_new = generate_kv_cache_tuple_paged_list_tensors( ++ num_blocks, ++ device, ++ block_size=block_size, ++ num_layers=layers, ++ num_heads=num_heads, ++ head_size=head_size, ++ dtype=dtype ++ ) ++ ++ kv_cache_pointers_dev_new = torch.empty([layers, kvs], ++ dtype=torch.int64, ++ device='Ascend', ++ pin_memory=True) ++ kv_cache_pointers_cpus_new = torch.Tensor([[kvc.data_ptr() for kvc in layer_kv ]for layer_kv in kv_cache_new]) ++ kv_cache_pointers_dev_new.copy_(kv_cache_pointers_cpus_new, non_blocking=False) ++ torch.cuda.synchronize() ++ for chunk_id, slot_mapping_temp in enumerate(slot_mapping_chunked): ++ memory_obj_new = memory_obj_new_list[chunk_id] ++ ++ lmcache_C.paged_layers_kv_transfer(memory_obj_new.tensor, ++ kv_cache_pointers_dev_new, slot_mapping_temp, ++ memory_obj_new.base_ptr, num_blocks, ++ block_size, kvs, False, True) ++ ++ valid_check_t1 = time.perf_counter() ++ valid = check_paged_tuple_kv_cache_equal( ++ kv_cache, ++ kv_cache_new, ++ num_tokens, ++ slot_mapping, ++ num_heads=num_heads, ++ head_size=head_size ++ ) ++ valid_check_time = (time.perf_counter() - valid_check_t1) * 1000 ++ print(valid, valid_check_time) ++ ++ del kv_cache_pointers_dev, kv_cache_pointers_dev_new, kv_cache, kv_cache_new ++ ms.runtime.synchronize() ++ ms.runtime.empty_cache() ++ ++if __name__ == "__main__": ++ # debugpy.listen(('0.0.0.0', 5678)) ++ # debugpy.wait_for_client() ++ pinned_cpu_size = 30 * 1024 * 1024 * 1024 ++ mem_allocator = MixedMemoryAllocator(pinned_cpu_size) ++ ++ for token in [8192]: ++ test_ms_multi_layer_kernel(mem_allocator, token) ++ time.sleep(10) +\ No newline at end of file diff --git a/fix-build-requirements-for-ascend.patch b/fix-build-requirements-for-ascend.patch new file mode 100644 index 0000000000000000000000000000000000000000..79ab28970a2b300ee2b2388e3bfdb9ec9a95283b --- /dev/null +++ b/fix-build-requirements-for-ascend.patch @@ -0,0 +1,54 @@ +diff --git a/pyproject.toml b/pyproject.toml +index 9a1c936..3c2ed13 100644 +--- a/pyproject.toml ++++ b/pyproject.toml +@@ -5,7 +5,7 @@ requires = [ + "packaging>=24.2", + "setuptools>=77.0.3,<81.0.0", + "setuptools_scm>=8", +- "torch==2.7.0", ++ "torch>=2.5.1", + "wheel", + ] + build-backend = "setuptools.build_meta" +diff --git a/requirements/build.txt b/requirements/build.txt +index a2db38b..ccb5518 100644 +--- a/requirements/build.txt ++++ b/requirements/build.txt +@@ -4,5 +4,5 @@ ninja + packaging>=24.2 + setuptools>=77.0.3,<81.0.0 + setuptools_scm>=8 +-torch==2.7.0 # Corresponds to the version used by vLLM main branch ++torch>=2.5.1 # Corresponds to the version used by vLLM main branch + wheel +diff --git a/requirements/common.txt b/requirements/common.txt +index 9894e01..ba3c3a7 100644 +--- a/requirements/common.txt ++++ b/requirements/common.txt +@@ -1,8 +1,7 @@ + aiofile + aiofiles + aiohttp +-cufile-python +-infinistore ++infinistore; platform_machine == 'x86_64' + msgspec + numpy + nvtx +@@ -15,5 +14,5 @@ safetensors + setuptools>=77.0.3,<81.0.0 + setuptools_scm>=8 + sortedcontainers +-torch==2.7.0 # Should correspond to the version used by vLLM main branch ++torch>=2.5.1 # Should correspond to the version used by vLLM main branch + transformers >= 4.51.1 +diff --git a/requirements/cuda.txt b/requirements/cuda.txt +index 719261d..4912df9 100644 +--- a/requirements/cuda.txt ++++ b/requirements/cuda.txt +@@ -9,3 +9,4 @@ nvidia-ml-py # for pynvml package + torch == 2.7.0 + torchvision == 0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version + xformers == 0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.7.0 ++cufile-python diff --git a/fix-dependance-issue-on-ascend.patch b/fix-dependance-issue-on-ascend.patch deleted file mode 100644 index 05a158638d383324086c991815ebf5b095df5c68..0000000000000000000000000000000000000000 --- a/fix-dependance-issue-on-ascend.patch +++ /dev/null @@ -1,13 +0,0 @@ -diff --git a/setup.py b/setup.py -index 5b1aa4b..1895953 100644 ---- a/setup.py -+++ b/setup.py -@@ -17,7 +17,7 @@ setup( - "nvtx", - "safetensors", - "transformers", -- "torchac_cuda >= 0.2.5", -+ # "torchac_cuda >= 0.2.5", - ], - classifiers=[ - # Trove classifiers diff --git a/lmcache.spec b/lmcache.spec index c1dc836e6d611c2048e6a0ea79c5db825bb39c0a..6791f1ce8ef5bdba599247a76d8b83c281cbf5f7 100644 --- a/lmcache.spec +++ b/lmcache.spec @@ -1,20 +1,19 @@ %define debug_package %{nil} Name: python-LMCache -Version: 0.1.4.alpha +Version: 0.3.1.post1 Release: 1 Summary: LMCache - A lightweight memory cache system License: Apache 2.0 URL: https://github.com/LMCache/LMCache Source0: LMCache-%{version}.tar.gz -Patch0: fix-dependance-issue-on-ascend.patch - -BuildArch: noarch +Patch0: fix-build-requirements-for-ascend.patch +Patch1: adapt-mindspore.patch +BuildArch: aarch64 BuildRequires: python3 BuildRequires: python3-setuptools python3-pip - Requires: python3 %description @@ -25,17 +24,26 @@ simple key-value storage with minimal overhead. %autosetup -n %{name}-%{version} -p1 -Sgit %build -ln -s /usr/bin/python3 /usr/bin/python || : +# ln -s /usr/bin/python3 /usr/bin/python || : + +export CPLUS_INCLUDE_PATH=/usr/include/c++/12/aarch64-openEuler-linux:/usr/include/c++/12:$CPLUS_INCLUDE_PATH +export LMCACHE_TARGET_DEVICE="ASCEND" +export USE_TORCH=0 +export PYTHONPATH=/usr/local/lib64/python3.11/site-packages:/usr/local/lib/python3.11/site-packages:/usr/lib/python3.11/site-packages:$PYTHONPATH +export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:$LD_LIBRARY_PATH + %py3_build + %install %py3_install + %files -%{python3_sitelib}/* +/usr/lib64/python3.11/site-packages/* %{_bindir}/* -%changelog -* Tue Jun 24 2025 jingjunyuan - 0.1.4.alpha-1 -- Initial package build +%changelog +* Thu Jul 24 2025 jingjunyuan - 0.3.1.post1-1 +- Initial package build \ No newline at end of file