diff --git a/LMCache-0.1.4.alpha.tar.gz b/LMCache-0.1.4.alpha.tar.gz
deleted file mode 100644
index c242010c5335191bfa5298c757768528425427ec..0000000000000000000000000000000000000000
Binary files a/LMCache-0.1.4.alpha.tar.gz and /dev/null differ
diff --git a/LMCache-0.3.1.post1.tar.gz b/LMCache-0.3.1.post1.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..e0afd2f6a44fabe6c591b1acf8893f2d866ac754
Binary files /dev/null and b/LMCache-0.3.1.post1.tar.gz differ
diff --git a/adapt-mindspore.patch b/adapt-mindspore.patch
new file mode 100644
index 0000000000000000000000000000000000000000..a3c06e308f813dae4dc31784645c32782d592209
--- /dev/null
+++ b/adapt-mindspore.patch
@@ -0,0 +1,4688 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+new file mode 100644
+index 0000000..a77b665
+--- /dev/null
++++ b/CMakeLists.txt
+@@ -0,0 +1,252 @@
++cmake_minimum_required(VERSION 3.16)
++project(lmcache_C)
++
++# include(CheckCXXcompilerFlag)
++# check_cxx_compiler_flag("-std=c++17", COMPILER_SUPPORTS_CXX17)
++set(CMAKE_CXX_STANDARD 17)
++
++include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
++
++# Suppress potential warnings about unused manually-specified variables
++set(ignoreMe "${VLLM_PYTHON_PATH}")
++
++# TODO: Add 3.12 back when torch-npu support 3.12
++set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11")
++
++find_package(pybind11 REQUIRED)
++
++# append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
++set(LMCACHE_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}")
++
++# find_package(Torch REQUIRED)
++
++set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
++message(STATUS "Detected SOC version: ${SOC_VERSION}")
++
++if (NOT CMAKE_BUILD_TYPE)
++  set(CMAKE_BUILD_TYPE "Release" CACHE STRINGS "Build type Release/Debug (default Release)" FORCE)
++endif()
++
++if (CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
++  set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRINGS "path to install()")
++endif()
++
++message(STATUS "ASCEND_CANN_PACKAGE_PATH: ${ASCEND_CANN_PACKAGE_PATH}")
++
++set(ASCEND_HOME_PATH ${ASCEND_CANN_PACKAGE_PATH})
++if(EXISTS ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake)
++    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake)
++elseif(EXISTS ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
++    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
++elseif(EXISTS ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake)
++    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake)
++elseif(EXISTS ${ASCEND_HOME_PATH}/aarch64-linux/tikcpp/ascendc_kernel_cmake)
++    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/aarch64-linux/tikcpp/ascendc_kernel_cmake)
++else()
++    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.")
++endif()
++
++include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
++file(GLOB KERNEL_FILES
++${CMAKE_CURRENT_SOURCE_DIR}/csrc/ascend/kernels/*.cpp)
++
++ascendc_library(lmcache_ascend_kernels SHARED
++    ${KERNEL_FILES}
++)
++
++
++set(ASCENDC_TARGET_NAME lmcache_ascend_kernels PARENT_SCOPE)
++set(ASCENDC_TARGET_DIR "${CMAKE_BINARY_DIR}/lib" PARENT_SCOPE)
++
++if(NOT PYTHON_EXECUTABLE AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
++    find_package(PythonInterp QUIET) # For older CMake or Windows specific cases
++elseif(NOT PYTHON_EXECUTABLE)
++    find_package(Python COMPONENTS Interpreter QUIET) # Modern CMake
++endif()
++
++if (NOT PYTHON_EXECUTABLE)
++    set(PYTHON_EXECUTABLE "python3") # Fallback to system python3 if not found by CMake
++    message(WARNING "PYTHON_EXECUTABLE not found by CMake, falling back to 'python3'. Ensure this is the correct interpreter.")
++endif()
++
++
++if (USE_TORCH)
++  If (DEFINED TORCH_NPU_DIR AND IS_DIRECTORY "${TORCH_NPU_DIR}")
++  set(TORCH_NPU_PATH ${TORCH_NPU_DIR}) # This is the base path to the torch_npu package
++    message(STATUS "Using TORCH_NPU_PATH (from TORCH_NPU_DIR via setup.py): ${TORCH_NPU_PATH}")
++
++    # Check for lib subdirectory
++    if(IS_DIRECTORY "${TORCH_NPU_PATH}/lib")
++        message(STATUS "Found Torch NPU 'lib' directory: ${TORCH_NPU_PATH}/lib")
++    else()
++        message(WARNING "Torch NPU 'lib' directory ('${TORCH_NPU_PATH}/lib') NOT found. Linker may fail.")
++    endif()
++
++    # Check for include subdirectory
++    if(IS_DIRECTORY "${TORCH_NPU_PATH}/include")
++        message(STATUS "Found Torch NPU 'include' directory: ${TORCH_NPU_PATH}/include")
++    else()
++        message(WARNING "Torch NPU 'include' directory ('${TORCH_NPU_PATH}/include') NOT found.")
++    endif()
++  else()
++    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import os; import torch_npu; print(os.path.dirname(torch_npu.__file__))"
++      OUTPUT_STRIP_TRAILING_WHITESPACE
++      OUTPUT_VARIABLE TORCH_NPU_PATH
++    )
++  endif()
++  # make sure TORCH_NPU_PATH is set
++  if(NOT TORCH_NPU_PATH)
++    message(FATAL_ERROR "TORCH_NPU_PATH is not set")
++  endif()
++  message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
++else()
++  If (DEFINED MS_NPU_DIR AND IS_DIRECTORY "${MS_NPU_DIR}")
++    set(MS_NPU_PATH ${MS_NPU_DIR}) # This is the base path to the torch_npu package
++      message(STATUS "Using MS_NPU_DIR (from MS_NPU_DIR via setup.py): ${MS_NPU_PATH}")
++  else()
++    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import os; import mindspore as ms; print(os.path.dirname(ms.__file__))"
++      OUTPUT_STRIP_TRAILING_WHITESPACE
++      OUTPUT_VARIABLE MS_NPU_PATH
++    )
++  endif()
++  if(NOT MS_NPU_PATH)
++    message(FATAL_ERROR "MS_NPU_PATH is not set")
++  endif()
++  message("MS_NPU_PATH is ${MS_NPU_PATH}")
++endif()
++
++file(GLOB LMCACHE_ASCEND_SRC
++${CMAKE_CURRENT_SOURCE_DIR}/csrc/ascend/*.cpp)
++pybind11_add_module(lmcache_C ${LMCACHE_ASCEND_SRC})
++
++get_filename_component(ASCEND_TOOLKIT_PATH ${ASCEND_HOME_PATH} DIRECTORY)
++get_filename_component(ASCEND_DIR_PATH ${ASCEND_TOOLKIT_PATH} DIRECTORY)
++
++target_compile_options(lmcache_C PRIVATE -Wno-deprecated-declarations)
++
++if (USE_TORCH) # This 'if' checks the *CMake variable* USE_TORCH (TRUE/FALSE)
++    # Only add the preprocessor definition if the CMake variable USE_TORCH is TRUE
++    target_compile_definitions(lmcache_C PRIVATE USE_TORCH)
++    message(STATUS "CMake: USE_TORCH variable is TRUE. Adding -DUSE_TORCH compile definition.")
++else()
++    # No need to explicitly undefine it, just don't add the -D flag.
++    find_package(Threads REQUIRED)
++
++    target_compile_definitions(lmcache_C PRIVATE 
++    -D_GLIBCXX_USE_CXX11_ABI=0
++    ENABLE_FAST_HASH_TABLE=1
++    CUSTOM_ASCEND_OP
++    )
++    message(STATUS "CMake: USE_TORCH variable is FALSE. NOT adding -DUSE_TORCH compile definition.")
++endif()
++
++set(
++  INCLUDES
++  ${pybind11_INCLUDE_DIRS}
++  ${PYTHON_INCLUDE_PATH}
++  ${ASCEND_HOME_PATH}/include
++  ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
++  ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/ascend_hal
++  ${ASCEND_DIR_PATH}/driver/include
++)
++
++if (USE_TORCH)
++  list(APPEND INCLUDES ${TORCH_INCLUDE_DIRS})
++  list(APPEND INCLUDES ${TORCH_NPU_PATH}/include)
++else()
++  list(APPEND INCLUDES ${MS_NPU_PATH})
++  list(APPEND INCLUDES ${MS_NPU_PATH}/include)
++  list(APPEND INCLUDES ${MS_NPU_PATH}/include/third_party)
++  list(APPEND INCLUDES ${MS_NPU_PATH}/include/third_party/robin_hood_hashing)
++  list(APPEND INCLUDES ${MS_NPU_PATH}/include/third_party/securec/include)
++  list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/core/include)
++  list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/core/mindrt/include)
++  list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/core/mindrt)
++  list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/ops)
++  list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/ops/kernel/include)
++  list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/ccsrc)
++  list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/ccsrc/include)
++  list(APPEND INCLUDES ${MS_NPU_PATH}/include/mindspore/ccsrc/minddata/mindrecord/include)
++  list(APPEND INCLUDES ${MS_NPU_PATH}/lib)
++  list(APPEND INCLUDES ${MS_NPU_PATH}/lib/plugin)
++endif()
++
++
++target_include_directories(lmcache_C PRIVATE ${INCLUDES})
++
++set(
++  PLINK_DIRS
++  ${ASCEND_HOME_PATH}/lib64
++  ${ASCEND_HOME_PATH}/aarch64-linux/devlib
++  ${ASCEND_HOME_PATH}/aarch64-linux/devlib/linux/x86_64
++  ${ASCEND_HOME_PATH}/aarch64-linux/devlib/linux/aarch64
++  /usr/local/Ascend/driver/lib64/driver
++)
++
++if (USE_TORCH)
++  list(APPEND PLINK_DIRS ${TORCH_NPU_PATH}/lib)
++  set_target_properties(lmcache_C PROPERTIES
++    BUILD_RPATH "${TORCH_NPU_PATH}/lib;$$ORIGIN;$$ORIGIN/lib"
++    INSTALL_RPATH "$${TORCH_NPU_PATH}/lib;$$ORIGIN;$$ORIGIN/lib"
++    INSTALL_RPATH_USE_LINK_PATH TRUE # Important for ensuring RPATH is set correctly on install
++    )
++else()
++  list(APPEND PLINK_DIRS ${MS_NPU_PATH}/lib)
++  list(APPEND PLINK_DIRS ${MS_NPU_PATH}/lib/plugin)
++  set_target_properties(lmcache_C PROPERTIES
++    BUILD_RPATH "${MS_NPU_PATH}/lib;${MS_NPU_PATH}/lib/plugin;$$ORIGIN;$$ORIGIN/lib"
++    INSTALL_RPATH "${MS_NPU_PATH}/lib;${MS_NPU_PATH}/lib/plugin;$$ORIGIN;$$ORIGIN/lib"
++    INSTALL_RPATH_USE_LINK_PATH TRUE
++  )
++  
++endif()
++
++target_link_directories(
++  lmcache_C
++  PRIVATE
++  ${PLINK_DIRS}
++)
++
++
++set(
++  PUBLINK_LIBS
++  lmcache_ascend_kernels
++  tiling_api
++  ascendcl
++  platform
++  ascend_hal
++  dcmi
++  drvdsmi_host
++)
++
++if (USE_TORCH)
++  list(APPEND PUBLINK_LIBS ${TORCH_LIBRARIES})
++  list(APPEND PUBLINK_LIBS libtorch_npu.so)
++else()
++  list(APPEND PUBLINK_LIBS mindspore_common)
++  list(APPEND PUBLINK_LIBS mindspore_ops)
++  list(APPEND PUBLINK_LIBS mindspore_pyboost)
++  list(APPEND PUBLINK_LIBS mindspore_frontend)
++  list(APPEND PUBLINK_LIBS mindspore_res_manager)
++  list(APPEND PUBLINK_LIBS mindspore_backend_manager)
++  list(APPEND PUBLINK_LIBS mindspore_ops_kernel_common)
++  list(APPEND PUBLINK_LIBS mindspore_runtime_pipeline)
++  list(APPEND PUBLINK_LIBS pthread)
++  # Define the absolute path to libmindspore_ascend.so.2
++# This path was confirmed by your 'ls -l' output.
++  set(MINDSPORE_ASCEND_LIB_FULL_PATH "${MS_NPU_PATH}/lib/plugin/libmindspore_ascend.so.2")
++  list(APPEND PUBLINK_LIBS ${MINDSPORE_ASCEND_LIB_FULL_PATH})
++endif()
++
++message(STATUS "DEBUG: PUBLINK_LIBS content before linking: ${PUBLINK_LIBS}")
++
++target_link_libraries(
++  lmcache_C
++  PUBLIC
++  ${PUBLINK_LIBS}
++)
++
++
++install(TARGETS lmcache_C lmcache_ascend_kernels DESTINATION ${LMCACHE_ASCEND_INSTALL_PATH})
++
++
+diff --git a/cmake/utils.cmake b/cmake/utils.cmake
+new file mode 100644
+index 0000000..62078fd
+--- /dev/null
++++ b/cmake/utils.cmake
+@@ -0,0 +1,133 @@
++#
++# Attempt to find the python package that uses the same python executable as
++# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
++#
++macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
++  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
++  set(Python_EXECUTABLE ${EXECUTABLE})
++  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
++  if (NOT Python_FOUND)
++    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
++  endif()
++  set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
++  set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
++  if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
++    message(FATAL_ERROR
++      "Python version (${_VER}) is not one of the supported versions: "
++      "${_SUPPORTED_VERSIONS_LIST}.")
++  endif()
++  message(STATUS "Found python matching: ${EXECUTABLE}.")
++endmacro()
++
++#
++# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
++# has trailing whitespace stripped.  If an error is encountered when running
++# python, a fatal message `ERR_MSG` is issued.
++#
++function (run_python OUT EXPR ERR_MSG)
++  execute_process(
++    COMMAND
++    "${PYTHON_EXECUTABLE}" "-c" "${EXPR}"
++    OUTPUT_VARIABLE PYTHON_OUT
++    RESULT_VARIABLE PYTHON_ERROR_CODE
++    ERROR_VARIABLE PYTHON_STDERR
++    OUTPUT_STRIP_TRAILING_WHITESPACE)
++
++  if(NOT PYTHON_ERROR_CODE EQUAL 0)
++    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
++  endif()
++  set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
++endfunction()
++
++# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
++# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
++macro (append_cmake_prefix_path PKG EXPR)
++  run_python(_PREFIX_PATH
++    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
++  list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
++endmacro()
++
++
++# This cmake function is adapted from vllm /Users/ganyi/workspace/vllm-ascend/cmake/utils.cmake
++# Define a target named `GPU_MOD_NAME` for a single extension. The
++# arguments are:
++#
++# DESTINATION <dest>         - Module destination directory.
++# LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
++#                              etc.
++# SOURCES <sources>          - List of source files relative to CMakeLists.txt
++#                              directory.
++#
++# Optional arguments:
++#
++# ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
++#                              format.
++#                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
++#                              and `CMAKE_HIP_ARCHITECTURES` for more info.
++#                              ARCHITECTURES will use cmake's defaults if
++#                              not provided.
++# COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
++# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
++# LIBRARIES <libraries>      - Extra link libraries.
++# WITH_SOABI                 - Generate library with python SOABI suffix name.
++# USE_SABI <version>         - Use python stable api <version>
++#
++# Note: optimization level/debug info is set via cmake build type.
++#
++function (define_gpu_extension_target GPU_MOD_NAME)
++  cmake_parse_arguments(PARSE_ARGV 1
++    GPU
++    "WITH_SOABI"
++    "DESTINATION;LANGUAGE;USE_SABI"
++    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
++
++  # Add hipify preprocessing step when building with HIP/ROCm.
++  if (GPU_LANGUAGE STREQUAL "HIP")
++    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
++  endif()
++
++  if (GPU_WITH_SOABI)
++    set(GPU_WITH_SOABI WITH_SOABI)
++  else()
++    set(GPU_WITH_SOABI)
++  endif()
++
++  if (GPU_USE_SABI)
++    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
++  else()
++    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
++  endif()
++
++  if (GPU_LANGUAGE STREQUAL "HIP")
++    # Make this target dependent on the hipify preprocessor step.
++    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
++  endif()
++
++  if (GPU_ARCHITECTURES)
++    set_target_properties(${GPU_MOD_NAME} PROPERTIES
++      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
++  endif()
++
++  set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
++
++  target_compile_options(${GPU_MOD_NAME} PRIVATE
++    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
++
++  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
++    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
++
++  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
++    ${GPU_INCLUDE_DIRECTORIES})
++
++  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
++
++  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
++  # dependencies that are not necessary and may not be installed.
++  if (GPU_LANGUAGE STREQUAL "CUDA")
++    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
++  else()
++    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
++  endif()
++
++  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
++endfunction()
+diff --git a/csrc/ascend/dcmi_management.cpp b/csrc/ascend/dcmi_management.cpp
+new file mode 100644
+index 0000000..64f3d7b
+--- /dev/null
++++ b/csrc/ascend/dcmi_management.cpp
+@@ -0,0 +1,35 @@
++#include "dcmi_management.h"
++#include <iostream>
++#include <string>
++
++namespace lmc {
++DCMIManager::DCMIManager() {
++    auto ret = dcmi_init();
++    this->initialized = false;
++    if (ret == 0) {
++        this->initialized = true;
++    } else {
++        std::cerr << "Unable to initialize DCMI: " << std::to_string(ret)  << std::endl;
++    }
++};
++
++DCMIManager::~DCMIManager() {
++};
++
++std::string DCMIManager::getCPUAffinityFromDeviceId(int8_t cardId, int8_t devId) {
++    if (!this->initialized) {
++        throw std::runtime_error("DCMI not initialized\n");
++    }
++
++    char affinityCpu[TOPO_INFO_MAX_LENTH] ={0};
++    int length = 0;
++
++    auto dcmiErr = dcmi_get_affinity_cpu_info_by_device_id(static_cast<int>(cardId), 
++        static_cast<int>(devId), affinityCpu, &length);
++    if (dcmiErr != 0 ){
++        throw std::runtime_error("Error in getting affinity CPU info: " + std::to_string(dcmiErr) + "\n");
++    }
++
++    return std::string(affinityCpu, length);
++}
++}
+\ No newline at end of file
+diff --git a/csrc/ascend/dcmi_management.h b/csrc/ascend/dcmi_management.h
+new file mode 100644
+index 0000000..83dcf05
+--- /dev/null
++++ b/csrc/ascend/dcmi_management.h
+@@ -0,0 +1,32 @@
++#pragma once
++#include "dcmi_interface_api.h"
++#include <string>
++
++namespace lmc {
++class DCMIManager {
++private:
++    DCMIManager();
++
++    // Delete Copy constructor and assignment operator
++    DCMIManager(const DCMIManager&) = delete;
++    DCMIManager& operator=(const DCMIManager&) = delete;
++    DCMIManager(DCMIManager&&) = delete;
++    DCMIManager& operator=(DCMIManager&&) = delete;
++    
++    bool initialized;
++    std::string cpuAffinity;
++
++public:
++    static DCMIManager& GetInstance()
++    {
++        static DCMIManager instance;
++        return instance;
++    }
++    ~DCMIManager();
++    
++    // NOTE: at the moment we assume card and devId are the same.
++    // there might be scenario you won't have the same card and devId.
++    // we should indeed do this properly ?
++    std::string getCPUAffinityFromDeviceId(int8_t cardId, int8_t devId);
++};
++}
+diff --git a/csrc/ascend/framework_hal.h b/csrc/ascend/framework_hal.h
+new file mode 100644
+index 0000000..5d22d91
+--- /dev/null
++++ b/csrc/ascend/framework_hal.h
+@@ -0,0 +1,78 @@
++#pragma once
++
++#ifdef USE_TORCH
++    #include <torch/extension.h>
++    #include <torch_npu/csrc/core/npu/NPUStream.h>
++#else
++    #include <pybind11/pybind11.h>
++    #include <pybind11/stl.h> 
++    #include <iostream>
++    #include <string> 
++    #include <sstream>
++#endif
++
++
++namespace py = pybind11;
++namespace framework_hal {
++#ifdef USE_TORCH
++#else
++// NOTE (Gingfung): This function encapsulates the logic of calling MindSpore's Python API
++// The reason we had to do this were because when I directly included the mindspore runtime apis for building,
++// it kept erroring with linking issues. I was in touch with a couple of MindSpore guys but never really got it working.
++// Anyway, we will revisit using Mindspore runtime API once we are on 2.7.
++int32_t _get_device_id_from_mindspore_hal_python_api_impl() {
++    int device_id = -1;
++
++    try {
++        py::module_ ms = py::module_::import("mindspore");
++        py::module_ hal = ms.attr("hal");
++        py::object current_stream_obj = hal.attr("current_stream")();
++        py::object device_id_py_obj = current_stream_obj.attr("device_id");
++        device_id = device_id_py_obj.cast<int32_t>();
++        
++        // NOTE: Mindspore current_device_id starts from 0 no matter what, so we should shift the device id 
++        // iff users specify the ASCEND_RT_VISIBLE_DEVICES
++        const char* env_visible_devices_p = std::getenv("ASCEND_RT_VISIBLE_DEVICES");
++        if (env_visible_devices_p != nullptr) {
++            std::string env_visible_devices = env_visible_devices_p;
++            std::vector<uint32_t> list_visible_devices;
++            std::stringstream ss(env_visible_devices);
++            std::string item;
++            while (std::getline(ss, item, ',')) {
++                list_visible_devices.push_back(std::stoi(item));
++            }
++            std::sort(list_visible_devices.begin(), list_visible_devices.end());
++            // from what I have seen, there are two cases:
++            // 1. no hccl, we just use current_device, even though we have specify the ASCEND_RT_VISIBLE_DEVICES
++            // 2. hccl, and we use current_device that seems to be correct
++            // for case 2, since the current_device would have been correct anyway, obtaining from the list would be fine.
++            // for case 1, we have shifted the device to the RT_VISIBLE_DEVICES, so it should be corrected.
++            device_id = list_visible_devices[device_id];
++        }
++
++    } catch (const py::error_already_set &e) {
++        std::cerr << "C++ Internal: Python error in _get_device_id_from_mindspore_hal_python_api_impl: " << e.what() << std::endl;
++        PyErr_Print(); // Print Python traceback
++        device_id = -1;
++    } catch (const std::exception &e) {
++        std::cerr << "C++ Internal: Standard C++ exception in _get_device_id_from_mindspore_hal_python_api_impl: " << e.what() << std::endl;
++        device_id = -1;
++    }
++    return device_id;
++}
++#endif
++
++
++int8_t GetDeviceIdx() {
++#ifdef USE_TORCH
++    return c10_npu::getCurrentNPUStream().device_index();
++#else
++    // FIXME: should be always within 127 ?
++    auto devId = _get_device_id_from_mindspore_hal_python_api_impl();
++    if (devId == -1) {
++        throw std::runtime_error("Failed to get device ID from MindSpore's Python API.");
++    }
++    return devId;
++#endif
++};
++} // namespace framework_hal
+\ No newline at end of file
+diff --git a/csrc/ascend/kernels/paged_kv_copy.cpp b/csrc/ascend/kernels/paged_kv_copy.cpp
+new file mode 100644
+index 0000000..d36b415
+--- /dev/null
++++ b/csrc/ascend/kernels/paged_kv_copy.cpp
+@@ -0,0 +1,170 @@
++/* NOTE: tested fp16, but not the rest yet. */
++#include "kernel_operator.h"
++#include <stdio.h>
++#include "types.h"
++#include "utils.h"
++
++template <typename scalar_t> class PagedKVCopy {
++    using local_scalar_t = AscendC::LocalTensor<scalar_t>;
++
++public:
++    __aicore__ inline PagedKVCopy()
++    {
++    }
++
++    __aicore__ inline void init(GM_ADDR pagedKVCaches, GM_ADDR cacheTensor, GM_ADDR slotmappings,
++                                const int64_t numPages, const int64_t hiddenDims, const int32_t pagedSize,
++                                const int32_t numLayers, const int32_t numTokensChunk, const bool page2L, 
++                                AscendC::TPipe *pipe)
++    {
++        this->pipe_ = pipe;
++        this->numLayers_ = numLayers;
++        this->numPages_ = numPages;
++        this->hiddenDims_ = hiddenDims;
++        this->numTokensChunk_ = numTokensChunk;
++        this->pagedSize_ = pagedSize;
++        this->page2L_ = page2L;
++        this->invalid_ = false;
++        // TODO: Not sure whether we need double buffering since we are mainly copying data
++        //       might only make sense if we have compute
++        this->pipe_->InitBuffer(pagedTokenQue_, 2, this->hiddenDims_*sizeof(scalar_t));
++    }
++
++    __aicore__ inline void updateMemOffset(__gm__ uint8_t *pagedKVCaches, __gm__ uint8_t* cacheTensor, 
++                                             __gm__ uint8_t *slotmappings, const int tokenIdx, 
++                                             const int kvIdx, const int layerIdx) 
++    {
++        __gm__ int32_t *slotmappingPtr = reinterpret_cast<__gm__ int32_t*>(slotmappings);
++        int64_t slot = slotmappingPtr[tokenIdx];
++        if (slot == -1) {
++            this->invalid_ = true;
++            return;
++        }
++        // its a pointer within the GM addr space, that point to another GM addr space
++        __gm__ uint8_t * __gm__ *pagedKVCachesPtr = reinterpret_cast<__gm__ uint8_t* __gm__ *>(pagedKVCaches);
++        
++        // getting the right ptr to the paged kvcache layer
++        __gm__ uint8_t *pagedLayerKVCaches = pagedKVCachesPtr[layerIdx];
++
++        int64_t pagedIdx = slot / this->pagedSize_;
++        int64_t withinPagedIdx = slot % this->pagedSize_;
++
++        int64_t pagedIdxOffset = kvIdx * this->numPages_ * this->pagedSize_ * this->hiddenDims_ +
++                                 pagedIdx * this->pagedSize_ * this->hiddenDims_ +
++                                 withinPagedIdx * this->hiddenDims_;
++
++        int64_t dstTensorIdxOffset = kvIdx * this->numLayers_ * this->numTokensChunk_  * this->hiddenDims_ +
++                                     layerIdx * this->numTokensChunk_  * this->hiddenDims_ + 
++                                     tokenIdx * this->hiddenDims_;
++        
++        this->pagedTokenGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedLayerKVCaches) + pagedIdxOffset, 
++                                                   this->hiddenDims_);
++        this->lmcBufferGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(cacheTensor) + dstTensorIdxOffset, 
++                                               this->hiddenDims_);
++    }
++
++    __aicore__ inline void processFunc() {
++        if (this->invalid_) {
++            return;
++        }
++        // 1. Alloc Tensor for local page
++        local_scalar_t hiddenDimTensor = pagedTokenQue_.AllocTensor<scalar_t>();
++
++        // 2. copy from global tensor into local
++        if (this->page2L_) {
++            AscendC::DataCopy(hiddenDimTensor, this->pagedTokenGlobal_, this->hiddenDims_);
++        } else {
++            AscendC::DataCopy(hiddenDimTensor, this->lmcBufferGlobal_, this->hiddenDims_);
++        }
++       
++        // 3. enque vecin
++        pagedTokenQue_.EnQue(hiddenDimTensor);
++        // 4. deque vecin, possible to reuse due to QueBind
++        hiddenDimTensor = pagedTokenQue_.DeQue<scalar_t>();
++
++        // 5. datacopy into GM 
++        if (this->page2L_) {
++            AscendC::DataCopy(this->lmcBufferGlobal_, hiddenDimTensor, this->hiddenDims_);
++        } else {
++            AscendC::DataCopy(this->pagedTokenGlobal_, hiddenDimTensor, this->hiddenDims_);
++        }
++        
++        // 6. free alloced Tensor
++        pagedTokenQue_.FreeTensor(hiddenDimTensor);
++    }
++
++    __aicore__ inline void reset() {
++        this->invalid_ = false;
++    }
++
++private:
++    AscendC::TPipe *pipe_;
++    // copying from NPUBuffer to the Chunked GM Tensor
++    AscendC::TQueBind<AscendC::QuePosition::VECIN, AscendC::QuePosition::VECOUT, 1> pagedTokenQue_;
++
++    // [layers * [kvs, numPages * pagedSize, heads*headsize]]
++    AscendC::GlobalTensor<scalar_t> pagedTokenGlobal_;
++    // [kvs, layers, numTokensChunk, heads*headsize]
++    AscendC::GlobalTensor<scalar_t> lmcBufferGlobal_;
++    int64_t numPages_; // num vllm npu blocks
++    int32_t pagedSize_; // per npu block tokens
++    int32_t numLayers_; // num layers
++    int64_t hiddenDims_; // heads * headsize
++    int64_t hiddenDimsAligned_; // heads *headsize aligned to datablock byte
++    int32_t numTokensChunk_; // num tokens in the cache tensor chunk
++    bool page2L_; // whether the direction of copy is from page to lmc
++    bool invalid_; // keep track of whether current iter is invalid
++};
++
++#define PAGED_KV_COPY_TYPE_DECLARE(TYPE)                                                                            \
++        extern "C" __global__ __aicore__ void paged_kv_copy_##TYPE(                                                 \
++        __gm__ uint8_t* pagedKVCaches, __gm__ uint8_t* dstCacheTensor, __gm__ uint8_t* slotmappings,                \
++        const int64_t numPages, const int64_t hiddenDims, const int32_t pagedSize,                                  \
++        const int32_t kvs, const int32_t numLayers, const int32_t numTokensChunk, const int coreNum,                \
++        const bool page2L)                                                                                          \
++    {                                                                                                               \
++        AscendC::TPipe pipe;                                                                                        \
++        PagedKVCopy<TYPE> op{};                                                                                     \
++        op.init(pagedKVCaches, dstCacheTensor, slotmappings, numPages, hiddenDims, pagedSize,                       \
++                numLayers, numTokensChunk, page2L, &pipe);                                                          \
++        int64_t bIdx = AscendC::GetBlockIdx();                                                                      \
++        for (int64_t i = bIdx; i < numTokensChunk; i+=coreNum) {                                                    \
++            for (int32_t kvIdx = 0; kvIdx < kvs; kvIdx ++) {                                                        \
++                for (int32_t layerIdx = 0; layerIdx < numLayers; layerIdx++) {                                      \
++                    op.reset();                                                                                     \
++                    op.updateMemOffset(pagedKVCaches, dstCacheTensor, slotmappings, i, kvIdx, layerIdx);            \
++                    op.processFunc();                                                                               \
++                }                                                                                                   \
++            }                                                                                                       \
++        }                                                                                                           \
++    }
++
++// Declare support kernel entry
++PAGED_KV_COPY_TYPE_DECLARE(half);
++PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t);
++PAGED_KV_COPY_TYPE_DECLARE(int8_t);
++
++namespace lmc_ops {
++
++#define PAGED_KV_COPY_KERNEL_CALL(TYPE)                                                                               \
++    paged_kv_copy_##TYPE<<<blockDim, nullptr, stream>>>(pagedKVCaches, dstCacheTensor, slotmappings, numPages,        \
++                                                        hiddenDims, pagedSize, kvs, numLayers, numTokensChunk,        \
++                                                        aivNum, page2L);
++
++extern void paged_kv_copy_impl(vllm_ascend::AscendType type, uint32_t blockDim, void *stream, uint8_t *pagedKVCaches, uint8_t *dstCacheTensor,
++                           uint8_t *slotmappings, const int64_t numPages, const int64_t hiddenDims, 
++                           const int32_t pagedSize, const int32_t kvs, const int32_t numLayers, const int32_t numTokensChunk, 
++                           const int aivNum, const bool page2L)
++{
++    if (type == vllm_ascend::AscendType::FP16) {
++        PAGED_KV_COPY_KERNEL_CALL(half);
++    } else if (type == vllm_ascend::AscendType::BF16) {
++        PAGED_KV_COPY_KERNEL_CALL(bfloat16_t);
++    } else if (type == vllm_ascend::AscendType::INT8) {
++        PAGED_KV_COPY_KERNEL_CALL(int8_t);
++    } else {
++        return;
++    }
++}
++
++} // namespace lmc_ops
+diff --git a/csrc/ascend/kernels/paged_kv_tuple_copy.cpp b/csrc/ascend/kernels/paged_kv_tuple_copy.cpp
+new file mode 100644
+index 0000000..5769c4d
+--- /dev/null
++++ b/csrc/ascend/kernels/paged_kv_tuple_copy.cpp
+@@ -0,0 +1,176 @@
++/* NOTE: tested fp16, but not the rest yet. */
++#include "kernel_operator.h"
++#include <stdio.h>
++#include "types.h"
++#include "utils.h"
++
++template <typename scalar_t> class PagedKVTupleCopy {
++    using local_scalar_t = AscendC::LocalTensor<scalar_t>;
++
++public:
++    __aicore__ inline PagedKVTupleCopy()
++    {
++    }
++
++    __aicore__ inline void init(GM_ADDR pagedKVCaches, GM_ADDR cacheTensor, GM_ADDR slotmappings,
++                                const int64_t numPages, const int64_t hiddenDims, const int32_t pagedSize,
++                                const int32_t numLayers, const int32_t numTokensChunk, const bool page2L, 
++                                AscendC::TPipe *pipe)
++    {
++        this->pipe_ = pipe;
++        this->numLayers_ = numLayers;
++        this->numPages_ = numPages;
++        this->hiddenDims_ = hiddenDims;
++        this->numTokensChunk_ = numTokensChunk;
++        this->pagedSize_ = pagedSize;
++        this->page2L_ = page2L;
++        this->invalid_ = false;
++        // TODO: Not sure whether we need double buffering since we are mainly copying data
++        //       might only make sense if we have compute
++        this->pipe_->InitBuffer(pagedTokenQue_, 2, this->hiddenDims_*sizeof(scalar_t));
++    }
++
++    __aicore__ inline void updateMemOffset(__gm__ uint8_t *pagedKVCaches, __gm__ uint8_t* cacheTensor, 
++                                             __gm__ uint8_t *slotmappings, const int tokenIdx, 
++                                             const int kvIdx, const int layerIdx, const int kvs) 
++    {
++        __gm__ int32_t *slotmappingPtr = reinterpret_cast<__gm__ int32_t*>(slotmappings);
++        int32_t slot = slotmappingPtr[tokenIdx];
++        if (slot == -1) {
++            this->invalid_ = true;
++            return;
++        }
++        // its a pointer within the GM addr space, that point to another tuple of GM addr space
++        __gm__ uint8_t * __gm__ *pagedKVCachesPtr = reinterpret_cast<__gm__ uint8_t* __gm__ *>(pagedKVCaches);
++        
++        // getting the right ptr to the paged kvcache layer
++        // the pagedKVCachesPtr is a list of list of ptrs
++        int layerKVIdx = layerIdx * kvs + kvIdx;
++
++        __gm__ uint8_t *pagedLayerKVCaches = pagedKVCachesPtr[layerKVIdx];
++
++        int64_t pagedIdx = slot / this->pagedSize_;
++        int64_t withinPagedIdx = slot % this->pagedSize_;
++
++        int64_t pagedIdxOffset = pagedIdx * this->pagedSize_ * this->hiddenDims_ +
++                                 withinPagedIdx * this->hiddenDims_;
++
++        int64_t dstTensorIdxOffset = kvIdx * this->numLayers_ * this->numTokensChunk_  * this->hiddenDims_ +
++                                     layerIdx * this->numTokensChunk_  * this->hiddenDims_ + 
++                                     tokenIdx * this->hiddenDims_;
++
++        this->pagedTokenGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedLayerKVCaches) + pagedIdxOffset, 
++                                                   this->hiddenDims_);
++        this->lmcBufferGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(cacheTensor) + dstTensorIdxOffset, 
++                                               this->hiddenDims_);
++    }
++
++    __aicore__ inline void processFunc() {
++        if (this->invalid_) {
++            return;
++        }
++        // 1. Alloc Tensor for local page
++        local_scalar_t hiddenDimTensor = pagedTokenQue_.AllocTensor<scalar_t>();
++
++        // 2. copy from global tensor into local
++        if (this->page2L_) {
++            // pagetoken -> local
++            AscendC::DataCopy(hiddenDimTensor, this->pagedTokenGlobal_, this->hiddenDims_);
++        } else {
++            // lmc -> local
++            AscendC::DataCopy(hiddenDimTensor, this->lmcBufferGlobal_, this->hiddenDims_);
++        }
++       
++        // 3. enque vecin
++        pagedTokenQue_.EnQue(hiddenDimTensor);
++        // 4. deque vecin, possible to reuse due to QueBind
++        hiddenDimTensor = pagedTokenQue_.DeQue<scalar_t>();
++
++        // 5. datacopy into GM 
++        if (this->page2L_) {
++            // local -> lmc
++            AscendC::DataCopy(this->lmcBufferGlobal_, hiddenDimTensor, this->hiddenDims_);
++        } else {
++            // local -> pagetoken
++            AscendC::DataCopy(this->pagedTokenGlobal_, hiddenDimTensor, this->hiddenDims_);
++        }
++        
++        // 6. free alloced Tensor
++        pagedTokenQue_.FreeTensor(hiddenDimTensor);
++    }
++
++    __aicore__ inline void reset() {
++        this->invalid_ = false;
++    }
++
++private:
++    AscendC::TPipe *pipe_;
++    // copying from NPUBuffer to the Chunked GM Tensor
++    AscendC::TQueBind<AscendC::QuePosition::VECIN, AscendC::QuePosition::VECOUT, 1> pagedTokenQue_;
++
++    // [layers * [kvs, numPages * pagedSize, heads*headsize]]
++    AscendC::GlobalTensor<scalar_t> pagedTokenGlobal_;
++    // [kvs, layers, numTokensChunk, heads*headsize]
++    AscendC::GlobalTensor<scalar_t> lmcBufferGlobal_;
++    int64_t numPages_; // num vllm npu blocks
++    int32_t pagedSize_; // per npu block tokens
++    int32_t numLayers_; // num layers
++    int64_t hiddenDims_; // heads * headsize
++    int64_t hiddenDimsAligned_; // heads *headsize aligned to datablock byte
++    int32_t numTokensChunk_; // num tokens in the cache tensor chunk
++    bool page2L_; // whether the direction of copy is from page tensor to lmc
++    bool invalid_; // keep track of whether current iter is invalid
++};
++
++#define PAGED_KV_TUPLE_COPY_TYPE_DECLARE(TYPE)                                                                            \
++        extern "C" __global__ __aicore__ void paged_kv_tuple_copy_##TYPE(                                                 \
++        __gm__ uint8_t* pagedKVCaches, __gm__ uint8_t* dstCacheTensor, __gm__ uint8_t* slotmappings,                \
++        const int64_t numPages, const int64_t hiddenDims, const int32_t pagedSize,                                  \
++        const int32_t kvs, const int32_t numLayers, const int32_t numTokensChunk, const int coreNum,                \
++        const bool page2L)                                                                                          \
++    {                                                                                                               \
++        AscendC::TPipe pipe;                                                                                        \
++        PagedKVTupleCopy<TYPE> op{};                                                                                     \
++        op.init(pagedKVCaches, dstCacheTensor, slotmappings, numPages, hiddenDims, pagedSize,                       \
++                numLayers, numTokensChunk, page2L, &pipe);                                                          \
++        int64_t bIdx = AscendC::GetBlockIdx();                                                                      \
++        for (int64_t i = bIdx; i < numTokensChunk; i+=coreNum) {                                                    \
++            for (int32_t kvIdx = 0; kvIdx < kvs; kvIdx ++) {                                                        \
++                for (int32_t layerIdx = 0; layerIdx < numLayers; layerIdx++) {                                      \
++                    op.reset();                                                                                     \
++                    op.updateMemOffset(pagedKVCaches, dstCacheTensor, slotmappings, i, kvIdx, layerIdx, kvs);       \
++                    op.processFunc();                                                                               \
++                }                                                                                                   \
++            }                                                                                                       \
++        }                                                                                                           \
++    }
++
++// Declare support kernel entry
++PAGED_KV_TUPLE_COPY_TYPE_DECLARE(half);
++PAGED_KV_TUPLE_COPY_TYPE_DECLARE(bfloat16_t);
++PAGED_KV_TUPLE_COPY_TYPE_DECLARE(int8_t);
++
++namespace lmc_ops {
++
++#define PAGED_KV_TUPLE_COPY_KERNEL_CALL(TYPE)                                                                               \
++    paged_kv_tuple_copy_##TYPE<<<blockDim, nullptr, stream>>>(pagedKVCaches, dstCacheTensor, slotmappings, numPages,        \
++                                                        hiddenDims, pagedSize, kvs, numLayers, numTokensChunk,        \
++                                                        aivNum, page2L);
++
++extern void paged_kv_tuple_copy_impl(vllm_ascend::AscendType type, uint32_t blockDim, void *stream, uint8_t *pagedKVCaches, uint8_t *dstCacheTensor,
++                           uint8_t *slotmappings, const int64_t numPages, const int64_t hiddenDims, 
++                           const int32_t pagedSize, const int32_t kvs, const int32_t numLayers, const int32_t numTokensChunk, 
++                           const int aivNum, const bool page2L)
++{
++    if (type == vllm_ascend::AscendType::FP16) {
++        PAGED_KV_TUPLE_COPY_KERNEL_CALL(half);
++    } else if (type == vllm_ascend::AscendType::BF16) {
++        PAGED_KV_TUPLE_COPY_KERNEL_CALL(bfloat16_t);
++    } else if (type == vllm_ascend::AscendType::INT8) {
++        PAGED_KV_TUPLE_COPY_KERNEL_CALL(int8_t);
++    } else {
++        return;
++    }
++}
++
++} // namespace lmc_ops
+diff --git a/csrc/ascend/kernels/types.h b/csrc/ascend/kernels/types.h
+new file mode 100644
+index 0000000..d272781
+--- /dev/null
++++ b/csrc/ascend/kernels/types.h
+@@ -0,0 +1,26 @@
++/*
++ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++namespace vllm_ascend {
++enum struct AscendType {
++    FP16 = 0,
++    BF16 = 1,
++    FP32 = 2,
++    INT8 = 3,
++};
++}
+\ No newline at end of file
+diff --git a/csrc/ascend/kernels/utils.h b/csrc/ascend/kernels/utils.h
+new file mode 100644
+index 0000000..a6dd3f3
+--- /dev/null
++++ b/csrc/ascend/kernels/utils.h
+@@ -0,0 +1,38 @@
++/*
++ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++#include "kernel_type.h"
++namespace vllm_ascend {
++
++template <typename scalar_t> struct AccType;
++
++template <> struct AccType<bfloat16_t> {
++    using type = float;
++};
++
++template <> struct AccType<half> {
++    using type = half;
++};
++
++template <> struct AccType<float> {
++    using type = float;
++};
++
++template <> struct AccType<int8_t> {
++    using type = int;
++};
++}// namespace vllm_ascend
+\ No newline at end of file
+diff --git a/csrc/ascend/np_tensor.h b/csrc/ascend/np_tensor.h
+new file mode 100644
+index 0000000..41c23f7
+--- /dev/null
++++ b/csrc/ascend/np_tensor.h
+@@ -0,0 +1,219 @@
++#pragma once
++#include <pybind11/pybind11.h>
++#include <pybind11/numpy.h>
++#include <string>
++#include "pinned_mem.h"
++#include "ms_extension.h"
++#include "utils.h"
++#include "aclnn/opdev/platform.h"
++#include "tiling/platform/platform_ascendc.h"
++#include "tiling/tiling_api.h"
++#include "ops.h"
++
++using BaseTensor = mindspore::tensor::BaseTensor;
++using BaseTensorPtr = mindspore::tensor::BaseTensorPtr;
++using PyBoostUtils = mindspore::kernel::pyboost::PyBoostUtils;
++
++namespace py = pybind11;
++
++struct DtypeCaster {
++  BaseTensorPtr CheckAndCast(const BaseTensorPtr &t, const std::string &name = "") {
++    mindspore::Int64ImmPtr dst_type = std::make_shared<mindspore::Int64Imm>(mindspore::TypeId::kNumberTypeInt32);
++    if (t->data_type() != mindspore::TypeId::kNumberTypeInt32) {
++      if (!name.empty()) {
++        tensor_map_[name] = t;
++      }
++      return mindspore::kernel::pyboost::cast(t, dst_type);
++    }
++    return t;
++  }
++  BaseTensorPtr RecoveryTensorDtype(const BaseTensorPtr &t, const std::string &name) {
++    auto iter = tensor_map_.find(name);
++    if (iter == tensor_map_.end()) {
++      return t;
++    }
++    auto ori_tensor = iter->second;
++    auto ori_dtype = std::make_shared<mindspore::Int64Imm>(ori_tensor->data_type());
++    auto ret = mindspore::kernel::pyboost::cast(t, ori_dtype);
++    ori_tensor->AssignValue(*ret);
++    return ori_tensor;
++  }
++  std::map<std::string, BaseTensorPtr> tensor_map_;
++};
++
++namespace lmcache_ascend {
++
++    uint32_t get_static_aiv_core_num() {
++        static uint32_t aiv_num = []() -> uint32_t {
++
++            const char* socName = aclrtGetSocName();
++            if (socName == nullptr) {
++                throw std::runtime_error("Failed to get SoC name from ACL runtime.");
++            }
++
++            platform_ascendc::PlatformAscendC* ascendcPlatform = nullptr;
++            try {
++                ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socName);
++            } catch (const std::exception& e) {
++                std::cerr << "Internal Error getting PlatformAscendCManager instance: " << e.what() << std::endl;
++                throw;
++            }
++            
++            if (ascendcPlatform == nullptr) {
++                throw std::runtime_error("Failed to get PlatformAscendCManager instance.");
++            }
++
++            uint32_t calculated_aiv_num = ascendcPlatform->GetCoreNumAiv();
++            return calculated_aiv_num;
++        }();
++        return aiv_num;
++    }
++
++    uint8_t *GetMSDataPtr(const BaseTensorPtr &t) {
++        return static_cast<uint8_t *>(t->device_address()->GetMutablePtr()) + t->data().itemsize() * t->storage_offset();
++    }
++
++    py::array create_mapped_numpy_with_dtype(const std::vector<int64_t>& shape,
++                                             py::object dtype_obj) {
++        if (shape.empty()) {
++            throw std::runtime_error("Shape must not be empty");
++        }
++        
++        // 2. Calculate total number of elements and required bytes
++        size_t num_elements = 1;
++        std::vector<py::ssize_t> py_shape; // Use py::ssize_t for numpy shape
++        for (int64_t dim : shape) {
++            if (dim <= 0) {
++                throw std::runtime_error("Shape dimensions must be positive.");
++            }
++            num_elements *= dim;
++            py_shape.push_back(static_cast<py::ssize_t>(dim));
++        }
++
++        py::dtype np_dtype = py::dtype::from_args(dtype_obj);
++        size_t item_size = np_dtype.itemsize();
++        
++        size_t size_bytes = num_elements * item_size;
++        if (size_bytes == 0) {
++            throw std::runtime_error("Calculated memory size is zero. Check shape and dtype.");
++        }
++
++        uintptr_t ptr = lmc::alloc_pinned_mem(size_bytes);
++        if (ptr == 0) {
++            throw std::runtime_error("Failed to allocate pinned memory.");
++        }
++        
++        // 4. Calculate strides for a row-major (C-style) array
++        std::vector<py::ssize_t> strides;
++        strides.reserve(py_shape.size());
++        py::ssize_t current_stride = item_size;
++        for (int i = py_shape.size() - 1; i >= 0; --i) {
++            strides.insert(strides.begin(), current_stride); // Insert at beginning to reverse order
++            current_stride *= py_shape[i];
++        }
++        
++        // 5. Create a py::capsule that will call MMapManager::release_mmap when destroyed
++        py::capsule free_mmap_capsule(
++            reinterpret_cast<void*>(ptr), // Payload: the ID of the mmap'd region
++            [](void* _ptr) {
++                lmc::free_pinned_mem(reinterpret_cast<uintptr_t>(_ptr));
++            }
++        );
++
++        // 6. Create the NumPy array, passing the capsule as the base
++        // Use the generic py::array constructor (not py::array_t) since dtype is dynamic
++        return py::array(np_dtype, py_shape, strides, reinterpret_cast<void*>(ptr), free_mmap_capsule);
++    }
++    
++    py::array create_mmapped_numpy(size_t buffer_size) {
++        if (buffer_size <= 0 ) {
++            throw std::runtime_error("Buffer size must be positive.");
++        }
++
++        int64_t numel = static_cast<int64_t>(buffer_size);
++    
++        uintptr_t ptr = lmc::alloc_pinned_mem(buffer_size);
++        if (ptr == 0) {
++            throw std::runtime_error("Failed to allocate pinned memory."); 
++        }
++
++        py::capsule free_mmap_capsule(
++            reinterpret_cast<void*>(ptr), // Payload: the ID of the mmap'd region
++            [](void* _ptr) {
++                lmc::free_pinned_mem(reinterpret_cast<uintptr_t>(_ptr));
++            } 
++        );
++
++        return py::array(py::dtype::from_args(py::str("uint8")), {numel}, {1}, 
++            reinterpret_cast<void*>(ptr), free_mmap_capsule);
++    }
++
++    // NOTE: Does not currently support in graph execution
++    void paged_layers_kv_transfer_ms(
++        py::array& lmc_buffer, 
++        BaseTensorPtr& paged_kv_ptrs,
++        BaseTensorPtr& slot_mappings,
++        const uintptr_t lmc_buffer_hostptr,
++        const int num_pages,
++        const int page_size,
++        const int kvs,
++        const bool page2L,
++        const bool is_reg_mem) {
++            auto stream_id = PyBoostUtils::cur_stream_id();
++            auto device_context = mindspore::runtime::OpRunner::GetDeviceContext("Ascend");
++            // reset
++            if (page2L) {
++                memset(static_cast<void*>(lmc_buffer.mutable_data()), 0, lmc_buffer.nbytes());
++            }
++            
++            uintptr_t lmc_offset_dptr;
++            if (is_reg_mem) {
++                // 1. we need to get the dev ptr from the host ptr and the paged_kv_ptrs list.
++                uintptr_t lmc_base_dptr = lmc::get_device_ptr(lmc_buffer_hostptr);
++                // 2. calculate offset from the lmcbaseptr to current dataptr
++                uintptr_t current_ptr = reinterpret_cast<uintptr_t>(lmc_buffer.mutable_data());
++                uintptr_t offset_ptr = current_ptr - lmc_buffer_hostptr;
++                if (offset_ptr < 0) {
++                    throw std::runtime_error(
++                        "Offset calculation failed.");
++                } 
++                lmc_offset_dptr = lmc_base_dptr + offset_ptr;
++            } else {
++                // we just get the ptr from the lmc buf
++                lmc_offset_dptr = reinterpret_cast<uintptr_t>(lmc_buffer.mutable_data());
++            }
++
++            // only support int32 slotmapping for now.
++            DtypeCaster caster;
++            slot_mappings = caster.CheckAndCast(slot_mappings, "slot_mapping");
++
++            
++            int num_tokens = slot_mappings->shape()[0];
++            int num_layers = paged_kv_ptrs->shape()[0];
++            int ndim = lmc_buffer.ndim();
++            int hidden_dims = static_cast<int>(lmc_buffer.shape(ndim - 1));
++            auto ascend_type = vllm_ascend::get_dtype_from_np(lmc_buffer);
++
++            uint32_t aivNum = get_static_aiv_core_num();
++
++            PyBoostUtils::PrepareOpInputs(device_context, stream_id, paged_kv_ptrs, slot_mappings);
++            PyBoostUtils::DispatchRun(std::make_shared<mindspore::runtime::PyBoostDeviceTask>([=]() {
++                PyBoostUtils::MallocOpInputs(device_context, paged_kv_ptrs, slot_mappings);
++                
++                uint8_t* paged_kv_dev_ptr = GetMSDataPtr(paged_kv_ptrs);
++                uint8_t* slot_mappings_dptr = GetMSDataPtr(slot_mappings);
++                auto acl_stream = device_context->device_res_manager_->GetStream(stream_id);
++
++                mindspore::runtime::OpExecutor::DispatchLaunchTask([=]() {
++                    lmc_ops::paged_kv_tuple_copy_impl(ascend_type, aivNum, acl_stream, paged_kv_dev_ptr,
++                        reinterpret_cast<uint8_t*>(lmc_offset_dptr), slot_mappings_dptr, 
++                        static_cast<int64_t>(num_pages), static_cast<int64_t>(hidden_dims), 
++                        static_cast<int32_t>(page_size), static_cast<int32_t>(kvs), 
++                        static_cast<int32_t>(num_layers), static_cast<int32_t>(num_tokens),
++                        static_cast<int>(aivNum), page2L);
++                });
++            }));
++            slot_mappings = caster.RecoveryTensorDtype(slot_mappings, "slot_mapping");
++    }
++
++}
+diff --git a/csrc/ascend/ops.h b/csrc/ascend/ops.h
+new file mode 100644
+index 0000000..e027107
+--- /dev/null
++++ b/csrc/ascend/ops.h
+@@ -0,0 +1,33 @@
++/*
++ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++#include <optional>
++#include <vector>
++#include "kernels/types.h"
++
++namespace lmc_ops {
++  extern void paged_kv_copy_impl(vllm_ascend::AscendType type, uint32_t blockDim, void *stream, uint8_t *pagedKVCaches, uint8_t *dstCacheTensor,
++                           uint8_t *slotmappings, const int64_t numPages, const int64_t hiddenDims, 
++                           const int32_t pagedSize, const int32_t kvs, const int32_t numLayers, const int32_t numTokensChunk, 
++                           const int aivNum, const bool page2L);
++
++  extern void paged_kv_tuple_copy_impl(vllm_ascend::AscendType type, uint32_t blockDim, void *stream, uint8_t *pagedKVCaches, uint8_t *dstCacheTensor,
++                           uint8_t *slotmappings, const int64_t numPages, const int64_t hiddenDims, 
++                           const int32_t pagedSize, const int32_t kvs, const int32_t numLayers, const int32_t numTokensChunk, 
++                           const int aivNum, const bool page2L);
++}
+\ No newline at end of file
+diff --git a/csrc/ascend/pinned_mem.cpp b/csrc/ascend/pinned_mem.cpp
+new file mode 100644
+index 0000000..eba74b4
+--- /dev/null
++++ b/csrc/ascend/pinned_mem.cpp
+@@ -0,0 +1,225 @@
++#include "pinned_mem.h"
++#include <string>
++#include <exception>
++// HACK: undefine the version from ascend_hal.h.
++#ifdef PROF_ERROR
++    // You can add a pragma message to see this in your build log if you want:
++    // #pragma message("Undefining PROF_ERROR from ascend_hal.h before NPU headers")
++    #undef PROF_ERROR
++#endif
++
++#include <sys/mman.h>
++#include "acl/acl.h"
++#include "framework_hal.h"
++#include "dcmi_management.h"
++#include <sys/stat.h>
++#include <unistd.h>
++#include "threading.h"
++#include <memory>
++
++namespace lmc {
++
++
++// A more robust check for your specific environment.
++// numa_available() sometimes does not work in openEuler?
++bool is_numa_system_present() {
++    struct stat st;
++    // Check if the directory that the kernel uses to expose NUMA nodes exists.
++    if (stat("/sys/devices/system/node", &st) != 0) {
++        return false; // The path doesn't exist.
++    }
++    // Check if it's actually a directory.
++    return S_ISDIR(st.st_mode);
++}
++
++// Function to parse a CPU range string (e.g., "144-167")
++// and return the first CPU in the range.
++int parse_first_cpu(const std::string& cpu_str) {
++    try {
++        size_t dash_pos = cpu_str.find('-');
++        if (dash_pos != std::string::npos) {
++            return std::stoi(cpu_str.substr(0, dash_pos));
++        }
++        // If it's a single CPU, not a range
++        return std::stoi(cpu_str);
++    } catch (const std::invalid_argument& e) {
++        throw std::runtime_error("Invalid CPU string format. Could not parse number.");
++    } catch (const std::out_of_range& e) {
++        throw std::runtime_error("CPU number is out of range.");
++    }
++}
++
++PinnedMemoryManager::PinnedMemoryManager() {
++};
++
++PinnedMemoryManager::~PinnedMemoryManager() {
++    this->freeAll();
++};
++
++PinnedMemoryRecord PinnedMemoryManager::getRecord(uintptr_t hostptr) {
++    const std::shared_lock<std::shared_mutex> lock(this->mux); 
++    if (this->allocatedMap.find(hostptr) == this->allocatedMap.end()) {
++        std::cerr << "Host ptr: "<< hostptr << " does not exists. This should not happened." << std::endl;
++        return PinnedMemoryRecord{0, 0, 0, 0};
++    }
++    return this->allocatedMap.at(hostptr);
++}
++
++bool PinnedMemoryManager::innerFree(uintptr_t hostPtr, size_t bufferSize, int8_t device) {
++    // since this is a pinned and dev accessible mem
++    // we need to halhostunregister first
++    auto ret = halHostUnregisterEx(reinterpret_cast<void*>(hostPtr), 
++        static_cast<UINT32>(device), HOST_MEM_MAP_DEV_PCIE_TH);
++
++    if (ret != 0) {
++        std::cout << "Unable to hal host unregister: "<< ret << std::endl;
++        return false;
++    }
++
++    auto mret = munmap(reinterpret_cast<void*>(hostPtr), bufferSize);
++    if (mret != 0) {
++        std::cout << "Unable to unmap memory: "<< ret << std::endl;
++        return false;
++    }
++
++    return true;
++};
++
++bool PinnedMemoryManager::freePinned(uintptr_t hostPtr) {
++    const std::unique_lock<std::shared_mutex> lock(this->mux);     
++
++    // make sure this hostptr is in our allocated map
++    if (this->allocatedMap.find(hostPtr) == this->allocatedMap.end()) {
++        std::cerr << "HostPtr "<<hostPtr << " does not exists." << std::endl;
++        return true; 
++    }
++
++    auto record = this->allocatedMap.at(hostPtr);
++    auto freed = this->innerFree(hostPtr, record.buffSize, record.device);
++    if (freed) {
++        this->allocatedMap.erase(hostPtr);
++        return true;
++    }
++    return false;
++};
++
++uintptr_t PinnedMemoryManager::allocPinned(size_t bufferSize) {
++    auto device = framework_hal::GetDeviceIdx();
++    
++    int cpu = -1;
++    if (is_numa_system_present()) {
++        long numaErr;
++        int numa_node;
++        auto& dcmiManger = lmc::DCMIManager::GetInstance();
++        auto cpuAffinityStr = dcmiManger.getCPUAffinityFromDeviceId(static_cast<int>(device), 0);
++        cpu = lmc::parse_first_cpu(cpuAffinityStr);
++    }
++
++    std::unique_ptr<ThreadAffinityGuard> guard;
++     if (cpu >= 0) {
++        guard = std::make_unique<ThreadAffinityGuard>(cpu);
++    }
++
++    uintptr_t hostPtr;
++    int adviseErr;
++
++    hostPtr = reinterpret_cast<uintptr_t>(mmap(nullptr, bufferSize, PROT_FLAGS, MAP_FLAGS, -1, 0));
++    if ((void*) hostPtr == MAP_FAILED) {
++        throw std::runtime_error("Unable to alloc memory with mmap.");
++    } 
++
++    adviseErr = madvise(reinterpret_cast<void*>(hostPtr), bufferSize, MADV_HUGEPAGE);
++
++    if (adviseErr != 0) {
++        // should be okay to continue
++        std::cerr << "Unable to get madvise with HugePages: "<< adviseErr << std::endl;
++    } 
++
++    // set to all zeros
++    memset(reinterpret_cast<void*>(hostPtr), 0, bufferSize);
++
++    void* devPtr;
++    drvError_t drvRet;
++    drvRet = halHostRegister(reinterpret_cast<void*>(hostPtr), static_cast<UINT64>(bufferSize),
++            HOST_MEM_MAP_DEV_PCIE_TH, static_cast<UINT32>(device), (void**)&devPtr);
++
++    if (drvRet != 0) {
++        throw std::runtime_error(std::string("Unable to register host memory with hal: ") + std::to_string(drvRet) + \
++         " on device: " + std::to_string(device));
++    } 
++
++    auto lockErr = mlock(reinterpret_cast<void*>(hostPtr), bufferSize);
++    if (lockErr == -1) {
++        std::cerr << "Unable to pin host memory with error code: "<< std::to_string(lockErr) << std::endl;
++        // this can happen in non-privileged mode or not enough rlimit, 
++        // let's not proceed since we wanted to guarantee pinned
++        // because we already alloced, let's free
++        this->innerFree(hostPtr, bufferSize, static_cast<int8_t>(device));
++        return 0;
++    } 
++    
++    {
++        const std::unique_lock<std::shared_mutex> lock(this->mux);     
++        this->allocatedMap.emplace(hostPtr, PinnedMemoryRecord{hostPtr, reinterpret_cast<uintptr_t>(devPtr), bufferSize, static_cast<int8_t>(device)});
++    }
++    return hostPtr;
++};
++
++void PinnedMemoryManager::freeAll() {
++    const std::shared_lock<std::shared_mutex> lock(this->mux); 
++    if (!this->allocatedMap.empty()) {
++        std::cerr << "PinnedMemoryManager::freeAll() called. "
++                  << this->allocatedMap.size() 
++                  << " block(s) were still allocated. This might indicate "
++                  << "that not all PyTorch tensor deleters were invoked." << std::endl;
++        
++        // Iterate carefully as erasing modifies the map
++        // One way is to collect keys then iterate, or use C++17 map::extract
++        std::vector<uintptr_t> keys_to_free;
++        for (const auto& pair : this->allocatedMap) {
++            keys_to_free.push_back(pair.first);
++        }
++
++        for (uintptr_t hostPtr : keys_to_free) {
++            // No need to check if it exists here, as we are iterating over existing keys
++            auto record = this->allocatedMap.at(hostPtr); // Or find again, though at should be safe
++            std::cerr << "PinnedMemoryManager::freeAll() freeing hostPtr: " << hostPtr 
++                      << " with size: " << record.buffSize << std::endl;
++            this->innerFree(hostPtr, record.buffSize, record.device);
++            // The map entry will be removed below or after the loop
++        }
++        this->allocatedMap.clear(); // Clear the map after freeing all elements
++    }
++};
++
++uintptr_t get_device_ptr(uintptr_t hostptr) {
++    auto& pmm = lmc::PinnedMemoryManager::GetInstance();
++    auto record = pmm.getRecord(hostptr);
++    if (record.devptr == 0) {
++        throw std::runtime_error("Unable to find host ptr: " + std::to_string(hostptr));
++    }
++    return record.devptr;
++}
++
++uintptr_t alloc_pinned_mem(size_t bufferSize) {
++    auto& pmm = lmc::PinnedMemoryManager::GetInstance();
++    return pmm.allocPinned(bufferSize);
++}
++
++
++bool free_pinned_mem(uintptr_t hostptr) {
++    auto& pmm = lmc::PinnedMemoryManager::GetInstance();
++    return pmm.freePinned(hostptr);
++}
++
++void free_all() {
++    auto& pmm = lmc::PinnedMemoryManager::GetInstance();
++    pmm.freeAll();
++}
++
++void pinned_memory_deleter(void* ptr) {
++    if(ptr) {
++        free_pinned_mem(reinterpret_cast<uintptr_t>(ptr));
++    }
++}
++} // namespace lmc
+\ No newline at end of file
+diff --git a/csrc/ascend/pinned_mem.h b/csrc/ascend/pinned_mem.h
+new file mode 100644
+index 0000000..66ecf69
+--- /dev/null
++++ b/csrc/ascend/pinned_mem.h
+@@ -0,0 +1,69 @@
++#pragma once
++#include <iostream>
++#include <vector>
++#include <pybind11/stl.h>
++#include <cstdint> 
++#include "driver/ascend_hal_define.h"
++#include "driver/ascend_hal.h"
++#include "sys/mman.h"
++#include <shared_mutex>
++#include <mutex>
++#include <unordered_map>
++#include <string>
++#include <cstring>
++#include "acl/acl.h"
++
++/*
++When aclrtHostRegister is supported,
++we could migrate the sharedmemory instantiation with the aclrtHostRegister call instead.
++*/
++namespace lmc {
++
++constexpr int32_t PROT_FLAGS = static_cast<int32_t>(PROT_READ) | static_cast<int32_t>(PROT_WRITE);
++constexpr int32_t MAP_FLAGS = static_cast<int32_t>(MAP_PRIVATE) | static_cast<int32_t>(MAP_ANONYMOUS) | static_cast<int32_t>(MAP_POPULATE);
++
++struct PinnedMemoryRecord {
++    uintptr_t ptr;
++    uintptr_t devptr;
++    size_t buffSize;
++    int8_t device;
++};
++
++/* We are not responsible for acl init and ctx initialization,
++   we assume the user responsible for ctx initialization
++ */
++class PinnedMemoryManager {
++private:
++    PinnedMemoryManager();
++
++    // Delete copy constructor and assignment operator
++    PinnedMemoryManager(const PinnedMemoryManager&) = delete;
++    PinnedMemoryManager& operator=(const PinnedMemoryManager&) = delete;
++    PinnedMemoryManager(PinnedMemoryManager&&) = delete;
++    PinnedMemoryManager& operator=(PinnedMemoryManager&&) = delete;
++
++    std::unordered_map<uintptr_t, PinnedMemoryRecord> allocatedMap;
++    mutable std::shared_mutex mux;
++    
++    bool innerFree(uintptr_t hostPtr, size_t bufferSize, int8_t device);
++    
++public:
++    static PinnedMemoryManager& GetInstance()
++    {
++        static PinnedMemoryManager instance;
++        return instance;
++    }
++    ~PinnedMemoryManager();
++    
++    PinnedMemoryRecord getRecord(uintptr_t hostptr);
++    uintptr_t allocPinned(size_t bufferSize);
++    bool freePinned(uintptr_t hostPtr);
++    void freeAll();
++};
++
++uintptr_t get_device_ptr(uintptr_t hostptr);
++uintptr_t alloc_pinned_mem(size_t bufferSize);
++bool free_pinned_mem(uintptr_t hostptr);
++void free_all();
++void pinned_memory_deleter(void* ptr);
++}
+diff --git a/csrc/ascend/pybind.cpp b/csrc/ascend/pybind.cpp
+new file mode 100644
+index 0000000..1538e11
+--- /dev/null
++++ b/csrc/ascend/pybind.cpp
+@@ -0,0 +1,38 @@
++#include <pybind11/pybind11.h>
++#include "pinned_mem.h"
++#ifdef USE_TORCH
++    #include "torch_tensor.h"
++#else
++    #include "np_tensor.h"
++#endif
++
++
++namespace py = pybind11;
++
++PYBIND11_MODULE(lmcache_C, m) {
++    m.def("alloc_pinned_mem", &lmc::alloc_pinned_mem, py::arg("bufferSize"));
++    m.def("free_pinned_mem", &lmc::free_pinned_mem, py::arg("hostptr"));
++    #ifdef USE_TORCH
++        m.def("paged_layers_kv_transfer", &lmcache_ascend::paged_layers_kv_transfer_torch, 
++            py::arg("lmc_buffer"), py::arg("paged_kv_ptrs"), py::arg("slot_mappings"),
++            py::arg("lmc_buffer_hostptr"), py::arg("num_pages"), py::arg("page_size"),
++            py::arg("kvs"), py::arg("page2L"), py::arg("is_reg_mem") = false);
++        m.def("create_pinned_tensor", &lmcache_ascend::create_pinned_torch_tensor, py::arg("buffer_size"), 
++            "Create a tensor backed by custom pinned memory and accessible by the device with a deleter.");
++        m.def("create_pinned_tensor_with_infos", &lmcache_ascend::create_pinned_torch_tensor_with_infos, 
++            py::arg("shape"), py::arg("dtype"),
++            "Create a tensor of shape and dtype, backed by custom pinned memory and accessible by the device with a deleter.");
++    #else
++        m.def("paged_layers_kv_transfer", &lmcache_ascend::paged_layers_kv_transfer_ms, 
++            py::arg("lmc_buffer"), py::arg("paged_kv_ptrs"), py::arg("slot_mappings"),
++            py::arg("lmc_buffer_hostptr"), py::arg("num_pages"), py::arg("page_size"),
++            py::arg("kvs"), py::arg("page2L"), py::arg("is_reg_mem") = false);
++        m.def("create_pinned_tensor", &lmcache_ascend::create_mmapped_numpy, py::arg("buffer_size"),
++            "Create a numpy tensor backed by custom pinned memory and accessible by the device with a deleter.");
++        m.def("create_pinned_tensor_with_infos", &lmcache_ascend::create_mapped_numpy_with_dtype,
++            py::arg("shape"), py::arg("dtype"),
++            "Create a numpy tensor of shape and dtype, backed by custom pinned memory and accessible by the device with a deleter.");
++    #endif
++    m.def("free_all", &lmc::free_all);
++    m.def("get_device_ptr", &lmc::get_device_ptr, py::arg("ptr"));
++}
+\ No newline at end of file
+diff --git a/csrc/ascend/threading.h b/csrc/ascend/threading.h
+new file mode 100644
+index 0000000..1919605
+--- /dev/null
++++ b/csrc/ascend/threading.h
+@@ -0,0 +1,65 @@
++#pragma once
++#include <iostream>
++#include <thread>
++#include <vector>
++#include <stdexcept>
++#include <cstring>
++#include <cerrno>
++#include <pthread.h>
++#include <sched.h>
++
++/**
++ * @brief An RAII class to temporarily set thread affinity and restore it on destruction.
++ *
++ * Usage:
++ * {
++ * ThreadAffinityGuard guard(8); // Binds thread to CPU 8
++ * // ... do work on CPU 8 ...
++ * } // Original affinity is automatically restored here
++ */
++class ThreadAffinityGuard {
++public:
++    /**
++     * @brief Saves the current thread affinity and sets a new affinity to a single CPU.
++     * @param cpu_id The CPU core to bind the current thread to.
++     */
++    explicit ThreadAffinityGuard(int cpu_id) {
++        // 1. Get and save the original affinity mask.
++        if (pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &original_mask_) != 0) {
++            // If we can't get the original mask, we can't safely restore it.
++            throw std::runtime_error("Failed to get original thread affinity.");
++        }
++        
++        // 2. Create and set the new, temporary mask.
++        cpu_set_t new_mask;
++        CPU_ZERO(&new_mask);
++        CPU_SET(cpu_id, &new_mask);
++        
++        if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &new_mask) != 0) {
++            // We failed to set the new mask. For safety, restore the original immediately.
++            pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &original_mask_); // Best effort restore
++            throw std::runtime_error(std::string("Failed to set thread affinity to CPU ") + std::to_string(cpu_id));
++        }
++    }
++
++    /**
++     * @brief Destructor that automatically restores the original thread affinity.
++     */
++    ~ThreadAffinityGuard() {
++        // 3. Restore the original affinity mask.
++        if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &original_mask_) != 0) {
++            // It's generally a bad idea to throw from a destructor.
++            // A warning is more appropriate here.
++            std::cerr << "Warning: Failed to restore original thread affinity." << std::endl;
++        }
++    }
++
++    // Delete copy and move constructors/assignments to prevent misuse.
++    ThreadAffinityGuard(const ThreadAffinityGuard&) = delete;
++    ThreadAffinityGuard& operator=(const ThreadAffinityGuard&) = delete;
++    ThreadAffinityGuard(ThreadAffinityGuard&&) = delete;
++    ThreadAffinityGuard& operator=(ThreadAffinityGuard&&) = delete;
++
++private:
++    cpu_set_t original_mask_; // Member variable to store the original mask
++};
+\ No newline at end of file
+diff --git a/csrc/ascend/torch_tensor.h b/csrc/ascend/torch_tensor.h
+new file mode 100644
+index 0000000..b737608
+--- /dev/null
++++ b/csrc/ascend/torch_tensor.h
+@@ -0,0 +1,141 @@
++#pragma once
++#include "pinned_mem.h"
++#ifdef PROF_ERROR
++#undef PROF_ERROR
++#endif
++#include <iostream>
++#include <vector>
++#include <pybind11/stl.h>
++#include <cstdint> 
++#include "utils.h"
++#include <torch/torch.h>
++#include <torch/extension.h>
++#include <torch_npu/csrc/core/npu/NPUStream.h>
++#include "aclnn/opdev/platform.h"
++#include "tiling/platform/platform_ascendc.h"
++#include "tiling/tiling_api.h"
++#include "ops.h"
++
++namespace lmcache_ascend {
++    torch::Tensor create_pinned_torch_tensor_with_infos(const std::vector<int64_t>& shape,
++                                                  c10::ScalarType dtype) {
++        if (shape.empty()) {
++            throw std::runtime_error(
++                "Shape must not be empty.");
++        }
++
++        for (int64_t dim: shape) {
++            if (dim <= 0) {
++                throw std::runtime_error(
++                    "Dimensions must be greater than zero.");
++            }
++        }
++
++        int64_t numel = std::accumulate(shape.begin(), shape.end(), 1LL, std::multiplies<int64_t>());
++
++        size_t element_size = c10::elementSize(dtype);
++
++        if (element_size == 0 && numel > 0) {
++            throw std::runtime_error(
++                "Invalid dtype.");
++        }
++
++        size_t bufferSize = element_size * static_cast<size_t>(numel);
++
++        uintptr_t ptr = lmc::alloc_pinned_mem(bufferSize);
++
++        if (ptr == 0) {
++            throw std::runtime_error(
++                "Failed to allocate pinned memory for tensor.");
++        }
++        
++        torch::TensorOptions tensorOpsCpu = torch::TensorOptions()
++                                                .dtype(dtype)
++                                                .device(torch::kCPU)
++                                                .pinned_memory(true);
++        
++        return torch::from_blob(reinterpret_cast<void*>(ptr), shape, lmc::pinned_memory_deleter, tensorOpsCpu);
++
++    }
++
++    torch::Tensor create_pinned_torch_tensor(size_t buffer_size) {
++        torch::TensorOptions tensorOpsCpu = torch::TensorOptions()
++                                                .dtype(torch::kUInt8)
++                                                .device(torch::kCPU)
++                                                .pinned_memory(true);
++        if (buffer_size <= 0) {
++            throw std::runtime_error(
++                "Buffer size must be greater than zero.");
++        }
++        
++        // unlikely this would be greater than int64_t
++        int64_t numel = static_cast<int64_t>(buffer_size);
++
++        uintptr_t ptr = lmc::alloc_pinned_mem(buffer_size);
++        if (ptr == 0) {
++            throw std::runtime_error(
++                "Failed to allocate pinned memory for tensor.");
++        }
++
++        std::vector<int64_t> dims = {numel};
++        
++        return torch::from_blob(reinterpret_cast<void*>(ptr), dims, lmc::pinned_memory_deleter, tensorOpsCpu);
++    }
++
++    
++    void paged_layers_kv_transfer_torch(
++        torch::Tensor& lmc_buffer, 
++        const torch::Tensor& paged_kv_ptrs,
++        const torch::Tensor& slot_mappings,
++        const uintptr_t lmc_buffer_hostptr,
++        const int num_pages,
++        const int page_size,
++        const int kvs,
++        const bool page2L,
++        const bool is_reg_mem) {
++        // reset
++        if (page2L) {
++            memset(lmc_buffer.data_ptr(), 0, lmc_buffer.nbytes());
++        }
++        uintptr_t lmc_offset_dptr;
++        if (is_reg_mem) {
++            // 1. we need to get the dev ptr from the host ptr and the paged_kv_ptrs list.
++            uintptr_t lmc_base_dptr = lmc::get_device_ptr(lmc_buffer_hostptr);
++            // 2. calculate offset from the lmcbaseptr to current dataptr
++            uintptr_t current_ptr = reinterpret_cast<uintptr_t>(lmc_buffer.data_ptr());
++            uintptr_t offset_ptr = current_ptr - lmc_buffer_hostptr;
++            if (offset_ptr < 0) {
++                throw std::runtime_error(
++                    "Offset calculation failed.");
++            } 
++            lmc_offset_dptr = lmc_base_dptr + offset_ptr;
++        } else {
++            // we just get the ptr from the lmc buf
++            lmc_offset_dptr = reinterpret_cast<uintptr_t>(lmc_buffer.data_ptr());
++        }
++            
++        
++        // 3. get the paged_kv_ptrs
++        uintptr_t paged_kv_dptr = reinterpret_cast<uintptr_t>(paged_kv_ptrs.data_ptr());
++        
++        // 4. we make sure slot mappings are on the device
++        int num_tokens = slot_mappings.size(0);
++        int num_layers = paged_kv_ptrs.size(0);
++        int hidden_dims = lmc_buffer.size(-1);
++        
++        aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
++        at::ScalarType scalar_type = lmc_buffer.scalar_type();
++        
++        auto dtype_num = vllm_ascend::get_dtype_from_torch(scalar_type);
++        const char* socName = aclrtGetSocName();
++        auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socName);
++        uint32_t aivNum = ascendcPlatform->GetCoreNumAiv();
++        void* slot_mappings_dptr = slot_mappings.data_ptr();
++
++        lmc_ops::paged_kv_copy_impl(dtype_num, aivNum, stream, reinterpret_cast<uint8_t*>(paged_kv_dptr),
++            reinterpret_cast<uint8_t*>(lmc_offset_dptr), reinterpret_cast<uint8_t*>(slot_mappings_dptr), 
++            static_cast<int64_t>(num_pages), static_cast<int64_t>(hidden_dims), static_cast<int32_t>(page_size),
++            static_cast<int32_t>(kvs), static_cast<int32_t>(num_layers), static_cast<int32_t>(num_tokens),
++            static_cast<int>(aivNum), page2L);
++    }
++}
+\ No newline at end of file
+diff --git a/csrc/ascend/utils.h b/csrc/ascend/utils.h
+new file mode 100644
+index 0000000..3728d27
+--- /dev/null
++++ b/csrc/ascend/utils.h
+@@ -0,0 +1,48 @@
++#pragma once
++
++#include "kernels/types.h"
++#include <Python.h>
++namespace vllm_ascend {
++
++#ifdef USE_TORCH
++    #include <c10/core/ScalarType.h>
++    AscendType get_dtype_from_torch(at::ScalarType scalarType)
++    {
++        if (scalarType == at::ScalarType::Float) {
++            return AscendType::FP32;
++        } else if (scalarType == at::ScalarType::BFloat16) {
++            return AscendType::BF16;
++        } else {
++            return AscendType::FP16;
++        }
++    }
++#else
++    #include <pybind11/pybind11.h>
++    #include <pybind11/numpy.h> 
++    #include <string>
++    
++
++    AscendType get_dtype_from_np(const py::array& arr) {
++        py::object array_dtype_obj = arr.dtype();
++        std::string array_dtype_repr =  py::repr(array_dtype_obj).cast<std::string>();
++        
++        if (array_dtype_repr.find("bfloat16") != std::string::npos) {
++            // HACK: Mindspore np weirdness
++            return AscendType::BF16;
++        }
++
++        // Fallback to format string for other common dtypes
++        std::string format_str = arr.request().format;
++        if (format_str == "f" || format_str == "f4") { // float32
++            return AscendType::FP32;
++        } else if (format_str == "h" || format_str == "f2") { // float16
++            return AscendType::FP16;
++        } else if (format_str == "b" || format_str == "i1") { // <--- ADD THIS for signed 8-bit integer
++            return AscendType::INT8;
++        }
++
++        throw std::runtime_error( 
++            "Unsupported numpy dtype: " + format_str + ". Only float32, float16, and int8 are supported."); 
++    }
++#endif
++} // namespace vllm_ascend
+
+diff --git a/lmcache/integration/vllm/vllm_v1_adapter.py b/lmcache/integration/vllm/vllm_v1_adapter.py
+index 167b316..a0196df 100644
+--- a/lmcache/integration/vllm/vllm_v1_adapter.py
++++ b/lmcache/integration/vllm/vllm_v1_adapter.py
+@@ -30,6 +30,7 @@ from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+ import torch
+ import vllm.envs as envs
+ import zmq
++import numpy as np
+ 
+ # First Party
+ from lmcache.integration.vllm.utils import (
+@@ -85,10 +86,13 @@ class LMCacheLookupClient:
+         )
+ 
+     def lookup(self, token_ids: torch.Tensor) -> int:
+-        request = self.encoder.encode(token_ids)
++        # NOTE: the default encoder and decoder does not decode the mindspore tensor properly
++        # using numpy seems to work
++        request = self.encoder.encode(token_ids.asnumpy())
+         self.socket.send_multipart(request, copy=False)
+         resp = self.socket.recv()
+         result = int.from_bytes(resp, "big")
++        logger.debug(f"Client Received Lookup Result: {result}")
+         return result
+ 
+     def close(self):
+@@ -103,7 +107,8 @@ class LMCacheLookupServer:
+         is_tp: bool,
+         vllm_config: "VllmConfig",
+     ):
+-        self.decoder = MsgpackDecoder(torch.Tensor)
++        # NOTE: Instead of mindspore tensor, let's do ndarray
++        self.decoder = MsgpackDecoder(np.ndarray)
+         self.ctx = zmq.Context()  # type: ignore[attr-defined]
+         socket_path = get_zmq_rpc_path_lmcache(role, is_tp, vllm_config)
+         self.socket = make_zmq_socket(
+@@ -118,15 +123,17 @@ class LMCacheLookupServer:
+ 
+         def process_request():
+             while self.running:
+-                # try:
++                try:
+                 # request = self.socket.recv()
+-                frames = self.socket.recv_multipart(copy=False)
+-                token_ids = self.decoder.decode(frames)
+-                result = self.lmcache_engine.lookup(token_ids, pin=True)
+-                response = result.to_bytes(4, "big")
+-                self.socket.send(response)
+-                # except Exception as e:
+-                #    logger.error("Error in LMCache lookup server: %s", e)
++                    frames = self.socket.recv_multipart(copy=False)
++                    token_ids = self.decoder.decode(frames)
++                    result = self.lmcache_engine.lookup(token_ids, pin=True)
++                    response = result.to_bytes(4, "big")
++                    logger.debug(f"Lookup Result: {result}")
++                    self.socket.send(response)
++                except Exception as e:
++                    logger.error("Error in LMCache lookup server: %s", e)
++                    raise e
+                 #    break
+                 # continue
+ 
+@@ -507,12 +514,12 @@ class LMCacheConnectorV1Impl:
+             if request.load_spec is None:
+                 continue
+ 
+-            tokens = request.token_ids
++            tokens = request.token_ids.asnumpy()
+             # TODO: have a pre-allocated buffer to hold the slot_mappings
+             slot_mapping = request.slot_mapping.cuda()
+             assert len(tokens) == len(slot_mapping)
+ 
+-            token_mask = torch.ones_like(tokens, dtype=torch.bool)
++            token_mask = np.ones_like(tokens, dtype=np.bool_)
+             masked_token_count = (
+                 request.load_spec.vllm_cached_tokens
+                 // self._lmcache_chunk_size
+@@ -728,7 +735,10 @@ class LMCacheConnectorV1Impl:
+ 
+             token_ids = request.token_ids
+             assert isinstance(token_ids, torch.Tensor)
+-            assert token_ids.is_cpu
++            
++            # NOTE: Mindspore TorchAdapter is_cpu is always false
++            # assert token_ids.is_cpu
++            token_ids_np = token_ids.asnumpy()
+ 
+             slot_mapping = request.slot_mapping
+             assert isinstance(slot_mapping, torch.Tensor)
+@@ -746,7 +756,7 @@ class LMCacheConnectorV1Impl:
+             # )
+             skip_leading_tokens = save_spec.skip_leading_tokens
+ 
+-            if skip_leading_tokens == len(token_ids):
++            if skip_leading_tokens == len(token_ids_np):
+                 continue  # skip this request
+             # Align to lmcache chunk size
+             skip_leading_tokens = (
+@@ -755,19 +765,19 @@ class LMCacheConnectorV1Impl:
+                 * self._lmcache_chunk_size
+             )
+ 
+-            store_mask = torch.ones_like(token_ids, dtype=torch.bool)
++            store_mask = np.ones_like(token_ids_np, dtype=np.bool_)
+             store_mask[:skip_leading_tokens] = False
+ 
+             logger.info(
+                 "Storing KV cache for %d out of %d tokens "
+                 "(skip_leading_tokens=%d) for request %s",
+-                len(token_ids) - skip_leading_tokens,
+-                len(token_ids),
++                len(token_ids_np) - skip_leading_tokens,
++                len(token_ids_np),
+                 skip_leading_tokens,
+                 request.req_id,
+             )
+             self.lmcache_engine.store(
+-                token_ids,
++                token_ids_np,
+                 mask=store_mask,
+                 kvcaches=kvcaches,
+                 slot_mapping=slot_mapping,
+diff --git a/lmcache/observability.py b/lmcache/observability.py
+index 874d9aa..76ff9e2 100644
+--- a/lmcache/observability.py
++++ b/lmcache/observability.py
+@@ -695,9 +695,11 @@ class LMCacheStatsLogger:
+         self.log_interval = log_interval
+         self.monitor = LMCStatsMonitor.GetOrCreate()
+         self.prometheus_logger = PrometheusLogger.GetOrCreate(metadata)
+-        self.is_running = True
+ 
++        self.is_running = True
++        
+         self.thread = threading.Thread(target=self.log_worker, daemon=True)
++        
+         self.thread.start()
+ 
+     def log_worker(self):
+@@ -708,4 +710,5 @@ class LMCacheStatsLogger:
+ 
+     def shutdown(self):
+         self.is_running = False
+-        self.thread.join()
++        if self.thread.is_alive():
++            self.thread.join()
+diff --git a/lmcache/storage_backend/serde/cachegen_decoder.py b/lmcache/storage_backend/serde/cachegen_decoder.py
+index e109ae7..b128d51 100644
+--- a/lmcache/storage_backend/serde/cachegen_decoder.py
++++ b/lmcache/storage_backend/serde/cachegen_decoder.py
+@@ -28,7 +28,7 @@ from lmcache.storage_backend.serde.cachegen_basics import (
+ )
+ from lmcache.storage_backend.serde.serde import Deserializer
+ from lmcache.utils import _lmcache_nvtx_annotate
+-import lmcache.c_ops as lmc_ops
++# import lmcache.c_ops as lmc_ops
+ import lmcache.storage_backend.serde.cachegen_basics as CGBasics
+ 
+ logger = init_logger(__name__)
+@@ -76,6 +76,7 @@ def decode_chunk(
+     Write the decode output in target_buffer
+     Expected shape: [nlayers (kv in total), ntokens, nchannels]
+     """
++    raise NotImplementedError()
+     bytes_tensor = data_chunk.bytestream
+     length_prefsum = (
+         data_chunk.bytestream_lengths.flatten()
+diff --git a/lmcache/storage_backend/serde/cachegen_encoder.py b/lmcache/storage_backend/serde/cachegen_encoder.py
+index da5fa90..3745410 100644
+--- a/lmcache/storage_backend/serde/cachegen_encoder.py
++++ b/lmcache/storage_backend/serde/cachegen_encoder.py
+@@ -28,7 +28,7 @@ from lmcache.storage_backend.serde.cachegen_basics import (
+ )
+ from lmcache.storage_backend.serde.serde import Serializer
+ from lmcache.utils import _lmcache_nvtx_annotate
+-import lmcache.c_ops as lmc_ops
++# import lmcache.c_ops as lmc_ops
+ import lmcache.storage_backend.serde.cachegen_basics as CGBasics
+ 
+ logger = init_logger(__name__)
+@@ -273,6 +273,7 @@ def encode_ntokens(
+ 
+     :return byte_tensor: the byte tensor
+     """
++    raise NotImplementedError()
+     lmc_ops.encode_fast_new(
+         cdf_int,
+         encode_input,
+diff --git a/lmcache/utils.py b/lmcache/utils.py
+index dfdb6e4..e8412e5 100644
+--- a/lmcache/utils.py
++++ b/lmcache/utils.py
+@@ -17,12 +17,13 @@ from __future__ import annotations
+ 
+ # Standard
+ from dataclasses import dataclass
+-from typing import TYPE_CHECKING, List, Optional, Tuple
++from typing import TYPE_CHECKING, List, Optional, Tuple, Dict, Any
+ import hashlib
+ import threading
++import json
+ 
+ # Third Party
+-from nvtx import annotate  # type: ignore
++# from nvtx import annotate  # type: ignore
+ import torch
+ 
+ if TYPE_CHECKING:
+@@ -31,7 +32,16 @@ if TYPE_CHECKING:
+ 
+ # Type definition
+ KVCache = Tuple[Tuple[torch.Tensor, torch.Tensor], ...]
++from lmcache.logging import init_logger
+ 
++logger = init_logger(__name__)
++
++HAS_MS_TYPE = False
++try:
++    from mindspore.common import np_dtype
++    HAS_MS_TYPE = True
++except ImportError as ie:
++    pass
+ 
+ @dataclass
+ class DiskCacheMetadata:
+@@ -54,20 +64,100 @@ class DiskCacheMetadata:
+     def is_pinned(self) -> bool:
+         return self.is_pin
+ 
++    def to_dict(self) -> Dict[str, Any]:
++        """
++        Converts the metadata to a JSON-serializable dictionary.
++        """
++        global TORCH_DTYPE_TO_STR_DTYPE
++        return {
++            "path": self.path,
++            "size": self.size,
++            # Convert torch.Size to a list, which is JSON-safe
++            "shape": list(self.shape) if self.shape is not None else None,
++            # Convert torch.dtype to a string, which is JSON-safe
++            "dtype": TORCH_DTYPE_TO_STR_DTYPE[self.dtype] if self.dtype is not None else None,
++        }
++    
++    @classmethod
++    def from_dict(cls, data_dict: Dict[str, Any]) -> 'DiskCacheMetadata':
++        """
++        Creates a DiskCacheMetadata instance from a dictionary.
++        """
++        path = data_dict.get("path")
++        size = data_dict.get("size")
++        global TORCH_STR_TO_DTYPE
++        # Convert shape from a list back to torch.Size
++        shape_list = data_dict.get("shape")
++        shape = torch.Size(shape_list) if shape_list is not None else None
++        
++        # Convert dtype from a string back to torch.dtype using the safe map
++        dtype_str = data_dict.get("dtype")
++        dtype = TORCH_STR_TO_DTYPE[dtype_str] if dtype_str is not None else None
++ 
++        return cls(path=path, size=size, shape=shape, dtype=dtype)
+ 
+ TORCH_DTYPE_TO_STR_DTYPE = {
+     torch.half: "half",
+     torch.float16: "half",
+     torch.bfloat16: "bfloat16",
+     torch.float: "float",
+-    torch.float32: "float",
+-    torch.float64: "double",
++    torch.float32: "float32",
++    torch.float64: "float64",
+     torch.double: "double",
+     torch.uint8: "fp8",
+-    torch.float8_e4m3fn: "fp8_e4m3",
+-    torch.float8_e5m2: "fp8_e5m2",
+ }
+ 
++TORCH_STR_TO_DTYPE = {
++    "half": torch.float16,
++    "bfloat16": torch.bfloat16,
++    "float": torch.float,
++    "float32": torch.float32,
++    "float64": torch.float64,
++    "double": torch.double,
++    "fp8": torch.uint8,
++}
++
++def update_dtypes():
++    global TORCH_DTYPE_TO_STR_DTYPE
++    global HAS_MS_TYPE
++    try:
++        TORCH_DTYPE_TO_STR_DTYPE.update({torch.float8_e4m3fn: "fp8_e4m3"})
++        TORCH_DTYPE_TO_STR_DTYPE.update({torch.float8_e5m2: "float8_e5m2"})
++        TORCH_STR_TO_DTYPE.update({"fp8_e4m3": torch.float8_e4m3fn})
++        TORCH_STR_TO_DTYPE.update({"float8_e5m2": torch.float8_e5m2})
++    except AttributeError as ae:
++        if not HAS_MS_TYPE:
++            logger.error("Unable to update dtype: ", ae)
++            raise ae
++        else:
++            logger.warn("not using dtypes: torch.float8_e4m3fn and torch.float8_e5m2")
++            pass
++    
++    if HAS_MS_TYPE:    
++        try:
++            TORCH_DTYPE_TO_STR_DTYPE.update({np_dtype.bfloat16: "np_bfloat16"})
++            TORCH_STR_TO_DTYPE.update({"np_bfloat16": np_dtype.bfloat16})
++        except AttributeError as ae:
++            logger.error("Unable to update dtype: ", ae)
++            pass
++
++update_dtypes()
++
++class TorchEncoderWithDecoderHook(json.JSONEncoder):
++    def default(self, obj):
++        if obj is not None and isinstance(obj, torch.dtype):
++            return TORCH_DTYPE_TO_STR_DTYPE[obj]
++        return json.JSONEncoder.default(self, obj)
++    
++    @staticmethod
++    def decode_hook(dct):
++        new_dct = {}
++        for key, value in dct.items():
++            if isinstance(value, str) and value in TORCH_STR_TO_DTYPE:
++                new_dct[key] = TORCH_STR_TO_DTYPE[value]
++            else:
++                new_dct[key] = value
++        return new_dct
+ 
+ @dataclass(order=True)
+ class CacheEngineKey:
+@@ -89,8 +179,9 @@ class CacheEngineKey:
+         )
+ 
+     def to_string(self):
++        model_name = self.model_name.replace("-", "#")
+         return (
+-            f"{self.fmt}@{self.model_name}@{self.world_size}"
++            f"{self.fmt}@{model_name}@{self.world_size}"
+             f"@{self.worker_id}@{self.chunk_hash}"
+         )
+ 
+@@ -127,8 +218,9 @@ class CacheEngineKey:
+         parts = s.split("@")
+         if len(parts) != 5:
+             raise ValueError(f"Invalid key string: {s}")
++        model_name = parts[1].replace("#", "-")
+         return CacheEngineKey(
+-            parts[0], parts[1], int(parts[2]), int(parts[3]), parts[4]
++            parts[0], model_name, int(parts[2]), int(parts[3]), parts[4]
+         )
+ 
+     def to_dict(self):
+@@ -219,14 +311,9 @@ def _get_color_for_nvtx(name):
+     idx = hash_value % len(_NVTX_COLORS)
+     return _NVTX_COLORS[idx]
+ 
+-
+ def _lmcache_nvtx_annotate(func, domain="lmcache"):
+     """Decorator for applying nvtx annotations to methods in lmcache."""
+-    return annotate(
+-        message=func.__qualname__,
+-        color=_get_color_for_nvtx(func.__qualname__),
+-        domain=domain,
+-    )(func)
++    return (func)
+ 
+ 
+ ##### Observability Threading related #####
+diff --git a/lmcache/v1/_tensor.py b/lmcache/v1/_tensor.py
+new file mode 100644
+index 0000000..a7a2525
+--- /dev/null
++++ b/lmcache/v1/_tensor.py
+@@ -0,0 +1,85 @@
++"""
++    This file contain methods that can be used to get information about a torch.Tensor or np.ndarray.
++"""
++import torch
++import numpy as np
++from typing import Union
++import ctypes
++
++USE_MS = False
++
++try:
++    import mindspore as ms
++    from mindspore.common import np_dtype
++
++    USE_MS = True
++except:
++    pass
++
++
++MS_DTYPE_SIZE = {}
++
++def get_dtype_compat(dtype: torch.dtype):
++    global USE_MS
++    if USE_MS and isinstance(dtype, ms.dtype.Type):
++        return ms.dtype_to_nptype(dtype)
++    return dtype
++    
++
++def get_itemsize(dtype: torch.dtype):
++    """FIXME: is there a better way to do this ? :/"""
++    m = getattr(dtype, "itemsize", None)
++    global USE_MS
++    global MS_DTYPE_SIZE
++    if USE_MS:
++        if m is None:
++            # we are probably at mindspore
++            if dtype in MS_DTYPE_SIZE:
++                return MS_DTYPE_SIZE[dtype]
++            tmp = ms.Tensor([1.0], dtype=dtype)
++            MS_DTYPE_SIZE[dtype] = tmp.itemsize
++            m = MS_DTYPE_SIZE[dtype]
++        elif dtype == np_dtype.bfloat16:
++            # np does not have bfloat16
++            return 2
++    return m
++        
++
++def get_data_ptr(tensor: Union[torch.Tensor, np.ndarray]):
++    """Get the data pointer of a torch.Tensor or np.ndarray."""
++    if isinstance(tensor, torch.Tensor):
++        return tensor.data_ptr()
++    elif isinstance(tensor, np.ndarray):
++        return tensor.ctypes.data_as(ctypes.c_void_p).value
++    else:
++        raise TypeError(f"Expected torch.Tensor or np.ndarray, got {type(tensor)}")
++
++
++def get_numel(tensor: Union[torch.Tensor, np.ndarray, torch.Size]):
++    """Get the number of elements in a torch.Tensor or np.ndarray."""
++    if isinstance(tensor, torch.Tensor):
++        return tensor.numel()
++    elif isinstance(tensor, np.ndarray):
++        return tensor.size
++    elif isinstance(tensor, torch.Size):
++        return torch.numel(tensor)
++    else:
++        raise TypeError(f"Expected torch.Tensor or np.ndarray, got {type(tensor)}")
++    
++def get_element_size(tensor: Union[torch.Tensor, np.ndarray]):
++    """Get the size of each element in a torch.Tensor or np.ndarray."""
++    if isinstance(tensor, torch.Tensor):
++        return tensor.elemenet_size()
++    elif isinstance(tensor, np.ndarray):
++        return tensor.itemsize
++    else:
++        raise TypeError(f"Expected torch.Tensor or np.ndarray, got {type(tensor)}")
++
++def view_and_shape(tensor: Union[torch.Tensor, np.ndarray], dtype: torch.dtype, shape: torch.Size):
++    """Get the view and shape of a torch.Tensor or np.ndarray."""
++    if isinstance(tensor, torch.Tensor):
++        return tensor.view(dtype).view(shape)
++    elif isinstance(tensor, np.ndarray):
++        return tensor.reshape(-1).view(dtype).reshape(shape)
++    else:
++        raise TypeError(f"Expected torch.Tensor or np.ndarray, got {type(tensor)}")
+diff --git a/lmcache/v1/cache_engine.py b/lmcache/v1/cache_engine.py
+index 349e6f6..93e3047 100644
+--- a/lmcache/v1/cache_engine.py
++++ b/lmcache/v1/cache_engine.py
+@@ -25,7 +25,6 @@ import torch
+ from lmcache.config import LMCacheEngineMetadata
+ from lmcache.logging import init_logger
+ from lmcache.observability import LMCacheStatsLogger, LMCStatsMonitor
+-from lmcache.usage_context import InitializeUsageContext
+ from lmcache.utils import CacheEngineKey, _lmcache_nvtx_annotate
+ from lmcache.v1.config import LMCacheEngineConfig
+ from lmcache.v1.distributed_server import (
+@@ -51,7 +50,8 @@ from lmcache.v1.token_database import (
+     SegmentTokenDatabase,
+     TokenDatabase,
+ )
+-
++import time
++import numpy as np
+ logger = init_logger(__name__)
+ 
+ 
+@@ -140,15 +140,14 @@ class LMCacheEngine:
+             else:
+                 self.fmt = MemoryFormat.KV_T2D
+ 
+-        InitializeUsageContext(config.to_original_config(), metadata)
+         self.stats_monitor = LMCStatsMonitor.GetOrCreate()
+ 
+     @_lmcache_nvtx_annotate
+     @torch.inference_mode()
+     def store(
+         self,
+-        tokens: torch.Tensor,
+-        mask: Optional[torch.Tensor] = None,
++        tokens: np.ndarray,
++        mask: Optional[np.ndarray] = None,
+         **kwargs,
+     ) -> None:
+         """Store the tokens and mask into the cache engine.
+@@ -170,9 +169,9 @@ class LMCacheEngine:
+         """
+ 
+         if mask is not None:
+-            num_stored_tokens = torch.sum(mask).item()
++            num_stored_tokens = int(np.sum(mask))
+         else:
+-            num_stored_tokens = len(tokens)
++            num_stored_tokens = tokens.size
+         monitor_req_id = self.stats_monitor.on_store_request(num_stored_tokens)
+ 
+         starts = []
+@@ -343,10 +342,10 @@ class LMCacheEngine:
+     @torch.inference_mode()
+     def retrieve(
+         self,
+-        tokens: torch.Tensor,
+-        mask: Optional[torch.Tensor] = None,
++        tokens: np.ndarray,
++        mask: Optional[np.ndarray] = None,
+         **kwargs,
+-    ) -> torch.Tensor:
++    ) -> np.ndarray:
+         """Retrieve the KV caches from the cache engine. And put the retrieved
+         KV cache to the serving engine via the GPU connector.
+ 
+@@ -369,17 +368,18 @@ class LMCacheEngine:
+             multiple of the chunk size.
+         """
+         if mask is not None:
+-            num_required_tokens = torch.sum(mask).item()
++            num_required_tokens = int(np.sum(mask))
+         else:
+-            num_required_tokens = len(tokens)
++            num_required_tokens = tokens.size
+         monitor_req_id = self.stats_monitor.on_retrieve_request(num_required_tokens)
+ 
+-        ret_mask = torch.zeros_like(tokens, dtype=torch.bool, device="cpu")
++        ret_mask = np.zeros(tokens.shape, dtype=np.bool_)
++
+         for start, end, key in self.token_database.process_tokens(tokens, mask):
+             assert isinstance(key, CacheEngineKey)
+ 
+             # Get the memory object from the storage backend
+-            memory_obj = self.storage_manager.get(key)
++            memory_obj = self.storage_manager.get(key, shape=self.metadata.kv_shape, dtype=self.metadata.kv_dtype)
+ 
+             if memory_obj is None:
+                 if self.enable_p2p:
+@@ -397,7 +397,10 @@ class LMCacheEngine:
+             # cpu tensor for the sake of performance.
+             # For example, disk->gpu is faster than disk->cpu->gpu.
+             # RDMA is another example.
++            # t1 = time.perf_counter()
+             self.gpu_connector.to_gpu(memory_obj, start, end, **kwargs)
++            # t2 = time.perf_counter()
++            # print("To gpu: ", t2-t1)
+             memory_obj.ref_count_down()
+ 
+             # NOTE (ApostaC): This is only for the current implementation:
+@@ -408,7 +411,7 @@ class LMCacheEngine:
+             else:
+                 self.storage_manager.batched_unpin([key])
+ 
+-        retrieved_tokens = torch.sum(ret_mask)
++        retrieved_tokens = np.sum(ret_mask)
+         self.stats_monitor.on_retrieve_finished(monitor_req_id, retrieved_tokens)
+         logger.debug(
+             f"Retrieved {retrieved_tokens} "
+@@ -546,7 +549,7 @@ class LMCacheEngine:
+     @_lmcache_nvtx_annotate
+     def lookup(
+         self,
+-        tokens: Union[torch.Tensor, List[int]],
++        tokens: Union[torch.Tensor, List[int], np.ndarray],
+         search_range: Optional[List[str]] = None,
+         pin: bool = False,
+     ) -> int:
+@@ -643,6 +646,7 @@ class LMCacheEngineBuilder:
+     def _Create_memory_allocator(
+         config: LMCacheEngineConfig,
+         metadata: LMCacheEngineMetadata,
++        framework: str = "torch",
+     ) -> MemoryAllocatorInterface:
+         if config.enable_nixl:
+             assert config.nixl_buffer_device is not None
+@@ -653,7 +657,7 @@ class LMCacheEngineBuilder:
+             return CuFileMemoryAllocator(config.cufile_buffer_size * 1024**2)
+ 
+         max_local_cpu_size = config.max_local_cpu_size
+-        return MixedMemoryAllocator(int(max_local_cpu_size * 1024**3))
++        return MixedMemoryAllocator(int(max_local_cpu_size * 1024**3), framework)
+ 
+     @staticmethod
+     def _Create_token_database(
+@@ -671,6 +675,7 @@ class LMCacheEngineBuilder:
+         config: LMCacheEngineConfig,
+         metadata: LMCacheEngineMetadata,
+         gpu_connector: GPUConnectorInterface,
++        framework: str = "torch",
+     ) -> LMCacheEngine:
+         """
+         Builds a new LMCacheEngine instance if it doesn't already exist for the
+@@ -681,7 +686,7 @@ class LMCacheEngineBuilder:
+         """
+         logger.info(f"Creating LMCacheEngine instance {instance_id}")
+         if instance_id not in cls._instances:
+-            memory_allocator = cls._Create_memory_allocator(config, metadata)
++            memory_allocator = cls._Create_memory_allocator(config, metadata, framework)
+             token_database = cls._Create_token_database(config, metadata)
+             stat_logger = LMCacheStatsLogger(metadata, log_interval=10)
+ 
+diff --git a/lmcache/v1/config.py b/lmcache/v1/config.py
+index d27696d..54378ff 100644
+--- a/lmcache/v1/config.py
++++ b/lmcache/v1/config.py
+@@ -49,6 +49,7 @@ class LMCacheEngineConfig:
+     # value even if local_cpu is disabled
+     local_disk: Optional[str]
+     max_local_disk_size: float  # in GB
++    use_fuse_driver: bool
+ 
+     remote_url: Optional[str]
+     remote_serde: Optional[str]  # Can be "naive" or "cachegen"
+@@ -101,6 +102,10 @@ class LMCacheEngineConfig:
+     # The url of the actual remote lmcache instance for auditing
+     audit_actual_remote_url: Optional[str] = None
+ 
++    # Path of falconfs
++    workspace: Optional[str] = None
++    config_path: Optional[str] = None
++
+     # The path under the WekaFS mount that the cache will be stored
+     weka_path: Optional[str] = None
+     # (Optional) The path under the File-based backend cache will be stored
+@@ -127,6 +132,7 @@ class LMCacheEngineConfig:
+         max_local_cpu_size: float = 5.0,
+         local_disk: Optional[str] = None,
+         max_local_disk_size: int = 0,
++        use_fuse_driver: bool = False,
+         remote_url: Optional[str] = "lm://localhost:65432",
+         remote_serde: Optional[str] = "naive",
+         use_layerwise: bool = False,
+@@ -151,6 +157,8 @@ class LMCacheEngineConfig:
+         nixl_buffer_device: Optional[str] = None,
+         nixl_enable_gc: Optional[bool] = False,
+         audit_actual_remote_url: Optional[str] = None,
++        workspace: Optional[str] = None,
++        config_path: Optional[str] = None,
+         weka_path: Optional[str] = None,
+         gds_path: Optional[str] = None,
+         cufile_buffer_size: Optional[int] = None,
+@@ -165,6 +173,7 @@ class LMCacheEngineConfig:
+             max_local_cpu_size,
+             local_disk,
+             max_local_disk_size,
++            use_fuse_driver,
+             remote_url,
+             remote_serde,
+             use_layerwise,
+@@ -189,6 +198,8 @@ class LMCacheEngineConfig:
+             nixl_buffer_device,
+             nixl_enable_gc,
+             audit_actual_remote_url,
++            workspace,
++            config_path,
+             weka_path,
+             gds_path,
+             cufile_buffer_size,
+@@ -229,6 +240,7 @@ class LMCacheEngineConfig:
+             local_disk = "/local/disk_test/local_disk/"
+             max_local_disk_size = 5
+             remote_url = None
++            use_fuse_driver = False
+         elif backend == "local_cpu_disk":
+             local_cpu = True
+             max_local_cpu_size = 5
+@@ -262,6 +274,7 @@ class LMCacheEngineConfig:
+                 max_local_cpu_size=max_local_cpu_size,
+                 local_disk=local_disk,
+                 max_local_disk_size=max_local_disk_size,
++                use_fuse_driver=False,
+                 remote_url=remote_url,
+                 remote_serde=remote_serde,
+                 use_layerwise=use_layerwise,
+@@ -295,6 +308,7 @@ class LMCacheEngineConfig:
+ 
+         local_disk = config.get("local_disk", None)
+         max_local_disk_size = config.get("max_local_disk_size", 5)
++        use_fuse_driver = config.get("use_fuse_driver", False)
+ 
+         remote_url = config.get("remote_url", None)
+         remote_serde = config.get("remote_serde", "naive")
+@@ -352,6 +366,9 @@ class LMCacheEngineConfig:
+ 
+         audit_actual_remote_url = config.get("audit_actual_remote_url", None)
+ 
++        workspace = config.get("workspace", None)
++        config_path = config.get("config_path", None)
++
+         weka_path = config.get("weka_path", None)
+         gds_path = config.get("gds_path", None)
+         cufile_buffer_size = config.get("cufile_buffer_size", None)
+@@ -370,6 +387,13 @@ class LMCacheEngineConfig:
+             case _:
+                 raise ValueError(f"Invalid remote storage url: {remote_url}")
+ 
++        match workspace:
++            case None:
++                print("----------------------workspace is None----------------------")
++            case path if re.match(r"/(.*)/",
++                                  path):  # local disk directory
++                print(f"----------------------workspace is {workspace}----------------------")
++
+         return (
+             LMCacheEngineConfig(
+                 chunk_size,
+@@ -377,6 +401,7 @@ class LMCacheEngineConfig:
+                 max_local_cpu_size,
+                 local_disk_path,
+                 max_local_disk_size,
++                use_fuse_driver,
+                 remote_url,
+                 remote_serde,
+                 use_layerwise,
+@@ -401,6 +426,8 @@ class LMCacheEngineConfig:
+                 nixl_buffer_device,
+                 nixl_enable_gc,
+                 audit_actual_remote_url,
++                workspace,
++                config_path,
+                 weka_path,
+                 gds_path,
+                 cufile_buffer_size,
+@@ -469,6 +496,8 @@ class LMCacheEngineConfig:
+         config.max_local_disk_size = to_float(
+             parse_env(get_env_name("max_local_disk_size"), config.max_local_disk_size)
+         )
++        config.use_fuse_driver = parse_env(get_env_name("use_fuse_driver"),
++                                           config.use_fuse_driver)
+         config.remote_url = parse_env(get_env_name("remote_url"), config.remote_url)
+         config.remote_serde = parse_env(
+             get_env_name("remote_serde"), config.remote_serde
+@@ -573,6 +602,14 @@ class LMCacheEngineConfig:
+             config.audit_actual_remote_url,
+         )
+ 
++        config.workspace = parse_env(
++            get_env_name("workspace"),
++            config.workspace)
++
++        config.workspace = parse_env(
++            get_env_name("config_path"),
++            config.config_path)
++
+         config.weka_path = parse_env(
+             get_env_name("weka_path"),
+             config.weka_path,
+@@ -679,6 +716,8 @@ class LMCacheEngineConfig:
+             "extra_config": self.extra_config,
+             "save_unfull_chunk": self.save_unfull_chunk,
+             "blocking_timeout_secs": self.blocking_timeout_secs,
++            "workspace": self.workspace,
++            "config_path": self.config_path,
+         }
+         logger.info(f"LMCache Configuration: {config_dict}")
+ 
+diff --git a/lmcache/v1/envs.py b/lmcache/v1/envs.py
+new file mode 100644
+index 0000000..ec5e788
+--- /dev/null
++++ b/lmcache/v1/envs.py
+@@ -0,0 +1,6 @@
++import os
++from functools import lru_cache
++
++@lru_cache(maxsize=10)
++def is_disable_custom_ops():
++    return bool(int(os.getenv("DISABLE_CUSTOM_OPS", "0")))
+\ No newline at end of file
+diff --git a/lmcache/v1/gpu_connector.py b/lmcache/v1/gpu_connector.py
+index 3998f33..ed8c519 100644
+--- a/lmcache/v1/gpu_connector.py
++++ b/lmcache/v1/gpu_connector.py
+@@ -26,10 +26,15 @@ from lmcache.utils import _lmcache_nvtx_annotate
+ from lmcache.v1.compute.blend.utils import LMCBlenderBuilder
+ from lmcache.v1.memory_management import GPUMemoryAllocator  # noqa: E501
+ from lmcache.v1.memory_management import MemoryFormat, MemoryObj
+-import lmcache.c_ops as lmc_ops
++# import lmcache.c_ops as lmc_ops
+ 
+ logger = init_logger(__name__)
+ 
++try:
++    from lmcache import lmcache_C
++    _USE_LMC_OPS = True    
++except ImportError as ie:
++    logger.error("Not using lmcache_C ops")
+ 
+ class GPUConnectorInterface(metaclass=abc.ABCMeta):
+     @abc.abstractmethod
+@@ -321,13 +326,21 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface):
+         self.kv_cache_pointers = torch.empty(
+             num_layers, dtype=torch.int64, device="cpu"
+         )
+-        # Not sure we need a dict here. Maybe a single GPU connector always
+-        # works with a single device?
+-        self.kv_cache_pointers_on_gpu: dict[int, torch.Tensor] = {}
++        
++        self.use_mla = "use_mla" in kwargs and kwargs["use_mla"]
++        kvsize = 1 if self.use_mla else 2
++        
++        # NOTE: Mindspore has no index from the torch tensor API
++        # will use mindspore.hal in the future, 
++        # so we only use 1 tensor and manage for its own device
++        self.kv_cache_pointers_on_gpu = torch.empty([num_layers, kvsize],
++                                                dtype=torch.int64,
++                                                device='Ascend',
++                                                pin_memory=True)
++        self._initialized_kv_ptrs = False
+         self.page_buffer_size = 0
+ 
+         self.gpu_buffer: Optional[torch.Tensor] = None
+-        self.use_mla = "use_mla" in kwargs and kwargs["use_mla"]
+         if use_gpu:
+             assert "chunk_size" in kwargs, (
+                 "chunk_size should be provided to create a GPU buffer."
+@@ -342,25 +355,24 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface):
+             )
+ 
+     def _initialize_pointers(self, kv_caches: List[torch.Tensor]) -> torch.Tensor:
+-        self.kv_cache_pointers.numpy()[:] = [t.data_ptr() for t in kv_caches]
+-        device = kv_caches[0].device
+-        assert device.type == "cuda", "The device should be CUDA."
+-        idx = device.index
+-        if idx not in self.kv_cache_pointers_on_gpu:
+-            self.kv_cache_pointers_on_gpu[idx] = torch.empty(
+-                self.num_layers, dtype=torch.int64, device=device
+-            )
+-        self.kv_cache_pointers_on_gpu[idx].copy_(self.kv_cache_pointers)
+-        if self.use_mla:
+-            # kv_caches[0].shape: [num_pages, page_size, head_size]
+-            assert kv_caches[0].dim() == 3
+-            self.page_buffer_size = kv_caches[0].shape[0] * kv_caches[0].shape[1]
+-        else:
+-            # kv_caches[0].shape: [2, num_pages, page_size, num_heads, head_size]
+-            assert kv_caches[0].dim() == 5
+-            self.page_buffer_size = kv_caches[0].shape[1] * kv_caches[0].shape[2]
+-
+-        return self.kv_cache_pointers_on_gpu[idx]
++        
++        logger.debug(f"{len(kv_caches[0])}, {kv_caches[0][0].shape}, {kv_caches[0][1].shape}")
++        
++        if self._initialized_kv_ptrs:
++            return
++        
++        kv_cache_pointers_cpus = torch.Tensor([[kvc.data_ptr() for kvc in layer_kv ]for layer_kv in kv_caches])
++        self.kv_cache_pointers_on_gpu.copy_(kv_cache_pointers_cpus, non_blocking=False)
++        torch.cuda.synchronize()
++        
++        self._initialized_kv_ptrs = True
++        
++        # MINDSPORE store k and v separately
++        self.page_buffer_size = kv_caches[0][0].shape[0] * kv_caches[0][0].shape[1]
++        self.num_pages = kv_caches[0][0].shape[0]
++        self.page_size = kv_caches[0][0].shape[1]
++        self.kv_size = kv_cache_pointers_cpus.shape[1]
++        return self.kv_cache_pointers_on_gpu
+ 
+     @_lmcache_nvtx_annotate
+     def to_gpu(self, memory_obj: MemoryObj, start: int, end: int, **kwargs):
+@@ -404,17 +416,28 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface):
+         kvcaches: List[torch.Tensor] = kwargs["kvcaches"]
+         slot_mapping: torch.Tensor = kwargs["slot_mapping"]
+ 
+-        kv_cache_pointers = self._initialize_pointers(kvcaches)
+-
+-        lmc_ops.multi_layer_kv_transfer(
+-            memory_obj.tensor,
+-            kv_cache_pointers,
+-            slot_mapping[start:end],
+-            kvcaches[0].device,
+-            self.page_buffer_size,
+-            False,
+-            self.use_mla,
+-        )
++        self._initialize_pointers(kvcaches)
++
++        slot_mapping_range = slot_mapping[start:end]
++        lmcache_C.paged_layers_kv_transfer(memory_obj.tensor, 
++                                            self.kv_cache_pointers_on_gpu, 
++                                            slot_mapping_range,
++                                            memory_obj.base_ptr,
++                                            self.num_pages,
++                                            self.page_size,
++                                            self.kv_size,
++                                            False,
++                                            True)
++
++        # lmc_ops.multi_layer_kv_transfer(
++        #     memory_obj.tensor,
++        #     kv_cache_pointers,
++        #     slot_mapping[start:end],
++        #     kvcaches[0].device,
++        #     self.page_buffer_size,
++        #     False,
++        #     self.use_mla,
++        # )
+ 
+     @_lmcache_nvtx_annotate
+     def from_gpu(self, memory_obj: MemoryObj, start: int, end: int, **kwargs):
+@@ -446,38 +469,49 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface):
+         kvcaches: List[torch.Tensor] = kwargs["kvcaches"]
+         slot_mapping: torch.Tensor = kwargs["slot_mapping"]
+ 
+-        kv_cache_pointers = self._initialize_pointers(kvcaches)
+-
+-        if self.gpu_buffer is None or end - start != self.gpu_buffer.shape[2]:
+-            lmc_ops.multi_layer_kv_transfer(
+-                memory_obj.tensor,
+-                kv_cache_pointers,
+-                slot_mapping[start:end],
+-                kvcaches[0].device,
+-                self.page_buffer_size,
+-                True,
+-                self.use_mla,
+-            )
+-        else:
+-            # kvcaches -> gpu_buffer -> memobj
+-            assert self.gpu_buffer.device == kvcaches[0].device
+-            tmp_gpu_buffer = self.gpu_buffer[:, :, : end - start, :]
+-            lmc_ops.multi_layer_kv_transfer(
+-                tmp_gpu_buffer,
+-                kv_cache_pointers,
+-                slot_mapping[start:end],
+-                kvcaches[0].device,
+-                self.page_buffer_size,
+-                True,
+-                self.use_mla,
+-            )
+-            memory_obj.tensor.copy_(tmp_gpu_buffer, non_blocking=True)
+-
+-        if not memory_obj.tensor.is_cuda:
+-            # Force a synchronize if the target buffer is NOT CUDA device
+-            # NOTE: for better performance, we may not want to sync for every
+-            # memory object
+-            torch.cuda.synchronize()
++        self._initialize_pointers(kvcaches)
++
++        slot_mapping_range = slot_mapping[start:end]
++        lmcache_C.paged_layers_kv_transfer(memory_obj.tensor, 
++                                            self.kv_cache_pointers_on_gpu, 
++                                            slot_mapping_range,
++                                            memory_obj.base_ptr,
++                                            self.num_pages,
++                                            self.page_size,
++                                            self.kv_size,
++                                            True,
++                                            True)
++        # if self.gpu_buffer is None or end - start != self.gpu_buffer.shape[2]:
++        #     lmc_ops.multi_layer_kv_transfer(
++        #         memory_obj.tensor,
++        #         kv_cache_pointers,
++        #         slot_mapping[start:end],
++        #         kvcaches[0].device,
++        #         self.page_buffer_size,
++        #         True,
++        #         self.use_mla,
++        #     )
++        # else:
++        #     # kvcaches -> gpu_buffer -> memobj
++        #     assert self.gpu_buffer.device == kvcaches[0].device
++        #     tmp_gpu_buffer = self.gpu_buffer[:, :, : end - start, :]
++        #     lmc_ops.multi_layer_kv_transfer(
++        #         tmp_gpu_buffer,
++        #         kv_cache_pointers,
++        #         slot_mapping[start:end],
++        #         kvcaches[0].device,
++        #         self.page_buffer_size,
++        #         True,
++        #         self.use_mla,
++        #     )
++        #     memory_obj.tensor.copy_(tmp_gpu_buffer, non_blocking=True)
++        
++        # NOTE: mindspore memory obj is a numpy tensor
++        # if not memory_obj.tensor.is_cuda:
++        #     # Force a synchronize if the target buffer is NOT CUDA device
++        #     # NOTE: for better performance, we may not want to sync for every
++        #     # memory object
++        #    torch.cuda.synchronize()
+ 
+         if self.use_mla:
+             memory_obj.metadata.fmt = MemoryFormat.KV_MLA_FMT
+diff --git a/lmcache/v1/memory_management.py b/lmcache/v1/memory_management.py
+index 44958d4..a55820f 100644
+--- a/lmcache/v1/memory_management.py
++++ b/lmcache/v1/memory_management.py
+@@ -20,6 +20,7 @@ from typing import List, Optional, Tuple, Union
+ import abc
+ import ctypes
+ import threading
++import numpy as np
+ 
+ # Third Party
+ import sortedcontainers
+@@ -29,6 +30,15 @@ import torch
+ from lmcache.logging import init_logger
+ from lmcache.observability import LMCStatsMonitor
+ from lmcache.utils import _lmcache_nvtx_annotate
++from .pin_buffer import create_pin_memory
++from ._tensor import (
++    get_data_ptr,
++    get_numel,
++    get_element_size,
++    get_itemsize,
++    get_dtype_compat,
++    view_and_shape
++)
+ 
+ logger = init_logger(__name__)
+ 
+@@ -109,12 +119,12 @@ class MemoryObjMetadata:
+         """
+         Calculate the size of the memory object in bytes
+         """
+-        if self.shape.numel() == 0:
++        num_elements = get_numel(self.shape)
++        if num_elements == 0:
+             return 0
+         if self.dtype is None:
+             return 0
+-        num_elements = self.shape.numel()
+-        element_size = self.dtype.itemsize
++        element_size = get_itemsize(self.dtype)
+         size_in_bytes = num_elements * element_size
+         return size_in_bytes
+ 
+@@ -236,6 +246,10 @@ class MemoryObj(metaclass=abc.ABCMeta):
+         """
+         raise NotImplementedError
+ 
++    @abc.abstractproperty
++    def framework(self):
++        raise NotImplementedError
++
+     @property
+     @abc.abstractmethod
+     def metadata(self) -> MemoryObjMetadata:
+@@ -268,6 +282,10 @@ class MemoryObj(metaclass=abc.ABCMeta):
+         """
+         raise NotImplementedError
+ 
++    @property
++    @abc.abstractmethod
++    def base_ptr(self) -> int:
++        raise NotImplementedError
+ 
+ class TensorMemoryObj(MemoryObj):
+     """
+@@ -279,13 +297,16 @@ class TensorMemoryObj(MemoryObj):
+         raw_data: torch.Tensor,
+         metadata: MemoryObjMetadata,
+         parent_allocator: Optional["MemoryAllocatorInterface"] = None,
++        base_ptr: int = None,
++        framework: str = None
+     ):
+         self.raw_data = raw_data
+         self.meta = metadata
+         self.valid = True
+         self.lock = threading.Lock()
+         self.parent_allocator = parent_allocator
+-
++        self._base_ptr = base_ptr
++        self._framework = framework
+     def invalidate(self):
+         self.valid = False
+ 
+@@ -293,8 +314,8 @@ class TensorMemoryObj(MemoryObj):
+         return self.valid
+ 
+     def get_size(self) -> int:
+-        num_elements = self.raw_data.numel()
+-        element_size = self.raw_data.element_size()
++        num_elements = get_numel(self.raw_data)
++        element_size = get_element_size(self.raw_data)
+         size_in_bytes = num_elements * element_size
+         return size_in_bytes
+ 
+@@ -336,26 +357,28 @@ class TensorMemoryObj(MemoryObj):
+     def unpin(self) -> bool:
+         self.metadata.is_pin = False
+         return True
+-
++    
++    def framework(self):
++        return self._framework
+     @property
+     def metadata(self) -> MemoryObjMetadata:
+         with self.lock:
+             return self.meta
+ 
+     @property
+-    def tensor(self) -> Optional[torch.Tensor]:
++    def tensor(self) -> Optional[Union[torch.Tensor, np.ndarray]]:
+         if not self.valid:
+             logger.warning("Trying to access an invalidated MemoryObj")
+             return None
+         assert self.meta.dtype is not None
+-        return self.raw_data.view(self.meta.dtype).view(self.meta.shape)
++        return view_and_shape(self.raw_data, self.meta.dtype, self.meta.shape)
+ 
+     @property
+     def byte_array(self) -> bytes:
+         kv_chunk = self.tensor
+         assert kv_chunk is not None
+-        num_bytes = kv_chunk.numel() * kv_chunk.element_size()
+-        ptr = kv_chunk.data_ptr()
++        num_bytes = get_numel(kv_chunk) * get_element_size(kv_chunk)
++        ptr = get_data_ptr(kv_chunk)
+         ubyte_ptr = ctypes.cast(ptr, ctypes.POINTER(ctypes.c_ubyte))
+         byte_array = (ctypes.c_ubyte * num_bytes).from_address(
+             ctypes.addressof(ubyte_ptr.contents)
+@@ -366,6 +389,9 @@ class TensorMemoryObj(MemoryObj):
+     def is_pinned(self) -> bool:
+         return self.metadata.is_pin
+ 
++    @property
++    def base_ptr(self) -> int:
++        return self._base_ptr
+ 
+ class BytesBufferMemoryObj(MemoryObj):
+     """
+@@ -446,6 +472,9 @@ class BytesBufferMemoryObj(MemoryObj):
+     def is_pinned(self) -> bool:
+         return self.metadata.is_pin
+ 
++    @property
++    def base_ptr(self) -> int:
++        return 0
+ 
+ class MemoryAllocatorInterface(metaclass=abc.ABCMeta):
+     @abc.abstractmethod
+@@ -521,13 +550,20 @@ class TensorMemoryAllocator(MemoryAllocatorInterface):
+ 
+     ALIGN_BYTES = 512
+ 
+-    def __init__(self, tensor: torch.Tensor, align_bytes: int = ALIGN_BYTES):
+-        self.buffer = tensor.view(torch.uint8).flatten()
++    def __init__(self, tensor: Union[torch.Tensor, np.ndarray], align_bytes: int = ALIGN_BYTES):
++        # NOTE (Gingfung:) changed to reshape to enable 
++        assert self._is_uint8_type(tensor)
++        self.buffer = tensor.reshape(-1)
++        self._base_ptr = get_data_ptr(self.buffer)
++
+         self.align_bytes = align_bytes
+ 
+         self.explicit_list = sortedcontainers.SortedList(key=lambda x: x.start)
+ 
+-        self.explicit_list.add(FreeBlock(start=0, size=self.buffer.numel()))
++        self._buffer_is_np = isinstance(tensor, np.ndarray)
++        el_size = get_numel(self.buffer)
++ 
++        self.explicit_list.add(FreeBlock(start=0, size=el_size))
+ 
+         # For debugging purposes
+         self.num_active_allocations = 0
+@@ -535,16 +571,28 @@ class TensorMemoryAllocator(MemoryAllocatorInterface):
+ 
+         self.stats_monitor = LMCStatsMonitor.GetOrCreate()
+ 
++    @property
++    def framework(self):
++        return "Mindspore" if self._buffer_is_np else "Torch"
++
+     @staticmethod
+     @_lmcache_nvtx_annotate
+     def _Compute_raw_size(shape: torch.Size, dtype: torch.dtype) -> int:
+-        return shape.numel() * dtype.itemsize
++        return shape.numel() * get_itemsize(dtype)
+ 
+     @staticmethod
+     @_lmcache_nvtx_annotate
+     def _Compute_aligned_size(raw_size: int, align: int) -> int:
+         return (raw_size + align - 1) & ~(align - 1)
+ 
++    def _is_uint8_type(self, tensor: Union[torch.Tensor, np.ndarray]):
++        if isinstance(tensor, np.ndarray):
++            return tensor.dtype == np.uint8
++        elif isinstance(tensor, torch.Tensor):
++            return tensor.dtype == torch.uint8
++        else:
++            raise ValueError(f"tensor of type: {type(tensor)} not supported.")
++
+     @_lmcache_nvtx_annotate
+     def _coalesce(
+         self,
+@@ -595,6 +643,8 @@ class TensorMemoryAllocator(MemoryAllocatorInterface):
+             shape = torch.Size(shape)
+ 
+         assert dtype is not None, "dtype must be specified"
++        dtype = get_dtype_compat(dtype)
++
+         # Calculate the size of the tensor
+         raw_size = TensorMemoryAllocator._Compute_raw_size(shape, dtype)
+         if raw_size % self.align_bytes != 0:
+@@ -640,6 +690,8 @@ class TensorMemoryAllocator(MemoryAllocatorInterface):
+                 shape, dtype, block.start, aligned_size, 1, False, fmt
+             ),
+             parent_allocator=parent_allocator,
++            base_ptr=self._base_ptr,
++            framework=self.framework
+         )
+ 
+     @_lmcache_nvtx_annotate
+@@ -658,6 +710,7 @@ class TensorMemoryAllocator(MemoryAllocatorInterface):
+             shape = torch.Size(shape)
+ 
+         assert dtype is not None, "dtype must be specified"
++        dtype = get_dtype_compat(dtype)
+ 
+         # Calculate the size of the tensor
+         unit_raw_size = TensorMemoryAllocator._Compute_raw_size(shape, dtype)
+@@ -714,6 +767,7 @@ class TensorMemoryAllocator(MemoryAllocatorInterface):
+                         shape, dtype, temp_start, unit_aligned_size, 1, False, fmt
+                     ),
+                     parent_allocator=parent_allocator,
++                    framework=self.framework
+                 )
+             )
+             temp_start += unit_aligned_size
+@@ -820,7 +874,7 @@ class TensorMemoryAllocator(MemoryAllocatorInterface):
+         logger.info(f" - Total free size: {total_free_size / 1048576} MB")
+ 
+         # Check if the numbers are consistent
+-        if total_free_size + self.total_allocated_size != self.buffer.numel():
++        if total_free_size + self.total_allocated_size != get_numel(self.buffer):
+             logger.error("Memory allocator size is inconsistent")
+             logger.error("This implies a bug in the memory allocator")
+             clear = False
+@@ -881,6 +935,9 @@ class BufferAllocator(MemoryAllocatorInterface):
+     def memcheck(self):
+         return True
+ 
++    @property
++    def framework(self):
++        return "Torch"
+ 
+ class HostMemoryAllocator(MemoryAllocatorInterface):
+     """Allocates memory in the pre-allocated Host memory."""
+@@ -929,6 +986,9 @@ class HostMemoryAllocator(MemoryAllocatorInterface):
+         with self.host_mem_lock:
+             return self.allocator.memcheck()
+ 
++    @property
++    def framework(self):
++        return self.allocator.framework
+ 
+ class PinMemoryAllocator(MemoryAllocatorInterface):
+     """Allocates memory in the pre-allocated pinned memory."""
+@@ -978,6 +1038,9 @@ class PinMemoryAllocator(MemoryAllocatorInterface):
+         with self.host_mem_lock:
+             return self.allocator.memcheck()
+ 
++    @property
++    def framework(self):
++        return self.allocator.framework
+ 
+ class MixedMemoryAllocator(MemoryAllocatorInterface):
+     """
+@@ -985,11 +1048,13 @@ class MixedMemoryAllocator(MemoryAllocatorInterface):
+               (2) byte_array buffer memory.
+     """
+ 
+-    def __init__(self, size: int):
++    def __init__(self, size: int, framework: str = "Torch"):
+         """
+         :param int size: The size of the pinned memory in bytes.
+         """
+-        buffer = torch.empty(size, dtype=torch.uint8, pin_memory=True)
++
++        buffer = create_pin_memory(framework, size, dtype="uint8")
++        self._framework = framework
+ 
+         self.pin_allocator = TensorMemoryAllocator(buffer)
+         self.buffer_allocator = BufferAllocator("cpu")
+@@ -1076,6 +1141,9 @@ class MixedMemoryAllocator(MemoryAllocatorInterface):
+         with self.host_mem_lock:
+             return self.pin_allocator.memcheck()
+ 
++    @property
++    def framework(self):
++        return self._framework
+ 
+ class GPUMemoryAllocator(MemoryAllocatorInterface):
+     """Allocates memory in the pre-allocated GPU memory."""
+@@ -1126,6 +1194,9 @@ class GPUMemoryAllocator(MemoryAllocatorInterface):
+         with self.device_mem_lock:
+             return self.allocator.memcheck()
+ 
++    @property
++    def framework(self):
++        return self.allocator.framework
+ 
+ class AdHocMemoryAllocator(MemoryAllocatorInterface):
+     """
+@@ -1153,6 +1224,7 @@ class AdHocMemoryAllocator(MemoryAllocatorInterface):
+             shape = torch.Size(shape)
+ 
+         assert dtype is not None, "dtype must be specified"
++        dtype = get_dtype_compat(dtype)
+ 
+         # Return a dummy object with no actual memory allocation
+         return TensorMemoryObj(
+@@ -1199,6 +1271,9 @@ class AdHocMemoryAllocator(MemoryAllocatorInterface):
+     def memcheck(self):
+         return True
+ 
++    @property
++    def framework(self):
++        return "Torch"
+ 
+ class CuFileMemoryAllocator(GPUMemoryAllocator):
+     def __init__(self, size: int, device=None):
+diff --git a/lmcache/v1/npu_connector.py b/lmcache/v1/npu_connector.py
+new file mode 100644
+index 0000000..8316a0b
+--- /dev/null
++++ b/lmcache/v1/npu_connector.py
+@@ -0,0 +1,291 @@
++from lmcache.v1.gpu_connector import GPUConnectorInterface
++import torch
++from typing import List, Optional, Tuple, Union
++from lmcache.v1.memory_management import MemoryFormat, MemoryObj
++from lmcache.logging import init_logger
++logger = init_logger(__file__)
++from .envs import is_disable_custom_ops
++
++_USE_LMC_OPS = False
++try:
++    from lmcache import lmcache_C
++    _USE_LMC_OPS = True    
++except ImportError as ie:
++    logger.debug("Not using lmcache_C ops")
++
++_USE_MS = False
++try:
++    from mindspore import mutable
++    _USE_MS = True
++except ImportError as ie:
++    logger.error("Unable to import mindspore")
++
++class VLLMPagedMemNPUConnector(GPUConnectorInterface):
++    """
++    FIXME: Currently a lot of the memcpy and transfer ops
++           are in sequence, we probably want to optimize this.
++
++    The NPU KV cache should be a nested tuple of K and V tensors.
++    More specifically, we have:
++    - GPUTensor = Tuple[KVLayer, ...]
++    - KVLayer = Tuple[Tensor, Tensor]
++    - Tensor: [num_blocks, block_size, num_heads, head_size]
++
++    It will produce / consume memory object with KV_BLOB format
++    """
++
++    ACL_MEMCPY_DEVICE_TO_HOST=2
++
++    def __init__(self,
++                 hidden_dim_size: int,
++                 num_layers: int,
++                 use_npu: bool = True,
++                 **kwargs):
++        """
++        If use_gpu is true, it will create a gpu intermediate buffer. In this 
++        case, it requires the following kwargs:
++        - chunk_size: The MAX size of the chunk to be copied to GPU.
++        - dtype: The data type of the intermediate buffer.
++        """
++        global _USE_LMC_OPS
++        global _USE_MS
++        self._use_lmc_ops = _USE_LMC_OPS and not is_disable_custom_ops()
++        logger.debug(f"NPU Connector using lmc ops: {self._use_lmc_ops}")
++        self.hidden_dim_size = hidden_dim_size
++        self.num_layers = num_layers
++        self.is_tuple_layout = kwargs["kv_is_tuple"]
++        self.kv_size = kwargs["kv_size"]
++        
++        if not self.is_tuple_layout:
++            self.kv_cache_pointers = torch.empty(num_layers,
++                                                dtype=torch.int64,
++                                                device='cpu',
++                                                pin_memory=True)
++            self._kv_cache_devptrs = None
++        else:
++            if not _USE_MS:
++                raise ImportError("MindSpore is required for tuple layout")
++            # per layer, kvs
++            self._kv_cache_devptrs = torch.empty([num_layers, self.kv_size],
++                                                dtype=torch.int64,
++                                                device='Ascend',
++                                                pin_memory=True)
++        self.pointers_initialized = False
++        self.page_buffer_size = 0
++        self.num_pages = 0
++        self.page_size = 0
++
++        self.npu_buffer: Optional[torch.Tensor] = None
++        if use_npu:
++            assert "chunk_size" in kwargs, \
++                    "chunk_size should be provided to create a GPU buffer."
++            assert "dtype" in kwargs, \
++                    "dtype should be provided to create a GPU buffer."
++            assert "device" in kwargs, \
++                    "device should be provided to create a GPU buffer."
++            self.npu_buffer = torch.empty((self.kv_size, num_layers, 
++                                           kwargs["chunk_size"],
++                                           self.hidden_dim_size),
++                                          dtype=kwargs["dtype"],
++                                          device=kwargs["device"])
++            self.npu_buffer.zero_()
++
++    def _initialize_pointers(self, kv_caches: List[torch.Tensor]):
++        if not self.is_tuple_layout:
++            for i in range(self.num_layers):
++                self.kv_cache_pointers[i] = kv_caches[i].data_ptr()
++            self._kv_cache_devptrs = self.kv_cache_pointers.npu()
++        else:
++            kv_cache_pointers_cpus = torch.Tensor([[kvc.data_ptr() for kvc in layer_kv ]for layer_kv in kv_caches])
++            self._kv_cache_devptrs.copy_(kv_cache_pointers_cpus, non_blocking=False)
++            
++            torch.cuda.synchronize()
++
++        self.pointers_initialized = True
++        kv_one_layer = kv_caches[0][0] if self.is_tuple_layout else kv_caches[0]
++        if not self.is_tuple_layout:
++            # kv_caches layer.shape: [2, num_pages, page_size, num_heads, head_size]
++            self.page_buffer_size = kv_one_layer.shape[1] * kv_one_layer.shape[2]
++            self.num_pages = kv_one_layer.shape[1]
++            self.page_size = kv_one_layer.shape[2]
++        else:
++            # kv_caches layer.shape: [num_pages, page_size, num_heads, head_size]
++            self.page_buffer_size = kv_one_layer.shape[0] * kv_one_layer.shape[1]
++            self.num_pages = kv_one_layer.shape[0]
++            self.page_size = kv_one_layer.shape[1]
++
++            
++    def _pointers_are_good(self, kv_caches: Union[List[torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]]):
++        """
++        Check if the initialized pointers are the same as the pointers in 
++        the KV caches. 
++
++        Returns:
++            bool: True if the pointers are the same, False otherwise (
++                including uninitialized).
++        """
++        if not self.pointers_initialized:
++            return False
++        
++        for i in range(self.num_layers):
++            if not self.is_tuple_layout:
++                if self._kv_cache_devptrs[i] != kv_caches[i].data_ptr():
++                    return False
++            else:
++                for j in range(self.kv_size):
++                    if self._kv_cache_devptrs[i][j] != kv_caches[i][j].data_ptr():
++                        return False
++            
++        return True
++
++    def to_gpu(self, memory_obj: MemoryObj, start: int, end: int, **kwargs):
++        """Expect a kwarg 'kvcaches' which is a nested tuple of K and V tensors.
++        The kvcaches should correspond to the "WHOLE token sequence".
++
++        Note: 
++          1. This function expects the 'slot_mapping' is a "full slot mapping"
++             where it's length is the same as the whole token sequence.
++          2. In the case that there is prefix caching, slot_mapping will starts
++             with -1s until the end of the matched prefix. The start and end
++             should NEVER overlap with the prefix caching (which means the 
++             underlying CUDA kernel will never see -1 in slot_mapping)
++
++
++        :raises ValueError: If 'kvcaches' is not provided in kwargs.
++        :raises AssertionError: If the memory object does not have a tensor.
++        :raises ValueError: If 'slot_mapping' is not provided in kwargs.
++        """
++        assert memory_obj.tensor is not None
++
++        if memory_obj.metadata.fmt != MemoryFormat.KV_BLOB:
++            raise ValueError(
++                "The memory object should be in KV_BLOB format in"
++                " order to be processed by VLLMPagedMemGPUConnector")
++
++        if "kvcaches" not in kwargs:
++            raise ValueError("'kvcaches' should be provided in kwargs.")
++
++        if "slot_mapping" not in kwargs:
++            raise ValueError("'slot_mapping' should be provided in kwargs.")
++
++        kvcaches: Union[List[torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]] = kwargs["kvcaches"]
++        slot_mapping: torch.Tensor = kwargs["slot_mapping"]
++
++        if not self.pointers_initialized:
++            self._initialize_pointers(kvcaches)
++        
++        if self._use_lmc_ops and memory_obj.base_ptr is not None:
++            slot_mapping_range = slot_mapping[start:end]
++            lmcache_C.paged_layers_kv_transfer(memory_obj.tensor, 
++                                               self._kv_cache_devptrs, 
++                                               slot_mapping_range,
++                                               memory_obj.base_ptr,
++                                               self.num_pages,
++                                               self.page_size,
++                                               self.kv_size,
++                                               False,
++                                               True)
++        else:
++            if self.is_tuple_layout:
++                raise NotImplementedError()
++            
++            self.npu_buffer.zero_()
++
++            # FIXME: This seems inefficient
++            # Memobj -> npu_buffer -> kvcaches
++            # atm the main overhead is probably to do with the cpu overhead here
++            assert self.npu_buffer.device == kvcaches[0].device
++            tmp_npu_buffer = self.npu_buffer[:, :, :end-start, :]
++            tmp_npu_buffer.copy_(memory_obj.tensor, non_blocking=True)
++
++            slots_tokens = slot_mapping[start:end]
++            valid_tokens = slots_tokens >= 0
++            valid_idx = torch.nonzero(valid_tokens, as_tuple=True)[0]
++            physical_slots = slots_tokens[valid_tokens]
++            if physical_slots.numel() <= 0:
++                logger.error("Physical slots are zeros, this should not happened.")
++                return
++            for layer_id in range(len(kvcaches)):
++                # [blocks*block_size, num_heads*head_size]
++                for i in range(self.kv_size):
++                    _layer = kvcaches[layer_id][i].view(self.page_buffer_size, -1)
++                    _layer[physical_slots] = tmp_npu_buffer[i, layer_id, valid_idx]
++
++    def from_gpu(self, memory_obj: MemoryObj, start: int, end: int, **kwargs):
++        """Expect a kwarg 'kvcaches' which is a nested tuple of K and V tensors.
++        The kvcaches should correspond to the "WHOLE token sequence".
++
++        Will set the memory_obj.metadata.fmt to MemoryFormat.KV_BLOB.
++
++        Note: 
++          1. This function expects the 'slot_mapping' is a "full slot mapping"
++             where it's length is the same as the whole token sequence.
++          2. In the case that there is prefix caching, slot_mapping will starts
++             with -1s until the end of the matched prefix. The start and end
++             should NEVER overlap with the prefix caching 
++
++        :raises ValueError: If 'kvcaches' is not provided in kwargs,
++        :raises AssertionError: If the memory object does not have a tensor.
++        :raises ValueError: If 'slot_mapping' is not provided in kwargs.
++        """
++        assert memory_obj.tensor is not None
++
++        if "kvcaches" not in kwargs:
++            raise ValueError("'kvcaches' should be provided in kwargs.")
++
++        if "slot_mapping" not in kwargs:
++            raise ValueError("'slot_mapping' should be provided in kwargs.")
++
++        kvcaches: List[torch.Tensor] = kwargs["kvcaches"]
++        slot_mapping: torch.Tensor = kwargs["slot_mapping"]
++        
++        if not self.pointers_initialized:
++            self._initialize_pointers(kvcaches)
++
++        if self._use_lmc_ops and memory_obj.base_ptr is not None:
++            slot_mapping_range = slot_mapping[start:end]
++            lmcache_C.paged_layers_kv_transfer(memory_obj.tensor, 
++                                               self._kv_cache_devptrs, 
++                                               slot_mapping_range,
++                                               memory_obj.base_ptr,
++                                               self.num_pages,
++                                               self.page_size,
++                                               self.kv_size,
++                                               True,
++                                               True)
++        else:
++            if self.is_tuple_layout:
++                raise NotImplementedError()
++                
++            self.npu_buffer.zero_()
++            temp_buf = self.npu_buffer[:, :, :end-start, :]
++
++            # FIXME: Work on ascendc kernel copy version
++            # currently this is quite slow
++            # atm the main overhead is probably to do with the cpu overhead here
++            # copy from kvs into temp_buf
++            slots_tokens = slot_mapping[start:end]
++            valid_tokens = slots_tokens >= 0
++            physical_slots = slots_tokens[valid_tokens]
++            if physical_slots.numel() <= 0:
++                logger.error("Physical slots are zeros, this should not happened.")
++                return
++            valid_idx = torch.nonzero(valid_tokens, as_tuple=True)[0]
++            for layer_id in range(len(kvcaches)):
++                for i in range(self.kv_size):
++                    _layer = kvcaches[layer_id][i].view(self.page_buffer_size, -1)
++                    temp_buf[i, layer_id, valid_idx] = _layer[physical_slots]
++                
++            memory_obj.tensor.zero_()
++            # sync memcpy
++            # size_in_bytes = temp_buf.numel() * temp_buf.element_size()
++            # ret = memcpy(memory_obj.tensor.data_ptr(), size_in_bytes, 
++            #       temp_buf.data_ptr(), size_in_bytes, self.ACL_MEMCPY_DEVICE_TO_HOST)
++            memory_obj.tensor.copy_(temp_buf.contiguous(), non_blocking=True)
++        memory_obj.metadata.fmt = MemoryFormat.KV_BLOB
++        # assert ret == 0, f"Memcpy failed, Got: {ret}."
++        # reset the npu buffer
++
++    def get_shape(self, num_tokens: int) -> torch.Size:
++        return torch.Size(
++            [self.kv_size, self.num_layers, num_tokens, self.hidden_dim_size])
+diff --git a/lmcache/v1/pin_buffer.py b/lmcache/v1/pin_buffer.py
+new file mode 100644
+index 0000000..197815d
+--- /dev/null
++++ b/lmcache/v1/pin_buffer.py
+@@ -0,0 +1,138 @@
++import os
++import torch
++import numpy as np
++from lmcache.logging import init_logger
++import atexit
++import ctypes
++from .envs import is_disable_custom_ops
++
++logger = init_logger(__file__)
++
++
++_HAS_ACL = False
++try:
++    import acl
++    _HAS_ACL = True
++except ImportError as ie:
++    logger.warning("Unable to import acl")
++
++_HAS_MS = False
++try:
++    import mindspore as ms
++    _HAS_MS = True
++except ImportError as ie:
++    logger.warning("Unable to import mindspore")
++    
++_USE_LMCACHE_C = False
++try:
++    from lmcache import lmcache_C
++    _USE_LMCACHE_C = True
++except ImportError as ie:
++    logger.warning("unable to import LMCache_C")
++
++STR_TO_TORCH_TYPE = {
++    "uint8" : torch.uint8
++}
++
++STR_TO_NP_TYPE = {
++    "uint8": np.uint8
++}
++
++STR_TO_CTYPES = {
++    "uint8": ctypes.c_uint8
++}
++
++__all__ = [
++    "create_pin_memory"
++]
++
++# TODO (Gingfung): Get rid of this global ?
++ACL_PIN_MEMORY_MANAGER = None
++
++class aclPinnedMemoryManager:
++    def __init__(self):
++        assert _HAS_ACL, "Cannot create aclPinnedMemoryManager without acl"
++        
++        self._init()
++        
++        # TODO (Gingfung): Double check whether this will free the memory
++        atexit.register(self.clean_up)
++    
++    def _init(self):
++        """Since we only take care of alloc memory, 
++           regardless whether we are initialized within a framework or not, 
++           best we create our current context here. 
++        """
++        # NOTE (Gingfung): Just in case user haven't called acl ini
++        ret = acl.init()
++        dev = torch.cuda.current_device()
++        if dev == "Ascend":
++            # we are in mindspore land,
++            dev = ms.get_current_device().device_id
++            
++        if ret == 0:
++            self._to_finalized = True
++            
++            self._ctx, ret = acl.rt.create_context(dev)
++            assert ret == 0, "Cannot create context"
++        elif ret == 100002:
++            # already init by framework
++            self._to_finalized = False
++            self._ctx = None
++        else:
++            raise RuntimeError(f"Acl Init Failed: {ret}")
++        
++       
++        self._managed_ptrs = {}
++        
++    
++    def alloc(self, size: int):
++        _p, ret = acl.rt.malloc_host(size)
++        if ret != 0:
++            logger.error("Unable to malloc pinned host memory: %d", ret)
++            return None
++        
++        self._managed_ptrs[_p] = size
++        return _p
++    
++    def clean_up(self):
++        _ptrs = list(self._managed_ptrs.keys())
++        for k in _ptrs:
++            ret = acl.rt.free_host(k)
++            if ret != 0:
++                logger.error("Unable to free host ptr: %d", ret)
++            
++            self._managed_ptrs.pop(k, None)
++        
++        if self._to_finalized:
++            ret = acl.rt.destroy_context(self._ctx)
++            assert ret == 0
++            ret = acl.finalize()
++            assert ret == 0
++            self._to_finalized = False
++
++def _get_pinned_mem_manager():
++    global ACL_PIN_MEMORY_MANAGER
++    if ACL_PIN_MEMORY_MANAGER is None:
++        ACL_PIN_MEMORY_MANAGER = aclPinnedMemoryManager()
++    return ACL_PIN_MEMORY_MANAGER
++
++def get_np_pin_buffer(size: int, dtype: str):
++    pin_mem_manager = _get_pinned_mem_manager()
++    h_ptr = pin_mem_manager.alloc(size)
++    c_t = STR_TO_CTYPES[dtype]
++    el_size = ctypes.sizeof(c_t)
++    np_ptr = ctypes.cast(h_ptr, ctypes.POINTER(c_t * el_size))
++    np_buf = np.ctypeslib.as_array(np_ptr, shape=(size,)).reshape(-1)
++    assert h_ptr == np_buf.ctypes.data_as(ctypes.c_void_p).value
++    return np_buf
++
++def create_pin_memory(framework: str, size: int, dtype: str = "uint8"):
++    # if framework == "Mindspore":
++        # we need to use acl and numpy to represent our buffer.
++        # return get_np_pin_buffer(size, dtype)
++    global _USE_LMCACHE_C
++    if _USE_LMCACHE_C and not is_disable_custom_ops():
++        logger.debug(f"create pinned tensor of size: {size} with lmcache_C")
++        return lmcache_C.create_pinned_tensor(size)
++    return torch.empty(size, dtype=torch.uint8, pin_memory=True)
+diff --git a/lmcache/v1/protocol.py b/lmcache/v1/protocol.py
+index 6e7583d..0f161d0 100644
+--- a/lmcache/v1/protocol.py
++++ b/lmcache/v1/protocol.py
+@@ -48,8 +48,6 @@ DTYPE_TO_INT = {
+     torch.float64: 5,
+     torch.double: 5,
+     torch.uint8: 6,
+-    torch.float8_e4m3fn: 7,
+-    torch.float8_e5m2: 8,
+ }
+ 
+ INT_TO_DTYPE = {
+@@ -60,11 +58,16 @@ INT_TO_DTYPE = {
+     4: torch.float,
+     5: torch.float64,
+     6: torch.uint8,
+-    7: torch.float8_e4m3fn,
+-    8: torch.float8_e5m2,
+ }
+ 
+ 
++try:
++    DTYPE_TO_INT.update({torch.float8_e4m3fn: 7, torch.float8_e5m2: 8})
++    INT_TO_DTYPE.update({7: torch.float8_e4m3fn, 8: torch.float8_e5m2})
++except AttributeError as ae:
++    # MSAdapter does not have these versions yet
++    pass
++
+ @dataclass
+ class RemoteMetadata:
+     length: int
+diff --git a/lmcache/v1/storage_backend/abstract_backend.py b/lmcache/v1/storage_backend/abstract_backend.py
+index ce9be36..5c00add 100644
+--- a/lmcache/v1/storage_backend/abstract_backend.py
++++ b/lmcache/v1/storage_backend/abstract_backend.py
+@@ -101,6 +101,7 @@ class StorageBackendInterface(metaclass=abc.ABCMeta):
+     def get_blocking(
+         self,
+         key: CacheEngineKey,
++        **kwargs
+     ) -> Optional[MemoryObj]:
+         """
+         A blcocking function to get the kv cache from the storage backend.
+diff --git a/lmcache/v1/storage_backend/connector/falconfs_connector.py b/lmcache/v1/storage_backend/connector/falconfs_connector.py
+new file mode 100644
+index 0000000..d75f523
+--- /dev/null
++++ b/lmcache/v1/storage_backend/connector/falconfs_connector.py
+@@ -0,0 +1,98 @@
++# Copyright 2024-2025 LMCache Authors.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++import uuid
++import time
++import asyncio
++import os
++import weakref
++from typing import List, Optional, Union, Tuple, no_type_check
++
++import pyfalconfs
++import torch
++import operator
++from functools import reduce
++
++from lmcache.v1.memory_management import (CopyLessMemoryObj,
++                                                    MemoryAllocatorInterface,
++                                                    MemoryObj,
++                                                    MemoryFormat,
++                                                    MemoryObjMetadata,
++                                                    TensorMemoryAllocator)
++from lmcache.v1.protocol import RemoteMetadata
++from lmcache.v1.storage_backend.connector.base_connector import \
++    RemoteConnector
++from lmcache.logging import init_logger
++from lmcache.utils import CacheEngineKey
++from concurrent.futures import ThreadPoolExecutor
++
++logger = init_logger(__name__)
++
++MAX_BUFFER_SIZE = 40 << 20  # 40MB
++METADATA_BYTES_LEN = 28
++MAX_BUFFER_CNT = 500
++
++def fmtFile(
++        key: str,
++    ) -> str:
++        return "/data_py/" + key.replace("/", "-") + ".pt"
++
++class FalconFsConnector(RemoteConnector):
++    """
++    The remote url should start with "falconfs://"
++    """
++    def __init__(self, workspace: str, config_path: str,
++                 memory_allocator: MemoryAllocatorInterface, thread_pool_size = 256):
++        random_dir = workspace + str(uuid.uuid4())
++        os.makedirs(random_dir)
++        #logger.info(f"sleeping 30: {os.getpid()}")
++        #time.sleep(30)
++        
++        self.connector = pyfalconfs.AsyncConnector(random_dir, config_path)
++        self.memory_allocator = memory_allocator
++
++    async def exists(self, key: CacheEngineKey) -> bool:
++        ret = await self.connector.AsyncExists(fmtFile(key.to_string()))
++        return ret == 0
++
++    async def get(self, key: CacheEngineKey, shape, dtype) -> Optional[MemoryObj]:
++        key_str = key.to_string()
++
++        memory_obj = self.memory_allocator.allocate(shape=shape, dtype=dtype)
++        ret = await self.connector.AsyncGet(fmtFile(key_str), memory_obj.byte_array, memory_obj.get_size(), 0)
++        if (ret < 0):
++            logger.warning(f"get: AsyncGet failed, ret = {ret}")
++            self.memory_allocator.ref_count_down(memory_obj)
++            return None
++
++        return memory_obj
++
++    async def put(self, key: CacheEngineKey, memory_obj: MemoryObj):
++        kv_bytes = memory_obj.byte_array
++        key_str = key.to_string()
++        
++        ret = await self.connector.AsyncPut(fmtFile(key_str), kv_bytes, len(kv_bytes), 0)
++        if ret < 0:
++            logger.warning(f"put: AsyncPut failed, ret = {ret}")
++            
++        self.memory_allocator.ref_count_down(memory_obj)
++
++    # TODO
++    @no_type_check
++    async def list(self) -> List[str]:
++        pass
++
++    async def close(self):
++        pass
++
+diff --git a/lmcache/v1/storage_backend/local_cpu_backend.py b/lmcache/v1/storage_backend/local_cpu_backend.py
+index 9538014..b031dc2 100644
+--- a/lmcache/v1/storage_backend/local_cpu_backend.py
++++ b/lmcache/v1/storage_backend/local_cpu_backend.py
+@@ -146,6 +146,7 @@ class LocalCPUBackend(StorageBackendInterface):
+     def get_blocking(
+         self,
+         key: CacheEngineKey,
++        **kwargs
+     ) -> Optional[MemoryObj]:
+         with self.cpu_lock:
+             if key not in self.hot_cache:
+diff --git a/lmcache/v1/storage_backend/local_disk_backend.py b/lmcache/v1/storage_backend/local_disk_backend.py
+index e7880f5..3f81838 100644
+--- a/lmcache/v1/storage_backend/local_disk_backend.py
++++ b/lmcache/v1/storage_backend/local_disk_backend.py
+@@ -23,6 +23,8 @@ import threading
+ # Third Party
+ import aiofiles
+ import torch
++import json
++from dataclasses import asdict
+ 
+ # First Party
+ from lmcache.logging import init_logger
+@@ -65,6 +67,12 @@ class LocalDiskBackend(StorageBackendInterface):
+             os.makedirs(self.path)
+             logger.info(f"Created local disk cache directory: {self.path}")
+ 
++        # NOTE: currently the pinned memory tensor
++        #       seems to causes bad address error with FUSE driver
++        self.is_fuse_driver = config.use_fuse_driver
++
++        self._rebuild_idx()
++
+         self.lookup_server = lookup_server
+ 
+         # Initialize the evictor
+@@ -80,7 +88,28 @@ class LocalDiskBackend(StorageBackendInterface):
+ 
+     def __str__(self):
+         return self.__class__.__name__
+-
++    
++    def _rebuild_idx(self):
++        """Rebuild Metadata across instances"""
++        for kv_file in os.listdir(self.path):
++            if "@meta" in kv_file:
++                with open(os.path.join(self.path, kv_file), "r") as f:
++                    json_str = f.readlines()
++                if len(json_str) == 0:
++                    continue
++                json_dct = json.loads(json_str[0])
++                meta = DiskCacheMetadata.from_dict(json_dct)
++                key = self._path_to_key(meta.path)
++                with self.disk_lock:
++                    self.dict[key] = meta
++    
++    def _path_to_key(self, p: str) -> CacheEngineKey:
++        """
++            -3 = ".pt"
++        """
++        _key = p[len(self.path):-3].replace("-", "/")
++        return CacheEngineKey.from_string(_key)
++    
+     def _key_to_path(
+         self,
+         key: CacheEngineKey,
+@@ -140,12 +169,13 @@ class LocalDiskBackend(StorageBackendInterface):
+                 KVEvictMsg(self.instance_id, key.worker_id, key.chunk_hash, "disk")
+             )
+ 
+-    def insert_key(self, key: CacheEngineKey, memory_obj: MemoryObj) -> None:
++    def insert_key(self, key: CacheEngineKey, memory_obj: MemoryObj) -> DiskCacheMetadata:
+         path = self._key_to_path(key)
+         size = memory_obj.get_size()
+         shape = memory_obj.metadata.shape
+         dtype = memory_obj.metadata.dtype
+         fmt = memory_obj.metadata.fmt
++        meta = DiskCacheMetadata(path, size, shape, dtype, fmt, False)
+ 
+         has_stored = False
+         with self.disk_lock:
+@@ -154,13 +184,14 @@ class LocalDiskBackend(StorageBackendInterface):
+                 self.dict.pop(key)
+                 has_stored = True
+ 
+-            self.dict[key] = DiskCacheMetadata(path, size, shape, dtype, fmt, False)
++            self.dict[key] = meta
+ 
+         # push kv admit msg
+         if self.lmcache_worker is not None and not has_stored:
+             self.lmcache_worker.put_msg(
+                 KVAdmitMsg(self.instance_id, key.worker_id, key.chunk_hash, "disk")
+             )
++        return meta
+ 
+     def submit_put_task(
+         self,
+@@ -229,6 +260,7 @@ class LocalDiskBackend(StorageBackendInterface):
+     def get_blocking(
+         self,
+         key: CacheEngineKey,
++        **kwargs
+     ) -> Optional[MemoryObj]:
+         """
+         Blocking get function.
+@@ -262,7 +294,6 @@ class LocalDiskBackend(StorageBackendInterface):
+         # TODO(Jiayi): Need to align prefetch and get_non_blocking
+         return self.submit_prefetch_task(key)
+ 
+-    @_lmcache_nvtx_annotate
+     @torch.inference_mode()
+     async def async_save_bytes_to_disk(
+         self,
+@@ -281,10 +312,25 @@ class LocalDiskBackend(StorageBackendInterface):
+         self.usage += size
+         self.stats_monitor.update_local_storage_usage(self.usage)
+ 
+-        async with aiofiles.open(path, "wb") as f:
+-            await f.write(byte_array)
+-
+-        self.insert_key(key, memory_obj)
++        # step1. save kv bytes
++        try:
++            async with aiofiles.open(path, 'wb') as f:
++                # works when made a copy, bytes(byte_array)
++                if self.is_fuse_driver:
++                    await f.write(bytes(byte_array))
++                else:
++                    await f.write(byte_array)
++        except Exception as e:
++            logger.error("unable to save KVBytes: ", e)
++        meta = self.insert_key(key, memory_obj)
++        
++        # step2. save meta
++        meta_path = path.replace(".pt", "@meta.json")
++        try:
++            async with aiofiles.open(meta_path, 'w') as f:
++                await f.write(json.dumps(meta.to_dict()))
++        except Exception as e:
++            logger.error("unable to save DiskCacheMetadata: ", e)
+ 
+         memory_obj.ref_count_down()
+ 
+@@ -304,14 +350,26 @@ class LocalDiskBackend(StorageBackendInterface):
+         if memory_obj is None:
+             logger.debug("Memory allocation failed during async disk load.")
+             return None
+-        buffer = memory_obj.byte_array
++        if not self.is_fuse_driver:
++            buffer = memory_obj.byte_array
++        else:
++            # create a temporary buffer
++            buffer = bytearray(memory_obj.get_size())
++
+         async with aiofiles.open(path, "rb") as f:
+             await f.readinto(buffer)
++
++        if self.is_fuse_driver:
++            temp_tensor = torch.frombuffer(buffer, dtype=memory_obj.get_dtype()).view(shape)
++            memory_obj.tensor.copy_(temp_tensor)
++
+         return memory_obj
+ 
+     # TODO(Jiayi): use memory allocator to redeuce cpu buffer allocation
+     # TODO(Jiayi): the pinned cpu memory_obj should directly be passed into
+     # gpu connector; this gpu buffer could be avoided
++    # TODO(Gingfung): the frombuffer tensor copied is needed for FUSE driver
++    #                 however seems to cause overhead here, will need to optimize further
+     def load_bytes_from_disk(
+         self, path: str, dtype: torch.dtype, shape: torch.Size, fmt: MemoryFormat
+     ) -> Optional[MemoryObj]:
+@@ -322,9 +380,26 @@ class LocalDiskBackend(StorageBackendInterface):
+         if memory_obj is None:
+             logger.debug("Memory allocation failed during async disk load.")
+             return None
+-        buffer = memory_obj.byte_array
+-        with open(path, "rb") as f:
+-            f.readinto(buffer)
++
++        if not self.is_fuse_driver:
++            buffer = memory_obj.byte_array
++        else:
++            # create a temporary buffer
++            buffer = bytearray(memory_obj.get_size())
++
++        try:
++            with open(path, "rb") as f:
++                f.readinto(buffer)
++ 
++        except Exception as e:
++            logger.error("unable to load KVBytes due to: ", e)
++            self.memory_allocator.ref_count_down(memory_obj)
++            return None
++
++        if self.is_fuse_driver:
++            temp_tensor = torch.frombuffer(buffer, dtype=memory_obj.get_dtype()).view(shape)
++            memory_obj.tensor.copy_(temp_tensor, non_blocking=True)
++
+         return memory_obj
+ 
+     @_lmcache_nvtx_annotate
+diff --git a/lmcache/v1/storage_backend/remote_backend.py b/lmcache/v1/storage_backend/remote_backend.py
+index 4c15f9c..a90a417 100644
+--- a/lmcache/v1/storage_backend/remote_backend.py
++++ b/lmcache/v1/storage_backend/remote_backend.py
+@@ -168,6 +168,10 @@ class RemoteBackend(StorageBackendInterface):
+         """
+         Callback function for put tasks.
+         """
++        exception = future.exception()
++        if exception is not None:
++           logger.error("During put remote, encounter exception: %s", exception)
++           raise exception
+         self.lock.acquire()
+         self.put_tasks.remove(key)
+         self.lock.release()
+@@ -221,6 +225,7 @@ class RemoteBackend(StorageBackendInterface):
+     def get_blocking(
+         self,
+         key: CacheEngineKey,
++        **kwargs
+     ) -> Optional[MemoryObj]:
+         """
+         Blocking get function.
+@@ -235,7 +240,7 @@ class RemoteBackend(StorageBackendInterface):
+                 key.fmt, key.model_name, key.world_size, 0, key.chunk_hash
+             )
+         t1 = time.perf_counter()
+-        future = asyncio.run_coroutine_threadsafe(self.connection.get(key), self.loop)
++        future = asyncio.run_coroutine_threadsafe(self.connection.get(key, **kwargs), self.loop)
+ 
+         try:
+             memory_obj = future.result(self.blocking_timeout_secs)
+diff --git a/lmcache/v1/storage_backend/storage_manager.py b/lmcache/v1/storage_backend/storage_manager.py
+index da2b261..7c79dc9 100644
+--- a/lmcache/v1/storage_backend/storage_manager.py
++++ b/lmcache/v1/storage_backend/storage_manager.py
+@@ -188,7 +188,7 @@ class StorageManager:
+         for memory_obj in memory_objs:
+             memory_obj.ref_count_down()
+ 
+-    def get(self, key: CacheEngineKey) -> Optional[MemoryObj]:
++    def get(self, key: CacheEngineKey, **kwargs) -> Optional[MemoryObj]:
+         """
+         Blocking function to get the memory object from the storages.
+         """
+@@ -212,7 +212,7 @@ class StorageManager:
+         # Search all backends for blocking get
+         for backend_name, backend in self.storage_backends.items():
+             # NOTE(Jiayi): bypass the allocator for now
+-            memory_obj = backend.get_blocking(key)
++            memory_obj = backend.get_blocking(key, **kwargs)
+             if memory_obj is not None:
+                 if backend_name not in ["LocalCPUBackend", "NixlBackend"]:
+                     local_cpu_backend = self.storage_backends["LocalCPUBackend"]
+diff --git a/lmcache/v1/token_database.py b/lmcache/v1/token_database.py
+index 5d026a2..9a8536f 100644
+--- a/lmcache/v1/token_database.py
++++ b/lmcache/v1/token_database.py
+@@ -26,7 +26,11 @@ import torch
+ from lmcache.config import LMCacheEngineMetadata
+ from lmcache.utils import CacheEngineKey, _lmcache_nvtx_annotate
+ from lmcache.v1.config import LMCacheEngineConfig
++import numpy as np
++import xxhash
+ 
++from lmcache.logging import init_logger
++logger = init_logger(__name__)
+ 
+ class TokenDatabase(metaclass=abc.ABCMeta):
+     """TokenDatabase is used to convert input tokens into list of
+@@ -42,8 +46,8 @@ class TokenDatabase(metaclass=abc.ABCMeta):
+     @abc.abstractmethod
+     def process_tokens(
+         self,
+-        tokens: Union[torch.Tensor, List[int]],
+-        mask: Optional[torch.Tensor] = None,
++        tokens: Union[np.ndarray, List[int]],
++        mask: Optional[np.ndarray] = None,
+         make_key: bool = True,
+     ) -> Iterable[Tuple[int, int, Union[CacheEngineKey, str]]]:
+         """Process the tokens and return the corresponding cache engine keys.
+@@ -90,20 +94,25 @@ class ChunkedTokenDatabase(TokenDatabase):
+ 
+     def _hash(
+         self,
+-        tokens: Union[torch.Tensor, List[int]],
++        tokens: Union[np.ndarray, List[int]],
+         prefix_hash: str,
+     ) -> str:
+         # TODO: change it to a more efficient hash function
+-        if isinstance(tokens, torch.Tensor):
+-            tokens_bytes = tokens.cpu().to(torch.uint32).numpy().tobytes()
++        if isinstance(tokens, np.ndarray):
++            tokens_bytes = tokens.tobytes()
+         elif isinstance(tokens, list):
+             tokens_bytes = array.array("I", tokens).tobytes()
+-        return hashlib.sha256(prefix_hash.encode("ascii") + tokens_bytes).hexdigest()
++        else:
++            raise TypeError(f"Unsupported type for tokens: {type(tokens)}")
+ 
++        # # NOTE (GingFung): shouldn't really have collision... 2^64 tokens after 
++        # return xxhash.xxh3_128_hexdigest(prefix_hash.encode("ascii") + tokens_bytes)
++        hashes_ = hashlib.sha256(prefix_hash.encode("ascii") + tokens_bytes).hexdigest()
++        return hashes_
+     def _chunk_tokens(
+         self,
+-        tokens: Union[torch.Tensor, List[int]],
+-    ) -> Iterable[Union[torch.Tensor, List[int]]]:
++        tokens: Union[np.ndarray, List[int]],
++    ) -> Iterable[Union[np.ndarray, List[int]]]:
+         """
+         Chunk the tokens into chunks of size self.chunk_size.
+ 
+@@ -114,16 +123,16 @@ class ChunkedTokenDatabase(TokenDatabase):
+                 shape [chunk_size]
+         """
+         end = (
+-            len(tokens)
++            tokens.size
+             if self.save_unfull_chunk
+-            else (len(tokens) - len(tokens) % self.chunk_size)
++            else (tokens.size - tokens.size % self.chunk_size)
+         )
+         for i in range(0, end, self.chunk_size):
+             yield tokens[i : i + self.chunk_size]
+ 
+     def _prefix_hash(
+         self,
+-        token_chunks: Iterable[Union[torch.Tensor, List[int]]],
++        token_chunks: Iterable[Union[np.ndarray, List[int]]],
+     ) -> Iterable[str]:
+         prefix_hash = self._get_init_hash()
+         for token_chunk in token_chunks:
+@@ -133,8 +142,8 @@ class ChunkedTokenDatabase(TokenDatabase):
+     @_lmcache_nvtx_annotate
+     def process_tokens(
+         self,
+-        tokens: Union[torch.Tensor, List[int]],
+-        mask: Optional[torch.Tensor] = None,
++        tokens: Union[np.ndarray, List[int]],
++        mask: Optional[np.ndarray] = None,
+         make_key: bool = True,
+     ) -> Iterable[Tuple[int, int, Union[CacheEngineKey, str]]]:
+         """Process the tokens and return the corresponding cache engine keys.
+@@ -158,7 +167,7 @@ class ChunkedTokenDatabase(TokenDatabase):
+             multiple of the chunk size.
+         """
+         if mask is not None:
+-            num_falses = mask.numel() - mask.long().sum().item()
++            num_falses = mask.size - int(np.sum(mask))
+         else:
+             num_falses = 0
+ 
+@@ -166,7 +175,7 @@ class ChunkedTokenDatabase(TokenDatabase):
+             raise ValueError(
+                 "The number of Falses in the mask is not a multiple of the chunk size."
+             )
+-        total_len = len(tokens)
++        total_len = tokens.size
+ 
+         token_chunks = self._chunk_tokens(tokens)
+         prefix_hashes = self._prefix_hash(token_chunks)
+diff --git a/pyproject.toml b/pyproject.toml
+index 3c2ed13..9fa2626 100644
+--- a/pyproject.toml
++++ b/pyproject.toml
+@@ -5,7 +5,7 @@ requires = [
+     "packaging>=24.2",
+     "setuptools>=77.0.3,<81.0.0",
+     "setuptools_scm>=8",
+-    "torch>=2.5.1",
++    # "torch>=2.5.1",
+     "wheel",
+ ]
+ build-backend = "setuptools.build_meta"
+@@ -13,8 +13,7 @@ build-backend = "setuptools.build_meta"
+ [project]
+ name = "lmcache"
+ authors = [{name = "LMCache Team", email = "lmcacheteam@gmail.com"}]
+-license = "Apache-2.0"
+-license-files = ["LICENSE"]
++license = { text = "Apache-2.0" }
+ readme = "README.md"
+ description = "A LLM serving engine extension to reduce TTFT and increase throughput, especially under long-context scenarios."
+ classifiers = [
+diff --git a/requirements/ascend.txt b/requirements/ascend.txt
+new file mode 100644
+index 0000000..645bc46
+--- /dev/null
++++ b/requirements/ascend.txt
+@@ -0,0 +1,2 @@
++-r common.txt
++pybind11
+\ No newline at end of file
+diff --git a/requirements/build.txt b/requirements/build.txt
+index ccb5518..eea6e85 100644
+--- a/requirements/build.txt
++++ b/requirements/build.txt
+@@ -4,5 +4,5 @@ ninja
+ packaging>=24.2
+ setuptools>=77.0.3,<81.0.0
+ setuptools_scm>=8
+-torch>=2.5.1 # Corresponds to the version used by vLLM main branch
++# torch>=2.5.1 # Corresponds to the version used by vLLM main branch
+ wheel
+diff --git a/requirements/common.txt b/requirements/common.txt
+index ba3c3a7..f2cd517 100644
+--- a/requirements/common.txt
++++ b/requirements/common.txt
+@@ -4,7 +4,6 @@ aiohttp
+ infinistore; platform_machine == 'x86_64'
+ msgspec
+ numpy
+-nvtx
+ prometheus_client >= 0.18.0
+ psutil
+ pyyaml
+@@ -14,5 +13,6 @@ safetensors
+ setuptools>=77.0.3,<81.0.0
+ setuptools_scm>=8
+ sortedcontainers
+-torch>=2.5.1 # Should correspond to the version used by vLLM main branch
++# torch>=2.5.1 # Should correspond to the version used by vLLM main branch
+ transformers >= 4.51.1
++xxhash
+diff --git a/setup.py b/setup.py
+index 12aad15..3b56457 100644
+--- a/setup.py
++++ b/setup.py
+@@ -4,7 +4,17 @@ import os
+ import sys
+ 
+ # Third Party
+-from setuptools import find_packages, setup
++from setuptools import find_packages, setup, Extension
++from setuptools.command.build_ext import build_ext
++from setuptools.command.develop import develop
++from setuptools.command.install import install
++
++import logging
++import sysconfig
++import subprocess
++import platform
++import shutil
++
+ 
+ ROOT_DIR = Path(__file__).parent
+ HIPIFY_DIR = os.path.join(ROOT_DIR, "csrc/")
+@@ -14,10 +24,159 @@ HIPIFY_OUT_DIR = os.path.join(ROOT_DIR, "csrc_hip/")
+ # will run python setup.py sdist --dist-dir dist
+ BUILDING_SDIST = "sdist" in sys.argv or os.environ.get("NO_CUDA_EXT", "0") == "1"
+ 
+-# New environment variable to choose between CUDA and HIP
+-BUILD_WITH_HIP = os.environ.get("BUILD_WITH_HIP", "0") == "1"
++# Environment variable to choose between CUDA, HIP, Ascend
++TARGET_DEVICE = os.environ.get("LMCACHE_TARGET_DEVICE", "ASCEND") 
++
++USE_TORCH_ENV = bool(os.getenv("USE_TORCH", "0") == "1")
++
++logging.basicConfig(level=logging.INFO)
++logger = logging.getLogger(__name__)
++
++def _get_ascend_home_path():
++    # NOTE: standard Ascend CANN toolkit path
++    return os.environ.get("ASCEND_HOME_PATH", "/usr/local/Ascend/ascend-toolkit/latest")
++
++def _get_ascend_env_path():
++    # NOTE: standard Ascend Environment variable setup path
++    env_script_path = os.path.realpath(os.path.join(_get_ascend_home_path(), "..", "set_env.sh"))
++    if not os.path.exists(env_script_path):
++        raise ValueError(f"The file '{env_script_path}' is not found, "
++                            "please make sure environment variable 'ASCEND_HOME_PATH' is set correctly.")
++    return env_script_path
++
++def _get_npu_soc():
++    _soc_version = os.getenv("SOC_VERSION", None)
++    if _soc_version is None:
++        npu_smi_cmd = [
++            "bash",
++            "-c",
++            "npu-smi info | grep OK | awk '{print $3}' | head -n 1",
++        ]
++        try:
++            _soc_version = subprocess.check_output(npu_smi_cmd,
++                                                   text=True).strip()
++            _soc_version = _soc_version.split("-")[0]
++            _soc_version = "Ascend"+_soc_version
++            return _soc_version
++        except subprocess.CalledProcessError as e:
++            raise RuntimeError(f"Retrieve SoC version failed: {e}")
++        
++
++class CMakeExtension(Extension):
++
++    def __init__(self,
++                 name: str,
++                 cmake_lists_dir: str = ".",
++                 **kwargs) -> None:
++        super().__init__(name, sources=[], py_limited_api=False, **kwargs)
++        self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
++
++class custom_install(install):
++    def run(self):
++        self.run_command("build_ext")
++        install.run(self)
++        
++class CustomAscendCmakeBuildExt(build_ext):
++    
++    def build_extension(self, ext):
++        # build the so as c_ops
++        ext_name = ext.name.split(".")[-1]
++        so_name = ext_name + ".so"
++        logger.info(f"Building {so_name} ...")
++        OPS_DIR = os.path.join(ROOT_DIR)
++        BUILD_OPS_DIR = os.path.join(ROOT_DIR, "build", "ascend")
++        os.makedirs(BUILD_OPS_DIR, exist_ok=True)
++       
++        ascend_home_path = _get_ascend_home_path()
++        env_path = _get_ascend_env_path()
++        _soc_version = _get_npu_soc()
++        _cxx_compiler = os.getenv("CXX")
++        _cc_compiler = os.getenv("CC")
++        python_executable = sys.executable
+ 
++        try:
++            # if pybind11 is installed via pip
++            pybind11_cmake_path = (subprocess.check_output(
++                [python_executable, "-m", "pybind11",
++                 "--cmakedir"]).decode().strip())
++        except subprocess.CalledProcessError as e:
++            # else specify pybind11 path installed from source code on CI container
++            raise RuntimeError(f"CMake configuration failed: {e}")
++        
++        # import torch_npu
++        # torch_npu_path = os.path.dirname(os.path.abspath(torch_npu.__file__))
++        # import torch
++        # torch_path = os.path.dirname(os.path.abspath(torch.__file__))
+ 
++        import mindspore
++        ms_path = os.path.dirname(os.path.abspath(mindspore.__file__))
++        
++        # python include
++        python_include_path = sysconfig.get_path('include', scheme='posix_prefix')
++        
++        arch = platform.machine()
++        install_path = os.path.join(BUILD_OPS_DIR, 'install')
++        if isinstance(self.distribution.get_command_obj("develop"), develop):
++            install_path=BUILD_OPS_DIR
++        
++        cmake_cmd = [
++            f"source {env_path} && "
++            f"cmake -S {OPS_DIR} -B {BUILD_OPS_DIR}"
++            f"  -DSOC_VERSION={_soc_version}"
++            f"  -DARCH={arch}"
++            "  -DUSE_ASCEND=1"
++            f"  -DPYTHON_EXECUTABLE={python_executable}"
++            f"  -DCMAKE_PREFIX_PATH={pybind11_cmake_path}"
++            f"  -DCMAKE_BUILD_TYPE=Release"
++            f"  -DCMAKE_INSTALL_PREFIX={install_path}"
++            f"  -DPYTHON_INCLUDE_PATH={python_include_path}"
++            # f"  -DTORCH_NPU_PATH={torch_npu_path}"
++            # f"  -DTORCH_PATH={torch_path}"
++            f"  -DMS_NPU_DIR={ms_path}"
++            f"  -DASCEND_CANN_PACKAGE_PATH={ascend_home_path}"
++            "  -DCMAKE_VERBOSE_MAKEFILE=ON"
++        ]
++        
++        if _cxx_compiler is not None:
++            cmake_cmd += [f"  -DCMAKE_CXX_COMPILER={_cxx_compiler}"]
++        
++        if _cc_compiler is not None:
++            cmake_cmd += [f"  -DCMAKE_C_COMPILER={_cc_compiler}"]
++        
++        cmake_cmd += [f" && cmake --build {BUILD_OPS_DIR} -j --verbose"]
++        cmake_cmd += [f" && cmake --install {BUILD_OPS_DIR}"]
++        cmake_cmd = "".join(cmake_cmd)
++        
++        logger.info(f"Start running CMake commands:\n{cmake_cmd}")
++        try:
++            result = subprocess.run(cmake_cmd, cwd=ROOT_DIR, text=True, shell=True, check=True)
++        except subprocess.CalledProcessError as e:
++            raise RuntimeError(f"Failed to build {so_name}: {e}")
++        
++        build_lib_dir = self.get_ext_fullpath(ext.name)
++        os.makedirs(os.path.dirname(build_lib_dir), exist_ok=True)
++        
++        package_name = ext.name.split('.')[0] # e.g., 'lmcache'
++        src_dir = os.path.join(ROOT_DIR, package_name)
++        
++        for root, _, files in os.walk(install_path):
++            for file in files:
++                if file.endswith(".so"):
++                    src_path = os.path.join(root, file)
++                    dst_path = os.path.join(os.path.dirname(build_lib_dir), file)
++                    if os.path.exists(dst_path):
++                        os.remove(dst_path)
++                    
++                    if isinstance(self.distribution.get_command_obj("develop"), develop):
++                        # For the ascend kernels
++                        src_dir_file = os.path.join(src_dir, file)
++                        shutil.copy(src_path, src_dir_file)
++                    shutil.copy(src_path, dst_path)
++
++                    logger.info(f"Copied {file} to {dst_path}")
++        
++        
++        
+ def hipify_wrapper() -> None:
+     # Third Party
+     from torch.utils.hipify.hipify_python import hipify
+@@ -58,7 +217,7 @@ def hipify_wrapper() -> None:
+ 
+ 
+ # Taken from https://github.com/vllm-project/vllm/blob/main/setup.py
+-def get_requirements() -> list[str]:
++def get_requirements(target_device) -> list[str]:
+     """Get Python package dependencies from requirements.txt."""
+     requirements_dir = ROOT_DIR / "requirements"
+ 
+@@ -76,8 +235,15 @@ def get_requirements() -> list[str]:
+             ):
+                 resolved_requirements.append(line)
+         return resolved_requirements
+-
+-    requirements = _read_requirements("common.txt")
++    
++    if target_device == "EMPTY":
++        requirements = _read_requirements("common.txt")
++    elif target_device == "CUDA" or target_device == "HIP":
++        requirements = _read_requirements("cuda.txt")
++    elif target_device == "ASCEND":
++        requirements = _read_requirements("ascend.txt")
++    else:
++        raise ValueError(f"Unknown target device: {target_device}")
+     return requirements
+ 
+ 
+@@ -158,6 +324,12 @@ def rocm_extension() -> tuple[list, dict]:
+     return ext_modules, cmdclass
+ 
+ 
++def ascend_extension():
++    print("Building Ascend extensions")
++    return [CMakeExtension(name="lmcache.lmcache_C")], \
++        {"build_ext": CustomAscendCmakeBuildExt}
++
++
+ def source_dist_extension() -> tuple[list, dict]:
+     print("Not building CUDA/HIP extensions for sdist")
+     return [], {}
+@@ -166,19 +338,31 @@ def source_dist_extension() -> tuple[list, dict]:
+ if __name__ == "__main__":
+     if BUILDING_SDIST:
+         get_extension = source_dist_extension
+-    elif BUILD_WITH_HIP:
++        TARGET_DEVICE = "EMPTY"
++    elif TARGET_DEVICE == "HIP":
+         get_extension = rocm_extension
+-    else:
++    elif TARGET_DEVICE == "CUDA":
+         get_extension = cuda_extension
+-
++    elif TARGET_DEVICE == "ASCEND":
++        get_extension = ascend_extension
++        
+     ext_modules, cmdclass = get_extension()
++    requirments_list_origin = get_requirements(TARGET_DEVICE)
++
++    if USE_TORCH_ENV:
++        requirments_list = requirments_list_origin
++    else:
++        requirments_list = ["mindspore >= 2.6.0" if s.startswith("torch") else s for s in requirments_list_origin]
++
+ 
++    logger.info("Python package dependencies: %s", requirments_list)
+     setup(
+         packages=find_packages(
+             exclude=("csrc",)
+         ),  # Ensure csrc is excluded if it only contains sources
+-        install_requires=get_requirements(),
++        install_requires=requirments_list,
+         ext_modules=ext_modules,
+         cmdclass=cmdclass,
+         include_package_data=True,
++        package_data={ "lmcache": ['*.so'] }
+     )
+diff --git a/tests/v1/test_ms_mem_kernels.py b/tests/v1/test_ms_mem_kernels.py
+new file mode 100644
+index 0000000..b032961
+--- /dev/null
++++ b/tests/v1/test_ms_mem_kernels.py
+@@ -0,0 +1,190 @@
++import random
++from typing import List
++
++import torch
++import numpy as np
++
++from lmcache import lmcache_C
++from mindspore import Tensor, nn, mutable, mint
++from lmcache.experimental.memory_management import MixedMemoryAllocator
++import debugpy
++import acl
++import time
++import mindspore as ms
++ms.set_context(mode=ms.PYNATIVE_MODE)
++ms.set_context(debug_level=ms.context.DEBUG)
++
++
++def generate_kv_cache_tuple_paged_list_tensors(
++    num_blocks,
++    device,
++    block_size=16,
++    num_layers=32,
++    num_heads=8,
++    head_size=128,
++    kvs_dim=2,
++    dtype=torch.bfloat16
++):
++    """
++    Return List[Tuple[Tensor, Tensor]]
++    """
++    shape = [num_blocks, block_size, num_heads, head_size]
++    # calculate and print size
++    size = np.prod(shape) * 2 * num_layers * kvs_dim
++    print(f"KVCache size: {size / 1024 / 1024} MB")
++    kvcaches = []
++    for i in range(num_layers):
++        current_cache = []
++        for _ in range(kvs_dim):
++            t = mint.randn(shape, dtype=dtype)
++            t = t.move_to(device)
++            current_cache.append(mutable(t))
++        kvcaches.append(mutable(tuple(current_cache)))
++    return mutable(kvcaches)
++
++def check_paged_tuple_kv_cache_equal(left,
++                                     right,
++                                     num_tokens,
++                                     slot_mapping,
++                                     num_heads=8,
++                                     head_size=128):
++    """
++    check whether two paged kv caches are the same at slot_mapping
++    """
++    token_dim = 0
++    for left_kv, right_kv in zip(left, right):
++        left_k = left_kv[0].reshape(-1, num_heads, head_size)
++        left_v = left_kv[1].reshape(-1, num_heads, head_size)
++        right_k = right_kv[0].reshape(-1, num_heads, head_size)
++        right_v = right_kv[1].reshape(-1, num_heads, head_size)
++
++        assert len(left_k.shape) == 3
++        assert len(left_v.shape) == 3
++        assert len(right_k.shape) == 3
++        assert len(right_v.shape) == 3
++
++        assert left_k.shape[token_dim] >= num_tokens
++        assert left_v.shape[token_dim] >= num_tokens
++        assert right_k.shape[token_dim] >= num_tokens
++        assert right_v.shape[token_dim] >= num_tokens
++
++        # 1. Convert MindSpore Tensors to NumPy arrays
++        left_k_np = left_k.asnumpy()
++        right_k_np = right_k.asnumpy()
++        slot_mapping_np = slot_mapping.asnumpy()
++        
++        comp_left_k = left_k_np[slot_mapping_np, :, :]
++        comp_right_k = right_k_np[slot_mapping_np, :, :]
++        result_np_k = ( comp_left_k == comp_right_k ).all()
++        assert result_np_k
++        
++        left_v_np = left_v.asnumpy()
++        right_v_np = right_v.asnumpy()
++        
++        comp_left_v = left_v_np[slot_mapping_np, :, :]
++        comp_right_v = right_v_np[slot_mapping_np, :, :]
++        result_np_v = ( comp_left_v == comp_right_v ).all()
++        assert result_np_v
++    return True
++        
++        
++        
++def test_ms_multi_layer_kernel(mem_allocator, num_tokens, layers=60, kvs=2,
++                               head_size=128, num_heads=10,
++                               chunk_size=256, block_size=64):
++    device = "Ascend"
++    num_blocks = (num_tokens // block_size) + 100 
++    assert num_tokens >= chunk_size and num_tokens % chunk_size == 0
++    dtype = torch.bfloat16
++    kv_cache = generate_kv_cache_tuple_paged_list_tensors(
++        num_blocks,
++        device,
++        block_size=block_size,
++        num_layers=layers,
++        num_heads=num_heads,
++        head_size=head_size,
++        dtype=dtype
++    )
++    slot_mapping = np.arange(0, num_tokens)
++    slot_mapping = torch.tensor(slot_mapping, device=device, dtype=torch.int32)
++    
++    # New extract with multi layer kernel
++    t1 = time.perf_counter()
++    kv_cache_pointers_dev = torch.empty([layers, kvs],
++                                    dtype=torch.int64,
++                                    device='Ascend',
++                                    pin_memory=True)
++    t2 = time.perf_counter()
++    kv_cache_pointers_cpus = torch.Tensor([[kvc.data_ptr() for kvc in layer_kv ]for layer_kv in kv_cache])
++    kv_cache_pointers_dev.copy_(kv_cache_pointers_cpus, non_blocking=False)
++    t3 = time.perf_counter()
++    print("Cost cpu alloc: ", (t2-t1)*1000, "Cost KVCache ptrs: ", (t3 - t2)*1000)
++
++    memory_obj_new_list = []
++    start_event = torch.cuda.Event(enable_timing=True)
++    end_event = torch.cuda.Event(enable_timing=True)
++    start_event.record()
++    slot_mapping_chunked = torch.split(slot_mapping, chunk_size)
++    for chunk_id, slot_mapping_temp in enumerate(slot_mapping_chunked):
++        mem_obj_shape = [kvs, layers, len(slot_mapping_temp), num_heads * head_size]
++        memory_obj_new = mem_allocator.allocate(mem_obj_shape, dtype)
++        lmcache_C.paged_layers_kv_transfer(memory_obj_new.tensor,
++                                        kv_cache_pointers_dev, slot_mapping_temp,
++                                        memory_obj_new.base_ptr, num_blocks,
++                                        block_size, kvs, True, True)
++        memory_obj_new_list.append(memory_obj_new)
++    end_event.record()
++    torch.cuda.synchronize()
++    elapsed_time_ms = start_event.elapsed_time(end_event)
++    print("extract time: ", elapsed_time_ms / 1000)
++    
++    kv_cache_new = generate_kv_cache_tuple_paged_list_tensors(
++        num_blocks,
++        device,
++        block_size=block_size,
++        num_layers=layers,
++        num_heads=num_heads,
++        head_size=head_size,
++        dtype=dtype
++    )
++    
++    kv_cache_pointers_dev_new = torch.empty([layers, kvs],
++                                    dtype=torch.int64,
++                                    device='Ascend',
++                                    pin_memory=True)
++    kv_cache_pointers_cpus_new = torch.Tensor([[kvc.data_ptr() for kvc in layer_kv ]for layer_kv in kv_cache_new])
++    kv_cache_pointers_dev_new.copy_(kv_cache_pointers_cpus_new, non_blocking=False)
++    torch.cuda.synchronize()
++    for chunk_id, slot_mapping_temp in enumerate(slot_mapping_chunked):
++        memory_obj_new = memory_obj_new_list[chunk_id]
++        
++        lmcache_C.paged_layers_kv_transfer(memory_obj_new.tensor,
++                                        kv_cache_pointers_dev_new, slot_mapping_temp,
++                                        memory_obj_new.base_ptr, num_blocks,
++                                        block_size, kvs, False, True)
++        
++    valid_check_t1 = time.perf_counter()
++    valid = check_paged_tuple_kv_cache_equal(
++        kv_cache,
++        kv_cache_new,
++        num_tokens,
++        slot_mapping,
++        num_heads=num_heads,
++        head_size=head_size
++    )
++    valid_check_time = (time.perf_counter() - valid_check_t1) * 1000
++    print(valid, valid_check_time)
++    
++    del kv_cache_pointers_dev, kv_cache_pointers_dev_new, kv_cache, kv_cache_new
++    ms.runtime.synchronize()
++    ms.runtime.empty_cache()
++
++if __name__ == "__main__":
++    # debugpy.listen(('0.0.0.0', 5678))
++    # debugpy.wait_for_client()
++    pinned_cpu_size = 30 * 1024 * 1024 * 1024  
++    mem_allocator = MixedMemoryAllocator(pinned_cpu_size)
++    
++    for token in [8192]:
++        test_ms_multi_layer_kernel(mem_allocator, token)
++        time.sleep(10)
+\ No newline at end of file
diff --git a/fix-build-requirements-for-ascend.patch b/fix-build-requirements-for-ascend.patch
new file mode 100644
index 0000000000000000000000000000000000000000..79ab28970a2b300ee2b2388e3bfdb9ec9a95283b
--- /dev/null
+++ b/fix-build-requirements-for-ascend.patch
@@ -0,0 +1,54 @@
+diff --git a/pyproject.toml b/pyproject.toml
+index 9a1c936..3c2ed13 100644
+--- a/pyproject.toml
++++ b/pyproject.toml
+@@ -5,7 +5,7 @@ requires = [
+     "packaging>=24.2",
+     "setuptools>=77.0.3,<81.0.0",
+     "setuptools_scm>=8",
+-    "torch==2.7.0",
++    "torch>=2.5.1",
+     "wheel",
+ ]
+ build-backend = "setuptools.build_meta"
+diff --git a/requirements/build.txt b/requirements/build.txt
+index a2db38b..ccb5518 100644
+--- a/requirements/build.txt
++++ b/requirements/build.txt
+@@ -4,5 +4,5 @@ ninja
+ packaging>=24.2
+ setuptools>=77.0.3,<81.0.0
+ setuptools_scm>=8
+-torch==2.7.0 # Corresponds to the version used by vLLM main branch
++torch>=2.5.1 # Corresponds to the version used by vLLM main branch
+ wheel
+diff --git a/requirements/common.txt b/requirements/common.txt
+index 9894e01..ba3c3a7 100644
+--- a/requirements/common.txt
++++ b/requirements/common.txt
+@@ -1,8 +1,7 @@
+ aiofile
+ aiofiles
+ aiohttp
+-cufile-python
+-infinistore
++infinistore; platform_machine == 'x86_64'
+ msgspec
+ numpy
+ nvtx
+@@ -15,5 +14,5 @@ safetensors
+ setuptools>=77.0.3,<81.0.0
+ setuptools_scm>=8
+ sortedcontainers
+-torch==2.7.0  # Should correspond to the version used by vLLM main branch
++torch>=2.5.1 # Should correspond to the version used by vLLM main branch
+ transformers >= 4.51.1
+diff --git a/requirements/cuda.txt b/requirements/cuda.txt
+index 719261d..4912df9 100644
+--- a/requirements/cuda.txt
++++ b/requirements/cuda.txt
+@@ -9,3 +9,4 @@ nvidia-ml-py # for pynvml package
+ torch == 2.7.0
+ torchvision == 0.22.0   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+ xformers == 0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.7.0
++cufile-python
diff --git a/fix-dependance-issue-on-ascend.patch b/fix-dependance-issue-on-ascend.patch
deleted file mode 100644
index 05a158638d383324086c991815ebf5b095df5c68..0000000000000000000000000000000000000000
--- a/fix-dependance-issue-on-ascend.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/setup.py b/setup.py
-index 5b1aa4b..1895953 100644
---- a/setup.py
-+++ b/setup.py
-@@ -17,7 +17,7 @@ setup(
-         "nvtx",
-         "safetensors",
-         "transformers",
--        "torchac_cuda >= 0.2.5",
-+        # "torchac_cuda >= 0.2.5",
-     ],
-     classifiers=[
-         # Trove classifiers
diff --git a/lmcache.spec b/lmcache.spec
index c1dc836e6d611c2048e6a0ea79c5db825bb39c0a..6791f1ce8ef5bdba599247a76d8b83c281cbf5f7 100644
--- a/lmcache.spec
+++ b/lmcache.spec
@@ -1,20 +1,19 @@
 %define debug_package %{nil}
 
 Name:           python-LMCache
-Version:        0.1.4.alpha
+Version:        0.3.1.post1
 Release:        1
 Summary:        LMCache - A lightweight memory cache system
 License:        Apache 2.0
 URL:            https://github.com/LMCache/LMCache
 Source0:        LMCache-%{version}.tar.gz
 
-Patch0:         fix-dependance-issue-on-ascend.patch
-
-BuildArch:      noarch
+Patch0:         fix-build-requirements-for-ascend.patch
+Patch1:         adapt-mindspore.patch
 
+BuildArch:      aarch64
 BuildRequires:  python3
 BuildRequires:  python3-setuptools python3-pip
-
 Requires:       python3
 
 %description
@@ -25,17 +24,26 @@ simple key-value storage with minimal overhead.
 %autosetup -n %{name}-%{version} -p1 -Sgit
 
 %build
-ln -s /usr/bin/python3 /usr/bin/python || :
+# ln -s /usr/bin/python3 /usr/bin/python || :
+
+export CPLUS_INCLUDE_PATH=/usr/include/c++/12/aarch64-openEuler-linux:/usr/include/c++/12:$CPLUS_INCLUDE_PATH
+export LMCACHE_TARGET_DEVICE="ASCEND" 
+export USE_TORCH=0
+export PYTHONPATH=/usr/local/lib64/python3.11/site-packages:/usr/local/lib/python3.11/site-packages:/usr/lib/python3.11/site-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:$LD_LIBRARY_PATH
+
 %py3_build
 
+
 %install
 %py3_install
 
+
 %files
-%{python3_sitelib}/*
+/usr/lib64/python3.11/site-packages/*
 %{_bindir}/*
 
-%changelog
-* Tue Jun 24 2025 jingjunyuan <jingjunyuan@huawei.com> - 0.1.4.alpha-1
-- Initial package build
 
+%changelog
+* Thu Jul 24 2025 jingjunyuan <jingjunyuan@huawei.com> - 0.3.1.post1-1
+- Initial package build
\ No newline at end of file