diff --git a/0001-npu-picked-support.patch b/0001-npu-picked-support.patch
new file mode 100644
index 0000000000000000000000000000000000000000..e22bb123bb9a670db883f53e2ba388a29aecd63e
--- /dev/null
+++ b/0001-npu-picked-support.patch
@@ -0,0 +1,3380 @@
+diff --git a/csrc/ascend/CMakeLists.txt b/csrc/ascend/CMakeLists.txt
+new file mode 100644
+index 0000000..b434723
+--- /dev/null
++++ b/csrc/ascend/CMakeLists.txt
+@@ -0,0 +1,130 @@
++# Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
++
++# CMake lowest version requirement
++cmake_minimum_required(VERSION 3.16.0)
++# project information
++project(c_ops)
++
++set(CMAKE_CXX_STANDARD 17)
++include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake)
++
++find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
++set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11")
++find_package(pybind11 REQUIRED)
++
++append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
++set(LMC_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}")
++
++set(SOC_VERSION ${SOC_VERSION})
++set(ARCH ${ARCH})
++
++if (NOT CMAKE_BUILD_TYPE)
++  set(CMAKE_BUILD_TYPE "Release" CACHE STRINGS "Build type Release/Debug (default Release)" FORCE)
++endif()
++
++set(ASCEND_HOME_PATH ${ASCEND_CANN_PACKAGE_PATH}) 
++if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
++    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
++elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
++    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
++elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
++    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
++else()
++    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.")
++endif()
++
++include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
++
++# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}.
++# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library
++file(GLOB KERNEL_FILES 
++${CMAKE_CURRENT_SOURCE_DIR}/kernels/*.cpp)
++
++message(STATUS "kernel files: ${KERNEL_FILES}")
++
++# ascendc_library use to add kernel file to generate ascendc library
++ascendc_library(ascend_kernels SHARED
++    ${KERNEL_FILES}
++)
++
++message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
++
++file(GLOB SRC_FILES
++${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
++
++set(PYBIND11_SOURCE_FILE 
++${CMAKE_CURRENT_SOURCE_DIR}/mem_kernels.cpp
++${CMAKE_CURRENT_SOURCE_DIR}/managed_mem.cpp
++${CMAKE_CURRENT_SOURCE_DIR}/../pybind.cpp
++)
++
++find_package(Torch REQUIRED)
++
++include_directories(
++  ${CMAKE_CURRENT_SOURCE_DIR}
++  ${pybind11_INCLUDE_DIRS}
++  ${PYTHON_INCLUDE_PATH}
++  ${TORCH_INCLUDE_DIRS}
++  ${TORCH_NPU_PATH}/include
++  ${ASCEND_HOME_PATH}/include
++  ${ASCEND_HOME_PATH}/aarch64-linux/ascendc/include
++  ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
++  ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/ascend_hal
++  ${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform
++  ${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/ascend_hal
++)
++
++
++set(
++  INCLUDES
++  ${TORCH_INCLUDE_DIRS}
++  ${TORCH_NPU_PATH}/include
++  ${ASCEND_HOME_PATH}/include
++  ${ASCEND_HOME_PATH}/aarch64-linux/ascendc/include
++  ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
++  ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/ascend_hal
++)
++
++set(PYMODULE_FILES 
++    ${SRC_FILES}
++    ${PYBIND11_SOURCE_FILE}
++)
++
++pybind11_add_module(c_ops ${PYMODULE_FILES})
++
++message(STATUS "CMake: Adding -DUSE_ASCEND compile definition.")
++target_compile_definitions(c_ops PRIVATE USE_ASCEND)
++
++set(TORCH_NPU_LIBS_DIR "${TORCH_NPU_PATH}/lib")
++set(ASCEND_CANN_LIBS_DIR "${ASCEND_HOME_PATH}/lib64")
++set(TORCH_LIBS_DIR "${TORCH_PATH}/lib")
++
++
++target_link_options(c_ops PRIVATE
++  "-Wl,-rpath,$ORIGIN:$ORIGIN/lib"
++  "-Wl,-rpath,${LMC_INSTALL_PATH}" 
++)
++
++target_link_directories(
++  c_ops
++  PRIVATE
++  ${TORCH_LIBS_DIR}
++  ${TORCH_NPU_PATH}/lib/
++  ${ASCEND_HOME_PATH}/lib64
++  ${ASCEND_DRIVER_PATH}/lib64/driver
++)
++
++target_link_libraries(
++  c_ops
++  PUBLIC
++  ${TORCH_LIBRARIES}
++  libtorch_npu.so
++  ascend_kernels
++  ascendcl
++  platform
++  ascend_hal
++  tiling_api
++)
++
++
++install(TARGETS c_ops ascend_kernels DESTINATION ${LMC_INSTALL_PATH})
+diff --git a/csrc/ascend/cachegen_kernels.cpp b/csrc/ascend/cachegen_kernels.cpp
+new file mode 100644
+index 0000000..fdf865f
+--- /dev/null
++++ b/csrc/ascend/cachegen_kernels.cpp
+@@ -0,0 +1,32 @@
++#include "cachegen_kernels.h"
++#include <pybind11/pybind11.h>
++#include <Python.h> 
++
++namespace py = pybind11;
++
++void encode_cuda_new(const at::Tensor& cdf, const at::Tensor& input_sym,
++                     at::Tensor& output_buffer, at::Tensor& output_lengths) {
++    // TODO:
++    PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
++    throw py::error_already_set();
++};
++
++void decode_cuda_new(const at::Tensor& cdf, const at::Tensor& bytestreams,
++                     const at::Tensor& lengths, at::Tensor& output) {
++    // TODO:
++    PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
++    throw py::error_already_set();
++};
++
++void decode_cuda_prefsum(const at::Tensor& cdf, const at::Tensor& bytestreams,
++                         const at::Tensor& lengths, at::Tensor& output) {
++    // TODO:
++    PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
++    throw py::error_already_set();
++};
++
++at::Tensor calculate_cdf(const at::Tensor& input, const int max_bins) {
++    // TODO:
++    PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
++    throw py::error_already_set();
++};
+\ No newline at end of file
+diff --git a/csrc/ascend/cachegen_kernels.h b/csrc/ascend/cachegen_kernels.h
+new file mode 100644
+index 0000000..d1a1701
+--- /dev/null
++++ b/csrc/ascend/cachegen_kernels.h
+@@ -0,0 +1,16 @@
++#pragma once
++#include <ATen/ATen.h>
++#include <pybind11/pybind11.h>
++#include <torch/torch.h>
++#include <torch/extension.h>
++
++void encode_cuda_new(const at::Tensor& cdf, const at::Tensor& input_sym,
++                     at::Tensor& output_buffer, at::Tensor& output_lengths);
++
++void decode_cuda_new(const at::Tensor& cdf, const at::Tensor& bytestreams,
++                     const at::Tensor& lengths, at::Tensor& output);
++
++void decode_cuda_prefsum(const at::Tensor& cdf, const at::Tensor& bytestreams,
++                         const at::Tensor& lengths, at::Tensor& output);
++
++at::Tensor calculate_cdf(const at::Tensor& input, const int max_bins);
+\ No newline at end of file
+diff --git a/csrc/ascend/kernels/load_and_reshape_flash.cpp b/csrc/ascend/kernels/load_and_reshape_flash.cpp
+new file mode 100644
+index 0000000..7e04e6f
+--- /dev/null
++++ b/csrc/ascend/kernels/load_and_reshape_flash.cpp
+@@ -0,0 +1,264 @@
++/*
++ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++#include "kernel_operator.h"
++#include <stdio.h>
++#include "types.h"
++#include "utils.h"
++
++template <typename scalar_t, typename slot_t> class LoadAndReshapeFlashCopy {
++    using local_scalar_t = AscendC::LocalTensor<scalar_t>;
++
++public:
++    __aicore__ inline LoadAndReshapeFlashCopy()
++    {
++    }
++
++    __aicore__ inline void init(GM_ADDR cacheTensor, GM_ADDR keyCachePtr, GM_ADDR valueCachePtr, GM_ADDR slotmappings,
++                                const int64_t numPages, const int64_t hiddenDims, const int32_t pagedSize,
++                                const int32_t numTokens, const int32_t numLayers, const int32_t layerIdx, 
++                                const bool page2L, AscendC::TPipe *pipe)
++    {
++        this->pipe_ = pipe;
++        this->numPages_ = numPages;
++        this->hiddenDims_ = hiddenDims;
++        this->numTokens_ = numTokens;
++        this->pagedSize_ = pagedSize;
++        this->numLayers_ = numLayers;
++        this->layerIdx_ = layerIdx;
++        this->valid_ = true;
++        this->page2L_ = page2L;
++
++        // TODO: Not sure how many to allocate, but let's do 4 blocks of hiddenDims_
++        // if it was fp16, 2048, we would get 16kb.？
++        // should check whether hiddenDims_ is > 192KB.
++        this->pipe_->InitBuffer(this->pagedTokenQue_, 4, this->hiddenDims_*sizeof(scalar_t));
++    }
++
++    __aicore__ inline void reset(){
++        this->valid_ = true;
++    }
++
++    __aicore__ inline void updateTensorMemOffsetAndProcess(__gm__ uint8_t *pagedKeyTensor, 
++                                                           __gm__ uint8_t *pagedValueTensor,
++                                                           __gm__ uint8_t* nonPagedTensor, 
++                                                           __gm__ uint8_t *slotmappings, const int tokenIdx) 
++    {
++        __gm__ slot_t *slotmappingPtr = reinterpret_cast<__gm__ slot_t*>(slotmappings);
++        int64_t slot = static_cast<int64_t>(slotmappingPtr[tokenIdx]);
++
++        if (slot == -1) {
++            this->valid_ = false;
++            return;
++        }
++
++        // for the page tensor
++        int64_t pagedIdxOffset = slot * this->hiddenDims_;
++        
++        // for the lmc tensor
++        int64_t nonPagedKeyOffset = this->layerIdx_ * this->numTokens_ * this->hiddenDims_ +
++                                    tokenIdx * this->hiddenDims_;
++
++        // values are stored after keys in the non-paged tensor
++        int64_t nonPagedValueOffset = this->numLayers_ * this->numTokens_  * this->hiddenDims_ +
++                                      this->layerIdx_ * this->numTokens_  * this->hiddenDims_ + 
++                                      tokenIdx * this->hiddenDims_;
++
++        // keys
++        this->keyTokensGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedKeyTensor) + pagedIdxOffset,
++                                                this->hiddenDims_);
++        this->lmcBufferKeyGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(nonPagedTensor) + nonPagedKeyOffset,
++                                                this->hiddenDims_);
++        // values
++        this->valueTokensGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedValueTensor) + pagedIdxOffset,
++                                                this->hiddenDims_);
++        this->lmcBufferValueGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(nonPagedTensor) + nonPagedValueOffset,
++                                                this->hiddenDims_);
++    }
++
++    __aicore__ inline void processFunc() {
++        if (!this->valid_) {
++            return;
++        }
++        // 1. Alloc Tensor for local page
++        local_scalar_t hiddenKeysDimTensor = this->pagedTokenQue_.template AllocTensor<scalar_t>();
++        local_scalar_t hiddenValuesDimTensor = this->pagedTokenQue_.template AllocTensor<scalar_t>();;
++
++        // 2. copy from global tensor into local (GM -> UB)
++        if (this->page2L_) {
++            AscendC::DataCopy(hiddenKeysDimTensor, this->keyTokensGlobal_, this->hiddenDims_);
++            AscendC::DataCopy(hiddenValuesDimTensor, this->valueTokensGlobal_, this->hiddenDims_);
++        } else {
++            AscendC::DataCopy(hiddenKeysDimTensor, this->lmcBufferKeyGlobal_, this->hiddenDims_);
++            AscendC::DataCopy(hiddenValuesDimTensor, this->lmcBufferValueGlobal_, this->hiddenDims_);
++        }
++       
++        // 3. enque vecin
++        pagedTokenQue_.EnQue(hiddenKeysDimTensor);
++        pagedTokenQue_.EnQue(hiddenValuesDimTensor);
++
++        // 4. deque vecin, possible to reuse due to QueBind
++        hiddenKeysDimTensor = pagedTokenQue_.DeQue<scalar_t>();
++        hiddenValuesDimTensor = pagedTokenQue_.DeQue<scalar_t>();
++
++        // 5. datacopy into GM 
++        if (this->page2L_) {
++            AscendC::DataCopy(this->lmcBufferKeyGlobal_, hiddenKeysDimTensor, this->hiddenDims_);
++            AscendC::DataCopy(this->lmcBufferValueGlobal_, hiddenValuesDimTensor, this->hiddenDims_);
++        } else {
++            AscendC::DataCopy(this->keyTokensGlobal_, hiddenKeysDimTensor, this->hiddenDims_);
++            AscendC::DataCopy(this->valueTokensGlobal_, hiddenValuesDimTensor, this->hiddenDims_);
++        }
++        // 6. free alloced Tensor
++        pagedTokenQue_.FreeTensor(hiddenKeysDimTensor);
++        pagedTokenQue_.FreeTensor(hiddenValuesDimTensor);
++    }
++
++private:
++    AscendC::TPipe *pipe_;
++    AscendC::TQueBind<AscendC::QuePosition::VECIN, AscendC::QuePosition::VECOUT, 4> pagedTokenQue_;
++
++    // [numPages, pagedSize, heads*headsize]
++    AscendC::GlobalTensor<scalar_t> keyTokensGlobal_;
++    AscendC::GlobalTensor<scalar_t> valueTokensGlobal_;
++
++    // Depends on LMC setting whether we store in tokensMajor or not.
++    // the layout would be the followings:
++    // [tokens, kvs, heads*headsize] or [kvs, tokens, heads*headsize]
++    // TODO: check whether should combine the two and use a loop
++    AscendC::GlobalTensor<scalar_t> lmcBufferKeyGlobal_;
++    AscendC::GlobalTensor<scalar_t> lmcBufferValueGlobal_;
++
++    int64_t numPages_; // num vllm npu blocks
++    int32_t pagedSize_; // per npu block tokens
++    int64_t hiddenDims_; // heads * headsize
++    int32_t numTokens_; // num tokens in the cache tensor chunk
++    int32_t numLayers_; // num layers in the cache tensor
++    int32_t layerIdx_; // layer idx in the cache tensor
++    bool valid_;
++    bool page2L_; // true, from pagedTensor to LMC, false otherwise
++};
++
++#define LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(TYPE, SLOTTYPE)                                                       \
++        extern "C" __global__ __aicore__ void load_and_reshape_flash_copy_##TYPE##_##SLOTTYPE(                         \
++        __gm__ uint8_t* dstCacheTensor, __gm__ uint8_t* keyCachePtr, __gm__ uint8_t* valueCachePtr,                    \
++        __gm__ uint8_t* slotmappings, const int64_t hiddenDims, const int64_t numPages, const int32_t pagedSize,       \
++        const int32_t numTokens, const int32_t numLayers, const int32_t layerIdx, const bool page2L,                   \
++        const int blockNum)                                                                                            \
++    {                                                                                                                  \
++        AscendC::TPipe pipe;                                                                                           \
++        LoadAndReshapeFlashCopy<TYPE, SLOTTYPE> op{};                                                                  \
++        op.init(dstCacheTensor, keyCachePtr, valueCachePtr, slotmappings, numPages, hiddenDims, pagedSize,             \
++                numTokens, numLayers, layerIdx, page2L, &pipe);                                                        \
++        int64_t bIdx = AscendC::GetBlockIdx();                                                                         \
++        for (int64_t i = bIdx; i < numTokens; i+=blockNum)                                                             \
++        {                                                                                                              \
++            op.reset();                                                                                                \
++            op.updateTensorMemOffsetAndProcess(keyCachePtr, valueCachePtr, dstCacheTensor, slotmappings, i);           \
++            op.processFunc();                                                                                          \
++        }                                                                                                              \
++    }
++
++// Declare support kernel entry
++LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(half, int32_t);
++LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(half, int64_t);
++LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(bfloat16_t, int32_t);
++LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(bfloat16_t, int64_t);
++LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(int8_t, int32_t);
++LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(int8_t, int64_t);
++
++namespace lmc {
++
++#define LOAD_AND_RESHAPE_FLASH_COPY_KERNEL_CALL(TYPE, SLOTTYPE)                                                        \
++        load_and_reshape_flash_copy_##TYPE##_##SLOTTYPE<<<blockDim, nullptr, stream>>>(dstCacheTensor, keyCachePtr,    \
++                                                    valueCachePtr, slotmappings, hiddenDims, numPages, pagedSize,      \
++                                                    numTokens, numLayers, layerIdx, page2L, blockDim);
++
++template<typename T, typename SlotT>
++void load_and_reshape_kernel_call(uint32_t blockDim, void *stream, uint8_t *dstCacheTensor, uint8_t *keyCachePtr, 
++                       uint8_t *valueCachePtr, uint8_t *slotmappings, const int64_t hiddenDims, const int64_t numPages, 
++                       const int32_t pagedSize, const int32_t numTokens, const int32_t numLayers, 
++                       const int32_t layerIdx, const bool page2L);
++
++
++#define LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(TYPE, SLOTTYPE)                                                      \
++template<>                                                                                                             \
++void load_and_reshape_kernel_call<TYPE, SLOTTYPE>(uint32_t blockDim, void *stream, uint8_t *dstCacheTensor,            \
++                                                  uint8_t *keyCachePtr, uint8_t *valueCachePtr, uint8_t *slotmappings, \
++                                                  const int64_t hiddenDims, const int64_t numPages,                    \
++                                                  const int32_t pagedSize, const int32_t numTokens,                    \
++                                                  const int32_t numLayers, const int32_t layerIdx,                     \
++                                                  const bool page2L) {                                                 \
++    LOAD_AND_RESHAPE_FLASH_COPY_KERNEL_CALL(TYPE, SLOTTYPE);                                                           \
++}
++
++LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(half, int32_t);
++LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(half, int64_t);
++LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int32_t);
++LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int64_t);
++LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(int8_t, int32_t);
++LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(int8_t, int64_t);
++
++template<typename T>
++void dispatch_on_slot_type(vllm_ascend::AscendType slotType, uint32_t blockDim, void *stream, 
++                           uint8_t *dstCacheTensor, uint8_t *keyCachePtr, uint8_t *valueCachePtr,
++                           uint8_t *slotmappings, const int64_t hiddenDims, const int64_t numPages, 
++                           const int32_t pagedSize, const int32_t numTokens, const int32_t numLayers,
++                           const int32_t layerIdx, const bool page2L) {
++    switch(slotType) {
++        case vllm_ascend::AscendType::INT32:
++            load_and_reshape_kernel_call<T, int32_t>(blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr,
++                                          slotmappings, hiddenDims, numPages, pagedSize, numTokens, numLayers, layerIdx,
++                                          page2L);
++            break;
++        case vllm_ascend::AscendType::INT64:
++            load_and_reshape_kernel_call<T, int64_t>(blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr,
++                                          slotmappings, hiddenDims, numPages, pagedSize, numTokens, numLayers, layerIdx,
++                                          page2L);
++            break;
++        default:
++            return;
++    }
++}
++
++extern void load_and_reshape_flash_kernel(vllm_ascend::AscendType type, vllm_ascend::AscendType slotType,
++                            uint32_t blockDim, void *stream, 
++                            uint8_t *dstCacheTensor, uint8_t *keyCachePtr, uint8_t *valueCachePtr,
++                            uint8_t *slotmappings, const int64_t hiddenDims, const int64_t numPages, 
++                            const int32_t pagedSize, const int32_t numTokens, const int32_t numLayers,
++                            const int32_t layerIdx, bool page2L)
++{
++    switch(type) {
++        case vllm_ascend::AscendType::FP16:
++            dispatch_on_slot_type<half>(slotType, blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr, 
++                                        slotmappings, hiddenDims, numPages, pagedSize, numTokens, numLayers, layerIdx,
++                                        page2L);
++            break;
++        case vllm_ascend::AscendType::BF16:
++            dispatch_on_slot_type<bfloat16_t>(slotType, blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr, 
++                                        slotmappings, hiddenDims, numPages, pagedSize, numTokens, numLayers, layerIdx,
++                                        page2L);
++            break;
++        case vllm_ascend::AscendType::INT8:
++            dispatch_on_slot_type<int8_t>(slotType, blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr, 
++                                        slotmappings, hiddenDims, numPages, pagedSize, numTokens, numLayers, layerIdx,
++                                        page2L);
++            break;
++        default:
++            return;
++    }
++}
++
++} // namespace lmc
+diff --git a/csrc/ascend/kernels/multi_layer_mem_kernels.cpp b/csrc/ascend/kernels/multi_layer_mem_kernels.cpp
+new file mode 100644
+index 0000000..3e94f63
+--- /dev/null
++++ b/csrc/ascend/kernels/multi_layer_mem_kernels.cpp
+@@ -0,0 +1,234 @@
++/*
++ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++#include "kernel_operator.h"
++#include <stdio.h>
++#include "types.h"
++#include "utils.h"
++
++template <typename scalar_t, typename slot_t> class MultiLayerPagedKVCopy {
++    using local_scalar_t = AscendC::LocalTensor<scalar_t>;
++
++public:
++    __aicore__ inline MultiLayerPagedKVCopy()
++    {
++    }
++
++    __aicore__ inline void init(GM_ADDR pagedKVCaches, GM_ADDR cacheTensor, GM_ADDR slotmappings,
++                                const int64_t hiddenDims, const int32_t numLayers, const int64_t pageBuffSize,
++                                const int32_t numTokensChunk, const bool page2L, 
++                                AscendC::TPipe *pipe)
++    {
++        this->pipe_ = pipe;
++        this->numLayers_ = numLayers;
++        this->hiddenDims_ = hiddenDims;
++        this->pageBuffSize_ = pageBuffSize;
++        this->numTokensChunk_ = numTokensChunk;
++        this->page2L_ = page2L;
++        this->valid_ = true;
++        
++        this->pipe_->InitBuffer(pagedTokenQue_, 4, this->hiddenDims_*sizeof(scalar_t));
++    }
++
++    __aicore__ inline void reset(){
++        this->valid_ = true;
++    }
++
++    __aicore__ inline void updateMemOffset(__gm__ uint8_t *pagedKVCaches, __gm__ uint8_t* cacheTensor, 
++                                             __gm__ uint8_t *slotmappings, const int tokenIdx, 
++                                             const int kvIdx, const int layerIdx) 
++    {
++        __gm__ slot_t *slotmappingPtr = reinterpret_cast<__gm__ slot_t*>(slotmappings);
++        int64_t slot = static_cast<int64_t>(slotmappingPtr[tokenIdx]);
++
++        if (slot == -1) {
++            this->valid_ = false;
++            return;
++        }
++        
++        // its a pointer within the GM addr space, that point to another GM addr space
++        __gm__ uint8_t * __gm__ *pagedKVCachesPtr = reinterpret_cast<__gm__ uint8_t* __gm__ *>(pagedKVCaches);
++
++        // getting the right ptr to the paged kvcache layer
++        __gm__ uint8_t *pagedLayerKVCaches = pagedKVCachesPtr[layerIdx];
++        
++        int64_t pagedIdxOffset = kvIdx * this->pageBuffSize_ * this->hiddenDims_ +
++                                 slot * this->hiddenDims_;
++
++        int64_t dstTensorIdxOffset = kvIdx * this->numLayers_ * this->numTokensChunk_  * this->hiddenDims_ +
++                                     layerIdx * this->numTokensChunk_  * this->hiddenDims_ + 
++                                     tokenIdx * this->hiddenDims_;
++
++        this->pagedTokenGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedLayerKVCaches) + pagedIdxOffset, 
++                                                   this->hiddenDims_);
++        this->lmcBufferGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(cacheTensor) + dstTensorIdxOffset, 
++                                               this->hiddenDims_);
++    }
++
++    __aicore__ inline void processFunc() {
++        if (!this->valid_) {
++            return;
++        }
++        // 1. Alloc Tensor for local page
++        local_scalar_t hiddenDimTensor = pagedTokenQue_.AllocTensor<scalar_t>();
++
++        // 2. copy from global tensor into local
++        if (this->page2L_) {
++            AscendC::DataCopy(hiddenDimTensor, this->pagedTokenGlobal_, this->hiddenDims_);
++        } else {
++            AscendC::DataCopy(hiddenDimTensor, this->lmcBufferGlobal_, this->hiddenDims_);
++        }
++       
++        // 3. enque vecin
++        pagedTokenQue_.EnQue(hiddenDimTensor);
++        // 4. deque vecin, possible to reuse due to QueBind
++        hiddenDimTensor = pagedTokenQue_.DeQue<scalar_t>();
++
++        // 5. datacopy into GM 
++        if (this->page2L_) {
++            AscendC::DataCopy(this->lmcBufferGlobal_, hiddenDimTensor, this->hiddenDims_);
++        } else {
++            AscendC::DataCopy(this->pagedTokenGlobal_, hiddenDimTensor, this->hiddenDims_);
++        }
++        
++        // 6. free alloced Tensor
++        pagedTokenQue_.FreeTensor(hiddenDimTensor);
++    }
++
++
++private:
++    AscendC::TPipe *pipe_;
++    AscendC::TQueBind<AscendC::QuePosition::VECIN, AscendC::QuePosition::VECOUT, 4> pagedTokenQue_;
++
++    // [layers * [kvs, numPages * pagedSize, heads*headsize]]
++    AscendC::GlobalTensor<scalar_t> pagedTokenGlobal_;
++    // [kvs, layers, numTokensChunk, heads*headsize]
++    AscendC::GlobalTensor<scalar_t> lmcBufferGlobal_;
++    int32_t numLayers_; // num layers
++    int64_t pageBuffSize_; // pages * pageSize
++    int64_t hiddenDims_; // heads * headSize
++    int32_t numTokensChunk_; // num tokens in the cache tensor chunk
++    bool valid_;
++    bool page2L_; // true, from pagedTensor to LMC, false otherwise
++};
++
++// NOTE: there are potential micro optimizaiton here.
++#define MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(TYPE, SLOTTYPE)                                                         \
++        extern "C" __global__ __aicore__ void multi_layer_paged_kv_copy_##TYPE##_##SLOTTYPE(                           \
++        __gm__ uint8_t* pagedKVCaches, __gm__ uint8_t* dstCacheTensor, __gm__ uint8_t* slotmappings,                   \
++        const int64_t hiddenDims, const int32_t kvs, const int32_t numLayers,                                          \
++        const int64_t pageBuffSize, const int32_t numTokensChunk, const int coreNum, const bool page2L)                \
++    {                                                                                                                  \
++        AscendC::TPipe pipe;                                                                                           \
++        MultiLayerPagedKVCopy<TYPE, SLOTTYPE> op{};                                                                    \
++        op.init(pagedKVCaches, dstCacheTensor, slotmappings, hiddenDims,                                               \
++                numLayers, pageBuffSize, numTokensChunk, page2L, &pipe);                                               \
++        int64_t bIdx = AscendC::GetBlockIdx();                                                                         \
++        for (int64_t i = bIdx; i < numTokensChunk; i+=coreNum) {                                                       \
++            for (int32_t kvIdx = 0; kvIdx < kvs; kvIdx ++) {                                                           \
++                for (int32_t layerIdx = 0; layerIdx < numLayers; layerIdx++) {                                         \
++                    op.reset();                                                                                        \
++                    op.updateMemOffset(pagedKVCaches, dstCacheTensor, slotmappings, i, kvIdx, layerIdx);               \
++                    op.processFunc();                                                                                  \
++                }                                                                                                      \
++            }                                                                                                          \
++        }                                                                                                              \
++    }
++
++// Declare support kernel entry
++MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(half, int32_t);
++MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(half, int64_t);
++MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t, int32_t);
++MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t, int64_t);
++MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(int8_t, int32_t);
++MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(int8_t, int64_t);
++
++namespace lmc {
++
++#define MULTI_LAYER_PAGED_KV_COPY_KERNEL_CALL(TYPE, SLOTTYPE)                                                          \
++    multi_layer_paged_kv_copy_##TYPE##_##SLOTTYPE<<<blockDim, nullptr, stream>>>(pagedKVCaches, dstCacheTensor,        \
++                                                                                 slotmappings, hiddenDims, kvs,        \
++                                                                                 numLayers, pageBuffSize,              \
++                                                                                 numTokensChunk, blockDim, page2L);
++
++template<typename T, typename SlotT>
++void multi_layer_paged_kernel(uint32_t blockDim, void *stream, uint8_t *pagedKVCaches, uint8_t *dstCacheTensor, 
++                  uint8_t *slotmappings, const int64_t hiddenDims, const int32_t kvs, const int32_t numLayers, 
++                  const int64_t pageBuffSize, const int32_t numTokensChunk, const bool page2L);
++
++#define MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(TYPE, SLOTTYPE)                                                     \
++template<>                                                                                                             \
++void multi_layer_paged_kernel<TYPE, SLOTTYPE>(uint32_t blockDim, void *stream, uint8_t *pagedKVCaches,                 \
++                                              uint8_t *dstCacheTensor, uint8_t *slotmappings,                          \
++                                              const int64_t hiddenDims, const int32_t kvs, const int32_t numLayers,    \
++                                              const int64_t pageBuffSize, const int32_t numTokensChunk,                \
++                                              const bool page2L){                                                      \
++    MULTI_LAYER_PAGED_KV_COPY_KERNEL_CALL(TYPE, SLOTTYPE);                                                             \
++}
++
++MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(half, int32_t);
++MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(half, int64_t);
++MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int32_t);
++MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int64_t);
++MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(int8_t, int32_t);
++MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(int8_t, int64_t);
++
++template<typename T>
++void dispatch_paged_kernel_on_slot_type(vllm_ascend::AscendType slotType, uint32_t blockDim, void *stream,
++                                        uint8_t *pagedKVCaches, uint8_t *dstCacheTensor, uint8_t *slotmappings, 
++                                        const int64_t hiddenDims, const int32_t kvs, const int32_t numLayers,         
++                                        const int64_t pageBuffSize, const int32_t numTokensChunk, const bool page2L) {
++    switch(slotType) {
++        case vllm_ascend::AscendType::INT32:
++            multi_layer_paged_kernel<T, int32_t>(blockDim, stream, pagedKVCaches, dstCacheTensor, slotmappings,
++                                     hiddenDims, kvs, numLayers, pageBuffSize, numTokensChunk, page2L);
++            break;
++        case vllm_ascend::AscendType::INT64:
++            multi_layer_paged_kernel<T, int64_t>(blockDim, stream, pagedKVCaches, dstCacheTensor, slotmappings,
++                                     hiddenDims, kvs, numLayers, pageBuffSize, numTokensChunk, page2L);
++            break;
++        default:
++            return;
++    }
++}
++
++extern void multi_layer_kv_transfer_kernel(vllm_ascend::AscendType type, vllm_ascend::AscendType slotType, 
++                                           uint32_t blockDim, void *stream, uint8_t *pagedKVCaches, 
++                                           uint8_t *dstCacheTensor, uint8_t *slotmappings, 
++                                           const int64_t hiddenDims, const int32_t kvs, const int32_t numLayers, 
++                                           const int64_t pageBuffSize, const int32_t numTokensChunk, const bool page2L)
++{
++    switch(type) {
++        case vllm_ascend::AscendType::FP16:
++            dispatch_paged_kernel_on_slot_type<half>(slotType, blockDim, stream, pagedKVCaches, dstCacheTensor, 
++                                                     slotmappings, hiddenDims, kvs, numLayers, pageBuffSize, 
++                                                     numTokensChunk, page2L);
++            break;
++        case vllm_ascend::AscendType::BF16:
++            dispatch_paged_kernel_on_slot_type<bfloat16_t>(slotType, blockDim, stream, pagedKVCaches, dstCacheTensor, 
++                                                           slotmappings, hiddenDims, kvs, numLayers, pageBuffSize, 
++                                                           numTokensChunk, page2L);
++            break;
++        case vllm_ascend::AscendType::INT8:
++            dispatch_paged_kernel_on_slot_type<int8_t>(slotType, blockDim, stream, pagedKVCaches, dstCacheTensor, 
++                                                        slotmappings, hiddenDims, kvs, numLayers, pageBuffSize, 
++                                                        numTokensChunk, page2L);
++            break;
++        default:
++            return;
++    }
++}
++
++} // namespace lmc
+diff --git a/csrc/ascend/kernels/single_layer_mem_kernels.cpp b/csrc/ascend/kernels/single_layer_mem_kernels.cpp
+new file mode 100644
+index 0000000..a1cc30d
+--- /dev/null
++++ b/csrc/ascend/kernels/single_layer_mem_kernels.cpp
+@@ -0,0 +1,310 @@
++/*
++ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++#include "kernel_operator.h"
++#include <stdio.h>
++#include "types.h"
++#include "utils.h"
++
++template <typename scalar_t, typename slot_t, bool IsMLA> class SingleLayerPagedKVCopy {
++    using local_scalar_t = AscendC::LocalTensor<scalar_t>;
++
++public:
++    __aicore__ inline SingleLayerPagedKVCopy()
++    {
++    }
++
++    __aicore__ inline void init(GM_ADDR cacheTensor, GM_ADDR keyCachePtr, GM_ADDR valueCachePtr, GM_ADDR slotmappings,
++                                const int64_t hiddenDims, const int32_t numTokens, const bool page2L, 
++                                const bool tokenMajor, AscendC::TPipe *pipe)
++    {
++        this->pipe_ = pipe;
++        this->hiddenDims_ = hiddenDims;
++        this->numTokens_ = numTokens;
++        this->tokenMajor_ = tokenMajor;
++        this->valid_ = true;
++        this->page2L_ = page2L;
++        if constexpr (IsMLA) {
++            this->numKvs_ = 1;
++        } else {
++            this->numKvs_ = 2;
++        }
++        // TODO: Not sure how many to allocate, but let's do 4 blocks of hiddenDims_
++        // if it was fp16, 2048, we would get 16kb ？
++        this->pipe_->InitBuffer(this->pagedTokenQue_, 4, this->hiddenDims_*sizeof(scalar_t));
++    }
++
++    __aicore__ inline void reset(){
++        this->valid_ = true;
++    }
++
++    __aicore__ inline void updateTensorMemOffsetAndProcess(__gm__ uint8_t *pagedTensor, __gm__ uint8_t* nonPagedTensor, 
++                                             __gm__ uint8_t *slotmappings, const int tokenIdx, const int kvIdx) 
++    {
++        __gm__ slot_t *slotmappingPtr = reinterpret_cast<__gm__ slot_t*>(slotmappings);
++        int64_t slot = slotmappingPtr[tokenIdx];
++    
++        if (slot == -1) {
++            this->valid_ = false;
++            return;
++        }
++
++        // for the page tensor
++        int64_t pagedIdxOffset = slot * this->hiddenDims_;
++        
++        // for the lmc tensor
++        int64_t nonPagedIdxOffset = -1;
++        if (this->tokenMajor_) {
++            nonPagedIdxOffset = tokenIdx * this->numKvs_ * this->hiddenDims_ + 
++                                kvIdx * this->hiddenDims_;
++        } else {
++            nonPagedIdxOffset = kvIdx * this->numTokens_ * this -> hiddenDims_ +
++                                tokenIdx * this->hiddenDims_;
++        }
++
++        if (kvIdx == 0) {
++            // keys
++            this->keyTokensGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedTensor) + pagedIdxOffset,
++                                                    this->hiddenDims_);
++            this->lmcBufferKeyGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(nonPagedTensor) + nonPagedIdxOffset,
++                                                    this->hiddenDims_);
++        } else {
++            // values
++            this->valueTokensGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedTensor) + pagedIdxOffset,
++                                                    this->hiddenDims_);
++            this->lmcBufferValueGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(nonPagedTensor) + nonPagedIdxOffset,
++                                                    this->hiddenDims_);
++        }
++    }
++
++    __aicore__ inline void processFunc() {
++        if (!this->valid_) {
++            return;
++        }
++        // 1. Alloc Tensor for local page
++        local_scalar_t hiddenKeysDimTensor = this->pagedTokenQue_.template AllocTensor<scalar_t>();
++        local_scalar_t hiddenValuesDimTensor;
++        if constexpr(!IsMLA) {
++            hiddenValuesDimTensor = this->pagedTokenQue_.template AllocTensor<scalar_t>();
++        }
++
++        // 2. copy from global tensor into local
++        if (this->page2L_) {
++            AscendC::DataCopy(hiddenKeysDimTensor, this->keyTokensGlobal_, this->hiddenDims_);
++            if constexpr (!IsMLA) {
++                AscendC::DataCopy(hiddenValuesDimTensor, this->valueTokensGlobal_, this->hiddenDims_);
++            }
++        } else {
++            AscendC::DataCopy(hiddenKeysDimTensor, this->lmcBufferKeyGlobal_, this->hiddenDims_);
++            if constexpr(!IsMLA) {
++                AscendC::DataCopy(hiddenValuesDimTensor, this->lmcBufferValueGlobal_, this->hiddenDims_);
++            }
++        }
++       
++        // 3. enque vecin
++        pagedTokenQue_.EnQue(hiddenKeysDimTensor);
++        if constexpr(!IsMLA) {
++            pagedTokenQue_.EnQue(hiddenValuesDimTensor);
++        }
++
++        // 4. deque vecin, possible to reuse due to QueBind
++        hiddenKeysDimTensor = pagedTokenQue_.DeQue<scalar_t>();
++        if constexpr(!IsMLA) {
++            hiddenValuesDimTensor = pagedTokenQue_.DeQue<scalar_t>();
++        }
++
++        // 5. datacopy into GM 
++        if (this->page2L_) {
++            AscendC::DataCopy(this->lmcBufferKeyGlobal_, hiddenKeysDimTensor, this->hiddenDims_);
++            if constexpr(!IsMLA) {
++                AscendC::DataCopy(this->lmcBufferValueGlobal_, hiddenValuesDimTensor, this->hiddenDims_);
++            }
++        } else {
++            AscendC::DataCopy(this->keyTokensGlobal_, hiddenKeysDimTensor, this->hiddenDims_);
++            if constexpr(!IsMLA) {
++                AscendC::DataCopy(this->valueTokensGlobal_, hiddenValuesDimTensor, this->hiddenDims_);
++            }
++        }
++        
++        // 6. free alloced Tensor
++        pagedTokenQue_.FreeTensor(hiddenKeysDimTensor);
++        if constexpr(!IsMLA) {
++            pagedTokenQue_.FreeTensor(hiddenValuesDimTensor);
++        }
++    }
++
++private:
++    AscendC::TPipe *pipe_;
++    // a depth of 2
++    AscendC::TQueBind<AscendC::QuePosition::VECIN, AscendC::QuePosition::VECOUT, 4> pagedTokenQue_;
++
++    // [kvs, numPages * pagedSize, heads*headsize]
++    AscendC::GlobalTensor<scalar_t> keyTokensGlobal_;
++    // iff !isMLA
++    AscendC::GlobalTensor<scalar_t> valueTokensGlobal_;
++
++    // Depends on LMC setting whether we store in tokensMajor or not.
++    // the layout would be the followings:
++    // [tokens, kvs, heads*headsize] or [kvs, tokens, heads*headsize]
++    // TODO: check whether should combine the two and use a loop
++    AscendC::GlobalTensor<scalar_t> lmcBufferKeyGlobal_;
++    AscendC::GlobalTensor<scalar_t> lmcBufferValueGlobal_;
++
++    int64_t hiddenDims_; // heads * headsize
++    int32_t numTokens_; // num tokens in the cache tensor chunk
++    int16_t numKvs_; // 1 if MLA else 2
++    bool page2L_; // whether the direction of copy is from page to lmc
++    bool tokenMajor_; // whether the lmc buffer is in token major.
++    bool valid_;
++};
++
++#define SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(TYPE, SLOTTYPE, ISMLA)                                                 \
++        extern "C" __global__ __aicore__ void single_layer_paged_kv_copy_##TYPE##_##SLOTTYPE##_##ISMLA(                \
++        __gm__ uint8_t* dstCacheTensor, __gm__ uint8_t* keyCachePtr, __gm__ uint8_t* valueCachePtr,                    \
++        __gm__ uint8_t* slotmappings, const int64_t hiddenDims, const int32_t numTokens, const int coreNums,           \
++        const bool page2L, const bool tokenMajor)                                                                      \
++    {                                                                                                                  \
++        AscendC::TPipe pipe;                                                                                           \
++        SingleLayerPagedKVCopy<TYPE, SLOTTYPE, ISMLA> op{};                                                            \
++        op.init(dstCacheTensor, keyCachePtr, valueCachePtr, slotmappings, hiddenDims, numTokens,                       \
++                page2L, tokenMajor, &pipe);                                                                            \
++        int64_t bIdx = AscendC::GetBlockIdx();                                                                         \
++        for (int64_t i = bIdx; i < numTokens; i+=coreNums)                                                             \
++        {                                                                                                              \
++            op.reset();                                                                                                \
++            op.updateTensorMemOffsetAndProcess(keyCachePtr, dstCacheTensor, slotmappings, i, 0);                       \
++            if constexpr(!ISMLA) {                                                                                     \
++                op.updateTensorMemOffsetAndProcess(valueCachePtr, dstCacheTensor, slotmappings, i, 1);                 \
++            }                                                                                                          \
++            op.processFunc();                                                                                          \
++        }                                                                                                              \
++    }
++
++// Declare support kernel entry
++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(half, int32_t, false);
++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(half, int32_t, true);
++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t, int32_t, false);
++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t, int32_t, true);
++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(int8_t, int32_t, false);
++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(int8_t, int32_t, true);
++
++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(half, int64_t, false);
++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(half, int64_t, true);
++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t, int64_t, false);
++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t, int64_t, true);
++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(int8_t, int64_t, false);
++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(int8_t, int64_t, true);
++
++namespace lmc {
++
++#define SINGLE_LAYER_PAGED_KV_COPY_KERNEL_CALL(TYPE, SLOTTYPE, ISMLA)                                                  \
++    single_layer_paged_kv_copy_##TYPE##_##SLOTTYPE##_##ISMLA<<<blockDim, nullptr, stream>>>(dstCacheTensor,            \
++                                                        keyCachePtr, valueCachePtr, slotmappings, hiddenDims,          \
++                                                        numTokens, blockDim, page2L, tokenMajor);
++
++template<typename T, typename SlotT, bool ISMLA>
++void single_layer_paged_kernel(uint32_t blockDim, void *stream, uint8_t *dstCacheTensor, uint8_t *keyCachePtr, 
++                               uint8_t *valueCachePtr, uint8_t *slotmappings, const int64_t hiddenDims, 
++                               const int32_t numTokens, const bool page2L, const bool tokenMajor);
++
++#define SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(TYPE, SLOTTYPE, ISMLA)                                             \
++template<>                                                                                                             \
++void single_layer_paged_kernel<TYPE, SLOTTYPE, ISMLA>(uint32_t blockDim, void *stream, uint8_t *dstCacheTensor,        \
++                                               uint8_t *keyCachePtr, uint8_t *valueCachePtr, uint8_t *slotmappings,    \
++                                               const int64_t hiddenDims, const int32_t numTokens, const bool page2L,   \
++                                               const bool tokenMajor){                                                 \
++    SINGLE_LAYER_PAGED_KV_COPY_KERNEL_CALL(TYPE, SLOTTYPE, ISMLA);                                                     \
++}
++
++
++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(half, int32_t, false);
++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(half, int64_t, false);
++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int32_t, false);
++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int64_t, false);
++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(int8_t, int32_t, false);
++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(int8_t, int64_t, false);
++
++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(half, int32_t, true);
++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(half, int64_t, true);
++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int32_t, true);
++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int64_t, true);
++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(int8_t, int32_t, true);
++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(int8_t, int64_t, true);
++
++
++
++template<typename T>
++void dispatch_single_layer_kernel_on_slot_type(vllm_ascend::AscendType slotType, uint32_t blockDim, void *stream, 
++                                               uint8_t *dstCacheTensor, uint8_t *keyCachePtr, uint8_t *valueCachePtr,
++                                               uint8_t *slotmappings, const int64_t hiddenDims, const int32_t numTokens, 
++                                               const bool page2L, const bool tokenMajor, const bool isMLA) {
++    if (isMLA) {
++        switch(slotType) {
++            case vllm_ascend::AscendType::INT32:
++                single_layer_paged_kernel<T, int32_t, true>(blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr,
++                                                    slotmappings, hiddenDims, numTokens, page2L, tokenMajor);
++                break;
++            case vllm_ascend::AscendType::INT64:
++                single_layer_paged_kernel<T, int64_t, true>(blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr,
++                                                    slotmappings, hiddenDims, numTokens, page2L, tokenMajor);
++                break;
++            default:
++                return;
++        }
++    } else {
++        switch(slotType) {
++            case vllm_ascend::AscendType::INT32:
++                single_layer_paged_kernel<T, int32_t, false>(blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr,
++                                                    slotmappings, hiddenDims, numTokens, page2L, tokenMajor);
++                break;
++            case vllm_ascend::AscendType::INT64:
++                single_layer_paged_kernel<T, int64_t, false>(blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr,
++                                                    slotmappings, hiddenDims, numTokens, page2L, tokenMajor);
++                break;
++            default:
++                return;
++        }
++    }
++    
++}
++
++
++extern void single_layer_kv_transfer_kernel(vllm_ascend::AscendType type, vllm_ascend::AscendType slotType, 
++                                            uint32_t blockDim, void *stream, uint8_t *dstCacheTensor, 
++                                            uint8_t *keyCachePtr, uint8_t *valueCachePtr,
++                                            uint8_t *slotmappings, const int64_t hiddenDims, const int32_t numTokens, 
++                                            const bool page2L, const bool tokenMajor, const bool isMLA)
++{
++    switch(type) {
++        case vllm_ascend::AscendType::FP16:
++            dispatch_single_layer_kernel_on_slot_type<half>(slotType, blockDim, stream, dstCacheTensor, keyCachePtr, 
++                                                            valueCachePtr, slotmappings, hiddenDims, numTokens, page2L, 
++                                                            tokenMajor, isMLA);
++            break;
++        case vllm_ascend::AscendType::BF16:
++            dispatch_single_layer_kernel_on_slot_type<bfloat16_t>(slotType, blockDim, stream, dstCacheTensor, keyCachePtr, 
++                                                                  valueCachePtr, slotmappings, hiddenDims, numTokens, 
++                                                                  page2L, tokenMajor, isMLA);
++            break;
++        case vllm_ascend::AscendType::INT8:
++            dispatch_single_layer_kernel_on_slot_type<int8_t>(slotType, blockDim, stream, dstCacheTensor, keyCachePtr, 
++                                                              valueCachePtr, slotmappings, hiddenDims, numTokens, page2L, 
++                                                              tokenMajor, isMLA);
++        default:
++            return;
++    }
++}
++
++} // namespace lmc
+diff --git a/csrc/ascend/kernels/types.h b/csrc/ascend/kernels/types.h
+new file mode 100644
+index 0000000..7c6c46e
+--- /dev/null
++++ b/csrc/ascend/kernels/types.h
+@@ -0,0 +1,28 @@
++/*
++ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++namespace vllm_ascend {
++enum struct AscendType {
++    FP16 = 0,
++    BF16 = 1,
++    FP32 = 2,
++    INT8 = 3,
++    INT32 = 4,
++    INT64 = 5,
++};
++}
+\ No newline at end of file
+diff --git a/csrc/ascend/kernels/utils.h b/csrc/ascend/kernels/utils.h
+new file mode 100644
+index 0000000..a6dd3f3
+--- /dev/null
++++ b/csrc/ascend/kernels/utils.h
+@@ -0,0 +1,38 @@
++/*
++ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++#include "kernel_type.h"
++namespace vllm_ascend {
++
++template <typename scalar_t> struct AccType;
++
++template <> struct AccType<bfloat16_t> {
++    using type = float;
++};
++
++template <> struct AccType<half> {
++    using type = half;
++};
++
++template <> struct AccType<float> {
++    using type = float;
++};
++
++template <> struct AccType<int8_t> {
++    using type = int;
++};
++}// namespace vllm_ascend
+\ No newline at end of file
+diff --git a/csrc/ascend/managed_mem.cpp b/csrc/ascend/managed_mem.cpp
+new file mode 100644
+index 0000000..4c79f54
+--- /dev/null
++++ b/csrc/ascend/managed_mem.cpp
+@@ -0,0 +1,326 @@
++#include "managed_mem.h"
++#include <acl/acl.h>
++// Only required for old driver version (look at registerHostPtr)
++#ifdef PROF_ERROR
++    // You can add a pragma message to see this in your build log if you want:
++    // #pragma message("Undefining PROF_ERROR from ascend_hal.h before NPU headers")
++    #undef PROF_ERROR
++#endif
++
++#include <sys/mman.h>
++#include <torch_npu/csrc/core/npu/NPUStream.h>
++#include "driver/ascend_hal_define.h"
++#include "driver/ascend_hal.h"
++#include <dlfcn.h>
++#include "torch/torch.h"
++#include "torch/extension.h"
++
++namespace lmc {
++constexpr int32_t PROT_FLAGS = static_cast<int32_t>(PROT_READ) | static_cast<int32_t>(PROT_WRITE);
++constexpr int32_t MAP_FLAGS = static_cast<int32_t>(MAP_PRIVATE) | static_cast<int32_t>(MAP_ANONYMOUS) | static_cast<int32_t>(MAP_POPULATE);
++
++// Signatures for internal helper functions
++
++// Get the version of the NPU driver as a string
++std::string get_driver_version();
++// Checks whether the major version of the NPU is greater or equal 25 to support aclrtHostRegister
++bool is_version_at_least_25(const std::string& version_str);
++// Gets the current device offsetting on ASCEND_RT_VISIBLE_DEVICES when needed
++int get_device();
++// Uregisters the malloced hostPtr
++void unregisterPtr(void* ptr);
++// Swaps the host memory allocated to a tensor with the given hostPtr
++void swap_tensor_ptr(void* hostPtr, torch::Tensor& original_tensor);
++
++// Class implementations
++
++HostRegisteredMemoryManager::HostRegisteredMemoryManager(){
++};
++    
++HostRegisteredMemoryManager::~HostRegisteredMemoryManager() {
++    this->unregisterAll();
++};
++
++void HostRegisteredMemoryManager::unregisterAll(){
++    const std::unique_lock<std::shared_mutex> guard(this->mux);
++
++    // Iterate through each key-value pair in the map.
++    for (const auto& pair : this->allocatedMap) {
++        void* hostPtr = pair.first;
++        aclrtHostUnregister(hostPtr);
++    }
++
++    // After unregistering all pointers, clear the map completely.
++    this->allocatedMap.clear();
++};
++
++// Register a pointer through high level APIs (aclrt) return devPtr
++// Returns the created RegisteredMemoryRecord
++RegisteredMemoryRecord HostRegisteredMemoryManager::registerHostPtr(void* hostPtr, size_t bufferSize) { // torch::Tensor& tensor){
++    TORCH_CHECK(!(hostPtr == nullptr || bufferSize == 0), "Error: hostPtr cannot be null and bufferSize must be greater than 0.");
++    const std::unique_lock<std::shared_mutex> guard(this->mux);
++
++    // Check if the host pointer is already registered
++    if (this->allocatedMap.count(hostPtr)) {
++        return this->allocatedMap[hostPtr];
++    }
++    
++    void* devPtr;
++    aclError err = aclrtHostRegister(hostPtr, static_cast<uint64_t>(bufferSize), 
++        ACL_HOST_REGISTER_MAPPED, (void**)&devPtr);
++    TORCH_CHECK(err == 0, "Unable to host register the host ptr: " + std::to_string(err));
++
++    this->allocatedMap.emplace(hostPtr, RegisteredMemoryRecord{reinterpret_cast<uintptr_t>(hostPtr), 
++            reinterpret_cast<uintptr_t>(devPtr), bufferSize});
++
++    return this->allocatedMap[hostPtr];
++};
++
++// Register a pointer through low level APIs (HAL). Allocates a new pinned host memory
++// This should be used for driver versions, where cannot rely on aclrtHostRegister()
++// Returns the created RegisteredMemoryRecord
++RegisteredMemoryRecord HostRegisteredMemoryManager::halRegisterHostPtr(size_t bufferSize){
++    // We allocate a new chunk of memory, register it, and replace the tensor.
++    // Essentially, the halHostRegister function requires a ptr given by mmap.
++    TORCH_CHECK((bufferSize >= 0), "Error: bufferSize must be greater than 0.");
++    const std::unique_lock<std::shared_mutex> guard(this->mux);
++
++    void* devPtr;
++    int device = get_device();
++    void* hostPtr;
++    // Allocate and register
++    hostPtr = mmap(nullptr, bufferSize, PROT_FLAGS, MAP_FLAGS, -1, 0);
++    TORCH_CHECK(hostPtr != MAP_FAILED, "Unable to alloc memory with mmap.");
++    auto ret = madvise(reinterpret_cast<void*>(hostPtr), bufferSize, MADV_HUGEPAGE);
++    auto drvRet = halHostRegister((void*)hostPtr, static_cast<UINT64>(bufferSize),
++        HOST_MEM_MAP_DEV_PCIE_TH, (UINT32)device, (void**)&devPtr);
++    TORCH_CHECK(drvRet == 0, "Unable to register host memory with hal: " + std::to_string(drvRet))
++
++    // Lock the memory and fail if impossible to lock
++    auto lockErr = mlock(reinterpret_cast<void*>(hostPtr), bufferSize);
++    if (lockErr == -1) {
++        // This can happen in non-privileged mode or not enough rlimit, 
++        // let's not proceed since we wanted to guarantee pinned
++        // because we already alloced, let's free
++        auto ret = halHostUnregisterEx(reinterpret_cast<void*>(hostPtr), 
++            static_cast<UINT32>(device), HOST_MEM_MAP_DEV_PCIE_TH);
++        TORCH_CHECK(ret==0, "Unable to pin host memory, unable to unregister. Error code: " + std::to_string(ret))
++        auto mret = munmap(reinterpret_cast<void*>(hostPtr), bufferSize);
++        TORCH_CHECK(false, "Unable to pin host memory with error code: " + std::to_string(lockErr))
++    }
++    
++    this->allocatedMap.emplace(hostPtr, RegisteredMemoryRecord{reinterpret_cast<uintptr_t>(hostPtr), 
++        reinterpret_cast<uintptr_t>(devPtr), bufferSize});
++
++    return this->allocatedMap[hostPtr];
++};
++
++void HostRegisteredMemoryManager::unregisterMemory(void* hostPtr) {
++    TORCH_CHECK(hostPtr != nullptr, "Error: hostPtr cannot be null.");
++    
++    // we don't actually mind if it doesn't unregister, 
++    // at context destroy it should be unregister anyway.
++    const std::unique_lock<std::shared_mutex> guard(this->mux);
++    aclError err = aclrtHostUnregister(hostPtr);
++    this->allocatedMap.erase(hostPtr);
++};
++
++/*
++*    For now we only do a linear search as we probably won't have a long list of ptrs
++*    we go through each record and check whether we are in range, if so
++*    we calculate the offset from the host ptr and apply to the device ptr
++*    finally we return the device ptr.
++*/
++void* HostRegisteredMemoryManager::getDevicePtr(void* hostPtr) {
++    if (hostPtr == nullptr) {
++        return nullptr;
++    }
++    const std::shared_lock<std::shared_mutex> guard(this->mux);
++    
++    const uintptr_t hostAddrPtr = reinterpret_cast<uintptr_t>(hostPtr);
++
++    for (const auto& pair: this->allocatedMap) {
++        const RegisteredMemoryRecord& record = pair.second;
++
++        if (hostAddrPtr >= record.ptr && hostAddrPtr < (record.ptr + record.buffSize)) {
++            const size_t offset = hostAddrPtr - record.ptr;
++
++            const uintptr_t deviceAddrPtr = record.devptr + offset;
++
++            return reinterpret_cast<void*>(deviceAddrPtr);
++        }
++    }
++
++    return nullptr;
++};
++
++
++size_t HostRegisteredMemoryManager::getRecordSize(void* hostPtr){
++    if (hostPtr == nullptr) {
++        return 0;
++    }
++    const std::shared_lock<std::shared_mutex> guard(this->mux);
++    
++    const uintptr_t hostAddrPtr = reinterpret_cast<uintptr_t>(hostPtr);
++
++    for (const auto& pair: this->allocatedMap) {
++        const RegisteredMemoryRecord& record = pair.second;
++
++        if (hostAddrPtr >= record.ptr && hostAddrPtr < (record.ptr + record.buffSize)) {
++            return record.buffSize;
++        }
++    }
++    return 0;
++};
++
++std::string get_driver_version() {
++    void* handle = nullptr;
++    int (*dsmi_get_version)(int, char*, unsigned int, unsigned int*) = nullptr;
++    std::string result;
++
++    handle = dlopen("libdrvdsmi_host.so", RTLD_LAZY);
++    if (!handle) {
++        TORCH_CHECK(false, std::string("Error opening libdrvdsmi_host.so: ") + dlerror() );
++        return result;
++    }
++    dlerror();
++
++    // Load the function
++    *(void**) (&dsmi_get_version) = dlsym(handle, "dsmi_get_version");
++    const char* dlsym_error = dlerror();
++    if (dlsym_error) {
++        dlclose(handle);
++        TORCH_CHECK(false, std::string("Error loading dsmi_get_version: ") + dlsym_error);
++        return result;
++    }
++
++    // Call the function
++    int device_id = c10_npu::getCurrentNPUStream().device_index();
++    const unsigned int buffer_size = 256;
++    std::vector<char> version_buffer(buffer_size);
++    unsigned int ret_len = 0;
++    int ret = dsmi_get_version(device_id, version_buffer.data(), buffer_size, &ret_len);
++    if (ret == 0) { 
++        if (ret_len > 0 && ret_len <= buffer_size) {
++            version_buffer[ret_len] = '\0'; // Ensure null-termination
++            result = version_buffer.data();
++        } else {
++            TORCH_CHECK(false, "Error: Invalid length returned: " + std::to_string(ret_len));
++        }
++    } else {
++        TORCH_CHECK(false, "Error: dsmi_get_version returned " + std::to_string(ret));
++    }
++
++    dlclose(handle);
++
++    return result;
++}
++
++// To be on the safe side, returns false in case of uncertainties
++bool is_version_at_least_25(const std::string& version_str) {
++    if (version_str.empty()) {
++        return false;
++    }
++
++    size_t num_end = 0;
++    long major_version = 0;
++
++    try {
++        major_version = std::stol(version_str, &num_end);
++    } catch (const std::invalid_argument&) {
++        // No valid number at start
++        return false;
++    } catch (const std::out_of_range&) {
++        // Should never happen, here for robustness
++        return false;
++    }
++    return major_version >= 25;
++}
++
++int get_device(){
++    int device = c10_npu::getCurrentNPUStream().device_index();
++    const char* env_visible_devices_p = std::getenv("ASCEND_RT_VISIBLE_DEVICES");
++    // If we are using a custom list of visible devices, the index refers to that
++    if (env_visible_devices_p != nullptr) {
++        std::string env_visible_devices = env_visible_devices_p;
++        std::vector<uint32_t> list_visible_devices;
++        std::stringstream ss(env_visible_devices);
++        std::string item;
++        while (std::getline(ss, item, ',')) {
++            list_visible_devices.push_back(std::stoi(item));
++        }
++        std::sort(list_visible_devices.begin(), list_visible_devices.end());
++        // Here two cases are possible:
++        // 1. no hccl, we just use current_device, even though we have specify the ASCEND_RT_VISIBLE_DEVICES
++        // 2. hccl, and we use current_device that seems to be correct
++        // for case 2, since the current_device would have been correct anyway, obtaining from the list would be fine.
++        // for case 1, we have shifted the device to the RT_VISIBLE_DEVICES, so it should be corrected.
++        device = list_visible_devices[device];
++    }
++    return device;
++}
++
++void unregisterPtr(void* ptr) {
++    if (ptr){
++        int device = get_device();
++        auto& hmm = HostRegisteredMemoryManager::GetInstance();
++        size_t bufferSize = hmm.getRecordSize(ptr);
++        auto ret = halHostUnregisterEx(reinterpret_cast<void*>(ptr), 
++            static_cast<UINT32>(device), HOST_MEM_MAP_DEV_PCIE_TH);
++        if (ret != 0) {
++            std::cout << "Unable to hal host unregister: "<< ret << std::endl;
++        }
++        auto mret = munmap(reinterpret_cast<void*>(ptr), bufferSize);
++        if (mret != 0) {
++            std::cout << "Unable to unmap memory: "<< ret << std::endl;
++        }
++    }
++}
++
++
++void swap_tensor_ptr(void* hostPtr, torch::Tensor& original_tensor){
++    torch::TensorOptions tensorOpsCpu = torch::TensorOptions()
++                                                .dtype(original_tensor.dtype())
++                                                .device(original_tensor.device())
++                                                .pinned_memory(true);
++    int64_t numel = static_cast<int64_t>(original_tensor.nbytes());
++    std::vector<int64_t> dims = {numel};
++    torch::Tensor new_tensor_from_myptr = torch::from_blob(
++        hostPtr, dims, unregisterPtr, tensorOpsCpu);
++
++    original_tensor.set_(new_tensor_from_myptr.storage(), original_tensor.storage_offset(), 
++        original_tensor.sizes(), original_tensor.strides());
++}
++
++} // namespace lmc
++
++
++void* register_memory(torch::Tensor& tensor) {
++    torch::Device device = tensor.device();
++    if (!device.is_cpu() || !tensor.is_pinned()) {
++        TORCH_CHECK(false, "Invalid device. Device must be CPU and tensor must be pinned.");
++    }
++    auto& hmm = lmc::HostRegisteredMemoryManager::GetInstance();
++    size_t tensorSize = tensor.nbytes();
++    std::string verString = lmc::get_driver_version();
++    if (lmc::is_version_at_least_25(verString)) { // New driver version, supports aclrtHostRegister()
++        void* hostPtr = static_cast<void*>(tensor.data_ptr());
++        return (void*) hmm.registerHostPtr(hostPtr, tensorSize).devptr;
++    } else { // Old driver version, does not support aclrtHostRegister(), we have to use HAL.
++        // We ask for a new registerd memory and substitute with the previously allocated.
++        lmc::RegisteredMemoryRecord record = hmm.halRegisterHostPtr(tensorSize);
++        lmc::swap_tensor_ptr((void*) record.ptr, tensor);
++        return (void*) record.devptr;
++    }
++};
++
++void unregister_memory(torch::Tensor& tensor) {
++    void* hostPtr = static_cast<void*>(tensor.data_ptr());
++    auto& hmm = lmc::HostRegisteredMemoryManager::GetInstance();
++    hmm.unregisterMemory(hostPtr);
++};
++
++void* get_device_ptr(void* ptr) {
++    auto& hmm = lmc::HostRegisteredMemoryManager::GetInstance();
++    return hmm.getDevicePtr(ptr);
++};
+diff --git a/csrc/ascend/managed_mem.h b/csrc/ascend/managed_mem.h
+new file mode 100644
+index 0000000..ae42364
+--- /dev/null
++++ b/csrc/ascend/managed_mem.h
+@@ -0,0 +1,69 @@
++#pragma once
++#include <shared_mutex>
++#include <map>
++#include <torch/torch.h>
++#include <torch/extension.h>
++
++namespace lmc {
++
++struct RegisteredMemoryRecord {
++    uintptr_t ptr;
++    uintptr_t devptr;
++    size_t buffSize;
++};
++
++/* 
++* We are not responsible for acl init and ctx initialization,
++* we assume the user responsible for ctx initialization
++*/
++class HostRegisteredMemoryManager {
++private:
++    HostRegisteredMemoryManager();
++
++    // Delete copy constructor and assignment operator
++    HostRegisteredMemoryManager(const HostRegisteredMemoryManager&) = delete;
++    HostRegisteredMemoryManager& operator=(const HostRegisteredMemoryManager&) = delete;
++    HostRegisteredMemoryManager(HostRegisteredMemoryManager&&) = delete;
++    HostRegisteredMemoryManager& operator=(HostRegisteredMemoryManager&&) = delete;
++
++    std::map<void*, RegisteredMemoryRecord> allocatedMap;
++    mutable std::shared_mutex mux;
++    
++public:
++    static HostRegisteredMemoryManager& GetInstance()
++    {
++        static HostRegisteredMemoryManager instance;
++        return instance;
++    }
++    ~HostRegisteredMemoryManager();
++    
++    // Register a pointer through high level APIs (aclrt) return devPtr
++    // Returns an already existing RegisteredMemoryRecord or the newly created one
++    // Inputs: 
++    // -hostPtr: host pointer of the allocated memory area to register on device
++    // -bufferSize: size of the allocated memory area to register on device
++    RegisteredMemoryRecord  registerHostPtr(void* hostPtr, size_t bufferSize); //torch::Tensor& tensor); // 
++    // Register a pointer through low level APIs (hal)
++    // This should be used for driver versions, where cannot rely on aclrtHostRegister()
++    // Returns the created RegisteredMemoryRecord
++    // Inputs: 
++    // -bufferSize: size of the allocated memory area to register on device
++    RegisteredMemoryRecord  halRegisterHostPtr(size_t bufferSize);
++    void                    unregisterMemory(void* hostPtr);
++    void*                   getDevicePtr(void* hostPtr);
++    size_t                  getRecordSize(void* hostPtr);
++    void                    unregisterAll();
++};
++} // namespace lmc
++
++// Register a tensor on the current device
++// Inputs: 
++// -tensor: The tensor to register on the device
++// Returns the device ptr for that tensor
++void* register_memory(torch::Tensor& tensor);
++// Reverse of register
++// Inputs: 
++// -tensor: The tensor to register on the device
++void  unregister_memory(torch::Tensor& tensor);
++// Takes in input a host pointer, returns the corresponding device pointer
++void* get_device_ptr(void* ptr);
+diff --git a/csrc/ascend/mem_kernels.cpp b/csrc/ascend/mem_kernels.cpp
+new file mode 100644
+index 0000000..6526836
+--- /dev/null
++++ b/csrc/ascend/mem_kernels.cpp
+@@ -0,0 +1,245 @@
++#include "mem_kernels.h"
++#include <ATen/ATen.h>
++#include <torch_npu/csrc/core/npu/NPUStream.h>
++#include <torch_npu/csrc/framework/OpCommand.h>
++#include <torch_npu/csrc/npu/Module.h>
++#include "utils.h"
++#include "tiling/platform/platform_ascendc.h"
++#include <pybind11/pybind11.h>
++#include <Python.h>
++
++template <typename T, typename TENSOR_TYPE>
++T* get_kernel_ptr(TENSOR_TYPE& tensor) {
++    torch::Device device = tensor.device();
++    // NPU should be using PrivateUse1
++    if (device.is_privateuseone() || device.is_cuda()) {
++        return static_cast<T*>(tensor.data_ptr());
++    } else if (device.is_cpu()) {
++        // find device ptr based on the host pinned ptr
++        // because acl does not currently support HostGetDevicePointer API
++        void* devPtr = get_device_ptr(tensor.data_ptr());
++        TORCH_CHECK(devPtr != nullptr, "Unable to retrieve device ptr, is this a host registered pointer ?");
++        return reinterpret_cast<T*>(devPtr);
++    } else {
++        TORCH_CHECK(false, "Invalid device. Device must be ascend (PrivateUseOne) or pinned cpu.");
++    }
++}
++
++/**
++ * Quickly offload KV cache from vLLM paged memory to the offloading buffer
++ * Processes all the layers at the same time
++ *
++ * Each layer in vLLM's KV buffer has a shape of
++ * [2, PAGE_BUFFER_SIZE, num_heads*head_size]
++ *
++ * Each AIV Core processes the copy for a token
++ *
++ * Therefore:
++ *  AIV Core - token
++ *
++ * The function does:
++ * slot_id = slot_mapping[tokenId]
++ * ptrs[mem_offset(kv, layer, tokenId, hiddenDims)] = key_value[mem_offset(kv, layer, pages, pageSize, slot_id, hiddenDims)]
++ *
++ * Param:
++ *  - direction: false  means LMCache to PagedBuffer, true  means PagedBuffer to
++ * LMCache
++ */
++void multi_layer_kv_transfer(torch::Tensor& key_value, // [kv, num_layer, num_tokens, hidden]
++                             const torch::Tensor& key_value_ptrs, // [num_layers]
++                             const torch::Tensor& slot_mapping, // [num_tokens]
++                             const torch::Device& paged_memory_device,
++                             const int page_buffer_size, const bool direction,
++                             const bool use_mla) {
++    uint8_t* key_value_ptr = get_kernel_ptr<uint8_t, torch::Tensor>(key_value);
++    // it is actually a uint8_t**. we will reinterpret it inside the kernel
++    uint8_t* page_buffer_ptrs = get_kernel_ptr<uint8_t, const torch::Tensor>(key_value_ptrs);
++    uint8_t* slot_mapping_ptr = get_kernel_ptr<uint8_t, const torch::Tensor>(slot_mapping);
++
++    int num_layers = key_value.size(1);
++    int num_tokens = slot_mapping.size(0);
++    int hidden_dims = key_value.size(-1);
++    int kv_size = 2;
++    if (use_mla) {
++        kv_size = 1;
++    }
++    
++    const c10::OptionalDeviceGuard device_guard(paged_memory_device);
++    // we require the kv ptr list to be on the device too
++    const c10::OptionalDeviceGuard kv_device_guard(device_of(key_value_ptrs));
++
++    const aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
++    at::ScalarType scalar_type = key_value.scalar_type();
++    at::ScalarType slot_type = slot_mapping.scalar_type();
++    const char* socName = aclrtGetSocName();
++    
++    at_npu::native::OpCommand cmd;
++    cmd.Name("multi_layer_kv_transfer_kernel");
++    cmd.SetCustomHandler([scalar_type, slot_type, socName, stream, page_buffer_ptrs, key_value_ptr,
++                          slot_mapping_ptr, hidden_dims, kv_size, num_layers, page_buffer_size,
++                          num_tokens, direction]()->int{
++        auto slot_num = vllm_ascend::get_dtype_from_torch(slot_type);
++        auto dtype_num = vllm_ascend::get_dtype_from_torch(scalar_type);
++        auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socName);
++        uint32_t aiv_num = ascendcPlatform->GetCoreNumAiv();
++        lmc::multi_layer_kv_transfer_kernel(dtype_num, slot_num, aiv_num, stream, page_buffer_ptrs, key_value_ptr,
++                                        slot_mapping_ptr, hidden_dims, kv_size, num_layers, page_buffer_size,
++                                        num_tokens, direction);
++        return 0;
++    });
++    cmd.Run();
++    return ;
++};
++
++
++void multi_layer_kv_transfer_unilateral(torch::Tensor& key_value,
++                                        const torch::Tensor& key_ptrs,
++                                        const torch::Tensor& value_ptrs,
++                                        const torch::Tensor& slot_mapping,
++                                        const torch::Device& paged_memory_device,
++                                        const int page_buffer_size,
++                                        const bool direction){
++       // TODO:
++    PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
++    throw py::error_already_set();
++};
++
++
++void single_layer_kv_transfer(torch::Tensor& lmc_key_value_cache, // [num_tokens, 2, num_heads*head_size]
++                                                                  // or
++                                                                  // [2, num_tokens, num_heads*head_size]
++                              torch::Tensor& vllm_key_cache, // [num_blocks, block_size, num_heads, head_size]
++                              torch::Tensor& vllm_value_cache, // [....]
++                              torch::Tensor& slot_mapping, // [num_tokens]
++                              const bool direction, // false: LMCache to PagedBuffer, true: PagedBuffer to LMCache
++                              const bool token_major // true: lmc_key_value_cache is [num_tokens, 2, num_heads*head_size]
++                                                     // false: otherwise
++) {
++    uint8_t *lmc_key_value_cache_ptr = get_kernel_ptr<uint8_t, torch::Tensor>(lmc_key_value_cache);
++    uint8_t *vllm_key_cache_ptr = get_kernel_ptr<uint8_t, torch::Tensor>(vllm_key_cache);
++    uint8_t *vllm_value_cache_ptr = get_kernel_ptr<uint8_t, torch::Tensor>(vllm_value_cache);
++    uint8_t *slot_mapping_ptr = get_kernel_ptr<uint8_t, torch::Tensor>(slot_mapping);
++
++    int num_tokens = slot_mapping.size(0);
++    int hidden_dims = lmc_key_value_cache.size(-1);
++
++    const c10::OptionalDeviceGuard device_guard(device_of(vllm_key_cache));
++    const c10::OptionalDeviceGuard slot_device_guard(device_of(slot_mapping));
++    const aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
++
++    at::ScalarType scalar_type = vllm_key_cache.scalar_type();
++    at::ScalarType slot_type = slot_mapping.scalar_type();
++    
++    const char* socName = aclrtGetSocName();
++
++    at_npu::native::OpCommand cmd;
++    cmd.Name("single_layer_kv_transfer_kernel");
++    cmd.SetCustomHandler([scalar_type, slot_type, socName, stream, lmc_key_value_cache_ptr,
++                          vllm_key_cache_ptr, vllm_value_cache_ptr, slot_mapping_ptr,
++                          hidden_dims, num_tokens, direction, token_major]() -> int {
++        auto slot_num = vllm_ascend::get_dtype_from_torch(slot_type);
++        auto dtype_num = vllm_ascend::get_dtype_from_torch(scalar_type);
++        auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socName);
++        uint32_t aiv_num = ascendcPlatform->GetCoreNumAiv();
++        // TODO: We will add the isMLA argument once the signature have support for the MLA.
++        lmc::single_layer_kv_transfer_kernel(dtype_num, slot_num, aiv_num, stream, lmc_key_value_cache_ptr,
++                                         vllm_key_cache_ptr, vllm_value_cache_ptr, slot_mapping_ptr,
++                                         hidden_dims, num_tokens, direction, token_major, false);
++        return 0;
++    });
++    cmd.Run();
++    return ;
++};
++
++void load_and_reshape_flash(
++    torch::Tensor& key_value, // [2, num_layer, num_tokens, num_heads*head_size]
++                              // must be one gpu / pinned cpu
++    torch::Tensor& key_cache, // [num_blocks, block_size, num_heads, head_size]
++    torch::Tensor& value_cache, // [num_blocks, block_size, num_heads, head_size]
++    torch::Tensor& slot_mapping, // [num_tokens],
++    const int layer_idx) {
++    
++    uint8_t* key_value_ptr = get_kernel_ptr<uint8_t, torch::Tensor>(key_value);
++    uint8_t* key_cache_ptr = get_kernel_ptr<uint8_t, torch::Tensor>(key_cache);
++    uint8_t* value_cache_ptr = get_kernel_ptr<uint8_t, torch::Tensor>(value_cache);
++
++    uint8_t* slot_mapping_ptr = get_kernel_ptr<uint8_t, torch::Tensor>(slot_mapping);
++
++    int num_tokens = slot_mapping.size(0);
++    int num_layers = key_value.size(1);
++    int block_size = key_cache.size(1);
++    int num_blocks = key_cache.size(0);
++    int hidden_dims = key_value.size(-1);
++    const c10::OptionalDeviceGuard device_guard(device_of(key_cache));
++    const aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
++
++    at::ScalarType scalar_type = key_value.scalar_type();
++    at::ScalarType slot_type = slot_mapping.scalar_type();
++    const char* socName = aclrtGetSocName();
++
++    at_npu::native::OpCommand cmd;
++    cmd.Name("load_and_reshape_flash_kernel");
++    cmd.SetCustomHandler([scalar_type, slot_type, socName, stream, key_value_ptr,
++                          key_cache_ptr, value_cache_ptr, slot_mapping_ptr,
++                          hidden_dims, num_blocks, block_size,
++                          num_tokens, num_layers, layer_idx]()->int {
++        auto slot_num = vllm_ascend::get_dtype_from_torch(slot_type);
++        auto dtype_num = vllm_ascend::get_dtype_from_torch(scalar_type);
++        auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socName);
++        uint32_t aiv_num = ascendcPlatform->GetCoreNumAiv();
++        lmc::load_and_reshape_flash_kernel(dtype_num, slot_num, aiv_num, stream, key_value_ptr,
++                                       key_cache_ptr, value_cache_ptr, slot_mapping_ptr,
++                                       hidden_dims, num_blocks, block_size,
++                                       num_tokens, num_layers, layer_idx, true);
++        return 0;
++    });
++    cmd.Run();
++    return;
++};
++
++void reshape_and_cache_back_flash(
++    torch::Tensor& key_value, // [2, num_layer, num_tokens, num_heads*head_size]
++                              // must be one gpu / pinned cpu
++    torch::Tensor& key_cache, // [num_blocks, block_size, num_heads, head_size]
++    torch::Tensor& value_cache, // [num_blocks, block_size, num_heads, head_size]
++    torch::Tensor& slot_mapping, // [num_tokens],
++    const int layer_idx) {
++    
++    uint8_t* key_value_ptr = get_kernel_ptr<uint8_t, torch::Tensor>(key_value);
++    uint8_t* key_cache_ptr = get_kernel_ptr<uint8_t, torch::Tensor>(key_cache);
++    uint8_t* value_cache_ptr = get_kernel_ptr<uint8_t, torch::Tensor>(value_cache);
++
++    uint8_t* slot_mapping_ptr = get_kernel_ptr<uint8_t, torch::Tensor>(slot_mapping);
++
++    int num_tokens = slot_mapping.size(0);
++    int num_layers = key_value.size(1);
++    int block_size = key_cache.size(1);
++    int num_blocks = key_cache.size(0);
++    int hidden_dims = key_value.size(-1);
++    const c10::OptionalDeviceGuard device_guard(device_of(key_cache));
++    const aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
++
++    at::ScalarType scalar_type = key_value.scalar_type();
++    at::ScalarType slot_type = slot_mapping.scalar_type();
++    
++    const char* socName = aclrtGetSocName();
++
++    at_npu::native::OpCommand cmd;
++    cmd.Name("reshape_and_cache_back_flash");
++    cmd.SetCustomHandler([scalar_type, slot_type, socName, stream, key_value_ptr,
++                          key_cache_ptr, value_cache_ptr, slot_mapping_ptr,
++                          hidden_dims, num_blocks, block_size,
++                          num_tokens, num_layers, layer_idx]() -> int {
++        auto slot_num = vllm_ascend::get_dtype_from_torch(slot_type);
++        auto dtype_num = vllm_ascend::get_dtype_from_torch(scalar_type);
++        auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socName);
++        uint32_t aiv_num = ascendcPlatform->GetCoreNumAiv();
++        lmc::load_and_reshape_flash_kernel(dtype_num, slot_num, aiv_num, stream, key_value_ptr,
++                                       key_cache_ptr, value_cache_ptr, slot_mapping_ptr,
++                                       hidden_dims, num_blocks, block_size,
++                                       num_tokens, num_layers, layer_idx, false);
++        return 0;
++    });
++    cmd.Run();
++    return;
++};
+diff --git a/csrc/ascend/mem_kernels.h b/csrc/ascend/mem_kernels.h
+new file mode 100644
+index 0000000..01e0494
+--- /dev/null
++++ b/csrc/ascend/mem_kernels.h
+@@ -0,0 +1,58 @@
++#pragma once
++#include <torch/torch.h>
++#include <torch/extension.h>
++#include "managed_mem.h"
++#include "kernels/types.h"
++
++namespace lmc {
++void multi_layer_kv_transfer_kernel(vllm_ascend::AscendType type, vllm_ascend::AscendType slotType, uint32_t blockDim, 
++                                    void *stream, uint8_t *pagedKVCaches, uint8_t *dstCacheTensor, 
++                                    uint8_t *slotmappings, const int64_t hiddenDims, const int32_t kvs, 
++                                    const int32_t numLayers, const int64_t pageBuffSize, const int32_t numTokensChunk, 
++                                    const bool page2L);
++
++void single_layer_kv_transfer_kernel(vllm_ascend::AscendType type, vllm_ascend::AscendType slotType, 
++                                     uint32_t blockDim, void *stream, uint8_t *dstCacheTensor, 
++                                     uint8_t *keyCachePtr, uint8_t *valueCachePtr,
++                                     uint8_t *slotmappings, const int64_t hiddenDims, const int32_t numTokens, 
++                                     const bool page2L, const bool tokenMajor, const bool isMLA);
++
++void load_and_reshape_flash_kernel(vllm_ascend::AscendType type, vllm_ascend::AscendType slotType, 
++                                  uint32_t blockDim, void *stream, uint8_t *dstCacheTensor, uint8_t *keyCachePtr, 
++                                  uint8_t *valueCachePtr, uint8_t *slotmappings, const int64_t hiddenDims, 
++                                  const int64_t numPages, const int32_t pagedSize, const int32_t numTokens, 
++                                  const int32_t numLayers, const int32_t layerIdx, const bool page2L);
++}
++
++
++void multi_layer_kv_transfer(torch::Tensor& key_value, // [kv, num_layer, num_tokens, hidden]
++                             const torch::Tensor& key_value_ptrs, // [num_layers]
++                             const torch::Tensor& slot_mapping, // [num_tokens]
++                             const torch::Device& paged_memory_device,
++                             const int page_buffer_size, const bool direction,
++                             const bool use_mla);
++
++void multi_layer_kv_transfer_unilateral(torch::Tensor& key_value,
++                                        const torch::Tensor& key_ptrs,
++                                        const torch::Tensor& value_ptrs,
++                                        const torch::Tensor& slot_mapping,
++                                        const torch::Device& paged_memory_device,
++                                        const int page_buffer_size,
++                                        const bool direction);
++
++void single_layer_kv_transfer(torch::Tensor& lmc_key_value_cache,
++                              torch::Tensor& vllm_key_cache,
++                              torch::Tensor& vllm_value_cache,
++                              torch::Tensor& slot_mapping,
++                              const bool direction,
++                              const bool token_major = false);
++
++void load_and_reshape_flash(torch::Tensor& key_value, torch::Tensor& key_cache,
++                            torch::Tensor& value_cache,
++                            torch::Tensor& slot_mapping, const int layer_idx);
++
++void reshape_and_cache_back_flash(torch::Tensor& key_value,
++                                  torch::Tensor& key_cache,
++                                  torch::Tensor& value_cache,
++                                  torch::Tensor& slot_mapping,
++                                  const int layer_idx);
+\ No newline at end of file
+diff --git a/csrc/ascend/pos_kernels.cpp b/csrc/ascend/pos_kernels.cpp
+new file mode 100644
+index 0000000..01ee897
+--- /dev/null
++++ b/csrc/ascend/pos_kernels.cpp
+@@ -0,0 +1,14 @@
++#include "pos_kernels.h"
++#include <pybind11/pybind11.h>
++#include <Python.h> 
++
++namespace py = pybind11;
++
++void rotary_embedding_k_fused(const torch::Tensor& old_positions,
++                              const torch::Tensor& new_positions,
++                              torch::Tensor& key, int64_t head_size,
++                              const torch::Tensor& cos_sin_cache, bool is_neox) {
++    // TODO:
++    PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend.");
++    throw py::error_already_set();
++};
+\ No newline at end of file
+diff --git a/csrc/ascend/pos_kernels.h b/csrc/ascend/pos_kernels.h
+new file mode 100644
+index 0000000..b0bcaf1
+--- /dev/null
++++ b/csrc/ascend/pos_kernels.h
+@@ -0,0 +1,10 @@
++#pragma once
++#include <ATen/ATen.h>
++#include <pybind11/pybind11.h>
++#include <torch/torch.h>
++#include <torch/extension.h>
++
++void rotary_embedding_k_fused(const torch::Tensor& old_positions,
++                              const torch::Tensor& new_positions,
++                              torch::Tensor& key, int64_t head_size,
++                              const torch::Tensor& cos_sin_cache, bool is_neox);
+\ No newline at end of file
+diff --git a/csrc/ascend/torch_tensor.h b/csrc/ascend/torch_tensor.h
+new file mode 100644
+index 0000000..f1842dd
+--- /dev/null
++++ b/csrc/ascend/torch_tensor.h
+@@ -0,0 +1,33 @@
++#pragma once
++#include <torch/torch.h>
++#include <torch/extension.h>
++#include <acl/acl.h>
++#include "managed_mem.h"
++
++void unregisterPtr(void* ptr) {
++    if (ptr) {
++        auto& hmm = lmc::HostRegisteredMemoryManager::GetInstance();
++        hmm.unregisterMemory(ptr);
++    }
++}
++
++torch::Tensor create_pinned_host_registered_tensor(size_t bufferSize) {
++    torch::TensorOptions tensorOpsCpu = torch::TensorOptions()
++                                                .dtype(torch::kUInt8)
++                                                .device(torch::kCPU)
++                                                .pinned_memory(true);
++    TORCH_CHECK(bufferSize > 0, "Buffer size must be greater than zero. Got: " + std::to_string(bufferSize));
++    
++    // unlikely this would be greater than int64_t
++    int64_t numel = static_cast<int64_t>(bufferSize);
++    
++    void* hostPtr;
++    aclError err = aclrtMallocHost((void**)&hostPtr, bufferSize);
++    TORCH_CHECK(err == 0, "Unable to malloc host buffer, error: " + std::to_string(err));
++
++    auto& hmm = lmc::HostRegisteredMemoryManager::GetInstance();
++    hmm.registerHostPtr(hostPtr, bufferSize);
++    
++    std::vector<int64_t> dims = {numel};
++    return torch::from_blob(hostPtr, dims, unregisterPtr, tensorOpsCpu);
++}
+\ No newline at end of file
+diff --git a/csrc/ascend/utils.cmake b/csrc/ascend/utils.cmake
+new file mode 100644
+index 0000000..ebf06c6
+--- /dev/null
++++ b/csrc/ascend/utils.cmake
+@@ -0,0 +1,28 @@
++#
++# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
++# has trailing whitespace stripped.  If an error is encountered when running
++# python, a fatal message `ERR_MSG` is issued.
++#
++function (run_python OUT EXPR ERR_MSG)
++  execute_process(
++    COMMAND
++    "${PYTHON_EXECUTABLE}" "-c" "${EXPR}"
++    OUTPUT_VARIABLE PYTHON_OUT
++    RESULT_VARIABLE PYTHON_ERROR_CODE
++    ERROR_VARIABLE PYTHON_STDERR
++    OUTPUT_STRIP_TRAILING_WHITESPACE)
++
++  if(NOT PYTHON_ERROR_CODE EQUAL 0)
++    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
++  endif()
++  set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
++endfunction()
++
++# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
++# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
++macro (append_cmake_prefix_path PKG EXPR)
++  run_python(_PREFIX_PATH
++    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
++  list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
++endmacro()
++
+diff --git a/csrc/ascend/utils.h b/csrc/ascend/utils.h
+new file mode 100644
+index 0000000..c1c02cd
+--- /dev/null
++++ b/csrc/ascend/utils.h
+@@ -0,0 +1,39 @@
++/*
++ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++#pragma once
++
++#include "kernels/types.h"
++#include <c10/core/ScalarType.h>
++#include <torch/torch.h>
++
++namespace vllm_ascend {
++AscendType get_dtype_from_torch(at::ScalarType scalarType)
++{
++    if (scalarType == at::ScalarType::Float) {
++        return AscendType::FP32;
++    } else if (scalarType == at::ScalarType::BFloat16) {
++        return AscendType::BF16;
++    } else if (scalarType == at::ScalarType::Half) {
++        return AscendType::FP16;
++    } else if (scalarType == at::ScalarType::Long) {
++        return AscendType::INT64;
++    } else if (scalarType == at::ScalarType::Int) {
++        return AscendType::INT32;
++    } else {
++        TORCH_CHECK(false, "ScalarType not supported.");
++    }
++};
++} // namespace vllm_ascend
+diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
+index 4b4b900..d7ad00d 100644
+--- a/csrc/pybind.cpp
++++ b/csrc/pybind.cpp
+@@ -15,19 +15,29 @@
+  */
+ 
+ #include <pybind11/pybind11.h>
+-#include "mem_kernels.cuh"
+-#include "cachegen_kernels.cuh"
+-#include "pos_kernels.cuh"
++#ifdef USE_ASCEND
++  #include "ascend/mem_kernels.h"
++  #include "ascend/managed_mem.h"
++  #include "ascend/cachegen_kernels.h"
++  #include "ascend/pos_kernels.h"
++#else
++  #include "mem_kernels.cuh"
++  #include "cachegen_kernels.cuh"
++  #include "pos_kernels.cuh"
++#endif
+ #include <torch/torch.h>
+ #include <iostream>
+ 
+ namespace py = pybind11;
+ 
+ PYBIND11_MODULE(c_ops, m) {
++#ifdef USE_ASCEND
++  m.def("host_register", &register_memory);
++#endif
+   m.def("multi_layer_kv_transfer", &multi_layer_kv_transfer);
++  m.def("single_layer_kv_transfer", &single_layer_kv_transfer);
+   m.def("multi_layer_kv_transfer_unilateral",
+         &multi_layer_kv_transfer_unilateral);
+-  m.def("single_layer_kv_transfer", &single_layer_kv_transfer);
+   m.def("load_and_reshape_flash", &load_and_reshape_flash);
+   m.def("reshape_and_cache_back_flash", &reshape_and_cache_back_flash);
+   m.def("encode_fast_new", &encode_cuda_new);
+diff --git a/lmcache/integration/vllm/vllm_adapter.py b/lmcache/integration/vllm/vllm_adapter.py
+index a316561..8527d02 100644
+--- a/lmcache/integration/vllm/vllm_adapter.py
++++ b/lmcache/integration/vllm/vllm_adapter.py
+@@ -23,25 +23,35 @@ from torch.nn.utils.rnn import pad_sequence
+ import torch
+ import torch.distributed as dist
+ 
+-if TYPE_CHECKING:
+-    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+-
+ # Third Party
+ from vllm.attention import AttentionMetadata
+ 
+-# from vllm.attention.backends.flash_attn import FlashAttentionMetadata
+-try:
++from vllm.platforms import current_platform
++if current_platform.is_cuda() or current_platform.device_type == "cuda":
++    if TYPE_CHECKING:
++        from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
++
++    # from vllm.attention.backends.flash_attn import FlashAttentionMetadata
++    try:
++        # Third Party
++        from vllm.attention.backends.flash_attn import FlashAttentionMetadata
++    except (ModuleNotFoundError, ImportError):
++        # vllm_flash_attn is not installed, try the ROCm FA metadata
++        from vllm.attention.backends.rocm_flash_attn import (
++            ROCmFlashAttentionMetadata as FlashAttentionMetadata,
++        )
++
+     # Third Party
+-    from vllm.attention.backends.flash_attn import FlashAttentionMetadata
+-except (ModuleNotFoundError, ImportError):
+-    # vllm_flash_attn is not installed, try the ROCm FA metadata
+-    from vllm.attention.backends.rocm_flash_attn import (
+-        ROCmFlashAttentionMetadata as FlashAttentionMetadata,
+-    )
++    from vllm.attention.backends.flashmla import FlashMLAMetadata
++    from vllm.attention.backends.mla.common import MLACommonMetadata
++elif current_platform.device_name == "npu":
++    if TYPE_CHECKING:
++        from vllm_ascend.worker.model_runner import ModelInputForNPUWithSamplingMetadata as ModelInputForGPUWithSamplingMetadata
++    from vllm_ascend.attention.attention_v1 import AscendMetadata as FlashAttentionMetadata
++    from vllm_ascend.attention.mla_v1 import AscendMLAMetadata as FlashMLAMetadata, AscendMLAMetadata as MLACommonMetadata
++    from torch_npu.contrib import transfer_to_npu
++    import lmcache.c_ops as lmc_ops
+ 
+-# Third Party
+-from vllm.attention.backends.flashmla import FlashMLAMetadata
+-from vllm.attention.backends.mla.common import MLACommonMetadata
+ from vllm.config import (
+     CacheConfig,
+     ModelConfig,
+@@ -224,9 +234,9 @@ def init_lmcache_engine(
+             use_mla=use_mla,
+         )
+     engine = LMCacheEngineBuilder.get_or_create(
+-        ENGINE_NAME, config, metadata, vllm_gpu_connector
++        ENGINE_NAME, config, metadata, vllm_gpu_connector, device=current_platform.device_name
+     )
+-
++    
+     return engine
+ 
+ 
+diff --git a/lmcache/v1/cache_engine.py b/lmcache/v1/cache_engine.py
+index eaba752..4fb57a3 100644
+--- a/lmcache/v1/cache_engine.py
++++ b/lmcache/v1/cache_engine.py
+@@ -769,6 +769,7 @@ class LMCacheEngineBuilder:
+     def _Create_memory_allocator(
+         config: LMCacheEngineConfig,
+         metadata: LMCacheEngineMetadata,
++        device: str = "cuda",
+     ) -> MemoryAllocatorInterface:
+         if config.enable_nixl:
+             assert config.nixl_buffer_device is not None
+@@ -810,7 +811,7 @@ class LMCacheEngineBuilder:
+             return CuFileMemoryAllocator(config.cufile_buffer_size * 1024**2)
+ 
+         max_local_cpu_size = config.max_local_cpu_size
+-        return MixedMemoryAllocator(int(max_local_cpu_size * 1024**3))
++        return MixedMemoryAllocator(int(max_local_cpu_size * 1024**3), device=device)
+ 
+     @staticmethod
+     def _Create_token_database(
+@@ -828,6 +829,7 @@ class LMCacheEngineBuilder:
+         config: LMCacheEngineConfig,
+         metadata: LMCacheEngineMetadata,
+         gpu_connector: GPUConnectorInterface,
++        device: str = "cuda"
+     ) -> LMCacheEngine:
+         """
+         Builds a new LMCacheEngine instance if it doesn't already exist for the
+@@ -838,7 +840,7 @@ class LMCacheEngineBuilder:
+         """
+         logger.info(f"Creating LMCacheEngine instance {instance_id}")
+         if instance_id not in cls._instances:
+-            memory_allocator = cls._Create_memory_allocator(config, metadata)
++            memory_allocator = cls._Create_memory_allocator(config, metadata, device=device)
+             token_database = cls._Create_token_database(config, metadata)
+             stat_logger = LMCacheStatsLogger(metadata, log_interval=10)
+ 
+@@ -885,4 +887,4 @@ class LMCacheEngineBuilder:
+             cls._cfgs.pop(instance_id, None)
+             cls._metadatas.pop(instance_id, None)
+             cls._stat_loggers.pop(instance_id, None)
+-            LMCStatsMonitor.DestroyInstance()
++            LMCStatsMonitor.DestroyInstance()
+\ No newline at end of file
+diff --git a/lmcache/v1/gpu_connector.py b/lmcache/v1/gpu_connector.py
+index 62fe7d5..8b257c9 100644
+--- a/lmcache/v1/gpu_connector.py
++++ b/lmcache/v1/gpu_connector.py
+@@ -18,6 +18,10 @@ import abc
+ 
+ # Third Party
+ import torch
++try:
++    from torch_npu.contrib import transfer_to_npu
++except (ModuleNotFoundError, ImportError):
++    print("Not importing NPU packages. We are not running on Ascend")
+ 
+ # First Party
+ from lmcache.integration.vllm.utils import ENGINE_NAME
+@@ -153,7 +157,8 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface):
+     def _initialize_pointers(self, kv_caches: List[torch.Tensor]) -> torch.Tensor:
+         self.kv_cache_pointers.numpy()[:] = [t.data_ptr() for t in kv_caches]
+         device = kv_caches[0].device
+-        assert device.type == "cuda", "The device should be CUDA."
++        from vllm.platforms import current_platform
++        assert device.type == current_platform.device_name  # The device should be CUDA or Ascend based on vllm.
+         idx = device.index
+         if idx not in self.kv_cache_pointers_on_gpu:
+             self.kv_cache_pointers_on_gpu[idx] = torch.empty(
+@@ -161,9 +166,9 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface):
+             )
+         self.kv_cache_pointers_on_gpu[idx].copy_(self.kv_cache_pointers)
+         if self.use_mla:
+-            # kv_caches[0].shape: [num_pages, page_size, head_size]
+-            assert kv_caches[0].dim() == 3
+-            self.page_buffer_size = kv_caches[0].shape[0] * kv_caches[0].shape[1]
++            # kv_caches[0].shape: [num_pages, page_size, head_size] (vllm) or 
++            # kv_caches[0].shape: [1, num_pages, page_size, head_size] (vllm-Ascend)
++            self.page_buffer_size = kv_caches[0].shape[-3] * kv_caches[0].shape[-2]
+         else:
+             # kv_caches[0].shape: [2, num_pages, page_size, num_heads, head_size]
+             assert kv_caches[0].dim() == 5
+@@ -798,7 +803,7 @@ class VLLMPagedMemLayerwiseGPUConnector(GPUConnectorInterface):
+                             memory_obj.tensor,
+                             self.kvcaches[layer_id][0],
+                             self.kvcaches[layer_id][1],
+-                            slot_mapping_full,
++                            slot_mapping[start:end],
+                             False,
+                             True,
+                         )
+@@ -819,7 +824,8 @@ class VLLMPagedMemLayerwiseGPUConnector(GPUConnectorInterface):
+             current_stream.wait_stream(self.load_stream)
+ 
+         # free the buffer memory
+-        tmp_gpu_buffer_obj.ref_count_down()
++        if self.use_gpu:
++            tmp_gpu_buffer_obj.ref_count_down()
+ 
+         logger.debug(f"Finished loading layer {layer_id}")
+         yield
+@@ -932,7 +938,8 @@ class VLLMPagedMemLayerwiseGPUConnector(GPUConnectorInterface):
+             logger.debug(f"Finished offloading layer {layer_id}")
+ 
+         # free the buffer memory
+-        tmp_gpu_buffer_obj.ref_count_down()
++        if self.use_gpu:
++            tmp_gpu_buffer_obj.ref_count_down()
+         yield
+ 
+     def get_shape(self, num_tokens: int) -> torch.Size:
+diff --git a/lmcache/v1/memory_management.py b/lmcache/v1/memory_management.py
+index 68b6f12..a17cc8a 100644
+--- a/lmcache/v1/memory_management.py
++++ b/lmcache/v1/memory_management.py
+@@ -31,6 +31,10 @@ import torch
+ from lmcache.logging import init_logger
+ from lmcache.observability import LMCStatsMonitor
+ from lmcache.utils import _lmcache_nvtx_annotate
++import lmcache.c_ops as lmc_ops
++
++# Third Party
++from vllm.platforms import current_platform
+ 
+ logger = init_logger(__name__)
+ 
+@@ -563,6 +567,19 @@ class MemoryAllocatorInterface(metaclass=abc.ABCMeta):
+         """
+         raise NotImplementedError
+ 
++    @abc.abstractmethod
++    def registerHostPtr(
++        self,
++        device: str = "cuda",
++        size: int = 0
++    ):
++        """
++        Register the host ptr with the device using device specific APIs.
++        
++        :param str device: The device type to register the host ptr with.
++        """
++        raise NotImplementedError
++
+     def close(self):
+         """
+         Closes the memory allocator.
+@@ -900,6 +917,10 @@ class TensorMemoryAllocator(MemoryAllocatorInterface):
+                 clear = False
+         return clear
+ 
++    def registerHostPtr(self, device: str = "cuda", size: int = 0):
++        # do nothing for tensor memory allocator
++        pass
++
+ 
+ class PagedTensorMemoryAllocator(MemoryAllocatorInterface):
+     """
+@@ -1151,6 +1172,10 @@ class PagedTensorMemoryAllocator(MemoryAllocatorInterface):
+         # FIXME: NIXL-related memory leak should be handled somewhere (else).
+         del self.buffer
+ 
++    def registerHostPtr(self, device: str = "cuda", size: int = 0):
++        # do nothing for paged tensor memory allocator
++        pass
++
+ 
+ class BufferAllocator(MemoryAllocatorInterface):
+     """Allocates memory in the pre-allocated pinned memory."""
+@@ -1201,6 +1226,9 @@ class BufferAllocator(MemoryAllocatorInterface):
+     def memcheck(self):
+         return True
+ 
++    def registerHostPtr(self, device: str = "cuda", size: int = 0):
++        pass
++
+ 
+ class HostMemoryAllocator(MemoryAllocatorInterface):
+     """Allocates memory in the pre-allocated Host memory."""
+@@ -1272,21 +1300,20 @@ class HostMemoryAllocator(MemoryAllocatorInterface):
+         with self.host_mem_lock:
+             return self.allocator.memcheck()
+ 
++    def registerHostPtr(self, device: str = "cuda", size: int = 0):
++        pass
+ 
+ class PinMemoryAllocator(MemoryAllocatorInterface):
+     """Allocates memory in the pre-allocated pinned memory."""
+ 
+-    def __init__(self, size: int, use_paging: bool = False, **kwargs):
++    def __init__(self, size: int, use_paging: bool = False, device: str = "cuda", **kwargs):
+         """
+         :param int size: The size of the pinned memory in bytes.
+         """
+ 
+         self.buffer = torch.empty(size, dtype=torch.uint8)
+-        ptr = self.buffer.data_ptr()
+-        err = torch.cuda.cudart().cudaHostRegister(ptr, size, 0)
+-        assert err == 0, (
+-            f"cudaHostRegister failed: {torch.cuda.cudart().cudaGetErrorString(err)}"
+-        )
++        self._device = device
++        self.registerHostPtr(device, size)
+         self._unregistered = False
+ 
+         if use_paging:
+@@ -1353,9 +1380,22 @@ class PinMemoryAllocator(MemoryAllocatorInterface):
+     def close(self):
+         if not self._unregistered:
+             torch.cuda.synchronize()
+-            torch.cuda.cudart().cudaHostUnregister(self.buffer.data_ptr())
++            if self._device == "cuda":
++                torch.cuda.cudart().cudaHostUnregister(self.buffer.data_ptr())
+             self._unregistered = True
+ 
++    def registerHostPtr(self, device: str = "cuda", size: int = 0):
++        ptr = self.buffer.data_ptr()
++        if device == "cuda":
++            assert size >= 0, ("size must be non-negative and greater than 0.")
++            err = torch.cuda.cudart().cudaHostRegister(ptr, size, 0)
++            assert err == 0, (
++                f"cudaHostRegister failed: {torch.cuda.cudart().cudaGetErrorString(err)}"
++            )
++        elif device == "npu":
++            # Ascend need to manually manage host register API and memory is pinned
++            self.buffer = self.buffer.pin_memory()
++            lmc_ops.host_register(self.buffer)
+ 
+ class MixedMemoryAllocator(MemoryAllocatorInterface):
+     """
+@@ -1363,17 +1403,14 @@ class MixedMemoryAllocator(MemoryAllocatorInterface):
+               (2) byte_array buffer memory.
+     """
+ 
+-    def __init__(self, size: int, use_paging: bool = False, **kwargs):
++    def __init__(self, size: int, use_paging: bool = False, device: str = "cuda", **kwargs):
+         """
+         :param int size: The size of the pinned memory in bytes.
+         """
+ 
+         self.buffer = torch.empty(size, dtype=torch.uint8)
+-        ptr = self.buffer.data_ptr()
+-        err = torch.cuda.cudart().cudaHostRegister(ptr, size, 0)
+-        assert err == 0, (
+-            f"cudaHostRegister failed: {torch.cuda.cudart().cudaGetErrorString(err)}"
+-        )
++        self._device = device
++        self.registerHostPtr(device, size)
+         self._unregistered = False
+ 
+         if use_paging:
+@@ -1487,8 +1524,22 @@ class MixedMemoryAllocator(MemoryAllocatorInterface):
+     def close(self):
+         if not self._unregistered:
+             torch.cuda.synchronize()
+-            torch.cuda.cudart().cudaHostUnregister(self.buffer.data_ptr())
++            if self._device == "cuda":
++                torch.cuda.cudart().cudaHostUnregister(self.buffer.data_ptr())
+             self._unregistered = True
++    
++    def registerHostPtr(self, device: str = "cuda", size: int = 0):
++        ptr = self.buffer.data_ptr()
++        if device == "cuda":
++            assert size >= 0, ("size must be non-negative and greater than 0.")
++            err = torch.cuda.cudart().cudaHostRegister(ptr, size, 0)
++            assert err == 0, (
++                f"cudaHostRegister failed: {torch.cuda.cudart().cudaGetErrorString(err)}"
++            )
++        elif device == "npu":
++            # Ascend need to manually manage host register API and memory is pinned
++            self.buffer = self.buffer.pin_memory()
++            lmc_ops.host_register(self.buffer)
+ 
+ 
+ class GPUMemoryAllocator(MemoryAllocatorInterface):
+@@ -1570,6 +1621,8 @@ class GPUMemoryAllocator(MemoryAllocatorInterface):
+         with self.device_mem_lock:
+             return self.allocator.memcheck()
+ 
++    def registerHostPtr(self, device: str = "cuda", size: int = 0):
++        pass
+ 
+ class AdHocMemoryAllocator(MemoryAllocatorInterface):
+     """
+@@ -1650,6 +1703,9 @@ class AdHocMemoryAllocator(MemoryAllocatorInterface):
+     def memcheck(self):
+         return True
+ 
++    def registerHostPtr(self, device: str = "cuda", size: int = 0):
++        pass
++
+ 
+ class CuFileMemoryAllocator(GPUMemoryAllocator):
+     def __init__(self, size: int, device=None):
+@@ -1749,3 +1805,6 @@ class NixlCPUMemoryAllocator(MemoryAllocatorInterface):
+             self.cpu_allocator.batched_free(memory_objs, update_stats=update_stats)
+         else:
+             raise ValueError(f"Unsupported allocator type: {allocator_type}")
++
++    def registerHostPtr(self, device: str = "cuda", size: int = 0):
++        pass
+\ No newline at end of file
+diff --git a/lmcache/v1/storage_backend/storage_manager.py b/lmcache/v1/storage_backend/storage_manager.py
+index 82d6edc..ba0b103 100644
+--- a/lmcache/v1/storage_backend/storage_manager.py
++++ b/lmcache/v1/storage_backend/storage_manager.py
+@@ -425,4 +425,4 @@ class StorageManager:
+         if self.thread.is_alive():
+             self.thread.join()
+ 
+-        logger.info("Storage manager closed.")
++        logger.info("Storage manager closed.")
+\ No newline at end of file
+diff --git a/pyproject.toml b/pyproject.toml
+index f8ee543..8a362c1 100644
+--- a/pyproject.toml
++++ b/pyproject.toml
+@@ -6,7 +6,7 @@ requires = [
+     "packaging>=24.2",
+     "setuptools>=77.0.3,<81.0.0",
+     "setuptools_scm>=8",
+-    "torch==2.7.0",
++    "torch>=2.5.1",
+     "wheel",
+ ]
+ build-backend = "setuptools.build_meta"
+@@ -14,8 +14,8 @@
+ [project]
+ name = "lmcache"
+ authors = [{name = "LMCache Team", email = "lmcacheteam@gmail.com"}]
+-license = "Apache-2.0"
+-license-files = ["LICENSE"]
++license = { file = "LICENSE" } 
++
+ readme = "README.md"
+ description = "A LLM serving engine extension to reduce TTFT and increase throughput, especially under long-context scenarios."
+ classifiers = [
+
+diff --git a/requirements/ascend.txt b/requirements/ascend.txt
+new file mode 100644
+index 0000000..ad04d52
+--- /dev/null
++++ b/requirements/ascend.txt
+@@ -0,0 +1,7 @@
++torch>=2.5.1
++# Common project dependencies
++-r common.txt
++
++
++# These must be updated alongside torch to correspond to vLLM-ascend versions
++torch-npu>=2.5.1.post1.dev20250619
+\ No newline at end of file
+diff --git a/requirements/build.txt b/requirements/build.txt
+index 85aacb4..0923a0b 100644
+--- a/requirements/build.txt
++++ b/requirements/build.txt
+@@ -4,6 +4,6 @@ ninja
+ packaging>=24.2
+ setuptools>=77.0.3,<81.0.0
+ setuptools_scm>=8
+-torch==2.7.0 # Corresponds to the version used by vLLM main branch
++torch>=2.5.1 # Corresponds to the version used by vLLM main branch
+ wheel
+ 
+diff --git a/requirements/common.txt b/requirements/common.txt
+index 34e30c2..35011ad 100644
+--- a/requirements/common.txt
++++ b/requirements/common.txt
+@@ -1,8 +1,7 @@
+ aiofile
+ aiofiles
+ aiohttp
+-cufile-python
+-infinistore
++infinistore; platform_machine == 'x86_64'
+ msgspec
+ numpy
+ nvtx
+@@ -15,6 +14,6 @@ safetensors
+ setuptools>=77.0.3,<81.0.0
+ setuptools_scm>=8
+ sortedcontainers
+-torch==2.7.0  # Should correspond to the version used by vLLM main branch
++torch>=2.5.1 # Should correspond to the version used by vLLM main branch
+ transformers >= 4.51.1
+ xxhash==3.5.0
+diff --git a/requirements/cuda.txt b/requirements/cuda.txt
+index 719261d..4912df9 100644
+--- a/requirements/cuda.txt
++++ b/requirements/cuda.txt
+@@ -9,3 +9,4 @@ nvidia-ml-py # for pynvml package
+ torch == 2.7.0
+ torchvision == 0.22.0   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+ xformers == 0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.7.0
++cufile-python
+diff --git a/setup.py b/setup.py
+index 00a4980..b964aff 100644
+--- a/setup.py
++++ b/setup.py
+@@ -4,7 +4,17 @@ import os
+ import sys
+ 
+ # Third Party
+-from setuptools import find_packages, setup
++from setuptools import find_packages, setup, Extension
++from setuptools.command.build_ext import build_ext
++from setuptools.command.develop import develop
++from setuptools.command.install import install
++
++import logging
++import sysconfig
++import subprocess
++import platform
++import shutil
++
+ 
+ ROOT_DIR = Path(__file__).parent
+ HIPIFY_DIR = os.path.join(ROOT_DIR, "csrc/")
+@@ -14,10 +24,159 @@ HIPIFY_OUT_DIR = os.path.join(ROOT_DIR, "csrc_hip/")
+ # will run python setup.py sdist --dist-dir dist
+ BUILDING_SDIST = "sdist" in sys.argv or os.environ.get("NO_CUDA_EXT", "0") == "1"
+ 
+-# New environment variable to choose between CUDA and HIP
+-BUILD_WITH_HIP = os.environ.get("BUILD_WITH_HIP", "0") == "1"
++# Environment variable to choose between CUDA, HIP, Ascend
++TARGET_DEVICE = os.environ.get("LMCACHE_TARGET_DEVICE", "CUDA") 
++
++logging.basicConfig(level=logging.INFO)
++logger = logging.getLogger(__name__)
++
++def _get_ascend_home_path():
++    # NOTE: standard Ascend CANN toolkit path
++    return os.environ.get("ASCEND_HOME_PATH", "/usr/local/Ascend/ascend-toolkit/latest")
++
++def _get_ascend_driver_path():
++    # NOTE: standard Ascend path
++    return os.environ.get("ASCEND_DRIVER_PATH", "/usr/local/Ascend/driver")
++
++def _get_ascend_env_path():
++    # NOTE: standard Ascend Environment variable setup path
++    env_script_path = os.path.realpath(os.path.join(_get_ascend_home_path(), "..", "set_env.sh"))
++    if not os.path.exists(env_script_path):
++        raise ValueError(f"The file '{env_script_path}' is not found, "
++                            "please make sure environment variable 'ASCEND_HOME_PATH' is set correctly.")
++    return env_script_path
++
++def _get_npu_soc():
++    _soc_version = os.getenv("SOC_VERSION", None)
++    if _soc_version is None:
++        npu_smi_cmd = [
++            "bash",
++            "-c",
++            "npu-smi info | grep OK | awk '{print $3}' | head -n 1",
++        ]
++        try:
++            _soc_version = subprocess.check_output(npu_smi_cmd,
++                                                   text=True).strip()
++            _soc_version = _soc_version.split("-")[0]
++            _soc_version = "Ascend"+_soc_version
++            return _soc_version
++        except subprocess.CalledProcessError as e:
++            raise RuntimeError(f"Retrieve SoC version failed: {e}")
++    return _soc_version   
+ 
++class CMakeExtension(Extension):
+ 
++    def __init__(self,
++                 name: str,
++                 cmake_lists_dir: str = ".",
++                 **kwargs) -> None:
++        super().__init__(name, sources=[], py_limited_api=False, **kwargs)
++        self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
++
++class custom_install(install):
++    def run(self):
++        self.run_command("build_ext")
++        install.run(self)
++        
++class CustomAscendCmakeBuildExt(build_ext):
++    
++    def build_extension(self, ext):
++        # build the so as c_ops
++        ext_name = ext.name.split(".")[-1]
++        so_name = ext_name + ".so"
++        logger.info(f"Building {so_name} ...")
++        OPS_DIR = os.path.join(ROOT_DIR, "csrc", "ascend")
++        BUILD_OPS_DIR = os.path.join(ROOT_DIR, "build", "ascend")
++        os.makedirs(BUILD_OPS_DIR, exist_ok=True)
++       
++        ascend_home_path = _get_ascend_home_path()
++        ascend_driver_path = _get_ascend_driver_path()
++        env_path = _get_ascend_env_path()
++        _soc_version = _get_npu_soc()
++        _cxx_compiler = os.getenv("CXX")
++        _cc_compiler = os.getenv("CC")
++        python_executable = sys.executable
++
++        try:
++            # if pybind11 is installed via pip
++            pybind11_cmake_path = (subprocess.check_output(
++                [python_executable, "-m", "pybind11",
++                 "--cmakedir"]).decode().strip())
++        except subprocess.CalledProcessError as e:
++            # else specify pybind11 path installed from source code on CI container
++            raise RuntimeError(f"CMake configuration failed: {e}")
++        
++        import torch_npu
++        torch_npu_path = os.path.dirname(os.path.abspath(torch_npu.__file__))
++        import torch
++        torch_path = os.path.dirname(os.path.abspath(torch.__file__))
++        
++        # python include
++        python_include_path = sysconfig.get_path('include', scheme='posix_prefix')
++        
++        arch = platform.machine()
++        install_path = os.path.join(BUILD_OPS_DIR, 'install')
++        if isinstance(self.distribution.get_command_obj("develop"), develop):
++            install_path=BUILD_OPS_DIR
++        
++        cmake_cmd = [
++            f"source {env_path} && "
++            f"cmake -S {OPS_DIR} -B {BUILD_OPS_DIR}"
++            f"  -DSOC_VERSION={_soc_version}"
++            f"  -DARCH={arch}"
++            "  -DUSE_ASCEND=1"
++            f"  -DPYTHON_EXECUTABLE={python_executable}"
++            f"  -DCMAKE_PREFIX_PATH={pybind11_cmake_path}"
++            f"  -DCMAKE_BUILD_TYPE=Release"
++            f"  -DCMAKE_INSTALL_PREFIX={install_path}"
++            f"  -DPYTHON_INCLUDE_PATH={python_include_path}"
++            f"  -DTORCH_NPU_PATH={torch_npu_path}"
++            f"  -DTORCH_PATH={torch_path}"
++            f"  -DASCEND_CANN_PACKAGE_PATH={ascend_home_path}"
++            f"  -DASCEND_DRIVER_PATH={ascend_driver_path}"
++            "  -DCMAKE_VERBOSE_MAKEFILE=ON"
++        ]
++        
++        if _cxx_compiler is not None:
++            cmake_cmd += [f"  -DCMAKE_CXX_COMPILER={_cxx_compiler}"]
++        
++        if _cc_compiler is not None:
++            cmake_cmd += [f"  -DCMAKE_C_COMPILER={_cc_compiler}"]
++        
++        cmake_cmd += [f" && cmake --build {BUILD_OPS_DIR} -j --verbose"]
++        cmake_cmd += [f" && cmake --install {BUILD_OPS_DIR}"]
++        cmake_cmd = "".join(cmake_cmd)
++        
++        logger.info(f"Start running CMake commands:\n{cmake_cmd}")
++        try:
++            result = subprocess.run(cmake_cmd, cwd=ROOT_DIR, text=True, shell=True, check=True)
++        except subprocess.CalledProcessError as e:
++            raise RuntimeError(f"Failed to build {so_name}: {e}")
++        
++        build_lib_dir = self.get_ext_fullpath(ext.name)
++        os.makedirs(os.path.dirname(build_lib_dir), exist_ok=True)
++        
++        package_name = ext.name.split('.')[0] # e.g., 'lmcache'
++        src_dir = os.path.join(ROOT_DIR, package_name)
++        
++        for root, _, files in os.walk(install_path):
++            for file in files:
++                if file.endswith(".so"):
++                    src_path = os.path.join(root, file)
++                    dst_path = os.path.join(os.path.dirname(build_lib_dir), file)
++                    if os.path.exists(dst_path):
++                        os.remove(dst_path)
++                    
++                    if isinstance(self.distribution.get_command_obj("develop"), develop):
++                        # For the ascend kernels
++                        src_dir_file = os.path.join(src_dir, file)
++                        shutil.copy(src_path, src_dir_file)
++                    shutil.copy(src_path, dst_path)
++
++                    logger.info(f"Copied {file} to {dst_path}")
++        
++        
++        
+ def hipify_wrapper() -> None:
+     # Third Party
+     from torch.utils.hipify.hipify_python import hipify
+@@ -134,6 +293,12 @@ def rocm_extension() -> tuple[list, dict]:
+     return ext_modules, cmdclass
+ 
+ 
++def ascend_extension():
++    print("Building Ascend extensions")
++    return [CMakeExtension(name="lmcache.c_ops")], \
++        {"build_ext": CustomAscendCmakeBuildExt}
++
++
+ def source_dist_extension() -> tuple[list, dict]:
+     print("Not building CUDA/HIP extensions for sdist")
+     return [], {}
+@@ -142,11 +307,14 @@ def source_dist_extension() -> tuple[list, dict]:
+ if __name__ == "__main__":
+     if BUILDING_SDIST:
+         get_extension = source_dist_extension
+-    elif BUILD_WITH_HIP:
++        TARGET_DEVICE = "EMPTY"
++    elif TARGET_DEVICE == "HIP":
+         get_extension = rocm_extension
+-    else:
++    elif TARGET_DEVICE == "CUDA":
+         get_extension = cuda_extension
+-
++    elif TARGET_DEVICE == "ASCEND":
++        get_extension = ascend_extension
++        
+     ext_modules, cmdclass = get_extension()
+ 
+     setup(
+diff --git a/tests/conftest.py b/tests/conftest.py
+index e7aafc8..c6f9805 100644
+--- a/tests/conftest.py
++++ b/tests/conftest.py
+@@ -247,3 +247,18 @@ def autorelease_v1(request):
+     # Cleanup all objects created by the factory
+     # for obj in objects:
+     #    obj.close()
++
++@pytest.fixture(scope="module")
++def has_npu(request):
++    try:
++        import torch_npu
++        from torch_npu.contrib import transfer_to_npu
++        return True
++    except ImportError as ie:
++        return False
++    
++@pytest.fixture(scope="module")
++def skip_if_npu(has_npu):
++    if has_npu:
++        pytest.skip("Skipped. NPU does not support current function.")
++    return has_npu
+\ No newline at end of file
+diff --git a/tests/v1/storage_backend/test_local_cpu_backend.py b/tests/v1/storage_backend/test_local_cpu_backend.py
+index 6c7fbb1..061b277 100644
+--- a/tests/v1/storage_backend/test_local_cpu_backend.py
++++ b/tests/v1/storage_backend/test_local_cpu_backend.py
+@@ -64,9 +64,10 @@ def create_test_memory_obj(shape=(2, 16, 8, 128), dtype=torch.bfloat16) -> Memor
+ 
+ 
+ @pytest.fixture
+-def memory_allocator():
++def memory_allocator(has_npu):
+     """Create a memory allocator for testing."""
+-    return MixedMemoryAllocator(1024 * 1024 * 1024)  # 1GB
++    device = "npu" if has_npu else "cuda"
++    return MixedMemoryAllocator(1024 * 1024 * 1024, device=device)  # 1GB
+ 
+ 
+ @pytest.fixture
+@@ -589,4 +590,4 @@ class TestLocalCPUBackend:
+         # The accessed key should be at the end (MRU)
+         assert retrieved_keys[-1] == keys[0]
+ 
+-        local_cpu_backend.memory_allocator.close()
++        local_cpu_backend.memory_allocator.close()
+\ No newline at end of file
+diff --git a/tests/v1/storage_backend/test_local_disk_backend.py b/tests/v1/storage_backend/test_local_disk_backend.py
+index 197c195..77474da 100644
+--- a/tests/v1/storage_backend/test_local_disk_backend.py
++++ b/tests/v1/storage_backend/test_local_disk_backend.py
+@@ -619,4 +619,4 @@ class TestLocalDiskBackend:
+         # The backend should still be in a consistent state
+         assert local_disk_backend.contains(key)
+ 
+-        local_disk_backend.local_cpu_backend.memory_allocator.close()
++        local_disk_backend.local_cpu_backend.memory_allocator.close()
+\ No newline at end of file
+diff --git a/tests/v1/test_cache_engine.py b/tests/v1/test_cache_engine.py
+index ba9721f..0827e85 100644
+--- a/tests/v1/test_cache_engine.py
++++ b/tests/v1/test_cache_engine.py
+@@ -19,10 +19,11 @@ import torch
+ # First Party
+ from lmcache.v1.cache_engine import LMCacheEngineBuilder
+ from lmcache.v1.config import LMCacheEngineConfig
++import lmcache.c_ops as lmc_ops
+ 
+ 
+-def test_paged_same_retrieve_store(autorelease_v1):
+-    device = "cuda"
++def test_paged_same_retrieve_store(autorelease_v1, has_npu):
++    device = "npu" if has_npu else "cuda"
+     fmt = "vllm"
+     num_tokens = 2000
+     num_blocks = 1000
+@@ -57,9 +58,10 @@ def test_paged_same_retrieve_store(autorelease_v1):
+ 
+     engine = autorelease_v1(
+         LMCacheEngineBuilder.get_or_create(
+-            "test", cfg, dumb_metadata(fmt, kv_shape), connector
++            "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device 
+         )
+     )
++
+     """ test retrieve empty """
+     ret_mask = engine.retrieve(
+         tokens, kvcaches=retrieved_cache, slot_mapping=slot_mapping
+@@ -93,7 +95,7 @@ def test_paged_same_retrieve_store(autorelease_v1):
+ @pytest.mark.parametrize("backend", ["cpu", "local_disk", "remote", "remote_cachegen"])
+ @pytest.mark.parametrize("lmserver_v1_process", ["cpu"], indirect=True)
+ def test_paged_retrieve_prefix(
+-    fmt, chunk_size, backend, lmserver_v1_process, autorelease_v1
++    fmt, chunk_size, backend, lmserver_v1_process, autorelease_v1, has_npu
+ ):
+     url = None
+     remote_serde = None
+@@ -101,12 +103,14 @@ def test_paged_retrieve_prefix(
+     if "remote" in backend:
+         url = lmserver_v1_process.server_url
+         if backend == "remote_cachegen":
++            if has_npu:
++                pytest.skip("NPU backend: Not implemented for cachegen")
+             backend = "remote"
+             remote_serde = "cachegen"
+             check_equality = False
+         else:
+             remote_serde = "naive"
+-    device = "cuda"
++    device = "npu" if has_npu else "cuda"
+     num_tokens = 2000
+     new_num_tokens = 1000
+     kv_shape = (32, 2, chunk_size, 8, 128)
+@@ -139,9 +143,10 @@ def test_paged_retrieve_prefix(
+ 
+     engine = autorelease_v1(
+         LMCacheEngineBuilder.get_or_create(
+-            "test", cfg, dumb_metadata(fmt, kv_shape), connector
++            "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device
+         )
+     )
++
+     """ test store """
+     t1 = time.perf_counter()
+     engine.store(tokens, kvcaches=kv_cache, slot_mapping=slot_mapping)
+@@ -199,12 +204,12 @@ def test_paged_retrieve_prefix(
+ )
+ @pytest.mark.parametrize("lmserver_v1_process", ["cpu"], indirect=True)
+ def test_paged_store_offset(
+-    fmt, chunk_size, backend, lmserver_v1_process, autorelease_v1
++    fmt, chunk_size, backend, lmserver_v1_process, autorelease_v1, has_npu
+ ):
+     url = None
+     if backend == "remote":
+         url = lmserver_v1_process.server_url
+-    device = "cuda"
++    device = "npu" if has_npu else "cuda"
+     num_tokens = 2000
+     num_suffix_tokens = 500
+     num_total_tokens = 3000
+@@ -231,9 +236,10 @@ def test_paged_store_offset(
+ 
+     engine = autorelease_v1(
+         LMCacheEngineBuilder.get_or_create(
+-            "test", cfg, dumb_metadata(fmt, kv_shape), connector
++            "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device
+         )
+     )
++
+     """ test store """
+     engine.store(
+         tokens[:num_tokens],
+@@ -295,8 +301,8 @@ def test_paged_store_offset(
+         "local_disk"
+     ],
+ )
+-def test_paged_mixed_retrieve(fmt, chunk_size, backend, autorelease_v1):
+-    device = "cuda"
++def test_paged_mixed_retrieve(fmt, chunk_size, backend, autorelease_v1, has_npu):
++    device = "npu" if has_npu else "cuda"
+     num_tokens = 2000
+     new_num_tokens = 1000
+     num_blocks = 1000
+@@ -327,9 +333,10 @@ def test_paged_mixed_retrieve(fmt, chunk_size, backend, autorelease_v1):
+ 
+     engine = autorelease_v1(
+         LMCacheEngineBuilder.get_or_create(
+-            "test", cfg, dumb_metadata(fmt, kv_shape), connector
++            "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device
+         )
+     )
++
+     """ test store """
+     engine.store(tokens, kvcaches=kv_cache, slot_mapping=slot_mapping)
+     engine.store(new_tokens, kvcaches=kv_cache, slot_mapping=new_slot_mapping)
+@@ -421,8 +428,8 @@ def test_paged_mixed_retrieve(fmt, chunk_size, backend, autorelease_v1):
+ 
+ 
+ @pytest.mark.parametrize("fmt", ["vllm"])
+-def test_paged_store_kv_tensors_mask(fmt, autorelease_v1):
+-    device = "cuda"
++def test_paged_store_kv_tensors_mask(fmt, autorelease_v1, has_npu):
++    device = "npu" if has_npu else "cuda"
+     num_tokens = 1000
+     new_num_tokens = 2000
+     num_blocks = 1000
+@@ -452,9 +459,10 @@ def test_paged_store_kv_tensors_mask(fmt, autorelease_v1):
+ 
+     engine = autorelease_v1(
+         LMCacheEngineBuilder.get_or_create(
+-            "test", cfg, dumb_metadata(fmt, kv_shape), connector
++            "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device
+         )
+     )
++
+     """ Store some tokens with mask """
+     engine.store(tokens, kvcaches=kv_cache, slot_mapping=slot_mapping)
+     """Wait until store finishes"""
+@@ -570,12 +578,12 @@ def test_paged_store_kv_tensors_mask(fmt, autorelease_v1):
+ )
+ @pytest.mark.parametrize("lmserver_v1_process", ["cpu"], indirect=True)
+ def test_paged_hierarchy_retrieve(
+-    fmt, chunk_size, backend, retrieve_from, lmserver_v1_process, autorelease_v1
++    fmt, chunk_size, backend, retrieve_from, lmserver_v1_process, autorelease_v1, has_npu
+ ):
+     url = None
+     if backend == "local_cpu_disk_remote":
+         url = lmserver_v1_process.server_url
+-    device = "cuda"
++    device = "npu" if has_npu else "cuda"
+     num_tokens = 2000
+     new_num_tokens = 1000
+     kv_shape = (32, 2, chunk_size, 8, 128)
+@@ -609,9 +617,10 @@ def test_paged_hierarchy_retrieve(
+ 
+     engine = autorelease_v1(
+         LMCacheEngineBuilder.get_or_create(
+-            "test", cfg, dumb_metadata(fmt, kv_shape), connector
++            "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device
+         )
+     )
++
+     """ test store """
+     t1 = time.perf_counter()
+     engine.store(tokens, kvcaches=kv_cache, slot_mapping=slot_mapping)
+@@ -692,8 +701,8 @@ def test_paged_hierarchy_retrieve(
+         "local_disk",
+     ],
+ )
+-def test_paged_prefetch_retrieve(backend, prefetch_from, autorelease_v1):
+-    device = "cuda"
++def test_paged_prefetch_retrieve(backend, prefetch_from, autorelease_v1, has_npu):
++    device = "npu" if has_npu else "cuda"
+     num_tokens = 2000
+     new_num_tokens = 1000
+     num_blocks = 1000
+@@ -726,7 +735,7 @@ def test_paged_prefetch_retrieve(backend, prefetch_from, autorelease_v1):
+ 
+     engine = autorelease_v1(
+         LMCacheEngineBuilder.get_or_create(
+-            "test", cfg, dumb_metadata(fmt, kv_shape), connector
++            "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device
+         )
+     )
+     """ test store """
+@@ -804,12 +813,12 @@ def test_paged_prefetch_retrieve(backend, prefetch_from, autorelease_v1):
+     ],
+ )
+ @pytest.mark.parametrize("lmserver_v1_process", ["cpu"], indirect=True)
+-def test_paged_mem_leak(fmt, chunk_size, backend, lmserver_v1_process, autorelease_v1):
++def test_paged_mem_leak(fmt, chunk_size, backend, lmserver_v1_process, autorelease_v1, has_npu):
+     url = None
+     if "remote" in backend:
+         url = lmserver_v1_process.server_url
+ 
+-    device = "cuda"
++    device = "npu" if has_npu else "cuda"
+     num_tokens = 2000
+     kv_shape = (32, 2, chunk_size, 8, 128)
+     num_blocks = 1000
+@@ -830,7 +839,7 @@ def test_paged_mem_leak(fmt, chunk_size, backend, lmserver_v1_process, autorelea
+ 
+     engine = autorelease_v1(
+         LMCacheEngineBuilder.get_or_create(
+-            "test", cfg, dumb_metadata(fmt, kv_shape), connector
++            "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device
+         )
+     )
+ 
+@@ -872,20 +881,20 @@ def test_paged_mem_leak(fmt, chunk_size, backend, lmserver_v1_process, autorelea
+     LMCacheEngineBuilder.destroy("test")
+ 
+ 
+-def test_builder(autorelease_v1):
++def test_builder(autorelease_v1, has_npu):
+     instance_id = "test"
+     cfg = LMCacheEngineConfig.from_legacy(chunk_size=256)
+     cfg2 = LMCacheEngineConfig.from_legacy(chunk_size=512)
+     connector = None
+     should_be_none = LMCacheEngineBuilder.get(instance_id)
+     assert should_be_none is None
+-
++    device = "npu" if has_npu else "cuda"
+     _engine = autorelease_v1(
+-        LMCacheEngineBuilder.get_or_create(instance_id, cfg, dumb_metadata(), connector)
++        LMCacheEngineBuilder.get_or_create(instance_id, cfg, dumb_metadata(), connector, device=device)
+     )
+     _engine2 = autorelease_v1(LMCacheEngineBuilder.get(instance_id))  # noqa
+ 
+     with pytest.raises(ValueError):
+         LMCacheEngineBuilder.get_or_create(
+-            instance_id, cfg2, dumb_metadata(), connector
++            instance_id, cfg2, dumb_metadata(), connector, device=device
+         )
+diff --git a/tests/v1/test_connector.py b/tests/v1/test_connector.py
+index 9dbfe72..3e538e6 100644
+--- a/tests/v1/test_connector.py
++++ b/tests/v1/test_connector.py
+@@ -16,7 +16,7 @@ import torch
+ # First Party
+ from lmcache.v1.memory_management import PinMemoryAllocator
+ from lmcache.v1.storage_backend.connector import CreateConnector
+-
++import lmcache.c_ops as lmc_ops
+ 
+ @pytest.mark.parametrize("lmserver_v1_process", ["cpu"], indirect=True)
+ @pytest.mark.parametrize(
+@@ -25,12 +25,14 @@ from lmcache.v1.storage_backend.connector import CreateConnector
+         "lm://localhost:65000",
+     ],
+ )
+-def test_lm_connector(url, autorelease_v1, lmserver_v1_process):
++def test_lm_connector(url, autorelease_v1, lmserver_v1_process, has_npu):
+     if url.startswith("lm"):
+         url = lmserver_v1_process.server_url
+ 
+     async_loop, async_thread = init_asyncio_loop()
+-    memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024)
++    device = "npu" if has_npu else "cuda"
++    memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device)
++
+     connector = autorelease_v1(CreateConnector(url, async_loop, memory_allocator))
+ 
+     random_key = dumb_cache_engine_key()
+@@ -70,14 +72,15 @@ def test_lm_connector(url, autorelease_v1, lmserver_v1_process):
+ 
+ 
+ @pytest.mark.parametrize("lmserver_v1_process", ["cpu"], indirect=True)
+-def test_fs_connector(lmserver_v1_process, autorelease_v1):
++def test_fs_connector(lmserver_v1_process, autorelease_v1, has_npu):
+     """Test filesystem connector: exists, put, get, list, and file store."""
+ 
+     with tempfile.TemporaryDirectory() as temp_dir:
+         # Setup
+         url = f"fs://host:0/{temp_dir}/"
+         async_loop, async_thread = init_asyncio_loop()
+-        memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024)
++        device = "npu" if has_npu else "cuda"
++        memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device)
+         connector = autorelease_v1(CreateConnector(url, async_loop, memory_allocator))
+         random_key = dumb_cache_engine_key()
+ 
+@@ -138,7 +141,7 @@ def test_fs_connector(lmserver_v1_process, autorelease_v1):
+         "unix:///tmp/redis.sock",
+     ],
+ )
+-def test_redis_connector(url, autorelease_v1):
++def test_redis_connector(url, autorelease_v1, has_npu):
+     """Test Redis connector: exists, put, get operations.
+ 
+     This test uses the MockRedis from conftest.py to simulate
+@@ -146,7 +149,8 @@ def test_redis_connector(url, autorelease_v1):
+     """
+ 
+     async_loop, async_thread = init_asyncio_loop()
+-    memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024)
++    device = "npu" if has_npu else "cuda"
++    memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device)
+     connector = autorelease_v1(CreateConnector(url, async_loop, memory_allocator))
+ 
+     random_key = dumb_cache_engine_key()
+@@ -199,7 +203,7 @@ def test_redis_connector(url, autorelease_v1):
+         "redis-sentinel://localhost:26379",
+     ],
+ )
+-def test_redis_sentinel_connector(url, autorelease_v1):
++def test_redis_sentinel_connector(url, autorelease_v1, has_npu):
+     """Test Redis Sentinel connector: exists, put, get operations.
+ 
+     This test uses the MockRedisSentinel from conftest.py to simulate
+@@ -213,7 +217,8 @@ def test_redis_sentinel_connector(url, autorelease_v1):
+     os.environ["REDIS_TIMEOUT"] = "5"
+ 
+     async_loop, async_thread = init_asyncio_loop()
+-    memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024)
++    device = "npu" if has_npu else "cuda"
++    memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device)
+     connector = autorelease_v1(CreateConnector(url, async_loop, memory_allocator))
+ 
+     random_key = dumb_cache_engine_key()
+diff --git a/tests/v1/test_gds.py b/tests/v1/test_gds.py
+index 5fda653..ee0c093 100644
+--- a/tests/v1/test_gds.py
++++ b/tests/v1/test_gds.py
+@@ -20,7 +20,7 @@ from lmcache.v1.storage_backend import CreateStorageBackends
+ from lmcache.v1.storage_backend.gds_backend import pack_metadata, unpack_metadata
+ 
+ 
+-def test_gds_backend_metadata():
++def test_gds_backend_metadata(skip_if_npu):
+     # This is a sanity check that packing and unpacking works. We can add
+     # more tensor types to be sure.
+     for [tensor, expected_nbytes] in [(torch.randn(3, 10), 120)]:
+@@ -46,7 +46,7 @@ def test_gds_backend_metadata():
+ 
+ 
+ @pytest.mark.skip(reason="We need to add this test back after implementing prefetch")
+-def test_gds_backend_sanity():
++def test_gds_backend_sanity(skip_if_npu):
+     BASE_DIR = Path(__file__).parent
+     GDS_DIR = "/tmp/gds/test-cache"
+     TEST_KEY = CacheEngineKey(
+diff --git a/tests/v1/test_gpu_connector.py b/tests/v1/test_gpu_connector.py
+index 8fcac95..6ab3bfa 100644
+--- a/tests/v1/test_gpu_connector.py
++++ b/tests/v1/test_gpu_connector.py
+@@ -25,10 +25,9 @@ from lmcache.v1.memory_management import (
+     PinMemoryAllocator,
+ )
+ 
+-
+ @pytest.mark.parametrize("use_gpu", [True, False])
+ @pytest.mark.parametrize("use_mla", [True, False])
+-def test_vllm_paged_connector_v2_with_gpu_and_mla(use_gpu, use_mla):
++def test_vllm_paged_connector_v2_with_gpu_and_mla(use_gpu, use_mla, has_npu):
+     num_blocks = 100
+     block_size = 16
+     num_layers = 32
+@@ -39,8 +38,8 @@ def test_vllm_paged_connector_v2_with_gpu_and_mla(use_gpu, use_mla):
+ 
+     num_tokens = 800
+     chunk_size = 256
+-
+-    allocator = PinMemoryAllocator(1024 * 1024 * 1024)
++    device = "npu" if has_npu else "cuda"
++    allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device)
+ 
+     gpu_kv_src = generate_kv_cache_paged_list_tensors(
+         num_blocks=num_blocks, device=device, block_size=block_size, use_mla=use_mla
+@@ -122,7 +121,7 @@ def test_vllm_paged_connector_v2_with_gpu_and_mla(use_gpu, use_mla):
+ 
+ 
+ @pytest.mark.parametrize("use_gpu", [True])
+-def test_layerwise_vllm_paged_connector_with_gpu(use_gpu):
++def test_layerwise_vllm_paged_connector_with_gpu(use_gpu, has_npu):
+     num_blocks = 100
+     block_size = 16
+     num_layers = 32
+@@ -133,8 +132,8 @@ def test_layerwise_vllm_paged_connector_with_gpu(use_gpu):
+ 
+     num_tokens = 800
+     chunk_size = 256
+-
+-    allocator = PinMemoryAllocator(1024 * 1024 * 1024)
++    device = "npu" if has_npu else "cuda"
++    allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device)
+ 
+     gpu_kv_src = generate_kv_cache_paged_list_tensors(num_blocks, device, block_size)
+     gpu_kv_dst = generate_kv_cache_paged_list_tensors(num_blocks, device, block_size)
+@@ -222,7 +221,7 @@ def test_layerwise_vllm_paged_connector_with_gpu(use_gpu):
+ 
+ 
+ @pytest.mark.parametrize("use_gpu", [True])
+-def test_batched_layerwise_vllm_paged_connector_with_gpu(use_gpu):
++def test_batched_layerwise_vllm_paged_connector_with_gpu(use_gpu, has_npu):
+     num_blocks = 100
+     block_size = 16
+     num_layers = 32
+@@ -235,8 +234,8 @@ def test_batched_layerwise_vllm_paged_connector_with_gpu(use_gpu):
+     num_tokens_2 = 500
+     num_tokens_total = num_tokens_1 + num_tokens_2
+     chunk_size = 256
+-
+-    allocator = PinMemoryAllocator(1024 * 1024 * 1024)
++    device = "npu" if has_npu else "cuda"
++    allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device)
+ 
+     gpu_kv_src = generate_kv_cache_paged_list_tensors(num_blocks, device, block_size)
+     gpu_kv_dst = generate_kv_cache_paged_list_tensors(num_blocks, device, block_size)
+@@ -540,7 +539,7 @@ def test_vllm_paged_connector_v2_to_gpu_bench(benchmark):
+ 
+ @pytest.mark.parametrize("use_gpu", [True, False])
+ @pytest.mark.parametrize("use_mla", [True, False])
+-def test_sglang_connector_with_gpu_and_mla(use_gpu, use_mla):
++def test_sglang_connector_with_gpu_and_mla(use_gpu, use_mla, skip_if_npu):
+     num_blocks = 100
+     block_size = 16
+     num_layers = 32
+diff --git a/tests/v1/test_mem_kernels.py b/tests/v1/test_mem_kernels.py
+index 2cabd67..94440df 100644
+--- a/tests/v1/test_mem_kernels.py
++++ b/tests/v1/test_mem_kernels.py
+@@ -13,6 +13,7 @@ from utils import (
+ import pytest
+ import torch
+ 
++
+ # First Party
+ from lmcache.v1.memory_management import PinMemoryAllocator
+ import lmcache.c_ops as lmc_ops
+@@ -54,7 +55,7 @@ def _slice_kv_at(
+ 
+ 
+ @pytest.mark.parametrize("num_tokens", [256, 500, 1024, 8000])
+-def test_extract_and_load_back(num_tokens):
++def test_extract_and_load_back(num_tokens, has_npu):
+     device = "cuda"
+ 
+     num_blocks = 1000
+@@ -68,7 +69,8 @@ def test_extract_and_load_back(num_tokens):
+     slot_mapping = torch.tensor(slot_mapping, device=device)
+ 
+     pinned_cpu_size = 4 * 1024 * 1024 * 1024  # 4GB
+-    mem_allocator = PinMemoryAllocator(pinned_cpu_size)
++    device = "npu" if has_npu else "cuda"
++    mem_allocator = PinMemoryAllocator(pinned_cpu_size, device=device)
+ 
+     # Old extract
+     kv_tuple_list = []
+@@ -154,7 +156,7 @@ def test_extract_and_load_back(num_tokens):
+ 
+ 
+ @pytest.mark.parametrize("num_tokens", [256, 500, 1024, 8000])
+-def test_multi_layer_kernel(num_tokens):
++def test_multi_layer_kernel(num_tokens, has_npu):
+     device = "cuda"
+ 
+     num_blocks = 1000
+@@ -172,7 +174,8 @@ def test_multi_layer_kernel(num_tokens):
+     slot_mapping = torch.tensor(slot_mapping, device=device)
+ 
+     pinned_cpu_size = 4 * 1024 * 1024 * 1024  # 4GB
+-    mem_allocator = PinMemoryAllocator(pinned_cpu_size)
++    device = "npu" if has_npu else "cuda"
++    mem_allocator = PinMemoryAllocator(pinned_cpu_size, device=device)
+ 
+     # lmc_ops.multi_layer_kv_transfer(memory_obj_new.tensor,
+     #                                kv_cache_pointers, # TODO: initialize this
+@@ -209,8 +212,12 @@ def test_multi_layer_kernel(num_tokens):
+     kv_cache_pointers = torch.empty(
+         32, dtype=torch.int64, device="cpu", pin_memory=True
+     )
++        
+     for i in range(32):
+         kv_cache_pointers[i] = kv_cache[i].data_ptr()
++    
++    if has_npu:
++        kv_cache_pointers = kv_cache_pointers.to(device)
+ 
+     memory_obj_new_list = []
+     start_event = torch.cuda.Event(enable_timing=True)
+@@ -253,6 +260,9 @@ def test_multi_layer_kernel(num_tokens):
+     )
+     for i in range(32):
+         kv_cache_pointers_new[i] = kv_cache_new[i].data_ptr()
++        
++    if has_npu:
++        kv_cache_pointers_new = kv_cache_pointers_new.to(device)
+ 
+     for chunk_id, slot_mapping_temp in enumerate(slot_mapping_chunked):
+         memory_obj_new = memory_obj_new_list[chunk_id]
+@@ -276,7 +286,7 @@ def test_multi_layer_kernel(num_tokens):
+ 
+ 
+ @pytest.mark.parametrize("num_tokens", [256, 500, 1024, 8000])
+-def test_multi_layer_kernel_use_mla(num_tokens):
++def test_multi_layer_kernel_use_mla(num_tokens, has_npu):
+     device = "cuda"
+ 
+     num_blocks = 1000
+@@ -293,8 +303,9 @@ def test_multi_layer_kernel_use_mla(num_tokens):
+     slot_mapping = torch.tensor(slot_mapping, device=device)
+ 
+     pinned_cpu_size = 4 * 1024 * 1024 * 1024  # 4GB
+-    mem_allocator = PinMemoryAllocator(pinned_cpu_size)
+-
++    device = "npu" if has_npu else "cuda"
++    mem_allocator = PinMemoryAllocator(pinned_cpu_size, device=device)
++        
+     # layer by layer extract
+     memory_obj_old_list = []
+     start_event = torch.cuda.Event(enable_timing=True)
+@@ -329,6 +340,8 @@ def test_multi_layer_kernel_use_mla(num_tokens):
+     )
+     for i in range(num_layers):
+         kv_cache_pointers[i] = kv_cache[i].data_ptr()
++    if has_npu:
++        kv_cache_pointers = kv_cache_pointers.to(device)
+ 
+     memory_obj_new_list = []
+     start_event = torch.cuda.Event(enable_timing=True)
+@@ -377,6 +390,9 @@ def test_multi_layer_kernel_use_mla(num_tokens):
+     )
+     for i in range(num_layers):
+         kv_cache_pointers_new[i] = kv_cache_new[i].data_ptr()
++    
++    if has_npu:
++        kv_cache_pointers_new = kv_cache_pointers_new.to(device)
+ 
+     for chunk_id, slot_mapping_temp in enumerate(slot_mapping_chunked):
+         memory_obj_new = memory_obj_new_list[chunk_id]
+diff --git a/tests/v1/test_memory_management.py b/tests/v1/test_memory_management.py
+index 2f70517..660e74f 100644
+--- a/tests/v1/test_memory_management.py
++++ b/tests/v1/test_memory_management.py
+@@ -142,15 +142,15 @@ def test_tensor_allocator(use_paging):
+         True,
+     ],
+ )
+-def test_device_allocators(alloc_cls, use_paging):
++def test_device_allocators(alloc_cls, use_paging, has_npu):
+     total_size = 1024 * 1024 * 128  # 128MB
+ 
+     shape = torch.Size([2, 32, 16, 1024])  # 64 pages
+     dtype = torch.bfloat16
+     fmt = MemoryFormat.KV_2LTD
+-
++    device = "npu" if has_npu else "cuda"
+     allocator = alloc_cls(
+-        total_size, use_paging=use_paging, shape=shape, dtype=dtype, fmt=fmt
++        total_size, use_paging=use_paging, shape=shape, dtype=dtype, fmt=fmt, device=device
+     )
+ 
+     if use_paging:
+@@ -171,9 +171,10 @@ def test_device_allocators(alloc_cls, use_paging):
+         MixedMemoryAllocator,
+     ],
+ )
+-def test_inplace_modification(alloc_cls):
++def test_inplace_modification(alloc_cls, has_npu):
+     total_size = 1024
+-    allocator = alloc_cls(total_size)
++    device = "npu" if has_npu else "cuda"
++    allocator = alloc_cls(total_size, device=device)
+ 
+     data = allocator.allocate([10], torch.float)
+     assert data is not None
+@@ -198,9 +199,10 @@ def test_inplace_modification(alloc_cls):
+         MixedMemoryAllocator,
+     ],
+ )
+-def test_boundary_alloc(alloc_cls):
++def test_boundary_alloc(alloc_cls, has_npu):
+     total_size = 1 << 25
+-    allocator = alloc_cls(total_size)
++    device = "npu" if has_npu else "cuda"
++    allocator = alloc_cls(total_size, device=device)
+     data1 = allocator.allocate([512, 10], torch.float)
+     allocator.allocate([512, 10], torch.float)
+     allocator.free(data1)
+@@ -225,10 +227,11 @@ def test_boundary_alloc(alloc_cls):
+         MixedMemoryAllocator,
+     ],
+ )
+-def test_batched_alloc(alloc_cls):
++def test_batched_alloc(alloc_cls, has_npu):
+     total_size = 32 * 100 * 2 * 1024 * 2
+     batch_size = 32
+-    allocator = alloc_cls(total_size)
++    device = "npu" if has_npu else "cuda"
++    allocator = alloc_cls(total_size, device=device)
+     objs = allocator.batched_allocate(
+         [100, 2, 1024], torch.bfloat16, batch_size, MemoryFormat.KV_T2D
+     )
+@@ -255,9 +258,10 @@ def test_batched_alloc(alloc_cls):
+         MixedMemoryAllocator,
+     ],
+ )
+-def test_mixed_alloc(alloc_cls):
++def test_mixed_alloc(alloc_cls, has_npu):
+     total_size = 1 << 25
+-    allocator = alloc_cls(total_size)
++    device = "npu" if has_npu else "cuda"
++    allocator = alloc_cls(total_size, device=device)
+     data1 = allocator.allocate([512, 0], None, MemoryFormat.BINARY_BUFFER)
+     allocator.allocate([512, 10], torch.float)
+     allocator.free(data1)
+diff --git a/tests/v1/test_weka.py b/tests/v1/test_weka.py
+index eab8e1b..7cf052b 100644
+--- a/tests/v1/test_weka.py
++++ b/tests/v1/test_weka.py
+@@ -18,7 +18,7 @@ from lmcache.v1.storage_backend import CreateStorageBackends
+ 
+ 
+ @pytest.mark.skip(reason="We need to add this test back after implementing prefetch")
+-def test_weka_backend_sanity():
++def test_weka_backend_sanity(skip_if_npu):
+     BASE_DIR = Path(__file__).parent
+     WEKA_DIR = "/tmp/weka/test-cache"
+     TEST_KEY = CacheEngineKey(
diff --git a/LMCache-0.1.4.alpha.tar.gz b/LMCache-0.1.4.alpha.tar.gz
deleted file mode 100644
index c242010c5335191bfa5298c757768528425427ec..0000000000000000000000000000000000000000
Binary files a/LMCache-0.1.4.alpha.tar.gz and /dev/null differ
diff --git a/LMCache.tar.gz b/LMCache.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..b90311279cb52f716ed1e852c87f098057c755ec
Binary files /dev/null and b/LMCache.tar.gz differ
diff --git a/fix-dependance-issue-on-ascend.patch b/fix-dependance-issue-on-ascend.patch
deleted file mode 100644
index 05a158638d383324086c991815ebf5b095df5c68..0000000000000000000000000000000000000000
--- a/fix-dependance-issue-on-ascend.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/setup.py b/setup.py
-index 5b1aa4b..1895953 100644
---- a/setup.py
-+++ b/setup.py
-@@ -17,7 +17,7 @@ setup(
-         "nvtx",
-         "safetensors",
-         "transformers",
--        "torchac_cuda >= 0.2.5",
-+        # "torchac_cuda >= 0.2.5",
-     ],
-     classifiers=[
-         # Trove classifiers
diff --git a/lmcache.spec b/lmcache.spec
index c1dc836e6d611c2048e6a0ea79c5db825bb39c0a..673cd34593459d79286583f081ce1f684c6a810a 100644
--- a/lmcache.spec
+++ b/lmcache.spec
@@ -1,41 +1,44 @@
 %define debug_package %{nil}
 
-Name:           python-LMCache
-Version:        0.1.4.alpha
+Name:           LMCache
+Version:        0.3.1.post1
 Release:        1
 Summary:        LMCache - A lightweight memory cache system
 License:        Apache 2.0
 URL:            https://github.com/LMCache/LMCache
-Source0:        LMCache-%{version}.tar.gz
+Source0:        LMCache.tar.gz
 
-Patch0:         fix-dependance-issue-on-ascend.patch
-
-BuildArch:      noarch
+Patch0:         0001-npu-picked-support.patch
 
+BuildArch:      aarch64
 BuildRequires:  python3
 BuildRequires:  python3-setuptools python3-pip
-
 Requires:       python3
 
+
 %description
 LMCache is a lightweight, high-performance memory cache system designed for 
 simple key-value storage with minimal overhead.
 
 %prep
-%autosetup -n %{name}-%{version} -p1 -Sgit
+%autosetup -n %{name} -p1 -Sgit
 
 %build
-ln -s /usr/bin/python3 /usr/bin/python || :
+# ln -s /usr/bin/python3 /usr/bin/python || :
+
+export LMCACHE_TARGET_DEVICE="ASCEND" 
+export PYTHONPATH=/usr/local/lib64/python3.11/site-packages:$PYTHONPATH
+
 %py3_build
 
 %install
 %py3_install
 
 %files
-%{python3_sitelib}/*
+/usr/lib64/python3.11/site-packages/*
 %{_bindir}/*
 
-%changelog
-* Tue Jun 24 2025 jingjunyuan <jingjunyuan@huawei.com> - 0.1.4.alpha-1
-- Initial package build
 
+%changelog
+* Tues Jul 29 2025 gingfung.matthew.yeung <gingfung.matthew.yeung@huawei.com> - 0.3.1.post1-1
+- Initial package build
\ No newline at end of file