diff --git a/0001-npu-picked-support.patch b/0001-npu-picked-support.patch new file mode 100644 index 0000000000000000000000000000000000000000..e22bb123bb9a670db883f53e2ba388a29aecd63e --- /dev/null +++ b/0001-npu-picked-support.patch @@ -0,0 +1,3380 @@ +diff --git a/csrc/ascend/CMakeLists.txt b/csrc/ascend/CMakeLists.txt +new file mode 100644 +index 0000000..b434723 +--- /dev/null ++++ b/csrc/ascend/CMakeLists.txt +@@ -0,0 +1,130 @@ ++# Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. ++ ++# CMake lowest version requirement ++cmake_minimum_required(VERSION 3.16.0) ++# project information ++project(c_ops) ++ ++set(CMAKE_CXX_STANDARD 17) ++include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake) ++ ++find_package(Python3 COMPONENTS Interpreter Development REQUIRED) ++set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11") ++find_package(pybind11 REQUIRED) ++ ++append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") ++set(LMC_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}") ++ ++set(SOC_VERSION ${SOC_VERSION}) ++set(ARCH ${ARCH}) ++ ++if (NOT CMAKE_BUILD_TYPE) ++ set(CMAKE_BUILD_TYPE "Release" CACHE STRINGS "Build type Release/Debug (default Release)" FORCE) ++endif() ++ ++set(ASCEND_HOME_PATH ${ASCEND_CANN_PACKAGE_PATH}) ++if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) ++ set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) ++elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) ++ set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) ++elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) ++ set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) ++else() ++ message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.") ++endif() ++ ++include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) ++ ++# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}. ++# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library ++file(GLOB KERNEL_FILES ++${CMAKE_CURRENT_SOURCE_DIR}/kernels/*.cpp) ++ ++message(STATUS "kernel files: ${KERNEL_FILES}") ++ ++# ascendc_library use to add kernel file to generate ascendc library ++ascendc_library(ascend_kernels SHARED ++ ${KERNEL_FILES} ++) ++ ++message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}") ++ ++file(GLOB SRC_FILES ++${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) ++ ++set(PYBIND11_SOURCE_FILE ++${CMAKE_CURRENT_SOURCE_DIR}/mem_kernels.cpp ++${CMAKE_CURRENT_SOURCE_DIR}/managed_mem.cpp ++${CMAKE_CURRENT_SOURCE_DIR}/../pybind.cpp ++) ++ ++find_package(Torch REQUIRED) ++ ++include_directories( ++ ${CMAKE_CURRENT_SOURCE_DIR} ++ ${pybind11_INCLUDE_DIRS} ++ ${PYTHON_INCLUDE_PATH} ++ ${TORCH_INCLUDE_DIRS} ++ ${TORCH_NPU_PATH}/include ++ ${ASCEND_HOME_PATH}/include ++ ${ASCEND_HOME_PATH}/aarch64-linux/ascendc/include ++ ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform ++ ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/ascend_hal ++ ${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform ++ ${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/ascend_hal ++) ++ ++ ++set( ++ INCLUDES ++ ${TORCH_INCLUDE_DIRS} ++ ${TORCH_NPU_PATH}/include ++ ${ASCEND_HOME_PATH}/include ++ ${ASCEND_HOME_PATH}/aarch64-linux/ascendc/include ++ ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform ++ ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/ascend_hal ++) ++ ++set(PYMODULE_FILES ++ ${SRC_FILES} ++ ${PYBIND11_SOURCE_FILE} ++) ++ ++pybind11_add_module(c_ops ${PYMODULE_FILES}) ++ ++message(STATUS "CMake: Adding -DUSE_ASCEND compile definition.") ++target_compile_definitions(c_ops PRIVATE USE_ASCEND) ++ ++set(TORCH_NPU_LIBS_DIR "${TORCH_NPU_PATH}/lib") ++set(ASCEND_CANN_LIBS_DIR "${ASCEND_HOME_PATH}/lib64") ++set(TORCH_LIBS_DIR "${TORCH_PATH}/lib") ++ ++ ++target_link_options(c_ops PRIVATE ++ "-Wl,-rpath,$ORIGIN:$ORIGIN/lib" ++ "-Wl,-rpath,${LMC_INSTALL_PATH}" ++) ++ ++target_link_directories( ++ c_ops ++ PRIVATE ++ ${TORCH_LIBS_DIR} ++ ${TORCH_NPU_PATH}/lib/ ++ ${ASCEND_HOME_PATH}/lib64 ++ ${ASCEND_DRIVER_PATH}/lib64/driver ++) ++ ++target_link_libraries( ++ c_ops ++ PUBLIC ++ ${TORCH_LIBRARIES} ++ libtorch_npu.so ++ ascend_kernels ++ ascendcl ++ platform ++ ascend_hal ++ tiling_api ++) ++ ++ ++install(TARGETS c_ops ascend_kernels DESTINATION ${LMC_INSTALL_PATH}) +diff --git a/csrc/ascend/cachegen_kernels.cpp b/csrc/ascend/cachegen_kernels.cpp +new file mode 100644 +index 0000000..fdf865f +--- /dev/null ++++ b/csrc/ascend/cachegen_kernels.cpp +@@ -0,0 +1,32 @@ ++#include "cachegen_kernels.h" ++#include ++#include ++ ++namespace py = pybind11; ++ ++void encode_cuda_new(const at::Tensor& cdf, const at::Tensor& input_sym, ++ at::Tensor& output_buffer, at::Tensor& output_lengths) { ++ // TODO: ++ PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend."); ++ throw py::error_already_set(); ++}; ++ ++void decode_cuda_new(const at::Tensor& cdf, const at::Tensor& bytestreams, ++ const at::Tensor& lengths, at::Tensor& output) { ++ // TODO: ++ PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend."); ++ throw py::error_already_set(); ++}; ++ ++void decode_cuda_prefsum(const at::Tensor& cdf, const at::Tensor& bytestreams, ++ const at::Tensor& lengths, at::Tensor& output) { ++ // TODO: ++ PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend."); ++ throw py::error_already_set(); ++}; ++ ++at::Tensor calculate_cdf(const at::Tensor& input, const int max_bins) { ++ // TODO: ++ PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend."); ++ throw py::error_already_set(); ++}; +\ No newline at end of file +diff --git a/csrc/ascend/cachegen_kernels.h b/csrc/ascend/cachegen_kernels.h +new file mode 100644 +index 0000000..d1a1701 +--- /dev/null ++++ b/csrc/ascend/cachegen_kernels.h +@@ -0,0 +1,16 @@ ++#pragma once ++#include ++#include ++#include ++#include ++ ++void encode_cuda_new(const at::Tensor& cdf, const at::Tensor& input_sym, ++ at::Tensor& output_buffer, at::Tensor& output_lengths); ++ ++void decode_cuda_new(const at::Tensor& cdf, const at::Tensor& bytestreams, ++ const at::Tensor& lengths, at::Tensor& output); ++ ++void decode_cuda_prefsum(const at::Tensor& cdf, const at::Tensor& bytestreams, ++ const at::Tensor& lengths, at::Tensor& output); ++ ++at::Tensor calculate_cdf(const at::Tensor& input, const int max_bins); +\ No newline at end of file +diff --git a/csrc/ascend/kernels/load_and_reshape_flash.cpp b/csrc/ascend/kernels/load_and_reshape_flash.cpp +new file mode 100644 +index 0000000..7e04e6f +--- /dev/null ++++ b/csrc/ascend/kernels/load_and_reshape_flash.cpp +@@ -0,0 +1,264 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++#include "kernel_operator.h" ++#include ++#include "types.h" ++#include "utils.h" ++ ++template class LoadAndReshapeFlashCopy { ++ using local_scalar_t = AscendC::LocalTensor; ++ ++public: ++ __aicore__ inline LoadAndReshapeFlashCopy() ++ { ++ } ++ ++ __aicore__ inline void init(GM_ADDR cacheTensor, GM_ADDR keyCachePtr, GM_ADDR valueCachePtr, GM_ADDR slotmappings, ++ const int64_t numPages, const int64_t hiddenDims, const int32_t pagedSize, ++ const int32_t numTokens, const int32_t numLayers, const int32_t layerIdx, ++ const bool page2L, AscendC::TPipe *pipe) ++ { ++ this->pipe_ = pipe; ++ this->numPages_ = numPages; ++ this->hiddenDims_ = hiddenDims; ++ this->numTokens_ = numTokens; ++ this->pagedSize_ = pagedSize; ++ this->numLayers_ = numLayers; ++ this->layerIdx_ = layerIdx; ++ this->valid_ = true; ++ this->page2L_ = page2L; ++ ++ // TODO: Not sure how many to allocate, but let's do 4 blocks of hiddenDims_ ++ // if it was fp16, 2048, we would get 16kb.? ++ // should check whether hiddenDims_ is > 192KB. ++ this->pipe_->InitBuffer(this->pagedTokenQue_, 4, this->hiddenDims_*sizeof(scalar_t)); ++ } ++ ++ __aicore__ inline void reset(){ ++ this->valid_ = true; ++ } ++ ++ __aicore__ inline void updateTensorMemOffsetAndProcess(__gm__ uint8_t *pagedKeyTensor, ++ __gm__ uint8_t *pagedValueTensor, ++ __gm__ uint8_t* nonPagedTensor, ++ __gm__ uint8_t *slotmappings, const int tokenIdx) ++ { ++ __gm__ slot_t *slotmappingPtr = reinterpret_cast<__gm__ slot_t*>(slotmappings); ++ int64_t slot = static_cast(slotmappingPtr[tokenIdx]); ++ ++ if (slot == -1) { ++ this->valid_ = false; ++ return; ++ } ++ ++ // for the page tensor ++ int64_t pagedIdxOffset = slot * this->hiddenDims_; ++ ++ // for the lmc tensor ++ int64_t nonPagedKeyOffset = this->layerIdx_ * this->numTokens_ * this->hiddenDims_ + ++ tokenIdx * this->hiddenDims_; ++ ++ // values are stored after keys in the non-paged tensor ++ int64_t nonPagedValueOffset = this->numLayers_ * this->numTokens_ * this->hiddenDims_ + ++ this->layerIdx_ * this->numTokens_ * this->hiddenDims_ + ++ tokenIdx * this->hiddenDims_; ++ ++ // keys ++ this->keyTokensGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedKeyTensor) + pagedIdxOffset, ++ this->hiddenDims_); ++ this->lmcBufferKeyGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(nonPagedTensor) + nonPagedKeyOffset, ++ this->hiddenDims_); ++ // values ++ this->valueTokensGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedValueTensor) + pagedIdxOffset, ++ this->hiddenDims_); ++ this->lmcBufferValueGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(nonPagedTensor) + nonPagedValueOffset, ++ this->hiddenDims_); ++ } ++ ++ __aicore__ inline void processFunc() { ++ if (!this->valid_) { ++ return; ++ } ++ // 1. Alloc Tensor for local page ++ local_scalar_t hiddenKeysDimTensor = this->pagedTokenQue_.template AllocTensor(); ++ local_scalar_t hiddenValuesDimTensor = this->pagedTokenQue_.template AllocTensor();; ++ ++ // 2. copy from global tensor into local (GM -> UB) ++ if (this->page2L_) { ++ AscendC::DataCopy(hiddenKeysDimTensor, this->keyTokensGlobal_, this->hiddenDims_); ++ AscendC::DataCopy(hiddenValuesDimTensor, this->valueTokensGlobal_, this->hiddenDims_); ++ } else { ++ AscendC::DataCopy(hiddenKeysDimTensor, this->lmcBufferKeyGlobal_, this->hiddenDims_); ++ AscendC::DataCopy(hiddenValuesDimTensor, this->lmcBufferValueGlobal_, this->hiddenDims_); ++ } ++ ++ // 3. enque vecin ++ pagedTokenQue_.EnQue(hiddenKeysDimTensor); ++ pagedTokenQue_.EnQue(hiddenValuesDimTensor); ++ ++ // 4. deque vecin, possible to reuse due to QueBind ++ hiddenKeysDimTensor = pagedTokenQue_.DeQue(); ++ hiddenValuesDimTensor = pagedTokenQue_.DeQue(); ++ ++ // 5. datacopy into GM ++ if (this->page2L_) { ++ AscendC::DataCopy(this->lmcBufferKeyGlobal_, hiddenKeysDimTensor, this->hiddenDims_); ++ AscendC::DataCopy(this->lmcBufferValueGlobal_, hiddenValuesDimTensor, this->hiddenDims_); ++ } else { ++ AscendC::DataCopy(this->keyTokensGlobal_, hiddenKeysDimTensor, this->hiddenDims_); ++ AscendC::DataCopy(this->valueTokensGlobal_, hiddenValuesDimTensor, this->hiddenDims_); ++ } ++ // 6. free alloced Tensor ++ pagedTokenQue_.FreeTensor(hiddenKeysDimTensor); ++ pagedTokenQue_.FreeTensor(hiddenValuesDimTensor); ++ } ++ ++private: ++ AscendC::TPipe *pipe_; ++ AscendC::TQueBind pagedTokenQue_; ++ ++ // [numPages, pagedSize, heads*headsize] ++ AscendC::GlobalTensor keyTokensGlobal_; ++ AscendC::GlobalTensor valueTokensGlobal_; ++ ++ // Depends on LMC setting whether we store in tokensMajor or not. ++ // the layout would be the followings: ++ // [tokens, kvs, heads*headsize] or [kvs, tokens, heads*headsize] ++ // TODO: check whether should combine the two and use a loop ++ AscendC::GlobalTensor lmcBufferKeyGlobal_; ++ AscendC::GlobalTensor lmcBufferValueGlobal_; ++ ++ int64_t numPages_; // num vllm npu blocks ++ int32_t pagedSize_; // per npu block tokens ++ int64_t hiddenDims_; // heads * headsize ++ int32_t numTokens_; // num tokens in the cache tensor chunk ++ int32_t numLayers_; // num layers in the cache tensor ++ int32_t layerIdx_; // layer idx in the cache tensor ++ bool valid_; ++ bool page2L_; // true, from pagedTensor to LMC, false otherwise ++}; ++ ++#define LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(TYPE, SLOTTYPE) \ ++ extern "C" __global__ __aicore__ void load_and_reshape_flash_copy_##TYPE##_##SLOTTYPE( \ ++ __gm__ uint8_t* dstCacheTensor, __gm__ uint8_t* keyCachePtr, __gm__ uint8_t* valueCachePtr, \ ++ __gm__ uint8_t* slotmappings, const int64_t hiddenDims, const int64_t numPages, const int32_t pagedSize, \ ++ const int32_t numTokens, const int32_t numLayers, const int32_t layerIdx, const bool page2L, \ ++ const int blockNum) \ ++ { \ ++ AscendC::TPipe pipe; \ ++ LoadAndReshapeFlashCopy op{}; \ ++ op.init(dstCacheTensor, keyCachePtr, valueCachePtr, slotmappings, numPages, hiddenDims, pagedSize, \ ++ numTokens, numLayers, layerIdx, page2L, &pipe); \ ++ int64_t bIdx = AscendC::GetBlockIdx(); \ ++ for (int64_t i = bIdx; i < numTokens; i+=blockNum) \ ++ { \ ++ op.reset(); \ ++ op.updateTensorMemOffsetAndProcess(keyCachePtr, valueCachePtr, dstCacheTensor, slotmappings, i); \ ++ op.processFunc(); \ ++ } \ ++ } ++ ++// Declare support kernel entry ++LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(half, int32_t); ++LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(half, int64_t); ++LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(bfloat16_t, int32_t); ++LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(bfloat16_t, int64_t); ++LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(int8_t, int32_t); ++LOAD_AND_RESHAPE_FLASH_COPY_TYPE_DECLARE(int8_t, int64_t); ++ ++namespace lmc { ++ ++#define LOAD_AND_RESHAPE_FLASH_COPY_KERNEL_CALL(TYPE, SLOTTYPE) \ ++ load_and_reshape_flash_copy_##TYPE##_##SLOTTYPE<<>>(dstCacheTensor, keyCachePtr, \ ++ valueCachePtr, slotmappings, hiddenDims, numPages, pagedSize, \ ++ numTokens, numLayers, layerIdx, page2L, blockDim); ++ ++template ++void load_and_reshape_kernel_call(uint32_t blockDim, void *stream, uint8_t *dstCacheTensor, uint8_t *keyCachePtr, ++ uint8_t *valueCachePtr, uint8_t *slotmappings, const int64_t hiddenDims, const int64_t numPages, ++ const int32_t pagedSize, const int32_t numTokens, const int32_t numLayers, ++ const int32_t layerIdx, const bool page2L); ++ ++ ++#define LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(TYPE, SLOTTYPE) \ ++template<> \ ++void load_and_reshape_kernel_call(uint32_t blockDim, void *stream, uint8_t *dstCacheTensor, \ ++ uint8_t *keyCachePtr, uint8_t *valueCachePtr, uint8_t *slotmappings, \ ++ const int64_t hiddenDims, const int64_t numPages, \ ++ const int32_t pagedSize, const int32_t numTokens, \ ++ const int32_t numLayers, const int32_t layerIdx, \ ++ const bool page2L) { \ ++ LOAD_AND_RESHAPE_FLASH_COPY_KERNEL_CALL(TYPE, SLOTTYPE); \ ++} ++ ++LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(half, int32_t); ++LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(half, int64_t); ++LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int32_t); ++LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int64_t); ++LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(int8_t, int32_t); ++LOAD_AND_RESHAPE_KERNEL_CALL_TYPE_DECLARE(int8_t, int64_t); ++ ++template ++void dispatch_on_slot_type(vllm_ascend::AscendType slotType, uint32_t blockDim, void *stream, ++ uint8_t *dstCacheTensor, uint8_t *keyCachePtr, uint8_t *valueCachePtr, ++ uint8_t *slotmappings, const int64_t hiddenDims, const int64_t numPages, ++ const int32_t pagedSize, const int32_t numTokens, const int32_t numLayers, ++ const int32_t layerIdx, const bool page2L) { ++ switch(slotType) { ++ case vllm_ascend::AscendType::INT32: ++ load_and_reshape_kernel_call(blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr, ++ slotmappings, hiddenDims, numPages, pagedSize, numTokens, numLayers, layerIdx, ++ page2L); ++ break; ++ case vllm_ascend::AscendType::INT64: ++ load_and_reshape_kernel_call(blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr, ++ slotmappings, hiddenDims, numPages, pagedSize, numTokens, numLayers, layerIdx, ++ page2L); ++ break; ++ default: ++ return; ++ } ++} ++ ++extern void load_and_reshape_flash_kernel(vllm_ascend::AscendType type, vllm_ascend::AscendType slotType, ++ uint32_t blockDim, void *stream, ++ uint8_t *dstCacheTensor, uint8_t *keyCachePtr, uint8_t *valueCachePtr, ++ uint8_t *slotmappings, const int64_t hiddenDims, const int64_t numPages, ++ const int32_t pagedSize, const int32_t numTokens, const int32_t numLayers, ++ const int32_t layerIdx, bool page2L) ++{ ++ switch(type) { ++ case vllm_ascend::AscendType::FP16: ++ dispatch_on_slot_type(slotType, blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr, ++ slotmappings, hiddenDims, numPages, pagedSize, numTokens, numLayers, layerIdx, ++ page2L); ++ break; ++ case vllm_ascend::AscendType::BF16: ++ dispatch_on_slot_type(slotType, blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr, ++ slotmappings, hiddenDims, numPages, pagedSize, numTokens, numLayers, layerIdx, ++ page2L); ++ break; ++ case vllm_ascend::AscendType::INT8: ++ dispatch_on_slot_type(slotType, blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr, ++ slotmappings, hiddenDims, numPages, pagedSize, numTokens, numLayers, layerIdx, ++ page2L); ++ break; ++ default: ++ return; ++ } ++} ++ ++} // namespace lmc +diff --git a/csrc/ascend/kernels/multi_layer_mem_kernels.cpp b/csrc/ascend/kernels/multi_layer_mem_kernels.cpp +new file mode 100644 +index 0000000..3e94f63 +--- /dev/null ++++ b/csrc/ascend/kernels/multi_layer_mem_kernels.cpp +@@ -0,0 +1,234 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++#include "kernel_operator.h" ++#include ++#include "types.h" ++#include "utils.h" ++ ++template class MultiLayerPagedKVCopy { ++ using local_scalar_t = AscendC::LocalTensor; ++ ++public: ++ __aicore__ inline MultiLayerPagedKVCopy() ++ { ++ } ++ ++ __aicore__ inline void init(GM_ADDR pagedKVCaches, GM_ADDR cacheTensor, GM_ADDR slotmappings, ++ const int64_t hiddenDims, const int32_t numLayers, const int64_t pageBuffSize, ++ const int32_t numTokensChunk, const bool page2L, ++ AscendC::TPipe *pipe) ++ { ++ this->pipe_ = pipe; ++ this->numLayers_ = numLayers; ++ this->hiddenDims_ = hiddenDims; ++ this->pageBuffSize_ = pageBuffSize; ++ this->numTokensChunk_ = numTokensChunk; ++ this->page2L_ = page2L; ++ this->valid_ = true; ++ ++ this->pipe_->InitBuffer(pagedTokenQue_, 4, this->hiddenDims_*sizeof(scalar_t)); ++ } ++ ++ __aicore__ inline void reset(){ ++ this->valid_ = true; ++ } ++ ++ __aicore__ inline void updateMemOffset(__gm__ uint8_t *pagedKVCaches, __gm__ uint8_t* cacheTensor, ++ __gm__ uint8_t *slotmappings, const int tokenIdx, ++ const int kvIdx, const int layerIdx) ++ { ++ __gm__ slot_t *slotmappingPtr = reinterpret_cast<__gm__ slot_t*>(slotmappings); ++ int64_t slot = static_cast(slotmappingPtr[tokenIdx]); ++ ++ if (slot == -1) { ++ this->valid_ = false; ++ return; ++ } ++ ++ // its a pointer within the GM addr space, that point to another GM addr space ++ __gm__ uint8_t * __gm__ *pagedKVCachesPtr = reinterpret_cast<__gm__ uint8_t* __gm__ *>(pagedKVCaches); ++ ++ // getting the right ptr to the paged kvcache layer ++ __gm__ uint8_t *pagedLayerKVCaches = pagedKVCachesPtr[layerIdx]; ++ ++ int64_t pagedIdxOffset = kvIdx * this->pageBuffSize_ * this->hiddenDims_ + ++ slot * this->hiddenDims_; ++ ++ int64_t dstTensorIdxOffset = kvIdx * this->numLayers_ * this->numTokensChunk_ * this->hiddenDims_ + ++ layerIdx * this->numTokensChunk_ * this->hiddenDims_ + ++ tokenIdx * this->hiddenDims_; ++ ++ this->pagedTokenGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedLayerKVCaches) + pagedIdxOffset, ++ this->hiddenDims_); ++ this->lmcBufferGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(cacheTensor) + dstTensorIdxOffset, ++ this->hiddenDims_); ++ } ++ ++ __aicore__ inline void processFunc() { ++ if (!this->valid_) { ++ return; ++ } ++ // 1. Alloc Tensor for local page ++ local_scalar_t hiddenDimTensor = pagedTokenQue_.AllocTensor(); ++ ++ // 2. copy from global tensor into local ++ if (this->page2L_) { ++ AscendC::DataCopy(hiddenDimTensor, this->pagedTokenGlobal_, this->hiddenDims_); ++ } else { ++ AscendC::DataCopy(hiddenDimTensor, this->lmcBufferGlobal_, this->hiddenDims_); ++ } ++ ++ // 3. enque vecin ++ pagedTokenQue_.EnQue(hiddenDimTensor); ++ // 4. deque vecin, possible to reuse due to QueBind ++ hiddenDimTensor = pagedTokenQue_.DeQue(); ++ ++ // 5. datacopy into GM ++ if (this->page2L_) { ++ AscendC::DataCopy(this->lmcBufferGlobal_, hiddenDimTensor, this->hiddenDims_); ++ } else { ++ AscendC::DataCopy(this->pagedTokenGlobal_, hiddenDimTensor, this->hiddenDims_); ++ } ++ ++ // 6. free alloced Tensor ++ pagedTokenQue_.FreeTensor(hiddenDimTensor); ++ } ++ ++ ++private: ++ AscendC::TPipe *pipe_; ++ AscendC::TQueBind pagedTokenQue_; ++ ++ // [layers * [kvs, numPages * pagedSize, heads*headsize]] ++ AscendC::GlobalTensor pagedTokenGlobal_; ++ // [kvs, layers, numTokensChunk, heads*headsize] ++ AscendC::GlobalTensor lmcBufferGlobal_; ++ int32_t numLayers_; // num layers ++ int64_t pageBuffSize_; // pages * pageSize ++ int64_t hiddenDims_; // heads * headSize ++ int32_t numTokensChunk_; // num tokens in the cache tensor chunk ++ bool valid_; ++ bool page2L_; // true, from pagedTensor to LMC, false otherwise ++}; ++ ++// NOTE: there are potential micro optimizaiton here. ++#define MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(TYPE, SLOTTYPE) \ ++ extern "C" __global__ __aicore__ void multi_layer_paged_kv_copy_##TYPE##_##SLOTTYPE( \ ++ __gm__ uint8_t* pagedKVCaches, __gm__ uint8_t* dstCacheTensor, __gm__ uint8_t* slotmappings, \ ++ const int64_t hiddenDims, const int32_t kvs, const int32_t numLayers, \ ++ const int64_t pageBuffSize, const int32_t numTokensChunk, const int coreNum, const bool page2L) \ ++ { \ ++ AscendC::TPipe pipe; \ ++ MultiLayerPagedKVCopy op{}; \ ++ op.init(pagedKVCaches, dstCacheTensor, slotmappings, hiddenDims, \ ++ numLayers, pageBuffSize, numTokensChunk, page2L, &pipe); \ ++ int64_t bIdx = AscendC::GetBlockIdx(); \ ++ for (int64_t i = bIdx; i < numTokensChunk; i+=coreNum) { \ ++ for (int32_t kvIdx = 0; kvIdx < kvs; kvIdx ++) { \ ++ for (int32_t layerIdx = 0; layerIdx < numLayers; layerIdx++) { \ ++ op.reset(); \ ++ op.updateMemOffset(pagedKVCaches, dstCacheTensor, slotmappings, i, kvIdx, layerIdx); \ ++ op.processFunc(); \ ++ } \ ++ } \ ++ } \ ++ } ++ ++// Declare support kernel entry ++MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(half, int32_t); ++MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(half, int64_t); ++MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t, int32_t); ++MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t, int64_t); ++MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(int8_t, int32_t); ++MULTI_LAYER_PAGED_KV_COPY_TYPE_DECLARE(int8_t, int64_t); ++ ++namespace lmc { ++ ++#define MULTI_LAYER_PAGED_KV_COPY_KERNEL_CALL(TYPE, SLOTTYPE) \ ++ multi_layer_paged_kv_copy_##TYPE##_##SLOTTYPE<<>>(pagedKVCaches, dstCacheTensor, \ ++ slotmappings, hiddenDims, kvs, \ ++ numLayers, pageBuffSize, \ ++ numTokensChunk, blockDim, page2L); ++ ++template ++void multi_layer_paged_kernel(uint32_t blockDim, void *stream, uint8_t *pagedKVCaches, uint8_t *dstCacheTensor, ++ uint8_t *slotmappings, const int64_t hiddenDims, const int32_t kvs, const int32_t numLayers, ++ const int64_t pageBuffSize, const int32_t numTokensChunk, const bool page2L); ++ ++#define MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(TYPE, SLOTTYPE) \ ++template<> \ ++void multi_layer_paged_kernel(uint32_t blockDim, void *stream, uint8_t *pagedKVCaches, \ ++ uint8_t *dstCacheTensor, uint8_t *slotmappings, \ ++ const int64_t hiddenDims, const int32_t kvs, const int32_t numLayers, \ ++ const int64_t pageBuffSize, const int32_t numTokensChunk, \ ++ const bool page2L){ \ ++ MULTI_LAYER_PAGED_KV_COPY_KERNEL_CALL(TYPE, SLOTTYPE); \ ++} ++ ++MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(half, int32_t); ++MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(half, int64_t); ++MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int32_t); ++MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int64_t); ++MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(int8_t, int32_t); ++MULTI_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(int8_t, int64_t); ++ ++template ++void dispatch_paged_kernel_on_slot_type(vllm_ascend::AscendType slotType, uint32_t blockDim, void *stream, ++ uint8_t *pagedKVCaches, uint8_t *dstCacheTensor, uint8_t *slotmappings, ++ const int64_t hiddenDims, const int32_t kvs, const int32_t numLayers, ++ const int64_t pageBuffSize, const int32_t numTokensChunk, const bool page2L) { ++ switch(slotType) { ++ case vllm_ascend::AscendType::INT32: ++ multi_layer_paged_kernel(blockDim, stream, pagedKVCaches, dstCacheTensor, slotmappings, ++ hiddenDims, kvs, numLayers, pageBuffSize, numTokensChunk, page2L); ++ break; ++ case vllm_ascend::AscendType::INT64: ++ multi_layer_paged_kernel(blockDim, stream, pagedKVCaches, dstCacheTensor, slotmappings, ++ hiddenDims, kvs, numLayers, pageBuffSize, numTokensChunk, page2L); ++ break; ++ default: ++ return; ++ } ++} ++ ++extern void multi_layer_kv_transfer_kernel(vllm_ascend::AscendType type, vllm_ascend::AscendType slotType, ++ uint32_t blockDim, void *stream, uint8_t *pagedKVCaches, ++ uint8_t *dstCacheTensor, uint8_t *slotmappings, ++ const int64_t hiddenDims, const int32_t kvs, const int32_t numLayers, ++ const int64_t pageBuffSize, const int32_t numTokensChunk, const bool page2L) ++{ ++ switch(type) { ++ case vllm_ascend::AscendType::FP16: ++ dispatch_paged_kernel_on_slot_type(slotType, blockDim, stream, pagedKVCaches, dstCacheTensor, ++ slotmappings, hiddenDims, kvs, numLayers, pageBuffSize, ++ numTokensChunk, page2L); ++ break; ++ case vllm_ascend::AscendType::BF16: ++ dispatch_paged_kernel_on_slot_type(slotType, blockDim, stream, pagedKVCaches, dstCacheTensor, ++ slotmappings, hiddenDims, kvs, numLayers, pageBuffSize, ++ numTokensChunk, page2L); ++ break; ++ case vllm_ascend::AscendType::INT8: ++ dispatch_paged_kernel_on_slot_type(slotType, blockDim, stream, pagedKVCaches, dstCacheTensor, ++ slotmappings, hiddenDims, kvs, numLayers, pageBuffSize, ++ numTokensChunk, page2L); ++ break; ++ default: ++ return; ++ } ++} ++ ++} // namespace lmc +diff --git a/csrc/ascend/kernels/single_layer_mem_kernels.cpp b/csrc/ascend/kernels/single_layer_mem_kernels.cpp +new file mode 100644 +index 0000000..a1cc30d +--- /dev/null ++++ b/csrc/ascend/kernels/single_layer_mem_kernels.cpp +@@ -0,0 +1,310 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++#include "kernel_operator.h" ++#include ++#include "types.h" ++#include "utils.h" ++ ++template class SingleLayerPagedKVCopy { ++ using local_scalar_t = AscendC::LocalTensor; ++ ++public: ++ __aicore__ inline SingleLayerPagedKVCopy() ++ { ++ } ++ ++ __aicore__ inline void init(GM_ADDR cacheTensor, GM_ADDR keyCachePtr, GM_ADDR valueCachePtr, GM_ADDR slotmappings, ++ const int64_t hiddenDims, const int32_t numTokens, const bool page2L, ++ const bool tokenMajor, AscendC::TPipe *pipe) ++ { ++ this->pipe_ = pipe; ++ this->hiddenDims_ = hiddenDims; ++ this->numTokens_ = numTokens; ++ this->tokenMajor_ = tokenMajor; ++ this->valid_ = true; ++ this->page2L_ = page2L; ++ if constexpr (IsMLA) { ++ this->numKvs_ = 1; ++ } else { ++ this->numKvs_ = 2; ++ } ++ // TODO: Not sure how many to allocate, but let's do 4 blocks of hiddenDims_ ++ // if it was fp16, 2048, we would get 16kb ? ++ this->pipe_->InitBuffer(this->pagedTokenQue_, 4, this->hiddenDims_*sizeof(scalar_t)); ++ } ++ ++ __aicore__ inline void reset(){ ++ this->valid_ = true; ++ } ++ ++ __aicore__ inline void updateTensorMemOffsetAndProcess(__gm__ uint8_t *pagedTensor, __gm__ uint8_t* nonPagedTensor, ++ __gm__ uint8_t *slotmappings, const int tokenIdx, const int kvIdx) ++ { ++ __gm__ slot_t *slotmappingPtr = reinterpret_cast<__gm__ slot_t*>(slotmappings); ++ int64_t slot = slotmappingPtr[tokenIdx]; ++ ++ if (slot == -1) { ++ this->valid_ = false; ++ return; ++ } ++ ++ // for the page tensor ++ int64_t pagedIdxOffset = slot * this->hiddenDims_; ++ ++ // for the lmc tensor ++ int64_t nonPagedIdxOffset = -1; ++ if (this->tokenMajor_) { ++ nonPagedIdxOffset = tokenIdx * this->numKvs_ * this->hiddenDims_ + ++ kvIdx * this->hiddenDims_; ++ } else { ++ nonPagedIdxOffset = kvIdx * this->numTokens_ * this -> hiddenDims_ + ++ tokenIdx * this->hiddenDims_; ++ } ++ ++ if (kvIdx == 0) { ++ // keys ++ this->keyTokensGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedTensor) + pagedIdxOffset, ++ this->hiddenDims_); ++ this->lmcBufferKeyGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(nonPagedTensor) + nonPagedIdxOffset, ++ this->hiddenDims_); ++ } else { ++ // values ++ this->valueTokensGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(pagedTensor) + pagedIdxOffset, ++ this->hiddenDims_); ++ this->lmcBufferValueGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ scalar_t*>(nonPagedTensor) + nonPagedIdxOffset, ++ this->hiddenDims_); ++ } ++ } ++ ++ __aicore__ inline void processFunc() { ++ if (!this->valid_) { ++ return; ++ } ++ // 1. Alloc Tensor for local page ++ local_scalar_t hiddenKeysDimTensor = this->pagedTokenQue_.template AllocTensor(); ++ local_scalar_t hiddenValuesDimTensor; ++ if constexpr(!IsMLA) { ++ hiddenValuesDimTensor = this->pagedTokenQue_.template AllocTensor(); ++ } ++ ++ // 2. copy from global tensor into local ++ if (this->page2L_) { ++ AscendC::DataCopy(hiddenKeysDimTensor, this->keyTokensGlobal_, this->hiddenDims_); ++ if constexpr (!IsMLA) { ++ AscendC::DataCopy(hiddenValuesDimTensor, this->valueTokensGlobal_, this->hiddenDims_); ++ } ++ } else { ++ AscendC::DataCopy(hiddenKeysDimTensor, this->lmcBufferKeyGlobal_, this->hiddenDims_); ++ if constexpr(!IsMLA) { ++ AscendC::DataCopy(hiddenValuesDimTensor, this->lmcBufferValueGlobal_, this->hiddenDims_); ++ } ++ } ++ ++ // 3. enque vecin ++ pagedTokenQue_.EnQue(hiddenKeysDimTensor); ++ if constexpr(!IsMLA) { ++ pagedTokenQue_.EnQue(hiddenValuesDimTensor); ++ } ++ ++ // 4. deque vecin, possible to reuse due to QueBind ++ hiddenKeysDimTensor = pagedTokenQue_.DeQue(); ++ if constexpr(!IsMLA) { ++ hiddenValuesDimTensor = pagedTokenQue_.DeQue(); ++ } ++ ++ // 5. datacopy into GM ++ if (this->page2L_) { ++ AscendC::DataCopy(this->lmcBufferKeyGlobal_, hiddenKeysDimTensor, this->hiddenDims_); ++ if constexpr(!IsMLA) { ++ AscendC::DataCopy(this->lmcBufferValueGlobal_, hiddenValuesDimTensor, this->hiddenDims_); ++ } ++ } else { ++ AscendC::DataCopy(this->keyTokensGlobal_, hiddenKeysDimTensor, this->hiddenDims_); ++ if constexpr(!IsMLA) { ++ AscendC::DataCopy(this->valueTokensGlobal_, hiddenValuesDimTensor, this->hiddenDims_); ++ } ++ } ++ ++ // 6. free alloced Tensor ++ pagedTokenQue_.FreeTensor(hiddenKeysDimTensor); ++ if constexpr(!IsMLA) { ++ pagedTokenQue_.FreeTensor(hiddenValuesDimTensor); ++ } ++ } ++ ++private: ++ AscendC::TPipe *pipe_; ++ // a depth of 2 ++ AscendC::TQueBind pagedTokenQue_; ++ ++ // [kvs, numPages * pagedSize, heads*headsize] ++ AscendC::GlobalTensor keyTokensGlobal_; ++ // iff !isMLA ++ AscendC::GlobalTensor valueTokensGlobal_; ++ ++ // Depends on LMC setting whether we store in tokensMajor or not. ++ // the layout would be the followings: ++ // [tokens, kvs, heads*headsize] or [kvs, tokens, heads*headsize] ++ // TODO: check whether should combine the two and use a loop ++ AscendC::GlobalTensor lmcBufferKeyGlobal_; ++ AscendC::GlobalTensor lmcBufferValueGlobal_; ++ ++ int64_t hiddenDims_; // heads * headsize ++ int32_t numTokens_; // num tokens in the cache tensor chunk ++ int16_t numKvs_; // 1 if MLA else 2 ++ bool page2L_; // whether the direction of copy is from page to lmc ++ bool tokenMajor_; // whether the lmc buffer is in token major. ++ bool valid_; ++}; ++ ++#define SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(TYPE, SLOTTYPE, ISMLA) \ ++ extern "C" __global__ __aicore__ void single_layer_paged_kv_copy_##TYPE##_##SLOTTYPE##_##ISMLA( \ ++ __gm__ uint8_t* dstCacheTensor, __gm__ uint8_t* keyCachePtr, __gm__ uint8_t* valueCachePtr, \ ++ __gm__ uint8_t* slotmappings, const int64_t hiddenDims, const int32_t numTokens, const int coreNums, \ ++ const bool page2L, const bool tokenMajor) \ ++ { \ ++ AscendC::TPipe pipe; \ ++ SingleLayerPagedKVCopy op{}; \ ++ op.init(dstCacheTensor, keyCachePtr, valueCachePtr, slotmappings, hiddenDims, numTokens, \ ++ page2L, tokenMajor, &pipe); \ ++ int64_t bIdx = AscendC::GetBlockIdx(); \ ++ for (int64_t i = bIdx; i < numTokens; i+=coreNums) \ ++ { \ ++ op.reset(); \ ++ op.updateTensorMemOffsetAndProcess(keyCachePtr, dstCacheTensor, slotmappings, i, 0); \ ++ if constexpr(!ISMLA) { \ ++ op.updateTensorMemOffsetAndProcess(valueCachePtr, dstCacheTensor, slotmappings, i, 1); \ ++ } \ ++ op.processFunc(); \ ++ } \ ++ } ++ ++// Declare support kernel entry ++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(half, int32_t, false); ++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(half, int32_t, true); ++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t, int32_t, false); ++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t, int32_t, true); ++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(int8_t, int32_t, false); ++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(int8_t, int32_t, true); ++ ++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(half, int64_t, false); ++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(half, int64_t, true); ++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t, int64_t, false); ++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t, int64_t, true); ++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(int8_t, int64_t, false); ++SINGLE_LAYER_PAGED_KV_COPY_TYPE_DECLARE(int8_t, int64_t, true); ++ ++namespace lmc { ++ ++#define SINGLE_LAYER_PAGED_KV_COPY_KERNEL_CALL(TYPE, SLOTTYPE, ISMLA) \ ++ single_layer_paged_kv_copy_##TYPE##_##SLOTTYPE##_##ISMLA<<>>(dstCacheTensor, \ ++ keyCachePtr, valueCachePtr, slotmappings, hiddenDims, \ ++ numTokens, blockDim, page2L, tokenMajor); ++ ++template ++void single_layer_paged_kernel(uint32_t blockDim, void *stream, uint8_t *dstCacheTensor, uint8_t *keyCachePtr, ++ uint8_t *valueCachePtr, uint8_t *slotmappings, const int64_t hiddenDims, ++ const int32_t numTokens, const bool page2L, const bool tokenMajor); ++ ++#define SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(TYPE, SLOTTYPE, ISMLA) \ ++template<> \ ++void single_layer_paged_kernel(uint32_t blockDim, void *stream, uint8_t *dstCacheTensor, \ ++ uint8_t *keyCachePtr, uint8_t *valueCachePtr, uint8_t *slotmappings, \ ++ const int64_t hiddenDims, const int32_t numTokens, const bool page2L, \ ++ const bool tokenMajor){ \ ++ SINGLE_LAYER_PAGED_KV_COPY_KERNEL_CALL(TYPE, SLOTTYPE, ISMLA); \ ++} ++ ++ ++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(half, int32_t, false); ++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(half, int64_t, false); ++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int32_t, false); ++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int64_t, false); ++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(int8_t, int32_t, false); ++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(int8_t, int64_t, false); ++ ++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(half, int32_t, true); ++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(half, int64_t, true); ++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int32_t, true); ++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(bfloat16_t, int64_t, true); ++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(int8_t, int32_t, true); ++SINGLE_LAYER_PAGED_KERNEL_CALL_TYPE_DECLARE(int8_t, int64_t, true); ++ ++ ++ ++template ++void dispatch_single_layer_kernel_on_slot_type(vllm_ascend::AscendType slotType, uint32_t blockDim, void *stream, ++ uint8_t *dstCacheTensor, uint8_t *keyCachePtr, uint8_t *valueCachePtr, ++ uint8_t *slotmappings, const int64_t hiddenDims, const int32_t numTokens, ++ const bool page2L, const bool tokenMajor, const bool isMLA) { ++ if (isMLA) { ++ switch(slotType) { ++ case vllm_ascend::AscendType::INT32: ++ single_layer_paged_kernel(blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr, ++ slotmappings, hiddenDims, numTokens, page2L, tokenMajor); ++ break; ++ case vllm_ascend::AscendType::INT64: ++ single_layer_paged_kernel(blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr, ++ slotmappings, hiddenDims, numTokens, page2L, tokenMajor); ++ break; ++ default: ++ return; ++ } ++ } else { ++ switch(slotType) { ++ case vllm_ascend::AscendType::INT32: ++ single_layer_paged_kernel(blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr, ++ slotmappings, hiddenDims, numTokens, page2L, tokenMajor); ++ break; ++ case vllm_ascend::AscendType::INT64: ++ single_layer_paged_kernel(blockDim, stream, dstCacheTensor, keyCachePtr, valueCachePtr, ++ slotmappings, hiddenDims, numTokens, page2L, tokenMajor); ++ break; ++ default: ++ return; ++ } ++ } ++ ++} ++ ++ ++extern void single_layer_kv_transfer_kernel(vllm_ascend::AscendType type, vllm_ascend::AscendType slotType, ++ uint32_t blockDim, void *stream, uint8_t *dstCacheTensor, ++ uint8_t *keyCachePtr, uint8_t *valueCachePtr, ++ uint8_t *slotmappings, const int64_t hiddenDims, const int32_t numTokens, ++ const bool page2L, const bool tokenMajor, const bool isMLA) ++{ ++ switch(type) { ++ case vllm_ascend::AscendType::FP16: ++ dispatch_single_layer_kernel_on_slot_type(slotType, blockDim, stream, dstCacheTensor, keyCachePtr, ++ valueCachePtr, slotmappings, hiddenDims, numTokens, page2L, ++ tokenMajor, isMLA); ++ break; ++ case vllm_ascend::AscendType::BF16: ++ dispatch_single_layer_kernel_on_slot_type(slotType, blockDim, stream, dstCacheTensor, keyCachePtr, ++ valueCachePtr, slotmappings, hiddenDims, numTokens, ++ page2L, tokenMajor, isMLA); ++ break; ++ case vllm_ascend::AscendType::INT8: ++ dispatch_single_layer_kernel_on_slot_type(slotType, blockDim, stream, dstCacheTensor, keyCachePtr, ++ valueCachePtr, slotmappings, hiddenDims, numTokens, page2L, ++ tokenMajor, isMLA); ++ default: ++ return; ++ } ++} ++ ++} // namespace lmc +diff --git a/csrc/ascend/kernels/types.h b/csrc/ascend/kernels/types.h +new file mode 100644 +index 0000000..7c6c46e +--- /dev/null ++++ b/csrc/ascend/kernels/types.h +@@ -0,0 +1,28 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++ ++namespace vllm_ascend { ++enum struct AscendType { ++ FP16 = 0, ++ BF16 = 1, ++ FP32 = 2, ++ INT8 = 3, ++ INT32 = 4, ++ INT64 = 5, ++}; ++} +\ No newline at end of file +diff --git a/csrc/ascend/kernels/utils.h b/csrc/ascend/kernels/utils.h +new file mode 100644 +index 0000000..a6dd3f3 +--- /dev/null ++++ b/csrc/ascend/kernels/utils.h +@@ -0,0 +1,38 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++#pragma once ++#include "kernel_type.h" ++namespace vllm_ascend { ++ ++template struct AccType; ++ ++template <> struct AccType { ++ using type = float; ++}; ++ ++template <> struct AccType { ++ using type = half; ++}; ++ ++template <> struct AccType { ++ using type = float; ++}; ++ ++template <> struct AccType { ++ using type = int; ++}; ++}// namespace vllm_ascend +\ No newline at end of file +diff --git a/csrc/ascend/managed_mem.cpp b/csrc/ascend/managed_mem.cpp +new file mode 100644 +index 0000000..4c79f54 +--- /dev/null ++++ b/csrc/ascend/managed_mem.cpp +@@ -0,0 +1,326 @@ ++#include "managed_mem.h" ++#include ++// Only required for old driver version (look at registerHostPtr) ++#ifdef PROF_ERROR ++ // You can add a pragma message to see this in your build log if you want: ++ // #pragma message("Undefining PROF_ERROR from ascend_hal.h before NPU headers") ++ #undef PROF_ERROR ++#endif ++ ++#include ++#include ++#include "driver/ascend_hal_define.h" ++#include "driver/ascend_hal.h" ++#include ++#include "torch/torch.h" ++#include "torch/extension.h" ++ ++namespace lmc { ++constexpr int32_t PROT_FLAGS = static_cast(PROT_READ) | static_cast(PROT_WRITE); ++constexpr int32_t MAP_FLAGS = static_cast(MAP_PRIVATE) | static_cast(MAP_ANONYMOUS) | static_cast(MAP_POPULATE); ++ ++// Signatures for internal helper functions ++ ++// Get the version of the NPU driver as a string ++std::string get_driver_version(); ++// Checks whether the major version of the NPU is greater or equal 25 to support aclrtHostRegister ++bool is_version_at_least_25(const std::string& version_str); ++// Gets the current device offsetting on ASCEND_RT_VISIBLE_DEVICES when needed ++int get_device(); ++// Uregisters the malloced hostPtr ++void unregisterPtr(void* ptr); ++// Swaps the host memory allocated to a tensor with the given hostPtr ++void swap_tensor_ptr(void* hostPtr, torch::Tensor& original_tensor); ++ ++// Class implementations ++ ++HostRegisteredMemoryManager::HostRegisteredMemoryManager(){ ++}; ++ ++HostRegisteredMemoryManager::~HostRegisteredMemoryManager() { ++ this->unregisterAll(); ++}; ++ ++void HostRegisteredMemoryManager::unregisterAll(){ ++ const std::unique_lock guard(this->mux); ++ ++ // Iterate through each key-value pair in the map. ++ for (const auto& pair : this->allocatedMap) { ++ void* hostPtr = pair.first; ++ aclrtHostUnregister(hostPtr); ++ } ++ ++ // After unregistering all pointers, clear the map completely. ++ this->allocatedMap.clear(); ++}; ++ ++// Register a pointer through high level APIs (aclrt) return devPtr ++// Returns the created RegisteredMemoryRecord ++RegisteredMemoryRecord HostRegisteredMemoryManager::registerHostPtr(void* hostPtr, size_t bufferSize) { // torch::Tensor& tensor){ ++ TORCH_CHECK(!(hostPtr == nullptr || bufferSize == 0), "Error: hostPtr cannot be null and bufferSize must be greater than 0."); ++ const std::unique_lock guard(this->mux); ++ ++ // Check if the host pointer is already registered ++ if (this->allocatedMap.count(hostPtr)) { ++ return this->allocatedMap[hostPtr]; ++ } ++ ++ void* devPtr; ++ aclError err = aclrtHostRegister(hostPtr, static_cast(bufferSize), ++ ACL_HOST_REGISTER_MAPPED, (void**)&devPtr); ++ TORCH_CHECK(err == 0, "Unable to host register the host ptr: " + std::to_string(err)); ++ ++ this->allocatedMap.emplace(hostPtr, RegisteredMemoryRecord{reinterpret_cast(hostPtr), ++ reinterpret_cast(devPtr), bufferSize}); ++ ++ return this->allocatedMap[hostPtr]; ++}; ++ ++// Register a pointer through low level APIs (HAL). Allocates a new pinned host memory ++// This should be used for driver versions, where cannot rely on aclrtHostRegister() ++// Returns the created RegisteredMemoryRecord ++RegisteredMemoryRecord HostRegisteredMemoryManager::halRegisterHostPtr(size_t bufferSize){ ++ // We allocate a new chunk of memory, register it, and replace the tensor. ++ // Essentially, the halHostRegister function requires a ptr given by mmap. ++ TORCH_CHECK((bufferSize >= 0), "Error: bufferSize must be greater than 0."); ++ const std::unique_lock guard(this->mux); ++ ++ void* devPtr; ++ int device = get_device(); ++ void* hostPtr; ++ // Allocate and register ++ hostPtr = mmap(nullptr, bufferSize, PROT_FLAGS, MAP_FLAGS, -1, 0); ++ TORCH_CHECK(hostPtr != MAP_FAILED, "Unable to alloc memory with mmap."); ++ auto ret = madvise(reinterpret_cast(hostPtr), bufferSize, MADV_HUGEPAGE); ++ auto drvRet = halHostRegister((void*)hostPtr, static_cast(bufferSize), ++ HOST_MEM_MAP_DEV_PCIE_TH, (UINT32)device, (void**)&devPtr); ++ TORCH_CHECK(drvRet == 0, "Unable to register host memory with hal: " + std::to_string(drvRet)) ++ ++ // Lock the memory and fail if impossible to lock ++ auto lockErr = mlock(reinterpret_cast(hostPtr), bufferSize); ++ if (lockErr == -1) { ++ // This can happen in non-privileged mode or not enough rlimit, ++ // let's not proceed since we wanted to guarantee pinned ++ // because we already alloced, let's free ++ auto ret = halHostUnregisterEx(reinterpret_cast(hostPtr), ++ static_cast(device), HOST_MEM_MAP_DEV_PCIE_TH); ++ TORCH_CHECK(ret==0, "Unable to pin host memory, unable to unregister. Error code: " + std::to_string(ret)) ++ auto mret = munmap(reinterpret_cast(hostPtr), bufferSize); ++ TORCH_CHECK(false, "Unable to pin host memory with error code: " + std::to_string(lockErr)) ++ } ++ ++ this->allocatedMap.emplace(hostPtr, RegisteredMemoryRecord{reinterpret_cast(hostPtr), ++ reinterpret_cast(devPtr), bufferSize}); ++ ++ return this->allocatedMap[hostPtr]; ++}; ++ ++void HostRegisteredMemoryManager::unregisterMemory(void* hostPtr) { ++ TORCH_CHECK(hostPtr != nullptr, "Error: hostPtr cannot be null."); ++ ++ // we don't actually mind if it doesn't unregister, ++ // at context destroy it should be unregister anyway. ++ const std::unique_lock guard(this->mux); ++ aclError err = aclrtHostUnregister(hostPtr); ++ this->allocatedMap.erase(hostPtr); ++}; ++ ++/* ++* For now we only do a linear search as we probably won't have a long list of ptrs ++* we go through each record and check whether we are in range, if so ++* we calculate the offset from the host ptr and apply to the device ptr ++* finally we return the device ptr. ++*/ ++void* HostRegisteredMemoryManager::getDevicePtr(void* hostPtr) { ++ if (hostPtr == nullptr) { ++ return nullptr; ++ } ++ const std::shared_lock guard(this->mux); ++ ++ const uintptr_t hostAddrPtr = reinterpret_cast(hostPtr); ++ ++ for (const auto& pair: this->allocatedMap) { ++ const RegisteredMemoryRecord& record = pair.second; ++ ++ if (hostAddrPtr >= record.ptr && hostAddrPtr < (record.ptr + record.buffSize)) { ++ const size_t offset = hostAddrPtr - record.ptr; ++ ++ const uintptr_t deviceAddrPtr = record.devptr + offset; ++ ++ return reinterpret_cast(deviceAddrPtr); ++ } ++ } ++ ++ return nullptr; ++}; ++ ++ ++size_t HostRegisteredMemoryManager::getRecordSize(void* hostPtr){ ++ if (hostPtr == nullptr) { ++ return 0; ++ } ++ const std::shared_lock guard(this->mux); ++ ++ const uintptr_t hostAddrPtr = reinterpret_cast(hostPtr); ++ ++ for (const auto& pair: this->allocatedMap) { ++ const RegisteredMemoryRecord& record = pair.second; ++ ++ if (hostAddrPtr >= record.ptr && hostAddrPtr < (record.ptr + record.buffSize)) { ++ return record.buffSize; ++ } ++ } ++ return 0; ++}; ++ ++std::string get_driver_version() { ++ void* handle = nullptr; ++ int (*dsmi_get_version)(int, char*, unsigned int, unsigned int*) = nullptr; ++ std::string result; ++ ++ handle = dlopen("libdrvdsmi_host.so", RTLD_LAZY); ++ if (!handle) { ++ TORCH_CHECK(false, std::string("Error opening libdrvdsmi_host.so: ") + dlerror() ); ++ return result; ++ } ++ dlerror(); ++ ++ // Load the function ++ *(void**) (&dsmi_get_version) = dlsym(handle, "dsmi_get_version"); ++ const char* dlsym_error = dlerror(); ++ if (dlsym_error) { ++ dlclose(handle); ++ TORCH_CHECK(false, std::string("Error loading dsmi_get_version: ") + dlsym_error); ++ return result; ++ } ++ ++ // Call the function ++ int device_id = c10_npu::getCurrentNPUStream().device_index(); ++ const unsigned int buffer_size = 256; ++ std::vector version_buffer(buffer_size); ++ unsigned int ret_len = 0; ++ int ret = dsmi_get_version(device_id, version_buffer.data(), buffer_size, &ret_len); ++ if (ret == 0) { ++ if (ret_len > 0 && ret_len <= buffer_size) { ++ version_buffer[ret_len] = '\0'; // Ensure null-termination ++ result = version_buffer.data(); ++ } else { ++ TORCH_CHECK(false, "Error: Invalid length returned: " + std::to_string(ret_len)); ++ } ++ } else { ++ TORCH_CHECK(false, "Error: dsmi_get_version returned " + std::to_string(ret)); ++ } ++ ++ dlclose(handle); ++ ++ return result; ++} ++ ++// To be on the safe side, returns false in case of uncertainties ++bool is_version_at_least_25(const std::string& version_str) { ++ if (version_str.empty()) { ++ return false; ++ } ++ ++ size_t num_end = 0; ++ long major_version = 0; ++ ++ try { ++ major_version = std::stol(version_str, &num_end); ++ } catch (const std::invalid_argument&) { ++ // No valid number at start ++ return false; ++ } catch (const std::out_of_range&) { ++ // Should never happen, here for robustness ++ return false; ++ } ++ return major_version >= 25; ++} ++ ++int get_device(){ ++ int device = c10_npu::getCurrentNPUStream().device_index(); ++ const char* env_visible_devices_p = std::getenv("ASCEND_RT_VISIBLE_DEVICES"); ++ // If we are using a custom list of visible devices, the index refers to that ++ if (env_visible_devices_p != nullptr) { ++ std::string env_visible_devices = env_visible_devices_p; ++ std::vector list_visible_devices; ++ std::stringstream ss(env_visible_devices); ++ std::string item; ++ while (std::getline(ss, item, ',')) { ++ list_visible_devices.push_back(std::stoi(item)); ++ } ++ std::sort(list_visible_devices.begin(), list_visible_devices.end()); ++ // Here two cases are possible: ++ // 1. no hccl, we just use current_device, even though we have specify the ASCEND_RT_VISIBLE_DEVICES ++ // 2. hccl, and we use current_device that seems to be correct ++ // for case 2, since the current_device would have been correct anyway, obtaining from the list would be fine. ++ // for case 1, we have shifted the device to the RT_VISIBLE_DEVICES, so it should be corrected. ++ device = list_visible_devices[device]; ++ } ++ return device; ++} ++ ++void unregisterPtr(void* ptr) { ++ if (ptr){ ++ int device = get_device(); ++ auto& hmm = HostRegisteredMemoryManager::GetInstance(); ++ size_t bufferSize = hmm.getRecordSize(ptr); ++ auto ret = halHostUnregisterEx(reinterpret_cast(ptr), ++ static_cast(device), HOST_MEM_MAP_DEV_PCIE_TH); ++ if (ret != 0) { ++ std::cout << "Unable to hal host unregister: "<< ret << std::endl; ++ } ++ auto mret = munmap(reinterpret_cast(ptr), bufferSize); ++ if (mret != 0) { ++ std::cout << "Unable to unmap memory: "<< ret << std::endl; ++ } ++ } ++} ++ ++ ++void swap_tensor_ptr(void* hostPtr, torch::Tensor& original_tensor){ ++ torch::TensorOptions tensorOpsCpu = torch::TensorOptions() ++ .dtype(original_tensor.dtype()) ++ .device(original_tensor.device()) ++ .pinned_memory(true); ++ int64_t numel = static_cast(original_tensor.nbytes()); ++ std::vector dims = {numel}; ++ torch::Tensor new_tensor_from_myptr = torch::from_blob( ++ hostPtr, dims, unregisterPtr, tensorOpsCpu); ++ ++ original_tensor.set_(new_tensor_from_myptr.storage(), original_tensor.storage_offset(), ++ original_tensor.sizes(), original_tensor.strides()); ++} ++ ++} // namespace lmc ++ ++ ++void* register_memory(torch::Tensor& tensor) { ++ torch::Device device = tensor.device(); ++ if (!device.is_cpu() || !tensor.is_pinned()) { ++ TORCH_CHECK(false, "Invalid device. Device must be CPU and tensor must be pinned."); ++ } ++ auto& hmm = lmc::HostRegisteredMemoryManager::GetInstance(); ++ size_t tensorSize = tensor.nbytes(); ++ std::string verString = lmc::get_driver_version(); ++ if (lmc::is_version_at_least_25(verString)) { // New driver version, supports aclrtHostRegister() ++ void* hostPtr = static_cast(tensor.data_ptr()); ++ return (void*) hmm.registerHostPtr(hostPtr, tensorSize).devptr; ++ } else { // Old driver version, does not support aclrtHostRegister(), we have to use HAL. ++ // We ask for a new registerd memory and substitute with the previously allocated. ++ lmc::RegisteredMemoryRecord record = hmm.halRegisterHostPtr(tensorSize); ++ lmc::swap_tensor_ptr((void*) record.ptr, tensor); ++ return (void*) record.devptr; ++ } ++}; ++ ++void unregister_memory(torch::Tensor& tensor) { ++ void* hostPtr = static_cast(tensor.data_ptr()); ++ auto& hmm = lmc::HostRegisteredMemoryManager::GetInstance(); ++ hmm.unregisterMemory(hostPtr); ++}; ++ ++void* get_device_ptr(void* ptr) { ++ auto& hmm = lmc::HostRegisteredMemoryManager::GetInstance(); ++ return hmm.getDevicePtr(ptr); ++}; +diff --git a/csrc/ascend/managed_mem.h b/csrc/ascend/managed_mem.h +new file mode 100644 +index 0000000..ae42364 +--- /dev/null ++++ b/csrc/ascend/managed_mem.h +@@ -0,0 +1,69 @@ ++#pragma once ++#include ++#include ++#include ++#include ++ ++namespace lmc { ++ ++struct RegisteredMemoryRecord { ++ uintptr_t ptr; ++ uintptr_t devptr; ++ size_t buffSize; ++}; ++ ++/* ++* We are not responsible for acl init and ctx initialization, ++* we assume the user responsible for ctx initialization ++*/ ++class HostRegisteredMemoryManager { ++private: ++ HostRegisteredMemoryManager(); ++ ++ // Delete copy constructor and assignment operator ++ HostRegisteredMemoryManager(const HostRegisteredMemoryManager&) = delete; ++ HostRegisteredMemoryManager& operator=(const HostRegisteredMemoryManager&) = delete; ++ HostRegisteredMemoryManager(HostRegisteredMemoryManager&&) = delete; ++ HostRegisteredMemoryManager& operator=(HostRegisteredMemoryManager&&) = delete; ++ ++ std::map allocatedMap; ++ mutable std::shared_mutex mux; ++ ++public: ++ static HostRegisteredMemoryManager& GetInstance() ++ { ++ static HostRegisteredMemoryManager instance; ++ return instance; ++ } ++ ~HostRegisteredMemoryManager(); ++ ++ // Register a pointer through high level APIs (aclrt) return devPtr ++ // Returns an already existing RegisteredMemoryRecord or the newly created one ++ // Inputs: ++ // -hostPtr: host pointer of the allocated memory area to register on device ++ // -bufferSize: size of the allocated memory area to register on device ++ RegisteredMemoryRecord registerHostPtr(void* hostPtr, size_t bufferSize); //torch::Tensor& tensor); // ++ // Register a pointer through low level APIs (hal) ++ // This should be used for driver versions, where cannot rely on aclrtHostRegister() ++ // Returns the created RegisteredMemoryRecord ++ // Inputs: ++ // -bufferSize: size of the allocated memory area to register on device ++ RegisteredMemoryRecord halRegisterHostPtr(size_t bufferSize); ++ void unregisterMemory(void* hostPtr); ++ void* getDevicePtr(void* hostPtr); ++ size_t getRecordSize(void* hostPtr); ++ void unregisterAll(); ++}; ++} // namespace lmc ++ ++// Register a tensor on the current device ++// Inputs: ++// -tensor: The tensor to register on the device ++// Returns the device ptr for that tensor ++void* register_memory(torch::Tensor& tensor); ++// Reverse of register ++// Inputs: ++// -tensor: The tensor to register on the device ++void unregister_memory(torch::Tensor& tensor); ++// Takes in input a host pointer, returns the corresponding device pointer ++void* get_device_ptr(void* ptr); +diff --git a/csrc/ascend/mem_kernels.cpp b/csrc/ascend/mem_kernels.cpp +new file mode 100644 +index 0000000..6526836 +--- /dev/null ++++ b/csrc/ascend/mem_kernels.cpp +@@ -0,0 +1,245 @@ ++#include "mem_kernels.h" ++#include ++#include ++#include ++#include ++#include "utils.h" ++#include "tiling/platform/platform_ascendc.h" ++#include ++#include ++ ++template ++T* get_kernel_ptr(TENSOR_TYPE& tensor) { ++ torch::Device device = tensor.device(); ++ // NPU should be using PrivateUse1 ++ if (device.is_privateuseone() || device.is_cuda()) { ++ return static_cast(tensor.data_ptr()); ++ } else if (device.is_cpu()) { ++ // find device ptr based on the host pinned ptr ++ // because acl does not currently support HostGetDevicePointer API ++ void* devPtr = get_device_ptr(tensor.data_ptr()); ++ TORCH_CHECK(devPtr != nullptr, "Unable to retrieve device ptr, is this a host registered pointer ?"); ++ return reinterpret_cast(devPtr); ++ } else { ++ TORCH_CHECK(false, "Invalid device. Device must be ascend (PrivateUseOne) or pinned cpu."); ++ } ++} ++ ++/** ++ * Quickly offload KV cache from vLLM paged memory to the offloading buffer ++ * Processes all the layers at the same time ++ * ++ * Each layer in vLLM's KV buffer has a shape of ++ * [2, PAGE_BUFFER_SIZE, num_heads*head_size] ++ * ++ * Each AIV Core processes the copy for a token ++ * ++ * Therefore: ++ * AIV Core - token ++ * ++ * The function does: ++ * slot_id = slot_mapping[tokenId] ++ * ptrs[mem_offset(kv, layer, tokenId, hiddenDims)] = key_value[mem_offset(kv, layer, pages, pageSize, slot_id, hiddenDims)] ++ * ++ * Param: ++ * - direction: false means LMCache to PagedBuffer, true means PagedBuffer to ++ * LMCache ++ */ ++void multi_layer_kv_transfer(torch::Tensor& key_value, // [kv, num_layer, num_tokens, hidden] ++ const torch::Tensor& key_value_ptrs, // [num_layers] ++ const torch::Tensor& slot_mapping, // [num_tokens] ++ const torch::Device& paged_memory_device, ++ const int page_buffer_size, const bool direction, ++ const bool use_mla) { ++ uint8_t* key_value_ptr = get_kernel_ptr(key_value); ++ // it is actually a uint8_t**. we will reinterpret it inside the kernel ++ uint8_t* page_buffer_ptrs = get_kernel_ptr(key_value_ptrs); ++ uint8_t* slot_mapping_ptr = get_kernel_ptr(slot_mapping); ++ ++ int num_layers = key_value.size(1); ++ int num_tokens = slot_mapping.size(0); ++ int hidden_dims = key_value.size(-1); ++ int kv_size = 2; ++ if (use_mla) { ++ kv_size = 1; ++ } ++ ++ const c10::OptionalDeviceGuard device_guard(paged_memory_device); ++ // we require the kv ptr list to be on the device too ++ const c10::OptionalDeviceGuard kv_device_guard(device_of(key_value_ptrs)); ++ ++ const aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); ++ at::ScalarType scalar_type = key_value.scalar_type(); ++ at::ScalarType slot_type = slot_mapping.scalar_type(); ++ const char* socName = aclrtGetSocName(); ++ ++ at_npu::native::OpCommand cmd; ++ cmd.Name("multi_layer_kv_transfer_kernel"); ++ cmd.SetCustomHandler([scalar_type, slot_type, socName, stream, page_buffer_ptrs, key_value_ptr, ++ slot_mapping_ptr, hidden_dims, kv_size, num_layers, page_buffer_size, ++ num_tokens, direction]()->int{ ++ auto slot_num = vllm_ascend::get_dtype_from_torch(slot_type); ++ auto dtype_num = vllm_ascend::get_dtype_from_torch(scalar_type); ++ auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socName); ++ uint32_t aiv_num = ascendcPlatform->GetCoreNumAiv(); ++ lmc::multi_layer_kv_transfer_kernel(dtype_num, slot_num, aiv_num, stream, page_buffer_ptrs, key_value_ptr, ++ slot_mapping_ptr, hidden_dims, kv_size, num_layers, page_buffer_size, ++ num_tokens, direction); ++ return 0; ++ }); ++ cmd.Run(); ++ return ; ++}; ++ ++ ++void multi_layer_kv_transfer_unilateral(torch::Tensor& key_value, ++ const torch::Tensor& key_ptrs, ++ const torch::Tensor& value_ptrs, ++ const torch::Tensor& slot_mapping, ++ const torch::Device& paged_memory_device, ++ const int page_buffer_size, ++ const bool direction){ ++ // TODO: ++ PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend."); ++ throw py::error_already_set(); ++}; ++ ++ ++void single_layer_kv_transfer(torch::Tensor& lmc_key_value_cache, // [num_tokens, 2, num_heads*head_size] ++ // or ++ // [2, num_tokens, num_heads*head_size] ++ torch::Tensor& vllm_key_cache, // [num_blocks, block_size, num_heads, head_size] ++ torch::Tensor& vllm_value_cache, // [....] ++ torch::Tensor& slot_mapping, // [num_tokens] ++ const bool direction, // false: LMCache to PagedBuffer, true: PagedBuffer to LMCache ++ const bool token_major // true: lmc_key_value_cache is [num_tokens, 2, num_heads*head_size] ++ // false: otherwise ++) { ++ uint8_t *lmc_key_value_cache_ptr = get_kernel_ptr(lmc_key_value_cache); ++ uint8_t *vllm_key_cache_ptr = get_kernel_ptr(vllm_key_cache); ++ uint8_t *vllm_value_cache_ptr = get_kernel_ptr(vllm_value_cache); ++ uint8_t *slot_mapping_ptr = get_kernel_ptr(slot_mapping); ++ ++ int num_tokens = slot_mapping.size(0); ++ int hidden_dims = lmc_key_value_cache.size(-1); ++ ++ const c10::OptionalDeviceGuard device_guard(device_of(vllm_key_cache)); ++ const c10::OptionalDeviceGuard slot_device_guard(device_of(slot_mapping)); ++ const aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); ++ ++ at::ScalarType scalar_type = vllm_key_cache.scalar_type(); ++ at::ScalarType slot_type = slot_mapping.scalar_type(); ++ ++ const char* socName = aclrtGetSocName(); ++ ++ at_npu::native::OpCommand cmd; ++ cmd.Name("single_layer_kv_transfer_kernel"); ++ cmd.SetCustomHandler([scalar_type, slot_type, socName, stream, lmc_key_value_cache_ptr, ++ vllm_key_cache_ptr, vllm_value_cache_ptr, slot_mapping_ptr, ++ hidden_dims, num_tokens, direction, token_major]() -> int { ++ auto slot_num = vllm_ascend::get_dtype_from_torch(slot_type); ++ auto dtype_num = vllm_ascend::get_dtype_from_torch(scalar_type); ++ auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socName); ++ uint32_t aiv_num = ascendcPlatform->GetCoreNumAiv(); ++ // TODO: We will add the isMLA argument once the signature have support for the MLA. ++ lmc::single_layer_kv_transfer_kernel(dtype_num, slot_num, aiv_num, stream, lmc_key_value_cache_ptr, ++ vllm_key_cache_ptr, vllm_value_cache_ptr, slot_mapping_ptr, ++ hidden_dims, num_tokens, direction, token_major, false); ++ return 0; ++ }); ++ cmd.Run(); ++ return ; ++}; ++ ++void load_and_reshape_flash( ++ torch::Tensor& key_value, // [2, num_layer, num_tokens, num_heads*head_size] ++ // must be one gpu / pinned cpu ++ torch::Tensor& key_cache, // [num_blocks, block_size, num_heads, head_size] ++ torch::Tensor& value_cache, // [num_blocks, block_size, num_heads, head_size] ++ torch::Tensor& slot_mapping, // [num_tokens], ++ const int layer_idx) { ++ ++ uint8_t* key_value_ptr = get_kernel_ptr(key_value); ++ uint8_t* key_cache_ptr = get_kernel_ptr(key_cache); ++ uint8_t* value_cache_ptr = get_kernel_ptr(value_cache); ++ ++ uint8_t* slot_mapping_ptr = get_kernel_ptr(slot_mapping); ++ ++ int num_tokens = slot_mapping.size(0); ++ int num_layers = key_value.size(1); ++ int block_size = key_cache.size(1); ++ int num_blocks = key_cache.size(0); ++ int hidden_dims = key_value.size(-1); ++ const c10::OptionalDeviceGuard device_guard(device_of(key_cache)); ++ const aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); ++ ++ at::ScalarType scalar_type = key_value.scalar_type(); ++ at::ScalarType slot_type = slot_mapping.scalar_type(); ++ const char* socName = aclrtGetSocName(); ++ ++ at_npu::native::OpCommand cmd; ++ cmd.Name("load_and_reshape_flash_kernel"); ++ cmd.SetCustomHandler([scalar_type, slot_type, socName, stream, key_value_ptr, ++ key_cache_ptr, value_cache_ptr, slot_mapping_ptr, ++ hidden_dims, num_blocks, block_size, ++ num_tokens, num_layers, layer_idx]()->int { ++ auto slot_num = vllm_ascend::get_dtype_from_torch(slot_type); ++ auto dtype_num = vllm_ascend::get_dtype_from_torch(scalar_type); ++ auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socName); ++ uint32_t aiv_num = ascendcPlatform->GetCoreNumAiv(); ++ lmc::load_and_reshape_flash_kernel(dtype_num, slot_num, aiv_num, stream, key_value_ptr, ++ key_cache_ptr, value_cache_ptr, slot_mapping_ptr, ++ hidden_dims, num_blocks, block_size, ++ num_tokens, num_layers, layer_idx, true); ++ return 0; ++ }); ++ cmd.Run(); ++ return; ++}; ++ ++void reshape_and_cache_back_flash( ++ torch::Tensor& key_value, // [2, num_layer, num_tokens, num_heads*head_size] ++ // must be one gpu / pinned cpu ++ torch::Tensor& key_cache, // [num_blocks, block_size, num_heads, head_size] ++ torch::Tensor& value_cache, // [num_blocks, block_size, num_heads, head_size] ++ torch::Tensor& slot_mapping, // [num_tokens], ++ const int layer_idx) { ++ ++ uint8_t* key_value_ptr = get_kernel_ptr(key_value); ++ uint8_t* key_cache_ptr = get_kernel_ptr(key_cache); ++ uint8_t* value_cache_ptr = get_kernel_ptr(value_cache); ++ ++ uint8_t* slot_mapping_ptr = get_kernel_ptr(slot_mapping); ++ ++ int num_tokens = slot_mapping.size(0); ++ int num_layers = key_value.size(1); ++ int block_size = key_cache.size(1); ++ int num_blocks = key_cache.size(0); ++ int hidden_dims = key_value.size(-1); ++ const c10::OptionalDeviceGuard device_guard(device_of(key_cache)); ++ const aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); ++ ++ at::ScalarType scalar_type = key_value.scalar_type(); ++ at::ScalarType slot_type = slot_mapping.scalar_type(); ++ ++ const char* socName = aclrtGetSocName(); ++ ++ at_npu::native::OpCommand cmd; ++ cmd.Name("reshape_and_cache_back_flash"); ++ cmd.SetCustomHandler([scalar_type, slot_type, socName, stream, key_value_ptr, ++ key_cache_ptr, value_cache_ptr, slot_mapping_ptr, ++ hidden_dims, num_blocks, block_size, ++ num_tokens, num_layers, layer_idx]() -> int { ++ auto slot_num = vllm_ascend::get_dtype_from_torch(slot_type); ++ auto dtype_num = vllm_ascend::get_dtype_from_torch(scalar_type); ++ auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socName); ++ uint32_t aiv_num = ascendcPlatform->GetCoreNumAiv(); ++ lmc::load_and_reshape_flash_kernel(dtype_num, slot_num, aiv_num, stream, key_value_ptr, ++ key_cache_ptr, value_cache_ptr, slot_mapping_ptr, ++ hidden_dims, num_blocks, block_size, ++ num_tokens, num_layers, layer_idx, false); ++ return 0; ++ }); ++ cmd.Run(); ++ return; ++}; +diff --git a/csrc/ascend/mem_kernels.h b/csrc/ascend/mem_kernels.h +new file mode 100644 +index 0000000..01e0494 +--- /dev/null ++++ b/csrc/ascend/mem_kernels.h +@@ -0,0 +1,58 @@ ++#pragma once ++#include ++#include ++#include "managed_mem.h" ++#include "kernels/types.h" ++ ++namespace lmc { ++void multi_layer_kv_transfer_kernel(vllm_ascend::AscendType type, vllm_ascend::AscendType slotType, uint32_t blockDim, ++ void *stream, uint8_t *pagedKVCaches, uint8_t *dstCacheTensor, ++ uint8_t *slotmappings, const int64_t hiddenDims, const int32_t kvs, ++ const int32_t numLayers, const int64_t pageBuffSize, const int32_t numTokensChunk, ++ const bool page2L); ++ ++void single_layer_kv_transfer_kernel(vllm_ascend::AscendType type, vllm_ascend::AscendType slotType, ++ uint32_t blockDim, void *stream, uint8_t *dstCacheTensor, ++ uint8_t *keyCachePtr, uint8_t *valueCachePtr, ++ uint8_t *slotmappings, const int64_t hiddenDims, const int32_t numTokens, ++ const bool page2L, const bool tokenMajor, const bool isMLA); ++ ++void load_and_reshape_flash_kernel(vllm_ascend::AscendType type, vllm_ascend::AscendType slotType, ++ uint32_t blockDim, void *stream, uint8_t *dstCacheTensor, uint8_t *keyCachePtr, ++ uint8_t *valueCachePtr, uint8_t *slotmappings, const int64_t hiddenDims, ++ const int64_t numPages, const int32_t pagedSize, const int32_t numTokens, ++ const int32_t numLayers, const int32_t layerIdx, const bool page2L); ++} ++ ++ ++void multi_layer_kv_transfer(torch::Tensor& key_value, // [kv, num_layer, num_tokens, hidden] ++ const torch::Tensor& key_value_ptrs, // [num_layers] ++ const torch::Tensor& slot_mapping, // [num_tokens] ++ const torch::Device& paged_memory_device, ++ const int page_buffer_size, const bool direction, ++ const bool use_mla); ++ ++void multi_layer_kv_transfer_unilateral(torch::Tensor& key_value, ++ const torch::Tensor& key_ptrs, ++ const torch::Tensor& value_ptrs, ++ const torch::Tensor& slot_mapping, ++ const torch::Device& paged_memory_device, ++ const int page_buffer_size, ++ const bool direction); ++ ++void single_layer_kv_transfer(torch::Tensor& lmc_key_value_cache, ++ torch::Tensor& vllm_key_cache, ++ torch::Tensor& vllm_value_cache, ++ torch::Tensor& slot_mapping, ++ const bool direction, ++ const bool token_major = false); ++ ++void load_and_reshape_flash(torch::Tensor& key_value, torch::Tensor& key_cache, ++ torch::Tensor& value_cache, ++ torch::Tensor& slot_mapping, const int layer_idx); ++ ++void reshape_and_cache_back_flash(torch::Tensor& key_value, ++ torch::Tensor& key_cache, ++ torch::Tensor& value_cache, ++ torch::Tensor& slot_mapping, ++ const int layer_idx); +\ No newline at end of file +diff --git a/csrc/ascend/pos_kernels.cpp b/csrc/ascend/pos_kernels.cpp +new file mode 100644 +index 0000000..01ee897 +--- /dev/null ++++ b/csrc/ascend/pos_kernels.cpp +@@ -0,0 +1,14 @@ ++#include "pos_kernels.h" ++#include ++#include ++ ++namespace py = pybind11; ++ ++void rotary_embedding_k_fused(const torch::Tensor& old_positions, ++ const torch::Tensor& new_positions, ++ torch::Tensor& key, int64_t head_size, ++ const torch::Tensor& cos_sin_cache, bool is_neox) { ++ // TODO: ++ PyErr_SetString(PyExc_NotImplementedError, "Please contact LMCache Ascend."); ++ throw py::error_already_set(); ++}; +\ No newline at end of file +diff --git a/csrc/ascend/pos_kernels.h b/csrc/ascend/pos_kernels.h +new file mode 100644 +index 0000000..b0bcaf1 +--- /dev/null ++++ b/csrc/ascend/pos_kernels.h +@@ -0,0 +1,10 @@ ++#pragma once ++#include ++#include ++#include ++#include ++ ++void rotary_embedding_k_fused(const torch::Tensor& old_positions, ++ const torch::Tensor& new_positions, ++ torch::Tensor& key, int64_t head_size, ++ const torch::Tensor& cos_sin_cache, bool is_neox); +\ No newline at end of file +diff --git a/csrc/ascend/torch_tensor.h b/csrc/ascend/torch_tensor.h +new file mode 100644 +index 0000000..f1842dd +--- /dev/null ++++ b/csrc/ascend/torch_tensor.h +@@ -0,0 +1,33 @@ ++#pragma once ++#include ++#include ++#include ++#include "managed_mem.h" ++ ++void unregisterPtr(void* ptr) { ++ if (ptr) { ++ auto& hmm = lmc::HostRegisteredMemoryManager::GetInstance(); ++ hmm.unregisterMemory(ptr); ++ } ++} ++ ++torch::Tensor create_pinned_host_registered_tensor(size_t bufferSize) { ++ torch::TensorOptions tensorOpsCpu = torch::TensorOptions() ++ .dtype(torch::kUInt8) ++ .device(torch::kCPU) ++ .pinned_memory(true); ++ TORCH_CHECK(bufferSize > 0, "Buffer size must be greater than zero. Got: " + std::to_string(bufferSize)); ++ ++ // unlikely this would be greater than int64_t ++ int64_t numel = static_cast(bufferSize); ++ ++ void* hostPtr; ++ aclError err = aclrtMallocHost((void**)&hostPtr, bufferSize); ++ TORCH_CHECK(err == 0, "Unable to malloc host buffer, error: " + std::to_string(err)); ++ ++ auto& hmm = lmc::HostRegisteredMemoryManager::GetInstance(); ++ hmm.registerHostPtr(hostPtr, bufferSize); ++ ++ std::vector dims = {numel}; ++ return torch::from_blob(hostPtr, dims, unregisterPtr, tensorOpsCpu); ++} +\ No newline at end of file +diff --git a/csrc/ascend/utils.cmake b/csrc/ascend/utils.cmake +new file mode 100644 +index 0000000..ebf06c6 +--- /dev/null ++++ b/csrc/ascend/utils.cmake +@@ -0,0 +1,28 @@ ++# ++# Run `EXPR` in python. The standard output of python is stored in `OUT` and ++# has trailing whitespace stripped. If an error is encountered when running ++# python, a fatal message `ERR_MSG` is issued. ++# ++function (run_python OUT EXPR ERR_MSG) ++ execute_process( ++ COMMAND ++ "${PYTHON_EXECUTABLE}" "-c" "${EXPR}" ++ OUTPUT_VARIABLE PYTHON_OUT ++ RESULT_VARIABLE PYTHON_ERROR_CODE ++ ERROR_VARIABLE PYTHON_STDERR ++ OUTPUT_STRIP_TRAILING_WHITESPACE) ++ ++ if(NOT PYTHON_ERROR_CODE EQUAL 0) ++ message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}") ++ endif() ++ set(${OUT} ${PYTHON_OUT} PARENT_SCOPE) ++endfunction() ++ ++# Run `EXPR` in python after importing `PKG`. Use the result of this to extend ++# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. ++macro (append_cmake_prefix_path PKG EXPR) ++ run_python(_PREFIX_PATH ++ "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") ++ list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH}) ++endmacro() ++ +diff --git a/csrc/ascend/utils.h b/csrc/ascend/utils.h +new file mode 100644 +index 0000000..c1c02cd +--- /dev/null ++++ b/csrc/ascend/utils.h +@@ -0,0 +1,39 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++#pragma once ++ ++#include "kernels/types.h" ++#include ++#include ++ ++namespace vllm_ascend { ++AscendType get_dtype_from_torch(at::ScalarType scalarType) ++{ ++ if (scalarType == at::ScalarType::Float) { ++ return AscendType::FP32; ++ } else if (scalarType == at::ScalarType::BFloat16) { ++ return AscendType::BF16; ++ } else if (scalarType == at::ScalarType::Half) { ++ return AscendType::FP16; ++ } else if (scalarType == at::ScalarType::Long) { ++ return AscendType::INT64; ++ } else if (scalarType == at::ScalarType::Int) { ++ return AscendType::INT32; ++ } else { ++ TORCH_CHECK(false, "ScalarType not supported."); ++ } ++}; ++} // namespace vllm_ascend +diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp +index 4b4b900..d7ad00d 100644 +--- a/csrc/pybind.cpp ++++ b/csrc/pybind.cpp +@@ -15,19 +15,29 @@ + */ + + #include +-#include "mem_kernels.cuh" +-#include "cachegen_kernels.cuh" +-#include "pos_kernels.cuh" ++#ifdef USE_ASCEND ++ #include "ascend/mem_kernels.h" ++ #include "ascend/managed_mem.h" ++ #include "ascend/cachegen_kernels.h" ++ #include "ascend/pos_kernels.h" ++#else ++ #include "mem_kernels.cuh" ++ #include "cachegen_kernels.cuh" ++ #include "pos_kernels.cuh" ++#endif + #include + #include + + namespace py = pybind11; + + PYBIND11_MODULE(c_ops, m) { ++#ifdef USE_ASCEND ++ m.def("host_register", ®ister_memory); ++#endif + m.def("multi_layer_kv_transfer", &multi_layer_kv_transfer); ++ m.def("single_layer_kv_transfer", &single_layer_kv_transfer); + m.def("multi_layer_kv_transfer_unilateral", + &multi_layer_kv_transfer_unilateral); +- m.def("single_layer_kv_transfer", &single_layer_kv_transfer); + m.def("load_and_reshape_flash", &load_and_reshape_flash); + m.def("reshape_and_cache_back_flash", &reshape_and_cache_back_flash); + m.def("encode_fast_new", &encode_cuda_new); +diff --git a/lmcache/integration/vllm/vllm_adapter.py b/lmcache/integration/vllm/vllm_adapter.py +index a316561..8527d02 100644 +--- a/lmcache/integration/vllm/vllm_adapter.py ++++ b/lmcache/integration/vllm/vllm_adapter.py +@@ -23,25 +23,35 @@ from torch.nn.utils.rnn import pad_sequence + import torch + import torch.distributed as dist + +-if TYPE_CHECKING: +- from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata +- + # Third Party + from vllm.attention import AttentionMetadata + +-# from vllm.attention.backends.flash_attn import FlashAttentionMetadata +-try: ++from vllm.platforms import current_platform ++if current_platform.is_cuda() or current_platform.device_type == "cuda": ++ if TYPE_CHECKING: ++ from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata ++ ++ # from vllm.attention.backends.flash_attn import FlashAttentionMetadata ++ try: ++ # Third Party ++ from vllm.attention.backends.flash_attn import FlashAttentionMetadata ++ except (ModuleNotFoundError, ImportError): ++ # vllm_flash_attn is not installed, try the ROCm FA metadata ++ from vllm.attention.backends.rocm_flash_attn import ( ++ ROCmFlashAttentionMetadata as FlashAttentionMetadata, ++ ) ++ + # Third Party +- from vllm.attention.backends.flash_attn import FlashAttentionMetadata +-except (ModuleNotFoundError, ImportError): +- # vllm_flash_attn is not installed, try the ROCm FA metadata +- from vllm.attention.backends.rocm_flash_attn import ( +- ROCmFlashAttentionMetadata as FlashAttentionMetadata, +- ) ++ from vllm.attention.backends.flashmla import FlashMLAMetadata ++ from vllm.attention.backends.mla.common import MLACommonMetadata ++elif current_platform.device_name == "npu": ++ if TYPE_CHECKING: ++ from vllm_ascend.worker.model_runner import ModelInputForNPUWithSamplingMetadata as ModelInputForGPUWithSamplingMetadata ++ from vllm_ascend.attention.attention_v1 import AscendMetadata as FlashAttentionMetadata ++ from vllm_ascend.attention.mla_v1 import AscendMLAMetadata as FlashMLAMetadata, AscendMLAMetadata as MLACommonMetadata ++ from torch_npu.contrib import transfer_to_npu ++ import lmcache.c_ops as lmc_ops + +-# Third Party +-from vllm.attention.backends.flashmla import FlashMLAMetadata +-from vllm.attention.backends.mla.common import MLACommonMetadata + from vllm.config import ( + CacheConfig, + ModelConfig, +@@ -224,9 +234,9 @@ def init_lmcache_engine( + use_mla=use_mla, + ) + engine = LMCacheEngineBuilder.get_or_create( +- ENGINE_NAME, config, metadata, vllm_gpu_connector ++ ENGINE_NAME, config, metadata, vllm_gpu_connector, device=current_platform.device_name + ) +- ++ + return engine + + +diff --git a/lmcache/v1/cache_engine.py b/lmcache/v1/cache_engine.py +index eaba752..4fb57a3 100644 +--- a/lmcache/v1/cache_engine.py ++++ b/lmcache/v1/cache_engine.py +@@ -769,6 +769,7 @@ class LMCacheEngineBuilder: + def _Create_memory_allocator( + config: LMCacheEngineConfig, + metadata: LMCacheEngineMetadata, ++ device: str = "cuda", + ) -> MemoryAllocatorInterface: + if config.enable_nixl: + assert config.nixl_buffer_device is not None +@@ -810,7 +811,7 @@ class LMCacheEngineBuilder: + return CuFileMemoryAllocator(config.cufile_buffer_size * 1024**2) + + max_local_cpu_size = config.max_local_cpu_size +- return MixedMemoryAllocator(int(max_local_cpu_size * 1024**3)) ++ return MixedMemoryAllocator(int(max_local_cpu_size * 1024**3), device=device) + + @staticmethod + def _Create_token_database( +@@ -828,6 +829,7 @@ class LMCacheEngineBuilder: + config: LMCacheEngineConfig, + metadata: LMCacheEngineMetadata, + gpu_connector: GPUConnectorInterface, ++ device: str = "cuda" + ) -> LMCacheEngine: + """ + Builds a new LMCacheEngine instance if it doesn't already exist for the +@@ -838,7 +840,7 @@ class LMCacheEngineBuilder: + """ + logger.info(f"Creating LMCacheEngine instance {instance_id}") + if instance_id not in cls._instances: +- memory_allocator = cls._Create_memory_allocator(config, metadata) ++ memory_allocator = cls._Create_memory_allocator(config, metadata, device=device) + token_database = cls._Create_token_database(config, metadata) + stat_logger = LMCacheStatsLogger(metadata, log_interval=10) + +@@ -885,4 +887,4 @@ class LMCacheEngineBuilder: + cls._cfgs.pop(instance_id, None) + cls._metadatas.pop(instance_id, None) + cls._stat_loggers.pop(instance_id, None) +- LMCStatsMonitor.DestroyInstance() ++ LMCStatsMonitor.DestroyInstance() +\ No newline at end of file +diff --git a/lmcache/v1/gpu_connector.py b/lmcache/v1/gpu_connector.py +index 62fe7d5..8b257c9 100644 +--- a/lmcache/v1/gpu_connector.py ++++ b/lmcache/v1/gpu_connector.py +@@ -18,6 +18,10 @@ import abc + + # Third Party + import torch ++try: ++ from torch_npu.contrib import transfer_to_npu ++except (ModuleNotFoundError, ImportError): ++ print("Not importing NPU packages. We are not running on Ascend") + + # First Party + from lmcache.integration.vllm.utils import ENGINE_NAME +@@ -153,7 +157,8 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface): + def _initialize_pointers(self, kv_caches: List[torch.Tensor]) -> torch.Tensor: + self.kv_cache_pointers.numpy()[:] = [t.data_ptr() for t in kv_caches] + device = kv_caches[0].device +- assert device.type == "cuda", "The device should be CUDA." ++ from vllm.platforms import current_platform ++ assert device.type == current_platform.device_name # The device should be CUDA or Ascend based on vllm. + idx = device.index + if idx not in self.kv_cache_pointers_on_gpu: + self.kv_cache_pointers_on_gpu[idx] = torch.empty( +@@ -161,9 +166,9 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface): + ) + self.kv_cache_pointers_on_gpu[idx].copy_(self.kv_cache_pointers) + if self.use_mla: +- # kv_caches[0].shape: [num_pages, page_size, head_size] +- assert kv_caches[0].dim() == 3 +- self.page_buffer_size = kv_caches[0].shape[0] * kv_caches[0].shape[1] ++ # kv_caches[0].shape: [num_pages, page_size, head_size] (vllm) or ++ # kv_caches[0].shape: [1, num_pages, page_size, head_size] (vllm-Ascend) ++ self.page_buffer_size = kv_caches[0].shape[-3] * kv_caches[0].shape[-2] + else: + # kv_caches[0].shape: [2, num_pages, page_size, num_heads, head_size] + assert kv_caches[0].dim() == 5 +@@ -798,7 +803,7 @@ class VLLMPagedMemLayerwiseGPUConnector(GPUConnectorInterface): + memory_obj.tensor, + self.kvcaches[layer_id][0], + self.kvcaches[layer_id][1], +- slot_mapping_full, ++ slot_mapping[start:end], + False, + True, + ) +@@ -819,7 +824,8 @@ class VLLMPagedMemLayerwiseGPUConnector(GPUConnectorInterface): + current_stream.wait_stream(self.load_stream) + + # free the buffer memory +- tmp_gpu_buffer_obj.ref_count_down() ++ if self.use_gpu: ++ tmp_gpu_buffer_obj.ref_count_down() + + logger.debug(f"Finished loading layer {layer_id}") + yield +@@ -932,7 +938,8 @@ class VLLMPagedMemLayerwiseGPUConnector(GPUConnectorInterface): + logger.debug(f"Finished offloading layer {layer_id}") + + # free the buffer memory +- tmp_gpu_buffer_obj.ref_count_down() ++ if self.use_gpu: ++ tmp_gpu_buffer_obj.ref_count_down() + yield + + def get_shape(self, num_tokens: int) -> torch.Size: +diff --git a/lmcache/v1/memory_management.py b/lmcache/v1/memory_management.py +index 68b6f12..a17cc8a 100644 +--- a/lmcache/v1/memory_management.py ++++ b/lmcache/v1/memory_management.py +@@ -31,6 +31,10 @@ import torch + from lmcache.logging import init_logger + from lmcache.observability import LMCStatsMonitor + from lmcache.utils import _lmcache_nvtx_annotate ++import lmcache.c_ops as lmc_ops ++ ++# Third Party ++from vllm.platforms import current_platform + + logger = init_logger(__name__) + +@@ -563,6 +567,19 @@ class MemoryAllocatorInterface(metaclass=abc.ABCMeta): + """ + raise NotImplementedError + ++ @abc.abstractmethod ++ def registerHostPtr( ++ self, ++ device: str = "cuda", ++ size: int = 0 ++ ): ++ """ ++ Register the host ptr with the device using device specific APIs. ++ ++ :param str device: The device type to register the host ptr with. ++ """ ++ raise NotImplementedError ++ + def close(self): + """ + Closes the memory allocator. +@@ -900,6 +917,10 @@ class TensorMemoryAllocator(MemoryAllocatorInterface): + clear = False + return clear + ++ def registerHostPtr(self, device: str = "cuda", size: int = 0): ++ # do nothing for tensor memory allocator ++ pass ++ + + class PagedTensorMemoryAllocator(MemoryAllocatorInterface): + """ +@@ -1151,6 +1172,10 @@ class PagedTensorMemoryAllocator(MemoryAllocatorInterface): + # FIXME: NIXL-related memory leak should be handled somewhere (else). + del self.buffer + ++ def registerHostPtr(self, device: str = "cuda", size: int = 0): ++ # do nothing for paged tensor memory allocator ++ pass ++ + + class BufferAllocator(MemoryAllocatorInterface): + """Allocates memory in the pre-allocated pinned memory.""" +@@ -1201,6 +1226,9 @@ class BufferAllocator(MemoryAllocatorInterface): + def memcheck(self): + return True + ++ def registerHostPtr(self, device: str = "cuda", size: int = 0): ++ pass ++ + + class HostMemoryAllocator(MemoryAllocatorInterface): + """Allocates memory in the pre-allocated Host memory.""" +@@ -1272,21 +1300,20 @@ class HostMemoryAllocator(MemoryAllocatorInterface): + with self.host_mem_lock: + return self.allocator.memcheck() + ++ def registerHostPtr(self, device: str = "cuda", size: int = 0): ++ pass + + class PinMemoryAllocator(MemoryAllocatorInterface): + """Allocates memory in the pre-allocated pinned memory.""" + +- def __init__(self, size: int, use_paging: bool = False, **kwargs): ++ def __init__(self, size: int, use_paging: bool = False, device: str = "cuda", **kwargs): + """ + :param int size: The size of the pinned memory in bytes. + """ + + self.buffer = torch.empty(size, dtype=torch.uint8) +- ptr = self.buffer.data_ptr() +- err = torch.cuda.cudart().cudaHostRegister(ptr, size, 0) +- assert err == 0, ( +- f"cudaHostRegister failed: {torch.cuda.cudart().cudaGetErrorString(err)}" +- ) ++ self._device = device ++ self.registerHostPtr(device, size) + self._unregistered = False + + if use_paging: +@@ -1353,9 +1380,22 @@ class PinMemoryAllocator(MemoryAllocatorInterface): + def close(self): + if not self._unregistered: + torch.cuda.synchronize() +- torch.cuda.cudart().cudaHostUnregister(self.buffer.data_ptr()) ++ if self._device == "cuda": ++ torch.cuda.cudart().cudaHostUnregister(self.buffer.data_ptr()) + self._unregistered = True + ++ def registerHostPtr(self, device: str = "cuda", size: int = 0): ++ ptr = self.buffer.data_ptr() ++ if device == "cuda": ++ assert size >= 0, ("size must be non-negative and greater than 0.") ++ err = torch.cuda.cudart().cudaHostRegister(ptr, size, 0) ++ assert err == 0, ( ++ f"cudaHostRegister failed: {torch.cuda.cudart().cudaGetErrorString(err)}" ++ ) ++ elif device == "npu": ++ # Ascend need to manually manage host register API and memory is pinned ++ self.buffer = self.buffer.pin_memory() ++ lmc_ops.host_register(self.buffer) + + class MixedMemoryAllocator(MemoryAllocatorInterface): + """ +@@ -1363,17 +1403,14 @@ class MixedMemoryAllocator(MemoryAllocatorInterface): + (2) byte_array buffer memory. + """ + +- def __init__(self, size: int, use_paging: bool = False, **kwargs): ++ def __init__(self, size: int, use_paging: bool = False, device: str = "cuda", **kwargs): + """ + :param int size: The size of the pinned memory in bytes. + """ + + self.buffer = torch.empty(size, dtype=torch.uint8) +- ptr = self.buffer.data_ptr() +- err = torch.cuda.cudart().cudaHostRegister(ptr, size, 0) +- assert err == 0, ( +- f"cudaHostRegister failed: {torch.cuda.cudart().cudaGetErrorString(err)}" +- ) ++ self._device = device ++ self.registerHostPtr(device, size) + self._unregistered = False + + if use_paging: +@@ -1487,8 +1524,22 @@ class MixedMemoryAllocator(MemoryAllocatorInterface): + def close(self): + if not self._unregistered: + torch.cuda.synchronize() +- torch.cuda.cudart().cudaHostUnregister(self.buffer.data_ptr()) ++ if self._device == "cuda": ++ torch.cuda.cudart().cudaHostUnregister(self.buffer.data_ptr()) + self._unregistered = True ++ ++ def registerHostPtr(self, device: str = "cuda", size: int = 0): ++ ptr = self.buffer.data_ptr() ++ if device == "cuda": ++ assert size >= 0, ("size must be non-negative and greater than 0.") ++ err = torch.cuda.cudart().cudaHostRegister(ptr, size, 0) ++ assert err == 0, ( ++ f"cudaHostRegister failed: {torch.cuda.cudart().cudaGetErrorString(err)}" ++ ) ++ elif device == "npu": ++ # Ascend need to manually manage host register API and memory is pinned ++ self.buffer = self.buffer.pin_memory() ++ lmc_ops.host_register(self.buffer) + + + class GPUMemoryAllocator(MemoryAllocatorInterface): +@@ -1570,6 +1621,8 @@ class GPUMemoryAllocator(MemoryAllocatorInterface): + with self.device_mem_lock: + return self.allocator.memcheck() + ++ def registerHostPtr(self, device: str = "cuda", size: int = 0): ++ pass + + class AdHocMemoryAllocator(MemoryAllocatorInterface): + """ +@@ -1650,6 +1703,9 @@ class AdHocMemoryAllocator(MemoryAllocatorInterface): + def memcheck(self): + return True + ++ def registerHostPtr(self, device: str = "cuda", size: int = 0): ++ pass ++ + + class CuFileMemoryAllocator(GPUMemoryAllocator): + def __init__(self, size: int, device=None): +@@ -1749,3 +1805,6 @@ class NixlCPUMemoryAllocator(MemoryAllocatorInterface): + self.cpu_allocator.batched_free(memory_objs, update_stats=update_stats) + else: + raise ValueError(f"Unsupported allocator type: {allocator_type}") ++ ++ def registerHostPtr(self, device: str = "cuda", size: int = 0): ++ pass +\ No newline at end of file +diff --git a/lmcache/v1/storage_backend/storage_manager.py b/lmcache/v1/storage_backend/storage_manager.py +index 82d6edc..ba0b103 100644 +--- a/lmcache/v1/storage_backend/storage_manager.py ++++ b/lmcache/v1/storage_backend/storage_manager.py +@@ -425,4 +425,4 @@ class StorageManager: + if self.thread.is_alive(): + self.thread.join() + +- logger.info("Storage manager closed.") ++ logger.info("Storage manager closed.") +\ No newline at end of file +diff --git a/pyproject.toml b/pyproject.toml +index f8ee543..8a362c1 100644 +--- a/pyproject.toml ++++ b/pyproject.toml +@@ -6,7 +6,7 @@ requires = [ + "packaging>=24.2", + "setuptools>=77.0.3,<81.0.0", + "setuptools_scm>=8", +- "torch==2.7.0", ++ "torch>=2.5.1", + "wheel", + ] + build-backend = "setuptools.build_meta" +@@ -14,8 +14,8 @@ + [project] + name = "lmcache" + authors = [{name = "LMCache Team", email = "lmcacheteam@gmail.com"}] +-license = "Apache-2.0" +-license-files = ["LICENSE"] ++license = { file = "LICENSE" } ++ + readme = "README.md" + description = "A LLM serving engine extension to reduce TTFT and increase throughput, especially under long-context scenarios." + classifiers = [ + +diff --git a/requirements/ascend.txt b/requirements/ascend.txt +new file mode 100644 +index 0000000..ad04d52 +--- /dev/null ++++ b/requirements/ascend.txt +@@ -0,0 +1,7 @@ ++torch>=2.5.1 ++# Common project dependencies ++-r common.txt ++ ++ ++# These must be updated alongside torch to correspond to vLLM-ascend versions ++torch-npu>=2.5.1.post1.dev20250619 +\ No newline at end of file +diff --git a/requirements/build.txt b/requirements/build.txt +index 85aacb4..0923a0b 100644 +--- a/requirements/build.txt ++++ b/requirements/build.txt +@@ -4,6 +4,6 @@ ninja + packaging>=24.2 + setuptools>=77.0.3,<81.0.0 + setuptools_scm>=8 +-torch==2.7.0 # Corresponds to the version used by vLLM main branch ++torch>=2.5.1 # Corresponds to the version used by vLLM main branch + wheel + +diff --git a/requirements/common.txt b/requirements/common.txt +index 34e30c2..35011ad 100644 +--- a/requirements/common.txt ++++ b/requirements/common.txt +@@ -1,8 +1,7 @@ + aiofile + aiofiles + aiohttp +-cufile-python +-infinistore ++infinistore; platform_machine == 'x86_64' + msgspec + numpy + nvtx +@@ -15,6 +14,6 @@ safetensors + setuptools>=77.0.3,<81.0.0 + setuptools_scm>=8 + sortedcontainers +-torch==2.7.0 # Should correspond to the version used by vLLM main branch ++torch>=2.5.1 # Should correspond to the version used by vLLM main branch + transformers >= 4.51.1 + xxhash==3.5.0 +diff --git a/requirements/cuda.txt b/requirements/cuda.txt +index 719261d..4912df9 100644 +--- a/requirements/cuda.txt ++++ b/requirements/cuda.txt +@@ -9,3 +9,4 @@ nvidia-ml-py # for pynvml package + torch == 2.7.0 + torchvision == 0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version + xformers == 0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.7.0 ++cufile-python +diff --git a/setup.py b/setup.py +index 00a4980..b964aff 100644 +--- a/setup.py ++++ b/setup.py +@@ -4,7 +4,17 @@ import os + import sys + + # Third Party +-from setuptools import find_packages, setup ++from setuptools import find_packages, setup, Extension ++from setuptools.command.build_ext import build_ext ++from setuptools.command.develop import develop ++from setuptools.command.install import install ++ ++import logging ++import sysconfig ++import subprocess ++import platform ++import shutil ++ + + ROOT_DIR = Path(__file__).parent + HIPIFY_DIR = os.path.join(ROOT_DIR, "csrc/") +@@ -14,10 +24,159 @@ HIPIFY_OUT_DIR = os.path.join(ROOT_DIR, "csrc_hip/") + # will run python setup.py sdist --dist-dir dist + BUILDING_SDIST = "sdist" in sys.argv or os.environ.get("NO_CUDA_EXT", "0") == "1" + +-# New environment variable to choose between CUDA and HIP +-BUILD_WITH_HIP = os.environ.get("BUILD_WITH_HIP", "0") == "1" ++# Environment variable to choose between CUDA, HIP, Ascend ++TARGET_DEVICE = os.environ.get("LMCACHE_TARGET_DEVICE", "CUDA") ++ ++logging.basicConfig(level=logging.INFO) ++logger = logging.getLogger(__name__) ++ ++def _get_ascend_home_path(): ++ # NOTE: standard Ascend CANN toolkit path ++ return os.environ.get("ASCEND_HOME_PATH", "/usr/local/Ascend/ascend-toolkit/latest") ++ ++def _get_ascend_driver_path(): ++ # NOTE: standard Ascend path ++ return os.environ.get("ASCEND_DRIVER_PATH", "/usr/local/Ascend/driver") ++ ++def _get_ascend_env_path(): ++ # NOTE: standard Ascend Environment variable setup path ++ env_script_path = os.path.realpath(os.path.join(_get_ascend_home_path(), "..", "set_env.sh")) ++ if not os.path.exists(env_script_path): ++ raise ValueError(f"The file '{env_script_path}' is not found, " ++ "please make sure environment variable 'ASCEND_HOME_PATH' is set correctly.") ++ return env_script_path ++ ++def _get_npu_soc(): ++ _soc_version = os.getenv("SOC_VERSION", None) ++ if _soc_version is None: ++ npu_smi_cmd = [ ++ "bash", ++ "-c", ++ "npu-smi info | grep OK | awk '{print $3}' | head -n 1", ++ ] ++ try: ++ _soc_version = subprocess.check_output(npu_smi_cmd, ++ text=True).strip() ++ _soc_version = _soc_version.split("-")[0] ++ _soc_version = "Ascend"+_soc_version ++ return _soc_version ++ except subprocess.CalledProcessError as e: ++ raise RuntimeError(f"Retrieve SoC version failed: {e}") ++ return _soc_version + ++class CMakeExtension(Extension): + ++ def __init__(self, ++ name: str, ++ cmake_lists_dir: str = ".", ++ **kwargs) -> None: ++ super().__init__(name, sources=[], py_limited_api=False, **kwargs) ++ self.cmake_lists_dir = os.path.abspath(cmake_lists_dir) ++ ++class custom_install(install): ++ def run(self): ++ self.run_command("build_ext") ++ install.run(self) ++ ++class CustomAscendCmakeBuildExt(build_ext): ++ ++ def build_extension(self, ext): ++ # build the so as c_ops ++ ext_name = ext.name.split(".")[-1] ++ so_name = ext_name + ".so" ++ logger.info(f"Building {so_name} ...") ++ OPS_DIR = os.path.join(ROOT_DIR, "csrc", "ascend") ++ BUILD_OPS_DIR = os.path.join(ROOT_DIR, "build", "ascend") ++ os.makedirs(BUILD_OPS_DIR, exist_ok=True) ++ ++ ascend_home_path = _get_ascend_home_path() ++ ascend_driver_path = _get_ascend_driver_path() ++ env_path = _get_ascend_env_path() ++ _soc_version = _get_npu_soc() ++ _cxx_compiler = os.getenv("CXX") ++ _cc_compiler = os.getenv("CC") ++ python_executable = sys.executable ++ ++ try: ++ # if pybind11 is installed via pip ++ pybind11_cmake_path = (subprocess.check_output( ++ [python_executable, "-m", "pybind11", ++ "--cmakedir"]).decode().strip()) ++ except subprocess.CalledProcessError as e: ++ # else specify pybind11 path installed from source code on CI container ++ raise RuntimeError(f"CMake configuration failed: {e}") ++ ++ import torch_npu ++ torch_npu_path = os.path.dirname(os.path.abspath(torch_npu.__file__)) ++ import torch ++ torch_path = os.path.dirname(os.path.abspath(torch.__file__)) ++ ++ # python include ++ python_include_path = sysconfig.get_path('include', scheme='posix_prefix') ++ ++ arch = platform.machine() ++ install_path = os.path.join(BUILD_OPS_DIR, 'install') ++ if isinstance(self.distribution.get_command_obj("develop"), develop): ++ install_path=BUILD_OPS_DIR ++ ++ cmake_cmd = [ ++ f"source {env_path} && " ++ f"cmake -S {OPS_DIR} -B {BUILD_OPS_DIR}" ++ f" -DSOC_VERSION={_soc_version}" ++ f" -DARCH={arch}" ++ " -DUSE_ASCEND=1" ++ f" -DPYTHON_EXECUTABLE={python_executable}" ++ f" -DCMAKE_PREFIX_PATH={pybind11_cmake_path}" ++ f" -DCMAKE_BUILD_TYPE=Release" ++ f" -DCMAKE_INSTALL_PREFIX={install_path}" ++ f" -DPYTHON_INCLUDE_PATH={python_include_path}" ++ f" -DTORCH_NPU_PATH={torch_npu_path}" ++ f" -DTORCH_PATH={torch_path}" ++ f" -DASCEND_CANN_PACKAGE_PATH={ascend_home_path}" ++ f" -DASCEND_DRIVER_PATH={ascend_driver_path}" ++ " -DCMAKE_VERBOSE_MAKEFILE=ON" ++ ] ++ ++ if _cxx_compiler is not None: ++ cmake_cmd += [f" -DCMAKE_CXX_COMPILER={_cxx_compiler}"] ++ ++ if _cc_compiler is not None: ++ cmake_cmd += [f" -DCMAKE_C_COMPILER={_cc_compiler}"] ++ ++ cmake_cmd += [f" && cmake --build {BUILD_OPS_DIR} -j --verbose"] ++ cmake_cmd += [f" && cmake --install {BUILD_OPS_DIR}"] ++ cmake_cmd = "".join(cmake_cmd) ++ ++ logger.info(f"Start running CMake commands:\n{cmake_cmd}") ++ try: ++ result = subprocess.run(cmake_cmd, cwd=ROOT_DIR, text=True, shell=True, check=True) ++ except subprocess.CalledProcessError as e: ++ raise RuntimeError(f"Failed to build {so_name}: {e}") ++ ++ build_lib_dir = self.get_ext_fullpath(ext.name) ++ os.makedirs(os.path.dirname(build_lib_dir), exist_ok=True) ++ ++ package_name = ext.name.split('.')[0] # e.g., 'lmcache' ++ src_dir = os.path.join(ROOT_DIR, package_name) ++ ++ for root, _, files in os.walk(install_path): ++ for file in files: ++ if file.endswith(".so"): ++ src_path = os.path.join(root, file) ++ dst_path = os.path.join(os.path.dirname(build_lib_dir), file) ++ if os.path.exists(dst_path): ++ os.remove(dst_path) ++ ++ if isinstance(self.distribution.get_command_obj("develop"), develop): ++ # For the ascend kernels ++ src_dir_file = os.path.join(src_dir, file) ++ shutil.copy(src_path, src_dir_file) ++ shutil.copy(src_path, dst_path) ++ ++ logger.info(f"Copied {file} to {dst_path}") ++ ++ ++ + def hipify_wrapper() -> None: + # Third Party + from torch.utils.hipify.hipify_python import hipify +@@ -134,6 +293,12 @@ def rocm_extension() -> tuple[list, dict]: + return ext_modules, cmdclass + + ++def ascend_extension(): ++ print("Building Ascend extensions") ++ return [CMakeExtension(name="lmcache.c_ops")], \ ++ {"build_ext": CustomAscendCmakeBuildExt} ++ ++ + def source_dist_extension() -> tuple[list, dict]: + print("Not building CUDA/HIP extensions for sdist") + return [], {} +@@ -142,11 +307,14 @@ def source_dist_extension() -> tuple[list, dict]: + if __name__ == "__main__": + if BUILDING_SDIST: + get_extension = source_dist_extension +- elif BUILD_WITH_HIP: ++ TARGET_DEVICE = "EMPTY" ++ elif TARGET_DEVICE == "HIP": + get_extension = rocm_extension +- else: ++ elif TARGET_DEVICE == "CUDA": + get_extension = cuda_extension +- ++ elif TARGET_DEVICE == "ASCEND": ++ get_extension = ascend_extension ++ + ext_modules, cmdclass = get_extension() + + setup( +diff --git a/tests/conftest.py b/tests/conftest.py +index e7aafc8..c6f9805 100644 +--- a/tests/conftest.py ++++ b/tests/conftest.py +@@ -247,3 +247,18 @@ def autorelease_v1(request): + # Cleanup all objects created by the factory + # for obj in objects: + # obj.close() ++ ++@pytest.fixture(scope="module") ++def has_npu(request): ++ try: ++ import torch_npu ++ from torch_npu.contrib import transfer_to_npu ++ return True ++ except ImportError as ie: ++ return False ++ ++@pytest.fixture(scope="module") ++def skip_if_npu(has_npu): ++ if has_npu: ++ pytest.skip("Skipped. NPU does not support current function.") ++ return has_npu +\ No newline at end of file +diff --git a/tests/v1/storage_backend/test_local_cpu_backend.py b/tests/v1/storage_backend/test_local_cpu_backend.py +index 6c7fbb1..061b277 100644 +--- a/tests/v1/storage_backend/test_local_cpu_backend.py ++++ b/tests/v1/storage_backend/test_local_cpu_backend.py +@@ -64,9 +64,10 @@ def create_test_memory_obj(shape=(2, 16, 8, 128), dtype=torch.bfloat16) -> Memor + + + @pytest.fixture +-def memory_allocator(): ++def memory_allocator(has_npu): + """Create a memory allocator for testing.""" +- return MixedMemoryAllocator(1024 * 1024 * 1024) # 1GB ++ device = "npu" if has_npu else "cuda" ++ return MixedMemoryAllocator(1024 * 1024 * 1024, device=device) # 1GB + + + @pytest.fixture +@@ -589,4 +590,4 @@ class TestLocalCPUBackend: + # The accessed key should be at the end (MRU) + assert retrieved_keys[-1] == keys[0] + +- local_cpu_backend.memory_allocator.close() ++ local_cpu_backend.memory_allocator.close() +\ No newline at end of file +diff --git a/tests/v1/storage_backend/test_local_disk_backend.py b/tests/v1/storage_backend/test_local_disk_backend.py +index 197c195..77474da 100644 +--- a/tests/v1/storage_backend/test_local_disk_backend.py ++++ b/tests/v1/storage_backend/test_local_disk_backend.py +@@ -619,4 +619,4 @@ class TestLocalDiskBackend: + # The backend should still be in a consistent state + assert local_disk_backend.contains(key) + +- local_disk_backend.local_cpu_backend.memory_allocator.close() ++ local_disk_backend.local_cpu_backend.memory_allocator.close() +\ No newline at end of file +diff --git a/tests/v1/test_cache_engine.py b/tests/v1/test_cache_engine.py +index ba9721f..0827e85 100644 +--- a/tests/v1/test_cache_engine.py ++++ b/tests/v1/test_cache_engine.py +@@ -19,10 +19,11 @@ import torch + # First Party + from lmcache.v1.cache_engine import LMCacheEngineBuilder + from lmcache.v1.config import LMCacheEngineConfig ++import lmcache.c_ops as lmc_ops + + +-def test_paged_same_retrieve_store(autorelease_v1): +- device = "cuda" ++def test_paged_same_retrieve_store(autorelease_v1, has_npu): ++ device = "npu" if has_npu else "cuda" + fmt = "vllm" + num_tokens = 2000 + num_blocks = 1000 +@@ -57,9 +58,10 @@ def test_paged_same_retrieve_store(autorelease_v1): + + engine = autorelease_v1( + LMCacheEngineBuilder.get_or_create( +- "test", cfg, dumb_metadata(fmt, kv_shape), connector ++ "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device + ) + ) ++ + """ test retrieve empty """ + ret_mask = engine.retrieve( + tokens, kvcaches=retrieved_cache, slot_mapping=slot_mapping +@@ -93,7 +95,7 @@ def test_paged_same_retrieve_store(autorelease_v1): + @pytest.mark.parametrize("backend", ["cpu", "local_disk", "remote", "remote_cachegen"]) + @pytest.mark.parametrize("lmserver_v1_process", ["cpu"], indirect=True) + def test_paged_retrieve_prefix( +- fmt, chunk_size, backend, lmserver_v1_process, autorelease_v1 ++ fmt, chunk_size, backend, lmserver_v1_process, autorelease_v1, has_npu + ): + url = None + remote_serde = None +@@ -101,12 +103,14 @@ def test_paged_retrieve_prefix( + if "remote" in backend: + url = lmserver_v1_process.server_url + if backend == "remote_cachegen": ++ if has_npu: ++ pytest.skip("NPU backend: Not implemented for cachegen") + backend = "remote" + remote_serde = "cachegen" + check_equality = False + else: + remote_serde = "naive" +- device = "cuda" ++ device = "npu" if has_npu else "cuda" + num_tokens = 2000 + new_num_tokens = 1000 + kv_shape = (32, 2, chunk_size, 8, 128) +@@ -139,9 +143,10 @@ def test_paged_retrieve_prefix( + + engine = autorelease_v1( + LMCacheEngineBuilder.get_or_create( +- "test", cfg, dumb_metadata(fmt, kv_shape), connector ++ "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device + ) + ) ++ + """ test store """ + t1 = time.perf_counter() + engine.store(tokens, kvcaches=kv_cache, slot_mapping=slot_mapping) +@@ -199,12 +204,12 @@ def test_paged_retrieve_prefix( + ) + @pytest.mark.parametrize("lmserver_v1_process", ["cpu"], indirect=True) + def test_paged_store_offset( +- fmt, chunk_size, backend, lmserver_v1_process, autorelease_v1 ++ fmt, chunk_size, backend, lmserver_v1_process, autorelease_v1, has_npu + ): + url = None + if backend == "remote": + url = lmserver_v1_process.server_url +- device = "cuda" ++ device = "npu" if has_npu else "cuda" + num_tokens = 2000 + num_suffix_tokens = 500 + num_total_tokens = 3000 +@@ -231,9 +236,10 @@ def test_paged_store_offset( + + engine = autorelease_v1( + LMCacheEngineBuilder.get_or_create( +- "test", cfg, dumb_metadata(fmt, kv_shape), connector ++ "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device + ) + ) ++ + """ test store """ + engine.store( + tokens[:num_tokens], +@@ -295,8 +301,8 @@ def test_paged_store_offset( + "local_disk" + ], + ) +-def test_paged_mixed_retrieve(fmt, chunk_size, backend, autorelease_v1): +- device = "cuda" ++def test_paged_mixed_retrieve(fmt, chunk_size, backend, autorelease_v1, has_npu): ++ device = "npu" if has_npu else "cuda" + num_tokens = 2000 + new_num_tokens = 1000 + num_blocks = 1000 +@@ -327,9 +333,10 @@ def test_paged_mixed_retrieve(fmt, chunk_size, backend, autorelease_v1): + + engine = autorelease_v1( + LMCacheEngineBuilder.get_or_create( +- "test", cfg, dumb_metadata(fmt, kv_shape), connector ++ "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device + ) + ) ++ + """ test store """ + engine.store(tokens, kvcaches=kv_cache, slot_mapping=slot_mapping) + engine.store(new_tokens, kvcaches=kv_cache, slot_mapping=new_slot_mapping) +@@ -421,8 +428,8 @@ def test_paged_mixed_retrieve(fmt, chunk_size, backend, autorelease_v1): + + + @pytest.mark.parametrize("fmt", ["vllm"]) +-def test_paged_store_kv_tensors_mask(fmt, autorelease_v1): +- device = "cuda" ++def test_paged_store_kv_tensors_mask(fmt, autorelease_v1, has_npu): ++ device = "npu" if has_npu else "cuda" + num_tokens = 1000 + new_num_tokens = 2000 + num_blocks = 1000 +@@ -452,9 +459,10 @@ def test_paged_store_kv_tensors_mask(fmt, autorelease_v1): + + engine = autorelease_v1( + LMCacheEngineBuilder.get_or_create( +- "test", cfg, dumb_metadata(fmt, kv_shape), connector ++ "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device + ) + ) ++ + """ Store some tokens with mask """ + engine.store(tokens, kvcaches=kv_cache, slot_mapping=slot_mapping) + """Wait until store finishes""" +@@ -570,12 +578,12 @@ def test_paged_store_kv_tensors_mask(fmt, autorelease_v1): + ) + @pytest.mark.parametrize("lmserver_v1_process", ["cpu"], indirect=True) + def test_paged_hierarchy_retrieve( +- fmt, chunk_size, backend, retrieve_from, lmserver_v1_process, autorelease_v1 ++ fmt, chunk_size, backend, retrieve_from, lmserver_v1_process, autorelease_v1, has_npu + ): + url = None + if backend == "local_cpu_disk_remote": + url = lmserver_v1_process.server_url +- device = "cuda" ++ device = "npu" if has_npu else "cuda" + num_tokens = 2000 + new_num_tokens = 1000 + kv_shape = (32, 2, chunk_size, 8, 128) +@@ -609,9 +617,10 @@ def test_paged_hierarchy_retrieve( + + engine = autorelease_v1( + LMCacheEngineBuilder.get_or_create( +- "test", cfg, dumb_metadata(fmt, kv_shape), connector ++ "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device + ) + ) ++ + """ test store """ + t1 = time.perf_counter() + engine.store(tokens, kvcaches=kv_cache, slot_mapping=slot_mapping) +@@ -692,8 +701,8 @@ def test_paged_hierarchy_retrieve( + "local_disk", + ], + ) +-def test_paged_prefetch_retrieve(backend, prefetch_from, autorelease_v1): +- device = "cuda" ++def test_paged_prefetch_retrieve(backend, prefetch_from, autorelease_v1, has_npu): ++ device = "npu" if has_npu else "cuda" + num_tokens = 2000 + new_num_tokens = 1000 + num_blocks = 1000 +@@ -726,7 +735,7 @@ def test_paged_prefetch_retrieve(backend, prefetch_from, autorelease_v1): + + engine = autorelease_v1( + LMCacheEngineBuilder.get_or_create( +- "test", cfg, dumb_metadata(fmt, kv_shape), connector ++ "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device + ) + ) + """ test store """ +@@ -804,12 +813,12 @@ def test_paged_prefetch_retrieve(backend, prefetch_from, autorelease_v1): + ], + ) + @pytest.mark.parametrize("lmserver_v1_process", ["cpu"], indirect=True) +-def test_paged_mem_leak(fmt, chunk_size, backend, lmserver_v1_process, autorelease_v1): ++def test_paged_mem_leak(fmt, chunk_size, backend, lmserver_v1_process, autorelease_v1, has_npu): + url = None + if "remote" in backend: + url = lmserver_v1_process.server_url + +- device = "cuda" ++ device = "npu" if has_npu else "cuda" + num_tokens = 2000 + kv_shape = (32, 2, chunk_size, 8, 128) + num_blocks = 1000 +@@ -830,7 +839,7 @@ def test_paged_mem_leak(fmt, chunk_size, backend, lmserver_v1_process, autorelea + + engine = autorelease_v1( + LMCacheEngineBuilder.get_or_create( +- "test", cfg, dumb_metadata(fmt, kv_shape), connector ++ "test", cfg, dumb_metadata(fmt, kv_shape), connector, device=device + ) + ) + +@@ -872,20 +881,20 @@ def test_paged_mem_leak(fmt, chunk_size, backend, lmserver_v1_process, autorelea + LMCacheEngineBuilder.destroy("test") + + +-def test_builder(autorelease_v1): ++def test_builder(autorelease_v1, has_npu): + instance_id = "test" + cfg = LMCacheEngineConfig.from_legacy(chunk_size=256) + cfg2 = LMCacheEngineConfig.from_legacy(chunk_size=512) + connector = None + should_be_none = LMCacheEngineBuilder.get(instance_id) + assert should_be_none is None +- ++ device = "npu" if has_npu else "cuda" + _engine = autorelease_v1( +- LMCacheEngineBuilder.get_or_create(instance_id, cfg, dumb_metadata(), connector) ++ LMCacheEngineBuilder.get_or_create(instance_id, cfg, dumb_metadata(), connector, device=device) + ) + _engine2 = autorelease_v1(LMCacheEngineBuilder.get(instance_id)) # noqa + + with pytest.raises(ValueError): + LMCacheEngineBuilder.get_or_create( +- instance_id, cfg2, dumb_metadata(), connector ++ instance_id, cfg2, dumb_metadata(), connector, device=device + ) +diff --git a/tests/v1/test_connector.py b/tests/v1/test_connector.py +index 9dbfe72..3e538e6 100644 +--- a/tests/v1/test_connector.py ++++ b/tests/v1/test_connector.py +@@ -16,7 +16,7 @@ import torch + # First Party + from lmcache.v1.memory_management import PinMemoryAllocator + from lmcache.v1.storage_backend.connector import CreateConnector +- ++import lmcache.c_ops as lmc_ops + + @pytest.mark.parametrize("lmserver_v1_process", ["cpu"], indirect=True) + @pytest.mark.parametrize( +@@ -25,12 +25,14 @@ from lmcache.v1.storage_backend.connector import CreateConnector + "lm://localhost:65000", + ], + ) +-def test_lm_connector(url, autorelease_v1, lmserver_v1_process): ++def test_lm_connector(url, autorelease_v1, lmserver_v1_process, has_npu): + if url.startswith("lm"): + url = lmserver_v1_process.server_url + + async_loop, async_thread = init_asyncio_loop() +- memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024) ++ device = "npu" if has_npu else "cuda" ++ memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device) ++ + connector = autorelease_v1(CreateConnector(url, async_loop, memory_allocator)) + + random_key = dumb_cache_engine_key() +@@ -70,14 +72,15 @@ def test_lm_connector(url, autorelease_v1, lmserver_v1_process): + + + @pytest.mark.parametrize("lmserver_v1_process", ["cpu"], indirect=True) +-def test_fs_connector(lmserver_v1_process, autorelease_v1): ++def test_fs_connector(lmserver_v1_process, autorelease_v1, has_npu): + """Test filesystem connector: exists, put, get, list, and file store.""" + + with tempfile.TemporaryDirectory() as temp_dir: + # Setup + url = f"fs://host:0/{temp_dir}/" + async_loop, async_thread = init_asyncio_loop() +- memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024) ++ device = "npu" if has_npu else "cuda" ++ memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device) + connector = autorelease_v1(CreateConnector(url, async_loop, memory_allocator)) + random_key = dumb_cache_engine_key() + +@@ -138,7 +141,7 @@ def test_fs_connector(lmserver_v1_process, autorelease_v1): + "unix:///tmp/redis.sock", + ], + ) +-def test_redis_connector(url, autorelease_v1): ++def test_redis_connector(url, autorelease_v1, has_npu): + """Test Redis connector: exists, put, get operations. + + This test uses the MockRedis from conftest.py to simulate +@@ -146,7 +149,8 @@ def test_redis_connector(url, autorelease_v1): + """ + + async_loop, async_thread = init_asyncio_loop() +- memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024) ++ device = "npu" if has_npu else "cuda" ++ memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device) + connector = autorelease_v1(CreateConnector(url, async_loop, memory_allocator)) + + random_key = dumb_cache_engine_key() +@@ -199,7 +203,7 @@ def test_redis_connector(url, autorelease_v1): + "redis-sentinel://localhost:26379", + ], + ) +-def test_redis_sentinel_connector(url, autorelease_v1): ++def test_redis_sentinel_connector(url, autorelease_v1, has_npu): + """Test Redis Sentinel connector: exists, put, get operations. + + This test uses the MockRedisSentinel from conftest.py to simulate +@@ -213,7 +217,8 @@ def test_redis_sentinel_connector(url, autorelease_v1): + os.environ["REDIS_TIMEOUT"] = "5" + + async_loop, async_thread = init_asyncio_loop() +- memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024) ++ device = "npu" if has_npu else "cuda" ++ memory_allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device) + connector = autorelease_v1(CreateConnector(url, async_loop, memory_allocator)) + + random_key = dumb_cache_engine_key() +diff --git a/tests/v1/test_gds.py b/tests/v1/test_gds.py +index 5fda653..ee0c093 100644 +--- a/tests/v1/test_gds.py ++++ b/tests/v1/test_gds.py +@@ -20,7 +20,7 @@ from lmcache.v1.storage_backend import CreateStorageBackends + from lmcache.v1.storage_backend.gds_backend import pack_metadata, unpack_metadata + + +-def test_gds_backend_metadata(): ++def test_gds_backend_metadata(skip_if_npu): + # This is a sanity check that packing and unpacking works. We can add + # more tensor types to be sure. + for [tensor, expected_nbytes] in [(torch.randn(3, 10), 120)]: +@@ -46,7 +46,7 @@ def test_gds_backend_metadata(): + + + @pytest.mark.skip(reason="We need to add this test back after implementing prefetch") +-def test_gds_backend_sanity(): ++def test_gds_backend_sanity(skip_if_npu): + BASE_DIR = Path(__file__).parent + GDS_DIR = "/tmp/gds/test-cache" + TEST_KEY = CacheEngineKey( +diff --git a/tests/v1/test_gpu_connector.py b/tests/v1/test_gpu_connector.py +index 8fcac95..6ab3bfa 100644 +--- a/tests/v1/test_gpu_connector.py ++++ b/tests/v1/test_gpu_connector.py +@@ -25,10 +25,9 @@ from lmcache.v1.memory_management import ( + PinMemoryAllocator, + ) + +- + @pytest.mark.parametrize("use_gpu", [True, False]) + @pytest.mark.parametrize("use_mla", [True, False]) +-def test_vllm_paged_connector_v2_with_gpu_and_mla(use_gpu, use_mla): ++def test_vllm_paged_connector_v2_with_gpu_and_mla(use_gpu, use_mla, has_npu): + num_blocks = 100 + block_size = 16 + num_layers = 32 +@@ -39,8 +38,8 @@ def test_vllm_paged_connector_v2_with_gpu_and_mla(use_gpu, use_mla): + + num_tokens = 800 + chunk_size = 256 +- +- allocator = PinMemoryAllocator(1024 * 1024 * 1024) ++ device = "npu" if has_npu else "cuda" ++ allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device) + + gpu_kv_src = generate_kv_cache_paged_list_tensors( + num_blocks=num_blocks, device=device, block_size=block_size, use_mla=use_mla +@@ -122,7 +121,7 @@ def test_vllm_paged_connector_v2_with_gpu_and_mla(use_gpu, use_mla): + + + @pytest.mark.parametrize("use_gpu", [True]) +-def test_layerwise_vllm_paged_connector_with_gpu(use_gpu): ++def test_layerwise_vllm_paged_connector_with_gpu(use_gpu, has_npu): + num_blocks = 100 + block_size = 16 + num_layers = 32 +@@ -133,8 +132,8 @@ def test_layerwise_vllm_paged_connector_with_gpu(use_gpu): + + num_tokens = 800 + chunk_size = 256 +- +- allocator = PinMemoryAllocator(1024 * 1024 * 1024) ++ device = "npu" if has_npu else "cuda" ++ allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device) + + gpu_kv_src = generate_kv_cache_paged_list_tensors(num_blocks, device, block_size) + gpu_kv_dst = generate_kv_cache_paged_list_tensors(num_blocks, device, block_size) +@@ -222,7 +221,7 @@ def test_layerwise_vllm_paged_connector_with_gpu(use_gpu): + + + @pytest.mark.parametrize("use_gpu", [True]) +-def test_batched_layerwise_vllm_paged_connector_with_gpu(use_gpu): ++def test_batched_layerwise_vllm_paged_connector_with_gpu(use_gpu, has_npu): + num_blocks = 100 + block_size = 16 + num_layers = 32 +@@ -235,8 +234,8 @@ def test_batched_layerwise_vllm_paged_connector_with_gpu(use_gpu): + num_tokens_2 = 500 + num_tokens_total = num_tokens_1 + num_tokens_2 + chunk_size = 256 +- +- allocator = PinMemoryAllocator(1024 * 1024 * 1024) ++ device = "npu" if has_npu else "cuda" ++ allocator = PinMemoryAllocator(1024 * 1024 * 1024, device=device) + + gpu_kv_src = generate_kv_cache_paged_list_tensors(num_blocks, device, block_size) + gpu_kv_dst = generate_kv_cache_paged_list_tensors(num_blocks, device, block_size) +@@ -540,7 +539,7 @@ def test_vllm_paged_connector_v2_to_gpu_bench(benchmark): + + @pytest.mark.parametrize("use_gpu", [True, False]) + @pytest.mark.parametrize("use_mla", [True, False]) +-def test_sglang_connector_with_gpu_and_mla(use_gpu, use_mla): ++def test_sglang_connector_with_gpu_and_mla(use_gpu, use_mla, skip_if_npu): + num_blocks = 100 + block_size = 16 + num_layers = 32 +diff --git a/tests/v1/test_mem_kernels.py b/tests/v1/test_mem_kernels.py +index 2cabd67..94440df 100644 +--- a/tests/v1/test_mem_kernels.py ++++ b/tests/v1/test_mem_kernels.py +@@ -13,6 +13,7 @@ from utils import ( + import pytest + import torch + ++ + # First Party + from lmcache.v1.memory_management import PinMemoryAllocator + import lmcache.c_ops as lmc_ops +@@ -54,7 +55,7 @@ def _slice_kv_at( + + + @pytest.mark.parametrize("num_tokens", [256, 500, 1024, 8000]) +-def test_extract_and_load_back(num_tokens): ++def test_extract_and_load_back(num_tokens, has_npu): + device = "cuda" + + num_blocks = 1000 +@@ -68,7 +69,8 @@ def test_extract_and_load_back(num_tokens): + slot_mapping = torch.tensor(slot_mapping, device=device) + + pinned_cpu_size = 4 * 1024 * 1024 * 1024 # 4GB +- mem_allocator = PinMemoryAllocator(pinned_cpu_size) ++ device = "npu" if has_npu else "cuda" ++ mem_allocator = PinMemoryAllocator(pinned_cpu_size, device=device) + + # Old extract + kv_tuple_list = [] +@@ -154,7 +156,7 @@ def test_extract_and_load_back(num_tokens): + + + @pytest.mark.parametrize("num_tokens", [256, 500, 1024, 8000]) +-def test_multi_layer_kernel(num_tokens): ++def test_multi_layer_kernel(num_tokens, has_npu): + device = "cuda" + + num_blocks = 1000 +@@ -172,7 +174,8 @@ def test_multi_layer_kernel(num_tokens): + slot_mapping = torch.tensor(slot_mapping, device=device) + + pinned_cpu_size = 4 * 1024 * 1024 * 1024 # 4GB +- mem_allocator = PinMemoryAllocator(pinned_cpu_size) ++ device = "npu" if has_npu else "cuda" ++ mem_allocator = PinMemoryAllocator(pinned_cpu_size, device=device) + + # lmc_ops.multi_layer_kv_transfer(memory_obj_new.tensor, + # kv_cache_pointers, # TODO: initialize this +@@ -209,8 +212,12 @@ def test_multi_layer_kernel(num_tokens): + kv_cache_pointers = torch.empty( + 32, dtype=torch.int64, device="cpu", pin_memory=True + ) ++ + for i in range(32): + kv_cache_pointers[i] = kv_cache[i].data_ptr() ++ ++ if has_npu: ++ kv_cache_pointers = kv_cache_pointers.to(device) + + memory_obj_new_list = [] + start_event = torch.cuda.Event(enable_timing=True) +@@ -253,6 +260,9 @@ def test_multi_layer_kernel(num_tokens): + ) + for i in range(32): + kv_cache_pointers_new[i] = kv_cache_new[i].data_ptr() ++ ++ if has_npu: ++ kv_cache_pointers_new = kv_cache_pointers_new.to(device) + + for chunk_id, slot_mapping_temp in enumerate(slot_mapping_chunked): + memory_obj_new = memory_obj_new_list[chunk_id] +@@ -276,7 +286,7 @@ def test_multi_layer_kernel(num_tokens): + + + @pytest.mark.parametrize("num_tokens", [256, 500, 1024, 8000]) +-def test_multi_layer_kernel_use_mla(num_tokens): ++def test_multi_layer_kernel_use_mla(num_tokens, has_npu): + device = "cuda" + + num_blocks = 1000 +@@ -293,8 +303,9 @@ def test_multi_layer_kernel_use_mla(num_tokens): + slot_mapping = torch.tensor(slot_mapping, device=device) + + pinned_cpu_size = 4 * 1024 * 1024 * 1024 # 4GB +- mem_allocator = PinMemoryAllocator(pinned_cpu_size) +- ++ device = "npu" if has_npu else "cuda" ++ mem_allocator = PinMemoryAllocator(pinned_cpu_size, device=device) ++ + # layer by layer extract + memory_obj_old_list = [] + start_event = torch.cuda.Event(enable_timing=True) +@@ -329,6 +340,8 @@ def test_multi_layer_kernel_use_mla(num_tokens): + ) + for i in range(num_layers): + kv_cache_pointers[i] = kv_cache[i].data_ptr() ++ if has_npu: ++ kv_cache_pointers = kv_cache_pointers.to(device) + + memory_obj_new_list = [] + start_event = torch.cuda.Event(enable_timing=True) +@@ -377,6 +390,9 @@ def test_multi_layer_kernel_use_mla(num_tokens): + ) + for i in range(num_layers): + kv_cache_pointers_new[i] = kv_cache_new[i].data_ptr() ++ ++ if has_npu: ++ kv_cache_pointers_new = kv_cache_pointers_new.to(device) + + for chunk_id, slot_mapping_temp in enumerate(slot_mapping_chunked): + memory_obj_new = memory_obj_new_list[chunk_id] +diff --git a/tests/v1/test_memory_management.py b/tests/v1/test_memory_management.py +index 2f70517..660e74f 100644 +--- a/tests/v1/test_memory_management.py ++++ b/tests/v1/test_memory_management.py +@@ -142,15 +142,15 @@ def test_tensor_allocator(use_paging): + True, + ], + ) +-def test_device_allocators(alloc_cls, use_paging): ++def test_device_allocators(alloc_cls, use_paging, has_npu): + total_size = 1024 * 1024 * 128 # 128MB + + shape = torch.Size([2, 32, 16, 1024]) # 64 pages + dtype = torch.bfloat16 + fmt = MemoryFormat.KV_2LTD +- ++ device = "npu" if has_npu else "cuda" + allocator = alloc_cls( +- total_size, use_paging=use_paging, shape=shape, dtype=dtype, fmt=fmt ++ total_size, use_paging=use_paging, shape=shape, dtype=dtype, fmt=fmt, device=device + ) + + if use_paging: +@@ -171,9 +171,10 @@ def test_device_allocators(alloc_cls, use_paging): + MixedMemoryAllocator, + ], + ) +-def test_inplace_modification(alloc_cls): ++def test_inplace_modification(alloc_cls, has_npu): + total_size = 1024 +- allocator = alloc_cls(total_size) ++ device = "npu" if has_npu else "cuda" ++ allocator = alloc_cls(total_size, device=device) + + data = allocator.allocate([10], torch.float) + assert data is not None +@@ -198,9 +199,10 @@ def test_inplace_modification(alloc_cls): + MixedMemoryAllocator, + ], + ) +-def test_boundary_alloc(alloc_cls): ++def test_boundary_alloc(alloc_cls, has_npu): + total_size = 1 << 25 +- allocator = alloc_cls(total_size) ++ device = "npu" if has_npu else "cuda" ++ allocator = alloc_cls(total_size, device=device) + data1 = allocator.allocate([512, 10], torch.float) + allocator.allocate([512, 10], torch.float) + allocator.free(data1) +@@ -225,10 +227,11 @@ def test_boundary_alloc(alloc_cls): + MixedMemoryAllocator, + ], + ) +-def test_batched_alloc(alloc_cls): ++def test_batched_alloc(alloc_cls, has_npu): + total_size = 32 * 100 * 2 * 1024 * 2 + batch_size = 32 +- allocator = alloc_cls(total_size) ++ device = "npu" if has_npu else "cuda" ++ allocator = alloc_cls(total_size, device=device) + objs = allocator.batched_allocate( + [100, 2, 1024], torch.bfloat16, batch_size, MemoryFormat.KV_T2D + ) +@@ -255,9 +258,10 @@ def test_batched_alloc(alloc_cls): + MixedMemoryAllocator, + ], + ) +-def test_mixed_alloc(alloc_cls): ++def test_mixed_alloc(alloc_cls, has_npu): + total_size = 1 << 25 +- allocator = alloc_cls(total_size) ++ device = "npu" if has_npu else "cuda" ++ allocator = alloc_cls(total_size, device=device) + data1 = allocator.allocate([512, 0], None, MemoryFormat.BINARY_BUFFER) + allocator.allocate([512, 10], torch.float) + allocator.free(data1) +diff --git a/tests/v1/test_weka.py b/tests/v1/test_weka.py +index eab8e1b..7cf052b 100644 +--- a/tests/v1/test_weka.py ++++ b/tests/v1/test_weka.py +@@ -18,7 +18,7 @@ from lmcache.v1.storage_backend import CreateStorageBackends + + + @pytest.mark.skip(reason="We need to add this test back after implementing prefetch") +-def test_weka_backend_sanity(): ++def test_weka_backend_sanity(skip_if_npu): + BASE_DIR = Path(__file__).parent + WEKA_DIR = "/tmp/weka/test-cache" + TEST_KEY = CacheEngineKey( diff --git a/LMCache-0.1.4.alpha.tar.gz b/LMCache-0.1.4.alpha.tar.gz deleted file mode 100644 index c242010c5335191bfa5298c757768528425427ec..0000000000000000000000000000000000000000 Binary files a/LMCache-0.1.4.alpha.tar.gz and /dev/null differ diff --git a/LMCache.tar.gz b/LMCache.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..b90311279cb52f716ed1e852c87f098057c755ec Binary files /dev/null and b/LMCache.tar.gz differ diff --git a/fix-dependance-issue-on-ascend.patch b/fix-dependance-issue-on-ascend.patch deleted file mode 100644 index 05a158638d383324086c991815ebf5b095df5c68..0000000000000000000000000000000000000000 --- a/fix-dependance-issue-on-ascend.patch +++ /dev/null @@ -1,13 +0,0 @@ -diff --git a/setup.py b/setup.py -index 5b1aa4b..1895953 100644 ---- a/setup.py -+++ b/setup.py -@@ -17,7 +17,7 @@ setup( - "nvtx", - "safetensors", - "transformers", -- "torchac_cuda >= 0.2.5", -+ # "torchac_cuda >= 0.2.5", - ], - classifiers=[ - # Trove classifiers diff --git a/lmcache.spec b/lmcache.spec index c1dc836e6d611c2048e6a0ea79c5db825bb39c0a..673cd34593459d79286583f081ce1f684c6a810a 100644 --- a/lmcache.spec +++ b/lmcache.spec @@ -1,41 +1,44 @@ %define debug_package %{nil} -Name: python-LMCache -Version: 0.1.4.alpha +Name: LMCache +Version: 0.3.1.post1 Release: 1 Summary: LMCache - A lightweight memory cache system License: Apache 2.0 URL: https://github.com/LMCache/LMCache -Source0: LMCache-%{version}.tar.gz +Source0: LMCache.tar.gz -Patch0: fix-dependance-issue-on-ascend.patch - -BuildArch: noarch +Patch0: 0001-npu-picked-support.patch +BuildArch: aarch64 BuildRequires: python3 BuildRequires: python3-setuptools python3-pip - Requires: python3 + %description LMCache is a lightweight, high-performance memory cache system designed for simple key-value storage with minimal overhead. %prep -%autosetup -n %{name}-%{version} -p1 -Sgit +%autosetup -n %{name} -p1 -Sgit %build -ln -s /usr/bin/python3 /usr/bin/python || : +# ln -s /usr/bin/python3 /usr/bin/python || : + +export LMCACHE_TARGET_DEVICE="ASCEND" +export PYTHONPATH=/usr/local/lib64/python3.11/site-packages:$PYTHONPATH + %py3_build %install %py3_install %files -%{python3_sitelib}/* +/usr/lib64/python3.11/site-packages/* %{_bindir}/* -%changelog -* Tue Jun 24 2025 jingjunyuan - 0.1.4.alpha-1 -- Initial package build +%changelog +* Tues Jul 29 2025 gingfung.matthew.yeung - 0.3.1.post1-1 +- Initial package build \ No newline at end of file