diff --git a/adapt-310p.patch b/adapt-310p.patch new file mode 100644 index 0000000000000000000000000000000000000000..820088a313706bd81a781de4b23013fb9033a55c --- /dev/null +++ b/adapt-310p.patch @@ -0,0 +1,398 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index a77b665..09158d6 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -23,6 +23,16 @@ set(LMCACHE_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}") + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") + message(STATUS "Detected SOC version: ${SOC_VERSION}") + ++if (SOC_VERSION STREQUAL "Ascend310P3") ++ message(STATUS "Building for Ascend 310P3.") ++ add_compile_definitions(_ASCEND_310P3_) ++elseif (SOC_VERSION STREQUAL "Ascend910B4") ++ message(STATUS "Building for Ascend 910B4.") ++ add_compile_definitions(_ASCEND_910B4_) ++else() ++ message(WARNING "Unknown or unsupported SOC version: ${SOC_VERSION}. Building with generic settings.") ++endif() ++ + if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRINGS "Build type Release/Debug (default Release)" FORCE) + endif() +diff --git a/csrc/ascend/kernels/paged_kv_copy.cpp b/csrc/ascend/kernels/paged_kv_copy.cpp +index d36b415..610741c 100644 +--- a/csrc/ascend/kernels/paged_kv_copy.cpp ++++ b/csrc/ascend/kernels/paged_kv_copy.cpp +@@ -141,7 +141,6 @@ private: + + // Declare support kernel entry + PAGED_KV_COPY_TYPE_DECLARE(half); +-PAGED_KV_COPY_TYPE_DECLARE(bfloat16_t); + PAGED_KV_COPY_TYPE_DECLARE(int8_t); + + namespace lmc_ops { +@@ -158,8 +157,6 @@ extern void paged_kv_copy_impl(vllm_ascend::AscendType type, uint32_t blockDim, + { + if (type == vllm_ascend::AscendType::FP16) { + PAGED_KV_COPY_KERNEL_CALL(half); +- } else if (type == vllm_ascend::AscendType::BF16) { +- PAGED_KV_COPY_KERNEL_CALL(bfloat16_t); + } else if (type == vllm_ascend::AscendType::INT8) { + PAGED_KV_COPY_KERNEL_CALL(int8_t); + } else { +diff --git a/csrc/ascend/kernels/paged_kv_tuple_copy.cpp b/csrc/ascend/kernels/paged_kv_tuple_copy.cpp +index 5769c4d..f21f40e 100644 +--- a/csrc/ascend/kernels/paged_kv_tuple_copy.cpp ++++ b/csrc/ascend/kernels/paged_kv_tuple_copy.cpp +@@ -147,7 +147,6 @@ private: + + // Declare support kernel entry + PAGED_KV_TUPLE_COPY_TYPE_DECLARE(half); +-PAGED_KV_TUPLE_COPY_TYPE_DECLARE(bfloat16_t); + PAGED_KV_TUPLE_COPY_TYPE_DECLARE(int8_t); + + namespace lmc_ops { +@@ -164,9 +163,7 @@ extern void paged_kv_tuple_copy_impl(vllm_ascend::AscendType type, uint32_t bloc + { + if (type == vllm_ascend::AscendType::FP16) { + PAGED_KV_TUPLE_COPY_KERNEL_CALL(half); +- } else if (type == vllm_ascend::AscendType::BF16) { +- PAGED_KV_TUPLE_COPY_KERNEL_CALL(bfloat16_t); +- } else if (type == vllm_ascend::AscendType::INT8) { ++ } else if (type == vllm_ascend::AscendType::INT8) { + PAGED_KV_TUPLE_COPY_KERNEL_CALL(int8_t); + } else { + return; +diff --git a/csrc/ascend/kernels/utils.h b/csrc/ascend/kernels/utils.h +index a6dd3f3..ff271f2 100644 +--- a/csrc/ascend/kernels/utils.h ++++ b/csrc/ascend/kernels/utils.h +@@ -20,10 +20,6 @@ namespace vllm_ascend { + + template struct AccType; + +-template <> struct AccType { +- using type = float; +-}; +- + template <> struct AccType { + using type = half; + }; +diff --git a/csrc/ascend/np_tensor.h b/csrc/ascend/np_tensor.h +index 41c23f7..c56b237 100644 +--- a/csrc/ascend/np_tensor.h ++++ b/csrc/ascend/np_tensor.h +@@ -216,4 +216,50 @@ namespace lmcache_ascend { + slot_mappings = caster.RecoveryTensorDtype(slot_mappings, "slot_mapping"); + } + ++ void paged_layers_kv_transfer_ms_unregistered( ++ BaseTensorPtr& lmc_buffer, //tmp_gpu_buffer ++ BaseTensorPtr& paged_kv_ptrs, ++ BaseTensorPtr& slot_mappings, ++ const int num_pages, ++ const int page_size, ++ const int kvs, ++ const int hidden_dims, ++ const bool page2L ++ ) { ++ auto stream_id = PyBoostUtils::cur_stream_id(); ++ auto device_context = mindspore::runtime::OpRunner::GetDeviceContext("Ascend"); ++ ++ // only support int32 slotmapping for now. ++ DtypeCaster caster; ++ slot_mappings = caster.CheckAndCast(slot_mappings, "slot_mapping"); ++ ++ int num_tokens = slot_mappings->shape()[0]; ++ int num_layers = paged_kv_ptrs->shape()[0]; ++ ++ // int hidden_dims = lmc_buffer.size(-1); ++ auto ascend_type = vllm_ascend::AscendType::FP16; ++ uint32_t aivNum = get_static_aiv_core_num(); ++ ++ PyBoostUtils::PrepareOpInputs(device_context, stream_id, paged_kv_ptrs, slot_mappings); ++ PyBoostUtils::DispatchRun(std::make_shared([=]() { ++ PyBoostUtils::MallocOpInputs(device_context, paged_kv_ptrs, slot_mappings); ++ ++ uint8_t* paged_kv_dev_ptr = GetMSDataPtr(paged_kv_ptrs); ++ uint8_t* slot_mappings_dptr = GetMSDataPtr(slot_mappings); ++ uint8_t* lmc_offset_dptr = GetMSDataPtr(lmc_buffer); ++ ++ auto acl_stream = device_context->device_res_manager_->GetStream(stream_id); ++ ++ mindspore::runtime::OpExecutor::DispatchLaunchTask([=]() { ++ lmc_ops::paged_kv_tuple_copy_impl(ascend_type, aivNum, acl_stream, paged_kv_dev_ptr, ++ reinterpret_cast(lmc_offset_dptr), slot_mappings_dptr, ++ static_cast(num_pages), static_cast(hidden_dims), ++ static_cast(page_size), static_cast(kvs), ++ static_cast(num_layers), static_cast(num_tokens), ++ static_cast(aivNum), page2L); ++ }); ++ })); ++ slot_mappings = caster.RecoveryTensorDtype(slot_mappings, "slot_mapping"); ++ } ++ + } +diff --git a/csrc/ascend/pinned_mem.cpp b/csrc/ascend/pinned_mem.cpp +index eba74b4..04cf2da 100644 +--- a/csrc/ascend/pinned_mem.cpp ++++ b/csrc/ascend/pinned_mem.cpp +@@ -106,19 +106,21 @@ bool PinnedMemoryManager::freePinned(uintptr_t hostPtr) { + uintptr_t PinnedMemoryManager::allocPinned(size_t bufferSize) { + auto device = framework_hal::GetDeviceIdx(); + +- int cpu = -1; +- if (is_numa_system_present()) { +- long numaErr; +- int numa_node; +- auto& dcmiManger = lmc::DCMIManager::GetInstance(); +- auto cpuAffinityStr = dcmiManger.getCPUAffinityFromDeviceId(static_cast(device), 0); +- cpu = lmc::parse_first_cpu(cpuAffinityStr); +- } ++ #ifndef _ASCEND_310P3_ ++ int cpu = -1; ++ if (is_numa_system_present()) { ++ long numaErr; ++ int numa_node; ++ auto& dcmiManger = lmc::DCMIManager::GetInstance(); ++ auto cpuAffinityStr = dcmiManger.getCPUAffinityFromDeviceId(static_cast(device), 0); ++ cpu = lmc::parse_first_cpu(cpuAffinityStr); ++ } + +- std::unique_ptr guard; +- if (cpu >= 0) { +- guard = std::make_unique(cpu); +- } ++ std::unique_ptr guard; ++ if (cpu >= 0) { ++ guard = std::make_unique(cpu); ++ } ++ #endif + + uintptr_t hostPtr; + int adviseErr; +@@ -138,15 +140,18 @@ uintptr_t PinnedMemoryManager::allocPinned(size_t bufferSize) { + // set to all zeros + memset(reinterpret_cast(hostPtr), 0, bufferSize); + +- void* devPtr; +- drvError_t drvRet; +- drvRet = halHostRegister(reinterpret_cast(hostPtr), static_cast(bufferSize), +- HOST_MEM_MAP_DEV_PCIE_TH, static_cast(device), (void**)&devPtr); ++ void* devPtr = nullptr; + +- if (drvRet != 0) { +- throw std::runtime_error(std::string("Unable to register host memory with hal: ") + std::to_string(drvRet) + \ +- " on device: " + std::to_string(device)); +- } ++ #ifndef _ASCEND_310P3_ ++ drvError_t drvRet; ++ drvRet = halHostRegister(reinterpret_cast(hostPtr), static_cast(bufferSize), ++ HOST_MEM_MAP_DEV_PCIE_TH, static_cast(device), (void**)&devPtr); ++ ++ if (drvRet != 0) { ++ throw std::runtime_error(std::string("Unable to register host memory with hal: ") + std::to_string(drvRet) + \ ++ " on device: " + std::to_string(device)); ++ } ++ #endif + + auto lockErr = mlock(reinterpret_cast(hostPtr), bufferSize); + if (lockErr == -1) { +diff --git a/csrc/ascend/pybind.cpp b/csrc/ascend/pybind.cpp +index 1538e11..1d55a15 100644 +--- a/csrc/ascend/pybind.cpp ++++ b/csrc/ascend/pybind.cpp +@@ -27,6 +27,9 @@ PYBIND11_MODULE(lmcache_C, m) { + py::arg("lmc_buffer"), py::arg("paged_kv_ptrs"), py::arg("slot_mappings"), + py::arg("lmc_buffer_hostptr"), py::arg("num_pages"), py::arg("page_size"), + py::arg("kvs"), py::arg("page2L"), py::arg("is_reg_mem") = false); ++ m.def("paged_layers_kv_transfer_ms_unregistered", &lmcache_ascend::paged_layers_kv_transfer_ms_unregistered, ++ py::arg("lmc_buffer"), py::arg("paged_kv_ptrs"), py::arg("slot_mappings"), ++ py::arg("num_pages"), py::arg("page_size"), py::arg("kvs"), py::arg("hidden_dims"), py::arg("page2L")); + m.def("create_pinned_tensor", &lmcache_ascend::create_mmapped_numpy, py::arg("buffer_size"), + "Create a numpy tensor backed by custom pinned memory and accessible by the device with a deleter."); + m.def("create_pinned_tensor_with_infos", &lmcache_ascend::create_mapped_numpy_with_dtype, +diff --git a/lmcache/v1/_tensor.py b/lmcache/v1/_tensor.py +index a7a2525..a52f645 100644 +--- a/lmcache/v1/_tensor.py ++++ b/lmcache/v1/_tensor.py +@@ -42,6 +42,8 @@ def get_itemsize(dtype: torch.dtype): + elif dtype == np_dtype.bfloat16: + # np does not have bfloat16 + return 2 ++ elif dtype == np.float16: ++ return 2 + return m + + +diff --git a/lmcache/v1/gpu_connector.py b/lmcache/v1/gpu_connector.py +index 25fc8ea..ed9b0ad 100644 +--- a/lmcache/v1/gpu_connector.py ++++ b/lmcache/v1/gpu_connector.py +@@ -18,6 +18,7 @@ import abc + + # Third Party + import torch ++import numpy as np + + # First Party + from lmcache.integration.vllm.utils import ENGINE_NAME +@@ -30,6 +31,15 @@ from lmcache.v1.memory_management import MemoryFormat, MemoryObj + + logger = init_logger(__name__) + ++_IS_310P = None ++ ++def is_310p(): ++ global _IS_310P ++ if _IS_310P is None: ++ from lmcache import _build_info ++ _IS_310P = _build_info.__soc_version__.lower().startswith("ascend310p") ++ return _IS_310P ++ + try: + from lmcache import lmcache_C + _USE_LMC_OPS = True +@@ -419,15 +429,34 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface): + self._initialize_pointers(kvcaches) + + slot_mapping_range = slot_mapping[start:end] +- lmcache_C.paged_layers_kv_transfer(memory_obj.tensor, +- self.kv_cache_pointers_on_gpu, +- slot_mapping_range, +- memory_obj.base_ptr, +- self.num_pages, +- self.page_size, +- self.kv_size, +- False, +- True) ++ ++ if is_310p(): ++ logger.warning(f" =================SYAN-DEBUG================== VLLMPagedMemGPUConnectorV2 中 to_gpu _IS_310P =================================== ") ++ assert self.gpu_buffer.device == kvcaches[0][0].device ++ self.gpu_buffer.zero_() ++ tmp_gpu_buffer = self.gpu_buffer[:, :, : end - start, :] ++ hidden_dims = tmp_gpu_buffer.size(-1) ++ tmp_gpu_buffer.copy_(torch.from_numpy(memory_obj.tensor)) ++ ++ lmcache_C.paged_layers_kv_transfer_ms_unregistered(tmp_gpu_buffer, #由np类型的memory_obj,替换为tensor类型 ++ self.kv_cache_pointers_on_gpu, ++ slot_mapping_range, ++ self.num_pages, ++ self.page_size, ++ self.kv_size, ++ hidden_dims, ++ True # page2L ++ ) ++ else: ++ lmcache_C.paged_layers_kv_transfer(memory_obj.tensor, ++ self.kv_cache_pointers_on_gpu, ++ slot_mapping_range, ++ memory_obj.base_ptr, ++ self.num_pages, ++ self.page_size, ++ self.kv_size, ++ False, ++ True) + + # lmc_ops.multi_layer_kv_transfer( + # memory_obj.tensor, +@@ -472,15 +501,37 @@ class VLLMPagedMemGPUConnectorV2(GPUConnectorInterface): + self._initialize_pointers(kvcaches) + + slot_mapping_range = slot_mapping[start:end] +- lmcache_C.paged_layers_kv_transfer(memory_obj.tensor, +- self.kv_cache_pointers_on_gpu, +- slot_mapping_range, +- memory_obj.base_ptr, +- self.num_pages, +- self.page_size, +- self.kv_size, +- True, +- True) ++ ++ if is_310p(): ++ # kvcaches -> gpu_buffer -> memobj ++ assert self.gpu_buffer.device == kvcaches[0][0].device ++ self.gpu_buffer.zero_() ++ tmp_gpu_buffer = self.gpu_buffer[:, :, : end - start, :] ++ hidden_dims = tmp_gpu_buffer.size(-1) ++ lmcache_C.paged_layers_kv_transfer_ms_unregistered(tmp_gpu_buffer, #由np类型的memory_obj,替换为tensor类型 ++ self.kv_cache_pointers_on_gpu, ++ slot_mapping_range, ++ self.num_pages, ++ self.page_size, ++ self.kv_size, ++ hidden_dims, ++ True # page2L ++ ) ++ ++ cpu_tensor = tmp_gpu_buffer.cpu() ++ cpu_np = cpu_tensor.numpy() ++ np.copyto(memory_obj.tensor, cpu_np) ++ ++ else: ++ lmcache_C.paged_layers_kv_transfer(memory_obj.tensor, ++ self.kv_cache_pointers_on_gpu, ++ slot_mapping_range, ++ memory_obj.base_ptr, ++ self.num_pages, ++ self.page_size, ++ self.kv_size, ++ True, ++ True) + # if self.gpu_buffer is None or end - start != self.gpu_buffer.shape[2]: + # lmc_ops.multi_layer_kv_transfer( + # memory_obj.tensor, +diff --git a/setup.py b/setup.py +index 99d2b00..b5c9496 100644 +--- a/setup.py ++++ b/setup.py +@@ -6,6 +6,7 @@ import sys + # Third Party + from setuptools import find_packages, setup, Extension + from setuptools.command.build_ext import build_ext ++from setuptools.command.build_py import build_py + from setuptools.command.develop import develop + from setuptools.command.install import install + +@@ -57,10 +58,25 @@ def _get_npu_soc(): + text=True).strip() + _soc_version = _soc_version.split("-")[0] + _soc_version = "Ascend"+_soc_version +- return _soc_version + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Retrieve SoC version failed: {e}") ++ return _soc_version + ++class custom_build_info(build_py): ++ ++ def run(self): ++ soc_version = _get_npu_soc() ++ if not soc_version: ++ raise ValueError( ++ "SOC version is not set. Please set SOC_VERSION environment variable." ++ ) ++ package_dir = os.path.join(ROOT_DIR, "lmcache", "_build_info.py") ++ with open(package_dir, "w+") as f: ++ f.write('# Auto-generated file\n') ++ f.write(f"__soc_version__ = '{soc_version}'\n") ++ logging.info( ++ f"Generated _build_info.py with SOC version: {soc_version}") ++ super().run() + + class CMakeExtension(Extension): + +@@ -326,8 +342,10 @@ def rocm_extension() -> tuple[list, dict]: + + def ascend_extension(): + print("Building Ascend extensions") +- return [CMakeExtension(name="lmcache.lmcache_C")], \ +- {"build_ext": CustomAscendCmakeBuildExt} ++ return [CMakeExtension(name="lmcache.lmcache_C")], { ++ "build_py": custom_build_info, ++ "build_ext": CustomAscendCmakeBuildExt ++ } + + + def source_dist_extension() -> tuple[list, dict]: