diff --git a/src/ixrt_model_instance.cc b/src/ixrt_model_instance.cc index a0542ec83b108580fb2efe7aa91b9de8efc0dfa3..f391342dbdf1550679eede237307de0eec6e6eb9 100644 --- a/src/ixrt_model_instance.cc +++ b/src/ixrt_model_instance.cc @@ -55,7 +55,7 @@ bool ModelInstanceState::SupportsDynamicBatching() { void ModelInstanceState::LoadModule(const std::string& file_name, const int& device_id) { std::vector engine_buffer; LoadBufferFromDisk(file_name, &engine_buffer); - cudaSetDevice(device_id); + CHECK(cudaSetDevice(device_id)); initLibNvInferPlugins(&logger_, ""); UniquePtr runtime{nvinfer1::createInferRuntime(logger_)}; @@ -311,10 +311,20 @@ ModelInstanceState::Create( TRITONSERVER_LOG_INFO, (std::string("Load model from path ") + model_path).c_str()); - std::string str; - str = std::to_string((*state)->DeviceId()); + size_t DeviceID; + DeviceID = (*state)->DeviceId(); //LOG_MESSAGE(TRITONSERVER_LOG_INFO, ("DeviceId: " + std::to_string(DeviceId())).c_str()); - LOG_MESSAGE(TRITONSERVER_LOG_INFO, ("DeviceId: " + str).c_str()); + LOG_MESSAGE(TRITONSERVER_LOG_INFO, ("DeviceId: " + std::to_string(DeviceID)).c_str()); + cudaStream_t copy_stream_ = nullptr; + CHECK(cudaSetDevice(DeviceID)); + CHECK(cudaStreamCreate(©_stream_)); + // CreateCudaStream(DeviceID, 0, ©_stream_); + // CreateCudaStream(DeviceID, 0, ©_stream_); + std::unordered_map stream_map; + stream_map[DeviceID] = copy_stream_; + (*state)->stream_list_.emplace_back(stream_map); + + (*state)->LoadModule(model_path, (*state)->DeviceId()); (*state)->InitBindingBuffers(); @@ -332,6 +342,16 @@ ModelInstanceState::~ModelInstanceState() { for (auto& buffer : binding_buffers_) { CHECK(cudaFree(buffer)); } + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + std::string("Destroy GPU cudastream").c_str()); + + for (const auto& stream_map : stream_list_){ + for (const auto& kv : stream_map) { + CHECK(cudaStreamDestroy(kv.second)); + } + } } TRITONSERVER_DataType ModelInstanceState::convertStrDataType2Datatype(const std::string& strDataType){ @@ -397,7 +417,22 @@ void ModelInstanceState::ProcessRequests( const int max_batch_size = model_state_->MaxBatchSize(); LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, ("max_batch_size: " + std::to_string(max_batch_size)).c_str()); - cudaSetDevice(DeviceId()); + CHECK(cudaSetDevice(DeviceId())); + size_t device_id = DeviceId(); + cudaStream_t stream = nullptr; + for (const auto& stream_map : stream_list_) { + auto it = stream_map.find(device_id); + if (it != stream_map.end()) { + stream = it->second; + break; // 找到就退出循环 + } + } + if (stream == nullptr) { + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + std::string("No stream found for device ID: " + std::to_string(device_id)).c_str()); + return; + } size_t total_batch_size = 0; @@ -472,14 +507,11 @@ void ModelInstanceState::ProcessRequests( uint32_t req_input_count; TRITONBACKEND_RequestInputCount(*requests, &req_input_count); - CreateCudaStream(DeviceId(), 0, &output_copy_stream_); - CreateCudaStream(DeviceId(), 0, &input_copy_stream_); - std::vector input_names; BackendInputCollector collector( requests, request_count, &responses, model_state->TritonMemoryManager(), - false /* pinned_enabled */, input_copy_stream_ /* stream*/, nullptr, nullptr, 0, nullptr, true, true); + false /* pinned_enabled */, stream /* stream*/, nullptr, nullptr, 0, nullptr, true, true); // std::vector> allowed_input_types = // {{TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}, {TRITONSERVER_MEMORY_GPU, 0}}; @@ -559,7 +591,7 @@ void ModelInstanceState::ProcessRequests( offset += input_byte_size; - cudaStreamSynchronize(input_copy_stream_); + CHECK(cudaStreamSynchronize(stream)); const bool need_cuda_input_sync = collector.Finalize(); // if (need_cuda_input_sync) { @@ -590,7 +622,7 @@ void ModelInstanceState::ProcessRequests( BackendOutputResponder responder( requests, request_count, &responses, model_state->TritonMemoryManager(), false, false /* pinned_enabled */, - output_copy_stream_ /* stream*/, nullptr, true); + stream /* stream*/, nullptr, true); std::map mapOutputAttr; for(std::vector::iterator it = config_outputs_.begin(); it != config_outputs_.end(); ++it){ @@ -645,7 +677,7 @@ void ModelInstanceState::ProcessRequests( output_offset += output_byte_size; - cudaStreamSynchronize(output_copy_stream_); + CHECK(cudaStreamSynchronize(stream)); const bool need_cuda_output_sync = responder.Finalize(); } diff --git a/src/ixrt_model_instance.h b/src/ixrt_model_instance.h index bbfaffbb7a71b18b1a30d54c79d9424150760bab..3be8560a975b48c4c3d9ef3359a5c5e3316d83ab 100644 --- a/src/ixrt_model_instance.h +++ b/src/ixrt_model_instance.h @@ -38,9 +38,7 @@ public: int cuda_stream_priority_{0}; - cudaStream_t input_copy_stream_{}; - cudaStream_t output_copy_stream_{}; - + std::vector> stream_list_; UniquePtr context_; std::unordered_map model_inputs_;