diff --git a/src/ixrt_model_instance.cc b/src/ixrt_model_instance.cc
index a0542ec83b108580fb2efe7aa91b9de8efc0dfa3..f391342dbdf1550679eede237307de0eec6e6eb9 100644
--- a/src/ixrt_model_instance.cc
+++ b/src/ixrt_model_instance.cc
@@ -55,7 +55,7 @@ bool ModelInstanceState::SupportsDynamicBatching() {
 void ModelInstanceState::LoadModule(const std::string& file_name, const int& device_id) {
   std::vector<int8_t> engine_buffer;
   LoadBufferFromDisk(file_name, &engine_buffer);
-  cudaSetDevice(device_id);
+  CHECK(cudaSetDevice(device_id));
   initLibNvInferPlugins(&logger_, "");
 
   UniquePtr<nvinfer1::IRuntime> runtime{nvinfer1::createInferRuntime(logger_)};
@@ -311,10 +311,20 @@ ModelInstanceState::Create(
             TRITONSERVER_LOG_INFO,
             (std::string("Load model from path ") + model_path).c_str());
 
-  std::string str;
-  str = std::to_string((*state)->DeviceId());
+  size_t DeviceID;
+  DeviceID = (*state)->DeviceId();
   //LOG_MESSAGE(TRITONSERVER_LOG_INFO, ("DeviceId: " + std::to_string(DeviceId())).c_str());
-  LOG_MESSAGE(TRITONSERVER_LOG_INFO, ("DeviceId: " + str).c_str());
+  LOG_MESSAGE(TRITONSERVER_LOG_INFO, ("DeviceId: " + std::to_string(DeviceID)).c_str());
+  cudaStream_t copy_stream_ = nullptr;
+  CHECK(cudaSetDevice(DeviceID));
+  CHECK(cudaStreamCreate(&copy_stream_));
+  // CreateCudaStream(DeviceID, 0, &copy_stream_);
+  // CreateCudaStream(DeviceID, 0, &copy_stream_);
+  std::unordered_map<size_t, cudaStream_t> stream_map;
+  stream_map[DeviceID] = copy_stream_;
+  (*state)->stream_list_.emplace_back(stream_map);
+
+
 
   (*state)->LoadModule(model_path, (*state)->DeviceId());
   (*state)->InitBindingBuffers();
@@ -332,6 +342,16 @@ ModelInstanceState::~ModelInstanceState() {
   for (auto& buffer : binding_buffers_) {
     CHECK(cudaFree(buffer));
   }
+
+  LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            std::string("Destroy GPU cudastream").c_str());
+
+  for (const auto& stream_map : stream_list_){
+        for (const auto& kv : stream_map) {
+        CHECK(cudaStreamDestroy(kv.second));
+    }
+  }
 }
 
 TRITONSERVER_DataType ModelInstanceState::convertStrDataType2Datatype(const std::string& strDataType){
@@ -397,7 +417,22 @@ void ModelInstanceState::ProcessRequests(
   const int max_batch_size = model_state_->MaxBatchSize();
   LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, ("max_batch_size: " + std::to_string(max_batch_size)).c_str());
   
-  cudaSetDevice(DeviceId());
+  CHECK(cudaSetDevice(DeviceId()));
+  size_t device_id = DeviceId();
+  cudaStream_t stream = nullptr;
+  for (const auto& stream_map : stream_list_) {
+    auto it = stream_map.find(device_id);
+    if (it != stream_map.end()) {
+        stream = it->second;
+        break; // 找到就退出循环
+    }
+  }
+  if (stream == nullptr) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_ERROR,
+        std::string("No stream found for device ID: " + std::to_string(device_id)).c_str());
+    return;
+  }
 
   size_t total_batch_size = 0;  
 
@@ -472,14 +507,11 @@ void ModelInstanceState::ProcessRequests(
   uint32_t req_input_count;
   TRITONBACKEND_RequestInputCount(*requests, &req_input_count);
 
-  CreateCudaStream(DeviceId(), 0, &output_copy_stream_);
-  CreateCudaStream(DeviceId(), 0, &input_copy_stream_);
-
   std::vector<const char*> input_names;
 
   BackendInputCollector collector(
       requests, request_count, &responses, model_state->TritonMemoryManager(),
-      false /* pinned_enabled */, input_copy_stream_ /* stream*/, nullptr, nullptr, 0, nullptr, true, true);
+      false /* pinned_enabled */, stream /* stream*/, nullptr, nullptr, 0, nullptr, true, true);
 
   // std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> allowed_input_types =
   //     {{TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}, {TRITONSERVER_MEMORY_GPU, 0}};
@@ -559,7 +591,7 @@ void ModelInstanceState::ProcessRequests(
   
     offset += input_byte_size;
 
-    cudaStreamSynchronize(input_copy_stream_);
+    CHECK(cudaStreamSynchronize(stream));
 
     const bool need_cuda_input_sync = collector.Finalize();
     // if (need_cuda_input_sync) {
@@ -590,7 +622,7 @@ void ModelInstanceState::ProcessRequests(
   BackendOutputResponder responder(
       requests, request_count, &responses, model_state->TritonMemoryManager(),
       false, false /* pinned_enabled */,
-      output_copy_stream_ /* stream*/, nullptr, true);
+      stream /* stream*/, nullptr, true);
   
   std::map<std::string, std::string> mapOutputAttr;
   for(std::vector<IOAttribute>::iterator it = config_outputs_.begin(); it != config_outputs_.end(); ++it){
@@ -645,7 +677,7 @@ void ModelInstanceState::ProcessRequests(
 
     output_offset += output_byte_size;
 
-    cudaStreamSynchronize(output_copy_stream_);
+    CHECK(cudaStreamSynchronize(stream));
     const bool need_cuda_output_sync = responder.Finalize();
 
   }
diff --git a/src/ixrt_model_instance.h b/src/ixrt_model_instance.h
index bbfaffbb7a71b18b1a30d54c79d9424150760bab..3be8560a975b48c4c3d9ef3359a5c5e3316d83ab 100644
--- a/src/ixrt_model_instance.h
+++ b/src/ixrt_model_instance.h
@@ -38,9 +38,7 @@ public:
 
   int cuda_stream_priority_{0};
 
-  cudaStream_t input_copy_stream_{};
-  cudaStream_t output_copy_stream_{};
-
+  std::vector<std::unordered_map<size_t, cudaStream_t>> stream_list_;
 
   UniquePtr<nvinfer1::IExecutionContext> context_;
   std::unordered_map<std::string, int32_t> model_inputs_;