From 0a1e0266d92a7f8ae77914d304bdc583a11cf766 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Mon, 8 Sep 2025 09:28:55 +0800
Subject: [PATCH] sync 2025-09-08 resnet model

---
 .../run_train_distributed_imagenette.sh       | 47 ++++++++++++-------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh
index fcf39ec1b..9598c60d5 100644
--- a/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh
+++ b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh
@@ -28,8 +28,8 @@ done
 echo "## Training model: ${model}"
 
 
-: ${BATCH_SIZE:=16}
-# TRAIN_EPOCHS=10
+: ${BATCH_SIZE:=160}
+TRAIN_EPOCHS=30
 # optional optimizer: momentum, rmsprop, momentum, sgd
 OPTIMIZER=momentum
 DATE=`date +%Y%m%d%H%M%S`
@@ -54,15 +54,27 @@ check_status()
 #################################################
 # Prepare devices
 #################################################
-# devices=$CUDA_VISIBLE_DEVICES
-# if [ -n "$devices"  ]; then
-#     devices=(${devices//,/ })
-#     num_devices=${#devices[@]}
-# else
-devices=(0 1)
-num_devices=2
-# fi
+devices=$CUDA_VISIBLE_DEVICES
 echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
+if [ -n "$devices"  ]; then
+    devices=(${devices//,/ })
+    num_devices=${#devices[@]}
+
+    if [ "${num_devices}" -gt "2" ]; then
+        devices=(${devices[@]:0:2})
+        num_devices=2
+
+        VALID_DEVICES="${devices[0]},${devices[1]}"
+        export CUDA_VISIBLE_DEVICES=${VALID_DEVICES}
+        echo "Distributed_replicated mode, use 2 gpus train: ${VALID_DEVICES}"
+    fi
+else
+    devices=(0 1)
+    VALID_DEVICES="0,1"
+    export CUDA_VISIBLE_DEVICES=${VALID_DEVICES}
+    num_devices=2
+fi
+echo "devices: ${devices}"
 echo "num_devices: ${num_devices}"
 
 if [ "${num_devices}" == "1" ]; then
@@ -82,7 +94,8 @@ do
         continue
     fi
     let i++
-    worker_hosts="${worker_hosts},127.0.0.1:5000${device}"
+    port=$((53400 + i))
+    worker_hosts="${worker_hosts},127.0.0.1:${port}"
 done
 worker_hosts=${worker_hosts#*,}
 echo "worker_hosts: ${worker_hosts}"
@@ -130,9 +143,10 @@ do
          --local_parameter_device=gpu --num_gpus=${num_devices}\
          --batch_size=${BATCH_SIZE} --model=${model} \
          --variable_update=distributed_replicated \
-         --job_name=${job_name} --ps_hosts=127.0.0.1:40000 --worker_hosts="${worker_hosts}"\
+         --num_epochs=${TRAIN_EPOCHS} \
+         --job_name=${job_name} --ps_hosts=127.0.0.1:53400 --worker_hosts="${worker_hosts}"\
          --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
-        echo "Distributed training PID ($!) on device ${device} where job name = ${job_name}"
+        echo "Distributed training on device ${device} where job name = ${job_name}"
     else
         echo "device: ${device}"
         UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\
@@ -142,14 +156,15 @@ do
          --local_parameter_device=gpu --num_gpus=${num_devices}\
          --batch_size=${BATCH_SIZE} --model=${model}\
          --variable_update=distributed_replicated\
-         --job_name=${job_name} --ps_hosts=127.0.0.1:40000 --worker_hosts="${worker_hosts}"\
+         --num_epochs=${TRAIN_EPOCHS} \
+         --job_name=${job_name} --ps_hosts=127.0.0.1:53400 --worker_hosts="${worker_hosts}"\
          --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" &
         echo "Distributed training PID ($!) on device ${device} where job name = ${job_name} and task_index = ${task_index}"
+        pid_list+=($!)
     fi
     let i++
-    pid_list+=($!)
 done
 
-echo "All subprocess: ${pid_list[*]}"
+echo "All background subprocess: ${pid_list[*]}"
 ctrl_c
 exit ${EXIT_STATUS}
-- 
Gitee