From 7a2e1e4a3b32ef331121ff3bae38f490b08bb048 Mon Sep 17 00:00:00 2001 From: tiankaijin Date: Wed, 31 Jul 2024 11:09:45 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=91=E6=9F=A5=E6=89=BEpod=E6=97=B6=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E4=B8=80=E6=AC=A1=E7=9B=B4=E6=8E=A5=E4=BB=8Eetcd?= =?UTF-8?q?=E6=9F=A5=E8=AF=A2=E7=9A=84=E9=87=8D=E8=AF=95=EF=BC=8C=E9=81=BF?= =?UTF-8?q?=E5=85=8D=E7=BC=93=E5=AD=98=E4=B8=80=E8=87=B4=E6=80=A7=E9=97=AE?= =?UTF-8?q?=E9=A2=98=E5=BD=B1=E5=93=8D=E8=AE=BE=E5=A4=87=E6=8C=82=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/constants.go | 2 +- pkg/kubeclient/kubeclient.go | 10 +++++----- pkg/server/plugin.go | 11 ++++++++--- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/pkg/common/constants.go b/pkg/common/constants.go index d580943b..7f13855a 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -36,7 +36,7 @@ const ( // RetryUpdateCount is max number of retry resource update RetryUpdateCount = 3 // GetPodFromInformerTime is max number of get pod from informer - GetPodFromInformerTime = 3 + GetPodFromInformerTime = 5 // MaxDeviceNameLen max length of device name, like "Ascend310P-4c.3cpu-100-0" MaxDeviceNameLen = 50 // MaxGRPCRecvMsgSize 4MB diff --git a/pkg/kubeclient/kubeclient.go b/pkg/kubeclient/kubeclient.go index a15eaac2..1c31089b 100644 --- a/pkg/kubeclient/kubeclient.go +++ b/pkg/kubeclient/kubeclient.go @@ -123,13 +123,13 @@ func (ki *ClientK8s) PatchPod(pod *v1.Pod, data []byte) (*v1.Pod, error) { } // GetActivePodList is to get active pod list -func (ki *ClientK8s) GetActivePodList() ([]v1.Pod, error) { +func (ki *ClientK8s) GetActivePodList(resourceVersion string) ([]v1.Pod, error) { fieldSelector, err := fields.ParseSelector("spec.nodeName=" + ki.NodeName + "," + "status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed)) if err != nil { return nil, err } - podList, err := ki.getPodListByCondition(fieldSelector) + podList, err := ki.getPodListByCondition(fieldSelector, resourceVersion) if err != nil { return nil, err } @@ -139,7 +139,7 @@ func (ki *ClientK8s) GetActivePodList() ([]v1.Pod, error) { // GetAllPodList get pod list by field selector func (ki *ClientK8s) GetAllPodList() (*v1.PodList, error) { selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": ki.NodeName}) - v1PodList, err := ki.getPodListByCondition(selector) + v1PodList, err := ki.getPodListByCondition(selector, "0") if err != nil { hwlog.RunLog.Errorf("get pod list failed, err: %v", err) return nil, err @@ -152,10 +152,10 @@ func (ki *ClientK8s) GetAllPodList() (*v1.PodList, error) { } // getPodListByCondition get pod list by field selector -func (ki *ClientK8s) getPodListByCondition(selector fields.Selector) (*v1.PodList, error) { +func (ki *ClientK8s) getPodListByCondition(selector fields.Selector, resourceVersion string) (*v1.PodList, error) { newPodList, err := ki.Clientset.CoreV1().Pods(v1.NamespaceAll).List(context.Background(), metav1.ListOptions{ FieldSelector: selector.String(), - ResourceVersion: "0", + ResourceVersion: resourceVersion, }) if err != nil && strings.Contains(err.Error(), common.ApiServerPort) { ki.IsApiErr = true diff --git a/pkg/server/plugin.go b/pkg/server/plugin.go index c3908bcb..980158d2 100644 --- a/pkg/server/plugin.go +++ b/pkg/server/plugin.go @@ -681,10 +681,15 @@ func (ps *PluginServer) doWithVolcanoSchedule(requestDevices []string) ([]string } var filteredPods []v1.Pod var allPods []v1.Pod + resourceVersion := "0" for i := 0; i < common.GetPodFromInformerTime; i++ { - if i == common.GetPodFromInformerTime-1 { - // in the last time of retry, get the pod from api server instead of cache - noneCachedPod, err := ps.manager.GetKubeClient().GetActivePodList() + // if retry times bigger than GetPodFromInformerTime/2, get the pod from api server cache + if i > common.GetPodFromInformerTime/2 { + // in the last time of retry, get the pod from etcd instead of api server cache + if i == common.GetPodFromInformerTime-1 { + resourceVersion = "" + } + noneCachedPod, err := ps.manager.GetKubeClient().GetActivePodList(resourceVersion) if err != nil { hwlog.RunLog.Errorf("get active pod from api server failed") return nil, err -- Gitee From 5bdc46291c3a4745a8a27f360c9bdf1ca24d00aa Mon Sep 17 00:00:00 2001 From: tiankaijin Date: Wed, 31 Jul 2024 15:02:22 +0800 Subject: [PATCH 2/4] add test & adjust sleep time --- pkg/common/constants.go | 2 +- pkg/common/utils.go | 30 ++++++++++++++++++++++++++++++ pkg/common/utils_test.go | 37 +++++++++++++++++++++++++++++++++++++ pkg/server/plugin.go | 10 +++++----- 4 files changed, 73 insertions(+), 6 deletions(-) create mode 100644 pkg/common/utils.go create mode 100644 pkg/common/utils_test.go diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 7f13855a..6f32126f 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -36,7 +36,7 @@ const ( // RetryUpdateCount is max number of retry resource update RetryUpdateCount = 3 // GetPodFromInformerTime is max number of get pod from informer - GetPodFromInformerTime = 5 + GetPodFromInformerTime = 8 // MaxDeviceNameLen max length of device name, like "Ascend310P-4c.3cpu-100-0" MaxDeviceNameLen = 50 // MaxGRPCRecvMsgSize 4MB diff --git a/pkg/common/utils.go b/pkg/common/utils.go new file mode 100644 index 00000000..30a01e8a --- /dev/null +++ b/pkg/common/utils.go @@ -0,0 +1,30 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common a series of common function +package common + +import ( + "time" +) + +// exponent back off +func ExpBackoffDuration(retryTimes int, maxSleepTime time.Duration) time.Duration { + secondCount := (1 << retryTimes) + sleepTime := time.Duration(secondCount) * time.Second + if sleepTime > maxSleepTime { + sleepTime = maxSleepTime + } + return sleepTime +} diff --git a/pkg/common/utils_test.go b/pkg/common/utils_test.go new file mode 100644 index 00000000..f159efb6 --- /dev/null +++ b/pkg/common/utils_test.go @@ -0,0 +1,37 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common a series of common function +package common + +import ( + "testing" + "time" + + "github.com/smartystreets/goconvey/convey" +) + +// TestAtomicBool for test AtomicBool +func TestExpBackoffDuration(t *testing.T) { + convey.Convey("test ExpBackoffDuration", t, func() { + convey.Convey("bigger than max sleep duration", func() { + sleepTime := ExpBackoffDuration(10, time.Minute) + convey.So(sleepTime, convey.ShouldEqual, time.Second*60) + }) + convey.Convey("small than max sleep duration", func() { + sleepTime := ExpBackoffDuration(1, time.Minute) + convey.So(sleepTime, convey.ShouldEqual, time.Second*2) + }) + }) +} diff --git a/pkg/server/plugin.go b/pkg/server/plugin.go index 980158d2..3f35fd14 100644 --- a/pkg/server/plugin.go +++ b/pkg/server/plugin.go @@ -683,10 +683,10 @@ func (ps *PluginServer) doWithVolcanoSchedule(requestDevices []string) ([]string var allPods []v1.Pod resourceVersion := "0" for i := 0; i < common.GetPodFromInformerTime; i++ { - // if retry times bigger than GetPodFromInformerTime/2, get the pod from api server cache - if i > common.GetPodFromInformerTime/2 { - // in the last time of retry, get the pod from etcd instead of api server cache - if i == common.GetPodFromInformerTime-1 { + // if local cache not satisfy, get pod from api server + if i > 0 { + // if api server cache not satisfy, get pod from etcd instead of api server cache + if i > 1 { resourceVersion = "" } noneCachedPod, err := ps.manager.GetKubeClient().GetActivePodList(resourceVersion) @@ -703,7 +703,7 @@ func (ps *PluginServer) doWithVolcanoSchedule(requestDevices []string) ([]string break } hwlog.RunLog.Warnf("no pod passed the filter, request device: %v, retry: %d", requestDevices, i) - time.Sleep(time.Second) + time.Sleep(common.ExpBackoffDuration(i, time.Minute*2)) } oldestPod := ps.getOldestPod(filteredPods) if oldestPod == nil { -- Gitee From 7cc7197889dd6e27de3c581d9f43f27047cb3a9e Mon Sep 17 00:00:00 2001 From: tiankaijin Date: Wed, 31 Jul 2024 15:03:51 +0800 Subject: [PATCH 3/4] fix test name --- pkg/common/utils_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/common/utils_test.go b/pkg/common/utils_test.go index f159efb6..00d7b288 100644 --- a/pkg/common/utils_test.go +++ b/pkg/common/utils_test.go @@ -22,7 +22,7 @@ import ( "github.com/smartystreets/goconvey/convey" ) -// TestAtomicBool for test AtomicBool +// TestExpBackoffDuration for test AtomicBool func TestExpBackoffDuration(t *testing.T) { convey.Convey("test ExpBackoffDuration", t, func() { convey.Convey("bigger than max sleep duration", func() { -- Gitee From fe80e578008fa5ea10e6b912cf98efbf55fd2116 Mon Sep 17 00:00:00 2001 From: tiankaijin Date: Wed, 31 Jul 2024 15:04:58 +0800 Subject: [PATCH 4/4] fix test label --- pkg/common/utils_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/common/utils_test.go b/pkg/common/utils_test.go index 00d7b288..b9d31705 100644 --- a/pkg/common/utils_test.go +++ b/pkg/common/utils_test.go @@ -22,7 +22,7 @@ import ( "github.com/smartystreets/goconvey/convey" ) -// TestExpBackoffDuration for test AtomicBool +// TestExpBackoffDuration for test ExpBackoffDuration func TestExpBackoffDuration(t *testing.T) { convey.Convey("test ExpBackoffDuration", t, func() { convey.Convey("bigger than max sleep duration", func() { -- Gitee