diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 05a4b86fced838f61995db4bee8c20e342c3bb5b..b8c96de9c8bbc02039795739a18cf77d795c0497 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -36,7 +36,7 @@ const ( // RetryUpdateCount is max number of retry resource update RetryUpdateCount = 3 // GetPodFromInformerTime is max number of get pod from informer - GetPodFromInformerTime = 3 + GetPodFromInformerTime = 8 // MaxDeviceNameLen max length of device name, like "Ascend310P-4c.3cpu-100-0" MaxDeviceNameLen = 50 // MaxGRPCRecvMsgSize 4MB diff --git a/pkg/common/utils.go b/pkg/common/utils.go new file mode 100644 index 0000000000000000000000000000000000000000..06035d90b15a2ed4ffd8093214a4ea5c2ad69678 --- /dev/null +++ b/pkg/common/utils.go @@ -0,0 +1,30 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common a series of common function +package common + +import ( + "time" +) + +// exponent back off +func ExpBackoffDuration(retryTimes int, maxSleepTime time.Duration) time.Duration { + secondCount := (1 << retryTimes) + sleepTime := time.Duration(secondCount) * time.Second + if sleepTime > maxSleepTime { + sleepTime = maxSleepTime + } + return sleepTime +} \ No newline at end of file diff --git a/pkg/common/utils_test.go b/pkg/common/utils_test.go new file mode 100644 index 0000000000000000000000000000000000000000..b9d3170546a1be82d2301aef9d16f3c1f1bd14f3 --- /dev/null +++ b/pkg/common/utils_test.go @@ -0,0 +1,37 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common a series of common function +package common + +import ( + "testing" + "time" + + "github.com/smartystreets/goconvey/convey" +) + +// TestExpBackoffDuration for test ExpBackoffDuration +func TestExpBackoffDuration(t *testing.T) { + convey.Convey("test ExpBackoffDuration", t, func() { + convey.Convey("bigger than max sleep duration", func() { + sleepTime := ExpBackoffDuration(10, time.Minute) + convey.So(sleepTime, convey.ShouldEqual, time.Second*60) + }) + convey.Convey("small than max sleep duration", func() { + sleepTime := ExpBackoffDuration(1, time.Minute) + convey.So(sleepTime, convey.ShouldEqual, time.Second*2) + }) + }) +} diff --git a/pkg/kubeclient/kubeclient.go b/pkg/kubeclient/kubeclient.go index 59b10886b56c919bd59734849492b6e3ddf6506d..c9deb61c9d3874548e21707f3535e43be361589a 100644 --- a/pkg/kubeclient/kubeclient.go +++ b/pkg/kubeclient/kubeclient.go @@ -117,13 +117,13 @@ func (ki *ClientK8s) PatchPod(pod *v1.Pod, data []byte) (*v1.Pod, error) { } // GetActivePodList is to get active pod list -func (ki *ClientK8s) GetActivePodList() ([]v1.Pod, error) { +func (ki *ClientK8s) GetActivePodList(resourceVersion string) ([]v1.Pod, error) { fieldSelector, err := fields.ParseSelector("spec.nodeName=" + ki.NodeName + "," + "status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed)) if err != nil { return nil, err } - podList, err := ki.getPodListByCondition(fieldSelector) + podList, err := ki.getPodListByCondition(fieldSelector, resourceVersion) if err != nil { return nil, err } @@ -133,7 +133,7 @@ func (ki *ClientK8s) GetActivePodList() ([]v1.Pod, error) { // GetAllPodList get pod list by field selector func (ki *ClientK8s) GetAllPodList() (*v1.PodList, error) { selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": ki.NodeName}) - v1PodList, err := ki.getPodListByCondition(selector) + v1PodList, err := ki.getPodListByCondition(selector, "0") if err != nil { hwlog.RunLog.Errorf("get pod list failed, err: %v", err) return nil, err @@ -146,10 +146,10 @@ func (ki *ClientK8s) GetAllPodList() (*v1.PodList, error) { } // getPodListByCondition get pod list by field selector -func (ki *ClientK8s) getPodListByCondition(selector fields.Selector) (*v1.PodList, error) { +func (ki *ClientK8s) getPodListByCondition(selector fields.Selector, resourceVersion string) (*v1.PodList, error) { newPodList, err := ki.Clientset.CoreV1().Pods(v1.NamespaceAll).List(context.Background(), metav1.ListOptions{ FieldSelector: selector.String(), - ResourceVersion: "0", + ResourceVersion: resourceVersion, }) if err != nil && strings.Contains(err.Error(), common.ApiServerPort) { ki.IsApiErr = true diff --git a/pkg/server/plugin.go b/pkg/server/plugin.go index ef5488b0a59f523f1276c01e040c90290280c356..fbbb5b9a95a083fac62c8810d37dd6afe8e297a1 100644 --- a/pkg/server/plugin.go +++ b/pkg/server/plugin.go @@ -680,10 +680,15 @@ func (ps *PluginServer) doWithVolcanoSchedule(requestDevices []string) ([]string } var filteredPods []v1.Pod var allPods []v1.Pod + resourceVersion := "0" for i := 0; i < common.GetPodFromInformerTime; i++ { - if i == common.GetPodFromInformerTime-1 { - // in the last time of retry, get the pod from api server instead of cache - noneCachedPod, err := ps.manager.GetKubeClient().GetActivePodList() + // if local cache not satisfy, get pod from api server + if i > 0 { + // if api server cache not satisfy, get pod from api server + if i > 1 { + resourceVersion = "" + } + noneCachedPod, err := ps.manager.GetKubeClient().GetActivePodList(resourceVersion) if err != nil { hwlog.RunLog.Errorf("get active pod from api server failed") return nil, err @@ -697,7 +702,7 @@ func (ps *PluginServer) doWithVolcanoSchedule(requestDevices []string) ([]string break } hwlog.RunLog.Warnf("no pod passed the filter, request device: %v, retry: %d", requestDevices, i) - time.Sleep(time.Second) + time.Sleep(common.ExpBackoffDuration(i, time.Minute*2)) } oldestPod := ps.getOldestPod(filteredPods) if oldestPod == nil {