diff --git a/pkg/common/constants.go b/pkg/common/constants.go index d580943b1d376321c3f94ee224408b9e15da2728..6f32126f3669848779a63665e906480b25dd87ae 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -36,7 +36,7 @@ const ( // RetryUpdateCount is max number of retry resource update RetryUpdateCount = 3 // GetPodFromInformerTime is max number of get pod from informer - GetPodFromInformerTime = 3 + GetPodFromInformerTime = 8 // MaxDeviceNameLen max length of device name, like "Ascend310P-4c.3cpu-100-0" MaxDeviceNameLen = 50 // MaxGRPCRecvMsgSize 4MB diff --git a/pkg/common/utils.go b/pkg/common/utils.go new file mode 100644 index 0000000000000000000000000000000000000000..30a01e8a1c1e3096bdff0e9be8e9b27dfefb9821 --- /dev/null +++ b/pkg/common/utils.go @@ -0,0 +1,30 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common a series of common function +package common + +import ( + "time" +) + +// exponent back off +func ExpBackoffDuration(retryTimes int, maxSleepTime time.Duration) time.Duration { + secondCount := (1 << retryTimes) + sleepTime := time.Duration(secondCount) * time.Second + if sleepTime > maxSleepTime { + sleepTime = maxSleepTime + } + return sleepTime +} diff --git a/pkg/common/utils_test.go b/pkg/common/utils_test.go new file mode 100644 index 0000000000000000000000000000000000000000..b9d3170546a1be82d2301aef9d16f3c1f1bd14f3 --- /dev/null +++ b/pkg/common/utils_test.go @@ -0,0 +1,37 @@ +/* Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common a series of common function +package common + +import ( + "testing" + "time" + + "github.com/smartystreets/goconvey/convey" +) + +// TestExpBackoffDuration for test ExpBackoffDuration +func TestExpBackoffDuration(t *testing.T) { + convey.Convey("test ExpBackoffDuration", t, func() { + convey.Convey("bigger than max sleep duration", func() { + sleepTime := ExpBackoffDuration(10, time.Minute) + convey.So(sleepTime, convey.ShouldEqual, time.Second*60) + }) + convey.Convey("small than max sleep duration", func() { + sleepTime := ExpBackoffDuration(1, time.Minute) + convey.So(sleepTime, convey.ShouldEqual, time.Second*2) + }) + }) +} diff --git a/pkg/kubeclient/kubeclient.go b/pkg/kubeclient/kubeclient.go index a15eaac23e72ba606517bf72f975a1d24706f9b7..1c31089b4ec4b0afcc4cd65a7d30fbf5a889cc62 100644 --- a/pkg/kubeclient/kubeclient.go +++ b/pkg/kubeclient/kubeclient.go @@ -123,13 +123,13 @@ func (ki *ClientK8s) PatchPod(pod *v1.Pod, data []byte) (*v1.Pod, error) { } // GetActivePodList is to get active pod list -func (ki *ClientK8s) GetActivePodList() ([]v1.Pod, error) { +func (ki *ClientK8s) GetActivePodList(resourceVersion string) ([]v1.Pod, error) { fieldSelector, err := fields.ParseSelector("spec.nodeName=" + ki.NodeName + "," + "status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed)) if err != nil { return nil, err } - podList, err := ki.getPodListByCondition(fieldSelector) + podList, err := ki.getPodListByCondition(fieldSelector, resourceVersion) if err != nil { return nil, err } @@ -139,7 +139,7 @@ func (ki *ClientK8s) GetActivePodList() ([]v1.Pod, error) { // GetAllPodList get pod list by field selector func (ki *ClientK8s) GetAllPodList() (*v1.PodList, error) { selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": ki.NodeName}) - v1PodList, err := ki.getPodListByCondition(selector) + v1PodList, err := ki.getPodListByCondition(selector, "0") if err != nil { hwlog.RunLog.Errorf("get pod list failed, err: %v", err) return nil, err @@ -152,10 +152,10 @@ func (ki *ClientK8s) GetAllPodList() (*v1.PodList, error) { } // getPodListByCondition get pod list by field selector -func (ki *ClientK8s) getPodListByCondition(selector fields.Selector) (*v1.PodList, error) { +func (ki *ClientK8s) getPodListByCondition(selector fields.Selector, resourceVersion string) (*v1.PodList, error) { newPodList, err := ki.Clientset.CoreV1().Pods(v1.NamespaceAll).List(context.Background(), metav1.ListOptions{ FieldSelector: selector.String(), - ResourceVersion: "0", + ResourceVersion: resourceVersion, }) if err != nil && strings.Contains(err.Error(), common.ApiServerPort) { ki.IsApiErr = true diff --git a/pkg/server/plugin.go b/pkg/server/plugin.go index c3908bcbfdbac34883002965e57b44b12933ff8d..3f35fd147d98a4fe0e7a3d5600e774a4061094dd 100644 --- a/pkg/server/plugin.go +++ b/pkg/server/plugin.go @@ -681,10 +681,15 @@ func (ps *PluginServer) doWithVolcanoSchedule(requestDevices []string) ([]string } var filteredPods []v1.Pod var allPods []v1.Pod + resourceVersion := "0" for i := 0; i < common.GetPodFromInformerTime; i++ { - if i == common.GetPodFromInformerTime-1 { - // in the last time of retry, get the pod from api server instead of cache - noneCachedPod, err := ps.manager.GetKubeClient().GetActivePodList() + // if local cache not satisfy, get pod from api server + if i > 0 { + // if api server cache not satisfy, get pod from etcd instead of api server cache + if i > 1 { + resourceVersion = "" + } + noneCachedPod, err := ps.manager.GetKubeClient().GetActivePodList(resourceVersion) if err != nil { hwlog.RunLog.Errorf("get active pod from api server failed") return nil, err @@ -698,7 +703,7 @@ func (ps *PluginServer) doWithVolcanoSchedule(requestDevices []string) ([]string break } hwlog.RunLog.Warnf("no pod passed the filter, request device: %v, retry: %d", requestDevices, i) - time.Sleep(time.Second) + time.Sleep(common.ExpBackoffDuration(i, time.Minute*2)) } oldestPod := ps.getOldestPod(filteredPods) if oldestPod == nil {