FIX AutoDL调度修复暖池功能
This commit is contained in:
parent
1272a33718
commit
bf4c66fc27
|
|
@ -33,15 +33,16 @@ class InstancePool:
|
||||||
self.instances:List[Instance] = []
|
self.instances:List[Instance] = []
|
||||||
self.executor = ThreadPoolExecutor(max_workers=os.cpu_count()*2)
|
self.executor = ThreadPoolExecutor(max_workers=os.cpu_count()*2)
|
||||||
self.threads = []
|
self.threads = []
|
||||||
|
self.intro_threads = []
|
||||||
|
|
||||||
def scale_instance(self, target_instance):
|
def scale_instance(self, target_instance, disable_shrink=True):
|
||||||
if target_instance + self.buffer_instance < self.min_instance:
|
if target_instance + self.buffer_instance < self.min_instance:
|
||||||
return self._scale(self.min_instance)
|
return self._scale(self.min_instance)
|
||||||
if target_instance + self.buffer_instance > self.max_instance:
|
if target_instance + self.buffer_instance > self.max_instance:
|
||||||
return self._scale(self.max_instance)
|
return self._scale(self.max_instance)
|
||||||
if target_instance + self.buffer_instance == len(self.instances):
|
if target_instance + self.buffer_instance == len(self.instances):
|
||||||
return True
|
return True
|
||||||
return self._scale(target_instance + self.buffer_instance)
|
return self._scale(target_instance + self.buffer_instance, disable_shrink)
|
||||||
|
|
||||||
def remove_instance(self, instance:Instance):
|
def remove_instance(self, instance:Instance):
|
||||||
if instance_operate(instance.uuid, "power_off"):
|
if instance_operate(instance.uuid, "power_off"):
|
||||||
|
|
@ -73,21 +74,34 @@ class InstancePool:
|
||||||
|
|
||||||
def introspection(self):
|
def introspection(self):
|
||||||
# 停止超时实例(运行超时和无任务超时)
|
# 停止超时实例(运行超时和无任务超时)
|
||||||
|
before=len(self.instances)
|
||||||
instance_copy = copy.deepcopy(self.instances)
|
instance_copy = copy.deepcopy(self.instances)
|
||||||
|
flag = False
|
||||||
for instance in instance_copy:
|
for instance in instance_copy:
|
||||||
if instance.active:
|
if instance.active:
|
||||||
if (time.time() - instance.last_active_time) > self.timeout:
|
if (time.time() - instance.last_active_time) > self.timeout:
|
||||||
self.threads.append(self.executor.submit(self.remove_instance, instance=instance))
|
flag = True
|
||||||
|
self.intro_threads.append(self.executor.submit(self.remove_instance, instance=instance))
|
||||||
else:
|
else:
|
||||||
if (time.time() - instance.last_active_time) > self.scaledown_window:
|
if (time.time() - instance.last_active_time) > self.scaledown_window:
|
||||||
self.threads.append(self.executor.submit(self.remove_instance, instance=instance))
|
flag = True
|
||||||
|
self.intro_threads.append(self.executor.submit(self.remove_instance, instance=instance))
|
||||||
|
while len(self.intro_threads) > 0:
|
||||||
|
for t in self.intro_threads:
|
||||||
|
t.result(timeout=self.timeout//2)
|
||||||
|
self.intro_threads.remove(t)
|
||||||
|
after = len(self.instances)
|
||||||
|
if flag:
|
||||||
|
loguru.logger.info("Instance Num Before Introspecting %d After Introspecting %d" % (before, after))
|
||||||
|
|
||||||
def _scale(self, target_instance:int):
|
def _scale(self, target_instance:int, disable_shrink=True):
|
||||||
loguru.logger.info("Instance Num Before Scaling %d ; Target %d" % (len(self.instances), target_instance))
|
|
||||||
self.introspection()
|
self.introspection()
|
||||||
# 调整实例数量
|
# 调整实例数量
|
||||||
instance_copy = copy.deepcopy(self.instances)
|
instance_copy = copy.deepcopy(self.instances)
|
||||||
dest = target_instance - len(instance_copy)
|
dest = target_instance - len(instance_copy)
|
||||||
|
if (disable_shrink and dest < 0) or dest == 0:
|
||||||
|
return True
|
||||||
|
loguru.logger.info("Instance Num Before Scaling %d ; Target %d" % (len(self.instances), target_instance))
|
||||||
if dest < 0:
|
if dest < 0:
|
||||||
dest = abs(dest)
|
dest = abs(dest)
|
||||||
for instance in instance_copy:
|
for instance in instance_copy:
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import time
|
import time
|
||||||
|
import traceback
|
||||||
import uuid
|
import uuid
|
||||||
from concurrent.futures.thread import ThreadPoolExecutor
|
from concurrent.futures.thread import ThreadPoolExecutor
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
@ -18,6 +19,8 @@ class RunningPool:
|
||||||
def run(self, instance_id, uid, base_url:str, video_file:Union[UploadFile,str], audio_file:Union[UploadFile,str]) -> Union[bool, None]:
|
def run(self, instance_id, uid, base_url:str, video_file:Union[UploadFile,str], audio_file:Union[UploadFile,str]) -> Union[bool, None]:
|
||||||
try:
|
try:
|
||||||
code = self._req(base_url, video_file, audio_file)
|
code = self._req(base_url, video_file, audio_file)
|
||||||
|
if not code:
|
||||||
|
raise Exception("[Code] is None")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
loguru.logger.error("%s Task Submit Failed: %s" % (uid, e))
|
loguru.logger.error("%s Task Submit Failed: %s" % (uid, e))
|
||||||
return None
|
return None
|
||||||
|
|
@ -41,6 +44,7 @@ class RunningPool:
|
||||||
return {"status": resp.json()['status'], "msg": resp.json()['msg'],
|
return {"status": resp.json()['status'], "msg": resp.json()['msg'],
|
||||||
"instance_id": self.tasks[uid]["instance_id"]}
|
"instance_id": self.tasks[uid]["instance_id"]}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
traceback.print_exc()
|
||||||
loguru.logger.error("%s Get Result Failed: %s" % (uid, e))
|
loguru.logger.error("%s Get Result Failed: %s" % (uid, e))
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -53,20 +53,17 @@ class Server:
|
||||||
return {"status": "queuing", "msg":""}
|
return {"status": "queuing", "msg":""}
|
||||||
|
|
||||||
def introspect_instance(self):
|
def introspect_instance(self):
|
||||||
loguru.logger.info("Introspecting worker started")
|
loguru.logger.info("Introspecting worker started || Scaledown Window: %ds" % self.instance_pool.scaledown_window)
|
||||||
while True:
|
while True:
|
||||||
self.instance_pool.introspection()
|
self.instance_pool.introspection()
|
||||||
time.sleep(0.5)
|
time.sleep(1)
|
||||||
|
|
||||||
def scaling_worker(self):
|
def scaling_worker(self):
|
||||||
loguru.logger.info("Scaling worker started")
|
loguru.logger.info("Scaling worker started")
|
||||||
try:
|
try:
|
||||||
last_target = 0
|
|
||||||
while True:
|
while True:
|
||||||
# 提交任务
|
# 提交任务
|
||||||
if last_target != self.waiting_queue.get_size()+self.running_pool.get_running_size():
|
self.instance_pool.scale_instance(self.waiting_queue.get_size()+self.running_pool.get_running_size(), disable_shrink=True)
|
||||||
last_target = self.waiting_queue.get_size()+self.running_pool.get_running_size()
|
|
||||||
self.instance_pool.scale_instance(last_target)
|
|
||||||
for instance in self.instance_pool.instances:
|
for instance in self.instance_pool.instances:
|
||||||
if not instance.active:
|
if not instance.active:
|
||||||
# 从等待队列取出任务
|
# 从等待队列取出任务
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue