FIX AutoDL调度修复暖池功能

This commit is contained in:
kyj@bowong.ai 2025-04-17 12:00:50 +08:00
parent 1272a33718
commit bf4c66fc27
3 changed files with 27 additions and 12 deletions

View File

@ -33,15 +33,16 @@ class InstancePool:
self.instances:List[Instance] = [] self.instances:List[Instance] = []
self.executor = ThreadPoolExecutor(max_workers=os.cpu_count()*2) self.executor = ThreadPoolExecutor(max_workers=os.cpu_count()*2)
self.threads = [] self.threads = []
self.intro_threads = []
def scale_instance(self, target_instance): def scale_instance(self, target_instance, disable_shrink=True):
if target_instance + self.buffer_instance < self.min_instance: if target_instance + self.buffer_instance < self.min_instance:
return self._scale(self.min_instance) return self._scale(self.min_instance)
if target_instance + self.buffer_instance > self.max_instance: if target_instance + self.buffer_instance > self.max_instance:
return self._scale(self.max_instance) return self._scale(self.max_instance)
if target_instance + self.buffer_instance == len(self.instances): if target_instance + self.buffer_instance == len(self.instances):
return True return True
return self._scale(target_instance + self.buffer_instance) return self._scale(target_instance + self.buffer_instance, disable_shrink)
def remove_instance(self, instance:Instance): def remove_instance(self, instance:Instance):
if instance_operate(instance.uuid, "power_off"): if instance_operate(instance.uuid, "power_off"):
@ -73,21 +74,34 @@ class InstancePool:
def introspection(self): def introspection(self):
# 停止超时实例(运行超时和无任务超时) # 停止超时实例(运行超时和无任务超时)
before=len(self.instances)
instance_copy = copy.deepcopy(self.instances) instance_copy = copy.deepcopy(self.instances)
flag = False
for instance in instance_copy: for instance in instance_copy:
if instance.active: if instance.active:
if (time.time() - instance.last_active_time) > self.timeout: if (time.time() - instance.last_active_time) > self.timeout:
self.threads.append(self.executor.submit(self.remove_instance, instance=instance)) flag = True
self.intro_threads.append(self.executor.submit(self.remove_instance, instance=instance))
else: else:
if (time.time() - instance.last_active_time) > self.scaledown_window: if (time.time() - instance.last_active_time) > self.scaledown_window:
self.threads.append(self.executor.submit(self.remove_instance, instance=instance)) flag = True
self.intro_threads.append(self.executor.submit(self.remove_instance, instance=instance))
while len(self.intro_threads) > 0:
for t in self.intro_threads:
t.result(timeout=self.timeout//2)
self.intro_threads.remove(t)
after = len(self.instances)
if flag:
loguru.logger.info("Instance Num Before Introspecting %d After Introspecting %d" % (before, after))
def _scale(self, target_instance:int): def _scale(self, target_instance:int, disable_shrink=True):
loguru.logger.info("Instance Num Before Scaling %d ; Target %d" % (len(self.instances), target_instance))
self.introspection() self.introspection()
# 调整实例数量 # 调整实例数量
instance_copy = copy.deepcopy(self.instances) instance_copy = copy.deepcopy(self.instances)
dest = target_instance - len(instance_copy) dest = target_instance - len(instance_copy)
if (disable_shrink and dest < 0) or dest == 0:
return True
loguru.logger.info("Instance Num Before Scaling %d ; Target %d" % (len(self.instances), target_instance))
if dest < 0: if dest < 0:
dest = abs(dest) dest = abs(dest)
for instance in instance_copy: for instance in instance_copy:

View File

@ -1,4 +1,5 @@
import time import time
import traceback
import uuid import uuid
from concurrent.futures.thread import ThreadPoolExecutor from concurrent.futures.thread import ThreadPoolExecutor
from typing import Union from typing import Union
@ -18,6 +19,8 @@ class RunningPool:
def run(self, instance_id, uid, base_url:str, video_file:Union[UploadFile,str], audio_file:Union[UploadFile,str]) -> Union[bool, None]: def run(self, instance_id, uid, base_url:str, video_file:Union[UploadFile,str], audio_file:Union[UploadFile,str]) -> Union[bool, None]:
try: try:
code = self._req(base_url, video_file, audio_file) code = self._req(base_url, video_file, audio_file)
if not code:
raise Exception("[Code] is None")
except Exception as e: except Exception as e:
loguru.logger.error("%s Task Submit Failed: %s" % (uid, e)) loguru.logger.error("%s Task Submit Failed: %s" % (uid, e))
return None return None
@ -41,6 +44,7 @@ class RunningPool:
return {"status": resp.json()['status'], "msg": resp.json()['msg'], return {"status": resp.json()['status'], "msg": resp.json()['msg'],
"instance_id": self.tasks[uid]["instance_id"]} "instance_id": self.tasks[uid]["instance_id"]}
except Exception as e: except Exception as e:
traceback.print_exc()
loguru.logger.error("%s Get Result Failed: %s" % (uid, e)) loguru.logger.error("%s Get Result Failed: %s" % (uid, e))
return None return None
else: else:

View File

@ -53,20 +53,17 @@ class Server:
return {"status": "queuing", "msg":""} return {"status": "queuing", "msg":""}
def introspect_instance(self): def introspect_instance(self):
loguru.logger.info("Introspecting worker started") loguru.logger.info("Introspecting worker started || Scaledown Window: %ds" % self.instance_pool.scaledown_window)
while True: while True:
self.instance_pool.introspection() self.instance_pool.introspection()
time.sleep(0.5) time.sleep(1)
def scaling_worker(self): def scaling_worker(self):
loguru.logger.info("Scaling worker started") loguru.logger.info("Scaling worker started")
try: try:
last_target = 0
while True: while True:
# 提交任务 # 提交任务
if last_target != self.waiting_queue.get_size()+self.running_pool.get_running_size(): self.instance_pool.scale_instance(self.waiting_queue.get_size()+self.running_pool.get_running_size(), disable_shrink=True)
last_target = self.waiting_queue.get_size()+self.running_pool.get_running_size()
self.instance_pool.scale_instance(last_target)
for instance in self.instance_pool.instances: for instance in self.instance_pool.instances:
if not instance.active: if not instance.active:
# 从等待队列取出任务 # 从等待队列取出任务