From 61106608e0b1144baba9689f304dbcc8beb86003 Mon Sep 17 00:00:00 2001 From: "kyj@bowong.ai" Date: Tue, 15 Apr 2025 11:11:34 +0800 Subject: [PATCH] =?UTF-8?q?FIX=20autodl=E8=87=AA=E5=8A=A8=E5=88=9B?= =?UTF-8?q?=E5=BB=BA=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AutoDL/audodl_sdk.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/AutoDL/audodl_sdk.py b/AutoDL/audodl_sdk.py index 64277c5..670b508 100644 --- a/AutoDL/audodl_sdk.py +++ b/AutoDL/audodl_sdk.py @@ -11,6 +11,7 @@ req_instance_page_size = 1500 chk_instance_page_size = 1500 instances = {} LIM = 30 # 等待状态时间LIM*5s +START_LIM = 20 #Heygem脚本启动等待超时时间-10 def ssh_try(host,port,pwd): # 建立连接 @@ -21,10 +22,22 @@ def ssh_try(host,port,pwd): ssh = paramiko.SSHClient() ssh._transport = trans - # 剩下的就和上面一样了 ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command("/bin/bash -lc \"source /root/.bashrc && nohup python /root/AutoDL_pure_heygem.py > log.txt 2>&1 &\"") - + ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command("bash -ic \"source /root/.bashrc && sleep 10 && ps -ef|grep heygem|grep -v grep && lsof -i:6006|wc -l\"", get_pty=True) + out = ssh_stdout.read().decode() + start_lim = START_LIM + if len(out.split("\n")[-2]) > 2: + trans.close() + raise RuntimeError(out.split("\n")[-2]) + while int(out.split("\n")[-2]) <= 1 and start_lim > 0: + ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command("bash -ic \"sleep 2 && lsof -i:6006|wc -l\"") + out = ssh_stdout.read().decode() + loguru.logger.info("waiting for HeyGem server ready...") + start_lim -= 1 + if start_lim <= 0: + loguru.logger.error("HeyGem Server Start Timeout, Please check!") + else: + loguru.logger.success("HeyGem Server Start Success") # 关闭连接 trans.close() @@ -239,7 +252,7 @@ if __name__=="__main__": machines = get_autodl_machines() for m in machines: instance_uuid, domain = payg(m["region_name"], m["machine_id"]) - print(instance_uuid, "https://"+domain) + loguru.logger.success("instance_id:%s server_domain: https://%s" % (instance_uuid, domain)) # if instance_uuid: # instance_operate(instance_uuid, "power_off") # instance_operate(instance_uuid, "release")