import time from typing import Literal, Union, Any import logger import loguru import paramiko import requests token = "eyJhbGciOiJFUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1aWQiOjMzNjM5MSwidXVpZCI6ImU2MDU0ZGI4LTlhN2UtNDllNC1hNDQ1LWI4N2M1NGViMjU4ZCIsImlzX2FkbWluIjpmYWxzZSwiYmFja3N0YWdlX3JvbGUiOiIiLCJpc19zdXBlcl9hZG1pbiI6ZmFsc2UsInN1Yl9uYW1lIjoid3F5QGI4N2M1NGViMjU4ZCIsInRlbmFudCI6IiIsInVwayI6IiJ9.4MV4P1feiUmrrzFbtTQpNQjvYyezPdaLxRJ79y0VyRAxR0aS5NQJGJPxa-6wuqsgzY-E1rvf5S8FCY92ZnViFQ" req_instance_page_size = 1500 chk_instance_page_size = 1500 instances = {} LIM = 30 # 等待状态时间LIM*5s START_LIM = 20 #Heygem脚本启动等待超时时间-10 def ssh_try(host,port,pwd): # 建立连接 trans = paramiko.Transport((host, int(port))) trans.connect(username="root", password=pwd) # 将sshclient的对象的transport指定为以上的trans ssh = paramiko.SSHClient() ssh._transport = trans ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command("bash -ic \"source /root/.bashrc && sleep 10 && ps -ef|grep heygem|grep -v grep && lsof -i:6006|wc -l\"", get_pty=True) out = ssh_stdout.read().decode() start_lim = START_LIM if len(out.split("\n")[-2]) > 2: trans.close() raise RuntimeError(out.split("\n")[-2]) while int(out.split("\n")[-2]) <= 1 and start_lim > 0: ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command("bash -ic \"sleep 2 && lsof -i:6006|wc -l\"") out = ssh_stdout.read().decode() loguru.logger.info("waiting for HeyGem server ready...") start_lim -= 1 if start_lim <= 0: loguru.logger.error("HeyGem Server Start Timeout, Please check!") else: loguru.logger.success("HeyGem Server Start Success") # 关闭连接 trans.close() def get_autodl_machines() -> Union[list, None]: machines = list() headers = { "Authorization": token, "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36", "Host": "www.autodl.com" } index = 1 payload = { "charge_type":"payg", "region_sign":"", "gpu_type_name":["RTX 4090", "RTX 4090D", "RTX 3090", "RTX 3080", "RTX 3080x2", "RTX 3080 Ti", "RTX 3060", "RTX A4000", "RTX 2080 Ti", "RTX 2080 Ti x2", "GTX 1080 Ti"], "machine_tag_name":[], "gpu_idle_num":1, "mount_net_disk":False, "instance_disk_size_order":"", "date_range":"", "date_from":"", "date_to":"", "page_index":index, "page_size":req_instance_page_size, "pay_price_order":"", "gpu_idle_type":"", "default_order":False, "region_sign_list":["nm-B1","nm-B2", "west-B", "west-C", "west-X", "bj-B1", "beijing-A", "beijing-B", "beijing-D", "beijing-E"], "cpu_arch":["x86"], "chip_corp":["nvidia"], "machine_id":"" } loguru.logger.info("Req Machine index {}".format(index)) rsp = requests.post("https://www.autodl.com/api/v1/sub_user/user/machine/list", json=payload, headers=headers) if rsp.status_code == 200: machine_list = rsp.json() loguru.logger.info("Machine Result Total {}".format(machine_list["data"]["result_total"])) while index < machine_list["data"]["max_page"]: index += 1 loguru.logger.info("Req Machine index {}/{}".format(index, machine_list["data"]["max_page"])) payload["page_index"] = index rsp = requests.post("https://www.autodl.com/api/v1/sub_user/user/machine/list", json=payload, headers=headers) if rsp.status_code == 200: machine_list["data"]["list"].extend(rsp.json()["data"]["list"]) else: loguru.logger.error("Get Machines Req Error") return None else: loguru.logger.error("Get Machines Req Error") return None i = 0 for machine in machine_list["data"]["list"]: if machine["health_status"] == 0 \ and machine["gpu_order_num"] > 0 \ and float(machine["highest_cuda_version"])>12. \ and machine["payg"] == True \ and machine["rent_mode"] == "" \ and not machine["user_visible_limit"]: i += 1 machines.append({ "machine_id": machine["machine_id"], "region_name": machine["region_name"], "machine_alias": machine["machine_alias"], "gpu_name": machine["gpu_name"], "gpu_order_num": machine["gpu_order_num"], "gpu_number": machine["gpu_number"], "region_sign": machine["region_sign"], }) return sorted(machines, key=lambda machine: machine["gpu_order_num"], reverse=True) def payg(region_name:str, machine_id:str) -> tuple[Any, Any] | None: region_image = { "西北": ["hub.kce.ksyun.com/autodl-image/miniconda:cuda11.8-cudnn8-devel-ubuntu20.04-py38","image-232ac04d3b"], "内蒙": ["hub.kce.ksyun.com/autodl-image/miniconda:cuda11.8-cudnn8-devel-ubuntu20.04-py38","image-06814c02d1"], "北京": ["hub.kce.ksyun.com/autodl-image/miniconda:cuda11.8-cudnn8-devel-ubuntu20.04-py38","image-e5334cc4f3"] } headers = { "Authorization": token, "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36", "Host": "www.autodl.com" } payload = { "instance_info":{ "machine_id":machine_id, "charge_type":"payg", "req_gpu_amount":1, "image":region_image[region_name[:2]][0], "private_image_uuid":region_image[region_name[:2]][1], "reproduction_uuid":"", "instance_name":"", "expand_data_disk":0, "reproduction_id":0 }, "price_info":{ "coupon_id_list":[], "machine_id":machine_id, "charge_type":"payg", "duration":1, "num":1, "expand_data_disk":0 } } loguru.logger.info("Try Create Payg Container on Machine {}/{}".format(region_name, machine_id)) rsp = requests.post("https://www.autodl.com/api/v1/sub_user/order/instance/create/payg", json=payload, headers=headers) if rsp.status_code == 200: j = rsp.json() if j["code"] == "Success": lim = LIM while lim>0: time.sleep(5) status, host, port, pwd, domain = check_status(j['data']) if status == "running": ssh_try(host, port, pwd) break else: lim = lim-1 if lim > 0: loguru.logger.success("Create Payg Instance Success: %s" % j['data']) return j['data'],domain else: logger.logger.error("Create Payg Instance Error: Wait for Created Timeout, Please Check!!! instance_uuid(%s)" % j['data']) return None else: loguru.logger.error("Create Payg Instance Error: %s" % j['msg']) return None else: loguru.logger.error("Create Payg Instance Error: Status Code[%s]" % rsp.status_code) return None def instance_operate(instance_uuid:str, operation: Literal["power_off","power_on","release"]) -> bool: dest_dict={ "power_off":"shutdown", "power_on":"running", } headers = { "Authorization": token, "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36", "Host": "www.autodl.com" } payload = {"instance_uuid":instance_uuid} rsp = requests.post("https://www.autodl.com/api/v1/sub_user/instance/%s" % operation, json=payload, headers=headers) if rsp.status_code == 200: j = rsp.json() if j["code"] == "Success": lim = LIM if operation in dest_dict.keys(): while lim>0: time.sleep(5) status = check_status(instance_uuid)[0] if status == dest_dict[operation]: break else: lim = lim-1 if lim > 0: loguru.logger.success("Operate[%s] Instance Success" % operation) return True else: loguru.logger.error("Operate[%s] Instance Error: Timeout, Please Check!!! instance_uuid(%s)" % (operation, instance_uuid)) return False else: loguru.logger.error("Operate[%s] Instance Error: %s" % (operation, j['msg'])) return False else: loguru.logger.error("Operate[%s] Instance Error: Status Code[%s]" % (operation, rsp.status_code)) return False def check_status(instance_uuid:str) -> tuple[Any, Any, Any, Any, Any] | None: headers = { "Authorization": token, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36", "Host": "www.autodl.com" } index = 1 payload = { "date_from":"", "date_to":"", "page_index": index, "page_size":chk_instance_page_size, "status":[], "charge_type":[] } # loguru.logger.info("Req Instance index {}".format(index)) rsp = requests.post("https://www.autodl.com/api/v1/sub_user/instance", json=payload, headers=headers) if rsp.status_code == 200: instance_list = rsp.json() # loguru.logger.info("Instance Result Total {}".format(instance_list["data"]["result_total"])) while index < instance_list["data"]["max_page"]: # loguru.logger.info("Req Instance index {}/{}".format(index, instance_list["data"]["max_page"])) payload["page_index"] = index rsp = requests.post("https://www.autodl.com/api/v1/sub_user/instance", json=payload, headers=headers) if rsp.status_code == 200: instance_list["data"]["list"].extend(rsp.json()["data"]["list"]) else: loguru.logger.error("Get Instance Req Error") return None for l in instance_list["data"]["list"]: if l["uuid"] == instance_uuid: loguru.logger.info("Instance {} Status {}".format(instance_uuid, l["status"])) return l["status"], l["proxy_host"], l["ssh_port"], l["root_password"], l["tensorboard_domain"] loguru.logger.warning("Instance {} Not Found".format(instance_uuid)) return None else: loguru.logger.error("Get Instance Req Error") return None if __name__=="__main__": machines = get_autodl_machines() for m in machines: instance_uuid, domain = payg(m["region_name"], m["machine_id"]) loguru.logger.success("instance_id:%s server_domain: https://%s" % (instance_uuid, domain)) # if instance_uuid: # instance_operate(instance_uuid, "power_off") # instance_operate(instance_uuid, "release") break