259 lines
11 KiB
Python
259 lines
11 KiB
Python
import time
|
|
from typing import Literal, Union, Any
|
|
|
|
import logger
|
|
import loguru
|
|
import paramiko
|
|
import requests
|
|
|
|
token = "eyJhbGciOiJFUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1aWQiOjMzNjM5MSwidXVpZCI6ImU2MDU0ZGI4LTlhN2UtNDllNC1hNDQ1LWI4N2M1NGViMjU4ZCIsImlzX2FkbWluIjpmYWxzZSwiYmFja3N0YWdlX3JvbGUiOiIiLCJpc19zdXBlcl9hZG1pbiI6ZmFsc2UsInN1Yl9uYW1lIjoid3F5QGI4N2M1NGViMjU4ZCIsInRlbmFudCI6IiIsInVwayI6IiJ9.4MV4P1feiUmrrzFbtTQpNQjvYyezPdaLxRJ79y0VyRAxR0aS5NQJGJPxa-6wuqsgzY-E1rvf5S8FCY92ZnViFQ"
|
|
req_instance_page_size = 1500
|
|
chk_instance_page_size = 1500
|
|
instances = {}
|
|
LIM = 30 # 等待状态时间LIM*5s
|
|
START_LIM = 20 #Heygem脚本启动等待超时时间-10
|
|
|
|
def ssh_try(host,port,pwd):
|
|
# 建立连接
|
|
trans = paramiko.Transport((host, int(port)))
|
|
trans.connect(username="root", password=pwd)
|
|
|
|
# 将sshclient的对象的transport指定为以上的trans
|
|
ssh = paramiko.SSHClient()
|
|
ssh._transport = trans
|
|
|
|
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command("bash -ic \"source /root/.bashrc && sleep 10 && ps -ef|grep heygem|grep -v grep && lsof -i:6006|wc -l\"", get_pty=True)
|
|
out = ssh_stdout.read().decode()
|
|
start_lim = START_LIM
|
|
if len(out.split("\n")[-2]) > 2:
|
|
trans.close()
|
|
raise RuntimeError(out.split("\n")[-2])
|
|
while int(out.split("\n")[-2]) <= 1 and start_lim > 0:
|
|
ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command("bash -ic \"sleep 2 && lsof -i:6006|wc -l\"")
|
|
out = ssh_stdout.read().decode()
|
|
loguru.logger.info("waiting for HeyGem server ready...")
|
|
start_lim -= 1
|
|
if start_lim <= 0:
|
|
loguru.logger.error("HeyGem Server Start Timeout, Please check!")
|
|
else:
|
|
loguru.logger.success("HeyGem Server Start Success")
|
|
# 关闭连接
|
|
trans.close()
|
|
|
|
def get_autodl_machines() -> Union[list, None]:
|
|
machines = list()
|
|
headers = {
|
|
"Authorization": token,
|
|
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
|
|
"Host": "www.autodl.com"
|
|
}
|
|
index = 1
|
|
payload = {
|
|
"charge_type":"payg",
|
|
"region_sign":"",
|
|
"gpu_type_name":["RTX 4090", "RTX 4090D", "RTX 3090", "RTX 3080", "RTX 3080x2", "RTX 3080 Ti", "RTX 3060", "RTX A4000", "RTX 2080 Ti", "RTX 2080 Ti x2", "GTX 1080 Ti"],
|
|
"machine_tag_name":[],
|
|
"gpu_idle_num":1,
|
|
"mount_net_disk":False,
|
|
"instance_disk_size_order":"",
|
|
"date_range":"",
|
|
"date_from":"",
|
|
"date_to":"",
|
|
"page_index":index,
|
|
"page_size":req_instance_page_size,
|
|
"pay_price_order":"",
|
|
"gpu_idle_type":"",
|
|
"default_order":False,
|
|
"region_sign_list":["nm-B1","nm-B2", "west-B", "west-C", "west-X", "bj-B1", "beijing-A", "beijing-B", "beijing-D", "beijing-E"],
|
|
"cpu_arch":["x86"],
|
|
"chip_corp":["nvidia"],
|
|
"machine_id":""
|
|
}
|
|
loguru.logger.info("Req Machine index {}".format(index))
|
|
rsp = requests.post("https://www.autodl.com/api/v1/sub_user/user/machine/list", json=payload, headers=headers)
|
|
|
|
if rsp.status_code == 200:
|
|
machine_list = rsp.json()
|
|
loguru.logger.info("Machine Result Total {}".format(machine_list["data"]["result_total"]))
|
|
while index < machine_list["data"]["max_page"]:
|
|
index += 1
|
|
loguru.logger.info("Req Machine index {}/{}".format(index, machine_list["data"]["max_page"]))
|
|
payload["page_index"] = index
|
|
rsp = requests.post("https://www.autodl.com/api/v1/sub_user/user/machine/list", json=payload, headers=headers)
|
|
if rsp.status_code == 200:
|
|
machine_list["data"]["list"].extend(rsp.json()["data"]["list"])
|
|
else:
|
|
loguru.logger.error("Get Machines Req Error")
|
|
return None
|
|
else:
|
|
loguru.logger.error("Get Machines Req Error")
|
|
return None
|
|
i = 0
|
|
for machine in machine_list["data"]["list"]:
|
|
if machine["health_status"] == 0 \
|
|
and machine["gpu_order_num"] > 0 \
|
|
and float(machine["highest_cuda_version"])>12. \
|
|
and machine["payg"] == True \
|
|
and machine["rent_mode"] == "" \
|
|
and not machine["user_visible_limit"]:
|
|
i += 1
|
|
machines.append({
|
|
"machine_id": machine["machine_id"],
|
|
"region_name": machine["region_name"],
|
|
"machine_alias": machine["machine_alias"],
|
|
"gpu_name": machine["gpu_name"],
|
|
"gpu_order_num": machine["gpu_order_num"],
|
|
"gpu_number": machine["gpu_number"],
|
|
"region_sign": machine["region_sign"],
|
|
})
|
|
return sorted(machines, key=lambda machine: machine["gpu_order_num"], reverse=True)
|
|
|
|
def payg(region_name:str, machine_id:str) -> tuple[Any, Any] | None:
|
|
region_image = {
|
|
"西北": ["hub.kce.ksyun.com/autodl-image/miniconda:cuda11.8-cudnn8-devel-ubuntu20.04-py38","image-232ac04d3b"],
|
|
"内蒙": ["hub.kce.ksyun.com/autodl-image/miniconda:cuda11.8-cudnn8-devel-ubuntu20.04-py38","image-06814c02d1"],
|
|
"北京": ["hub.kce.ksyun.com/autodl-image/miniconda:cuda11.8-cudnn8-devel-ubuntu20.04-py38","image-e5334cc4f3"]
|
|
}
|
|
headers = {
|
|
"Authorization": token,
|
|
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
|
|
"Host": "www.autodl.com"
|
|
}
|
|
payload = {
|
|
"instance_info":{
|
|
"machine_id":machine_id,
|
|
"charge_type":"payg",
|
|
"req_gpu_amount":1,
|
|
"image":region_image[region_name[:2]][0],
|
|
"private_image_uuid":region_image[region_name[:2]][1],
|
|
"reproduction_uuid":"",
|
|
"instance_name":"",
|
|
"expand_data_disk":0,
|
|
"reproduction_id":0
|
|
},
|
|
"price_info":{
|
|
"coupon_id_list":[],
|
|
"machine_id":machine_id,
|
|
"charge_type":"payg",
|
|
"duration":1,
|
|
"num":1,
|
|
"expand_data_disk":0
|
|
}
|
|
}
|
|
loguru.logger.info("Try Create Payg Container on Machine {}/{}".format(region_name, machine_id))
|
|
rsp = requests.post("https://www.autodl.com/api/v1/sub_user/order/instance/create/payg", json=payload, headers=headers)
|
|
|
|
if rsp.status_code == 200:
|
|
j = rsp.json()
|
|
if j["code"] == "Success":
|
|
lim = LIM
|
|
while lim>0:
|
|
time.sleep(5)
|
|
status, host, port, pwd, domain = check_status(j['data'])
|
|
if status == "running":
|
|
ssh_try(host, port, pwd)
|
|
break
|
|
else:
|
|
lim = lim-1
|
|
if lim > 0:
|
|
loguru.logger.success("Create Payg Instance Success: %s" % j['data'])
|
|
return j['data'],domain
|
|
else:
|
|
logger.logger.error("Create Payg Instance Error: Wait for Created Timeout, Please Check!!! instance_uuid(%s)" % j['data'])
|
|
return None
|
|
else:
|
|
loguru.logger.error("Create Payg Instance Error: %s" % j['msg'])
|
|
return None
|
|
else:
|
|
loguru.logger.error("Create Payg Instance Error: Status Code[%s]" % rsp.status_code)
|
|
return None
|
|
|
|
def instance_operate(instance_uuid:str, operation: Literal["power_off","power_on","release"]) -> bool:
|
|
dest_dict={
|
|
"power_off":"shutdown",
|
|
"power_on":"running",
|
|
}
|
|
headers = {
|
|
"Authorization": token,
|
|
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
|
|
"Host": "www.autodl.com"
|
|
}
|
|
payload = {"instance_uuid":instance_uuid}
|
|
rsp = requests.post("https://www.autodl.com/api/v1/sub_user/instance/%s" % operation, json=payload, headers=headers)
|
|
if rsp.status_code == 200:
|
|
j = rsp.json()
|
|
if j["code"] == "Success":
|
|
lim = LIM
|
|
if operation in dest_dict.keys():
|
|
while lim>0:
|
|
time.sleep(5)
|
|
status = check_status(instance_uuid)[0]
|
|
if status == dest_dict[operation]:
|
|
break
|
|
else:
|
|
lim = lim-1
|
|
if lim > 0:
|
|
loguru.logger.success("Operate[%s] Instance Success" % operation)
|
|
return True
|
|
else:
|
|
loguru.logger.error("Operate[%s] Instance Error: Timeout, Please Check!!! instance_uuid(%s)" % (operation, instance_uuid))
|
|
return False
|
|
else:
|
|
loguru.logger.error("Operate[%s] Instance Error: %s" % (operation, j['msg']))
|
|
return False
|
|
else:
|
|
loguru.logger.error("Operate[%s] Instance Error: Status Code[%s]" % (operation, rsp.status_code))
|
|
return False
|
|
|
|
def check_status(instance_uuid:str) -> tuple[Any, Any, Any, Any, Any] | None:
|
|
headers = {
|
|
"Authorization": token,
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
|
|
"Host": "www.autodl.com"
|
|
}
|
|
index = 1
|
|
payload = {
|
|
"date_from":"",
|
|
"date_to":"",
|
|
"page_index": index,
|
|
"page_size":chk_instance_page_size,
|
|
"status":[],
|
|
"charge_type":[]
|
|
}
|
|
# loguru.logger.info("Req Instance index {}".format(index))
|
|
rsp = requests.post("https://www.autodl.com/api/v1/sub_user/instance", json=payload, headers=headers)
|
|
if rsp.status_code == 200:
|
|
instance_list = rsp.json()
|
|
# loguru.logger.info("Instance Result Total {}".format(instance_list["data"]["result_total"]))
|
|
while index < instance_list["data"]["max_page"]:
|
|
# loguru.logger.info("Req Instance index {}/{}".format(index, instance_list["data"]["max_page"]))
|
|
payload["page_index"] = index
|
|
rsp = requests.post("https://www.autodl.com/api/v1/sub_user/instance", json=payload, headers=headers)
|
|
if rsp.status_code == 200:
|
|
instance_list["data"]["list"].extend(rsp.json()["data"]["list"])
|
|
else:
|
|
loguru.logger.error("Get Instance Req Error")
|
|
return None
|
|
for l in instance_list["data"]["list"]:
|
|
if l["uuid"] == instance_uuid:
|
|
loguru.logger.info("Instance {} Status {}".format(instance_uuid, l["status"]))
|
|
return l["status"], l["proxy_host"], l["ssh_port"], l["root_password"], l["tensorboard_domain"]
|
|
loguru.logger.warning("Instance {} Not Found".format(instance_uuid))
|
|
return None
|
|
else:
|
|
loguru.logger.error("Get Instance Req Error")
|
|
return None
|
|
|
|
|
|
|
|
|
|
if __name__=="__main__":
|
|
machines = get_autodl_machines()
|
|
for m in machines:
|
|
instance_uuid, domain = payg(m["region_name"], m["machine_id"])
|
|
loguru.logger.success("instance_id:%s server_domain: https://%s" % (instance_uuid, domain))
|
|
# if instance_uuid:
|
|
# instance_operate(instance_uuid, "power_off")
|
|
# instance_operate(instance_uuid, "release")
|
|
break |