modalDeploy/src/cluster/video.py

import modal
from dotenv import dotenv_values

downloader_image = (
    modal.Image
    .debian_slim(python_version="3.11")
    .pip_install_from_pyproject("../pyproject.toml")
    .env(dotenv_values("../.runtime.env"))
    .add_local_python_source('cluster')
    .add_local_python_source('BowongModalFunctions')
)

app = modal.App(
    name="media_app",
    image=downloader_image,
    include_source=False,
    secrets=[
        modal.Secret.from_name("cf-kv-secret", environment_name='dev'),
    ])

with downloader_image.imports():
    import os, httpx, crcmod
    import sentry_sdk
    from sentry_sdk.integrations.loguru import LoguruIntegration
    from tqdm import tqdm
    from typing import Tuple, List
    from loguru import logger
    from datetime import datetime, UTC, timedelta
    from modal import current_function_call_id

    from tencentcloud.common.credential import Credential
    from tencentcloud.vod.v20180717.vod_client import VodClient
    from tencentcloud.vod.v20180717 import models as vod_request_models

    from BowongModalFunctions.config import WorkerConfig
    from BowongModalFunctions.utils.KVCache import KVCache
    from BowongModalFunctions.models.media_model import MediaSource, MediaCacheStatus, MediaProtocol
    from BowongModalFunctions.models.web_model import SentryTransactionInfo

    config = WorkerConfig()

    sentry_sdk.init(dsn="https://85632fdcd62f699c2f88af6ca489e9ec@sentry.bowongai.com/3",
                    send_default_pii=True,
                    traces_sample_rate=1.0,
                    profiles_sample_rate=1.0,
                    add_full_stack=True,
                    shutdown_timeout=2,
                    integrations=[LoguruIntegration()],
                    environment=config.modal_environment,
                    )

    cf_account_id = os.environ.get("CF_ACCOUNT_ID")
    cf_kv_api_token = os.environ.get("CF_KV_API_TOKEN")
    cf_kv_namespace_id = os.environ.get("CF_KV_NAMESPACE_ID")

    modal_kv_cache = KVCache(kv_name=config.modal_kv_name, environment=config.modal_environment)


    @sentry_sdk.trace
    def batch_update_cloudflare_kv(caches: List[MediaSource]):
        with httpx.Client() as client:
            try:
                response = client.put(
                    f"https://api.cloudflare.com/client/v4/accounts/{cf_account_id}/storage/kv/namespaces/{cf_kv_namespace_id}/bulk",
                    headers={"Authorization": f"Bearer {cf_kv_api_token}"},
                    json=[
                        {
                            "based64": False,
                            "key": cache.urn,
                            "value": cache.model_dump_json(),
                        }
                        for cache in caches
                    ]
                )
                response.raise_for_status()
            except httpx.RequestError as e:
                logger.error(f"An error occurred while put kv to cloudflare")
                raise e
            except httpx.HTTPStatusError as e:
                logger.error(f"HTTP error occurred while get kv from cloudflare {str(e)}")
                raise e
            except Exception as e:
                logger.error(f"An unexpected error occurred: {str(e)}")
                raise e


    @sentry_sdk.trace
    def batch_remove_cloudflare_kv(caches: List[MediaSource]):
        with httpx.Client() as client:
            try:
                response = client.post(
                    f"https://api.cloudflare.com/client/v4/accounts/{cf_account_id}/storage/kv/namespaces/{cf_kv_namespace_id}/bulk/delete",
                    headers={"Authorization": f"Bearer {cf_kv_api_token}"},
                    json=[cache.urn for cache in caches]
                )
                response.raise_for_status()
            except httpx.RequestError as e:
                logger.error(f"An error occurred while put kv to cloudflare")
                raise e
            except httpx.HTTPStatusError as e:
                logger.error(f"HTTP error occurred while get kv from cloudflare {str(e)}")
                raise e
            except Exception as e:
                logger.error(f"An unexpected error occurred: {str(e)}")
                raise e


    @app.function(cpu=1, timeout=1800,
                  cloud="aws",
                  max_containers=config.video_downloader_concurrency,
                  volumes={
                      "/mntS3": modal.CloudBucketMount(
                          bucket_name=config.S3_bucket_name,
                          secret=modal.Secret.from_name("aws-s3-secret", environment_name=config.modal_environment),
                      ),
                  },
                  secrets=[modal.Secret.from_name("tencent-cloud-secret", environment_name=config.modal_environment)])
    @modal.concurrent(max_inputs=10)
    async def cache_submit(media: MediaSource, sentry_trace: SentryTransactionInfo) -> MediaSource:
        def vod_init():
            tencent_secret_id = os.environ["VOD_SECRET_ID"]
            tencent_secret_key = os.environ["VOD_SECRET_KEY"]
            cred = Credential(secret_id=tencent_secret_id, secret_key=tencent_secret_key)
            return VodClient(credential=cred, region='ap-shanghai')

        def vod_info(media: MediaSource) -> Tuple[str, str, str]:
            logger.info(f"Downloading media {media}")
            request = vod_request_models.DescribeMediaInfosRequest()
            request.SubAppId = int(media.bucket)
            # 兼容fileId带文件类型的格式和不带文件类型的格式
            request.FileIds = [media.path.split('.')[0] if '.' in media.path else media.path]
            response = vod_client.DescribeMediaInfos(request)
            if len(response.MediaInfoSet) > 0:
                media_info = response.MediaInfoSet[0].BasicInfo
                logger.info(f"VOD info = {media_info}")
                file_extension = media_info.Type
                cache_dir = f"/{config.S3_mount_dir}/{media.protocol.value}/{media.endpoint}/{media.bucket}"
                cache_file = media.path if '.' in media.path else f"{media.path}.{file_extension}"
                return (cache_dir, cache_file, media_info.MediaUrl)
            else:
                raise FileNotFoundError(
                    f"FileId : {media.path} not found in SubAppId: {media.bucket} at {media.endpoint}")

        def vod_download(media: MediaSource, on_progress_update: callable(float) = None) -> str:
            cache_dir, cache_file, url = vod_info(media)
            local_cache_filepath = os.path.join(cache_dir, cache_file)
            download_large_file(url=url, output_path=local_cache_filepath,
                                on_progress_callback=on_progress_update)
            return local_cache_filepath

        def download_large_file(url: str, output_path: str, protocol: MediaProtocol = MediaProtocol.vod,
                                on_progress_callback: callable(float) = None) -> None:
            # 配置日志
            logger.info(f"Starting download from {url}")
            try:
                # 使用 httpx 发送 HEAD 请求获取文件大小
                # 设置请求头，支持断点续传
                headers = {'Range': 'bytes=0-'}
                with httpx.Client() as client:
                    match protocol:
                        case MediaProtocol.vod:
                            head_response = client.head(url)
                            file_size = int(head_response.headers.get('content-length', 0))
                            remote_crc64 = int(head_response.headers.get('X-Cos-Hash-Crc64ecma', 0))
                            logger.info(f"File size: {file_size / (1024 * 1024 * 1024):.2f} GB")

                            if os.path.exists(output_path):
                                local_file_size = os.path.getsize(output_path)
                                logger.info(f"File size match check {local_file_size} = {file_size}")
                                if local_file_size == file_size:
                                    logger.info(f"Check file CRC64...")
                                    # CRC64使用ECMA-182标准校验 ref: https://cloud.tencent.com/document/product/436/40334#python-sdk
                                    c64 = crcmod.mkCrcFun(0x142F0E1EBA9EA3693, initCrc=0, xorOut=0xffffffffffffffff,
                                                          rev=True)
                                    with open(output_path, "rb") as local_file:
                                        local_crc64 = c64(local_file.read())
                                    logger.info(f"File crc64 check {local_crc64} = {remote_crc64}")
                                    if local_crc64 == remote_crc64:
                                        logger.success("File size verification passed!")
                                    return

                    logger.info(f"Downloading {url}...")
                    # 发起流式请求
                    with client.stream('GET', url, headers=headers) as response:
                        response.raise_for_status()
                        file_size = int(response.headers.get('content-length', 0))
                        # 设置进度条
                        progress_bar = tqdm(
                            total=file_size,
                            unit='iB',
                            unit_scale=True,
                            desc='Downloading'
                        )
                        # 以二进制写模式打开文件
                        with open(output_path, 'wb') as file:
                            # 分块下载，每次读取1MB
                            chunk_size = 1024 * 1024  # 1MB
                            downloaded_size = 0
                            for chunk in response.iter_bytes(chunk_size=chunk_size):
                                if chunk:
                                    file.write(chunk)
                                    downloaded_size += len(chunk)
                                    if on_progress_callback:
                                        on_progress_callback(downloaded_size / file_size)
                                    progress_bar.update(len(chunk))
                                    # 每下载100MB记录一次日志
                                    if downloaded_size % (100 * 1024 * 1024) == 0:
                                        logger.info(
                                            f"Downloaded: {downloaded_size / (1024 * 1024 * 1024):.2f} GB")

                        progress_bar.close()

                        # 验证下载是否完成
                        if os.path.exists(output_path):
                            final_size = os.path.getsize(output_path)
                        logger.info(f"Download completed successfully!")
                        logger.info(f"Final file size: {final_size / (1024 * 1024 * 1024):.2f} GB")
                        logger.info(f"File saved to: {os.path.abspath(output_path)}")
                        # 验证文件大小是否匹配
                        if final_size == file_size:
                            logger.info("File size verification passed!")
                        else:
                            logger.warning(f"File size mismatch! Expected: {file_size}, Got: {final_size}")
            except httpx.RequestError as e:
                logger.error(f"An error occurred while requesting {url}: {str(e)}")
                raise e
            except httpx.HTTPStatusError as e:
                logger.error(f"HTTP error occurred: {str(e)}")
                raise e
            except Exception as e:
                logger.error(f"An unexpected error occurred: {str(e)}")
                raise e
            finally:
                if 'progress_bar' in locals():
                    progress_bar.close()

        vod_client = vod_init()
        modal_kv = modal_kv_cache
        fn_id = current_function_call_id()

        with sentry_sdk.continue_trace(environ_or_headers={"sentry-trace": sentry_trace.x_trace_id,
                                                           "baggage": sentry_trace.x_baggage, }) as transaction:
            transaction.set_context("runtime_environment", {
                "MODAL_CLOUD_PROVIDER": os.environ.get('MODAL_CLOUD_PROVIDER', 'unknown'),
                "MODAL_ENVIRONMENT": config.modal_environment,
                "MODAL_IMAGE_ID": os.environ.get('MODAL_IMAGE_ID', 'unknown'),
                "MODAL_IS_REMOTE": os.environ.get('MODAL_IS_REMOTE', 'unknown'),
                "MODAL_REGION": os.environ.get('MODAL_REGION', 'unknown'),
                "MODAL_TASK_ID": os.environ.get('MODAL_TASK_ID', 'unknown'),
                "MODAL_IDENTITY_TOKEN": os.environ.get('MODAL_IDENTITY_TOKEN', 'unknown'),
            })
            with transaction.start_child(name="收到缓存视频任务", op="queue.receive") as receive_span:
                receive_span.set_data("messaging.message.id", fn_id)
                receive_span.set_data("messaging.destination.name", "video-downloader.cache_submit")
                receive_span.set_data("messaging.message.retry.count", 0)
                receive_span.set_data("cache.key", media.urn)
                with receive_span.start_child(name="处理缓存视频任务", op="queue.process") as process_span:
                    process_span.set_data("messaging.message.id", fn_id)
                    process_span.set_data("messaging.destination.name", "video-downloader.cache_submit")
                    process_span.set_data("messaging.message.retry.count", 0)
                    process_span.set_data("cache.key", media.urn)
                    volume_cache_path = None
                    match media.protocol:
                        case MediaProtocol.vod:
                            try:
                                volume_cache_path = vod_download(media)
                                process_span.set_status("success")
                            except Exception as e:
                                logger.exception(e)
                                media.status = MediaCacheStatus.failed
                                modal_kv.set_cache(media)
                                batch_update_cloudflare_kv([media])
                                process_span.set_status("failed")
                        case MediaProtocol.http:
                            try:
                                cache_filepath = f"{config.S3_mount_dir}/{media.cache_filepath}"
                                download_large_file(url=media.__str__(), output_path=cache_filepath)
                            except Exception as e:
                                logger.exception(e)
                                media.status = MediaCacheStatus.failed
                                modal_kv.set_cache(media)
                                batch_update_cloudflare_kv([media])
                                process_span.set_status("failed")
                        case MediaProtocol.s3:
                            # 本地挂载缓存
                            if media.protocol == MediaProtocol.s3 and media.endpoint == config.S3_region and media.bucket == config.S3_bucket_name:
                                volume_cache_path = f"{config.S3_mount_dir}/{media.cache_filepath}"
                            else:
                                logger.error("protocol not yet supported")
                        case _:
                            process_span.set_status("failed")
                            logger.error(f"protocol not yet supported")
                    media.downloader_id = fn_id
                    media.status = MediaCacheStatus.ready if volume_cache_path else MediaCacheStatus.failed
                    media.progress = 1 if volume_cache_path else 0
                    media.expired_at = datetime.now(UTC) + timedelta(days=7) if volume_cache_path else None
                    modal_kv.set_cache(media)
                    batch_update_cloudflare_kv([media])
        return media


    @app.function(cpu=1, timeout=300,
                  max_containers=config.video_downloader_concurrency,
                  volumes={
                      "/mntS3": modal.CloudBucketMount(
                          bucket_name=config.S3_bucket_name,
                          secret=modal.Secret.from_name("aws-s3-secret", environment_name=config.modal_environment),
                      ),
                  })
    @modal.concurrent(max_inputs=10)
    async def cache_delete(cache: MediaSource) -> MediaSource:
        if os.path.exists(cache.cache_filepath):
            os.remove(cache.cache_filepath)
            cache.status = MediaCacheStatus.deleted
        else:
            cache.status = MediaCacheStatus.missing
        return cache