From 3f495776263d8502d87a74562981585b9b08cad2 Mon Sep 17 00:00:00 2001 From: gexianmeng Date: Fri, 23 May 2025 17:38:58 +0800 Subject: [PATCH] =?UTF-8?q?=E5=B0=81=E8=A3=85minmax=E8=AF=AD=E9=9F=B3?= =?UTF-8?q?=E5=85=8B=E9=9A=86=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/lib/cluster/app.py | 18 + .../router/text2speech.py | 321 ++++++++++++++++++ src/cluster/minmax_clone_audio_api.py | 146 ++++++++ 3 files changed, 485 insertions(+) create mode 100644 build/lib/cluster/app.py create mode 100644 src/BowongModalFunctions/router/text2speech.py create mode 100644 src/cluster/minmax_clone_audio_api.py diff --git a/build/lib/cluster/app.py b/build/lib/cluster/app.py new file mode 100644 index 0000000..378487d --- /dev/null +++ b/build/lib/cluster/app.py @@ -0,0 +1,18 @@ +import modal +from src.cluster.config import config +from src.cluster.video_downloader.worker import worker_app +from src.cluster.web.worker import fastapi_app +from src.cluster.ffmpeg_worker.worker import app as ffmpeg_app +from src.cluster.comfyui.worker import app as comfyui_app +from src.cluster.comfyui_latentsync15.worker import app as comfyui_latentsync15_app + +app = modal.App(config.app_name, + include_source=False, + secrets=[modal.Secret.from_name("cf-kv-secret", + environment_name=config.environment)]) + +app.include(fastapi_app) +app.include(worker_app) +app.include(ffmpeg_app) +app.include(comfyui_app) +app.include(comfyui_latentsync15_app) diff --git a/src/BowongModalFunctions/router/text2speech.py b/src/BowongModalFunctions/router/text2speech.py new file mode 100644 index 0000000..868413f --- /dev/null +++ b/src/BowongModalFunctions/router/text2speech.py @@ -0,0 +1,321 @@ +import asyncio +import os +from pathlib import Path +from typing import Optional + +import sentry_sdk +from fastapi import APIRouter, Depends, HTTPException, File, UploadFile, Form +from fastapi.responses import JSONResponse +from loguru import logger +from modal import current_function_call_id +from pydantic import BaseModel, Field +from starlette import status + +from ..middleware.authorization import verify_token +from ..models.media_model import MediaSource, MediaCacheStatus +from ..utils.KVCache import KVCache +from ..utils.SentryUtils import SentryUtils +# 导入语音克隆函数 +from cluster.minmax_clone_audio_api import voice_clone_and_download + + +# 创建路由,使用 /text2speech 作为前缀 +router = APIRouter(prefix="/text2speech", tags=["文本转语音"]) + +# 使用函数来延迟获取配置 +def get_config(): + """延迟加载配置,确保环境变量已经被 Modal 注入""" + from ..config import WorkerConfig + return WorkerConfig() + + +# 定义请求和响应模型 +class Text2SpeechRequest(BaseModel): + """文本转语音请求模型""" + text: str = Field(description="要合成的文本内容") + voice_id: Optional[str] = Field(default=None, description="声音ID,如果为None则自动生成") + model: str = Field(default="speech-02-hd", description="使用的模型") + accuracy: float = Field(default=0.8, description="精确度,范围0-1") + + +class Text2SpeechResponse(BaseModel): + """文本转语音响应模型""" + success: bool = Field(description="是否成功") + media_source: Optional[MediaSource] = Field(default=None, description="生成的音频文件信息") + download_url: Optional[str] = Field(default=None, description="下载链接") + voice_id: Optional[str] = Field(default=None, description="声音ID") + error: Optional[str] = Field(default=None, description="错误信息") + + +@router.post("/clone", summary="语音克隆合成", description="上传音频文件进行语音克隆,并使用克隆的声音合成新的文本音频", + dependencies=[Depends(verify_token)]) +async def voice_clone(file: UploadFile = File(description="上传的音频文件,用于克隆声音"), + text: str = Form(description="要合成的文本内容"), + model: str = Form(default="speech-02-hd", description="使用的模型"), + accuracy: float = Form(default=0.8, description="精确度")) -> Text2SpeechResponse: + """ + 语音克隆接口 + + 该接口接收一个音频文件,克隆其声音特征,并使用克隆的声音合成新的文本音频。 + """ + fn_id = current_function_call_id() + + # 在函数内部获取配置 + config = get_config() + modal_kv_cache = KVCache(kv_name=config.modal_kv_name, environment=config.modal_environment) + + # 添加配置验证日志 + logger.info(f"开始语音克隆处理 - Function ID: {fn_id}") + logger.info(f"MiniMax Group ID 配置状态: {'已配置' if config.minimax_group_id else '未配置'}") + logger.info(f"MiniMax API Key 配置状态: {'已配置' if config.minimax_api_key else '未配置'}") + + # 检查必要的配置 + if not config.minimax_group_id or not config.minimax_api_key: + error_msg = [] + if not config.minimax_group_id: + error_msg.append("MINIMAX_GROUP_ID 未配置") + if not config.minimax_api_key: + error_msg.append("MINIMAX_API_KEY 未配置") + + logger.error(f"MiniMax API 配置缺失: {', '.join(error_msg)}") + return Text2SpeechResponse(success=False, error=f"MiniMax API 配置错误: {', '.join(error_msg)}") + + # 验证文件类型 + allowed_extensions = {'.mp3', '.wav', '.m4a', '.aac', '.ogg'} + file_extension = Path(file.filename).suffix.lower() + + if file_extension not in allowed_extensions: + + logger.warning(f"不支持的文件格式: {file_extension}") + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, + detail=f"不支持的文件格式。支持的格式: {', '.join(allowed_extensions)}") + + # 文件大小限制 + max_file_size = config.voice_clone_max_file_size + if file.size > max_file_size: + logger.warning(f"文件大小超限: {file.size} > {max_file_size}") + raise HTTPException(status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, + detail=f"文件大小超过限制。最大允许: {max_file_size / 1024 / 1024}MB") + + temp_dir = None + temp_audio_path = None + + try: + # 创建临时目录保存上传的文件 + temp_dir = f"/tmp/text2speech_{fn_id}" + os.makedirs(temp_dir, exist_ok=True) + + # 保存上传的文件 + temp_audio_path = os.path.join(temp_dir, file.filename) + with open(temp_audio_path, 'wb') as f: + content = await file.read() + f.write(content) + + logger.info(f"音频文件已保存到: {temp_audio_path}, 大小: {len(content)} bytes") + + # 设置输出目录 + output_dir = os.path.join(config.S3_mount_dir, config.voice_clone_output_dir) + os.makedirs(output_dir, exist_ok=True) + logger.info(f"输出目录: {output_dir}") + + # 调用语音克隆函数 + @SentryUtils.sentry_tracker(name="文本转语音处理", op="text2speech.clone", fn_id=fn_id, sentry_trace_id=None, + sentry_baggage=None) + async def process_voice_clone(): + + try: + # 在异步环境中运行同步函数 + loop = asyncio.get_event_loop() + logger.info(f"开始调用 MiniMax API - text: {text[:50]}...") + + result = await loop.run_in_executor(None, + voice_clone_and_download, + temp_audio_path, + config.minimax_group_id, + config.minimax_api_key, + text, + output_dir, + model, + accuracy) + + logger.info(f"MiniMax API 调用完成 - 结果: {result}") + return result + + except Exception as api_error: + logger.error(f"MiniMax API 调用异常: {str(api_error)}") + raise + + # 执行语音克隆 + output_path, used_voice_id = await process_voice_clone() + + logger.info(f"语音克隆结果 - output_path: {output_path}, voice_id: {used_voice_id}") + + # 检查结果 + if output_path is None: + logger.error("语音合成失败: output_path 为 None") + return Text2SpeechResponse( + success=False, + voice_id=used_voice_id or "", + error="语音合成失败:MiniMax API 返回空结果,请检查 API 配置和余额" + ) + + # 验证输出文件是否存在 + if not os.path.exists(output_path): + logger.error(f"输出文件不存在: {output_path}") + return Text2SpeechResponse( + success=False, + voice_id=used_voice_id or "", + error=f"生成的音频文件不存在: {output_path}" + ) + + # 创建 MediaSource 对象 + # 计算相对于S3挂载目录的路径 + relative_path = os.path.relpath(output_path, config.S3_mount_dir) + s3_key = relative_path.replace("\\", "/") # Windows兼容 + + media_source = MediaSource( + path=s3_key, + protocol="s3", + endpoint=config.S3_region, + bucket=config.S3_bucket_name, + urn=f"s3://{config.S3_region}/{config.S3_bucket_name}/{s3_key}", + status=MediaCacheStatus.ready, downloader_id=fn_id + ) + + # 保存到KV缓存 + modal_kv_cache.set_cache(media_source) + logger.info(f"已保存到 KV 缓存: {media_source.urn}") + + # 生成下载链接 + download_url = f"{config.S3_cdn_endpoint}/{s3_key}" + + logger.success(f"文本转语音成功完成。Voice ID: {used_voice_id}, Download URL: {download_url}") + + return Text2SpeechResponse( + success=True, + voice_id=used_voice_id, + media_source=media_source, + download_url=download_url + ) + + except HTTPException: + # 重新抛出 HTTP 异常 + raise + except Exception as e: + logger.error(f"文本转语音处理失败 - 详细错误: {str(e)}", exc_info=True) + sentry_sdk.capture_exception(e) + + # 返回详细的错误信息 + error_detail = str(e) + if "401" in error_detail or "403" in error_detail: + error_msg = "MiniMax API 认证失败,请检查 API Key" + elif "429" in error_detail: + error_msg = "MiniMax API 请求过于频繁,请稍后重试" + elif "insufficient" in error_detail.lower() or "quota" in error_detail.lower(): + error_msg = "MiniMax API 余额不足" + else: + error_msg = f"处理失败: {error_detail[:200]}" # 限制错误信息长度 + + return Text2SpeechResponse(success=False, error=error_msg) + + finally: + # 清理临时文件 + try: + if temp_audio_path and os.path.exists(temp_audio_path): + os.remove(temp_audio_path) + logger.info(f"已删除临时文件: {temp_audio_path}") + if temp_dir and os.path.exists(temp_dir): + os.rmdir(temp_dir) + logger.info(f"已删除临时目录: {temp_dir}") + + except Exception as cleanup_error: + logger.warning(f"清理临时文件失败: {cleanup_error}") + + +@router.get("/voice/{voice_id}", summary="查询声音状态", description="根据voice_id查询声音文件的状态") +async def get_voice_status(voice_id: str): + """ + 查询特定voice_id的处理状态 + """ + # 在函数内部获取配置 + config = get_config() + modal_kv_cache = KVCache(kv_name=config.modal_kv_name, environment=config.modal_environment) + + try: + # 尝试从KV缓存中查找相关的媒体文件 + key_pattern = f"{config.voice_clone_output_dir}/{voice_id}.mp3" + urn = f"s3://{config.S3_region}/{config.S3_bucket_name}/{key_pattern}" + + cached_media = modal_kv_cache.get_cache(urn) + + if cached_media: + download_url = f"{config.S3_cdn_endpoint}/{cached_media.path}" + return JSONResponse(content={ + "success": True, + "voice_id": voice_id, + "status": cached_media.status.value, + "download_url": download_url if cached_media.status == MediaCacheStatus.ready else None, + "media_source": cached_media.model_dump() + }) + + else: + return JSONResponse(status_code=status.HTTP_404_NOT_FOUND, + content={"success": False, "voice_id": voice_id, "error": "未找到该voice_id的记录"}) + except Exception as e: + logger.error(f"查询声音状态失败: {str(e)}") + return JSONResponse( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + content={"success": False, "voice_id": voice_id, "error": f"查询失败: {str(e)}"} + ) + + +# 调试端点 +@router.get("/debug/env", + summary="调试环境变量", + description="检查环境变量和配置状态", + dependencies=[Depends(verify_token)]) +async def debug_environment(): + """调试端点,检查环境变量是否正确加载""" + import os + + # 获取所有环境变量 + all_env_vars = dict(os.environ) + + # 查找 MINIMAX 相关的环境变量 + minimax_vars = {k: v for k, v in all_env_vars.items() if 'MINIMAX' in k.upper()} + + # 尝试加载配置 + try: + config = get_config() + config_loaded = True + config_values = { + "minimax_group_id": bool(config.minimax_group_id), + "minimax_api_key": bool(config.minimax_api_key), + "minimax_group_id_length": len(config.minimax_group_id), + "minimax_api_key_length": len(config.minimax_api_key), + } + except Exception as e: + config_loaded = False + config_values = {"error": str(e)} + + return { + "environment": { + "MODAL_ENVIRONMENT": os.getenv("MODAL_ENVIRONMENT", "NOT_SET"), + "total_env_vars": len(all_env_vars), + "minimax_env_vars_found": len(minimax_vars), + }, + "minimax_env_vars": { + k: f"{v[:20]}..." if len(v) > 20 else v + for k, v in minimax_vars.items() + }, + "direct_env_check": { + "MINIMAX_GROUP_ID": os.getenv("MINIMAX_GROUP_ID", "NOT_FOUND"), + "MINIMAX_API_KEY": "FOUND" if os.getenv("MINIMAX_API_KEY") else "NOT_FOUND", + }, + "config_loaded": config_loaded, + "config_values": config_values, + "secrets_in_env": { + "CF_KV_SECRET": "cf-kv" in str(all_env_vars), + "MINIMAX_SECRET": any("minimax" in k.lower() for k in all_env_vars.keys()), + } + } \ No newline at end of file diff --git a/src/cluster/minmax_clone_audio_api.py b/src/cluster/minmax_clone_audio_api.py new file mode 100644 index 0000000..02b1b40 --- /dev/null +++ b/src/cluster/minmax_clone_audio_api.py @@ -0,0 +1,146 @@ +import json +import os +import uuid +from datetime import datetime + +import requests +from loguru import logger + + +def generate_unique_voice_id(prefix="minmax_audio_clone"): + """ + 生成唯一的voice_id + + 参数: + prefix: voice_id的前缀,默认为"minmax_audio_clone" + + 返回: + 符合MiniMax规则的唯一voice_id,格式:minmax_audio_clone_20250803_547218f2 + """ + # 获取当前日期,格式:YYYYMMDD + current_date = datetime.now().strftime("%Y%m%d") + + # 生成8位随机字符 + unique_id = str(uuid.uuid4()).replace('-', '')[:8] + + # 组合voice_id + voice_id = f"{prefix}_{current_date}_{unique_id}" + + # 确保长度不超过256个字符 + if len(voice_id) > 256: + voice_id = voice_id[:256] + + return voice_id + + +def voice_clone_and_download(audio_file_path, group_id, api_key, text, output_dir, model="speech-02-hd", + accuracy=0.8): + """ + 音频复刻并下载生成的音频文件 + + 参数: + audio_file_path: 输入音频文件路径 + group_id: MiniMax API的组ID + api_key: MiniMax API密钥 + text: 要合成的文本内容 + output_dir: 输出目录 + model: 使用的模型,默认为"speech-02-hd" + accuracy: 精确度,默认为0.8 + + 返回: + (下载的音频文件完整路径, 使用的voice_id)的元组,如果失败返回(None, None) + """ + + try: + # 如果没有提供voice_id,则生成一个唯一的 + voice_id = generate_unique_voice_id() + + logger.info(f"使用voice_id: {voice_id}") + + # 步骤1: 上传音频文件 + logger.info("正在上传音频文件...") + upload_url = f'https://api.minimax.chat/v1/files/upload?GroupId={group_id}' + upload_headers = {'authority': 'api.minimax.chat', 'Authorization': f'Bearer {api_key}'} + + upload_data = {'purpose': 'voice_clone'} + + with open(audio_file_path, 'rb') as audio_file: + files = {'file': audio_file} + upload_response = requests.post(upload_url, headers=upload_headers, data=upload_data, files=files) + + if upload_response.status_code != 200: + logger.error(f"文件上传失败: {upload_response.status_code}") + return None, None + + file_id = upload_response.json().get("file", {}).get("file_id") + if not file_id: + logger.error("无法获取文件ID") + return None, None + + logger.success(f"文件上传成功,文件ID: {file_id}") + + # 步骤2: 音频复刻 + logger.info("正在进行音频复刻...") + clone_url = f'https://api.minimax.chat/v1/voice_clone?GroupId={group_id}' + clone_payload = json.dumps( + {"file_id": file_id, "voice_id": voice_id, "text": text, "model": model, "accuracy": accuracy, }) + + clone_headers = {'Authorization': f'Bearer {api_key}', 'content-type': 'application/json'} + + clone_response = requests.post(clone_url, headers=clone_headers, data=clone_payload) + + if clone_response.status_code != 200: + logger.error(f"音频复刻失败: {clone_response.status_code}") + return None, None + + response_data = clone_response.json() + logger.info(f"复刻响应: {response_data}") + + # 获取音频下载链接 + demo_audio_url = response_data.get("demo_audio") + if not demo_audio_url: + logger.error("无法获取音频下载链接") + return None, None + + # 步骤3: 下载音频文件 + logger.info("正在下载生成的音频文件...") + + # 创建输出目录 + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # 使用voice_id作为文件名 + filename = f"{voice_id}.mp3" + output_path = os.path.join(output_dir, filename) + + # 下载文件 + download_response = requests.get(demo_audio_url) + if download_response.status_code != 200: + logger.error(f"音频下载失败: {download_response.status_code}") + return None, None + + with open(output_path, 'wb') as f: + f.write(download_response.content) + + logger.success(f"音频文件下载成功: {output_path}") + logger.success(f"使用的voice_id: {voice_id}") + return os.path.abspath(output_path), voice_id + + except Exception as e: + logger.error(f"处理过程中出现错误: {str(e)}") + return None, None + + +# 使用示例 +if __name__ == "__main__": + # 配置你的参数 + group_id = "1925197219753894100" + api_key = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJHcm91cE5hbWUiOiLkuI3lv5jmmbrog73np5HmioDvvIjmna3lt57vvInmnInpmZDlhazlj7giLCJVc2VyTmFtZSI6IuWFiOiSmSIsIkFjY291bnQiOiLlhYjokplAMTkyNTE5NzIxOTc1Mzg5NDEwMCIsIlN1YmplY3RJRCI6IjE5MjUwNjg3Mjg2NTU5NDE2NDQiLCJQaG9uZSI6IiIsIkdyb3VwSUQiOiIxOTI1MTk3MjE5NzUzODk0MTAwIiwiUGFnZU5hbWUiOiIiLCJNYWlsIjoiIiwiQ3JlYXRlVGltZSI6IjIwMjUtMDUtMjIgMTY6MzI6MzkiLCJUb2tlblR5cGUiOjEsImlzcyI6Im1pbmltYXgifQ.ZoyN7nWPLbB-p8Agtw0ggeDd-Bj7T_0KVmFq1g2IKg6_sbmCySqavbZKYBh0AJMIyzxDhYbWXwl8IN_biOPqYjf9QkZFaKftBqn35pHlqWe94zGOD_0T2HYMNH-_WLjpG9NH54GICEN_-oOnYdyjnf74p8AIWR0POvLJ4ijZyz4q22QP51DEb3ExhlwXv68vY-sru33CB9C9e88SR6h1fB1S-eysck_ZenIbfhhXYCqGlAAheQqwft-IS7n1axCqg2Xd3DJj-82Mj1gvlvf2q9CiE8bOu6YvW6Hs67p_60e9Vr8RVEyZc6WzRViqLR1izb8elChmRTdtbsHeq9lSzg" + input_audio_path = r'D:\gxm\desktop\minmax_audio_clone_dawan.mp3' + text = "我有这么这么多新款我都卖不过来了,我今年我上新到现在,我春夏款,我光裤子类目,300个品光裤子单品有300条,你想一下开发能力这么强的,抖音上面有几家。" + output_dir = r'D:\gxm\desktop' + + # 调用函数 + result_path, used_voice_id = voice_clone_and_download(audio_file_path=input_audio_path, group_id=group_id, + api_key=api_key, text=text, output_dir=output_dir) +