封装minmax语音克隆接口

This commit is contained in:
gexianmeng 2025-05-23 17:38:58 +08:00
parent a4c4ee1f5b
commit 3f49577626
3 changed files with 485 additions and 0 deletions

18
build/lib/cluster/app.py Normal file
View File

@ -0,0 +1,18 @@
import modal
from src.cluster.config import config
from src.cluster.video_downloader.worker import worker_app
from src.cluster.web.worker import fastapi_app
from src.cluster.ffmpeg_worker.worker import app as ffmpeg_app
from src.cluster.comfyui.worker import app as comfyui_app
from src.cluster.comfyui_latentsync15.worker import app as comfyui_latentsync15_app
app = modal.App(config.app_name,
include_source=False,
secrets=[modal.Secret.from_name("cf-kv-secret",
environment_name=config.environment)])
app.include(fastapi_app)
app.include(worker_app)
app.include(ffmpeg_app)
app.include(comfyui_app)
app.include(comfyui_latentsync15_app)

View File

@ -0,0 +1,321 @@
import asyncio
import os
from pathlib import Path
from typing import Optional
import sentry_sdk
from fastapi import APIRouter, Depends, HTTPException, File, UploadFile, Form
from fastapi.responses import JSONResponse
from loguru import logger
from modal import current_function_call_id
from pydantic import BaseModel, Field
from starlette import status
from ..middleware.authorization import verify_token
from ..models.media_model import MediaSource, MediaCacheStatus
from ..utils.KVCache import KVCache
from ..utils.SentryUtils import SentryUtils
# 导入语音克隆函数
from cluster.minmax_clone_audio_api import voice_clone_and_download
# 创建路由,使用 /text2speech 作为前缀
router = APIRouter(prefix="/text2speech", tags=["文本转语音"])
# 使用函数来延迟获取配置
def get_config():
"""延迟加载配置,确保环境变量已经被 Modal 注入"""
from ..config import WorkerConfig
return WorkerConfig()
# 定义请求和响应模型
class Text2SpeechRequest(BaseModel):
"""文本转语音请求模型"""
text: str = Field(description="要合成的文本内容")
voice_id: Optional[str] = Field(default=None, description="声音ID如果为None则自动生成")
model: str = Field(default="speech-02-hd", description="使用的模型")
accuracy: float = Field(default=0.8, description="精确度范围0-1")
class Text2SpeechResponse(BaseModel):
"""文本转语音响应模型"""
success: bool = Field(description="是否成功")
media_source: Optional[MediaSource] = Field(default=None, description="生成的音频文件信息")
download_url: Optional[str] = Field(default=None, description="下载链接")
voice_id: Optional[str] = Field(default=None, description="声音ID")
error: Optional[str] = Field(default=None, description="错误信息")
@router.post("/clone", summary="语音克隆合成", description="上传音频文件进行语音克隆,并使用克隆的声音合成新的文本音频",
dependencies=[Depends(verify_token)])
async def voice_clone(file: UploadFile = File(description="上传的音频文件,用于克隆声音"),
text: str = Form(description="要合成的文本内容"),
model: str = Form(default="speech-02-hd", description="使用的模型"),
accuracy: float = Form(default=0.8, description="精确度")) -> Text2SpeechResponse:
"""
语音克隆接口
该接口接收一个音频文件克隆其声音特征并使用克隆的声音合成新的文本音频
"""
fn_id = current_function_call_id()
# 在函数内部获取配置
config = get_config()
modal_kv_cache = KVCache(kv_name=config.modal_kv_name, environment=config.modal_environment)
# 添加配置验证日志
logger.info(f"开始语音克隆处理 - Function ID: {fn_id}")
logger.info(f"MiniMax Group ID 配置状态: {'已配置' if config.minimax_group_id else '未配置'}")
logger.info(f"MiniMax API Key 配置状态: {'已配置' if config.minimax_api_key else '未配置'}")
# 检查必要的配置
if not config.minimax_group_id or not config.minimax_api_key:
error_msg = []
if not config.minimax_group_id:
error_msg.append("MINIMAX_GROUP_ID 未配置")
if not config.minimax_api_key:
error_msg.append("MINIMAX_API_KEY 未配置")
logger.error(f"MiniMax API 配置缺失: {', '.join(error_msg)}")
return Text2SpeechResponse(success=False, error=f"MiniMax API 配置错误: {', '.join(error_msg)}")
# 验证文件类型
allowed_extensions = {'.mp3', '.wav', '.m4a', '.aac', '.ogg'}
file_extension = Path(file.filename).suffix.lower()
if file_extension not in allowed_extensions:
logger.warning(f"不支持的文件格式: {file_extension}")
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST,
detail=f"不支持的文件格式。支持的格式: {', '.join(allowed_extensions)}")
# 文件大小限制
max_file_size = config.voice_clone_max_file_size
if file.size > max_file_size:
logger.warning(f"文件大小超限: {file.size} > {max_file_size}")
raise HTTPException(status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
detail=f"文件大小超过限制。最大允许: {max_file_size / 1024 / 1024}MB")
temp_dir = None
temp_audio_path = None
try:
# 创建临时目录保存上传的文件
temp_dir = f"/tmp/text2speech_{fn_id}"
os.makedirs(temp_dir, exist_ok=True)
# 保存上传的文件
temp_audio_path = os.path.join(temp_dir, file.filename)
with open(temp_audio_path, 'wb') as f:
content = await file.read()
f.write(content)
logger.info(f"音频文件已保存到: {temp_audio_path}, 大小: {len(content)} bytes")
# 设置输出目录
output_dir = os.path.join(config.S3_mount_dir, config.voice_clone_output_dir)
os.makedirs(output_dir, exist_ok=True)
logger.info(f"输出目录: {output_dir}")
# 调用语音克隆函数
@SentryUtils.sentry_tracker(name="文本转语音处理", op="text2speech.clone", fn_id=fn_id, sentry_trace_id=None,
sentry_baggage=None)
async def process_voice_clone():
try:
# 在异步环境中运行同步函数
loop = asyncio.get_event_loop()
logger.info(f"开始调用 MiniMax API - text: {text[:50]}...")
result = await loop.run_in_executor(None,
voice_clone_and_download,
temp_audio_path,
config.minimax_group_id,
config.minimax_api_key,
text,
output_dir,
model,
accuracy)
logger.info(f"MiniMax API 调用完成 - 结果: {result}")
return result
except Exception as api_error:
logger.error(f"MiniMax API 调用异常: {str(api_error)}")
raise
# 执行语音克隆
output_path, used_voice_id = await process_voice_clone()
logger.info(f"语音克隆结果 - output_path: {output_path}, voice_id: {used_voice_id}")
# 检查结果
if output_path is None:
logger.error("语音合成失败: output_path 为 None")
return Text2SpeechResponse(
success=False,
voice_id=used_voice_id or "",
error="语音合成失败MiniMax API 返回空结果,请检查 API 配置和余额"
)
# 验证输出文件是否存在
if not os.path.exists(output_path):
logger.error(f"输出文件不存在: {output_path}")
return Text2SpeechResponse(
success=False,
voice_id=used_voice_id or "",
error=f"生成的音频文件不存在: {output_path}"
)
# 创建 MediaSource 对象
# 计算相对于S3挂载目录的路径
relative_path = os.path.relpath(output_path, config.S3_mount_dir)
s3_key = relative_path.replace("\\", "/") # Windows兼容
media_source = MediaSource(
path=s3_key,
protocol="s3",
endpoint=config.S3_region,
bucket=config.S3_bucket_name,
urn=f"s3://{config.S3_region}/{config.S3_bucket_name}/{s3_key}",
status=MediaCacheStatus.ready, downloader_id=fn_id
)
# 保存到KV缓存
modal_kv_cache.set_cache(media_source)
logger.info(f"已保存到 KV 缓存: {media_source.urn}")
# 生成下载链接
download_url = f"{config.S3_cdn_endpoint}/{s3_key}"
logger.success(f"文本转语音成功完成。Voice ID: {used_voice_id}, Download URL: {download_url}")
return Text2SpeechResponse(
success=True,
voice_id=used_voice_id,
media_source=media_source,
download_url=download_url
)
except HTTPException:
# 重新抛出 HTTP 异常
raise
except Exception as e:
logger.error(f"文本转语音处理失败 - 详细错误: {str(e)}", exc_info=True)
sentry_sdk.capture_exception(e)
# 返回详细的错误信息
error_detail = str(e)
if "401" in error_detail or "403" in error_detail:
error_msg = "MiniMax API 认证失败,请检查 API Key"
elif "429" in error_detail:
error_msg = "MiniMax API 请求过于频繁,请稍后重试"
elif "insufficient" in error_detail.lower() or "quota" in error_detail.lower():
error_msg = "MiniMax API 余额不足"
else:
error_msg = f"处理失败: {error_detail[:200]}" # 限制错误信息长度
return Text2SpeechResponse(success=False, error=error_msg)
finally:
# 清理临时文件
try:
if temp_audio_path and os.path.exists(temp_audio_path):
os.remove(temp_audio_path)
logger.info(f"已删除临时文件: {temp_audio_path}")
if temp_dir and os.path.exists(temp_dir):
os.rmdir(temp_dir)
logger.info(f"已删除临时目录: {temp_dir}")
except Exception as cleanup_error:
logger.warning(f"清理临时文件失败: {cleanup_error}")
@router.get("/voice/{voice_id}", summary="查询声音状态", description="根据voice_id查询声音文件的状态")
async def get_voice_status(voice_id: str):
"""
查询特定voice_id的处理状态
"""
# 在函数内部获取配置
config = get_config()
modal_kv_cache = KVCache(kv_name=config.modal_kv_name, environment=config.modal_environment)
try:
# 尝试从KV缓存中查找相关的媒体文件
key_pattern = f"{config.voice_clone_output_dir}/{voice_id}.mp3"
urn = f"s3://{config.S3_region}/{config.S3_bucket_name}/{key_pattern}"
cached_media = modal_kv_cache.get_cache(urn)
if cached_media:
download_url = f"{config.S3_cdn_endpoint}/{cached_media.path}"
return JSONResponse(content={
"success": True,
"voice_id": voice_id,
"status": cached_media.status.value,
"download_url": download_url if cached_media.status == MediaCacheStatus.ready else None,
"media_source": cached_media.model_dump()
})
else:
return JSONResponse(status_code=status.HTTP_404_NOT_FOUND,
content={"success": False, "voice_id": voice_id, "error": "未找到该voice_id的记录"})
except Exception as e:
logger.error(f"查询声音状态失败: {str(e)}")
return JSONResponse(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
content={"success": False, "voice_id": voice_id, "error": f"查询失败: {str(e)}"}
)
# 调试端点
@router.get("/debug/env",
summary="调试环境变量",
description="检查环境变量和配置状态",
dependencies=[Depends(verify_token)])
async def debug_environment():
"""调试端点,检查环境变量是否正确加载"""
import os
# 获取所有环境变量
all_env_vars = dict(os.environ)
# 查找 MINIMAX 相关的环境变量
minimax_vars = {k: v for k, v in all_env_vars.items() if 'MINIMAX' in k.upper()}
# 尝试加载配置
try:
config = get_config()
config_loaded = True
config_values = {
"minimax_group_id": bool(config.minimax_group_id),
"minimax_api_key": bool(config.minimax_api_key),
"minimax_group_id_length": len(config.minimax_group_id),
"minimax_api_key_length": len(config.minimax_api_key),
}
except Exception as e:
config_loaded = False
config_values = {"error": str(e)}
return {
"environment": {
"MODAL_ENVIRONMENT": os.getenv("MODAL_ENVIRONMENT", "NOT_SET"),
"total_env_vars": len(all_env_vars),
"minimax_env_vars_found": len(minimax_vars),
},
"minimax_env_vars": {
k: f"{v[:20]}..." if len(v) > 20 else v
for k, v in minimax_vars.items()
},
"direct_env_check": {
"MINIMAX_GROUP_ID": os.getenv("MINIMAX_GROUP_ID", "NOT_FOUND"),
"MINIMAX_API_KEY": "FOUND" if os.getenv("MINIMAX_API_KEY") else "NOT_FOUND",
},
"config_loaded": config_loaded,
"config_values": config_values,
"secrets_in_env": {
"CF_KV_SECRET": "cf-kv" in str(all_env_vars),
"MINIMAX_SECRET": any("minimax" in k.lower() for k in all_env_vars.keys()),
}
}

View File

@ -0,0 +1,146 @@
import json
import os
import uuid
from datetime import datetime
import requests
from loguru import logger
def generate_unique_voice_id(prefix="minmax_audio_clone"):
"""
生成唯一的voice_id
参数:
prefix: voice_id的前缀默认为"minmax_audio_clone"
返回:
符合MiniMax规则的唯一voice_id格式minmax_audio_clone_20250803_547218f2
"""
# 获取当前日期格式YYYYMMDD
current_date = datetime.now().strftime("%Y%m%d")
# 生成8位随机字符
unique_id = str(uuid.uuid4()).replace('-', '')[:8]
# 组合voice_id
voice_id = f"{prefix}_{current_date}_{unique_id}"
# 确保长度不超过256个字符
if len(voice_id) > 256:
voice_id = voice_id[:256]
return voice_id
def voice_clone_and_download(audio_file_path, group_id, api_key, text, output_dir, model="speech-02-hd",
accuracy=0.8):
"""
音频复刻并下载生成的音频文件
参数:
audio_file_path: 输入音频文件路径
group_id: MiniMax API的组ID
api_key: MiniMax API密钥
text: 要合成的文本内容
output_dir: 输出目录
model: 使用的模型默认为"speech-02-hd"
accuracy: 精确度默认为0.8
返回:
(下载的音频文件完整路径, 使用的voice_id)的元组如果失败返回(None, None)
"""
try:
# 如果没有提供voice_id则生成一个唯一的
voice_id = generate_unique_voice_id()
logger.info(f"使用voice_id: {voice_id}")
# 步骤1: 上传音频文件
logger.info("正在上传音频文件...")
upload_url = f'https://api.minimax.chat/v1/files/upload?GroupId={group_id}'
upload_headers = {'authority': 'api.minimax.chat', 'Authorization': f'Bearer {api_key}'}
upload_data = {'purpose': 'voice_clone'}
with open(audio_file_path, 'rb') as audio_file:
files = {'file': audio_file}
upload_response = requests.post(upload_url, headers=upload_headers, data=upload_data, files=files)
if upload_response.status_code != 200:
logger.error(f"文件上传失败: {upload_response.status_code}")
return None, None
file_id = upload_response.json().get("file", {}).get("file_id")
if not file_id:
logger.error("无法获取文件ID")
return None, None
logger.success(f"文件上传成功文件ID: {file_id}")
# 步骤2: 音频复刻
logger.info("正在进行音频复刻...")
clone_url = f'https://api.minimax.chat/v1/voice_clone?GroupId={group_id}'
clone_payload = json.dumps(
{"file_id": file_id, "voice_id": voice_id, "text": text, "model": model, "accuracy": accuracy, })
clone_headers = {'Authorization': f'Bearer {api_key}', 'content-type': 'application/json'}
clone_response = requests.post(clone_url, headers=clone_headers, data=clone_payload)
if clone_response.status_code != 200:
logger.error(f"音频复刻失败: {clone_response.status_code}")
return None, None
response_data = clone_response.json()
logger.info(f"复刻响应: {response_data}")
# 获取音频下载链接
demo_audio_url = response_data.get("demo_audio")
if not demo_audio_url:
logger.error("无法获取音频下载链接")
return None, None
# 步骤3: 下载音频文件
logger.info("正在下载生成的音频文件...")
# 创建输出目录
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 使用voice_id作为文件名
filename = f"{voice_id}.mp3"
output_path = os.path.join(output_dir, filename)
# 下载文件
download_response = requests.get(demo_audio_url)
if download_response.status_code != 200:
logger.error(f"音频下载失败: {download_response.status_code}")
return None, None
with open(output_path, 'wb') as f:
f.write(download_response.content)
logger.success(f"音频文件下载成功: {output_path}")
logger.success(f"使用的voice_id: {voice_id}")
return os.path.abspath(output_path), voice_id
except Exception as e:
logger.error(f"处理过程中出现错误: {str(e)}")
return None, None
# 使用示例
if __name__ == "__main__":
# 配置你的参数
group_id = "1925197219753894100"
api_key = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJHcm91cE5hbWUiOiLkuI3lv5jmmbrog73np5HmioDvvIjmna3lt57vvInmnInpmZDlhazlj7giLCJVc2VyTmFtZSI6IuWFiOiSmSIsIkFjY291bnQiOiLlhYjokplAMTkyNTE5NzIxOTc1Mzg5NDEwMCIsIlN1YmplY3RJRCI6IjE5MjUwNjg3Mjg2NTU5NDE2NDQiLCJQaG9uZSI6IiIsIkdyb3VwSUQiOiIxOTI1MTk3MjE5NzUzODk0MTAwIiwiUGFnZU5hbWUiOiIiLCJNYWlsIjoiIiwiQ3JlYXRlVGltZSI6IjIwMjUtMDUtMjIgMTY6MzI6MzkiLCJUb2tlblR5cGUiOjEsImlzcyI6Im1pbmltYXgifQ.ZoyN7nWPLbB-p8Agtw0ggeDd-Bj7T_0KVmFq1g2IKg6_sbmCySqavbZKYBh0AJMIyzxDhYbWXwl8IN_biOPqYjf9QkZFaKftBqn35pHlqWe94zGOD_0T2HYMNH-_WLjpG9NH54GICEN_-oOnYdyjnf74p8AIWR0POvLJ4ijZyz4q22QP51DEb3ExhlwXv68vY-sru33CB9C9e88SR6h1fB1S-eysck_ZenIbfhhXYCqGlAAheQqwft-IS7n1axCqg2Xd3DJj-82Mj1gvlvf2q9CiE8bOu6YvW6Hs67p_60e9Vr8RVEyZc6WzRViqLR1izb8elChmRTdtbsHeq9lSzg"
input_audio_path = r'D:\gxm\desktop\minmax_audio_clone_dawan.mp3'
text = "我有这么这么多新款我都卖不过来了我今年我上新到现在我春夏款我光裤子类目300个品光裤子单品有300条你想一下开发能力这么强的抖音上面有几家。"
output_dir = r'D:\gxm\desktop'
# 调用函数
result_path, used_voice_id = voice_clone_and_download(audio_file_path=input_audio_path, group_id=group_id,
api_key=api_key, text=text, output_dir=output_dir)