mxivideo/python_core/audio_processing/core.py

351 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Audio Processing Core Module
音频处理核心模块
This module provides audio processing functionality using Librosa, Pydub, and other audio libraries.
"""
import argparse
import json
import sys
from pathlib import Path
from typing import Dict, Any, List, Tuple
import numpy as np
from pydub import AudioSegment
import librosa
import librosa.display
import matplotlib.pyplot as plt
import sys
import os
from ..config import settings
from ..utils import setup_logger, validate_audio_file
logger = setup_logger(__name__)
class AudioProcessor:
"""Main audio processing class."""
def __init__(self):
self.temp_dir = settings.temp_dir
self.cache_dir = settings.cache_dir
self.sample_rate = settings.default_sample_rate
def analyze_audio(self, file_path: str, analysis_type: str) -> Dict[str, Any]:
"""
Analyze audio file with specified analysis type.
Args:
file_path: Path to audio file
analysis_type: Type of analysis to perform
Returns:
Dictionary with analysis results
"""
try:
if not validate_audio_file(file_path):
raise ValueError(f"Invalid audio file: {file_path}")
logger.info(f"Analyzing audio: {analysis_type} on {file_path}")
# Load audio
y, sr = librosa.load(file_path, sr=self.sample_rate)
if analysis_type == "rhythm":
result = self._analyze_rhythm(y, sr)
elif analysis_type == "spectral":
result = self._analyze_spectral(y, sr)
elif analysis_type == "tempo":
result = self._analyze_tempo(y, sr)
elif analysis_type == "pitch":
result = self._analyze_pitch(y, sr)
elif analysis_type == "energy":
result = self._analyze_energy(y, sr)
elif analysis_type == "mfcc":
result = self._analyze_mfcc(y, sr)
else:
raise ValueError(f"Unknown analysis type: {analysis_type}")
return {
"status": "success",
"file_path": file_path,
"analysis_type": analysis_type,
"sample_rate": sr,
"duration": len(y) / sr,
"result": result
}
except Exception as e:
logger.error(f"Audio analysis failed: {str(e)}")
return {
"status": "error",
"error": str(e)
}
def _analyze_rhythm(self, y: np.ndarray, sr: int) -> Dict[str, Any]:
"""Analyze rhythm and beat tracking."""
# Beat tracking
tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
beat_times = librosa.frames_to_time(beats, sr=sr)
# Onset detection
onset_frames = librosa.onset.onset_detect(y=y, sr=sr)
onset_times = librosa.frames_to_time(onset_frames, sr=sr)
# Rhythm patterns
beat_intervals = np.diff(beat_times)
rhythm_stability = 1.0 - np.std(beat_intervals) / np.mean(beat_intervals)
return {
"tempo": float(tempo),
"beat_count": len(beats),
"beat_times": beat_times.tolist(),
"onset_count": len(onset_frames),
"onset_times": onset_times.tolist(),
"rhythm_stability": float(rhythm_stability),
"average_beat_interval": float(np.mean(beat_intervals))
}
def _analyze_spectral(self, y: np.ndarray, sr: int) -> Dict[str, Any]:
"""Analyze spectral features."""
# Spectral centroid
spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
# Spectral rolloff
spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
# Spectral bandwidth
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
# Zero crossing rate
zcr = librosa.feature.zero_crossing_rate(y)[0]
return {
"spectral_centroid_mean": float(np.mean(spectral_centroids)),
"spectral_centroid_std": float(np.std(spectral_centroids)),
"spectral_rolloff_mean": float(np.mean(spectral_rolloff)),
"spectral_rolloff_std": float(np.std(spectral_rolloff)),
"spectral_bandwidth_mean": float(np.mean(spectral_bandwidth)),
"spectral_bandwidth_std": float(np.std(spectral_bandwidth)),
"zero_crossing_rate_mean": float(np.mean(zcr)),
"zero_crossing_rate_std": float(np.std(zcr))
}
def _analyze_tempo(self, y: np.ndarray, sr: int) -> Dict[str, Any]:
"""Analyze tempo and timing."""
# Tempo estimation
tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
# Dynamic tempo analysis
hop_length = 512
tempo_dynamic = librosa.beat.tempo(
onset_envelope=librosa.onset.onset_strength(y=y, sr=sr),
sr=sr,
hop_length=hop_length
)
return {
"tempo": float(tempo),
"tempo_confidence": float(np.std(tempo_dynamic)),
"tempo_stability": float(1.0 - np.std(tempo_dynamic) / np.mean(tempo_dynamic)) if np.mean(tempo_dynamic) > 0 else 0.0,
"beat_count": len(beats)
}
def _analyze_pitch(self, y: np.ndarray, sr: int) -> Dict[str, Any]:
"""Analyze pitch and harmonic content."""
# Pitch tracking using piptrack
pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
# Extract fundamental frequency
pitch_values = []
for t in range(pitches.shape[1]):
index = magnitudes[:, t].argmax()
pitch = pitches[index, t]
if pitch > 0:
pitch_values.append(pitch)
if pitch_values:
pitch_mean = np.mean(pitch_values)
pitch_std = np.std(pitch_values)
pitch_range = np.max(pitch_values) - np.min(pitch_values)
else:
pitch_mean = pitch_std = pitch_range = 0.0
# Harmonic-percussive separation
y_harmonic, y_percussive = librosa.effects.hpss(y)
harmonic_ratio = np.mean(np.abs(y_harmonic)) / (np.mean(np.abs(y)) + 1e-8)
return {
"pitch_mean": float(pitch_mean),
"pitch_std": float(pitch_std),
"pitch_range": float(pitch_range),
"pitch_count": len(pitch_values),
"harmonic_ratio": float(harmonic_ratio)
}
def _analyze_energy(self, y: np.ndarray, sr: int) -> Dict[str, Any]:
"""Analyze energy and dynamics."""
# RMS energy
rms = librosa.feature.rms(y=y)[0]
# Short-time energy
frame_length = 2048
hop_length = 512
energy = []
for i in range(0, len(y) - frame_length, hop_length):
frame = y[i:i + frame_length]
energy.append(np.sum(frame ** 2))
energy = np.array(energy)
# Dynamic range
dynamic_range = np.max(rms) - np.min(rms)
return {
"rms_mean": float(np.mean(rms)),
"rms_std": float(np.std(rms)),
"rms_max": float(np.max(rms)),
"rms_min": float(np.min(rms)),
"energy_mean": float(np.mean(energy)),
"energy_std": float(np.std(energy)),
"dynamic_range": float(dynamic_range)
}
def _analyze_mfcc(self, y: np.ndarray, sr: int) -> Dict[str, Any]:
"""Analyze MFCC (Mel-frequency cepstral coefficients)."""
# Extract MFCC features
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
# Statistical features for each MFCC coefficient
mfcc_features = {}
for i in range(mfccs.shape[0]):
mfcc_features[f"mfcc_{i+1}_mean"] = float(np.mean(mfccs[i]))
mfcc_features[f"mfcc_{i+1}_std"] = float(np.std(mfccs[i]))
return mfcc_features
def process_audio(self, input_path: str, output_path: str, operation: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
"""
Process audio with specified operation and parameters.
Args:
input_path: Path to input audio file
output_path: Path to output audio file
operation: Type of operation to perform
parameters: Operation-specific parameters
Returns:
Dictionary with processing results
"""
try:
if not validate_audio_file(input_path):
raise ValueError(f"Invalid audio file: {input_path}")
logger.info(f"Processing audio: {operation} on {input_path}")
# Load audio with pydub for processing
audio = AudioSegment.from_file(input_path)
if operation == "trim":
result_audio = self._trim_audio(audio, parameters)
elif operation == "volume":
result_audio = self._adjust_volume(audio, parameters)
elif operation == "fade":
result_audio = self._apply_fade(audio, parameters)
elif operation == "normalize":
result_audio = self._normalize_audio(audio, parameters)
elif operation == "merge":
result_audio = self._merge_audio(parameters)
else:
raise ValueError(f"Unknown operation: {operation}")
# Export result
result_audio.export(output_path, format="wav")
return {
"status": "success",
"output_path": output_path,
"duration": len(result_audio) / 1000.0,
"channels": result_audio.channels,
"sample_rate": result_audio.frame_rate
}
except Exception as e:
logger.error(f"Audio processing failed: {str(e)}")
return {
"status": "error",
"error": str(e)
}
def _trim_audio(self, audio: AudioSegment, params: Dict[str, Any]) -> AudioSegment:
"""Trim audio to specified start and end times."""
start_ms = int(params.get("start_time", 0) * 1000)
end_ms = int(params.get("end_time", len(audio) / 1000) * 1000)
return audio[start_ms:end_ms]
def _adjust_volume(self, audio: AudioSegment, params: Dict[str, Any]) -> AudioSegment:
"""Adjust audio volume."""
volume_change = params.get("volume_db", 0)
return audio + volume_change
def _apply_fade(self, audio: AudioSegment, params: Dict[str, Any]) -> AudioSegment:
"""Apply fade in/out effects."""
fade_in_ms = int(params.get("fade_in", 0) * 1000)
fade_out_ms = int(params.get("fade_out", 0) * 1000)
result = audio
if fade_in_ms > 0:
result = result.fade_in(fade_in_ms)
if fade_out_ms > 0:
result = result.fade_out(fade_out_ms)
return result
def _normalize_audio(self, audio: AudioSegment, params: Dict[str, Any]) -> AudioSegment:
"""Normalize audio to specified level."""
target_dBFS = params.get("target_db", -20.0)
return audio.normalize().apply_gain(target_dBFS - audio.dBFS)
def _merge_audio(self, params: Dict[str, Any]) -> AudioSegment:
"""Merge multiple audio files."""
audio_paths = params.get("audio_paths", [])
if len(audio_paths) < 2:
raise ValueError("At least 2 audio files required for merge operation")
result = AudioSegment.from_file(audio_paths[0])
for path in audio_paths[1:]:
audio = AudioSegment.from_file(path)
result = result + audio
return result
def main():
"""Command line interface for audio processing."""
parser = argparse.ArgumentParser(description="Audio Processing Core")
parser.add_argument("--file", required=True, help="Audio file path")
parser.add_argument("--analysis", required=True, help="Analysis type to perform")
args = parser.parse_args()
try:
processor = AudioProcessor()
result = processor.analyze_audio(args.file, args.analysis)
print(json.dumps(result))
except Exception as e:
error_result = {
"status": "error",
"error": str(e)
}
print(json.dumps(error_result))
sys.exit(1)
if __name__ == "__main__":
main()