Spaces:
Running
Running
| import os | |
| import torch | |
| import shutil | |
| import librosa | |
| import warnings | |
| import numpy as np | |
| import gradio as gr | |
| import librosa.display | |
| import matplotlib.pyplot as plt | |
| from collections import Counter | |
| from model import EvalNet | |
| from utils import ( | |
| get_modelist, | |
| find_files, | |
| embed_img, | |
| _L, | |
| SAMPLE_RATE, | |
| TEMP_DIR, | |
| TRANSLATE, | |
| CLASSES, | |
| EN_US, | |
| ) | |
| def circular_padding(spec: np.ndarray, end: int): | |
| size = len(spec) | |
| if end <= size: | |
| return spec | |
| num_padding = end - size | |
| num_repeat = num_padding // size + int(num_padding % size != 0) | |
| padding = np.tile(spec, num_repeat) | |
| return np.concatenate((spec, padding))[:end] | |
| def wav2mel(audio_path: str, width=3): | |
| y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
| total_frames = len(y) | |
| if total_frames % (width * sr) != 0: | |
| count = total_frames // (width * sr) + 1 | |
| y = circular_padding(y, count * width * sr) | |
| mel_spec = librosa.feature.melspectrogram(y=y, sr=sr) | |
| log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) | |
| dur = librosa.get_duration(y=y, sr=sr) | |
| total_frames = log_mel_spec.shape[1] | |
| step = int(width * total_frames / dur) | |
| count = int(total_frames / step) | |
| begin = int(0.5 * (total_frames - count * step)) | |
| end = begin + step * count | |
| for i in range(begin, end, step): | |
| librosa.display.specshow(log_mel_spec[:, i : i + step]) | |
| plt.axis("off") | |
| plt.savefig( | |
| f"{TEMP_DIR}/{i}.jpg", | |
| bbox_inches="tight", | |
| pad_inches=0.0, | |
| ) | |
| plt.close() | |
| def wav2cqt(audio_path: str, width=3): | |
| y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
| total_frames = len(y) | |
| if total_frames % (width * sr) != 0: | |
| count = total_frames // (width * sr) + 1 | |
| y = circular_padding(y, count * width * sr) | |
| cqt_spec = librosa.cqt(y=y, sr=sr) | |
| log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max) | |
| dur = librosa.get_duration(y=y, sr=sr) | |
| total_frames = log_cqt_spec.shape[1] | |
| step = int(width * total_frames / dur) | |
| count = int(total_frames / step) | |
| begin = int(0.5 * (total_frames - count * step)) | |
| end = begin + step * count | |
| for i in range(begin, end, step): | |
| librosa.display.specshow(log_cqt_spec[:, i : i + step]) | |
| plt.axis("off") | |
| plt.savefig( | |
| f"{TEMP_DIR}/{i}.jpg", | |
| bbox_inches="tight", | |
| pad_inches=0.0, | |
| ) | |
| plt.close() | |
| def wav2chroma(audio_path: str, width=3): | |
| y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
| total_frames = len(y) | |
| if total_frames % (width * sr) != 0: | |
| count = total_frames // (width * sr) + 1 | |
| y = circular_padding(y, count * width * sr) | |
| chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr) | |
| log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max) | |
| dur = librosa.get_duration(y=y, sr=sr) | |
| total_frames = log_chroma_spec.shape[1] | |
| step = int(width * total_frames / dur) | |
| count = int(total_frames / step) | |
| begin = int(0.5 * (total_frames - count * step)) | |
| end = begin + step * count | |
| for i in range(begin, end, step): | |
| librosa.display.specshow(log_chroma_spec[:, i : i + step]) | |
| plt.axis("off") | |
| plt.savefig( | |
| f"{TEMP_DIR}/{i}.jpg", | |
| bbox_inches="tight", | |
| pad_inches=0.0, | |
| ) | |
| plt.close() | |
| def most_frequent_value(lst: list): | |
| counter = Counter(lst) | |
| max_count = max(counter.values()) | |
| for element, count in counter.items(): | |
| if count == max_count: | |
| return element | |
| return None | |
| def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR): | |
| status = "Success" | |
| filename = result = None | |
| try: | |
| if os.path.exists(folder_path): | |
| shutil.rmtree(folder_path) | |
| if not wav_path: | |
| raise ValueError("请输入音频!") | |
| spec = log_name.split("_")[-3] | |
| os.makedirs(folder_path, exist_ok=True) | |
| model = EvalNet(log_name, len(TRANSLATE)).model | |
| eval("wav2%s" % spec)(wav_path) | |
| jpgs = find_files(folder_path, ".jpg") | |
| preds = [] | |
| for jpg in jpgs: | |
| input = embed_img(jpg) | |
| output: torch.Tensor = model(input) | |
| preds.append(torch.max(output.data, 1)[1]) | |
| pred_id = most_frequent_value(preds) | |
| filename = os.path.basename(wav_path) | |
| result = ( | |
| CLASSES[pred_id].capitalize() | |
| if EN_US | |
| else f"{TRANSLATE[CLASSES[pred_id]]} ({CLASSES[pred_id].capitalize()})" | |
| ) | |
| except Exception as e: | |
| status = f"{e}" | |
| return status, filename, result | |
| if __name__ == "__main__": | |
| warnings.filterwarnings("ignore") | |
| models = get_modelist(assign_model="vit_l_16_mel") | |
| examples = [] | |
| example_wavs = find_files() | |
| for wav in example_wavs: | |
| examples.append([wav, models[0]]) | |
| with gr.Blocks() as demo: | |
| gr.Interface( | |
| fn=infer, | |
| inputs=[ | |
| gr.Audio(label=_L("上传录音"), type="filepath"), | |
| gr.Dropdown(choices=models, label=_L("选择模型"), value=models[0]), | |
| ], | |
| outputs=[ | |
| gr.Textbox(label=_L("状态栏"), show_copy_button=True), | |
| gr.Textbox(label=_L("音频文件名"), show_copy_button=True), | |
| gr.Textbox( | |
| label=_L("古筝演奏技法识别"), | |
| show_copy_button=True, | |
| ), | |
| ], | |
| examples=examples, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| title=_L("建议录音时长保持在 3s 左右"), | |
| ) | |
| gr.Markdown( | |
| f"# {_L('引用')}" | |
| + """ | |
| ```bibtex | |
| @article{Zhou-2025, | |
| author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han}, | |
| title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research}, | |
| journal = {Transactions of the International Society for Music Information Retrieval}, | |
| volume = {8}, | |
| number = {1}, | |
| pages = {22--38}, | |
| month = {Mar}, | |
| year = {2025}, | |
| url = {https://doi.org/10.5334/tismir.194}, | |
| doi = {10.5334/tismir.194} | |
| } | |
| ```""" | |
| ) | |
| demo.launch() | |