Skip to content

Commit 6ec8fbb

Browse files
committed
feat: beta release
1 parent e64986b commit 6ec8fbb

8 files changed

Lines changed: 378 additions & 68 deletions

File tree

README.md

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
<div align="center">
2+
3+
# JSON RVC Inference
4+
5+
</div>
6+
7+
### Information
8+
JSON RVC Inference is the same [advanced version of RVC](https://github.com/ArkanDash/Advanced-RVC-Inference) with JSON file to select desired model to download and load.
9+
**Currently this inference is on unstable state, Please use at your own risk!**
10+
11+
Please support the original RVC. This inference won't be possible to make without it.<br />
12+
[![Original RVC Repository](https://img.shields.io/badge/Github-Original%20RVC%20Repository-blue?style=for-the-badge&logo=github)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)
13+
14+
#### Features
15+
- Support V1 & V2 Model ✅
16+
- Model downloader using JSON file [Internet required for downloading voice model]
17+
- Youtube Audio Downloader ✅
18+
- Voice Splitter [Internet required for downloading splitter model]
19+
- Microphone Support ✅
20+
- TTS Support ✅
21+
22+
### Installation
23+
24+
1. Install Dependencies <br />
25+
```bash
26+
pip install torch torchvision torchaudio
27+
28+
pip install -r requirements.txt
29+
```
30+
2. Install [ffmpeg](https://ffmpeg.org/)
31+
32+
3. Download [Hubert Model](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/hubert_base.pt)
33+
34+
4. **[Optional]** To use rmvpe pitch extraction, download this [rvmpe.pt](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/rmvpe.pt)
35+
36+
### Run WebUI <br />
37+
38+
For Windows:
39+
```bash
40+
Open run.bat
41+
```
42+
For Other:
43+
```bash
44+
python infer.py
45+
```

configs/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def __init__(self):
5656
self.noparallel,
5757
self.noautoopen,
5858
self.dml,
59+
self.force_support,
5960
) = self.arg_parse()
6061
self.instead = ""
6162
self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
@@ -75,6 +76,7 @@ def arg_parse() -> tuple:
7576
parser.add_argument("--port", type=int, default=7865, help="Listen port")
7677
parser.add_argument("--pycmd", type=str, default=exe, help="Python command")
7778
parser.add_argument("--colab", action="store_true", help="Launch in colab")
79+
parser.add_argument("--force_support", action="store_true", help="Force unsupported feature such as crepe")
7880
parser.add_argument(
7981
"--noparallel", action="store_true", help="Disable parallel processing"
8082
)
@@ -99,6 +101,7 @@ def arg_parse() -> tuple:
99101
cmd_opts.noparallel,
100102
cmd_opts.noautoopen,
101103
cmd_opts.dml,
104+
cmd_opts.force_support,
102105
)
103106

104107
# has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.

inference.py renamed to infer.py

Lines changed: 149 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
now_dir = os.getcwd()
44
sys.path.append(now_dir)
55
from infer.modules.vc.modules import VC
6+
from infer.modules.vc.utils import download_and_split_audio, combine_audio
7+
from infer.lib.setting import change_audio_mode, show_description, use_microphone
68
from configs.config import Config
79
import numpy as np
810
import gradio as gr
@@ -12,12 +14,17 @@
1214
import logging
1315
import zipfile
1416
import glob
17+
import asyncio
18+
import edge_tts
1519

1620
logging.getLogger("numba").setLevel(logging.WARNING)
1721
logger = logging.getLogger(__name__)
1822
config = Config()
1923
vc = VC(config)
2024

25+
tts_voice_list = asyncio.new_event_loop().run_until_complete(edge_tts.list_voices())
26+
tts_voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
27+
2128
os.makedirs("models", exist_ok=True)
2229
os.makedirs(os.path.join("models", "weights"), exist_ok=True)
2330
os.makedirs(os.path.join("models", "indexs"), exist_ok=True)
@@ -28,6 +35,26 @@
2835
indexs_path = os.path.join("models", "indexs")
2936
covers_path = os.path.join("models", "covers")
3037

38+
force_support = None
39+
if config.force_support is False:
40+
if config.device == "mps" or config.device == "cpu":
41+
force_support = False
42+
else:
43+
print("\033[93mWARNING: Unsupported feature is enabled.\033[0m")
44+
print("\033[93mWARNING: It may not work properly.\033[0m")
45+
force_support = True
46+
47+
audio_mode = []
48+
f0method_mode = []
49+
f0method_info = ""
50+
51+
if force_support is False:
52+
audio_mode = ["Upload audio", "Input path", "TTS Audio"]
53+
f0method_mode = ["pm", "rmvpe", "harvest"]
54+
else:
55+
audio_mode = ["Upload audio", "Input path", "Youtube", "TTS Audio"]
56+
f0method_mode = ["pm", "rmvpe", "harvest", "crepe"]
57+
3158
json_files = []
3259
for root, dirs, files in os.walk(models_path):
3360
for file in files:
@@ -105,21 +132,22 @@ def clean():
105132
return {"value": "", "__type__": "update"}
106133

107134
with gr.Blocks(title="RVC WebUI", theme=gr.themes.Base()) as app:
108-
gr.Markdown("## RVC WebUI")
109-
gr.Markdown(
110-
value="Alpha testing."
111-
)
135+
gr.Markdown("<center> # RVC WebUI")
136+
gr.Markdown("v1.0.0Beta")
112137
with gr.Tabs():
113138
with gr.TabItem("Inference"):
114139
with gr.Row():
115140
modelSelect = gr.Dropdown(label="Model", choices=sorted(modelList))
116141
downloadModel = gr.Button("Download Model", variant="primary")
117142
downloadModel.click(fn=download_model, inputs=[modelSelect], outputs=[])
118143
with gr.Row():
119-
sid0 = gr.Dropdown(label="Selected Model", choices=sorted(names))
120-
with gr.Column():
121-
refresh_button = gr.Button("Refresh model", variant="primary")
122-
clean_button = gr.Button("Clean memory", variant="primary")
144+
sid0 = gr.Dropdown(label="Selected Model", choices=sorted(names), allow_custom_value=False)
145+
file_index = gr.Dropdown(
146+
label="Index file dropdown",
147+
choices=sorted(indexs),
148+
allow_custom_value=False,
149+
interactive=True,
150+
)
123151
spk_item = gr.Slider(
124152
minimum=0,
125153
maximum=2333,
@@ -129,33 +157,44 @@ def clean():
129157
visible=True,
130158
interactive=True,
131159
)
132-
clean_button.click(
133-
fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean"
134-
)
160+
with gr.Column():
161+
refresh_button = gr.Button("Refresh model", variant="primary")
162+
clean_button = gr.Button("Clean memory", variant="primary")
163+
clean_button.click(fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean")
135164
with gr.TabItem("Inference Setting"):
136165
with gr.Row():
137166
with gr.Column():
138-
vc_transform0 = gr.Number(
139-
label="Transpose", value=0
140-
)
141-
input_audio0 = gr.Textbox(
142-
label="Audio Input Path",
143-
placeholder="C:\\Users\\Desktop\\audio_example.wav",
144-
)
145-
file_index = gr.Dropdown(
146-
label="Index file dropdown",
147-
choices=sorted(indexs),
148-
interactive=True,
149-
)
167+
vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio", visible=True, interactive=True)
168+
# Upload Audio
169+
vc_upload = gr.Audio(label="Upload audio file", sources="upload", visible=True, interactive=True)
170+
vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
171+
# Audio Path
172+
vc_audio_input = gr.Textbox(label="Audio Input Path", placeholder="C:\\Users\\Desktop\\audio_example.wav", visible=False, interactive=True)
173+
# Youtube Audio
174+
vc_link = gr.Textbox(label="Youtube URL", visible=False, placeholder="https://www.youtube.com/watch?v=...", interactive=True)
175+
vc_split_model = gr.Dropdown(label="Splitter Model", choices=["hdemucs_mmi", "htdemucs", "htdemucs_ft", "mdx", "mdx_extra", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", interactive=True)
176+
vc_download_button = gr.Button("Download Audio", variant="primary", visible=False)
177+
vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
178+
# TTS Audio
179+
vc_tts_text = gr.Textbox(label="TTS text", placeholder="Hello world", visible=False, interactive=True)
180+
vc_tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=tts_voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female", interactive=True)
181+
with gr.Column():
150182
f0method0 = gr.Radio(
151183
label="Pitch extraction algorithm",
152-
choices=["pm", "harvest", "crepe", "rmvpe"]
184+
choices=f0method_mode
153185
if config.dml == False
154186
else ["pm", "harvest", "rmvpe"],
155187
value="rmvpe",
156188
interactive=True,
157189
)
158-
with gr.Column():
190+
vc_transform0 = gr.Slider(
191+
label="Transpose",
192+
minimum=-256,
193+
maximum=256,
194+
step=0.01,
195+
value=0,
196+
interactive=True,
197+
)
159198
index_rate = gr.Slider(
160199
minimum=0,
161200
maximum=1,
@@ -194,10 +233,6 @@ def clean():
194233
step=1,
195234
interactive=True,
196235
)
197-
f0_file = gr.File(
198-
label="F0 curve file (Optional)",
199-
visible=False,
200-
)
201236
refresh_button.click(
202237
fn=change_choices,
203238
inputs=[],
@@ -206,32 +241,91 @@ def clean():
206241
)
207242
with gr.Column():
208243
but0 = gr.Button("Run", variant="primary")
209-
vc_output1 = gr.Textbox(label="Output Log")
210-
vc_output2 = gr.Audio(label="Output Audio")
211-
but0.click(
212-
vc.vc_single,
213-
[
214-
spk_item,
215-
input_audio0,
216-
vc_transform0,
217-
f0_file,
218-
f0method0,
219-
file_index,
220-
index_rate,
221-
filter_radius0,
222-
resample_sr0,
223-
rms_mix_rate0,
224-
protect0,
225-
],
226-
[vc_output1, vc_output2],
227-
api_name="infer_convert",
228-
)
229-
sid0.change(
230-
fn=vc.get_vc,
231-
inputs=[sid0, protect0],
232-
outputs=[spk_item, protect0, file_index],
233-
api_name="infer_change_voice",
234-
)
244+
vc_output = gr.Audio(label="Output Audio")
245+
vc_combined = gr.Button("Combine", variant="primary")
246+
vc_combined_output = gr.Audio(label="Combined Audio")
247+
with gr.TabItem("Log"):
248+
gr.Markdown("## Log")
249+
vc_log = gr.Textbox(label="Output Log")
250+
with gr.TabItem("Settings"):
251+
gr.Markdown("## Setting")
252+
description_mode = gr.Checkbox(label="Show description", value=False)
253+
description_mode.change(
254+
fn=show_description,
255+
inputs=description_mode,
256+
outputs=[
257+
vc_audio_input,
258+
vc_link,
259+
vc_split_model,
260+
vc_tts_text,
261+
vc_tts_voice,
262+
f0method0,
263+
vc_transform0,
264+
index_rate,
265+
resample_sr0,
266+
rms_mix_rate0,
267+
protect0,
268+
filter_radius0
269+
]
270+
)
271+
but0.click(
272+
vc.vc_single,
273+
[
274+
spk_item,
275+
vc_audio_input,
276+
vc_upload,
277+
vc_tts_text,
278+
vc_tts_voice,
279+
vc_transform0,
280+
f0method0,
281+
file_index,
282+
index_rate,
283+
filter_radius0,
284+
resample_sr0,
285+
rms_mix_rate0,
286+
protect0,
287+
],
288+
[vc_log, vc_output],
289+
api_name="infer_convert",
290+
)
291+
sid0.change(
292+
fn=vc.get_vc,
293+
inputs=[sid0, protect0],
294+
outputs=[spk_item, protect0, file_index],
295+
api_name="infer_change_voice",
296+
)
297+
vc_microphone_mode.change(
298+
fn=use_microphone,
299+
inputs=vc_microphone_mode,
300+
outputs=vc_upload
301+
)
302+
vc_download_button.click(
303+
fn=download_and_split_audio,
304+
inputs=[vc_link, vc_split_model],
305+
outputs=[vc_vocal_preview, vc_log]
306+
)
307+
vc_combined.click(
308+
fn=combine_audio,
309+
inputs=[vc_split_model],
310+
outputs=[vc_combined_output, vc_log]
311+
)
312+
vc_audio_mode.change(
313+
fn=change_audio_mode,
314+
inputs=[vc_audio_mode],
315+
outputs=[
316+
vc_upload,
317+
vc_microphone_mode,
318+
vc_audio_input,
319+
vc_link,
320+
vc_split_model,
321+
vc_download_button,
322+
vc_vocal_preview,
323+
vc_tts_text,
324+
vc_tts_voice,
325+
vc_combined,
326+
vc_combined_output,
327+
]
328+
)
235329
if config.iscolab:
236330
app.queue(max_size=1022).launch(share=True)
237331
else:

0 commit comments

Comments
 (0)