ArkanDash
diff --git a/‎README.md‎
Lines changed: 45 additions & 0 deletions b/‎README.md‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎configs/config.py‎
Lines changed: 3 additions & 0 deletions b/‎configs/config.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎inference.py‎ ‎infer.py‎inference.py renamed to infer.py
Lines changed: 149 additions & 55 deletions b/‎inference.py‎ ‎infer.py‎inference.py renamed to infer.py
Lines changed: 149 additions & 55 deletions
@@ -0,0 +1,45 @@
+<div align="center">
+
+# JSON RVC Inference
+
+</div>
+
+### Information
+JSON RVC Inference is the same [advanced version of RVC](https://github.com/ArkanDash/Advanced-RVC-Inference) with JSON file to select desired model to download and load.
+**Currently this inference is on unstable state, Please use at your own risk!**
+
+Please support the original RVC. This inference won't be possible to make without it.<br />
+[![Original RVC Repository](https://img.shields.io/badge/Github-Original%20RVC%20Repository-blue?style=for-the-badge&logo=github)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)
+
+#### Features
+- Support V1 & V2 Model ✅
+- Model downloader using JSON file [Internet required for downloading voice model] ✅
+- Youtube Audio Downloader ✅
+- Voice Splitter [Internet required for downloading splitter model] ✅
+- Microphone Support ✅
+- TTS Support ✅
+
+### Installation
+
+1. Install Dependencies <br />
+```bash
+pip install torch torchvision torchaudio
+
+pip install -r requirements.txt
+```
+2. Install [ffmpeg](https://ffmpeg.org/)
+
+3. Download [Hubert Model](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/hubert_base.pt)
+
+4. **[Optional]** To use rmvpe pitch extraction, download this [rvmpe.pt](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/rmvpe.pt)
+
+### Run WebUI <br />
+
+For Windows:
+```bash
+Open run.bat
+```
+For Other:
+```bash
+python infer.py
+```
@@ -56,6 +56,7 @@ def __init__(self):
             self.noparallel,
             self.noautoopen,
             self.dml,
+            self.force_support,
         ) = self.arg_parse()
         self.instead = ""
         self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
@@ -75,6 +76,7 @@ def arg_parse() -> tuple:
         parser.add_argument("--port", type=int, default=7865, help="Listen port")
         parser.add_argument("--pycmd", type=str, default=exe, help="Python command")
         parser.add_argument("--colab", action="store_true", help="Launch in colab")
+        parser.add_argument("--force_support", action="store_true", help="Force unsupported feature such as crepe")
         parser.add_argument(
             "--noparallel", action="store_true", help="Disable parallel processing"
         )
@@ -99,6 +101,7 @@ def arg_parse() -> tuple:
             cmd_opts.noparallel,
             cmd_opts.noautoopen,
             cmd_opts.dml,
+            cmd_opts.force_support,
         )
 
     # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
 
@@ -3,6 +3,8 @@
 now_dir = os.getcwd()
 sys.path.append(now_dir)
 from infer.modules.vc.modules import VC
+from infer.modules.vc.utils import download_and_split_audio, combine_audio
+from infer.lib.setting import change_audio_mode, show_description, use_microphone
 from configs.config import Config
 import numpy as np
 import gradio as gr
@@ -12,12 +14,17 @@
 import logging
 import zipfile
 import glob
+import asyncio
+import edge_tts
 
 logging.getLogger("numba").setLevel(logging.WARNING)
 logger = logging.getLogger(__name__)
 config = Config()
 vc = VC(config)
 
+tts_voice_list = asyncio.new_event_loop().run_until_complete(edge_tts.list_voices())
+tts_voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
+
 os.makedirs("models", exist_ok=True)
 os.makedirs(os.path.join("models", "weights"), exist_ok=True)
 os.makedirs(os.path.join("models", "indexs"), exist_ok=True)
@@ -28,6 +35,26 @@
 indexs_path = os.path.join("models", "indexs")
 covers_path = os.path.join("models", "covers")
 
+force_support = None
+if config.force_support is False:
+    if config.device == "mps" or config.device == "cpu":
+        force_support = False
+else:
+    print("\033[93mWARNING: Unsupported feature is enabled.\033[0m")
+    print("\033[93mWARNING: It may not work properly.\033[0m")
+    force_support = True
+
+audio_mode = []
+f0method_mode = []
+f0method_info = ""
+
+if force_support is False:
+    audio_mode = ["Upload audio", "Input path", "TTS Audio"]
+    f0method_mode = ["pm", "rmvpe", "harvest"]
+else:
+    audio_mode = ["Upload audio", "Input path", "Youtube", "TTS Audio"]
+    f0method_mode = ["pm", "rmvpe", "harvest", "crepe"]
+
 json_files = []
 for root, dirs, files in os.walk(models_path):
     for file in files:
@@ -105,21 +132,22 @@ def clean():
     return {"value": "", "__type__": "update"}
 
 with gr.Blocks(title="RVC WebUI", theme=gr.themes.Base()) as app:
-    gr.Markdown("## RVC WebUI")
-    gr.Markdown(
-        value="Alpha testing."
-    )
+    gr.Markdown("<center> # RVC WebUI")
+    gr.Markdown("v1.0.0Beta")
     with gr.Tabs():
         with gr.TabItem("Inference"):
             with gr.Row():
                 modelSelect = gr.Dropdown(label="Model", choices=sorted(modelList))
                 downloadModel = gr.Button("Download Model", variant="primary")
                 downloadModel.click(fn=download_model, inputs=[modelSelect], outputs=[])
             with gr.Row():
-                sid0 = gr.Dropdown(label="Selected Model", choices=sorted(names))
-                with gr.Column():
-                    refresh_button = gr.Button("Refresh model", variant="primary")
-                    clean_button = gr.Button("Clean memory", variant="primary")
+                sid0 = gr.Dropdown(label="Selected Model", choices=sorted(names), allow_custom_value=False)
+                file_index = gr.Dropdown(
+                    label="Index file dropdown",
+                    choices=sorted(indexs),
+                    allow_custom_value=False,
+                    interactive=True,
+                )
                 spk_item = gr.Slider(
                     minimum=0,
                     maximum=2333,
@@ -129,33 +157,44 @@ def clean():
                     visible=True,
                     interactive=True,
                 )
-                clean_button.click(
-                    fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean"
-                )
+                with gr.Column():
+                    refresh_button = gr.Button("Refresh model", variant="primary")
+                    clean_button = gr.Button("Clean memory", variant="primary")
+                clean_button.click(fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean")
             with gr.TabItem("Inference Setting"):
                 with gr.Row():
                     with gr.Column():
-                        vc_transform0 = gr.Number(
-                            label="Transpose", value=0
-                        )
-                        input_audio0 = gr.Textbox(
-                            label="Audio Input Path",
-                            placeholder="C:\\Users\\Desktop\\audio_example.wav",
-                        )
-                        file_index = gr.Dropdown(
-                            label="Index file dropdown",
-                            choices=sorted(indexs),
-                            interactive=True,
-                        )
+                        vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio", visible=True, interactive=True)
+                        # Upload Audio
+                        vc_upload = gr.Audio(label="Upload audio file", sources="upload", visible=True, interactive=True)
+                        vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
+                        # Audio Path
+                        vc_audio_input = gr.Textbox(label="Audio Input Path", placeholder="C:\\Users\\Desktop\\audio_example.wav", visible=False, interactive=True)
+                        # Youtube Audio
+                        vc_link = gr.Textbox(label="Youtube URL", visible=False, placeholder="https://www.youtube.com/watch?v=...", interactive=True)
+                        vc_split_model = gr.Dropdown(label="Splitter Model", choices=["hdemucs_mmi", "htdemucs", "htdemucs_ft", "mdx", "mdx_extra", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", interactive=True)
+                        vc_download_button = gr.Button("Download Audio", variant="primary", visible=False)
+                        vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
+                        # TTS Audio
+                        vc_tts_text = gr.Textbox(label="TTS text", placeholder="Hello world", visible=False, interactive=True)
+                        vc_tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=tts_voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female", interactive=True)
+                    with gr.Column():
                         f0method0 = gr.Radio(
                             label="Pitch extraction algorithm",
-                            choices=["pm", "harvest", "crepe", "rmvpe"]
+                            choices=f0method_mode
                             if config.dml == False
                             else ["pm", "harvest", "rmvpe"],
                             value="rmvpe",
                             interactive=True,
                         )
-                    with gr.Column():
+                        vc_transform0 = gr.Slider(
+                            label="Transpose",
+                            minimum=-256,
+                            maximum=256,
+                            step=0.01,
+                            value=0,
+                            interactive=True,
+                        )
                         index_rate = gr.Slider(
                             minimum=0,
                             maximum=1,
@@ -194,10 +233,6 @@ def clean():
                             step=1,
                             interactive=True,
                         )
-                        f0_file = gr.File(
-                            label="F0 curve file (Optional)",
-                            visible=False,
-                        )
                         refresh_button.click(
                             fn=change_choices,
                             inputs=[],
@@ -206,32 +241,91 @@ def clean():
                         )
                     with gr.Column():
                         but0 = gr.Button("Run", variant="primary")
-                        vc_output1 = gr.Textbox(label="Output Log")
-                        vc_output2 = gr.Audio(label="Output Audio")
-                        but0.click(
-                            vc.vc_single,
-                            [
-                                spk_item,
-                                input_audio0,
-                                vc_transform0,
-                                f0_file,
-                                f0method0,
-                                file_index,
-                                index_rate,
-                                filter_radius0,
-                                resample_sr0,
-                                rms_mix_rate0,
-                                protect0,
-                            ],
-                            [vc_output1, vc_output2],
-                            api_name="infer_convert",
-                        )
-                sid0.change(
-                    fn=vc.get_vc,
-                    inputs=[sid0, protect0],
-                    outputs=[spk_item, protect0, file_index],
-                    api_name="infer_change_voice",
-                )
+                        vc_output = gr.Audio(label="Output Audio")
+                        vc_combined = gr.Button("Combine", variant="primary")
+                        vc_combined_output = gr.Audio(label="Combined Audio")
+        with gr.TabItem("Log"):
+            gr.Markdown("## Log")
+            vc_log = gr.Textbox(label="Output Log")
+        with gr.TabItem("Settings"):
+            gr.Markdown("## Setting")
+            description_mode = gr.Checkbox(label="Show description", value=False)
+            description_mode.change(
+                fn=show_description,
+                inputs=description_mode,
+                outputs=[
+                    vc_audio_input,
+                    vc_link,
+                    vc_split_model,
+                    vc_tts_text,
+                    vc_tts_voice,
+                    f0method0,
+                    vc_transform0,
+                    index_rate,
+                    resample_sr0,
+                    rms_mix_rate0,
+                    protect0,
+                    filter_radius0
+                ]
+            )
+        but0.click(
+            vc.vc_single,
+            [
+                spk_item,
+                vc_audio_input,
+                vc_upload,
+                vc_tts_text,
+                vc_tts_voice,
+                vc_transform0,
+                f0method0,
+                file_index,
+                index_rate,
+                filter_radius0,
+                resample_sr0,
+                rms_mix_rate0,
+                protect0,
+            ],
+            [vc_log, vc_output],
+            api_name="infer_convert",
+        )
+        sid0.change(
+            fn=vc.get_vc,
+            inputs=[sid0, protect0],
+            outputs=[spk_item, protect0, file_index],
+            api_name="infer_change_voice",
+        )
+        vc_microphone_mode.change(
+            fn=use_microphone,
+            inputs=vc_microphone_mode,
+            outputs=vc_upload
+        )
+        vc_download_button.click(
+            fn=download_and_split_audio,
+            inputs=[vc_link, vc_split_model],
+            outputs=[vc_vocal_preview, vc_log]
+        )
+        vc_combined.click(
+            fn=combine_audio,
+            inputs=[vc_split_model],
+            outputs=[vc_combined_output, vc_log]
+        )
+        vc_audio_mode.change(
+            fn=change_audio_mode,
+            inputs=[vc_audio_mode],
+            outputs=[
+                vc_upload,
+                vc_microphone_mode,
+                vc_audio_input,
+                vc_link,
+                vc_split_model,
+                vc_download_button,
+                vc_vocal_preview,
+                vc_tts_text,
+                vc_tts_voice,
+                vc_combined,
+                vc_combined_output,
+            ]
+        )
     if config.iscolab:
         app.queue(max_size=1022).launch(share=True)
     else: