Add files via upload

2026-01-19 18:41:52 +00:00 · 2023-05-28 22:58:33 +08:00
parent 7789c46ded
commit f1730d42d4
4 changed files with 455 additions and 56 deletions
--- a/vc_infer_pipeline.py
+++ b/vc_infer_pipeline.py
@@ -2,7 +2,7 @@ import numpy as np, parselmouth, torch, pdb
 from time import time as ttime
 import torch.nn.functional as F
 import scipy.signal as signal
-import pyworld, os, traceback, faiss, librosa
+import pyworld, os, traceback, faiss, librosa,torchcrepe
 from scipy import signal
 from functools import lru_cache

@@ -103,6 +103,27 @@ class VC(object):
            f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
            if filter_radius > 2:
                f0 = signal.medfilt(f0, 3)
+        elif f0_method == "crepe":
+            model = "full"
+            # Pick a batch size that doesn't cause memory errors on your gpu
+            batch_size = 512
+            # Compute pitch using first gpu
+            audio = torch.tensor(np.copy(x))[None].float()
+            f0, pd = torchcrepe.predict(
+                audio,
+                self.sr,
+                self.window,
+                f0_min,
+                f0_max,
+                model,
+                batch_size=batch_size,
+                device=self.device,
+                return_periodicity=True,
+            )
+            pd = torchcrepe.filter.median(pd, 3)
+            f0 = torchcrepe.filter.mean(f0, 3)
+            f0[pd < 0.1] = 0
+            f0 = f0[0].cpu().numpy()
        f0 *= pow(2, f0_up_key / 12)
        # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
        tf0 = self.sr // self.window  # 每秒f0点数
@@ -141,6 +162,7 @@ class VC(object):
        big_npy,
        index_rate,
        version,
+        protect
    ):  # ,file_index,file_big_npy
        feats = torch.from_numpy(audio0)
        if self.is_half:
@@ -162,7 +184,8 @@ class VC(object):
        with torch.no_grad():
            logits = model.extract_features(**inputs)
            feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
-
+        if(protect<0.5):
+            feats0=feats.clone()
        if (
            isinstance(index, type(None)) == False
            and isinstance(big_npy, type(None)) == False
@@ -188,6 +211,8 @@ class VC(object):
            )

        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+        if(protect<0.5):
+            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
        t1 = ttime()
        p_len = audio0.shape[0] // self.window
        if feats.shape[1] < p_len:
@@ -195,6 +220,14 @@ class VC(object):
            if pitch != None and pitchf != None:
                pitch = pitch[:, :p_len]
                pitchf = pitchf[:, :p_len]
+
+        if(protect<0.5):
+            pitchff = pitchf.clone()
+            pitchff[pitchf > 0] = 1
+            pitchff[pitchf < 1] = protect
+            pitchff = pitchff.unsqueeze(-1)
+            feats = feats * pitchff + feats0 * (1 - pitchff)
+            feats=feats.to(feats0.dtype)
        p_len = torch.tensor([p_len], device=self.device).long()
        with torch.no_grad():
            if pitch != None and pitchf != None:
@@ -235,6 +268,7 @@ class VC(object):
        resample_sr,
        rms_mix_rate,
        version,
+        protect,
        f0_file=None,
    ):
        if (
@@ -322,6 +356,7 @@ class VC(object):
                        big_npy,
                        index_rate,
                        version,
+                        protect
                    )[self.t_pad_tgt : -self.t_pad_tgt]
                )
            else:
@@ -338,6 +373,7 @@ class VC(object):
                        big_npy,
                        index_rate,
                        version,
+                        protect
                    )[self.t_pad_tgt : -self.t_pad_tgt]
                )
            s = t
@@ -355,6 +391,7 @@ class VC(object):
                    big_npy,
                    index_rate,
                    version,
+                    protect
                )[self.t_pad_tgt : -self.t_pad_tgt]
            )
        else:
@@ -371,6 +408,7 @@ class VC(object):
                    big_npy,
                    index_rate,
                    version,
+                    protect
                )[self.t_pad_tgt : -self.t_pad_tgt]
            )
        audio_opt = np.concatenate(audio_opt)