modified get_pitch and auto_tune methods in extract.py v2

    def get_pitch(self, wav_path, legacy=True):
        torchcrepe_zeroes, torchcrepe_nozeroes = self._torchcrepe_f0(wav_path)
        return torchcrepe_zeroes, torchcrepe_nozeroes

    def auto_tune(self, audio_np, audio_torch, f0s_wo_silence):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        output_freq = torchcrepe.predict(
            audio_torch.type(torch.int16).type(torch.float32),
            22050,
            hop_length=256,
            fmin=50,
            fmax=800,
            model="full",
            decoder=torchcrepe.decode.viterbi,
            # return_periodicity=True,
            batch_size=128,
            device=device,
        )

        output_freq = output_freq.squeeze(0).cpu().numpy()[: len(f0s_wo_silence)]
        output_pitch = torch.from_numpy(output_freq.astype(np.float32))
        target_pitch = torch.FloatTensor(f0s_wo_silence)
        factor = torch.mean(output_pitch) / torch.mean(target_pitch)

        octaves = [0.125, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0]
        nearest_octave = min(octaves, key=lambda x: abs(x - factor))
        target_pitch *= nearest_octave
        if len(target_pitch) < len(output_pitch):
            target_pitch = torch.nn.functional.pad(
                target_pitch,
                (0, list(output_pitch.shape)[0] - list(target_pitch.shape)[0]),
                "constant",
                0,
            )
        if len(target_pitch) > len(output_pitch):
            target_pitch = target_pitch[0 : list(output_pitch.shape)[0]]

        audio_np = psola.vocode(audio_np, 22050, target_pitch=target_pitch).astype(
            np.float32
        )
        normalize = (1.0 / np.max(np.abs(audio_np))) ** 0.9
        audio_np = audio_np * normalize * 32768.0
        audio_np = audio_np.astype(np.int16)
        return audio_np