``` def get_pitch(self, wav_path, legacy=True): torchcrepe_zeroes, torchcrepe_nozeroes = self._torchcrepe_f0(wav_path) return torchcrepe_zeroes, torchcrepe_nozeroes def auto_tune(self, audio_np, audio_torch, f0s_wo_silence): device = "cuda" if torch.cuda.is_available() else "cpu" output_freq = torchcrepe.predict( audio_torch.type(torch.int16).type(torch.float32), 22050, hop_length=256, fmin=50, fmax=800, model="full", decoder=torchcrepe.decode.viterbi, # return_periodicity=True, batch_size=128, device=device, ) output_freq = output_freq.squeeze(0).cpu().numpy()[: len(f0s_wo_silence)] output_pitch = torch.from_numpy(output_freq.astype(np.float32)) target_pitch = torch.FloatTensor(f0s_wo_silence) factor = torch.mean(output_pitch) / torch.mean(target_pitch) octaves = [0.125, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0] nearest_octave = min(octaves, key=lambda x: abs(x - factor)) target_pitch *= nearest_octave if len(target_pitch) < len(output_pitch): target_pitch = torch.nn.functional.pad( target_pitch, (0, list(output_pitch.shape)[0] - list(target_pitch.shape)[0]), "constant", 0, ) if len(target_pitch) > len(output_pitch): target_pitch = target_pitch[0 : list(output_pitch.shape)[0]] audio_np = psola.vocode(audio_np, 22050, target_pitch=target_pitch).astype( np.float32 ) normalize = (1.0 / np.max(np.abs(audio_np))) ** 0.9 audio_np = audio_np * normalize * 32768.0 audio_np = audio_np.astype(np.int16) return audio_np ```