TransferRapid/CommonVoices20_ro
Viewer • Updated • 41.4k • 81 • 4
The SpeechT5Processor is then used to tokenize the text input and compute mel-spectrogram targets from the waveform. The resulting inputs are:
Since mel-spectrogram sequences can vary in length, all outputs are padded to a consistent shape during collation. Spectrograms are padded or trimmed to ensure dimensionality consistency (80 bins per frame), and a reduction factor is applied when necessary to adjust sequence lengths.

import os
import torch
import torchaudio
import numpy as np
import soundfile as sf
import shutil
from huggingface_hub import hf_hub_download
from transformers import (
SpeechT5Processor,
SpeechT5ForTextToSpeech,
SpeechT5HifiGan
)
from speechbrain.inference import EncoderClassifier
# ====== CONFIG ======
TEXT_INPUT = "Salut! Acesta este un test."
WAV_NAME = "common_voice_ro_20349005.wav"
REPO_ID = "ionut-visan/SpeechT5_ro"
OUTPUT_WAV_PATH = "generated_speech.wav"
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
# ====== DOWNLOAD VOICE SAMPLE IF NEEDED ======
cached_path = hf_hub_download(repo_id=REPO_ID, filename=WAV_NAME)
target_path = os.path.join(os.getcwd(), WAV_NAME)
if not os.path.exists(target_path):
shutil.copy(cached_path, target_path)
print(f"Voice sample available at: {target_path}")
# ====== LOAD MODELS ======
print("Loading models...")
processor = SpeechT5Processor.from_pretrained(REPO_ID)
model = SpeechT5ForTextToSpeech.from_pretrained(REPO_ID).to(DEVICE).eval()
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(DEVICE)
speaker_encoder = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-xvect-voxceleb",
run_opts={"device": DEVICE},
savedir="/tmp/speechbrain/spkrec-xvect-voxceleb"
)
# ====== PREPROCESS VOICE SAMPLE ======
waveform, sr = torchaudio.load(target_path)
if sr != 16000:
waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
waveform = waveform / waveform.abs().max()
with torch.no_grad():
speaker_embedding = speaker_encoder.encode_batch(waveform)
speaker_embedding = torch.nn.functional.normalize(speaker_embedding, dim=2)
speaker_embedding = speaker_embedding.squeeze(0).squeeze(0).unsqueeze(0)
# ====== TEXT TO SPEECH INFERENCE ======
inputs = processor(text=TEXT_INPUT, return_tensors="pt").to(DEVICE)
with torch.no_grad():
generated_waveform = model.generate_speech(
input_ids=inputs["input_ids"],
speaker_embeddings=speaker_embedding.to(DEVICE),
vocoder=vocoder
)
# ====== SAVE TO FILE ======
sf.write(OUTPUT_WAV_PATH, generated_waveform.cpu().numpy(), 16000, subtype="PCM_16")
print(f"Speech generated and saved to '{OUTPUT_WAV_PATH}'")
Base model
microsoft/speecht5_tts