Code for How to Convert Text to Speech in Python Tutorial

tts_google.py

import gtts
from playsound import playsound

# make request to google to get synthesis
tts = gtts.gTTS("Hello world")
# save the audio file
tts.save("hello.mp3")
# play the audio file
playsound("hello.mp3")

# in spanish
tts = gtts.gTTS("Hola Mundo", lang="es")
tts.save("hola.mp3")
playsound("hola.mp3")

# all available languages along with their IETF tag
print(gtts.lang.tts_langs())

tts_pyttsx3.py

import pyttsx3

# initialize Text-to-speech engine
engine = pyttsx3.init()

# convert this text to speech
text = "Python is a great programming language"
engine.say(text)
# play the speech
engine.runAndWait()

# get details of speaking rate
rate = engine.getProperty("rate")
print(rate)

# setting new voice rate (faster)
engine.setProperty("rate", 300)
engine.say(text)
engine.runAndWait()

# slower
engine.setProperty("rate", 100)
engine.say(text)
engine.runAndWait()

# get details of all voices available
voices = engine.getProperty("voices")
print(voices)
# set another voice
engine.setProperty("voice", voices[1].id)
engine.say(text)
engine.runAndWait()

# saving speech audio into a file
engine.save_to_file(text, "python.mp3")
engine.runAndWait()

tts_openai.py

from openai import OpenAI

# initialize the OpenAI API client
api_key = "YOUR_OPENAI_API_KEY"
client = OpenAI(api_key=api_key)

# sample text to generate speech from
text = """In his miracle year, he published four groundbreaking papers. 
These outlined the theory of the photoelectric effect, explained Brownian motion, 
introduced special relativity, and demonstrated mass-energy equivalence."""

# generate speech from the text
response = client.audio.speech.create(
    model="tts-1", # the model to use, there is tts-1 and tts-1-hd
    voice="nova", # the voice to use, there is alloy, echo, fable, onyx, nova, and shimmer
    input=text, # the text to generate speech from
    speed=1.0, # the speed of the generated speech, ranging from 0.25 to 4.0
)
# save the generated speech to a file
response.stream_to_file("openai-output.mp3")

tts_transformers.py

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import random
import string
import soundfile as sf

device = "cuda" if torch.cuda.is_available() else "cpu"
# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
# we load this dataset to get the speaker embeddings
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

# speaker ids from the embeddings dataset
speakers = {
    'awb': 0,     # Scottish male
    'bdl': 1138,  # US male
    'clb': 2271,  # US female
    'jmk': 3403,  # Canadian male
    'ksp': 4535,  # Indian male
    'rms': 5667,  # US male
    'slt': 6799   # US female
}

def save_text_to_speech(text, speaker=None):
    # preprocess text
    inputs = processor(text=text, return_tensors="pt").to(device)
    if speaker is not None:
        # load xvector containing speaker's voice characteristics from a dataset
        speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
    else:
        # random vector, meaning a random voice
        speaker_embeddings = torch.randn((1, 512)).to(device)
    # generate speech with the models
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
    if speaker is not None:
        # if we have a speaker, we use the speaker's ID in the filename
        output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
    else:
        # if we don't have a speaker, we use a random string in the filename
        random_str = ''.join(random.sample(string.ascii_letters+string.digits, k=5))
        output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
    # save the generated speech to a file with 16KHz sampling rate
    sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
    # return the filename for reference
    return output_filename

# generate speech with a US female voice
save_text_to_speech("Python is my favorite programming language", speaker=speakers["slt"])
# generate speech with a random voice
save_text_to_speech("Python is my favorite programming language")

# a challenging text with all speakers
text = """In his miracle year, he published four groundbreaking papers. 
These outlined the theory of the photoelectric effect, explained Brownian motion, 
introduced special relativity, and demonstrated mass-energy equivalence."""

for speaker_name, speaker in speakers.items():
    output_filename = save_text_to_speech(text, speaker)
    print(f"Saved {output_filename}")
# random speaker
output_filename = save_text_to_speech(text)
print(f"Saved {output_filename}")

Ethical Hacking with Python EBook - About - Top

New Tutorials

How to Track Your AI Visibility with Python

How to Automate Excel Reports in Python using Openpyxl

How to Build a Website Blocker in Python

How to Analyze the Most In-Demand Tech Skills from Job Postings in Python

How to Generate and Visualize Text Embeddings in Python

Code for How to Convert Text to Speech in Python Tutorial

Tags

New Tutorials

Popular Tutorials