Code for Speech Recognition using Transformers in Python Tutorial

AutomaticSpeechRecognition_PythonCodeTutorial.py
# %%
!pip install transformers==4.28.1 soundfile sentencepiece torchaudio pydub

# %%
from transformers import *
import torch
import soundfile as sf
# import librosa
import os
import torchaudio

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# %% [markdown]
# # Wav2Vec2.0 Models
# 

# %%
# wav2vec2_model_name = "facebook/wav2vec2-base-960h" # 360MB
wav2vec2_model_name = "facebook/wav2vec2-large-960h-lv60-self" # pretrained 1.26GB
# wav2vec2_model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-english" # English-only, 1.26GB
# wav2vec2_model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic" # Arabic-only, 1.26GB
# wav2vec2_model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-spanish" # Spanish-only, 1.26GB

wav2vec2_processor = Wav2Vec2Processor.from_pretrained(wav2vec2_model_name)
wav2vec2_model = Wav2Vec2ForCTC.from_pretrained(wav2vec2_model_name).to(device)

# %%
# audio_url = "http://www.fit.vutbr.cz/~motlicek/sympatex/f2bjrop1.0.wav"
# audio_url = "http://www.fit.vutbr.cz/~motlicek/sympatex/f2bjrop1.1.wav"
# audio_url = "http://www.fit.vutbr.cz/~motlicek/sympatex/f2btrop6.0.wav"
# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/16-122828-0002.wav"
audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/30-4447-0004.wav"
# audio_url = "https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0060_8k.wav"
# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/7601-291468-0006.wav"
# audio_url = "http://www0.cs.ucl.ac.uk/teaching/GZ05/samples/lathe.wav"

# %%
# load our wav file
speech, sr = torchaudio.load(audio_url)
speech = speech.squeeze()
# or using librosa
# speech, sr = librosa.load(audio_file, sr=16000)
sr, speech.shape

# %%
# resample from whatever the audio sampling rate to 16000
resampler = torchaudio.transforms.Resample(sr, 16000)
speech = resampler(speech)
speech.shape

# %%
# tokenize our wav
input_values = wav2vec2_processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"].to(device)
input_values.shape

# %%
# perform inference
logits = wav2vec2_model(input_values)["logits"]
logits.shape

# %%
# use argmax to get the predicted IDs
predicted_ids = torch.argmax(logits, dim=-1)
predicted_ids.shape

# %%
# decode the IDs to text
transcription = wav2vec2_processor.decode(predicted_ids[0])
transcription.lower()

# %%
def load_audio(audio_path):
  """Load the audio file & convert to 16,000 sampling rate"""
  # load our wav file
  speech, sr = torchaudio.load(audio_path)
  resampler = torchaudio.transforms.Resample(sr, 16000)
  speech = resampler(speech)
  return speech.squeeze()

# %%
def get_transcription_wav2vec2(audio_path, model, processor):
  speech = load_audio(audio_path)
  input_features = processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"].to(device)
  # perform inference
  logits = model(input_features)["logits"]
  # use argmax to get the predicted IDs
  predicted_ids = torch.argmax(logits, dim=-1)
  transcription = processor.batch_decode(predicted_ids)[0]
  return transcription.lower()

# %%
get_transcription_wav2vec2("http://www0.cs.ucl.ac.uk/teaching/GZ05/samples/lathe.wav", 
                           wav2vec2_model, 
                           wav2vec2_processor)

# %% [markdown]
# # Whisper Models

# %%
# whisper_model_name = "openai/whisper-tiny.en" # English-only, ~ 151 MB
# whisper_model_name = "openai/whisper-base.en" # English-only, ~ 290 MB
# whisper_model_name = "openai/whisper-small.en" # English-only, ~ 967 MB
# whisper_model_name = "openai/whisper-medium.en" # English-only, ~ 3.06 GB
# whisper_model_name = "openai/whisper-tiny" # multilingual, ~ 151 MB
# whisper_model_name = "openai/whisper-base" # multilingual, ~ 290 MB
# whisper_model_name = "openai/whisper-small" # multilingual, ~ 967 MB
whisper_model_name = "openai/whisper-medium" # multilingual, ~ 3.06 GB
# whisper_model_name = "openai/whisper-large-v2" # multilingual, ~ 6.17 GB

whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name).to(device)

# %%
input_features = whisper_processor(load_audio(audio_url), sampling_rate=16000, return_tensors="pt").input_features.to(device)

# %%
forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="english", task="transcribe")

# %%
forced_decoder_ids

# %%
input_features.shape

# %%
predicted_ids = whisper_model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
predicted_ids.shape

# %%
transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
transcription

# %%
transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=False)
transcription

# %%
def get_transcription_whisper(audio_path, model, processor, language="english", skip_special_tokens=True):
  # resample from whatever the audio sampling rate to 16000
  speech = load_audio(audio_path)
  input_features = processor(speech, return_tensors="pt", sampling_rate=16000).input_features
  forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
  # print(forced_decoder_ids)
  predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=skip_special_tokens)[0]
  return transcription

# %%
arabic_transcription = get_transcription_whisper("https://datasets-server.huggingface.co/assets/arabic_speech_corpus/--/clean/train/0/audio/audio.wav",
                          whisper_model,
                          whisper_processor,
                          language="arabic",
                          skip_special_tokens=True)
arabic_transcription

# %%
spanish_transcription = get_transcription_whisper("https://www.lightbulblanguages.co.uk/resources/sp-audio/cual-es-la-fecha-cumple.mp3",
                          whisper_model,
                          whisper_processor,
                          language="spanish",
                          skip_special_tokens=True)
spanish_transcription

# %%
from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE 
# supported languages
TO_LANGUAGE_CODE 

# %% [markdown]
# # Transcribe your Voice

# %%
!git clone -q --depth 1 https://github.com/snakers4/silero-models

%cd silero-models

# %%
from IPython.display import Audio, display, clear_output
from colab_utils import record_audio
import ipywidgets as widgets
from scipy.io import wavfile
import numpy as np


record_seconds =   20#@param {type:"number", min:1, max:10, step:1}
sample_rate = 16000

def _record_audio(b):
  clear_output()
  audio = record_audio(record_seconds)
  display(Audio(audio, rate=sample_rate, autoplay=True))
  wavfile.write('recorded.wav', sample_rate, (32767*audio).numpy().astype(np.int16))

button = widgets.Button(description="Record Speech")
button.on_click(_record_audio)
display(button)

# %%
print("Whisper:", get_transcription_whisper("recorded.wav", whisper_model, whisper_processor))
print("Wav2vec2:", get_transcription_wav2vec2("recorded.wav", wav2vec2_model, wav2vec2_processor))

# %% [markdown]
# # Transcribing Long Audio Samples

# %%
def get_long_transcription_whisper(audio_path, pipe, return_timestamps=True, 
                                   chunk_length_s=10, stride_length_s=2):
    """Get the transcription of a long audio file using the Whisper model"""
    return pipe(load_audio(audio_path).numpy(), return_timestamps=return_timestamps,
                  chunk_length_s=chunk_length_s, stride_length_s=stride_length_s)

# %%
# initialize the pipeline
pipe = pipeline("automatic-speech-recognition", 
                model=whisper_model_name, device=device)

# %%
# get the transcription of a sample long audio file
output = get_long_transcription_whisper(
    "https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0060_8k.wav", 
    pipe, chunk_length_s=10, stride_length_s=1)

# %%
output["text"]

# %%
for chunk in output["chunks"]:
  # print the timestamp and the text
  print(chunk["timestamp"], ":", chunk["text"])

# %%
Ethical Hacking with Python EBook - Home - Top
New Tutorials

Building a Full-Stack RAG Chatbot with FastAPI, OpenAI, and Streamlit
How to Recover Deleted Files with Python
How to Use Python to Track Google Search Results and Reviews Over Time
YouTube Video Transcription Summarization with Python
Getting Started with Python for SaaS Applications
Code for Speech Recognition using Transformers in Python Tutorial

Tags

New Tutorials

Popular Tutorials