ESPNET 2 pass SLU Demonstration

This notebook provides a demonstration of the Two Pass End-to-End Spoken Language Understanding model

Paper Link: https://arxiv.org/abs/2207.06670

ESPnet2-SLU: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/slu1

Author: Siddhant Arora

[ ]:
! python -m pip install transformers
!git clone https://github.com/espnet/espnet /espnet
!pip install /espnet
%pip install -q espnet_model_zoo
%pip install fairseq@git+https://github.com//pytorch/fairseq.git@f2146bdc7abf293186de9449bfa2272775e39e1d#egg=fairseq

Download Audio File

[ ]:
# !gdown --id 1LxoxCoFgx3u8CvKb1loybGFtArKKPcAH -O /content/audio_file.wav
!gdown --id 18ANT62ittt7Ai2E8bQRlvT0ZVXXsf1eE -O /content/audio_file.wav
[ ]:
import os

import soundfile
from IPython.display import display, Audio
mixwav_mc, sr = soundfile.read("/content/audio_file.wav")
display(Audio(mixwav_mc.T, rate=sr))

Download and Load pretrained First Pass Model

[ ]:
!git lfs clone https://huggingface.co/espnet/siddhana_slurp_new_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best /content/slurp_first_pass_model
[ ]:
from espnet2.bin.asr_inference import Speech2Text
speech2text_slurp = Speech2Text.from_pretrained(
    asr_train_config="/content/slurp_first_pass_model/exp/asr_train_asr_conformer_raw_en_word/config.yaml",
    asr_model_file="/content/slurp_first_pass_model/exp/asr_train_asr_conformer_raw_en_word/valid.acc.ave_10best.pth",
    nbest=1,
)
[ ]:
nbests_orig = speech2text_slurp(mixwav_mc)
text, *_ = nbests_orig[0]
def text_normalizer(sub_word_transcript):
    transcript = sub_word_transcript[0].replace("▁", "")
    for sub_word in sub_word_transcript[1:]:
        if "▁" in sub_word:
            transcript = transcript + " " + sub_word.replace("▁", "")
        else:
            transcript = transcript + sub_word
    return transcript
intent_text="{scenario: "+text.split()[0].split("_")[0]+", action: "+"_".join(text.split()[0].split("_")[1:])+"}"
print(f"INTENT: {intent_text}")
transcript=text_normalizer(text.split()[1:])
print(f"ASR hypothesis: {transcript}")
print(f"First pass SLU model fails to predict the correct action.")

Download and Load pretrained Second Pass Model

[ ]:
!git lfs clone https://huggingface.co/espnet/slurp_slu_2pass /content/slurp_second_pass_model
[ ]:
from espnet2.bin.slu_inference import Speech2Understand
from transformers import AutoModel, AutoTokenizer
speech2text_second_pass_slurp = Speech2Understand.from_pretrained(
    slu_train_config="/content/slurp_second_pass_model/exp/slu_train_asr_bert_conformer_deliberation_raw_en_word/config.yaml",
    slu_model_file="/content/slurp_second_pass_model/exp/slu_train_asr_bert_conformer_deliberation_raw_en_word/valid.acc.ave_10best.pth",
    nbest=1,
)
[ ]:
from espnet2.tasks.slu import SLUTask
preprocess_fn=SLUTask.build_preprocess_fn(
            speech2text_second_pass_slurp.asr_train_args, False
        )

[ ]:
import numpy as np
transcript = preprocess_fn.text_cleaner(transcript)
tokens = preprocess_fn.transcript_tokenizer.text2tokens(transcript)
text_ints = np.array(preprocess_fn.transcript_token_id_converter.tokens2ids(tokens), dtype=np.int64)
[ ]:
import torch
nbests = speech2text_second_pass_slurp(mixwav_mc,torch.tensor(text_ints))
text1, *_ = nbests[0]
intent_text="{scenario: "+text1.split()[0].split("_")[0]+", action: "+"_".join(text1.split()[0].split("_")[1:])+"}"
print(f"INTENT: {intent_text}")
transcript=text_normalizer(text1.split()[1:])
print(f"ASR hypothesis: {transcript}")
print(f"Second pass SLU model successfully recognizes the correct action.")