I need to do speech recognition over a audio stream Alaw "on the fly", I
receive buffers of 2048 every 250 milliseconds (from a audio source -
telephony board). I made a prototype, see below but fails to behave as I
need, it moves right away into audiostate.stopped and I do not know how to
resume it .... documentation (for system.speech) seems very weak.
Main question is can I do continuous recognition over a stream of alaw
chunks arriving at 250 ms, is about a live person so it must be live?
If yes how can I do it?
Below is my test code:
class Program
{
static void Main(string[] args)
{
SRSAPI sr = new SRSAPI(new CultureInfo("en-US"), "MSASREnglish");
while (Console.ReadKey().KeyChar == 'q')
Thread.Sleep(300);
}
class SRSAPI
{
public SRSAPI(CultureInfo requiredCulture, string requiredId)
{
foreach (RecognizerInfo config in
SpeechRecognitionEngine.InstalledRecognizers())
{
Console.WriteLine("Configured engine: " + config.Id);
if (config.Culture.Equals(requiredCulture) && config.Id == requiredId)
{
theEngine = new SpeechRecognitionEngine(config);
break;
}
}
if (theEngine == null)
throw new ApplicationException("no engine");
//theEngine.SetInputToWaveFile("c:\\srtest.wav");
SpeechAudioFormatInfo alawformat = new
SpeechAudioFormatInfo(EncodingFormat.ALaw, 8000, 8, 1, 8000, 1, null);
WaveReader wr = new WaveReader();
wr.Open("c:\\srtest.wav");
byte [] buffer = wr.ReadAudio();
wr.Close();
MemoryStream ms = new MemoryStream();
theEngine.SetInputToAudioStream(ms, alawformat);
string[] values = { "one", "two", "three", "four", "five", "six", "seven",
"eight", "nine", "ten" };
Choices choices = new Choices(values);
GrammarBuilder grb = new GrammarBuilder(choices);
Grammar gr = new Grammar(grb);
theEngine.LoadGrammar(gr);
theEngine.RecognizeCompleted += new
EventHandler<RecognizeCompletedEventArgs>(theEngine_RecognizeCompleted);
theEngine.RecognizerUpdateReached += new
EventHandler<RecognizerUpdateReachedEventArgs>(theEngine_RecognizerUpdateReached);
theEngine.SpeechDetected += new
EventHandler<SpeechDetectedEventArgs>(theEngine_SpeechDetected);
theEngine.SpeechHypothesized += new
EventHandler<SpeechHypothesizedEventArgs>(theEngine_SpeechHypothesized);
theEngine.SpeechRecognitionRejected += new
EventHandler<SpeechRecognitionRejectedEventArgs>(theEngine_SpeechRecognitionRejected);
theEngine.SpeechRecognized += new
EventHandler<SpeechRecognizedEventArgs>(theEngine_SpeechRecognized);
theEngine.AudioStateChanged += new
EventHandler<AudioStateChangedEventArgs>(theEngine_AudioStateChanged);
theEngine.AudioSignalProblemOccurred += new
EventHandler<AudioSignalProblemOccurredEventArgs>(theEngine_AudioSignalProblemOccurred);
theEngine.BabbleTimeout = new TimeSpan( 0, 1, 0);
theEngine.InitialSilenceTimeout = new TimeSpan( 0, 1, 0);
//theEngine.EndSilenceTimeout = new TimeSpan( 0, 0, 0, 10);
//theEngine.EndSilenceTimeoutAmbiguous = new TimeSpan( 0, 1, 0);
for (int i = 0; i < buffer.Length; i += 2048)
{
Console.Write("buffer: " + i.ToString());
ms.Write(buffer, i, Math.Min(buffer.Length - i, 2048));
Console.WriteLine("\tbuffer: " + i.ToString());
if (!isrecognizing)
{
isrecognizing = true;
theEngine.RecognizeAsync(RecognizeMode.Multiple);
}
// Thread.Sleep(5);
}
waitobj.WaitOne();
}
void theEngine_AudioSignalProblemOccurred(object sender,
AudioSignalProblemOccurredEventArgs e)
{
Console.WriteLine("AudioSignalProblemOccurred: " +
e.AudioSignalProblem.ToString());
}
void theEngine_AudioStateChanged(object sender, AudioStateChangedEventArgs e)
{
Console.WriteLine("AudioStateChanged." + e.AudioState);
//switch (e.AudioState)
//{
// case AudioState.Stopped:
// ;
// break;
//}
}
void theEngine_SpeechRecognized(object sender, SpeechRecognizedEventArgs e)
{
Console.WriteLine("SpeechRecognized: " + e.Result.Text);
}
void theEngine_SpeechRecognitionRejected(object sender,
SpeechRecognitionRejectedEventArgs e)
{
Console.WriteLine("SpeechRecognitionRejected: " + e.Result.Words);
}
void theEngine_SpeechHypothesized(object sender, SpeechHypothesizedEventArgs
e)
{
Console.WriteLine("SpeechHypothesized: " + e.Result.Text);
}
void theEngine_SpeechDetected(object sender, SpeechDetectedEventArgs e)
{
Console.WriteLine("SpeechDetected: " + e.AudioPosition);
}
void theEngine_RecognizerUpdateReached(object sender,
RecognizerUpdateReachedEventArgs e)
{
Console.WriteLine("RecognizerUpdateReached.");
}
void theEngine_RecognizeCompleted(object sender, RecognizeCompletedEventArgs
e)
{
Console.WriteLine("RecognizeCompleted: " + e.Result.Text);
// waitobj.Set();
}
AutoResetEvent waitobj = new AutoResetEvent(false);
SpeechRecognitionEngine theEngine;
bool isrecognizing = false;
}
}