WasapiLoopbackCapture内部音频识别在没有音频时提供jibberish和文本



我终于构建了一个程序,使用NAudio收听内部音频环回,并输出识别的文本。问题是它会倾听,并且总是说,例如:

Recognized text: had
Recognized text: had
Recognized text: had
Recognized text: had
Recognized text: had had phone Le K add phone Laton
Recognized text: had phone looked had phone looked had phone looked had phone lo
oked zone
Recognized text: had phone lines to had, had phone looked had phone looked had p
hone line had phone
Recognized text: had phone line had phone looked had phone
Recognized text: had phone looked had phone looked had phone line had phone
Recognized text: had phone looked had phone look to had pot they had phone lit o
nly had phone
Recognized text: had phone line had phone looked had phone line to had to had ph
one
Recognized text: had phone line had phone looked had phone looked had phone
Recognized text: had phone line had phone looked had phone looked had phone line
10 only T had phone
Recognized text: had phone line had
Recognized text: had phone line had phone looked had phone line had
Recognized text: had phone Le tone looked had
Recognized text: had phone looked had phone looked had phone
Recognized text: had phone line had phone line had phone licked had phone
Recognized text: had phone lines to had popped the own

和类似的废话,但即使我暂停音频,它也只是一次又一次地显示"已识别的文本:有"或"an"。当我打开音频时,它一直无法识别内部音频。有没有办法解决这个问题,或者至少能把它试图发送给微软语音识别识别器的内容发一个wav?

using System;
using System.Speech.Recognition;
using NAudio.Wave;
using NAudio.CoreAudioApi.Interfaces;
using NAudio.CoreAudioApi;
using System.IO;
using System.Speech.AudioFormat;
using NAudio.Wave.SampleProviders;
using NAudio.Utils;
using System.Threading;
using System.Collections.Generic;
namespace SpeechRecognitionApp
{
class SpeechStreamer : Stream
{
private AutoResetEvent _writeEvent;
private List<byte> _buffer;
private int _buffersize;
private int _readposition;
private int _writeposition;
private bool _reset;
public SpeechStreamer(int bufferSize)
{
_writeEvent = new AutoResetEvent(false);
_buffersize = bufferSize;
_buffer = new List<byte>(_buffersize);
for (int i = 0; i < _buffersize; i++)
_buffer.Add(new byte());
_readposition = 0;
_writeposition = 0;
}
public override bool CanRead
{
get { return true; }
}
public override bool CanSeek
{
get { return false; }
}
public override bool CanWrite
{
get { return true; }
}
public override long Length
{
get { return -1L; }
}
public override long Position
{
get { return 0L; }
set { }
}
public override long Seek(long offset, SeekOrigin origin)
{
return 0L;
}
public override void SetLength(long value)
{
}
public override int Read(byte[] buffer, int offset, int count)
{
int i = 0;
while (i < count && _writeEvent != null)
{
if (!_reset && _readposition >= _writeposition)
{
_writeEvent.WaitOne(100, true);
continue;
}
buffer[i] = _buffer[_readposition + offset];
_readposition++;
if (_readposition == _buffersize)
{
_readposition = 0;
_reset = false;
}
i++;
}
return count;
}
public override void Write(byte[] buffer, int offset, int count)
{
for (int i = offset; i < offset + count; i++)
{
_buffer[_writeposition] = buffer[i];
_writeposition++;
if (_writeposition == _buffersize)
{
_writeposition = 0;
_reset = true;
}
}
_writeEvent.Set();
}
public override void Close()
{
_writeEvent.Close();
_writeEvent = null;
base.Close();
}
public override void Flush()
{
}
}
class FakeStreamer : Stream
{
public bool bExit = false;
Stream stream;
Stream client;
public FakeStreamer(Stream client)
{
this.client = client;
this.stream = client;
}
public override bool CanRead
{
get { return stream.CanRead; }
}
public override bool CanSeek
{
get { return false; }
}
public override bool CanWrite
{
get { return stream.CanWrite; }
}
public override long Length
{
get { return -1L; }
}
public override long Position
{
get { return 0L; }
set { }
}
public override long Seek(long offset, SeekOrigin origin)
{
return 0L;
}
public override void SetLength(long value)
{
stream.SetLength(value);
}
public override int Read(byte[] buffer, int offset, int count)
{
int len = 0, c = count;
while (c > 0 && !bExit)
{
//try {
len = stream.Read(buffer, offset, c);
/*}
catch (Exception e)
{
Console.WriteLine("ouch");
}
if (!client.Connected || len == 0)
{
//Exit read loop
return 0;
}*/
offset += len;
c -= len;
}
return count;
}
public override void Write(byte[] buffer, int offset, int count)
{
stream.Write(buffer, offset, count);
}
public override void Close()
{
stream.Close();
base.Close();
}
public override void Flush()
{
stream.Flush();
}
}
class Program
{
static void Main(string[] args)
{
// Create an in-process speech recognizer for the en-US locale.  
using (
SpeechRecognitionEngine recognizer =
new SpeechRecognitionEngine(
new System.Globalization.CultureInfo("en-US")))
{
// Create and load a dictation grammar.  
recognizer.LoadGrammar(new DictationGrammar());
// Add a handler for the speech recognized event.  
recognizer.SpeechRecognized +=
new EventHandler<SpeechRecognizedEventArgs>(recognizer_SpeechRecognized);
// Configure input to the speech recognizer.  
//recognizer.SetInputToDefaultAudioDevice();  
WasapiLoopbackCapture capture = new WasapiLoopbackCapture();
BufferedWaveProvider WaveBuffer = new BufferedWaveProvider(capture.WaveFormat);
WaveBuffer.DiscardOnBufferOverflow = true;
//WaveBuffer.ReadFully = false;
WaveToSampleProvider sampleStream = new WaveToSampleProvider(WaveBuffer);
StereoToMonoSampleProvider monoStream = new StereoToMonoSampleProvider(sampleStream)
{
LeftVolume = 1f,
RightVolume = 1f
};
//Downsample to 8000 https://stackoverflow.com/questions/48233099/capture-audio-from-wasapiloopbackcapture-and-convert-to-mulaw
WdlResamplingSampleProvider resamplingProvider = new WdlResamplingSampleProvider(monoStream, 16000);
SampleToWaveProvider16 ieeeToPcm = new SampleToWaveProvider16(resamplingProvider);
var arr = new byte[128];
Stream captureConvertStream = new System.IO.MemoryStream();
capture.StartRecording();
//outputStream = new MuLawConversionProvider(ieeeToPcm);
Stream captureStream = new System.IO.MemoryStream();
//Stream buffStream = new FakeStreamer(captureStream);
capture.DataAvailable += (s, a) =>
{
//It is getting here.
//captureStream.Write(a.Buffer, 0, a.BytesRecorded);
WaveBuffer.AddSamples(a.Buffer, 0, a.BytesRecorded);
};
Console.WriteLine(capture.WaveFormat.AverageBytesPerSecond);
Console.WriteLine(capture.WaveFormat.BitsPerSample);
//var newFormat = new WaveFormat(8000, 16, 1);
//using (var conversionStream = new WaveFormatConversionStream(newFormat, capture)
//capture.StartRecording();
//using (var resampler = new MediaFoundationResampler(new NAudio.Wave.RawSourceWaveStream(captureStream, capture.WaveFormat), newFormat))
//{
//resampler.ResamplerQuality = 60;
//WaveFileWriter.WriteWavFileToStream(captureConvertStream, resampler);
//recognizer.SetInputToDefaultAudioDevice();
//Stream buffStream = new FakeStreamer(captureConvertStream);
Stream buffStream = new SpeechStreamer(2048);
//recognizer.SetInputToWaveStream(buffStream);
recognizer.SetInputToAudioStream(buffStream, new SpeechAudioFormatInfo(
16000, AudioBitsPerSample.Eight, AudioChannel.Mono));
// Start asynchronous, continuous speech recognition.  
recognizer.RecognizeAsync(RecognizeMode.Multiple);
/*System.Threading.Thread.Sleep(5000);
works when playing anything
var floata = new float[128];
while(monoStream.Read(floata, 0, floata.Length) > 0 )
{
Console.WriteLine(arr.Length);
}*/
while (ieeeToPcm.Read(arr, 0, arr.Length) > 0)
{
//Console.Write("Writing PCM ");
//Console.WriteLine(arr.Length);
//captureConvertStream.Write(arr, 0, arr.Length);
buffStream.Write(arr, 0, arr.Length);
}
Console.WriteLine("end");
/*capture.StartRecording();
//Never getting to the resampler, the read is always zero!? even if waiting 5s for the audio to buffer.
System.Threading.Thread.Sleep(5000);
var arr = new byte[128];
while (resampler.Read(arr, 0, arr.Length) > 0)
{
captureConvertStream.Write(arr, 0, arr.Length);
Console.WriteLine("Never getting here");
}
// Keep the console window open.  
while (true)
{
Console.ReadLine();
}*/
//}
}
}
// Handle the SpeechRecognized event.  
static void recognizer_SpeechRecognized(object sender, SpeechRecognizedEventArgs e)
{
Console.WriteLine("Recognized text: " + e.Result.Text);
}
}
}

SpeechStreamer类有一些问题,我真的看不出它的用途是什么。我试过了。此外,从您的实现中查看波形文件转储,音频真的很不稳定,样本之间有很长的停顿。这可能是让语音识别器崩溃的原因。这是一个例子:Windows Volume Adjustment Sound From Your Code

正如你可能听到的,它是相当波涛汹涌,中间有很多沉默。语音识别部分将其识别为:"ta ta ta ta…">

我不得不重写你的代码一点来转储一个波形文件,因为SpeechStream的Read方法在用于读取其内容时会导致一个永恒的循环。

要转储波形文件,您可以执行以下操作:

var buffer = new byte[2048];
using (var writer = new WaveFileWriter("tmp.wav", ieeeToPcm.WaveFormat))
{
//buffStream is changed to a MemoryStream for this to work.
buffStream.Seek(0,SeekOrigin.Begin);
while (buffStream.Read(buffer, 0, buffer.Length)>0)
{
writer.Write(buffer, 0, buffer.Length);
}
}

或者你可以在阅读SampleToWaveProvider16:时这样做

var writer = new WaveFileWriter("dump.wav", ieeeToPcm.WaveFormat);
while (ieeeToPcm.Read(arr, 0, arr.Length) > 0)
{
if (Console.KeyAvailable && Console.ReadKey().Key == ConsoleKey.Escape)
break;
buffStream.Write(arr, 0, arr.Length);
writer.Write(arr, 0, arr.Length);
}

我刚刚添加了点击Escape退出循环的功能。

现在我想知道你为什么要用NAudio?为什么不使用Sound.Speech API的原生方法?

class Program
{
private static ManualResetEvent _done;
static void Main(string[] args)
{
_done = new ManualResetEvent(false);
using (SpeechRecognitionEngine recognizer = new SpeechRecognitionEngine(new CultureInfo("en-US")))
{
recognizer.LoadGrammar(new DictationGrammar());
recognizer.SpeechRecognized += RecognizedSpeech;
recognizer.SetInputToDefaultAudioDevice();
recognizer.RecognizeAsync(RecognizeMode.Multiple);
_done.WaitOne();
}
}
private static void RecognizedSpeech(object sender, SpeechRecognizedEventArgs e)
{
if (e.Result.Text.Contains("exit"))
{
_done.Set();
}
Console.WriteLine(e.Result.Text);
}
}

最新更新