我想做的是用webRTC在浏览器中实时录制视频。用例基本上是实时字幕,就像谷歌的hangouts一样。
所以我在浏览器中运行了一个WebRTC程序。它将webm对象发送回服务器。它们是线性32音频编码。谷歌语音转文本只接受linear16或Flac文件。
有没有一种方法可以将linear32实时转换为linear16?
否则,有人能够将webRTC与谷歌语音连接起来,使实时转录工作吗?
任何关于在哪里解决这个问题的建议都是很好的
查看此存储库,它可能会对您有所帮助-https://github.com/muaz-khan/Translator
Translator.js是一个基于GoogleSpeechRecognition&翻译API以转录和翻译语音和文本。它支持许多地区,并在WebRTC中带来全球化!
我遇到了同样的问题,使用webRTC失败了。如果你只是对转录视频中的音频感兴趣,我建议你使用网络音频Api。
以下是我如何使用nodejs服务器和react客户端应用程序实现的。它被上传到github这里
- 您需要一个音频工作集脚本。(将其放在公用文件夹中,因为API希望在那里找到它(
recorderWorkletProcessor.js(保存在public/src/worklets/recorderWorkletProcessor.js
中(
/**
An in-place replacement for ScriptProcessorNode using AudioWorklet
*/
class RecorderProcessor extends AudioWorkletProcessor {
// 0. Determine the buffer size (this is the same as the 1st argument of ScriptProcessor)
bufferSize = 2048;
// 1. Track the current buffer fill level
_bytesWritten = 0;
// 2. Create a buffer of fixed size
_buffer = new Float32Array(this.bufferSize);
constructor() {
super();
this.initBuffer();
}
initBuffer() {
this._bytesWritten = 0;
}
isBufferEmpty() {
return this._bytesWritten === 0;
}
isBufferFull() {
return this._bytesWritten === this.bufferSize;
}
/**
* @param {Float32Array[][]} inputs
* @returns {boolean}
*/
process(inputs) {
// Grabbing the 1st channel similar to ScriptProcessorNode
this.append(inputs[0][0]);
return true;
}
/**
*
* @param {Float32Array} channelData
*/
append(channelData) {
if (this.isBufferFull()) {
this.flush();
}
if (!channelData) return;
for (let i = 0; i < channelData.length; i++) {
this._buffer[this._bytesWritten++] = channelData[i];
}
}
flush() {
// trim the buffer if ended prematurely
const buffer = this._bytesWritten < this.bufferSize ? this._buffer.slice(0, this._bytesWritten) : this._buffer;
const result = this.downsampleBuffer(buffer, 44100, 16000);
this.port.postMessage(result);
this.initBuffer();
}
downsampleBuffer(buffer, sampleRate, outSampleRate) {
if (outSampleRate == sampleRate) {
return buffer;
}
if (outSampleRate > sampleRate) {
throw new Error("downsampling rate show be smaller than original sample rate");
}
var sampleRateRatio = sampleRate / outSampleRate;
var newLength = Math.round(buffer.length / sampleRateRatio);
var result = new Int16Array(newLength);
var offsetResult = 0;
var offsetBuffer = 0;
while (offsetResult < result.length) {
var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
var accum = 0,
count = 0;
for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
accum += buffer[i];
count++;
}
result[offsetResult] = Math.min(1, accum / count) * 0x7fff;
offsetResult++;
offsetBuffer = nextOffsetBuffer;
}
return result.buffer;
}
}
registerProcessor("recorder.worklet", RecorderProcessor);
在前端上安装Socket.io-client
npm i socket.io-client
React组件代码
/* eslint-disable react-hooks/exhaustive-deps */
import { default as React, useEffect, useState, useRef } from "react";
import { Button } from "react-bootstrap";
import Container from "react-bootstrap/Container";
import * as io from "socket.io-client";
const sampleRate = 16000;
const getMediaStream = () =>
navigator.mediaDevices.getUserMedia({
audio: {
deviceId: "default",
sampleRate: sampleRate,
sampleSize: 16,
channelCount: 1,
},
video: false,
});
interface WordRecognized {
final: boolean;
text: string;
}
const AudioToText: React.FC = () => {
const [connection, setConnection] = useState<io.Socket>();
const [currentRecognition, setCurrentRecognition] = useState<string>();
const [recognitionHistory, setRecognitionHistory] = useState<string[]>([]);
const [isRecording, setIsRecording] = useState<boolean>(false);
const [recorder, setRecorder] = useState<any>();
const processorRef = useRef<any>();
const audioContextRef = useRef<any>();
const audioInputRef = useRef<any>();
const speechRecognized = (data: WordRecognized) => {
if (data.final) {
setCurrentRecognition("...");
setRecognitionHistory((old) => [data.text, ...old]);
} else setCurrentRecognition(data.text + "...");
};
const connect = () => {
connection?.disconnect();
const socket = io.connect("http://localhost:8081");
socket.on("connect", () => {
console.log("connected", socket.id);
setConnection(socket);
});
socket.emit("send_message", "hello world");
socket.emit("startGoogleCloudStream");
socket.on("receive_message", (data) => {
console.log("received message", data);
});
socket.on("receive_audio_text", (data) => {
speechRecognized(data);
console.log("received audio text", data);
});
socket.on("disconnect", () => {
console.log("disconnected", socket.id);
});
};
const disconnect = () => {
if (!connection) return;
connection?.emit("endGoogleCloudStream");
connection?.disconnect();
processorRef.current?.disconnect();
audioInputRef.current?.disconnect();
audioContextRef.current?.close();
setConnection(undefined);
setRecorder(undefined);
setIsRecording(false);
};
useEffect(() => {
(async () => {
if (connection) {
if (isRecording) {
return;
}
const stream = await getMediaStream();
audioContextRef.current = new window.AudioContext();
await audioContextRef.current.audioWorklet.addModule(
"/src/worklets/recorderWorkletProcessor.js"
);
audioContextRef.current.resume();
audioInputRef.current =
audioContextRef.current.createMediaStreamSource(stream);
processorRef.current = new AudioWorkletNode(
audioContextRef.current,
"recorder.worklet"
);
processorRef.current.connect(audioContextRef.current.destination);
audioContextRef.current.resume();
audioInputRef.current.connect(processorRef.current);
processorRef.current.port.onmessage = (event: any) => {
const audioData = event.data;
connection.emit("send_audio_data", { audio: audioData });
};
setIsRecording(true);
} else {
console.error("No connection");
}
})();
return () => {
if (isRecording) {
processorRef.current?.disconnect();
audioInputRef.current?.disconnect();
if (audioContextRef.current?.state !== "closed") {
audioContextRef.current?.close();
}
}
};
}, [connection, isRecording, recorder]);
return (
<React.Fragment>
<Container className="py-5 text-center">
<Container fluid className="py-5 bg-primary text-light text-center ">
<Container>
<Button
className={isRecording ? "btn-danger" : "btn-outline-light"}
onClick={connect}
disabled={isRecording}
>
Start
</Button>
<Button
className="btn-outline-light"
onClick={disconnect}
disabled={!isRecording}
>
Stop
</Button>
</Container>
</Container>
<Container className="py-5 text-center">
{recognitionHistory.map((tx, idx) => (
<p key={idx}>{tx}</p>
))}
<p>{currentRecognition}</p>
</Container>
</Container>
</React.Fragment>
);
};
export default AudioToText;
server.js
const express = require("express");
const speech = require("@google-cloud/speech");
//use logger
const logger = require("morgan");
//use body parser
const bodyParser = require("body-parser");
//use corrs
const cors = require("cors");
const http = require("http");
const { Server } = require("socket.io");
const app = express();
app.use(cors());
app.use(logger("dev"));
app.use(bodyParser.json());
const server = http.createServer(app);
const io = new Server(server, {
cors: {
origin: "http://localhost:3000",
methods: ["GET", "POST"],
},
});
//TODO: run in terminal first to setup credentials export GOOGLE_APPLICATION_CREDENTIALS="./speech-to-text-key.json"
const speechClient = new speech.SpeechClient();
io.on("connection", (socket) => {
let recognizeStream = null;
console.log("** a user connected - " + socket.id + " **n");
socket.on("disconnect", () => {
console.log("** user disconnected ** n");
});
socket.on("send_message", (message) => {
console.log("message: " + message);
setTimeout(() => {
io.emit("receive_message", "got this message" + message);
}, 1000);
});
socket.on("startGoogleCloudStream", function (data) {
startRecognitionStream(this, data);
});
socket.on("endGoogleCloudStream", function () {
console.log("** ending google cloud stream **n");
stopRecognitionStream();
});
socket.on("send_audio_data", async (audioData) => {
io.emit("receive_message", "Got audio data");
if (recognizeStream !== null) {
try {
recognizeStream.write(audioData.audio);
} catch (err) {
console.log("Error calling google api " + err);
}
} else {
console.log("RecognizeStream is null");
}
});
function startRecognitionStream(client) {
console.log("* StartRecognitionStreamn");
try {
recognizeStream = speechClient
.streamingRecognize(request)
.on("error", console.error)
.on("data", (data) => {
const result = data.results[0];
const isFinal = result.isFinal;
const transcription = data.results
.map((result) => result.alternatives[0].transcript)
.join("n");
console.log(`Transcription: `, transcription);
client.emit("receive_audio_text", {
text: transcription,
final: isFinal,
});
});
} catch (err) {
console.error("Error streaming google api " + err);
}
}
function stopRecognitionStream() {
if (recognizeStream) {
console.log("* StopRecognitionStream n");
recognizeStream.end();
}
recognizeStream = null;
}
});
server.listen(8081, () => {
console.log("WebSocket server listening on port 8081.");
});
// =========================== GOOGLE CLOUD SETTINGS ================================ //
// The encoding of the audio file, e.g. 'LINEAR16'
// The sample rate of the audio file in hertz, e.g. 16000
// The BCP-47 language code to use, e.g. 'en-US'
const encoding = "LINEAR16";
const sampleRateHertz = 16000;
const languageCode = "en-US"; //en-US
const alternativeLanguageCodes = ["en-US", "ko-KR"];
const request = {
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode,
//alternativeLanguageCodes: alternativeLanguageCodes,
enableWordTimeOffsets: true,
enableAutomaticPunctuation: true,
enableWordConfidence: true,
enableSpeakerDiarization: true,
diarizationSpeakerCount: 2,
model: "video",
//model: "command_and_search",
useEnhanced: true,
speechContexts: [
{
phrases: ["hello", "안녕하세요"],
},
],
},
interimResults: true,
};