Speech-to-Text - Text-to-Speech
This audio engine listens to the microphone input and transcribes the audio to text using the Whisper extension's Speech-to-Text (STT) node. The transcribed text is then sent to a Text-to-Speech (TTS) node from the Sherpa extension. As a result, the engine echoes back the user's speech in real-time.
- JSON
{
"type": "RealTimeGraphRenderer",
"config": {
"microphoneEnabled": true,
"graph": {
"config": {
"sampleRate": 16000,
"bufferSize": 512
},
"nodes": [
{
"id": "multiChannelToMonoNode",
"type": "MultiChannelToMono"
},
{
"id": "busSplitterNode",
"type": "BusSplitter"
},
{
"id": "vadNode",
"type": "SileroVAD.SileroVAD"
},
{
"id": "sttNode",
"type": "Whisper.WhisperSTT",
"config": {
"initializeModel": true,
"useGPU": true
}
},
{
"id": "ttsNode",
"type": "Sherpa.SherpaTTSNode"
},
{
"id": "monoToMultiChannelNode",
"type": "MonoToMultiChannel"
}
],
"connections": [
{
"sourceNode": "inputNode",
"destinationNode": "multiChannelToMonoNode"
},
{
"sourceNode": "multiChannelToMonoNode",
"destinationNode": "busSplitterNode"
},
{
"sourceNode": "busSplitterNode",
"destinationNode": "vadNode"
},
{
"sourceNode": "busSplitterNode",
"destinationNode": "sttNode"
},
{
"sourceNode": "ttsNode",
"destinationNode": "monoToMultiChannelNode"
},
{
"sourceNode": "monoToMultiChannelNode",
"destinationNode": "outputNode"
},
{
"sourceNode": "vadNode.end",
"destinationNode": "sttNode.transcribe"
},
{
"sourceNode": "sttNode.transcription",
"destinationNode": "ttsNode.synthesize"
}
]
}
}
}