Speech-to-Text - Text-to-Speech

This audio engine listens to the microphone input and transcribes the audio to text using the Whisper extension's Speech-to-Text (STT) node. The transcribed text is then sent to a Text-to-Speech (TTS) node from the Sherpa extension. As a result, the engine echoes back the user's speech in real-time.

JSON

{
    "type": "RealTimeGraphRenderer",
    "config": {
        "microphoneEnabled": true,
        "graph": {
            "config": {
                "sampleRate": 16000,
                "bufferSize": 512
            },
            "nodes": [
                {
                    "id": "multiChannelToMonoNode",
                    "type": "MultiChannelToMono"
                },
                {
                    "id": "busSplitterNode",
                    "type": "BusSplitter"
                },
                {
                    "id": "vadNode",
                    "type": "SileroVAD.SileroVAD"
                },
                {
                    "id": "sttNode",
                    "type": "Whisper.WhisperSTT",
                    "config": {
                        "initializeModel": true,
                        "useGPU": true
                    }
                },
                {
                    "id": "ttsNode",
                    "type": "Sherpa.SherpaTTSNode"
                },
                {
                    "id": "monoToMultiChannelNode",
                    "type": "MonoToMultiChannel"
                }
            ],
            "connections": [
                {
                    "sourceNode": "inputNode",
                    "destinationNode": "multiChannelToMonoNode"
                },
                {
                    "sourceNode": "multiChannelToMonoNode",
                    "destinationNode": "busSplitterNode"
                },
                {
                    "sourceNode": "busSplitterNode",
                    "destinationNode": "vadNode"
                },
                {
                    "sourceNode": "busSplitterNode",
                    "destinationNode": "sttNode"
                },
                {
                    "sourceNode": "ttsNode",
                    "destinationNode": "monoToMultiChannelNode"
                },
                {
                    "sourceNode": "monoToMultiChannelNode",
                    "destinationNode": "outputNode"
                },
                {
                    "sourceNode": "vadNode.end",
                    "destinationNode": "sttNode.transcribe"
                },
                {
                    "sourceNode": "sttNode.transcription",
                    "destinationNode": "ttsNode.synthesize"
                }
            ]
        }
    }
}