From 3306edc514a996a7c61986d4851c9e6cfa323fca Mon Sep 17 00:00:00 2001 From: ouwou <26526779+ouwou@users.noreply.github.com> Date: Mon, 25 Mar 2024 00:38:31 -0400 Subject: [PATCH] add jitter buffer for voice --- README.md | 8 ++-- src/audio/jitterbuffer.hpp | 82 ++++++++++++++++++++++++++++++++++++++ src/audio/manager.cpp | 22 +++++++--- src/audio/manager.hpp | 3 +- src/settings.cpp | 2 + src/settings.hpp | 2 + 6 files changed, 109 insertions(+), 10 deletions(-) create mode 100644 src/audio/jitterbuffer.hpp diff --git a/README.md b/README.md index cbbe341..a36d5c6 100644 --- a/README.md +++ b/README.md @@ -330,9 +330,11 @@ For example, memory_db would be set by adding `memory_db = true` under the line #### voice -| Setting | Type | Default | Description | -|---------|--------|------------------------------------|------------------------------------------------------------| -| `vad` | string | rnnoise if enabled, gate otherwise | Method used for voice activity detection. Changeable in UI | +| Setting | Type | Default | Description | +|--------------------------|--------|------------------------------------|---------------------------------------------------------------------------------| +| `vad` | string | rnnoise if enabled, gate otherwise | Method used for voice activity detection. Changeable in UI | +| `jitter_latency_desired` | int | 50 | Desired/Minimum latency for jitter buffer (in milliseconds) | +| `jitter_latency_maximum` | int | 200 | Maximum latency for jitter buffer before frames are discarded (in milliseconds) | #### windows diff --git a/src/audio/jitterbuffer.hpp b/src/audio/jitterbuffer.hpp new file mode 100644 index 0000000..3da3594 --- /dev/null +++ b/src/audio/jitterbuffer.hpp @@ -0,0 +1,82 @@ +#pragma once +#include +#include +#include + +// very simple non-RTP-based jitter buffer. does not handle out-of-order +template +class JitterBuffer { +public: + /* + * desired_latency: how many milliseconds before audio can be drawn from buffer + * maximum_latency: how many milliseconds before old audio starts to be discarded + */ + JitterBuffer(int desired_latency, int maximum_latency, int channels, int sample_rate) + : m_desired_latency(desired_latency) + , m_maximum_latency(maximum_latency) + , m_channels(channels) + , m_sample_rate(sample_rate) + , m_last_push(std::chrono::steady_clock::now()) { + } + + [[nodiscard]] size_t Available() const noexcept { + return m_samples.size(); + } + + bool PopSamples(SampleFormat *ptr, size_t amount) { + CheckBuffering(); + if (m_buffering || Available() < amount) return false; + std::copy(m_samples.begin(), m_samples.begin() + amount, ptr); + m_samples.erase(m_samples.begin(), m_samples.begin() + amount); + return true; + } + + void PushSamples(SampleFormat *ptr, size_t amount) { + m_samples.insert(m_samples.end(), ptr, ptr + amount); + m_last_push = std::chrono::steady_clock::now(); + const auto buffered = MillisBuffered(); + if (buffered > m_maximum_latency) { + const auto overflow_ms = MillisBuffered() - m_maximum_latency; + const auto overflow_samples = overflow_ms * m_channels * m_sample_rate / 1000; + m_samples.erase(m_samples.begin(), m_samples.begin() + overflow_samples); + } + } + +private: + [[nodiscard]] size_t MillisBuffered() const { + return m_samples.size() * 1000 / m_channels / m_sample_rate; + } + + void CheckBuffering() { + // if we arent buffering but the buffer is empty then we should be + if (m_samples.empty()) { + if (!m_buffering) { + m_buffering = true; + } + return; + } + + if (!m_buffering) return; + + // if we reached desired latency, we are sufficiently buffered + const auto millis_buffered = MillisBuffered(); + if (millis_buffered >= m_desired_latency) { + m_buffering = false; + } + // if we havent buffered to desired latency but max latency has elapsed, exit buffering so it doesnt get stuck + const auto now = std::chrono::steady_clock::now(); + const auto millis = std::chrono::duration_cast(now - m_last_push).count(); + if (millis >= m_maximum_latency) { + m_buffering = false; + } + } + + int m_desired_latency; + int m_maximum_latency; + int m_channels; + int m_sample_rate; + bool m_buffering = true; + std::chrono::time_point m_last_push; + + std::deque m_samples; +}; diff --git a/src/audio/manager.cpp b/src/audio/manager.cpp index eaac3bf..bb12f23 100644 --- a/src/audio/manager.cpp +++ b/src/audio/manager.cpp @@ -25,6 +25,7 @@ const uint8_t *StripRTPExtensionHeader(const uint8_t *buf, int num_bytes, size_t return buf; } +// frameCount is configured to be 480 samples per channel void data_callback(ma_device *pDevice, void *pOutput, const void *pInput, ma_uint32 frameCount) { AudioManager *mgr = reinterpret_cast(pDevice->pUserData); if (mgr == nullptr) return; @@ -36,12 +37,14 @@ void data_callback(ma_device *pDevice, void *pOutput, const void *pInput, ma_uin if (const auto vol_it = mgr->m_volume_ssrc.find(ssrc); vol_it != mgr->m_volume_ssrc.end()) { volume = vol_it->second; } - auto &buf = pair.first; - const size_t n = std::min(static_cast(buf.size()), static_cast(frameCount * 2ULL)); - for (size_t i = 0; i < n; i++) { + + static std::array buf; + + if (!pair.first.PopSamples(buf.data(), 480 * 2)) continue; + + for (size_t i = 0; i < 480 * 2; i++) { pOutputF32[i] += volume * buf[i] / 32768.F; } - buf.erase(buf.begin(), buf.begin() + n); } } @@ -201,7 +204,14 @@ void AudioManager::AddSSRC(uint32_t ssrc) { int error; if (m_sources.find(ssrc) == m_sources.end()) { auto *decoder = opus_decoder_create(48000, 2, &error); - m_sources.insert(std::make_pair(ssrc, std::make_pair(std::deque {}, decoder))); + auto &s = Abaddon::Get().GetSettings(); + m_sources.insert(std::make_pair(ssrc, std::make_pair( + JitterBuffer( + s.JitterDesiredLatency, + s.JitterMaximumLatency, + 2, + 48000), + decoder))); } } @@ -241,7 +251,7 @@ void AudioManager::FeedMeOpus(uint32_t ssrc, const std::vector &data) { } else { UpdateReceiveVolume(ssrc, pcm.data(), decoded); auto &buf = it->second.first; - buf.insert(buf.end(), pcm.begin(), pcm.begin() + decoded * 2); + buf.PushSamples(pcm.data(), decoded * 2); } } } diff --git a/src/audio/manager.hpp b/src/audio/manager.hpp index 5716fc5..56882fd 100644 --- a/src/audio/manager.hpp +++ b/src/audio/manager.hpp @@ -21,6 +21,7 @@ #endif #include "devices.hpp" +#include "jitterbuffer.hpp" // clang-format on class AudioManager { @@ -136,7 +137,7 @@ private: mutable std::mutex m_rnn_mutex; #endif - std::unordered_map, OpusDecoder *>> m_sources; + std::unordered_map, OpusDecoder *>> m_sources; OpusEncoder *m_encoder; diff --git a/src/settings.cpp b/src/settings.cpp index fc76ddb..6dab229 100644 --- a/src/settings.cpp +++ b/src/settings.cpp @@ -130,6 +130,8 @@ void SettingsManager::DefineSettings() { AddSetting("voice", "vad", "gate"s, &Settings::VAD); #endif AddSetting("voice", "backends", ""s, &Settings::Backends); + AddSetting("voice", "jitter_latency_desired", 50, &Settings::JitterDesiredLatency); + AddSetting("voice", "jitter_latency_maximum", 200, &Settings::JitterMaximumLatency); } void SettingsManager::ReadSettings() { diff --git a/src/settings.hpp b/src/settings.hpp index 5805452..0b0f6e2 100644 --- a/src/settings.hpp +++ b/src/settings.hpp @@ -52,6 +52,8 @@ public: // [voice] std::string VAD; std::string Backends; + int JitterDesiredLatency; + int JitterMaximumLatency; // [windows] bool HideConsole;