You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

VADTalkMutedDetection.js 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. import { EventEmitter } from 'events';
  2. import { VAD_SCORE_PUBLISHED, VAD_TALK_WHILE_MUTED } from './DetectionEvents';
  3. import { getLogger } from 'jitsi-meet-logger';
  4. import TrackVADEmitter from '../detection/TrackVADEmitter';
  5. import * as JitsiConferenceEvents from '../../JitsiConferenceEvents';
  6. const logger = getLogger(__filename);
  7. /**
  8. * The threshold which the average VAD values for a span of time needs to exceed to trigger an event.
  9. * @type {number}
  10. */
  11. const VAD_AVG_THRESHOLD = 0.6;
  12. /**
  13. * The VAD score needed to trigger the processing algorithm, i.e. if a sample has the VAD score >= VAD_VOICE_LEVEL
  14. * we start processing all scores for a time span defined by const PROCESS_TIME_FRAME_SPAN_MS.
  15. * @type {number}
  16. */
  17. const VAD_VOICE_LEVEL = 0.9;
  18. /**
  19. * Sample rate of TrackVADEmitter, it defines how many audio samples are processed at a time.
  20. * @type {number}
  21. */
  22. const VAD_EMITTER_SAMPLE_RATE = 4096;
  23. /**
  24. * Time span over which we calculate an average score used to determine if we trigger the event.
  25. * @type {number}
  26. */
  27. const PROCESS_TIME_FRAME_SPAN_MS = 1500;
  28. /**
  29. * Detect user trying to speak while is locally muted and fires an event using a TrackVADEmitter.
  30. */
  31. export default class VADTalkMutedDetection extends EventEmitter {
  32. /**
  33. * Creates <tt>VADTalkMutedDetection</tt>
  34. * @param {JitsiConference} conference - JitsiConference instance that created us.
  35. * @param {Object} createVADProcessor - Function that creates a Voice activity detection processor. The processor
  36. * needs to implement the following functions:
  37. * - <tt>getSampleLength()</tt> - Returns the sample size accepted by getSampleLength.
  38. * - <tt>getRequiredPCMFrequency()</tt> - Returns the PCM frequency at which the processor operates.
  39. * - <tt>calculateAudioFrameVAD(pcmSample)</tt> - Process a 32 float pcm sample of getSampleLength size.
  40. * @constructor
  41. */
  42. constructor(conference, createVADProcessor) {
  43. super();
  44. /**
  45. * Member function that instantiates a VAD processor.
  46. */
  47. this._createVADProcessor = createVADProcessor;
  48. /**
  49. * Current {@link TrackVADEmitter}. VAD Emitter uses a {@link JitsiLocalTrack} and VAD processor to generate
  50. * period voice probability scores.
  51. */
  52. this._vadEmitter = null;
  53. /**
  54. * Flag which denotes the current state of the detection service i.e.if there is already a processing operation
  55. * ongoing.
  56. */
  57. this._processing = false;
  58. /**
  59. * Buffer that keeps the VAD scores for a period of time.
  60. */
  61. this._scoreArray = [];
  62. /**
  63. * Promise used to chain create and destroy operations associated with TRACK_ADDED and TRACK_REMOVED events
  64. * coming from the conference.
  65. * Because we have an async created component (VAD Processor) we need to make sure that it's initialized before
  66. * we destroy it ( when changing the device for instance), or when we use it from an external point of entry
  67. * i.e. (TRACK_MUTE_CHANGED event callback).
  68. */
  69. this._vadInitTracker = Promise.resolve();
  70. /**
  71. * {@link JitsiConference} bindings.
  72. */
  73. conference.on(JitsiConferenceEvents.TRACK_MUTE_CHANGED, this._trackMuteChanged.bind(this));
  74. conference.on(JitsiConferenceEvents.TRACK_ADDED, this._trackAdded.bind(this));
  75. conference.on(JitsiConferenceEvents.TRACK_REMOVED, this._trackRemoved.bind(this));
  76. // TODO do we need to handle the case where tracks are removed, make sure this cleans up properly so
  77. // we don't have any leeks i.e. stale JitsiLocalTracks
  78. }
  79. /**
  80. * Determine if the current score is high enough that we should start the final score processing, and make sure
  81. * there isn't already a process operation ongoing.
  82. *
  83. * @param {number} score - PCM sample VAD score.
  84. * @return {boolean}
  85. */
  86. _shouldStartVADCompute(vadScore) {
  87. return vadScore > VAD_VOICE_LEVEL && !this._processing;
  88. }
  89. /**
  90. * Determine if the computed score over the configured timestamp should trigger an event.
  91. *
  92. * @param {number} computedScore - Computed VAD score.
  93. * @returns {boolean} - Should or shouldn't trigger.
  94. */
  95. _shouldTriggerNotification(computedScore) {
  96. return computedScore > VAD_AVG_THRESHOLD;
  97. }
  98. /**
  99. * Start the {@link TrackVADEmitter} and attach the event listener.
  100. * @returns {void}
  101. */
  102. _startVADEmitter() {
  103. this._vadEmitter.on(VAD_SCORE_PUBLISHED, this._processVADScore.bind(this));
  104. this._vadEmitter.start();
  105. }
  106. /**
  107. * Stop the {@link TrackVADEmitter} and detach the event listener.
  108. * @returns {void}
  109. */
  110. _stopVADEmitter() {
  111. this._vadEmitter.removeAllListeners(VAD_SCORE_PUBLISHED);
  112. this._vadEmitter.stop();
  113. }
  114. /**
  115. * Calculates the average value of a Float32Array.
  116. *
  117. * @param {Float32Array} scoreArray - Array of vad scores.
  118. * @returns {number} - Score average.
  119. */
  120. _calculateAverage(scoreArray) {
  121. let avg = 0;
  122. if (scoreArray.length) {
  123. const sum = scoreArray.reduce((a, b) => a + b);
  124. avg = sum / scoreArray.length;
  125. }
  126. return avg;
  127. }
  128. /**
  129. * Compute cumulative VAD score function called once the PROCESS_TIME_FRAME_SPAN_MS timeout has elapsed.
  130. * @returns {void}
  131. * @fires VAD_TALK_WHILE_MUTED
  132. */
  133. _calculateVADScore() {
  134. const score = this._calculateAverage(this._scoreArray);
  135. if (this._shouldTriggerNotification(score)) {
  136. /**
  137. * User is talking while the mic is muted, generate event.
  138. *
  139. * @event VAD_TALK_WHILE_MUTED.
  140. * @type {Object}
  141. */
  142. this.emit(VAD_TALK_WHILE_MUTED, {});
  143. // Event was fired. Stop event emitter and remove listeners so no residue events kick off after this point
  144. // and a single VAD_TALK_WHILE_MUTED is generated per mic muted state.
  145. this._stopVADEmitter();
  146. }
  147. // We reset the context in case a new process phase needs to be triggered.
  148. this._reset();
  149. }
  150. /**
  151. * Listens for {@link TrackVADEmitter} events and processes them.
  152. *
  153. * @param {Object} vadScore -VAD score emitted by {@link TrackVADEmitter}
  154. * @param {Date} vadScore.timestamp - Exact time at which processed PCM sample was generated.
  155. * @param {number} vadScore.score - VAD score on a scale from 0 to 1 (i.e. 0.7)
  156. * @param {string} vadScore.deviceId - Device id of the associated track.
  157. * @listens VAD_SCORE_PUBLISHED
  158. */
  159. _processVADScore(vadScore) {
  160. // Because we remove all listeners on the vadEmitter once the main event is triggered,
  161. // there is no need to check for rogue events.
  162. if (this._shouldStartVADCompute(vadScore.score)) {
  163. this._processing = true;
  164. // Start gathering VAD scores for the configured period of time.
  165. this._processTimeout = setTimeout(this._calculateVADScore.bind(this), PROCESS_TIME_FRAME_SPAN_MS);
  166. }
  167. // There is a processing phase on going, add score to buffer array.
  168. if (this._processing) {
  169. this._scoreArray.push(vadScore.score);
  170. }
  171. }
  172. /**
  173. * Determines whether a specific {@link JitsiTrack} represents a local audio track.
  174. *
  175. * @param {JitsiTrack} track - The track to be checked whether it represents a local audio track.
  176. * @return {boolean} - true if the specified track represents a local audio track; otherwise, false.
  177. */
  178. _isLocalAudioTrack(track) {
  179. return track.isAudioTrack() && track.isLocal();
  180. }
  181. /**
  182. * Reset the processing context, clear buffer, cancel the timeout trigger.
  183. *
  184. * @returns {void}
  185. */
  186. _reset() {
  187. this._processing = false;
  188. this._scoreArray = [];
  189. clearTimeout(this._processTimeout);
  190. }
  191. /**
  192. * Notifies the detector that a track was added to the associated {@link JitsiConference}.
  193. * Only take into account local audio tracks.
  194. * @param {JitsiTrack} track - The added track.
  195. * @returns {void}
  196. * @listens TRACK_ADDED
  197. */
  198. _trackAdded(track) {
  199. if (this._isLocalAudioTrack(track)) {
  200. // Keep a track promise so we take into account successive TRACK_ADD events being generated so that we
  201. // destroy/create the processing context in the proper order.
  202. this._vadInitTracker
  203. .then(() => this._createVADProcessor())
  204. .then(vadProcessor =>
  205. TrackVADEmitter.create(track.getDeviceId(), VAD_EMITTER_SAMPLE_RATE, vadProcessor)
  206. )
  207. .then(vadEmitter => {
  208. logger.debug('Created VAD emitter for track: ', track.getTrackLabel());
  209. this._vadEmitter = vadEmitter;
  210. if (track.isMuted()) {
  211. this._startVADEmitter();
  212. }
  213. });
  214. }
  215. }
  216. /**
  217. * Notifies the detector that the mute state of a {@link JitsiConference} track has changed. Only takes into account
  218. * local audio tracks. In case the track was muted the detector starts the {@link TrackVADEmitter} otherwise it's
  219. * stopped.
  220. * @param {JitsiTrack} track - The track whose mute state has changed.
  221. * @returns {void}
  222. * @listens TRACK_MUTE_CHANGED
  223. */
  224. _trackMuteChanged(track) {
  225. if (this._isLocalAudioTrack(track)) {
  226. // On a mute toggle reset the state.
  227. this._vadInitTracker.then(() => {
  228. // Reset the processing context in between muted states so that each individual mute phase can generate
  229. // it's own event.
  230. this._reset();
  231. if (track.isMuted()) {
  232. this._startVADEmitter();
  233. } else {
  234. this._stopVADEmitter();
  235. }
  236. });
  237. }
  238. }
  239. /**
  240. * Notifies the detector that a track associated with the {@link JitsiConference} was removed. Only takes into
  241. * account local audio tracks. Cleans up resources associated with the track and resets the processing context.
  242. *
  243. * @param {JitsiTrack} track - The removed track.
  244. * @returns {void}
  245. * @listens TRACK_REMOVED
  246. */
  247. _trackRemoved(track) {
  248. if (this._isLocalAudioTrack(track)) {
  249. // Use the promise to make sure operations are in sequence.
  250. this._vadInitTracker.then(() => {
  251. logger.debug('Removing track from VAD detection - ', track.getTrackLabel());
  252. if (this._vadEmitter) {
  253. this._stopVADEmitter();
  254. this._reset();
  255. this._vadEmitter.destroy();
  256. this._vadEmitter = null;
  257. }
  258. });
  259. }
  260. }
  261. }