/* global TransformStream */ // Worker for E2EE/Insertable streams. // /** * Polyfill RTCEncoded(Audio|Video)Frame.getMetadata() (not available in M83, available M84+). * The polyfill can not be done on the prototype since its not exposed in workers. Instead, * it is done as another transformation to keep it separate. */ function polyFillEncodedFrameMetadata(encodedFrame, controller) { if (!encodedFrame.getMetadata) { encodedFrame.getMetadata = function() { return { // TODO: provide a more complete polyfill based on additionalData for video. synchronizationSource: this.synchronizationSource, contributingSources: this.contributingSources }; }; } controller.enqueue(encodedFrame); } // We use a ringbuffer of keys so we can change them and still decode packets that were // encrypted with an old key. const keyRingSize = 3; // We use a 96 bit IV for AES GCM. This is signalled in plain together with the // packet. See https://developer.mozilla.org/en-US/docs/Web/API/AesGcmParams const ivLength = 12; // We use a 128 bit key for AES GCM. const keyGenParameters = { name: 'AES-GCM', length: 128 }; // We copy the first bytes of the VP8 payload unencrypted. // For keyframes this is 10 bytes, for non-keyframes (delta) 3. See // https://tools.ietf.org/html/rfc6386#section-9.1 // This allows the bridge to continue detecting keyframes (only one byte needed in the JVB) // and is also a bit easier for the VP8 decoder (i.e. it generates funny garbage pictures // instead of being unable to decode). // This is a bit for show and we might want to reduce to 1 unconditionally in the final version. // // For audio (where frame.type is not set) we do not encrypt the opus TOC byte: // https://tools.ietf.org/html/rfc6716#section-3.1 const unencryptedBytes = { key: 10, delta: 3, undefined: 1 // frame.type is not set on audio }; // Salt used in key derivation // FIXME: We currently use the MUC room name for this which has the same lifetime // as this worker. While not (pseudo)random as recommended in // https://developer.mozilla.org/en-US/docs/Web/API/Pbkdf2Params // this is easily available and the same for all participants. // We currently do not enforce a minimum length of 16 bytes either. let _keySalt; /** * Derives a AES-GCM key from the input using PBKDF2 * The key length can be configured above and should be either 128 or 256 bits. * @param {Uint8Array} keyBytes - Value to derive key from * @param {Uint8Array} salt - Salt used in key derivation */ async function deriveKey(keyBytes, salt) { // https://developer.mozilla.org/en-US/docs/Web/API/SubtleCrypto/importKey const material = await crypto.subtle.importKey('raw', keyBytes, 'PBKDF2', false, [ 'deriveBits', 'deriveKey' ]); // https://developer.mozilla.org/en-US/docs/Web/API/SubtleCrypto/deriveKey#PBKDF2 return crypto.subtle.deriveKey({ name: 'PBKDF2', salt, iterations: 100000, hash: 'SHA-256' }, material, keyGenParameters, false, [ 'encrypt', 'decrypt' ]); } /** * Per-participant context holding the cryptographic keys and * encode/decode functions */ class Context { /** * @param {string} id - local muc resourcepart */ constructor(id) { // An array (ring) of keys that we use for sending and receiving. this._cryptoKeyRing = new Array(keyRingSize); // A pointer to the currently used key. this._currentKeyIndex = -1; // We keep track of how many frames we have sent per ssrc. // Starts with a random offset similar to the RTP sequence number. this._sendCounts = new Map(); this._id = id; } /** * Derives a per-participant key. * @param {Uint8Array} keyBytes - Value to derive key from * @param {Uint8Array} salt - Salt used in key derivation */ async deriveKey(keyBytes, salt) { const encoder = new TextEncoder(); const idBytes = encoder.encode(this._id); // Separate both parts by a null byte to avoid ambiguity attacks. const participantSalt = new Uint8Array(salt.byteLength + idBytes.byteLength + 1); participantSalt.set(salt); participantSalt.set(idBytes, salt.byteLength + 1); return deriveKey(keyBytes, participantSalt); } /** * Sets a key and starts using it for encrypting. * @param {CryptoKey} key * @param {Number} keyIndex */ setKey(key, keyIndex) { this._currentKeyIndex = keyIndex % this._cryptoKeyRing.length; this._cryptoKeyRing[this._currentKeyIndex] = key; } /** * Construct the IV used for AES-GCM and sent (in plain) with the packet similar to * https://tools.ietf.org/html/rfc7714#section-8.1 * It concatenates * - the 32 bit synchronization source (SSRC) given on the encoded frame, * - the 32 bit rtp timestamp given on the encoded frame, * - a send counter that is specific to the SSRC. Starts at a random number. * The send counter is essentially the pictureId but we currently have to implement this ourselves. * There is no XOR with a salt. Note that this IV leaks the SSRC to the receiver but since this is * randomly generated and SFUs may not rewrite this is considered acceptable. * The SSRC is used to allow demultiplexing multiple streams with the same key, as described in * https://tools.ietf.org/html/rfc3711#section-4.1.1 * The RTP timestamp is 32 bits and advances by the codec clock rate (90khz for video, 48khz for * opus audio) every second. For video it rolls over roughly every 13 hours. * The send counter will advance at the frame rate (30fps for video, 50fps for 20ms opus audio) * every second. It will take a long time to roll over. * * See also https://developer.mozilla.org/en-US/docs/Web/API/AesGcmParams */ makeIV(synchronizationSource, timestamp) { const iv = new ArrayBuffer(ivLength); const ivView = new DataView(iv); // having to keep our own send count (similar to a picture id) is not ideal. if (!this._sendCounts.has(synchronizationSource)) { // Initialize with a random offset, similar to the RTP sequence number. this._sendCounts.set(synchronizationSource, Math.floor(Math.random() * 0xFFFF)); } const sendCount = this._sendCounts.get(synchronizationSource); ivView.setUint32(0, synchronizationSource); ivView.setUint32(4, timestamp); ivView.setUint32(8, sendCount % 0xFFFF); this._sendCounts.set(synchronizationSource, sendCount + 1); return iv; } /** * Function that will be injected in a stream and will encrypt the given encoded frames. * * @param {RTCEncodedVideoFrame|RTCEncodedAudioFrame} encodedFrame - Encoded video frame. * @param {TransformStreamDefaultController} controller - TransportStreamController. * * The packet format is described below. One of the design goals was to not require * changes to the SFU which for video requires not encrypting the keyframe bit of VP8 * as SFUs need to detect a keyframe (framemarking or the generic frame descriptor will * solve this eventually). This also "hides" that a client is using E2EE a bit. * * Note that this operates on the full frame, i.e. for VP8 the data described in * https://tools.ietf.org/html/rfc6386#section-9.1 * * The VP8 payload descriptor described in * https://tools.ietf.org/html/rfc7741#section-4.2 * is part of the RTP packet and not part of the frame and is not controllable by us. * This is fine as the SFU keeps having access to it for routing. * * The encrypted frame is formed as follows: * 1) Leave the first (10, 3, 1) bytes unencrypted, depending on the frame type and kind. * 2) Form the GCM IV for the frame as described above. * 3) Encrypt the rest of the frame using AES-GCM. * 4) Allocate space for the encrypted frame. * 5) Copy the unencrypted bytes to the start of the encrypted frame. * 6) Append the ciphertext to the encrypted frame. * 7) Append the IV. * 8) Append a single byte for the key identifier. TODO: we don't need all the bits. * 9) Enqueue the encrypted frame for sending. */ encodeFunction(encodedFrame, controller) { const keyIndex = this._currentKeyIndex; if (this._cryptoKeyRing[keyIndex]) { const iv = this.makeIV(encodedFrame.getMetadata().synchronizationSource, encodedFrame.timestamp); return crypto.subtle.encrypt({ name: 'AES-GCM', iv, additionalData: new Uint8Array(encodedFrame.data, 0, unencryptedBytes[encodedFrame.type]) }, this._cryptoKeyRing[keyIndex], new Uint8Array(encodedFrame.data, unencryptedBytes[encodedFrame.type])) .then(cipherText => { const newData = new ArrayBuffer(unencryptedBytes[encodedFrame.type] + cipherText.byteLength + iv.byteLength + 1); const newUint8 = new Uint8Array(newData); newUint8.set( new Uint8Array(encodedFrame.data, 0, unencryptedBytes[encodedFrame.type])); // copy first bytes. newUint8.set( new Uint8Array(cipherText), unencryptedBytes[encodedFrame.type]); // add ciphertext. newUint8.set( new Uint8Array(iv), unencryptedBytes[encodedFrame.type] + cipherText.byteLength); // append IV. newUint8[unencryptedBytes[encodedFrame.type] + cipherText.byteLength + ivLength] = keyIndex; // set key index. encodedFrame.data = newData; return controller.enqueue(encodedFrame); }, e => { console.error(e); // We are not enqueuing the frame here on purpose. }); } /* NOTE WELL: * This will send unencrypted data (only protected by DTLS transport encryption) when no key is configured. * This is ok for demo purposes but should not be done once this becomes more relied upon. */ controller.enqueue(encodedFrame); } /** * Function that will be injected in a stream and will decrypt the given encoded frames. * * @param {RTCEncodedVideoFrame|RTCEncodedAudioFrame} encodedFrame - Encoded video frame. * @param {TransformStreamDefaultController} controller - TransportStreamController. * * The decrypted frame is formed as follows: * 1) Extract the key index from the last byte of the encrypted frame. * If there is no key associated with the key index, the frame is enqueued for decoding * and these steps terminate. * 2) Determine the frame type in order to look up the number of unencrypted header bytes. * 2) Extract the 12-byte IV from its position near the end of the packet. * Note: the IV is treated as opaque and not reconstructed from the input. * 3) Decrypt the encrypted frame content after the unencrypted bytes using AES-GCM. * 4) Allocate space for the decrypted frame. * 5) Copy the unencrypted bytes from the start of the encrypted frame. * 6) Append the plaintext to the decrypted frame. * 7) Enqueue the decrypted frame for decoding. */ decodeFunction(encodedFrame, controller) { const data = new Uint8Array(encodedFrame.data); const keyIndex = data[encodedFrame.data.byteLength - 1]; if (this._cryptoKeyRing[keyIndex]) { const iv = new Uint8Array(encodedFrame.data, encodedFrame.data.byteLength - ivLength - 1, ivLength); const cipherTextStart = unencryptedBytes[encodedFrame.type]; const cipherTextLength = encodedFrame.data.byteLength - (unencryptedBytes[encodedFrame.type] + ivLength + 1); return crypto.subtle.decrypt({ name: 'AES-GCM', iv, additionalData: new Uint8Array(encodedFrame.data, 0, unencryptedBytes[encodedFrame.type]) }, this._cryptoKeyRing[keyIndex], new Uint8Array(encodedFrame.data, cipherTextStart, cipherTextLength)) .then(plainText => { const newData = new ArrayBuffer(unencryptedBytes[encodedFrame.type] + plainText.byteLength); const newUint8 = new Uint8Array(newData); newUint8.set(new Uint8Array(encodedFrame.data, 0, unencryptedBytes[encodedFrame.type])); newUint8.set(new Uint8Array(plainText), unencryptedBytes[encodedFrame.type]); encodedFrame.data = newData; return controller.enqueue(encodedFrame); }, e => { console.error(e); // TODO: notify the application about error status. // TODO: For video we need a better strategy since we do not want to based any // non-error frames on a garbage keyframe. if (encodedFrame.type === undefined) { // audio, replace with silence. // audio, replace with silence. const newData = new ArrayBuffer(3); const newUint8 = new Uint8Array(newData); newUint8.set([ 0xd8, 0xff, 0xfe ]); // opus silence frame. encodedFrame.data = newData; controller.enqueue(encodedFrame); } }); } else if (keyIndex >= this._cryptoKeyRing.length && this._cryptoKeyRing[this._currentKeyIndex]) { // If we are encrypting but don't have a key for the remote drop the frame. // This is a heuristic since we don't know whether a packet is encrypted, // do not have a checksum and do not have signaling for whether a remote participant does // encrypt or not. return; } // TODO: this just passes through to the decoder. Is that ok? If we don't know the key yet // we might want to buffer a bit but it is still unclear how to do that (and for how long etc). controller.enqueue(encodedFrame); } } const contexts = new Map(); // Map participant id => context onmessage = async event => { const { operation } = event.data; if (operation === 'initialize') { _keySalt = event.data.salt; } else if (operation === 'encode') { const { readableStream, writableStream, participantId } = event.data; if (!contexts.has(participantId)) { contexts.set(participantId, new Context(participantId)); } const context = contexts.get(participantId); const transformStream = new TransformStream({ transform: context.encodeFunction.bind(context) }); readableStream .pipeThrough(new TransformStream({ transform: polyFillEncodedFrameMetadata // M83 polyfill. })) .pipeThrough(transformStream) .pipeTo(writableStream); } else if (operation === 'decode') { const { readableStream, writableStream, participantId } = event.data; if (!contexts.has(participantId)) { contexts.set(participantId, new Context(participantId)); } const context = contexts.get(participantId); const transformStream = new TransformStream({ transform: context.decodeFunction.bind(context) }); readableStream .pipeThrough(new TransformStream({ transform: polyFillEncodedFrameMetadata // M83 polyfill. })) .pipeThrough(transformStream) .pipeTo(writableStream); } else if (operation === 'setKey') { const { participantId, key, keyIndex } = event.data; if (!contexts.has(participantId)) { contexts.set(participantId, new Context(participantId)); } const context = contexts.get(participantId); if (key) { context.setKey(await context.deriveKey(key, _keySalt), keyIndex); } else { context.setKey(false, keyIndex); } } else if (operation === 'cleanup') { const { participantId } = event.data; contexts.delete(participantId); } else { console.error('e2ee worker', operation); } };