webrtc/modules/rtp_rtcp/source/rtp_receiver_audio.cc

/*
 *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "rtp_receiver_audio.h"

#include <cassert> //assert
#include <cstring> // memcpy()
#include <math.h>    // pow()

#include "critical_section_wrapper.h"

namespace webrtc {
RTPReceiverAudio::RTPReceiverAudio(const WebRtc_Word32 id):
    _id(id),
    _lastReceivedFrequency(8000),
    _telephoneEvent(false),
    _telephoneEventForwardToDecoder(false),
    _telephoneEventDetectEndOfTone(false),
    _telephoneEventPayloadType(-1),
    _telephoneEventReported(),
    _cngNBPayloadType(-1),
    _cngWBPayloadType(-1),
    _cngSWBPayloadType(-1),
    _cngPayloadType(-1),
    _G722PayloadType(-1),
    _lastReceivedG722(false),
    _criticalSectionFeedback(*CriticalSectionWrapper::CreateCriticalSection()),
    _cbAudioFeedback(NULL)
{
}

RTPReceiverAudio::~RTPReceiverAudio()
{
    delete &_criticalSectionFeedback;
}

WebRtc_Word32
RTPReceiverAudio::Init()
{
    _lastReceivedFrequency = 8000;
    _telephoneEvent = false;
    _telephoneEventForwardToDecoder = false;
    _telephoneEventDetectEndOfTone = false;
    _telephoneEventPayloadType = -1;

    while(_telephoneEventReported.Size() > 0)
    {
        _telephoneEventReported.Erase(_telephoneEventReported.First());
    }
    _cngNBPayloadType = -1;
    _cngWBPayloadType = -1;
    _cngSWBPayloadType = -1;
    _cngPayloadType = -1;
    _G722PayloadType = -1;
    _lastReceivedG722 = false;
    return 0;
}

void
RTPReceiverAudio::ChangeUniqueId(const WebRtc_Word32 id)
{
    _id = id;
}

WebRtc_Word32
RTPReceiverAudio::RegisterIncomingAudioCallback(RtpAudioFeedback* incomingMessagesCallback)
{
    CriticalSectionScoped lock(_criticalSectionFeedback);
    _cbAudioFeedback = incomingMessagesCallback;
    return 0;
}

WebRtc_UWord32
RTPReceiverAudio::AudioFrequency() const
{
    if(_lastReceivedG722)
    {
        return 8000;
    }
    return _lastReceivedFrequency;
}

// Outband TelephoneEvent(DTMF) detection
WebRtc_Word32
RTPReceiverAudio::SetTelephoneEventStatus(const bool enable,
                                          const bool forwardToDecoder,
                                          const bool detectEndOfTone)
{
    _telephoneEvent= enable;
    _telephoneEventDetectEndOfTone = detectEndOfTone;
    _telephoneEventForwardToDecoder = forwardToDecoder;
    return 0;
}

 // Is outband TelephoneEvent(DTMF) turned on/off?
bool
RTPReceiverAudio::TelephoneEvent() const
{
    return _telephoneEvent;
}

// Is forwarding of outband telephone events turned on/off?
bool
RTPReceiverAudio::TelephoneEventForwardToDecoder() const
{
    return _telephoneEventForwardToDecoder;
}

bool
RTPReceiverAudio::TelephoneEventPayloadType(const WebRtc_Word8 payloadType) const
{
    return (_telephoneEventPayloadType == payloadType)?true:false;
}

bool
RTPReceiverAudio::CNGPayloadType(const WebRtc_Word8 payloadType,
                                 WebRtc_UWord32& frequency)
{
    //  we can have three CNG on 8000Hz, 16000Hz and 32000Hz
    if(_cngNBPayloadType == payloadType)
    {
        frequency = 8000;
        if ((_cngPayloadType != -1) &&(_cngPayloadType !=_cngNBPayloadType))
        {
            ResetStatistics();
        }
        _cngPayloadType = _cngNBPayloadType;
        return true;
    } else if(_cngWBPayloadType == payloadType)
    {
        // if last received codec is G.722 we must use frequency 8000
        if(_lastReceivedG722)
        {
            frequency = 8000;
        } else
        {
            frequency = 16000;
        }
        if ((_cngPayloadType != -1) &&(_cngPayloadType !=_cngWBPayloadType))
        {
            ResetStatistics();
        }
        _cngPayloadType = _cngWBPayloadType;
        return true;
    }else if(_cngSWBPayloadType == payloadType)
    {
        frequency = 32000;
        if ((_cngPayloadType != -1) &&(_cngPayloadType !=_cngSWBPayloadType))
        {
            ResetStatistics();
        }
        _cngPayloadType = _cngSWBPayloadType;
        return true;
    }else
    {
        //  not CNG
        if(_G722PayloadType == payloadType)
        {
            _lastReceivedG722 = true;
        }else
        {
            _lastReceivedG722 = false;
        }
    }
    return false;
}

/*
   Sample based or frame based codecs based on RFC 3551

   NOTE! There is one error in the RFC, stating G.722 uses 8 bits/samples.
   The correct rate is 4 bits/sample.

   name of                              sampling              default
   encoding  sample/frame  bits/sample      rate  ms/frame  ms/packet

   Sample based audio codecs
   DVI4      sample        4                var.                   20
   G722      sample        4              16,000                   20
   G726-40   sample        5               8,000                   20
   G726-32   sample        4               8,000                   20
   G726-24   sample        3               8,000                   20
   G726-16   sample        2               8,000                   20
   L8        sample        8                var.                   20
   L16       sample        16               var.                   20
   PCMA      sample        8                var.                   20
   PCMU      sample        8                var.                   20

   Frame based audio codecs
   G723      frame         N/A             8,000        30         30
   G728      frame         N/A             8,000       2.5         20
   G729      frame         N/A             8,000        10         20
   G729D     frame         N/A             8,000        10         20
   G729E     frame         N/A             8,000        10         20
   GSM       frame         N/A             8,000        20         20
   GSM-EFR   frame         N/A             8,000        20         20
   LPC       frame         N/A             8,000        20         20
   MPA       frame         N/A              var.      var.

   G7221     frame         N/A
*/

ModuleRTPUtility::Payload*
RTPReceiverAudio::RegisterReceiveAudioPayload(const WebRtc_Word8 payloadName[RTP_PAYLOAD_NAME_SIZE],
                                              const WebRtc_Word8 payloadType,
                                              const WebRtc_UWord32 frequency,
                                              const WebRtc_UWord8 channels,
                                              const WebRtc_UWord32 rate)
{
    WebRtc_Word32 length = (WebRtc_Word32)strlen(payloadName);
    if(length > RTP_PAYLOAD_NAME_SIZE)
    {
        assert(false);
        return NULL;
    }

    if (ModuleRTPUtility::StringCompare(payloadName,"telephone-event",15))
    {
        _telephoneEventPayloadType = payloadType;
    }
    if (ModuleRTPUtility::StringCompare(payloadName,"cn",2))
    {
        //  we can have three CNG on 8000Hz, 16000Hz and 32000Hz
        if(frequency == 8000)
        {
            _cngNBPayloadType = payloadType;

        } else if(frequency == 16000)
        {
            _cngWBPayloadType = payloadType;

        } else if(frequency == 32000)
        {
            _cngSWBPayloadType = payloadType;
        }else
        {
            assert(false);
            return NULL;
        }
    }
    WebRtc_UWord8 bitsPerSample = 0; // zero implies frame based
    if (ModuleRTPUtility::StringCompare(payloadName,"DVI4",4))
    {
        bitsPerSample = 4;
    } else if(ModuleRTPUtility::StringCompare(payloadName,"G722",4))
    {
        if(ModuleRTPUtility::StringCompare(payloadName,"G7221",5))
        {
            // frame based
        } else
        {
            _G722PayloadType = payloadType;
            bitsPerSample = 4;
        }
    } else if(ModuleRTPUtility::StringCompare(payloadName,"G726-40",7))
    {
        bitsPerSample = 5;
    } else if(ModuleRTPUtility::StringCompare(payloadName,"G726-32",7))
    {
        bitsPerSample = 4;
    } else if(ModuleRTPUtility::StringCompare(payloadName,"G726-24",7))
    {
        bitsPerSample = 3;
    } else if(ModuleRTPUtility::StringCompare(payloadName,"G726-16",7))
    {
        bitsPerSample = 2;
    } else if(ModuleRTPUtility::StringCompare(payloadName,"L8",2))
    {
        bitsPerSample = 8;
    } else if(ModuleRTPUtility::StringCompare(payloadName,"L16",3))
    {
        bitsPerSample = 16;
    } else if(ModuleRTPUtility::StringCompare(payloadName,"PCMU",4))
    {
        bitsPerSample = 8;
    } else if(ModuleRTPUtility::StringCompare(payloadName,"PCMA",4))
    {
        bitsPerSample = 8;
    }

    ModuleRTPUtility::Payload* payload = new ModuleRTPUtility::Payload;
    memcpy(payload->name, payloadName, length+1);
    payload->typeSpecific.Audio.frequency = frequency;
    payload->typeSpecific.Audio.channels = channels;
    payload->typeSpecific.Audio.bitsPerSample = bitsPerSample;
    payload->typeSpecific.Audio.rate = rate;
    payload->audio = true;
    return payload;
}

// we are not allowed to have any critsects when calling CallbackOfReceivedPayloadData
WebRtc_Word32
RTPReceiverAudio::ParseAudioCodecSpecific(WebRtcRTPHeader* rtpHeader,
                                          const WebRtc_UWord8* payloadData,
                                          const WebRtc_UWord16 payloadLength,
                                          const ModuleRTPUtility::AudioPayload& audioSpecific,
                                          const bool isRED)
{
    WebRtc_UWord8 newEvents[MAX_NUMBER_OF_PARALLEL_TELEPHONE_EVENTS];
    WebRtc_UWord8 removedEvents[MAX_NUMBER_OF_PARALLEL_TELEPHONE_EVENTS];
    WebRtc_UWord8 numberOfNewEvents = 0;
    WebRtc_UWord8 numberOfRemovedEvents = 0;
    bool telephoneEventPacket = TelephoneEventPayloadType(rtpHeader->header.payloadType);

    if(payloadLength == 0)
    {
        return 0;
    }

    {
        CriticalSectionScoped lock(_criticalSectionFeedback);

        if(telephoneEventPacket)
        {
            // RFC 4733 2.3
            /*
                0                   1                   2                   3
                0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
                +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
                |     event     |E|R| volume    |          duration             |
                +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            */
            if(payloadLength % 4 != 0)
            {
                return -1;
            }
            WebRtc_UWord8 numberOfEvents = payloadLength / 4;

            // sanity
            if(numberOfEvents >= MAX_NUMBER_OF_PARALLEL_TELEPHONE_EVENTS)
            {
                numberOfEvents = MAX_NUMBER_OF_PARALLEL_TELEPHONE_EVENTS;
            }
            for (int n = 0; n < numberOfEvents; n++)
            {
                bool end = (payloadData[(4*n)+1] & 0x80)? true:false;

                if(_telephoneEventReported.Find(payloadData[4*n]) != NULL)
                {
                    // we have already seen this event
                    if(end)
                    {
                        removedEvents[numberOfRemovedEvents]= payloadData[4*n];
                        numberOfRemovedEvents++;
                        _telephoneEventReported.Erase(payloadData[4*n]);
                    }
                }else
                {
                    if(end)
                    {
                        // don't add if it's a end of a tone
                    }else
                    {
                        newEvents[numberOfNewEvents] = payloadData[4*n];
                        numberOfNewEvents++;
                        _telephoneEventReported.Insert(payloadData[4*n],NULL);
                    }
                }
            }

            // RFC 4733 2.5.1.3 & 2.5.2.3 Long-Duration Events
            // should not be a problem since we don't care about the duration

            // RFC 4733 See 2.5.1.5. & 2.5.2.4.  Multiple Events in a Packet
        }

        if(_telephoneEvent && _cbAudioFeedback)
        {
            for (int n = 0; n < numberOfNewEvents; n++)
            {
                _cbAudioFeedback->OnReceivedTelephoneEvent(_id, newEvents[n], false);
            }
            if(_telephoneEventDetectEndOfTone)
            {
                for (int n = 0; n < numberOfRemovedEvents; n++)
                {
                    _cbAudioFeedback->OnReceivedTelephoneEvent(_id, removedEvents[n], true);
                }
            }
        }
    }
    if(! telephoneEventPacket )
    {
        _lastReceivedFrequency = audioSpecific.frequency;
    }

    // Check if this is a CNG packet, receiver might want to know
    WebRtc_UWord32 dummy;
    if(CNGPayloadType(rtpHeader->header.payloadType, dummy))
    {
        rtpHeader->type.Audio.isCNG=true;
        rtpHeader->frameType = kAudioFrameCN;
    }else
    {
        rtpHeader->frameType = kAudioFrameSpeech;
        rtpHeader->type.Audio.isCNG=false;
    }

    // check if it's a DTMF event, hence something we can playout
    if(telephoneEventPacket)
    {
        if(!_telephoneEventForwardToDecoder)
        {
            // don't forward event to decoder
            return 0;
        }
        MapItem* first = _telephoneEventReported.First();
        if(first && first->GetId() > 15)
        {
            // don't forward non DTMF events
            return 0;
        }
    }
    if(isRED && !(payloadData[0] & 0x80))
    {
        // we recive only one frame packed in a RED packet remove the RED wrapper
        rtpHeader->header.payloadType = payloadData[0];

        // only one frame in the RED strip the one byte to help NetEq
        return CallbackOfReceivedPayloadData(payloadData+1,
                                             payloadLength-1,
                                             rtpHeader);
    }
    if(audioSpecific.channels > 1)
    {
        WebRtc_Word32 retVal = 0;
        WebRtc_UWord16 channelLength = payloadLength/audioSpecific.channels;

        if(audioSpecific.bitsPerSample > 0)
        {
            // sanity
            assert((payloadLength*8)%audioSpecific.bitsPerSample == 0);

            // sample based codec

            // build matrix
            WebRtc_UWord8 matrix[IP_PACKET_SIZE];
            WebRtc_UWord32 offsetBytes = 0;
            WebRtc_UWord32 offsetBytesInsert = 0;
            // initialize matrix to 0
            memset(matrix, 0, audioSpecific.channels*channelLength);

            switch(audioSpecific.bitsPerSample)
            {
            case 1:
            case 2:
            case 3:
            case 4:
            case 5:
            case 6:
            case 7:
                {
                    WebRtc_UWord32 offsetSamples = 0;
                    WebRtc_UWord32 offsetSamplesInsert = 0;
                    WebRtc_UWord16 bitMask = (WebRtc_UWord16)ModuleRTPUtility::pow2(audioSpecific.bitsPerSample)-1;
                    WebRtc_UWord16 samplesPerChannel =payloadLength*8/audioSpecific.bitsPerSample/audioSpecific.channels;

                    for(WebRtc_UWord32 i = 0; i < samplesPerChannel; i++)
                    {
                        WebRtc_UWord8 insertShift = (WebRtc_UWord8)((offsetSamplesInsert+audioSpecific.bitsPerSample)%16);
                        insertShift = 16 - insertShift;  // inverse the calculation

                        for(WebRtc_UWord32 j = 0; j < audioSpecific.channels; j++)
                        {
                            // get sample
                            WebRtc_UWord16 s = payloadData[offsetBytes] << 8;

                            // check that we don't read outside the memory
                            if(offsetBytes < (WebRtc_UWord32)payloadLength -2)
                            {
                                s += payloadData[offsetBytes+1];
                            }

                            WebRtc_UWord8 readShift = (WebRtc_UWord8)((offsetSamples+audioSpecific.bitsPerSample)%16);
                            readShift = 16 - readShift;  // inverse the calculation
                            s >>= readShift;
                            s &= bitMask;

                            // prepare for reading next sample
                            offsetSamples += audioSpecific.bitsPerSample;
                            if(readShift <= audioSpecific.bitsPerSample)
                            {
                                // next does not fitt
                                // or fitt exactly
                                offsetSamples -= 8;
                                offsetBytes++;
                            }

                            // insert sample into matrix
                            WebRtc_UWord32 columOffset = j*channelLength;

                            WebRtc_UWord16 insert = s << insertShift;
#if defined(WEBRTC_LITTLE_ENDIAN)
                            matrix[columOffset+offsetBytesInsert]   |= static_cast<WebRtc_UWord8>(insert>>8);
                            matrix[columOffset+offsetBytesInsert+1] |= static_cast<WebRtc_UWord8>(insert);
#else
                            WebRtc_UWord16* matrixU16 = (WebRtc_UWord16*)&(matrix[columOffset+offsetBytesInsert]);
                            matrixU16[0] |= (s << insertShift);
#endif
                        }
                        // prepare for writing next sample
                        offsetSamplesInsert += audioSpecific.bitsPerSample;
                        if(insertShift <= audioSpecific.bitsPerSample)
                        {
                            // next does not fitt
                            // or fitt exactly
                            offsetSamplesInsert -= 8;
                            offsetBytesInsert++;
                        }
                    }
                }
                break;
            case 8:
                {
                    WebRtc_UWord32 sample = 0;
                    for(WebRtc_UWord32 i = 0; i < channelLength; i++)
                    {
                        for(WebRtc_UWord32 j = 0; j < audioSpecific.channels; j++)
                        {
                            WebRtc_UWord32 columOffset = j*channelLength;
                            matrix[columOffset + i] = payloadData[sample++];
                        }
                    }
                }
                break;
            case 16:
                {
                    WebRtc_UWord32 sample = 0;
                    for(WebRtc_UWord32 i = 0; i < channelLength; i +=2)
                    {
                        for(WebRtc_UWord32 j = 0; j < audioSpecific.channels; j++)
                        {
                            WebRtc_UWord32 columOffset = j*channelLength;
                            matrix[columOffset + i] = payloadData[sample++];
                            matrix[columOffset + i + 1] = payloadData[sample++];
                        }
                    }
                }
                break;
            default:
                assert(false);
                return -1;
            }
            // we support 16 bits sample
            // callback for all channels
            for(int channel = 0; channel < audioSpecific.channels && retVal == 0; channel++)
            {
                // one callback per channel
                rtpHeader->type.Audio.channel = channel+1;

                if(channel == 0)
                {
                    // include the original packet only in the first callback
                    retVal = CallbackOfReceivedPayloadData(&matrix[channel*channelLength],
                                                           channelLength,
                                                           rtpHeader);
                } else
                {
                    retVal = CallbackOfReceivedPayloadData(&matrix[channel*channelLength],
                                                           channelLength,
                                                           rtpHeader);
                }
            }
        } else
        {
            for(int channel = 1; channel <= audioSpecific.channels && retVal == 0; channel++)
            {
                // one callback per channel
                rtpHeader->type.Audio.channel = channel;

                if(channel == 1)
                {
                    // include the original packet only in the first callback
                    retVal = CallbackOfReceivedPayloadData(payloadData,
                                                           channelLength,
                                                           rtpHeader);
                } else
                {
                    retVal = CallbackOfReceivedPayloadData(payloadData,
                                                           channelLength,
                                                           rtpHeader);
                }
                payloadData += channelLength;
            }
        }
        return retVal;
    }else
    {
        rtpHeader->type.Audio.channel = 1;
        return CallbackOfReceivedPayloadData(payloadData,
                                             payloadLength,
                                             rtpHeader);
    }
}
} // namespace webrtc