Implementing parallel mjpeg encoder.

Fixed errors in parallel_for based on pthreads Fixing compiler errore & removing whitespaces Fixing prallel_for_pthreads error and warnings on win
2015-06-29 20:08:08 +03:00
parent eb4bd6b4fb
commit 65e0387aa5
3 changed files with 666 additions and 237 deletions
--- a/modules/core/src/parallel_pthreads.cpp
+++ b/modules/core/src/parallel_pthreads.cpp
@@ -80,25 +80,31 @@ struct work_load
        set(range, body, nstripes);
    }
-    void set(const cv::Range& range, const cv::ParallelLoopBody& body, int nstripes)
+    void set(const cv::Range& range, const cv::ParallelLoopBody& body, unsigned int nstripes)
    {
        m_body = &body;
        m_range = &range;
-        m_nstripes = nstripes;
+
-        m_blocks_count = ((m_range->end - m_range->start - 1)/m_nstripes) + 1;
+        //ensure that nstripes not larger than range length
        m_nstripes = std::min( unsigned(m_range->end - m_range->start) , nstripes);
        m_block_size = ((m_range->end - m_range->start - 1)/m_nstripes) + 1;
        //ensure that nstripes not larger than blocks count, so we would never go out of range
        m_nstripes = std::min(m_nstripes, unsigned(((m_range->end - m_range->start - 1)/m_block_size) + 1) );
    }
    const cv::ParallelLoopBody* m_body;
    const cv::Range*            m_range;
-    int                         m_nstripes;
+    unsigned int                         m_nstripes;
-    unsigned int                m_blocks_count;
+    int                m_block_size;
    void clear()
    {
        m_body = 0;
        m_range = 0;
        m_nstripes = 0;
-        m_blocks_count = 0;
+        m_block_size = 0;
    }
 };
@@ -331,10 +337,10 @@ void ForThread::execute()
    work_load& load = m_parent->m_work_load;
-    while(m_current_pos < load.m_blocks_count)
+    while(m_current_pos < load.m_nstripes)
    {
-        int start = load.m_range->start + m_current_pos*load.m_nstripes;
+        int start = load.m_range->start + m_current_pos*load.m_block_size;
-        int end = std::min(start + load.m_nstripes, load.m_range->end);
+        int end = std::min(start + load.m_block_size, load.m_range->end);
        load.m_body->operator()(cv::Range(start, end));
@@ -417,9 +423,11 @@ void ThreadManager::run(const cv::Range& range, const cv::ParallelLoopBody& body
        {
            if(initPool())
            {
-                double min_stripes = double(range.end - range.start)/(4*m_threads.size());
+                if(nstripes < 1) nstripes = 4*m_threads.size();
-                nstripes = std::max(nstripes, min_stripes);
+                double max_stripes = 4*m_threads.size();
                nstripes = std::min(nstripes, max_stripes);
                pthread_mutex_lock(&m_manager_task_mutex);
@@ -429,7 +437,7 @@ void ThreadManager::run(const cv::Range& range, const cv::ParallelLoopBody& body
                m_task_complete = false;
-                m_work_load.set(range, body, std::ceil(nstripes));
+                m_work_load.set(range, body, cvCeil(nstripes));
                for(size_t i = 0; i < m_threads.size(); ++i)
                {
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -315,6 +315,7 @@ enum { CAP_INTELPERC_DEPTH_MAP              = 0, // Each pixel is a 16-bit integ
 enum { VIDEOWRITER_PROP_QUALITY = 1,    // Quality (0..100%) of the videostream encoded
       VIDEOWRITER_PROP_FRAMEBYTES = 2, // (Read-only): Size of just encoded video frame
       VIDEOWRITER_PROP_NSTRIPES = 3    // Number of stripes for parallel encoding. -1 for auto detection
     };
 // gPhoto2 properties, if propertyId is less than 0 then work on widget with that __additive inversed__ camera setting ID
@@ -610,6 +611,7 @@ public:
     @param propId Property identifier. It can be one of the following:
     -   **VIDEOWRITER_PROP_QUALITY** Quality (0..100%) of the videostream encoded. Can be adjusted dynamically in some codecs.
     -   **VIDEOWRITER_PROP_NSTRIPES** Number of stripes for parallel encoding
     @param value Value of the property.
     */
    CV_WRAP virtual bool set(int propId, double value);
@@ -619,6 +621,7 @@ public:
     @param propId Property identifier. It can be one of the following:
     -   **VIDEOWRITER_PROP_QUALITY** Current quality of the encoded videostream.
     -   **VIDEOWRITER_PROP_FRAMEBYTES** (Read-only) Size of just encoded video frame; note that the encoding order may be different from representation order.
     -   **VIDEOWRITER_PROP_NSTRIPES** Number of stripes for parallel encoding
     @note When querying a property that is not supported by the backend used by the VideoWriter
     class, value 0 is returned.
--- a/modules/videoio/src/cap_mjpeg_encoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_encoder.cpp
@@ -41,6 +41,7 @@
 #include "precomp.hpp"
 #include <vector>
 #include <deque>
 #if CV_NEON
 #define WITH_NEON
@@ -350,14 +351,261 @@ protected:
 };
 class mjpeg_buffer
 {
 public:
    mjpeg_buffer()
    {
        reset();
    }
    void resize(int size)
    {
        data.resize(size);
    }
    void put(unsigned bits, int len)
    {
        if((m_pos == (data.size() - 1) && len > bits_free) || m_pos == data.size())
        {
            resize(int(2*data.size()));
        }
        bits_free -= (len);
        unsigned int tempval = (bits) & bit_mask[(len)];
        if( bits_free <= 0 )
        {
            data[m_pos] |= ((unsigned)tempval >> -bits_free);
            bits_free += 32;
            ++m_pos;
            data[m_pos] = bits_free < 32 ? (tempval << bits_free) : 0;
        }
        else
        {
            data[m_pos] |= (tempval << bits_free);
        }
    }
    void finish()
    {
        if(bits_free == 32)
        {
            bits_free = 0;
            m_data_len = m_pos;
        }
        else
        {
            m_data_len = m_pos + 1;
        }
    }
    void reset()
    {
        bits_free = 32;
        m_pos = 0;
        m_data_len = 0;
    }
    void clear()
    {
        //we need to clear only first element, the rest would be overwritten
        data[0] = 0;
    }
    int get_bits_free()
    {
        return bits_free;
    }
    unsigned* get_data()
    {
        return &data[0];
    }
    unsigned get_len()
    {
        return m_data_len;
    }
 private:
    std::vector<unsigned> data;
    int bits_free;
    unsigned m_pos;
    bool m_is_full;
    unsigned m_data_len;
 };
 class mjpeg_buffer_keeper
 {
 public:
    mjpeg_buffer_keeper()
    {
        m_last_bit_len = 0;
    }
    mjpeg_buffer& operator[](int i)
    {
        return m_buffer_list[i];
    }
    void allocate_buffers(int count, int size)
    {
        for(int i = (int)m_buffer_list.size(); i < count; ++i)
        {
            m_buffer_list.push_back(mjpeg_buffer());
            m_buffer_list.back().resize(size);
        }
    }
    unsigned* get_data()
    {
        //if there is only one buffer (single thread) there is no need to stack buffers
        if(m_buffer_list.size() == 1)
        {
            m_buffer_list[0].finish();
            m_data_len = m_buffer_list[0].get_len();
            m_last_bit_len = m_buffer_list[0].get_bits_free() ? 32 - m_buffer_list[0].get_bits_free() : 0;
            return m_buffer_list[0].get_data();
        }
        allocate_output_buffer();
        int bits = 0;
        unsigned currval = 0;
        m_data_len = 0;
        for(unsigned j = 0; j < m_buffer_list.size(); ++j)
        {
            mjpeg_buffer& buffer = m_buffer_list[j];
            //if no bit shift required we could use memcpy
            if(bits == 0)
            {
                size_t current_pos = m_data_len;
                if(buffer.get_bits_free() == 0)
                {
                    memcpy(&m_output_buffer[current_pos], buffer.get_data(), sizeof(buffer.get_data()[0])*buffer.get_len());
                    m_data_len += buffer.get_len();
                    currval = 0;
                }
                else
                {
                    memcpy(&m_output_buffer[current_pos], buffer.get_data(), sizeof(buffer.get_data()[0])*(buffer.get_len() -1 ));
                    m_data_len += buffer.get_len() - 1;
                    currval = buffer.get_data()[buffer.get_len() - 1];
                }
            }
            else
            {
                for(unsigned i = 0; i < buffer.get_len() - 1; ++i)
                {
                    if( bits <= 0 )
                    {
                        currval |= ((unsigned)buffer.get_data()[i] >> -bits);
                        m_output_buffer[m_data_len++] = currval;
                        currval = (bits < 0) ? (buffer.get_data()[i] << (bits + 32)) : 0;
                    }
                    else
                    {
                        currval |= (buffer.get_data()[i] << bits);
                    }
                }
                currval |= ((unsigned)buffer.get_data()[buffer.get_len() - 1] >> -bits);
                if( (buffer.get_bits_free() == 32 ? 0 : buffer.get_bits_free()) <= -bits)
                {
                    m_output_buffer[m_data_len++] = currval;
                    currval = (bits < 0) ? (buffer.get_data()[buffer.get_len() - 1] << (bits + 32)) : 0;
                }
            }
            bits += buffer.get_bits_free();
            if(bits > 0)
            {
                bits -= 32;
            }
        }
        //bits == 0 means that last element shouldn't be used.
        m_output_buffer[m_data_len++] = currval;
        m_last_bit_len = -bits;
        return &m_output_buffer[0];
    }
    int get_last_bit_len()
    {
        return m_last_bit_len;
    }
    int get_data_size()
    {
        return m_data_len;
    }
    void reset()
    {
        m_last_bit_len = 0;
        for(unsigned i = 0; i < m_buffer_list.size(); ++i)
        {
            m_buffer_list[i].reset();
        }
        //there is no need to erase output buffer since it would be overwritten
        m_data_len = 0;
    }
 private:
    void allocate_output_buffer()
    {
        unsigned total_size = 0;
        for(unsigned i = 0; i < m_buffer_list.size(); ++i)
        {
            m_buffer_list[i].finish();
            total_size += m_buffer_list[i].get_len();
        }
        if(total_size > m_output_buffer.size())
        {
            m_output_buffer.clear();
            m_output_buffer.resize(total_size);
        }
    }
    std::deque<mjpeg_buffer> m_buffer_list;
    std::vector<unsigned> m_output_buffer;
    int m_data_len;
    int m_last_bit_len;
 };
 class MotionJpegWriter : public IVideoWriter
 {
 public:
-    MotionJpegWriter() { rawstream = false; }
+    MotionJpegWriter()
    {
        rawstream = false;
        nstripes = -1;
    }
    MotionJpegWriter(const String& filename, double fps, Size size, bool iscolor)
    {
        rawstream = false;
        open(filename, fps, size, iscolor);
        nstripes = -1;
    }
    ~MotionJpegWriter() { close(); }
@@ -616,6 +864,8 @@ public:
            return quality;
        if( propId == VIDEOWRITER_PROP_FRAMEBYTES )
            return frameSize.empty() ? 0. : (double)frameSize.back();
        if( propId == VIDEOWRITER_PROP_NSTRIPES )
            return nstripes;
        return 0.;
    }
@@ -626,6 +876,13 @@ public:
            quality = value;
            return true;
        }
        if( propId == VIDEOWRITER_PROP_NSTRIPES)
        {
            nstripes = value;
            return true;
        }
        return false;
    }
@@ -638,6 +895,8 @@ protected:
    size_t moviPointer;
    std::vector<size_t> frameOffset, frameSize, AVIChunkSizeIndex, frameNumIndexes;
    bool rawstream;
    mjpeg_buffer_keeper buffers_list;
    double nstripes;
    BitStream strm;
 };
@@ -1107,6 +1366,377 @@ static void aan_fdct8x8( const short *src, short *dst,
 }
 #endif
 inline void convertToYUV(int colorspace, int channels, int input_channels, short* UV_data, short* Y_data, const uchar* pix_data, int y_limit, int x_limit, int step, int u_plane_ofs, int v_plane_ofs)
 {
    int i, j;
    const int UV_step = 16;
    int  x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
    int  Y_step = x_scale*8;
    if( channels > 1 )
    {
        if( colorspace == COLORSPACE_YUV444P && y_limit == 16 && x_limit == 16 )
        {
            for( i = 0; i < y_limit; i += 2, pix_data += step*2, Y_data += Y_step*2, UV_data += UV_step )
            {
 #ifdef WITH_NEON
                {
                    uint16x8_t masklo = vdupq_n_u16(255);
                    uint16x8_t lane = vld1q_u16((unsigned short*)(pix_data+v_plane_ofs));
                    uint16x8_t t1 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
                    lane = vld1q_u16((unsigned short*)(pix_data + v_plane_ofs + step));
                    uint16x8_t t2 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
                    t1 = vaddq_u16(t1, t2);
                    vst1q_s16(UV_data, vsubq_s16(vreinterpretq_s16_u16(t1), vdupq_n_s16(128*4)));
                    lane = vld1q_u16((unsigned short*)(pix_data+u_plane_ofs));
                    t1 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
                    lane = vld1q_u16((unsigned short*)(pix_data + u_plane_ofs + step));
                    t2 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
                    t1 = vaddq_u16(t1, t2);
                    vst1q_s16(UV_data + 8, vsubq_s16(vreinterpretq_s16_u16(t1), vdupq_n_s16(128*4)));
                }
                {
                    int16x8_t lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data)));
                    int16x8_t delta = vdupq_n_s16(128);
                    lane = vsubq_s16(lane, delta);
                    vst1q_s16(Y_data, lane);
                    lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data+8)));
                    lane = vsubq_s16(lane, delta);
                    vst1q_s16(Y_data + 8, lane);
                    lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data+step)));
                    lane = vsubq_s16(lane, delta);
                    vst1q_s16(Y_data+Y_step, lane);
                    lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data + step + 8)));
                    lane = vsubq_s16(lane, delta);
                    vst1q_s16(Y_data+Y_step + 8, lane);
                }
 #else
                for( j = 0; j < x_limit; j += 2, pix_data += 2 )
                {
                    Y_data[j] = pix_data[0] - 128;
                    Y_data[j+1] = pix_data[1] - 128;
                    Y_data[j+Y_step] = pix_data[step] - 128;
                    Y_data[j+Y_step+1] = pix_data[step+1] - 128;
                    UV_data[j>>1] = pix_data[v_plane_ofs] + pix_data[v_plane_ofs+1] +
                        pix_data[v_plane_ofs+step] + pix_data[v_plane_ofs+step+1] - 128*4;
                    UV_data[(j>>1)+8] = pix_data[u_plane_ofs] + pix_data[u_plane_ofs+1] +
                        pix_data[u_plane_ofs+step] + pix_data[u_plane_ofs+step+1] - 128*4;
                }
                pix_data -= x_limit*input_channels;
 #endif
            }
        }
        else
        {
            for( i = 0; i < y_limit; i++, pix_data += step, Y_data += Y_step )
            {
                for( j = 0; j < x_limit; j++, pix_data += input_channels )
                {
                    int Y, U, V;
                    if( colorspace == COLORSPACE_BGR )
                    {
                        int r = pix_data[2];
                        int g = pix_data[1];
                        int b = pix_data[0];
                        Y = DCT_DESCALE( r*y_r + g*y_g + b*y_b, fixc) - 128;
                        U = DCT_DESCALE( r*cb_r + g*cb_g + b*cb_b, fixc );
                        V = DCT_DESCALE( r*cr_r + g*cr_g + b*cr_b, fixc );
                    }
                    else if( colorspace == COLORSPACE_RGBA )
                    {
                        int r = pix_data[0];
                        int g = pix_data[1];
                        int b = pix_data[2];
                        Y = DCT_DESCALE( r*y_r + g*y_g + b*y_b, fixc) - 128;
                        U = DCT_DESCALE( r*cb_r + g*cb_g + b*cb_b, fixc );
                        V = DCT_DESCALE( r*cr_r + g*cr_g + b*cr_b, fixc );
                    }
                    else
                    {
                        Y = pix_data[0] - 128;
                        U = pix_data[v_plane_ofs] - 128;
                        V = pix_data[u_plane_ofs] - 128;
                    }
                    int j2 = j >> (x_scale - 1);
                    Y_data[j] = (short)Y;
                    UV_data[j2] = (short)(UV_data[j2] + U);
                    UV_data[j2 + 8] = (short)(UV_data[j2 + 8] + V);
                }
                pix_data -= x_limit*input_channels;
                if( ((i+1) & (y_scale - 1)) == 0 )
                {
                    UV_data += UV_step;
                }
            }
        }
    }
    else
    {
        for( i = 0; i < y_limit; i++, pix_data += step, Y_data += Y_step )
        {
            for( j = 0; j < x_limit; j++ )
                Y_data[j] = (short)(pix_data[j]*4 - 128*4);
        }
    }
 }
 class MjpegEncoder : public ParallelLoopBody
 {
 public:
    MjpegEncoder(int _height,
        int _width,
        int _step,
        const uchar* _data,
        int _input_channels,
        int _channels,
        int _colorspace,
        unsigned (&_huff_dc_tab)[2][16],
        unsigned (&_huff_ac_tab)[2][256],
        short (&_fdct_qtab)[2][64],
        uchar* _cat_table,
        mjpeg_buffer_keeper& _buffer_list,
        double nstripes
    ) :
        m_buffer_list(_buffer_list),
        height(_height),
        width(_width),
        step(_step),
        in_data(_data),
        input_channels(_input_channels),
        channels(_channels),
        colorspace(_colorspace),
        huff_dc_tab(_huff_dc_tab),
        huff_ac_tab(_huff_ac_tab),
        fdct_qtab(_fdct_qtab),
        cat_table(_cat_table)
    {
        //empirically found value. if number of pixels is less than that value there is no sense to parallelize it.
        const int min_pixels_count = 96*96;
        stripes_count = 1;
        if(nstripes < 0)
        {
            if(height*width > min_pixels_count)
            {
                stripes_count = 4;
            }
        }
        else
        {
            stripes_count = cvCeil(nstripes);
        }
        int y_scale = channels > 1 ? 2 : 1;
        int y_step = y_scale * 8;
        int max_stripes = (height - 1)/y_step + 1;
        stripes_count = std::min(stripes_count, max_stripes);
        m_buffer_list.allocate_buffers(stripes_count, (height*width*2)/stripes_count);
    }
    void operator()( const cv::Range& range ) const
    {
        const int CAT_TAB_SIZE = 4096;
        unsigned code = 0;
 #define JPUT_BITS(val, bits) output_buffer.put(val, bits)
 #define JPUT_HUFF(val, table) \
    code = table[(val) + 2]; \
    JPUT_BITS(code >> 8, (int)(code & 255))
        int x, y;
        int i, j;
        short  buffer[4096];
        int  x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
        int  dc_pred[] = { 0, 0, 0 };
        int  x_step = x_scale * 8;
        int  y_step = y_scale * 8;
        short  block[6][64];
        int  luma_count = x_scale*y_scale;
        int  block_count = luma_count + channels - 1;
        int u_plane_ofs = step*height;
        int v_plane_ofs = u_plane_ofs + step*height;
        const uchar* data = in_data;
        const uchar* init_data = data;
        int num_steps = (height - 1)/y_step + 1;
        //if this is not first stripe we need to calculate dc_pred from previous step
        if(range.start > 0)
        {
            y = y_step*int(num_steps*range.start/stripes_count - 1);
            data = init_data + y*step;
            for( x = 0; x < width; x += x_step )
            {
                int x_limit = x_step;
                int y_limit = y_step;
                const uchar* pix_data = data + x*input_channels;
                short* Y_data = block[0];
                short* UV_data = block[luma_count];
                if( x + x_limit > width ) x_limit = width - x;
                if( y + y_limit > height ) y_limit = height - y;
                memset( block, 0, block_count*64*sizeof(block[0][0]));
                convertToYUV(colorspace, channels, input_channels, UV_data, Y_data, pix_data, y_limit, x_limit, step, u_plane_ofs, v_plane_ofs);
                for( i = 0; i < block_count; i++ )
                {
                    int is_chroma = i >= luma_count;
                    int src_step = x_scale * 8;
                    const short* src_ptr = block[i & -2] + (i & 1)*8;
                    aan_fdct8x8( src_ptr, buffer, src_step, fdct_qtab[is_chroma] );
                    j = is_chroma + (i > luma_count);
                    dc_pred[j] = buffer[0];
                }
            }
        }
        for(int k = range.start; k < range.end; ++k)
        {
            mjpeg_buffer& output_buffer = m_buffer_list[k];
            output_buffer.clear();
            int y_min = y_step*int(num_steps*k/stripes_count);
            int y_max = y_step*int(num_steps*(k+1)/stripes_count);
            if(k == stripes_count - 1)
            {
                y_max = height;
            }
            data = init_data + y_min*step;
            for( y = y_min; y < y_max; y += y_step, data += y_step*step )
            {
                for( x = 0; x < width; x += x_step )
                {
                    int x_limit = x_step;
                    int y_limit = y_step;
                    const uchar* pix_data = data + x*input_channels;
                    short* Y_data = block[0];
                    short* UV_data = block[luma_count];
                    if( x + x_limit > width ) x_limit = width - x;
                    if( y + y_limit > height ) y_limit = height - y;
                    memset( block, 0, block_count*64*sizeof(block[0][0]));
                    convertToYUV(colorspace, channels, input_channels, UV_data, Y_data, pix_data, y_limit, x_limit, step, u_plane_ofs, v_plane_ofs);
                    for( i = 0; i < block_count; i++ )
                    {
                        int is_chroma = i >= luma_count;
                        int src_step = x_scale * 8;
                        int run = 0, val;
                        const short* src_ptr = block[i & -2] + (i & 1)*8;
                        const unsigned* htable = huff_ac_tab[is_chroma];
                        aan_fdct8x8( src_ptr, buffer, src_step, fdct_qtab[is_chroma] );
                        j = is_chroma + (i > luma_count);
                        val = buffer[0] - dc_pred[j];
                        dc_pred[j] = buffer[0];
                        {
                            int cat = cat_table[val + CAT_TAB_SIZE];
                            //CV_Assert( cat <= 11 );
                            JPUT_HUFF( cat, huff_dc_tab[is_chroma] );
                            JPUT_BITS( val - (val < 0 ? 1 : 0), cat );
                        }
                        for( j = 1; j < 64; j++ )
                        {
                            val = buffer[zigzag[j]];
                            if( val == 0 )
                            {
                                run++;
                            }
                            else
                            {
                                while( run >= 16 )
                                {
                                    JPUT_HUFF( 0xF0, htable ); // encode 16 zeros
                                    run -= 16;
                                }
                                {
                                    int cat = cat_table[val + CAT_TAB_SIZE];
                                    //CV_Assert( cat <= 10 );
                                    JPUT_HUFF( cat + run*16, htable );
                                    JPUT_BITS( val - (val < 0 ? 1 : 0), cat );
                                }
                                run = 0;
                            }
                        }
                        if( run )
                        {
                            JPUT_HUFF( 0x00, htable ); // encode EOB
                        }
                    }
                }
            }
        }
    }
    cv::Range getRange()
    {
        return cv::Range(0, stripes_count);
    }
    double getNStripes()
    {
        return stripes_count;
    }
    mjpeg_buffer_keeper& m_buffer_list;
 private:
    MjpegEncoder& operator=( const MjpegEncoder & ) { return *this; }
    const int height;
    const int width;
    const int step;
    const uchar* in_data;
    const int input_channels;
    const int channels;
    const int colorspace;
    const unsigned (&huff_dc_tab)[2][16];
    const unsigned (&huff_ac_tab)[2][256];
    const short (&fdct_qtab)[2][64];
    const uchar* cat_table;
    int stripes_count;
 };
 void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspace, int input_channels )
 {
    //double total_cvt = 0, total_dct = 0;
@@ -1133,7 +1763,6 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
    //   for every block:
    //     calc dct and quantize
    //     encode block.
    int x, y;
    int i, j;
    const int max_quality = 12;
    short fdct_qtab[2][64];
@@ -1141,18 +1770,9 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
    unsigned huff_ac_tab[2][256];
    int  x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
    int  dc_pred[] = { 0, 0, 0 };
    int  x_step = x_scale * 8;
    int  y_step = y_scale * 8;
    short  block[6][64];
    short  buffer[4096];
    int*   hbuffer = (int*)buffer;
    int  luma_count = x_scale*y_scale;
    int  block_count = luma_count + channels - 1;
    int  Y_step = x_scale*8;
    const int UV_step = 16;
    int u_plane_ofs = step*height;
    int v_plane_ofs = u_plane_ofs + step*height;
    double _quality = quality*0.01*max_quality;
    if( _quality < 1. ) _quality = 1.;
@@ -1241,229 +1861,27 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
    strm.putByte( 0 );  // successive approximation bit position
    // high & low - (0,0) for sequential DCT
    unsigned currval = 0, code = 0, tempval = 0;
    int bit_idx = 32;
-#define JPUT_BITS(val, bits) \
+    buffers_list.reset();
    bit_idx -= (bits); \
    tempval = (val) & bit_mask[(bits)]; \
    if( bit_idx <= 0 ) \
    {  \
        strm.jput(currval | ((unsigned)tempval >> -bit_idx)); \
        bit_idx += 32; \
        currval = bit_idx < 32 ? (tempval << bit_idx) : 0; \
    } \
    else \
        currval |= (tempval << bit_idx)
-#define JPUT_HUFF(val, table) \
+    MjpegEncoder parallel_encoder(height, width, step, data, input_channels, channels, colorspace, huff_dc_tab, huff_ac_tab, fdct_qtab, cat_table, buffers_list, nstripes);
    code = table[(val) + 2]; \
    JPUT_BITS(code >> 8, (int)(code & 255))
-    // encode data
+    cv::parallel_for_(parallel_encoder.getRange(), parallel_encoder, parallel_encoder.getNStripes());
-    for( y = 0; y < height; y += y_step, data += y_step*step )
+
    //std::vector<unsigned>& v = parallel_encoder.m_buffer_list.get_data();
    unsigned* v = buffers_list.get_data();
    unsigned last_data_elem = buffers_list.get_data_size() - 1;
    for(unsigned k = 0; k < last_data_elem; ++k)
    {
-        for( x = 0; x < width; x += x_step )
+        strm.jput(v[k]);
        {
            int x_limit = x_step;
            int y_limit = y_step;
            const uchar* pix_data = data + x*input_channels;
            short* Y_data = block[0];
            if( x + x_limit > width ) x_limit = width - x;
            if( y + y_limit > height ) y_limit = height - y;
            memset( block, 0, block_count*64*sizeof(block[0][0]));
            if( channels > 1 )
            {
                short* UV_data = block[luma_count];
                // double t = (double)cv::getTickCount();
                if( colorspace == COLORSPACE_YUV444P && y_limit == 16 && x_limit == 16 )
                {
                    for( i = 0; i < y_limit; i += 2, pix_data += step*2, Y_data += Y_step*2, UV_data += UV_step )
                    {
 #ifdef WITH_NEON
                        {
                            uint16x8_t masklo = vdupq_n_u16(255);
                            uint16x8_t lane = vld1q_u16((unsigned short*)(pix_data+v_plane_ofs));
                            uint16x8_t t1 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
                            lane = vld1q_u16((unsigned short*)(pix_data + v_plane_ofs + step));
                            uint16x8_t t2 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
                            t1 = vaddq_u16(t1, t2);
                            vst1q_s16(UV_data, vsubq_s16(vreinterpretq_s16_u16(t1), vdupq_n_s16(128*4)));
                            lane = vld1q_u16((unsigned short*)(pix_data+u_plane_ofs));
                            t1 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
                            lane = vld1q_u16((unsigned short*)(pix_data + u_plane_ofs + step));
                            t2 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
                            t1 = vaddq_u16(t1, t2);
                            vst1q_s16(UV_data + 8, vsubq_s16(vreinterpretq_s16_u16(t1), vdupq_n_s16(128*4)));
                        }
                        {
                            int16x8_t lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data)));
                            int16x8_t delta = vdupq_n_s16(128);
                            lane = vsubq_s16(lane, delta);
                            vst1q_s16(Y_data, lane);
                            lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data+8)));
                            lane = vsubq_s16(lane, delta);
                            vst1q_s16(Y_data + 8, lane);
                            lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data+step)));
                            lane = vsubq_s16(lane, delta);
                            vst1q_s16(Y_data+Y_step, lane);
                            lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data + step + 8)));
                            lane = vsubq_s16(lane, delta);
                            vst1q_s16(Y_data+Y_step + 8, lane);
                        }
 #else
                        for( j = 0; j < x_limit; j += 2, pix_data += 2 )
                        {
                            Y_data[j] = pix_data[0] - 128;
                            Y_data[j+1] = pix_data[1] - 128;
                            Y_data[j+Y_step] = pix_data[step] - 128;
                            Y_data[j+Y_step+1] = pix_data[step+1] - 128;
                            UV_data[j>>1] = pix_data[v_plane_ofs] + pix_data[v_plane_ofs+1] +
                                pix_data[v_plane_ofs+step] + pix_data[v_plane_ofs+step+1] - 128*4;
                            UV_data[(j>>1)+8] = pix_data[u_plane_ofs] + pix_data[u_plane_ofs+1] +
                                pix_data[u_plane_ofs+step] + pix_data[u_plane_ofs+step+1] - 128*4;
                        }
                        pix_data -= x_limit*input_channels;
 #endif
                    }
                }
                else
                {
                    for( i = 0; i < y_limit; i++, pix_data += step, Y_data += Y_step )
                    {
                        for( j = 0; j < x_limit; j++, pix_data += input_channels )
                        {
                            int Y, U, V;
                            if( colorspace == COLORSPACE_BGR )
                            {
                                int r = pix_data[2];
                                int g = pix_data[1];
                                int b = pix_data[0];
                                Y = DCT_DESCALE( r*y_r + g*y_g + b*y_b, fixc) - 128;
                                U = DCT_DESCALE( r*cb_r + g*cb_g + b*cb_b, fixc );
                                V = DCT_DESCALE( r*cr_r + g*cr_g + b*cr_b, fixc );
                            }
                            else if( colorspace == COLORSPACE_RGBA )
                            {
                                int r = pix_data[0];
                                int g = pix_data[1];
                                int b = pix_data[2];
                                Y = DCT_DESCALE( r*y_r + g*y_g + b*y_b, fixc) - 128;
                                U = DCT_DESCALE( r*cb_r + g*cb_g + b*cb_b, fixc );
                                V = DCT_DESCALE( r*cr_r + g*cr_g + b*cr_b, fixc );
                            }
                            else
                            {
                                Y = pix_data[0] - 128;
                                U = pix_data[v_plane_ofs] - 128;
                                V = pix_data[u_plane_ofs] - 128;
                            }
                            int j2 = j >> (x_scale - 1);
                            Y_data[j] = (short)Y;
                            UV_data[j2] = (short)(UV_data[j2] + U);
                            UV_data[j2 + 8] = (short)(UV_data[j2 + 8] + V);
                        }
                        pix_data -= x_limit*input_channels;
                        if( ((i+1) & (y_scale - 1)) == 0 )
                        {
                            UV_data += UV_step;
                        }
                    }
                }
                // total_cvt += (double)cv::getTickCount() - t;
            }
            else
            {
                for( i = 0; i < y_limit; i++, pix_data += step, Y_data += Y_step )
                {
                    for( j = 0; j < x_limit; j++ )
                        Y_data[j] = (short)(pix_data[j]*4 - 128*4);
                }
            }
            for( i = 0; i < block_count; i++ )
            {
                int is_chroma = i >= luma_count;
                int src_step = x_scale * 8;
                int run = 0, val;
                const short* src_ptr = block[i & -2] + (i & 1)*8;
                const unsigned* htable = huff_ac_tab[is_chroma];
                //double t = (double)cv::getTickCount();
                aan_fdct8x8( src_ptr, buffer, src_step, fdct_qtab[is_chroma] );
                //total_dct += (double)cv::getTickCount() - t;
                j = is_chroma + (i > luma_count);
                val = buffer[0] - dc_pred[j];
                dc_pred[j] = buffer[0];
                {
                    int cat = cat_table[val + CAT_TAB_SIZE];
                    //CV_Assert( cat <= 11 );
                    JPUT_HUFF( cat, huff_dc_tab[is_chroma] );
                    JPUT_BITS( val - (val < 0 ? 1 : 0), cat );
                }
                for( j = 1; j < 64; j++ )
                {
                    val = buffer[zigzag[j]];
                    if( val == 0 )
                    {
                        run++;
                    }
                    else
                    {
                        while( run >= 16 )
                        {
                            JPUT_HUFF( 0xF0, htable ); // encode 16 zeros
                            run -= 16;
                        }
                        {
                            int cat = cat_table[val + CAT_TAB_SIZE];
                            //CV_Assert( cat <= 10 );
                            JPUT_HUFF( cat + run*16, htable );
                            JPUT_BITS( val - (val < 0 ? 1 : 0), cat );
                        }
                        run = 0;
                    }
                }
                if( run )
                {
                    JPUT_HUFF( 0x00, htable ); // encode EOB
                }
            }
        }
    }
-
+    strm.jflush(v[last_data_elem], 32 - buffers_list.get_last_bit_len());
    // Flush
    strm.jflush(currval, bit_idx);
    strm.jputShort( 0xFFD9 ); // EOI marker
    /*printf("total dct = %.1fms, total cvt = %.1fms\n",
     total_dct*1000./cv::getTickFrequency(),
     total_cvt*1000./cv::getTickFrequency());*/
    size_t pos = strm.getPos();
    size_t pos1 = (pos + 3) & ~3;
    for( ; pos < pos1; pos++ )