Merge pull request #4164 from kalistratovag:parallel_mjpeg

2015-07-01 14:15:51 +00:00 · 2015-07-01 14:15:51 +00:00 · c81b064727
commit c81b064727
parent e379ea6ed6 65e0387aa5
3 changed files with 666 additions and 237 deletions
--- a/modules/core/src/parallel_pthreads.cpp
+++ b/modules/core/src/parallel_pthreads.cpp
@ -80,25 +80,31 @@ struct work_load
        set(range, body, nstripes);
    }

-    void set(const cv::Range& range, const cv::ParallelLoopBody& body, int nstripes)
+    void set(const cv::Range& range, const cv::ParallelLoopBody& body, unsigned int nstripes)
    {
        m_body = &body;
        m_range = &range;
-        m_nstripes = nstripes;
-        m_blocks_count = ((m_range->end - m_range->start - 1)/m_nstripes) + 1;
+
+        //ensure that nstripes not larger than range length
+        m_nstripes = std::min( unsigned(m_range->end - m_range->start) , nstripes);
+
+        m_block_size = ((m_range->end - m_range->start - 1)/m_nstripes) + 1;
+
+        //ensure that nstripes not larger than blocks count, so we would never go out of range
+        m_nstripes = std::min(m_nstripes, unsigned(((m_range->end - m_range->start - 1)/m_block_size) + 1) );
    }

    const cv::ParallelLoopBody* m_body;
    const cv::Range*            m_range;
-    int                         m_nstripes;
-    unsigned int                m_blocks_count;
+    unsigned int                         m_nstripes;
+    int                m_block_size;

    void clear()
    {
        m_body = 0;
        m_range = 0;
        m_nstripes = 0;
-        m_blocks_count = 0;
+        m_block_size = 0;
    }
 };

@ -331,10 +337,10 @@ void ForThread::execute()

    work_load& load = m_parent->m_work_load;

-    while(m_current_pos < load.m_blocks_count)
+    while(m_current_pos < load.m_nstripes)
    {
-        int start = load.m_range->start + m_current_pos*load.m_nstripes;
-        int end = std::min(start + load.m_nstripes, load.m_range->end);
+        int start = load.m_range->start + m_current_pos*load.m_block_size;
+        int end = std::min(start + load.m_block_size, load.m_range->end);

        load.m_body->operator()(cv::Range(start, end));

@ -417,9 +423,11 @@ void ThreadManager::run(const cv::Range& range, const cv::ParallelLoopBody& body
        {
            if(initPool())
            {
-                double min_stripes = double(range.end - range.start)/(4*m_threads.size());
+                if(nstripes < 1) nstripes = 4*m_threads.size();

-                nstripes = std::max(nstripes, min_stripes);
+                double max_stripes = 4*m_threads.size();
+
+                nstripes = std::min(nstripes, max_stripes);

                pthread_mutex_lock(&m_manager_task_mutex);

@ -429,7 +437,7 @@ void ThreadManager::run(const cv::Range& range, const cv::ParallelLoopBody& body

                m_task_complete = false;

-                m_work_load.set(range, body, std::ceil(nstripes));
+                m_work_load.set(range, body, cvCeil(nstripes));

                for(size_t i = 0; i < m_threads.size(); ++i)
                {
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@ -315,6 +315,7 @@ enum { CAP_INTELPERC_DEPTH_MAP              = 0, // Each pixel is a 16-bit integ

 enum { VIDEOWRITER_PROP_QUALITY = 1,    // Quality (0..100%) of the videostream encoded
       VIDEOWRITER_PROP_FRAMEBYTES = 2, // (Read-only): Size of just encoded video frame
+       VIDEOWRITER_PROP_NSTRIPES = 3    // Number of stripes for parallel encoding. -1 for auto detection
     };

 // gPhoto2 properties, if propertyId is less than 0 then work on widget with that __additive inversed__ camera setting ID
@ -610,6 +611,7 @@ public:

     @param propId Property identifier. It can be one of the following:
     -   **VIDEOWRITER_PROP_QUALITY** Quality (0..100%) of the videostream encoded. Can be adjusted dynamically in some codecs.
+     -   **VIDEOWRITER_PROP_NSTRIPES** Number of stripes for parallel encoding
     @param value Value of the property.
     */
    CV_WRAP virtual bool set(int propId, double value);
@ -619,6 +621,7 @@ public:
     @param propId Property identifier. It can be one of the following:
     -   **VIDEOWRITER_PROP_QUALITY** Current quality of the encoded videostream.
     -   **VIDEOWRITER_PROP_FRAMEBYTES** (Read-only) Size of just encoded video frame; note that the encoding order may be different from representation order.
+     -   **VIDEOWRITER_PROP_NSTRIPES** Number of stripes for parallel encoding

     @note When querying a property that is not supported by the backend used by the VideoWriter
     class, value 0 is returned.
--- a/modules/videoio/src/cap_mjpeg_encoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_encoder.cpp
@ -41,6 +41,7 @@

 #include "precomp.hpp"
 #include <vector>
+#include <deque>

 #if CV_NEON
 #define WITH_NEON
@ -350,14 +351,261 @@ protected:
 };


+class mjpeg_buffer
+{
+public:
+    mjpeg_buffer()
+    {
+        reset();
+    }
+
+    void resize(int size)
+    {
+        data.resize(size);
+    }
+
+    void put(unsigned bits, int len)
+    {
+        if((m_pos == (data.size() - 1) && len > bits_free) || m_pos == data.size())
+        {
+            resize(int(2*data.size()));
+        }
+
+        bits_free -= (len);
+        unsigned int tempval = (bits) & bit_mask[(len)];
+
+        if( bits_free <= 0 )
+        {
+            data[m_pos] |= ((unsigned)tempval >> -bits_free);
+
+            bits_free += 32;
+            ++m_pos;
+            data[m_pos] = bits_free < 32 ? (tempval << bits_free) : 0;
+        }
+        else
+        {
+            data[m_pos] |= (tempval << bits_free);
+        }
+    }
+
+    void finish()
+    {
+        if(bits_free == 32)
+        {
+            bits_free = 0;
+            m_data_len = m_pos;
+        }
+        else
+        {
+            m_data_len = m_pos + 1;
+        }
+    }
+
+    void reset()
+    {
+        bits_free = 32;
+        m_pos = 0;
+        m_data_len = 0;
+    }
+
+    void clear()
+    {
+        //we need to clear only first element, the rest would be overwritten
+        data[0] = 0;
+    }
+
+    int get_bits_free()
+    {
+        return bits_free;
+    }
+
+    unsigned* get_data()
+    {
+        return &data[0];
+    }
+
+    unsigned get_len()
+    {
+        return m_data_len;
+    }
+
+private:
+    std::vector<unsigned> data;
+    int bits_free;
+    unsigned m_pos;
+    bool m_is_full;
+    unsigned m_data_len;
+};
+
+
+class mjpeg_buffer_keeper
+{
+public:
+    mjpeg_buffer_keeper()
+    {
+        m_last_bit_len = 0;
+    }
+
+    mjpeg_buffer& operator[](int i)
+    {
+        return m_buffer_list[i];
+    }
+
+    void allocate_buffers(int count, int size)
+    {
+        for(int i = (int)m_buffer_list.size(); i < count; ++i)
+        {
+            m_buffer_list.push_back(mjpeg_buffer());
+            m_buffer_list.back().resize(size);
+        }
+    }
+
+    unsigned* get_data()
+    {
+        //if there is only one buffer (single thread) there is no need to stack buffers
+        if(m_buffer_list.size() == 1)
+        {
+            m_buffer_list[0].finish();
+
+            m_data_len = m_buffer_list[0].get_len();
+            m_last_bit_len = m_buffer_list[0].get_bits_free() ? 32 - m_buffer_list[0].get_bits_free() : 0;
+
+            return m_buffer_list[0].get_data();
+        }
+
+        allocate_output_buffer();
+
+        int bits = 0;
+        unsigned currval = 0;
+        m_data_len = 0;
+
+        for(unsigned j = 0; j < m_buffer_list.size(); ++j)
+        {
+            mjpeg_buffer& buffer = m_buffer_list[j];
+
+            //if no bit shift required we could use memcpy
+            if(bits == 0)
+            {
+                size_t current_pos = m_data_len;
+
+                if(buffer.get_bits_free() == 0)
+                {
+                    memcpy(&m_output_buffer[current_pos], buffer.get_data(), sizeof(buffer.get_data()[0])*buffer.get_len());
+                    m_data_len += buffer.get_len();
+                    currval = 0;
+                }
+                else
+                {
+                    memcpy(&m_output_buffer[current_pos], buffer.get_data(), sizeof(buffer.get_data()[0])*(buffer.get_len() -1 ));
+                    m_data_len += buffer.get_len() - 1;
+                    currval = buffer.get_data()[buffer.get_len() - 1];
+                }
+            }
+            else
+            {
+                for(unsigned i = 0; i < buffer.get_len() - 1; ++i)
+                {
+                    if( bits <= 0 )
+                    {
+                        currval |= ((unsigned)buffer.get_data()[i] >> -bits);
+
+                        m_output_buffer[m_data_len++] = currval;
+
+                        currval = (bits < 0) ? (buffer.get_data()[i] << (bits + 32)) : 0;
+                    }
+                    else
+                    {
+                        currval |= (buffer.get_data()[i] << bits);
+                    }
+                }
+
+                currval |= ((unsigned)buffer.get_data()[buffer.get_len() - 1] >> -bits);
+
+                if( (buffer.get_bits_free() == 32 ? 0 : buffer.get_bits_free()) <= -bits)
+                {
+                    m_output_buffer[m_data_len++] = currval;
+
+                    currval = (bits < 0) ? (buffer.get_data()[buffer.get_len() - 1] << (bits + 32)) : 0;
+                }
+            }
+
+            bits += buffer.get_bits_free();
+
+            if(bits > 0)
+            {
+                bits -= 32;
+            }
+        }
+
+        //bits == 0 means that last element shouldn't be used.
+        m_output_buffer[m_data_len++] = currval;
+
+        m_last_bit_len = -bits;
+
+        return &m_output_buffer[0];
+    }
+
+    int get_last_bit_len()
+    {
+        return m_last_bit_len;
+    }
+
+    int get_data_size()
+    {
+        return m_data_len;
+    }
+
+    void reset()
+    {
+        m_last_bit_len = 0;
+        for(unsigned i = 0; i < m_buffer_list.size(); ++i)
+        {
+            m_buffer_list[i].reset();
+        }
+
+        //there is no need to erase output buffer since it would be overwritten
+        m_data_len = 0;
+    }
+
+private:
+
+    void allocate_output_buffer()
+    {
+        unsigned total_size = 0;
+
+        for(unsigned i = 0; i < m_buffer_list.size(); ++i)
+        {
+            m_buffer_list[i].finish();
+            total_size += m_buffer_list[i].get_len();
+        }
+
+        if(total_size > m_output_buffer.size())
+        {
+            m_output_buffer.clear();
+            m_output_buffer.resize(total_size);
+        }
+    }
+
+    std::deque<mjpeg_buffer> m_buffer_list;
+    std::vector<unsigned> m_output_buffer;
+    int m_data_len;
+    int m_last_bit_len;
+};
+
 class MotionJpegWriter : public IVideoWriter
 {
 public:
-    MotionJpegWriter() { rawstream = false; }
+    MotionJpegWriter()
+    {
+        rawstream = false;
+        nstripes = -1;
+    }
+
    MotionJpegWriter(const String& filename, double fps, Size size, bool iscolor)
    {
        rawstream = false;
        open(filename, fps, size, iscolor);
+        nstripes = -1;
    }
    ~MotionJpegWriter() { close(); }

@ -616,6 +864,8 @@ public:
            return quality;
        if( propId == VIDEOWRITER_PROP_FRAMEBYTES )
            return frameSize.empty() ? 0. : (double)frameSize.back();
+        if( propId == VIDEOWRITER_PROP_NSTRIPES )
+            return nstripes;
        return 0.;
    }

@ -626,6 +876,13 @@ public:
            quality = value;
            return true;
        }
+
+        if( propId == VIDEOWRITER_PROP_NSTRIPES)
+        {
+            nstripes = value;
+            return true;
+        }
+
        return false;
    }

@ -638,6 +895,8 @@ protected:
    size_t moviPointer;
    std::vector<size_t> frameOffset, frameSize, AVIChunkSizeIndex, frameNumIndexes;
    bool rawstream;
+    mjpeg_buffer_keeper buffers_list;
+    double nstripes;

    BitStream strm;
 };
@ -1107,179 +1366,16 @@ static void aan_fdct8x8( const short *src, short *dst,
 }
 #endif

-void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspace, int input_channels )
-{
-    //double total_cvt = 0, total_dct = 0;
-    static bool init_cat_table = false;
-    const int CAT_TAB_SIZE = 4096;
-    static uchar cat_table[CAT_TAB_SIZE*2+1];
-    if( !init_cat_table )
-    {
-        for( int i = -CAT_TAB_SIZE; i <= CAT_TAB_SIZE; i++ )
-        {
-            Cv32suf a;
-            a.f = (float)i;
-            cat_table[i+CAT_TAB_SIZE] = ((a.i >> 23) & 255) - (126 & (i ? -1 : 0));
-        }
-        init_cat_table = true;
-    }

-    //double total_dct = 0, total_cvt = 0;
-    CV_Assert( data && width > 0 && height > 0 );
-
-    // encode the header and tables
-    // for each mcu:
-    //   convert rgb to yuv with downsampling (if color).
-    //   for every block:
-    //     calc dct and quantize
-    //     encode block.
-    int x, y;
+inline void convertToYUV(int colorspace, int channels, int input_channels, short* UV_data, short* Y_data, const uchar* pix_data, int y_limit, int x_limit, int step, int u_plane_ofs, int v_plane_ofs)
+{
    int i, j;
-    const int max_quality = 12;
-    short fdct_qtab[2][64];
-    unsigned huff_dc_tab[2][16];
-    unsigned huff_ac_tab[2][256];
-
-    int  x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
-    int  dc_pred[] = { 0, 0, 0 };
-    int  x_step = x_scale * 8;
-    int  y_step = y_scale * 8;
-    short  block[6][64];
-    short  buffer[4096];
-    int*   hbuffer = (int*)buffer;
-    int  luma_count = x_scale*y_scale;
-    int  block_count = luma_count + channels - 1;
-    int  Y_step = x_scale*8;
    const int UV_step = 16;
-    int u_plane_ofs = step*height;
-    int v_plane_ofs = u_plane_ofs + step*height;
-    double _quality = quality*0.01*max_quality;
-
-    if( _quality < 1. ) _quality = 1.;
-    if( _quality > max_quality ) _quality = max_quality;
-
-    double inv_quality = 1./_quality;
-
-    // Encode header
-    strm.putBytes( (const uchar*)jpegHeader, sizeof(jpegHeader) - 1 );
-
-    // Encode quantization tables
-    for( i = 0; i < (channels > 1 ? 2 : 1); i++ )
-    {
-        const uchar* qtable = i == 0 ? jpegTableK1_T : jpegTableK2_T;
-        int chroma_scale = i > 0 ? luma_count : 1;
-
-        strm.jputShort( 0xffdb );   // DQT marker
-        strm.jputShort( 2 + 65*1 ); // put single qtable
-        strm.putByte( 0*16 + i );   // 8-bit table
-
-        // put coefficients
-        for( j = 0; j < 64; j++ )
-        {
-            int idx = zigzag[j];
-            int qval = cvRound(qtable[idx]*inv_quality);
-            if( qval < 1 )
-                qval = 1;
-            if( qval > 255 )
-                qval = 255;
-            fdct_qtab[i][idx] = (short)(cvRound((1 << (postshift + 11)))/
-                                (qval*chroma_scale*idct_prescale[idx]));
-            strm.putByte( qval );
-        }
-    }
-
-    // Encode huffman tables
-    for( i = 0; i < (channels > 1 ? 4 : 2); i++ )
-    {
-        const uchar* htable = i == 0 ? jpegTableK3 : i == 1 ? jpegTableK5 :
-        i == 2 ? jpegTableK4 : jpegTableK6;
-        int is_ac_tab = i & 1;
-        int idx = i >= 2;
-        int tableSize = 16 + (is_ac_tab ? 162 : 12);
-
-        strm.jputShort( 0xFFC4 );      // DHT marker
-        strm.jputShort( 3 + tableSize ); // define one huffman table
-        strm.putByte( is_ac_tab*16 + idx ); // put DC/AC flag and table index
-        strm.putBytes( htable, tableSize ); // put table
-
-        BitStream::createEncodeHuffmanTable( BitStream::createSourceHuffmanTable(
-                                            htable, hbuffer, 16, 9 ), is_ac_tab ? huff_ac_tab[idx] :
-                                            huff_dc_tab[idx], is_ac_tab ? 256 : 16 );
-    }
-
-    // put frame header
-    strm.jputShort( 0xFFC0 );          // SOF0 marker
-    strm.jputShort( 8 + 3*channels );  // length of frame header
-    strm.putByte( 8 );               // sample precision
-    strm.jputShort( height );
-    strm.jputShort( width );
-    strm.putByte( channels );        // number of components
-
-    for( i = 0; i < channels; i++ )
-    {
-        strm.putByte( i + 1 );  // (i+1)-th component id (Y,U or V)
-        if( i == 0 )
-            strm.putByte(x_scale*16 + y_scale); // chroma scale factors
-        else
-            strm.putByte(1*16 + 1);
-        strm.putByte( i > 0 ); // quantization table idx
-    }
-
-    // put scan header
-    strm.jputShort( 0xFFDA );          // SOS marker
-    strm.jputShort( 6 + 2*channels );  // length of scan header
-    strm.putByte( channels );          // number of components in the scan
-
-    for( i = 0; i < channels; i++ )
-    {
-        strm.putByte( i+1 );             // component id
-        strm.putByte( (i>0)*16 + (i>0) );// selection of DC & AC tables
-    }
-
-    strm.jputShort(0*256 + 63); // start and end of spectral selection - for
-    // sequential DCT start is 0 and end is 63
-
-    strm.putByte( 0 );  // successive approximation bit position
-    // high & low - (0,0) for sequential DCT
-    unsigned currval = 0, code = 0, tempval = 0;
-    int bit_idx = 32;
-
-#define JPUT_BITS(val, bits) \
-    bit_idx -= (bits); \
-    tempval = (val) & bit_mask[(bits)]; \
-    if( bit_idx <= 0 ) \
-    {  \
-        strm.jput(currval | ((unsigned)tempval >> -bit_idx)); \
-        bit_idx += 32; \
-        currval = bit_idx < 32 ? (tempval << bit_idx) : 0; \
-    } \
-    else \
-        currval |= (tempval << bit_idx)
-
-#define JPUT_HUFF(val, table) \
-    code = table[(val) + 2]; \
-    JPUT_BITS(code >> 8, (int)(code & 255))
-
-    // encode data
-    for( y = 0; y < height; y += y_step, data += y_step*step )
-    {
-        for( x = 0; x < width; x += x_step )
-        {
-            int x_limit = x_step;
-            int y_limit = y_step;
-            const uchar* pix_data = data + x*input_channels;
-            short* Y_data = block[0];
-
-            if( x + x_limit > width ) x_limit = width - x;
-            if( y + y_limit > height ) y_limit = height - y;
-
-            memset( block, 0, block_count*64*sizeof(block[0][0]));
+    int  x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
+    int  Y_step = x_scale*8;

    if( channels > 1 )
    {
-                short* UV_data = block[luma_count];
-                // double t = (double)cv::getTickCount();
-
        if( colorspace == COLORSPACE_YUV444P && y_limit == 16 && x_limit == 16 )
        {
            for( i = 0; i < y_limit; i += 2, pix_data += step*2, Y_data += Y_step*2, UV_data += UV_step )
@ -1388,7 +1484,6 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
            }
        }

-                // total_cvt += (double)cv::getTickCount() - t;
    }
    else
    {
@ -1398,6 +1493,161 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
                Y_data[j] = (short)(pix_data[j]*4 - 128*4);
        }
    }
+}
+
+class MjpegEncoder : public ParallelLoopBody
+{
+public:
+    MjpegEncoder(int _height,
+        int _width,
+        int _step,
+        const uchar* _data,
+        int _input_channels,
+        int _channels,
+        int _colorspace,
+        unsigned (&_huff_dc_tab)[2][16],
+        unsigned (&_huff_ac_tab)[2][256],
+        short (&_fdct_qtab)[2][64],
+        uchar* _cat_table,
+        mjpeg_buffer_keeper& _buffer_list,
+        double nstripes
+    ) :
+        m_buffer_list(_buffer_list),
+        height(_height),
+        width(_width),
+        step(_step),
+        in_data(_data),
+        input_channels(_input_channels),
+        channels(_channels),
+        colorspace(_colorspace),
+        huff_dc_tab(_huff_dc_tab),
+        huff_ac_tab(_huff_ac_tab),
+        fdct_qtab(_fdct_qtab),
+        cat_table(_cat_table)
+    {
+        //empirically found value. if number of pixels is less than that value there is no sense to parallelize it.
+        const int min_pixels_count = 96*96;
+
+        stripes_count = 1;
+
+        if(nstripes < 0)
+        {
+            if(height*width > min_pixels_count)
+            {
+                stripes_count = 4;
+            }
+        }
+        else
+        {
+            stripes_count = cvCeil(nstripes);
+        }
+
+        int y_scale = channels > 1 ? 2 : 1;
+        int y_step = y_scale * 8;
+
+        int max_stripes = (height - 1)/y_step + 1;
+
+        stripes_count = std::min(stripes_count, max_stripes);
+
+        m_buffer_list.allocate_buffers(stripes_count, (height*width*2)/stripes_count);
+    }
+
+    void operator()( const cv::Range& range ) const
+    {
+        const int CAT_TAB_SIZE = 4096;
+        unsigned code = 0;
+
+#define JPUT_BITS(val, bits) output_buffer.put(val, bits)
+
+#define JPUT_HUFF(val, table) \
+    code = table[(val) + 2]; \
+    JPUT_BITS(code >> 8, (int)(code & 255))
+
+        int x, y;
+        int i, j;
+
+        short  buffer[4096];
+        int  x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
+        int  dc_pred[] = { 0, 0, 0 };
+        int  x_step = x_scale * 8;
+        int  y_step = y_scale * 8;
+        short  block[6][64];
+        int  luma_count = x_scale*y_scale;
+        int  block_count = luma_count + channels - 1;
+        int u_plane_ofs = step*height;
+        int v_plane_ofs = u_plane_ofs + step*height;
+        const uchar* data = in_data;
+        const uchar* init_data = data;
+
+        int num_steps = (height - 1)/y_step + 1;
+
+        //if this is not first stripe we need to calculate dc_pred from previous step
+        if(range.start > 0)
+        {
+            y = y_step*int(num_steps*range.start/stripes_count - 1);
+            data = init_data + y*step;
+
+            for( x = 0; x < width; x += x_step )
+            {
+                int x_limit = x_step;
+                int y_limit = y_step;
+                const uchar* pix_data = data + x*input_channels;
+                short* Y_data = block[0];
+                short* UV_data = block[luma_count];
+
+                if( x + x_limit > width ) x_limit = width - x;
+                if( y + y_limit > height ) y_limit = height - y;
+
+                memset( block, 0, block_count*64*sizeof(block[0][0]));
+
+                convertToYUV(colorspace, channels, input_channels, UV_data, Y_data, pix_data, y_limit, x_limit, step, u_plane_ofs, v_plane_ofs);
+
+                for( i = 0; i < block_count; i++ )
+                {
+                    int is_chroma = i >= luma_count;
+                    int src_step = x_scale * 8;
+                    const short* src_ptr = block[i & -2] + (i & 1)*8;
+
+                    aan_fdct8x8( src_ptr, buffer, src_step, fdct_qtab[is_chroma] );
+
+                    j = is_chroma + (i > luma_count);
+                    dc_pred[j] = buffer[0];
+                }
+            }
+        }
+
+        for(int k = range.start; k < range.end; ++k)
+        {
+            mjpeg_buffer& output_buffer = m_buffer_list[k];
+            output_buffer.clear();
+
+            int y_min = y_step*int(num_steps*k/stripes_count);
+            int y_max = y_step*int(num_steps*(k+1)/stripes_count);
+
+            if(k == stripes_count - 1)
+            {
+                y_max = height;
+            }
+
+
+            data = init_data + y_min*step;
+
+            for( y = y_min; y < y_max; y += y_step, data += y_step*step )
+            {
+                for( x = 0; x < width; x += x_step )
+                {
+                    int x_limit = x_step;
+                    int y_limit = y_step;
+                    const uchar* pix_data = data + x*input_channels;
+                    short* Y_data = block[0];
+                    short* UV_data = block[luma_count];
+
+                    if( x + x_limit > width ) x_limit = width - x;
+                    if( y + y_limit > height ) y_limit = height - y;
+
+                    memset( block, 0, block_count*64*sizeof(block[0][0]));
+
+                    convertToYUV(colorspace, channels, input_channels, UV_data, Y_data, pix_data, y_limit, x_limit, step, u_plane_ofs, v_plane_ofs);

                    for( i = 0; i < block_count; i++ )
                    {
@ -1407,9 +1657,7 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
                        const short* src_ptr = block[i & -2] + (i & 1)*8;
                        const unsigned* htable = huff_ac_tab[is_chroma];

-                //double t = (double)cv::getTickCount();
                        aan_fdct8x8( src_ptr, buffer, src_step, fdct_qtab[is_chroma] );
-                //total_dct += (double)cv::getTickCount() - t;

                        j = is_chroma + (i > luma_count);
                        val = buffer[0] - dc_pred[j];
@ -1457,13 +1705,183 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
                    }
                }
            }
+        }
+    }

-    // Flush
-    strm.jflush(currval, bit_idx);
+    cv::Range getRange()
+    {
+        return cv::Range(0, stripes_count);
+    }
+
+    double getNStripes()
+    {
+        return stripes_count;
+    }
+
+    mjpeg_buffer_keeper& m_buffer_list;
+private:
+
+    MjpegEncoder& operator=( const MjpegEncoder & ) { return *this; }
+
+    const int height;
+    const int width;
+    const int step;
+    const uchar* in_data;
+    const int input_channels;
+    const int channels;
+    const int colorspace;
+    const unsigned (&huff_dc_tab)[2][16];
+    const unsigned (&huff_ac_tab)[2][256];
+    const short (&fdct_qtab)[2][64];
+    const uchar* cat_table;
+    int stripes_count;
+};
+
+void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspace, int input_channels )
+{
+    //double total_cvt = 0, total_dct = 0;
+    static bool init_cat_table = false;
+    const int CAT_TAB_SIZE = 4096;
+    static uchar cat_table[CAT_TAB_SIZE*2+1];
+    if( !init_cat_table )
+    {
+        for( int i = -CAT_TAB_SIZE; i <= CAT_TAB_SIZE; i++ )
+        {
+            Cv32suf a;
+            a.f = (float)i;
+            cat_table[i+CAT_TAB_SIZE] = ((a.i >> 23) & 255) - (126 & (i ? -1 : 0));
+        }
+        init_cat_table = true;
+    }
+
+    //double total_dct = 0, total_cvt = 0;
+    CV_Assert( data && width > 0 && height > 0 );
+
+    // encode the header and tables
+    // for each mcu:
+    //   convert rgb to yuv with downsampling (if color).
+    //   for every block:
+    //     calc dct and quantize
+    //     encode block.
+    int i, j;
+    const int max_quality = 12;
+    short fdct_qtab[2][64];
+    unsigned huff_dc_tab[2][16];
+    unsigned huff_ac_tab[2][256];
+
+    int  x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
+    short  buffer[4096];
+    int*   hbuffer = (int*)buffer;
+    int  luma_count = x_scale*y_scale;
+    double _quality = quality*0.01*max_quality;
+
+    if( _quality < 1. ) _quality = 1.;
+    if( _quality > max_quality ) _quality = max_quality;
+
+    double inv_quality = 1./_quality;
+
+    // Encode header
+    strm.putBytes( (const uchar*)jpegHeader, sizeof(jpegHeader) - 1 );
+
+    // Encode quantization tables
+    for( i = 0; i < (channels > 1 ? 2 : 1); i++ )
+    {
+        const uchar* qtable = i == 0 ? jpegTableK1_T : jpegTableK2_T;
+        int chroma_scale = i > 0 ? luma_count : 1;
+
+        strm.jputShort( 0xffdb );   // DQT marker
+        strm.jputShort( 2 + 65*1 ); // put single qtable
+        strm.putByte( 0*16 + i );   // 8-bit table
+
+        // put coefficients
+        for( j = 0; j < 64; j++ )
+        {
+            int idx = zigzag[j];
+            int qval = cvRound(qtable[idx]*inv_quality);
+            if( qval < 1 )
+                qval = 1;
+            if( qval > 255 )
+                qval = 255;
+            fdct_qtab[i][idx] = (short)(cvRound((1 << (postshift + 11)))/
+                                (qval*chroma_scale*idct_prescale[idx]));
+            strm.putByte( qval );
+        }
+    }
+
+    // Encode huffman tables
+    for( i = 0; i < (channels > 1 ? 4 : 2); i++ )
+    {
+        const uchar* htable = i == 0 ? jpegTableK3 : i == 1 ? jpegTableK5 :
+        i == 2 ? jpegTableK4 : jpegTableK6;
+        int is_ac_tab = i & 1;
+        int idx = i >= 2;
+        int tableSize = 16 + (is_ac_tab ? 162 : 12);
+
+        strm.jputShort( 0xFFC4 );      // DHT marker
+        strm.jputShort( 3 + tableSize ); // define one huffman table
+        strm.putByte( is_ac_tab*16 + idx ); // put DC/AC flag and table index
+        strm.putBytes( htable, tableSize ); // put table
+
+        BitStream::createEncodeHuffmanTable( BitStream::createSourceHuffmanTable(
+                                            htable, hbuffer, 16, 9 ), is_ac_tab ? huff_ac_tab[idx] :
+                                            huff_dc_tab[idx], is_ac_tab ? 256 : 16 );
+    }
+
+    // put frame header
+    strm.jputShort( 0xFFC0 );          // SOF0 marker
+    strm.jputShort( 8 + 3*channels );  // length of frame header
+    strm.putByte( 8 );               // sample precision
+    strm.jputShort( height );
+    strm.jputShort( width );
+    strm.putByte( channels );        // number of components
+
+    for( i = 0; i < channels; i++ )
+    {
+        strm.putByte( i + 1 );  // (i+1)-th component id (Y,U or V)
+        if( i == 0 )
+            strm.putByte(x_scale*16 + y_scale); // chroma scale factors
+        else
+            strm.putByte(1*16 + 1);
+        strm.putByte( i > 0 ); // quantization table idx
+    }
+
+    // put scan header
+    strm.jputShort( 0xFFDA );          // SOS marker
+    strm.jputShort( 6 + 2*channels );  // length of scan header
+    strm.putByte( channels );          // number of components in the scan
+
+    for( i = 0; i < channels; i++ )
+    {
+        strm.putByte( i+1 );             // component id
+        strm.putByte( (i>0)*16 + (i>0) );// selection of DC & AC tables
+    }
+
+    strm.jputShort(0*256 + 63); // start and end of spectral selection - for
+    // sequential DCT start is 0 and end is 63
+
+    strm.putByte( 0 );  // successive approximation bit position
+    // high & low - (0,0) for sequential DCT
+
+    buffers_list.reset();
+
+    MjpegEncoder parallel_encoder(height, width, step, data, input_channels, channels, colorspace, huff_dc_tab, huff_ac_tab, fdct_qtab, cat_table, buffers_list, nstripes);
+
+    cv::parallel_for_(parallel_encoder.getRange(), parallel_encoder, parallel_encoder.getNStripes());
+
+    //std::vector<unsigned>& v = parallel_encoder.m_buffer_list.get_data();
+    unsigned* v = buffers_list.get_data();
+    unsigned last_data_elem = buffers_list.get_data_size() - 1;
+
+    for(unsigned k = 0; k < last_data_elem; ++k)
+    {
+        strm.jput(v[k]);
+    }
+    strm.jflush(v[last_data_elem], 32 - buffers_list.get_last_bit_len());
    strm.jputShort( 0xFFD9 ); // EOI marker
    /*printf("total dct = %.1fms, total cvt = %.1fms\n",
     total_dct*1000./cv::getTickFrequency(),
     total_cvt*1000./cv::getTickFrequency());*/
+
    size_t pos = strm.getPos();
    size_t pos1 = (pos + 3) & ~3;
    for( ; pos < pos1; pos++ )