Merge pull request #4164 from kalistratovag:parallel_mjpeg

This commit is contained in:
Vadim Pisarevsky 2015-07-01 14:15:51 +00:00
commit c81b064727
3 changed files with 666 additions and 237 deletions

View File

@ -80,25 +80,31 @@ struct work_load
set(range, body, nstripes);
}
void set(const cv::Range& range, const cv::ParallelLoopBody& body, int nstripes)
void set(const cv::Range& range, const cv::ParallelLoopBody& body, unsigned int nstripes)
{
m_body = &body;
m_range = ⦥
m_nstripes = nstripes;
m_blocks_count = ((m_range->end - m_range->start - 1)/m_nstripes) + 1;
//ensure that nstripes not larger than range length
m_nstripes = std::min( unsigned(m_range->end - m_range->start) , nstripes);
m_block_size = ((m_range->end - m_range->start - 1)/m_nstripes) + 1;
//ensure that nstripes not larger than blocks count, so we would never go out of range
m_nstripes = std::min(m_nstripes, unsigned(((m_range->end - m_range->start - 1)/m_block_size) + 1) );
}
const cv::ParallelLoopBody* m_body;
const cv::Range* m_range;
int m_nstripes;
unsigned int m_blocks_count;
unsigned int m_nstripes;
int m_block_size;
void clear()
{
m_body = 0;
m_range = 0;
m_nstripes = 0;
m_blocks_count = 0;
m_block_size = 0;
}
};
@ -331,10 +337,10 @@ void ForThread::execute()
work_load& load = m_parent->m_work_load;
while(m_current_pos < load.m_blocks_count)
while(m_current_pos < load.m_nstripes)
{
int start = load.m_range->start + m_current_pos*load.m_nstripes;
int end = std::min(start + load.m_nstripes, load.m_range->end);
int start = load.m_range->start + m_current_pos*load.m_block_size;
int end = std::min(start + load.m_block_size, load.m_range->end);
load.m_body->operator()(cv::Range(start, end));
@ -417,9 +423,11 @@ void ThreadManager::run(const cv::Range& range, const cv::ParallelLoopBody& body
{
if(initPool())
{
double min_stripes = double(range.end - range.start)/(4*m_threads.size());
if(nstripes < 1) nstripes = 4*m_threads.size();
nstripes = std::max(nstripes, min_stripes);
double max_stripes = 4*m_threads.size();
nstripes = std::min(nstripes, max_stripes);
pthread_mutex_lock(&m_manager_task_mutex);
@ -429,7 +437,7 @@ void ThreadManager::run(const cv::Range& range, const cv::ParallelLoopBody& body
m_task_complete = false;
m_work_load.set(range, body, std::ceil(nstripes));
m_work_load.set(range, body, cvCeil(nstripes));
for(size_t i = 0; i < m_threads.size(); ++i)
{

View File

@ -315,6 +315,7 @@ enum { CAP_INTELPERC_DEPTH_MAP = 0, // Each pixel is a 16-bit integ
enum { VIDEOWRITER_PROP_QUALITY = 1, // Quality (0..100%) of the videostream encoded
VIDEOWRITER_PROP_FRAMEBYTES = 2, // (Read-only): Size of just encoded video frame
VIDEOWRITER_PROP_NSTRIPES = 3 // Number of stripes for parallel encoding. -1 for auto detection
};
// gPhoto2 properties, if propertyId is less than 0 then work on widget with that __additive inversed__ camera setting ID
@ -610,6 +611,7 @@ public:
@param propId Property identifier. It can be one of the following:
- **VIDEOWRITER_PROP_QUALITY** Quality (0..100%) of the videostream encoded. Can be adjusted dynamically in some codecs.
- **VIDEOWRITER_PROP_NSTRIPES** Number of stripes for parallel encoding
@param value Value of the property.
*/
CV_WRAP virtual bool set(int propId, double value);
@ -619,6 +621,7 @@ public:
@param propId Property identifier. It can be one of the following:
- **VIDEOWRITER_PROP_QUALITY** Current quality of the encoded videostream.
- **VIDEOWRITER_PROP_FRAMEBYTES** (Read-only) Size of just encoded video frame; note that the encoding order may be different from representation order.
- **VIDEOWRITER_PROP_NSTRIPES** Number of stripes for parallel encoding
@note When querying a property that is not supported by the backend used by the VideoWriter
class, value 0 is returned.

View File

@ -41,6 +41,7 @@
#include "precomp.hpp"
#include <vector>
#include <deque>
#if CV_NEON
#define WITH_NEON
@ -350,14 +351,261 @@ protected:
};
class mjpeg_buffer
{
public:
mjpeg_buffer()
{
reset();
}
void resize(int size)
{
data.resize(size);
}
void put(unsigned bits, int len)
{
if((m_pos == (data.size() - 1) && len > bits_free) || m_pos == data.size())
{
resize(int(2*data.size()));
}
bits_free -= (len);
unsigned int tempval = (bits) & bit_mask[(len)];
if( bits_free <= 0 )
{
data[m_pos] |= ((unsigned)tempval >> -bits_free);
bits_free += 32;
++m_pos;
data[m_pos] = bits_free < 32 ? (tempval << bits_free) : 0;
}
else
{
data[m_pos] |= (tempval << bits_free);
}
}
void finish()
{
if(bits_free == 32)
{
bits_free = 0;
m_data_len = m_pos;
}
else
{
m_data_len = m_pos + 1;
}
}
void reset()
{
bits_free = 32;
m_pos = 0;
m_data_len = 0;
}
void clear()
{
//we need to clear only first element, the rest would be overwritten
data[0] = 0;
}
int get_bits_free()
{
return bits_free;
}
unsigned* get_data()
{
return &data[0];
}
unsigned get_len()
{
return m_data_len;
}
private:
std::vector<unsigned> data;
int bits_free;
unsigned m_pos;
bool m_is_full;
unsigned m_data_len;
};
class mjpeg_buffer_keeper
{
public:
mjpeg_buffer_keeper()
{
m_last_bit_len = 0;
}
mjpeg_buffer& operator[](int i)
{
return m_buffer_list[i];
}
void allocate_buffers(int count, int size)
{
for(int i = (int)m_buffer_list.size(); i < count; ++i)
{
m_buffer_list.push_back(mjpeg_buffer());
m_buffer_list.back().resize(size);
}
}
unsigned* get_data()
{
//if there is only one buffer (single thread) there is no need to stack buffers
if(m_buffer_list.size() == 1)
{
m_buffer_list[0].finish();
m_data_len = m_buffer_list[0].get_len();
m_last_bit_len = m_buffer_list[0].get_bits_free() ? 32 - m_buffer_list[0].get_bits_free() : 0;
return m_buffer_list[0].get_data();
}
allocate_output_buffer();
int bits = 0;
unsigned currval = 0;
m_data_len = 0;
for(unsigned j = 0; j < m_buffer_list.size(); ++j)
{
mjpeg_buffer& buffer = m_buffer_list[j];
//if no bit shift required we could use memcpy
if(bits == 0)
{
size_t current_pos = m_data_len;
if(buffer.get_bits_free() == 0)
{
memcpy(&m_output_buffer[current_pos], buffer.get_data(), sizeof(buffer.get_data()[0])*buffer.get_len());
m_data_len += buffer.get_len();
currval = 0;
}
else
{
memcpy(&m_output_buffer[current_pos], buffer.get_data(), sizeof(buffer.get_data()[0])*(buffer.get_len() -1 ));
m_data_len += buffer.get_len() - 1;
currval = buffer.get_data()[buffer.get_len() - 1];
}
}
else
{
for(unsigned i = 0; i < buffer.get_len() - 1; ++i)
{
if( bits <= 0 )
{
currval |= ((unsigned)buffer.get_data()[i] >> -bits);
m_output_buffer[m_data_len++] = currval;
currval = (bits < 0) ? (buffer.get_data()[i] << (bits + 32)) : 0;
}
else
{
currval |= (buffer.get_data()[i] << bits);
}
}
currval |= ((unsigned)buffer.get_data()[buffer.get_len() - 1] >> -bits);
if( (buffer.get_bits_free() == 32 ? 0 : buffer.get_bits_free()) <= -bits)
{
m_output_buffer[m_data_len++] = currval;
currval = (bits < 0) ? (buffer.get_data()[buffer.get_len() - 1] << (bits + 32)) : 0;
}
}
bits += buffer.get_bits_free();
if(bits > 0)
{
bits -= 32;
}
}
//bits == 0 means that last element shouldn't be used.
m_output_buffer[m_data_len++] = currval;
m_last_bit_len = -bits;
return &m_output_buffer[0];
}
int get_last_bit_len()
{
return m_last_bit_len;
}
int get_data_size()
{
return m_data_len;
}
void reset()
{
m_last_bit_len = 0;
for(unsigned i = 0; i < m_buffer_list.size(); ++i)
{
m_buffer_list[i].reset();
}
//there is no need to erase output buffer since it would be overwritten
m_data_len = 0;
}
private:
void allocate_output_buffer()
{
unsigned total_size = 0;
for(unsigned i = 0; i < m_buffer_list.size(); ++i)
{
m_buffer_list[i].finish();
total_size += m_buffer_list[i].get_len();
}
if(total_size > m_output_buffer.size())
{
m_output_buffer.clear();
m_output_buffer.resize(total_size);
}
}
std::deque<mjpeg_buffer> m_buffer_list;
std::vector<unsigned> m_output_buffer;
int m_data_len;
int m_last_bit_len;
};
class MotionJpegWriter : public IVideoWriter
{
public:
MotionJpegWriter() { rawstream = false; }
MotionJpegWriter()
{
rawstream = false;
nstripes = -1;
}
MotionJpegWriter(const String& filename, double fps, Size size, bool iscolor)
{
rawstream = false;
open(filename, fps, size, iscolor);
nstripes = -1;
}
~MotionJpegWriter() { close(); }
@ -616,6 +864,8 @@ public:
return quality;
if( propId == VIDEOWRITER_PROP_FRAMEBYTES )
return frameSize.empty() ? 0. : (double)frameSize.back();
if( propId == VIDEOWRITER_PROP_NSTRIPES )
return nstripes;
return 0.;
}
@ -626,6 +876,13 @@ public:
quality = value;
return true;
}
if( propId == VIDEOWRITER_PROP_NSTRIPES)
{
nstripes = value;
return true;
}
return false;
}
@ -638,6 +895,8 @@ protected:
size_t moviPointer;
std::vector<size_t> frameOffset, frameSize, AVIChunkSizeIndex, frameNumIndexes;
bool rawstream;
mjpeg_buffer_keeper buffers_list;
double nstripes;
BitStream strm;
};
@ -1107,179 +1366,16 @@ static void aan_fdct8x8( const short *src, short *dst,
}
#endif
void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspace, int input_channels )
{
//double total_cvt = 0, total_dct = 0;
static bool init_cat_table = false;
const int CAT_TAB_SIZE = 4096;
static uchar cat_table[CAT_TAB_SIZE*2+1];
if( !init_cat_table )
{
for( int i = -CAT_TAB_SIZE; i <= CAT_TAB_SIZE; i++ )
{
Cv32suf a;
a.f = (float)i;
cat_table[i+CAT_TAB_SIZE] = ((a.i >> 23) & 255) - (126 & (i ? -1 : 0));
}
init_cat_table = true;
}
//double total_dct = 0, total_cvt = 0;
CV_Assert( data && width > 0 && height > 0 );
// encode the header and tables
// for each mcu:
// convert rgb to yuv with downsampling (if color).
// for every block:
// calc dct and quantize
// encode block.
int x, y;
inline void convertToYUV(int colorspace, int channels, int input_channels, short* UV_data, short* Y_data, const uchar* pix_data, int y_limit, int x_limit, int step, int u_plane_ofs, int v_plane_ofs)
{
int i, j;
const int max_quality = 12;
short fdct_qtab[2][64];
unsigned huff_dc_tab[2][16];
unsigned huff_ac_tab[2][256];
int x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
int dc_pred[] = { 0, 0, 0 };
int x_step = x_scale * 8;
int y_step = y_scale * 8;
short block[6][64];
short buffer[4096];
int* hbuffer = (int*)buffer;
int luma_count = x_scale*y_scale;
int block_count = luma_count + channels - 1;
int Y_step = x_scale*8;
const int UV_step = 16;
int u_plane_ofs = step*height;
int v_plane_ofs = u_plane_ofs + step*height;
double _quality = quality*0.01*max_quality;
if( _quality < 1. ) _quality = 1.;
if( _quality > max_quality ) _quality = max_quality;
double inv_quality = 1./_quality;
// Encode header
strm.putBytes( (const uchar*)jpegHeader, sizeof(jpegHeader) - 1 );
// Encode quantization tables
for( i = 0; i < (channels > 1 ? 2 : 1); i++ )
{
const uchar* qtable = i == 0 ? jpegTableK1_T : jpegTableK2_T;
int chroma_scale = i > 0 ? luma_count : 1;
strm.jputShort( 0xffdb ); // DQT marker
strm.jputShort( 2 + 65*1 ); // put single qtable
strm.putByte( 0*16 + i ); // 8-bit table
// put coefficients
for( j = 0; j < 64; j++ )
{
int idx = zigzag[j];
int qval = cvRound(qtable[idx]*inv_quality);
if( qval < 1 )
qval = 1;
if( qval > 255 )
qval = 255;
fdct_qtab[i][idx] = (short)(cvRound((1 << (postshift + 11)))/
(qval*chroma_scale*idct_prescale[idx]));
strm.putByte( qval );
}
}
// Encode huffman tables
for( i = 0; i < (channels > 1 ? 4 : 2); i++ )
{
const uchar* htable = i == 0 ? jpegTableK3 : i == 1 ? jpegTableK5 :
i == 2 ? jpegTableK4 : jpegTableK6;
int is_ac_tab = i & 1;
int idx = i >= 2;
int tableSize = 16 + (is_ac_tab ? 162 : 12);
strm.jputShort( 0xFFC4 ); // DHT marker
strm.jputShort( 3 + tableSize ); // define one huffman table
strm.putByte( is_ac_tab*16 + idx ); // put DC/AC flag and table index
strm.putBytes( htable, tableSize ); // put table
BitStream::createEncodeHuffmanTable( BitStream::createSourceHuffmanTable(
htable, hbuffer, 16, 9 ), is_ac_tab ? huff_ac_tab[idx] :
huff_dc_tab[idx], is_ac_tab ? 256 : 16 );
}
// put frame header
strm.jputShort( 0xFFC0 ); // SOF0 marker
strm.jputShort( 8 + 3*channels ); // length of frame header
strm.putByte( 8 ); // sample precision
strm.jputShort( height );
strm.jputShort( width );
strm.putByte( channels ); // number of components
for( i = 0; i < channels; i++ )
{
strm.putByte( i + 1 ); // (i+1)-th component id (Y,U or V)
if( i == 0 )
strm.putByte(x_scale*16 + y_scale); // chroma scale factors
else
strm.putByte(1*16 + 1);
strm.putByte( i > 0 ); // quantization table idx
}
// put scan header
strm.jputShort( 0xFFDA ); // SOS marker
strm.jputShort( 6 + 2*channels ); // length of scan header
strm.putByte( channels ); // number of components in the scan
for( i = 0; i < channels; i++ )
{
strm.putByte( i+1 ); // component id
strm.putByte( (i>0)*16 + (i>0) );// selection of DC & AC tables
}
strm.jputShort(0*256 + 63); // start and end of spectral selection - for
// sequential DCT start is 0 and end is 63
strm.putByte( 0 ); // successive approximation bit position
// high & low - (0,0) for sequential DCT
unsigned currval = 0, code = 0, tempval = 0;
int bit_idx = 32;
#define JPUT_BITS(val, bits) \
bit_idx -= (bits); \
tempval = (val) & bit_mask[(bits)]; \
if( bit_idx <= 0 ) \
{ \
strm.jput(currval | ((unsigned)tempval >> -bit_idx)); \
bit_idx += 32; \
currval = bit_idx < 32 ? (tempval << bit_idx) : 0; \
} \
else \
currval |= (tempval << bit_idx)
#define JPUT_HUFF(val, table) \
code = table[(val) + 2]; \
JPUT_BITS(code >> 8, (int)(code & 255))
// encode data
for( y = 0; y < height; y += y_step, data += y_step*step )
{
for( x = 0; x < width; x += x_step )
{
int x_limit = x_step;
int y_limit = y_step;
const uchar* pix_data = data + x*input_channels;
short* Y_data = block[0];
if( x + x_limit > width ) x_limit = width - x;
if( y + y_limit > height ) y_limit = height - y;
memset( block, 0, block_count*64*sizeof(block[0][0]));
int x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
int Y_step = x_scale*8;
if( channels > 1 )
{
short* UV_data = block[luma_count];
// double t = (double)cv::getTickCount();
if( colorspace == COLORSPACE_YUV444P && y_limit == 16 && x_limit == 16 )
{
for( i = 0; i < y_limit; i += 2, pix_data += step*2, Y_data += Y_step*2, UV_data += UV_step )
@ -1388,7 +1484,6 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
}
}
// total_cvt += (double)cv::getTickCount() - t;
}
else
{
@ -1398,6 +1493,161 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
Y_data[j] = (short)(pix_data[j]*4 - 128*4);
}
}
}
class MjpegEncoder : public ParallelLoopBody
{
public:
MjpegEncoder(int _height,
int _width,
int _step,
const uchar* _data,
int _input_channels,
int _channels,
int _colorspace,
unsigned (&_huff_dc_tab)[2][16],
unsigned (&_huff_ac_tab)[2][256],
short (&_fdct_qtab)[2][64],
uchar* _cat_table,
mjpeg_buffer_keeper& _buffer_list,
double nstripes
) :
m_buffer_list(_buffer_list),
height(_height),
width(_width),
step(_step),
in_data(_data),
input_channels(_input_channels),
channels(_channels),
colorspace(_colorspace),
huff_dc_tab(_huff_dc_tab),
huff_ac_tab(_huff_ac_tab),
fdct_qtab(_fdct_qtab),
cat_table(_cat_table)
{
//empirically found value. if number of pixels is less than that value there is no sense to parallelize it.
const int min_pixels_count = 96*96;
stripes_count = 1;
if(nstripes < 0)
{
if(height*width > min_pixels_count)
{
stripes_count = 4;
}
}
else
{
stripes_count = cvCeil(nstripes);
}
int y_scale = channels > 1 ? 2 : 1;
int y_step = y_scale * 8;
int max_stripes = (height - 1)/y_step + 1;
stripes_count = std::min(stripes_count, max_stripes);
m_buffer_list.allocate_buffers(stripes_count, (height*width*2)/stripes_count);
}
void operator()( const cv::Range& range ) const
{
const int CAT_TAB_SIZE = 4096;
unsigned code = 0;
#define JPUT_BITS(val, bits) output_buffer.put(val, bits)
#define JPUT_HUFF(val, table) \
code = table[(val) + 2]; \
JPUT_BITS(code >> 8, (int)(code & 255))
int x, y;
int i, j;
short buffer[4096];
int x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
int dc_pred[] = { 0, 0, 0 };
int x_step = x_scale * 8;
int y_step = y_scale * 8;
short block[6][64];
int luma_count = x_scale*y_scale;
int block_count = luma_count + channels - 1;
int u_plane_ofs = step*height;
int v_plane_ofs = u_plane_ofs + step*height;
const uchar* data = in_data;
const uchar* init_data = data;
int num_steps = (height - 1)/y_step + 1;
//if this is not first stripe we need to calculate dc_pred from previous step
if(range.start > 0)
{
y = y_step*int(num_steps*range.start/stripes_count - 1);
data = init_data + y*step;
for( x = 0; x < width; x += x_step )
{
int x_limit = x_step;
int y_limit = y_step;
const uchar* pix_data = data + x*input_channels;
short* Y_data = block[0];
short* UV_data = block[luma_count];
if( x + x_limit > width ) x_limit = width - x;
if( y + y_limit > height ) y_limit = height - y;
memset( block, 0, block_count*64*sizeof(block[0][0]));
convertToYUV(colorspace, channels, input_channels, UV_data, Y_data, pix_data, y_limit, x_limit, step, u_plane_ofs, v_plane_ofs);
for( i = 0; i < block_count; i++ )
{
int is_chroma = i >= luma_count;
int src_step = x_scale * 8;
const short* src_ptr = block[i & -2] + (i & 1)*8;
aan_fdct8x8( src_ptr, buffer, src_step, fdct_qtab[is_chroma] );
j = is_chroma + (i > luma_count);
dc_pred[j] = buffer[0];
}
}
}
for(int k = range.start; k < range.end; ++k)
{
mjpeg_buffer& output_buffer = m_buffer_list[k];
output_buffer.clear();
int y_min = y_step*int(num_steps*k/stripes_count);
int y_max = y_step*int(num_steps*(k+1)/stripes_count);
if(k == stripes_count - 1)
{
y_max = height;
}
data = init_data + y_min*step;
for( y = y_min; y < y_max; y += y_step, data += y_step*step )
{
for( x = 0; x < width; x += x_step )
{
int x_limit = x_step;
int y_limit = y_step;
const uchar* pix_data = data + x*input_channels;
short* Y_data = block[0];
short* UV_data = block[luma_count];
if( x + x_limit > width ) x_limit = width - x;
if( y + y_limit > height ) y_limit = height - y;
memset( block, 0, block_count*64*sizeof(block[0][0]));
convertToYUV(colorspace, channels, input_channels, UV_data, Y_data, pix_data, y_limit, x_limit, step, u_plane_ofs, v_plane_ofs);
for( i = 0; i < block_count; i++ )
{
@ -1407,9 +1657,7 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
const short* src_ptr = block[i & -2] + (i & 1)*8;
const unsigned* htable = huff_ac_tab[is_chroma];
//double t = (double)cv::getTickCount();
aan_fdct8x8( src_ptr, buffer, src_step, fdct_qtab[is_chroma] );
//total_dct += (double)cv::getTickCount() - t;
j = is_chroma + (i > luma_count);
val = buffer[0] - dc_pred[j];
@ -1457,13 +1705,183 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
}
}
}
}
}
// Flush
strm.jflush(currval, bit_idx);
cv::Range getRange()
{
return cv::Range(0, stripes_count);
}
double getNStripes()
{
return stripes_count;
}
mjpeg_buffer_keeper& m_buffer_list;
private:
MjpegEncoder& operator=( const MjpegEncoder & ) { return *this; }
const int height;
const int width;
const int step;
const uchar* in_data;
const int input_channels;
const int channels;
const int colorspace;
const unsigned (&huff_dc_tab)[2][16];
const unsigned (&huff_ac_tab)[2][256];
const short (&fdct_qtab)[2][64];
const uchar* cat_table;
int stripes_count;
};
void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspace, int input_channels )
{
//double total_cvt = 0, total_dct = 0;
static bool init_cat_table = false;
const int CAT_TAB_SIZE = 4096;
static uchar cat_table[CAT_TAB_SIZE*2+1];
if( !init_cat_table )
{
for( int i = -CAT_TAB_SIZE; i <= CAT_TAB_SIZE; i++ )
{
Cv32suf a;
a.f = (float)i;
cat_table[i+CAT_TAB_SIZE] = ((a.i >> 23) & 255) - (126 & (i ? -1 : 0));
}
init_cat_table = true;
}
//double total_dct = 0, total_cvt = 0;
CV_Assert( data && width > 0 && height > 0 );
// encode the header and tables
// for each mcu:
// convert rgb to yuv with downsampling (if color).
// for every block:
// calc dct and quantize
// encode block.
int i, j;
const int max_quality = 12;
short fdct_qtab[2][64];
unsigned huff_dc_tab[2][16];
unsigned huff_ac_tab[2][256];
int x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
short buffer[4096];
int* hbuffer = (int*)buffer;
int luma_count = x_scale*y_scale;
double _quality = quality*0.01*max_quality;
if( _quality < 1. ) _quality = 1.;
if( _quality > max_quality ) _quality = max_quality;
double inv_quality = 1./_quality;
// Encode header
strm.putBytes( (const uchar*)jpegHeader, sizeof(jpegHeader) - 1 );
// Encode quantization tables
for( i = 0; i < (channels > 1 ? 2 : 1); i++ )
{
const uchar* qtable = i == 0 ? jpegTableK1_T : jpegTableK2_T;
int chroma_scale = i > 0 ? luma_count : 1;
strm.jputShort( 0xffdb ); // DQT marker
strm.jputShort( 2 + 65*1 ); // put single qtable
strm.putByte( 0*16 + i ); // 8-bit table
// put coefficients
for( j = 0; j < 64; j++ )
{
int idx = zigzag[j];
int qval = cvRound(qtable[idx]*inv_quality);
if( qval < 1 )
qval = 1;
if( qval > 255 )
qval = 255;
fdct_qtab[i][idx] = (short)(cvRound((1 << (postshift + 11)))/
(qval*chroma_scale*idct_prescale[idx]));
strm.putByte( qval );
}
}
// Encode huffman tables
for( i = 0; i < (channels > 1 ? 4 : 2); i++ )
{
const uchar* htable = i == 0 ? jpegTableK3 : i == 1 ? jpegTableK5 :
i == 2 ? jpegTableK4 : jpegTableK6;
int is_ac_tab = i & 1;
int idx = i >= 2;
int tableSize = 16 + (is_ac_tab ? 162 : 12);
strm.jputShort( 0xFFC4 ); // DHT marker
strm.jputShort( 3 + tableSize ); // define one huffman table
strm.putByte( is_ac_tab*16 + idx ); // put DC/AC flag and table index
strm.putBytes( htable, tableSize ); // put table
BitStream::createEncodeHuffmanTable( BitStream::createSourceHuffmanTable(
htable, hbuffer, 16, 9 ), is_ac_tab ? huff_ac_tab[idx] :
huff_dc_tab[idx], is_ac_tab ? 256 : 16 );
}
// put frame header
strm.jputShort( 0xFFC0 ); // SOF0 marker
strm.jputShort( 8 + 3*channels ); // length of frame header
strm.putByte( 8 ); // sample precision
strm.jputShort( height );
strm.jputShort( width );
strm.putByte( channels ); // number of components
for( i = 0; i < channels; i++ )
{
strm.putByte( i + 1 ); // (i+1)-th component id (Y,U or V)
if( i == 0 )
strm.putByte(x_scale*16 + y_scale); // chroma scale factors
else
strm.putByte(1*16 + 1);
strm.putByte( i > 0 ); // quantization table idx
}
// put scan header
strm.jputShort( 0xFFDA ); // SOS marker
strm.jputShort( 6 + 2*channels ); // length of scan header
strm.putByte( channels ); // number of components in the scan
for( i = 0; i < channels; i++ )
{
strm.putByte( i+1 ); // component id
strm.putByte( (i>0)*16 + (i>0) );// selection of DC & AC tables
}
strm.jputShort(0*256 + 63); // start and end of spectral selection - for
// sequential DCT start is 0 and end is 63
strm.putByte( 0 ); // successive approximation bit position
// high & low - (0,0) for sequential DCT
buffers_list.reset();
MjpegEncoder parallel_encoder(height, width, step, data, input_channels, channels, colorspace, huff_dc_tab, huff_ac_tab, fdct_qtab, cat_table, buffers_list, nstripes);
cv::parallel_for_(parallel_encoder.getRange(), parallel_encoder, parallel_encoder.getNStripes());
//std::vector<unsigned>& v = parallel_encoder.m_buffer_list.get_data();
unsigned* v = buffers_list.get_data();
unsigned last_data_elem = buffers_list.get_data_size() - 1;
for(unsigned k = 0; k < last_data_elem; ++k)
{
strm.jput(v[k]);
}
strm.jflush(v[last_data_elem], 32 - buffers_list.get_last_bit_len());
strm.jputShort( 0xFFD9 ); // EOI marker
/*printf("total dct = %.1fms, total cvt = %.1fms\n",
total_dct*1000./cv::getTickFrequency(),
total_cvt*1000./cv::getTickFrequency());*/
size_t pos = strm.getPos();
size_t pos1 = (pos + 3) & ~3;
for( ; pos < pos1; pos++ )