Merge pull request #4164 from kalistratovag:parallel_mjpeg
This commit is contained in:
commit
c81b064727
@ -80,25 +80,31 @@ struct work_load
|
||||
set(range, body, nstripes);
|
||||
}
|
||||
|
||||
void set(const cv::Range& range, const cv::ParallelLoopBody& body, int nstripes)
|
||||
void set(const cv::Range& range, const cv::ParallelLoopBody& body, unsigned int nstripes)
|
||||
{
|
||||
m_body = &body;
|
||||
m_range = ⦥
|
||||
m_nstripes = nstripes;
|
||||
m_blocks_count = ((m_range->end - m_range->start - 1)/m_nstripes) + 1;
|
||||
|
||||
//ensure that nstripes not larger than range length
|
||||
m_nstripes = std::min( unsigned(m_range->end - m_range->start) , nstripes);
|
||||
|
||||
m_block_size = ((m_range->end - m_range->start - 1)/m_nstripes) + 1;
|
||||
|
||||
//ensure that nstripes not larger than blocks count, so we would never go out of range
|
||||
m_nstripes = std::min(m_nstripes, unsigned(((m_range->end - m_range->start - 1)/m_block_size) + 1) );
|
||||
}
|
||||
|
||||
const cv::ParallelLoopBody* m_body;
|
||||
const cv::Range* m_range;
|
||||
int m_nstripes;
|
||||
unsigned int m_blocks_count;
|
||||
unsigned int m_nstripes;
|
||||
int m_block_size;
|
||||
|
||||
void clear()
|
||||
{
|
||||
m_body = 0;
|
||||
m_range = 0;
|
||||
m_nstripes = 0;
|
||||
m_blocks_count = 0;
|
||||
m_block_size = 0;
|
||||
}
|
||||
};
|
||||
|
||||
@ -331,10 +337,10 @@ void ForThread::execute()
|
||||
|
||||
work_load& load = m_parent->m_work_load;
|
||||
|
||||
while(m_current_pos < load.m_blocks_count)
|
||||
while(m_current_pos < load.m_nstripes)
|
||||
{
|
||||
int start = load.m_range->start + m_current_pos*load.m_nstripes;
|
||||
int end = std::min(start + load.m_nstripes, load.m_range->end);
|
||||
int start = load.m_range->start + m_current_pos*load.m_block_size;
|
||||
int end = std::min(start + load.m_block_size, load.m_range->end);
|
||||
|
||||
load.m_body->operator()(cv::Range(start, end));
|
||||
|
||||
@ -417,9 +423,11 @@ void ThreadManager::run(const cv::Range& range, const cv::ParallelLoopBody& body
|
||||
{
|
||||
if(initPool())
|
||||
{
|
||||
double min_stripes = double(range.end - range.start)/(4*m_threads.size());
|
||||
if(nstripes < 1) nstripes = 4*m_threads.size();
|
||||
|
||||
nstripes = std::max(nstripes, min_stripes);
|
||||
double max_stripes = 4*m_threads.size();
|
||||
|
||||
nstripes = std::min(nstripes, max_stripes);
|
||||
|
||||
pthread_mutex_lock(&m_manager_task_mutex);
|
||||
|
||||
@ -429,7 +437,7 @@ void ThreadManager::run(const cv::Range& range, const cv::ParallelLoopBody& body
|
||||
|
||||
m_task_complete = false;
|
||||
|
||||
m_work_load.set(range, body, std::ceil(nstripes));
|
||||
m_work_load.set(range, body, cvCeil(nstripes));
|
||||
|
||||
for(size_t i = 0; i < m_threads.size(); ++i)
|
||||
{
|
||||
|
@ -315,6 +315,7 @@ enum { CAP_INTELPERC_DEPTH_MAP = 0, // Each pixel is a 16-bit integ
|
||||
|
||||
enum { VIDEOWRITER_PROP_QUALITY = 1, // Quality (0..100%) of the videostream encoded
|
||||
VIDEOWRITER_PROP_FRAMEBYTES = 2, // (Read-only): Size of just encoded video frame
|
||||
VIDEOWRITER_PROP_NSTRIPES = 3 // Number of stripes for parallel encoding. -1 for auto detection
|
||||
};
|
||||
|
||||
// gPhoto2 properties, if propertyId is less than 0 then work on widget with that __additive inversed__ camera setting ID
|
||||
@ -610,6 +611,7 @@ public:
|
||||
|
||||
@param propId Property identifier. It can be one of the following:
|
||||
- **VIDEOWRITER_PROP_QUALITY** Quality (0..100%) of the videostream encoded. Can be adjusted dynamically in some codecs.
|
||||
- **VIDEOWRITER_PROP_NSTRIPES** Number of stripes for parallel encoding
|
||||
@param value Value of the property.
|
||||
*/
|
||||
CV_WRAP virtual bool set(int propId, double value);
|
||||
@ -619,6 +621,7 @@ public:
|
||||
@param propId Property identifier. It can be one of the following:
|
||||
- **VIDEOWRITER_PROP_QUALITY** Current quality of the encoded videostream.
|
||||
- **VIDEOWRITER_PROP_FRAMEBYTES** (Read-only) Size of just encoded video frame; note that the encoding order may be different from representation order.
|
||||
- **VIDEOWRITER_PROP_NSTRIPES** Number of stripes for parallel encoding
|
||||
|
||||
@note When querying a property that is not supported by the backend used by the VideoWriter
|
||||
class, value 0 is returned.
|
||||
|
@ -41,6 +41,7 @@
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include <vector>
|
||||
#include <deque>
|
||||
|
||||
#if CV_NEON
|
||||
#define WITH_NEON
|
||||
@ -350,14 +351,261 @@ protected:
|
||||
};
|
||||
|
||||
|
||||
class mjpeg_buffer
|
||||
{
|
||||
public:
|
||||
mjpeg_buffer()
|
||||
{
|
||||
reset();
|
||||
}
|
||||
|
||||
void resize(int size)
|
||||
{
|
||||
data.resize(size);
|
||||
}
|
||||
|
||||
void put(unsigned bits, int len)
|
||||
{
|
||||
if((m_pos == (data.size() - 1) && len > bits_free) || m_pos == data.size())
|
||||
{
|
||||
resize(int(2*data.size()));
|
||||
}
|
||||
|
||||
bits_free -= (len);
|
||||
unsigned int tempval = (bits) & bit_mask[(len)];
|
||||
|
||||
if( bits_free <= 0 )
|
||||
{
|
||||
data[m_pos] |= ((unsigned)tempval >> -bits_free);
|
||||
|
||||
bits_free += 32;
|
||||
++m_pos;
|
||||
data[m_pos] = bits_free < 32 ? (tempval << bits_free) : 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
data[m_pos] |= (tempval << bits_free);
|
||||
}
|
||||
}
|
||||
|
||||
void finish()
|
||||
{
|
||||
if(bits_free == 32)
|
||||
{
|
||||
bits_free = 0;
|
||||
m_data_len = m_pos;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_data_len = m_pos + 1;
|
||||
}
|
||||
}
|
||||
|
||||
void reset()
|
||||
{
|
||||
bits_free = 32;
|
||||
m_pos = 0;
|
||||
m_data_len = 0;
|
||||
}
|
||||
|
||||
void clear()
|
||||
{
|
||||
//we need to clear only first element, the rest would be overwritten
|
||||
data[0] = 0;
|
||||
}
|
||||
|
||||
int get_bits_free()
|
||||
{
|
||||
return bits_free;
|
||||
}
|
||||
|
||||
unsigned* get_data()
|
||||
{
|
||||
return &data[0];
|
||||
}
|
||||
|
||||
unsigned get_len()
|
||||
{
|
||||
return m_data_len;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<unsigned> data;
|
||||
int bits_free;
|
||||
unsigned m_pos;
|
||||
bool m_is_full;
|
||||
unsigned m_data_len;
|
||||
};
|
||||
|
||||
|
||||
class mjpeg_buffer_keeper
|
||||
{
|
||||
public:
|
||||
mjpeg_buffer_keeper()
|
||||
{
|
||||
m_last_bit_len = 0;
|
||||
}
|
||||
|
||||
mjpeg_buffer& operator[](int i)
|
||||
{
|
||||
return m_buffer_list[i];
|
||||
}
|
||||
|
||||
void allocate_buffers(int count, int size)
|
||||
{
|
||||
for(int i = (int)m_buffer_list.size(); i < count; ++i)
|
||||
{
|
||||
m_buffer_list.push_back(mjpeg_buffer());
|
||||
m_buffer_list.back().resize(size);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned* get_data()
|
||||
{
|
||||
//if there is only one buffer (single thread) there is no need to stack buffers
|
||||
if(m_buffer_list.size() == 1)
|
||||
{
|
||||
m_buffer_list[0].finish();
|
||||
|
||||
m_data_len = m_buffer_list[0].get_len();
|
||||
m_last_bit_len = m_buffer_list[0].get_bits_free() ? 32 - m_buffer_list[0].get_bits_free() : 0;
|
||||
|
||||
return m_buffer_list[0].get_data();
|
||||
}
|
||||
|
||||
allocate_output_buffer();
|
||||
|
||||
int bits = 0;
|
||||
unsigned currval = 0;
|
||||
m_data_len = 0;
|
||||
|
||||
for(unsigned j = 0; j < m_buffer_list.size(); ++j)
|
||||
{
|
||||
mjpeg_buffer& buffer = m_buffer_list[j];
|
||||
|
||||
//if no bit shift required we could use memcpy
|
||||
if(bits == 0)
|
||||
{
|
||||
size_t current_pos = m_data_len;
|
||||
|
||||
if(buffer.get_bits_free() == 0)
|
||||
{
|
||||
memcpy(&m_output_buffer[current_pos], buffer.get_data(), sizeof(buffer.get_data()[0])*buffer.get_len());
|
||||
m_data_len += buffer.get_len();
|
||||
currval = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(&m_output_buffer[current_pos], buffer.get_data(), sizeof(buffer.get_data()[0])*(buffer.get_len() -1 ));
|
||||
m_data_len += buffer.get_len() - 1;
|
||||
currval = buffer.get_data()[buffer.get_len() - 1];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(unsigned i = 0; i < buffer.get_len() - 1; ++i)
|
||||
{
|
||||
if( bits <= 0 )
|
||||
{
|
||||
currval |= ((unsigned)buffer.get_data()[i] >> -bits);
|
||||
|
||||
m_output_buffer[m_data_len++] = currval;
|
||||
|
||||
currval = (bits < 0) ? (buffer.get_data()[i] << (bits + 32)) : 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
currval |= (buffer.get_data()[i] << bits);
|
||||
}
|
||||
}
|
||||
|
||||
currval |= ((unsigned)buffer.get_data()[buffer.get_len() - 1] >> -bits);
|
||||
|
||||
if( (buffer.get_bits_free() == 32 ? 0 : buffer.get_bits_free()) <= -bits)
|
||||
{
|
||||
m_output_buffer[m_data_len++] = currval;
|
||||
|
||||
currval = (bits < 0) ? (buffer.get_data()[buffer.get_len() - 1] << (bits + 32)) : 0;
|
||||
}
|
||||
}
|
||||
|
||||
bits += buffer.get_bits_free();
|
||||
|
||||
if(bits > 0)
|
||||
{
|
||||
bits -= 32;
|
||||
}
|
||||
}
|
||||
|
||||
//bits == 0 means that last element shouldn't be used.
|
||||
m_output_buffer[m_data_len++] = currval;
|
||||
|
||||
m_last_bit_len = -bits;
|
||||
|
||||
return &m_output_buffer[0];
|
||||
}
|
||||
|
||||
int get_last_bit_len()
|
||||
{
|
||||
return m_last_bit_len;
|
||||
}
|
||||
|
||||
int get_data_size()
|
||||
{
|
||||
return m_data_len;
|
||||
}
|
||||
|
||||
void reset()
|
||||
{
|
||||
m_last_bit_len = 0;
|
||||
for(unsigned i = 0; i < m_buffer_list.size(); ++i)
|
||||
{
|
||||
m_buffer_list[i].reset();
|
||||
}
|
||||
|
||||
//there is no need to erase output buffer since it would be overwritten
|
||||
m_data_len = 0;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
void allocate_output_buffer()
|
||||
{
|
||||
unsigned total_size = 0;
|
||||
|
||||
for(unsigned i = 0; i < m_buffer_list.size(); ++i)
|
||||
{
|
||||
m_buffer_list[i].finish();
|
||||
total_size += m_buffer_list[i].get_len();
|
||||
}
|
||||
|
||||
if(total_size > m_output_buffer.size())
|
||||
{
|
||||
m_output_buffer.clear();
|
||||
m_output_buffer.resize(total_size);
|
||||
}
|
||||
}
|
||||
|
||||
std::deque<mjpeg_buffer> m_buffer_list;
|
||||
std::vector<unsigned> m_output_buffer;
|
||||
int m_data_len;
|
||||
int m_last_bit_len;
|
||||
};
|
||||
|
||||
class MotionJpegWriter : public IVideoWriter
|
||||
{
|
||||
public:
|
||||
MotionJpegWriter() { rawstream = false; }
|
||||
MotionJpegWriter()
|
||||
{
|
||||
rawstream = false;
|
||||
nstripes = -1;
|
||||
}
|
||||
|
||||
MotionJpegWriter(const String& filename, double fps, Size size, bool iscolor)
|
||||
{
|
||||
rawstream = false;
|
||||
open(filename, fps, size, iscolor);
|
||||
nstripes = -1;
|
||||
}
|
||||
~MotionJpegWriter() { close(); }
|
||||
|
||||
@ -616,6 +864,8 @@ public:
|
||||
return quality;
|
||||
if( propId == VIDEOWRITER_PROP_FRAMEBYTES )
|
||||
return frameSize.empty() ? 0. : (double)frameSize.back();
|
||||
if( propId == VIDEOWRITER_PROP_NSTRIPES )
|
||||
return nstripes;
|
||||
return 0.;
|
||||
}
|
||||
|
||||
@ -626,6 +876,13 @@ public:
|
||||
quality = value;
|
||||
return true;
|
||||
}
|
||||
|
||||
if( propId == VIDEOWRITER_PROP_NSTRIPES)
|
||||
{
|
||||
nstripes = value;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -638,6 +895,8 @@ protected:
|
||||
size_t moviPointer;
|
||||
std::vector<size_t> frameOffset, frameSize, AVIChunkSizeIndex, frameNumIndexes;
|
||||
bool rawstream;
|
||||
mjpeg_buffer_keeper buffers_list;
|
||||
double nstripes;
|
||||
|
||||
BitStream strm;
|
||||
};
|
||||
@ -1107,6 +1366,377 @@ static void aan_fdct8x8( const short *src, short *dst,
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
inline void convertToYUV(int colorspace, int channels, int input_channels, short* UV_data, short* Y_data, const uchar* pix_data, int y_limit, int x_limit, int step, int u_plane_ofs, int v_plane_ofs)
|
||||
{
|
||||
int i, j;
|
||||
const int UV_step = 16;
|
||||
int x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
|
||||
int Y_step = x_scale*8;
|
||||
|
||||
if( channels > 1 )
|
||||
{
|
||||
if( colorspace == COLORSPACE_YUV444P && y_limit == 16 && x_limit == 16 )
|
||||
{
|
||||
for( i = 0; i < y_limit; i += 2, pix_data += step*2, Y_data += Y_step*2, UV_data += UV_step )
|
||||
{
|
||||
#ifdef WITH_NEON
|
||||
{
|
||||
uint16x8_t masklo = vdupq_n_u16(255);
|
||||
uint16x8_t lane = vld1q_u16((unsigned short*)(pix_data+v_plane_ofs));
|
||||
uint16x8_t t1 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
|
||||
lane = vld1q_u16((unsigned short*)(pix_data + v_plane_ofs + step));
|
||||
uint16x8_t t2 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
|
||||
t1 = vaddq_u16(t1, t2);
|
||||
vst1q_s16(UV_data, vsubq_s16(vreinterpretq_s16_u16(t1), vdupq_n_s16(128*4)));
|
||||
|
||||
lane = vld1q_u16((unsigned short*)(pix_data+u_plane_ofs));
|
||||
t1 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
|
||||
lane = vld1q_u16((unsigned short*)(pix_data + u_plane_ofs + step));
|
||||
t2 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
|
||||
t1 = vaddq_u16(t1, t2);
|
||||
vst1q_s16(UV_data + 8, vsubq_s16(vreinterpretq_s16_u16(t1), vdupq_n_s16(128*4)));
|
||||
}
|
||||
|
||||
{
|
||||
int16x8_t lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data)));
|
||||
int16x8_t delta = vdupq_n_s16(128);
|
||||
lane = vsubq_s16(lane, delta);
|
||||
vst1q_s16(Y_data, lane);
|
||||
|
||||
lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data+8)));
|
||||
lane = vsubq_s16(lane, delta);
|
||||
vst1q_s16(Y_data + 8, lane);
|
||||
|
||||
lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data+step)));
|
||||
lane = vsubq_s16(lane, delta);
|
||||
vst1q_s16(Y_data+Y_step, lane);
|
||||
|
||||
lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data + step + 8)));
|
||||
lane = vsubq_s16(lane, delta);
|
||||
vst1q_s16(Y_data+Y_step + 8, lane);
|
||||
}
|
||||
#else
|
||||
for( j = 0; j < x_limit; j += 2, pix_data += 2 )
|
||||
{
|
||||
Y_data[j] = pix_data[0] - 128;
|
||||
Y_data[j+1] = pix_data[1] - 128;
|
||||
Y_data[j+Y_step] = pix_data[step] - 128;
|
||||
Y_data[j+Y_step+1] = pix_data[step+1] - 128;
|
||||
|
||||
UV_data[j>>1] = pix_data[v_plane_ofs] + pix_data[v_plane_ofs+1] +
|
||||
pix_data[v_plane_ofs+step] + pix_data[v_plane_ofs+step+1] - 128*4;
|
||||
UV_data[(j>>1)+8] = pix_data[u_plane_ofs] + pix_data[u_plane_ofs+1] +
|
||||
pix_data[u_plane_ofs+step] + pix_data[u_plane_ofs+step+1] - 128*4;
|
||||
|
||||
}
|
||||
|
||||
pix_data -= x_limit*input_channels;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( i = 0; i < y_limit; i++, pix_data += step, Y_data += Y_step )
|
||||
{
|
||||
for( j = 0; j < x_limit; j++, pix_data += input_channels )
|
||||
{
|
||||
int Y, U, V;
|
||||
|
||||
if( colorspace == COLORSPACE_BGR )
|
||||
{
|
||||
int r = pix_data[2];
|
||||
int g = pix_data[1];
|
||||
int b = pix_data[0];
|
||||
|
||||
Y = DCT_DESCALE( r*y_r + g*y_g + b*y_b, fixc) - 128;
|
||||
U = DCT_DESCALE( r*cb_r + g*cb_g + b*cb_b, fixc );
|
||||
V = DCT_DESCALE( r*cr_r + g*cr_g + b*cr_b, fixc );
|
||||
}
|
||||
else if( colorspace == COLORSPACE_RGBA )
|
||||
{
|
||||
int r = pix_data[0];
|
||||
int g = pix_data[1];
|
||||
int b = pix_data[2];
|
||||
|
||||
Y = DCT_DESCALE( r*y_r + g*y_g + b*y_b, fixc) - 128;
|
||||
U = DCT_DESCALE( r*cb_r + g*cb_g + b*cb_b, fixc );
|
||||
V = DCT_DESCALE( r*cr_r + g*cr_g + b*cr_b, fixc );
|
||||
}
|
||||
else
|
||||
{
|
||||
Y = pix_data[0] - 128;
|
||||
U = pix_data[v_plane_ofs] - 128;
|
||||
V = pix_data[u_plane_ofs] - 128;
|
||||
}
|
||||
|
||||
int j2 = j >> (x_scale - 1);
|
||||
Y_data[j] = (short)Y;
|
||||
UV_data[j2] = (short)(UV_data[j2] + U);
|
||||
UV_data[j2 + 8] = (short)(UV_data[j2 + 8] + V);
|
||||
}
|
||||
|
||||
pix_data -= x_limit*input_channels;
|
||||
if( ((i+1) & (y_scale - 1)) == 0 )
|
||||
{
|
||||
UV_data += UV_step;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for( i = 0; i < y_limit; i++, pix_data += step, Y_data += Y_step )
|
||||
{
|
||||
for( j = 0; j < x_limit; j++ )
|
||||
Y_data[j] = (short)(pix_data[j]*4 - 128*4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class MjpegEncoder : public ParallelLoopBody
|
||||
{
|
||||
public:
|
||||
MjpegEncoder(int _height,
|
||||
int _width,
|
||||
int _step,
|
||||
const uchar* _data,
|
||||
int _input_channels,
|
||||
int _channels,
|
||||
int _colorspace,
|
||||
unsigned (&_huff_dc_tab)[2][16],
|
||||
unsigned (&_huff_ac_tab)[2][256],
|
||||
short (&_fdct_qtab)[2][64],
|
||||
uchar* _cat_table,
|
||||
mjpeg_buffer_keeper& _buffer_list,
|
||||
double nstripes
|
||||
) :
|
||||
m_buffer_list(_buffer_list),
|
||||
height(_height),
|
||||
width(_width),
|
||||
step(_step),
|
||||
in_data(_data),
|
||||
input_channels(_input_channels),
|
||||
channels(_channels),
|
||||
colorspace(_colorspace),
|
||||
huff_dc_tab(_huff_dc_tab),
|
||||
huff_ac_tab(_huff_ac_tab),
|
||||
fdct_qtab(_fdct_qtab),
|
||||
cat_table(_cat_table)
|
||||
{
|
||||
//empirically found value. if number of pixels is less than that value there is no sense to parallelize it.
|
||||
const int min_pixels_count = 96*96;
|
||||
|
||||
stripes_count = 1;
|
||||
|
||||
if(nstripes < 0)
|
||||
{
|
||||
if(height*width > min_pixels_count)
|
||||
{
|
||||
stripes_count = 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
stripes_count = cvCeil(nstripes);
|
||||
}
|
||||
|
||||
int y_scale = channels > 1 ? 2 : 1;
|
||||
int y_step = y_scale * 8;
|
||||
|
||||
int max_stripes = (height - 1)/y_step + 1;
|
||||
|
||||
stripes_count = std::min(stripes_count, max_stripes);
|
||||
|
||||
m_buffer_list.allocate_buffers(stripes_count, (height*width*2)/stripes_count);
|
||||
}
|
||||
|
||||
void operator()( const cv::Range& range ) const
|
||||
{
|
||||
const int CAT_TAB_SIZE = 4096;
|
||||
unsigned code = 0;
|
||||
|
||||
#define JPUT_BITS(val, bits) output_buffer.put(val, bits)
|
||||
|
||||
#define JPUT_HUFF(val, table) \
|
||||
code = table[(val) + 2]; \
|
||||
JPUT_BITS(code >> 8, (int)(code & 255))
|
||||
|
||||
int x, y;
|
||||
int i, j;
|
||||
|
||||
short buffer[4096];
|
||||
int x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
|
||||
int dc_pred[] = { 0, 0, 0 };
|
||||
int x_step = x_scale * 8;
|
||||
int y_step = y_scale * 8;
|
||||
short block[6][64];
|
||||
int luma_count = x_scale*y_scale;
|
||||
int block_count = luma_count + channels - 1;
|
||||
int u_plane_ofs = step*height;
|
||||
int v_plane_ofs = u_plane_ofs + step*height;
|
||||
const uchar* data = in_data;
|
||||
const uchar* init_data = data;
|
||||
|
||||
int num_steps = (height - 1)/y_step + 1;
|
||||
|
||||
//if this is not first stripe we need to calculate dc_pred from previous step
|
||||
if(range.start > 0)
|
||||
{
|
||||
y = y_step*int(num_steps*range.start/stripes_count - 1);
|
||||
data = init_data + y*step;
|
||||
|
||||
for( x = 0; x < width; x += x_step )
|
||||
{
|
||||
int x_limit = x_step;
|
||||
int y_limit = y_step;
|
||||
const uchar* pix_data = data + x*input_channels;
|
||||
short* Y_data = block[0];
|
||||
short* UV_data = block[luma_count];
|
||||
|
||||
if( x + x_limit > width ) x_limit = width - x;
|
||||
if( y + y_limit > height ) y_limit = height - y;
|
||||
|
||||
memset( block, 0, block_count*64*sizeof(block[0][0]));
|
||||
|
||||
convertToYUV(colorspace, channels, input_channels, UV_data, Y_data, pix_data, y_limit, x_limit, step, u_plane_ofs, v_plane_ofs);
|
||||
|
||||
for( i = 0; i < block_count; i++ )
|
||||
{
|
||||
int is_chroma = i >= luma_count;
|
||||
int src_step = x_scale * 8;
|
||||
const short* src_ptr = block[i & -2] + (i & 1)*8;
|
||||
|
||||
aan_fdct8x8( src_ptr, buffer, src_step, fdct_qtab[is_chroma] );
|
||||
|
||||
j = is_chroma + (i > luma_count);
|
||||
dc_pred[j] = buffer[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(int k = range.start; k < range.end; ++k)
|
||||
{
|
||||
mjpeg_buffer& output_buffer = m_buffer_list[k];
|
||||
output_buffer.clear();
|
||||
|
||||
int y_min = y_step*int(num_steps*k/stripes_count);
|
||||
int y_max = y_step*int(num_steps*(k+1)/stripes_count);
|
||||
|
||||
if(k == stripes_count - 1)
|
||||
{
|
||||
y_max = height;
|
||||
}
|
||||
|
||||
|
||||
data = init_data + y_min*step;
|
||||
|
||||
for( y = y_min; y < y_max; y += y_step, data += y_step*step )
|
||||
{
|
||||
for( x = 0; x < width; x += x_step )
|
||||
{
|
||||
int x_limit = x_step;
|
||||
int y_limit = y_step;
|
||||
const uchar* pix_data = data + x*input_channels;
|
||||
short* Y_data = block[0];
|
||||
short* UV_data = block[luma_count];
|
||||
|
||||
if( x + x_limit > width ) x_limit = width - x;
|
||||
if( y + y_limit > height ) y_limit = height - y;
|
||||
|
||||
memset( block, 0, block_count*64*sizeof(block[0][0]));
|
||||
|
||||
convertToYUV(colorspace, channels, input_channels, UV_data, Y_data, pix_data, y_limit, x_limit, step, u_plane_ofs, v_plane_ofs);
|
||||
|
||||
for( i = 0; i < block_count; i++ )
|
||||
{
|
||||
int is_chroma = i >= luma_count;
|
||||
int src_step = x_scale * 8;
|
||||
int run = 0, val;
|
||||
const short* src_ptr = block[i & -2] + (i & 1)*8;
|
||||
const unsigned* htable = huff_ac_tab[is_chroma];
|
||||
|
||||
aan_fdct8x8( src_ptr, buffer, src_step, fdct_qtab[is_chroma] );
|
||||
|
||||
j = is_chroma + (i > luma_count);
|
||||
val = buffer[0] - dc_pred[j];
|
||||
dc_pred[j] = buffer[0];
|
||||
|
||||
{
|
||||
int cat = cat_table[val + CAT_TAB_SIZE];
|
||||
|
||||
//CV_Assert( cat <= 11 );
|
||||
JPUT_HUFF( cat, huff_dc_tab[is_chroma] );
|
||||
JPUT_BITS( val - (val < 0 ? 1 : 0), cat );
|
||||
}
|
||||
|
||||
for( j = 1; j < 64; j++ )
|
||||
{
|
||||
val = buffer[zigzag[j]];
|
||||
|
||||
if( val == 0 )
|
||||
{
|
||||
run++;
|
||||
}
|
||||
else
|
||||
{
|
||||
while( run >= 16 )
|
||||
{
|
||||
JPUT_HUFF( 0xF0, htable ); // encode 16 zeros
|
||||
run -= 16;
|
||||
}
|
||||
|
||||
{
|
||||
int cat = cat_table[val + CAT_TAB_SIZE];
|
||||
//CV_Assert( cat <= 10 );
|
||||
JPUT_HUFF( cat + run*16, htable );
|
||||
JPUT_BITS( val - (val < 0 ? 1 : 0), cat );
|
||||
}
|
||||
|
||||
run = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if( run )
|
||||
{
|
||||
JPUT_HUFF( 0x00, htable ); // encode EOB
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cv::Range getRange()
|
||||
{
|
||||
return cv::Range(0, stripes_count);
|
||||
}
|
||||
|
||||
double getNStripes()
|
||||
{
|
||||
return stripes_count;
|
||||
}
|
||||
|
||||
mjpeg_buffer_keeper& m_buffer_list;
|
||||
private:
|
||||
|
||||
MjpegEncoder& operator=( const MjpegEncoder & ) { return *this; }
|
||||
|
||||
const int height;
|
||||
const int width;
|
||||
const int step;
|
||||
const uchar* in_data;
|
||||
const int input_channels;
|
||||
const int channels;
|
||||
const int colorspace;
|
||||
const unsigned (&huff_dc_tab)[2][16];
|
||||
const unsigned (&huff_ac_tab)[2][256];
|
||||
const short (&fdct_qtab)[2][64];
|
||||
const uchar* cat_table;
|
||||
int stripes_count;
|
||||
};
|
||||
|
||||
void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspace, int input_channels )
|
||||
{
|
||||
//double total_cvt = 0, total_dct = 0;
|
||||
@ -1133,7 +1763,6 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
|
||||
// for every block:
|
||||
// calc dct and quantize
|
||||
// encode block.
|
||||
int x, y;
|
||||
int i, j;
|
||||
const int max_quality = 12;
|
||||
short fdct_qtab[2][64];
|
||||
@ -1141,18 +1770,9 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
|
||||
unsigned huff_ac_tab[2][256];
|
||||
|
||||
int x_scale = channels > 1 ? 2 : 1, y_scale = x_scale;
|
||||
int dc_pred[] = { 0, 0, 0 };
|
||||
int x_step = x_scale * 8;
|
||||
int y_step = y_scale * 8;
|
||||
short block[6][64];
|
||||
short buffer[4096];
|
||||
int* hbuffer = (int*)buffer;
|
||||
int luma_count = x_scale*y_scale;
|
||||
int block_count = luma_count + channels - 1;
|
||||
int Y_step = x_scale*8;
|
||||
const int UV_step = 16;
|
||||
int u_plane_ofs = step*height;
|
||||
int v_plane_ofs = u_plane_ofs + step*height;
|
||||
double _quality = quality*0.01*max_quality;
|
||||
|
||||
if( _quality < 1. ) _quality = 1.;
|
||||
@ -1241,229 +1861,27 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
|
||||
|
||||
strm.putByte( 0 ); // successive approximation bit position
|
||||
// high & low - (0,0) for sequential DCT
|
||||
unsigned currval = 0, code = 0, tempval = 0;
|
||||
int bit_idx = 32;
|
||||
|
||||
#define JPUT_BITS(val, bits) \
|
||||
bit_idx -= (bits); \
|
||||
tempval = (val) & bit_mask[(bits)]; \
|
||||
if( bit_idx <= 0 ) \
|
||||
{ \
|
||||
strm.jput(currval | ((unsigned)tempval >> -bit_idx)); \
|
||||
bit_idx += 32; \
|
||||
currval = bit_idx < 32 ? (tempval << bit_idx) : 0; \
|
||||
} \
|
||||
else \
|
||||
currval |= (tempval << bit_idx)
|
||||
buffers_list.reset();
|
||||
|
||||
#define JPUT_HUFF(val, table) \
|
||||
code = table[(val) + 2]; \
|
||||
JPUT_BITS(code >> 8, (int)(code & 255))
|
||||
MjpegEncoder parallel_encoder(height, width, step, data, input_channels, channels, colorspace, huff_dc_tab, huff_ac_tab, fdct_qtab, cat_table, buffers_list, nstripes);
|
||||
|
||||
// encode data
|
||||
for( y = 0; y < height; y += y_step, data += y_step*step )
|
||||
cv::parallel_for_(parallel_encoder.getRange(), parallel_encoder, parallel_encoder.getNStripes());
|
||||
|
||||
//std::vector<unsigned>& v = parallel_encoder.m_buffer_list.get_data();
|
||||
unsigned* v = buffers_list.get_data();
|
||||
unsigned last_data_elem = buffers_list.get_data_size() - 1;
|
||||
|
||||
for(unsigned k = 0; k < last_data_elem; ++k)
|
||||
{
|
||||
for( x = 0; x < width; x += x_step )
|
||||
{
|
||||
int x_limit = x_step;
|
||||
int y_limit = y_step;
|
||||
const uchar* pix_data = data + x*input_channels;
|
||||
short* Y_data = block[0];
|
||||
|
||||
if( x + x_limit > width ) x_limit = width - x;
|
||||
if( y + y_limit > height ) y_limit = height - y;
|
||||
|
||||
memset( block, 0, block_count*64*sizeof(block[0][0]));
|
||||
|
||||
if( channels > 1 )
|
||||
{
|
||||
short* UV_data = block[luma_count];
|
||||
// double t = (double)cv::getTickCount();
|
||||
|
||||
if( colorspace == COLORSPACE_YUV444P && y_limit == 16 && x_limit == 16 )
|
||||
{
|
||||
for( i = 0; i < y_limit; i += 2, pix_data += step*2, Y_data += Y_step*2, UV_data += UV_step )
|
||||
{
|
||||
#ifdef WITH_NEON
|
||||
{
|
||||
uint16x8_t masklo = vdupq_n_u16(255);
|
||||
uint16x8_t lane = vld1q_u16((unsigned short*)(pix_data+v_plane_ofs));
|
||||
uint16x8_t t1 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
|
||||
lane = vld1q_u16((unsigned short*)(pix_data + v_plane_ofs + step));
|
||||
uint16x8_t t2 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
|
||||
t1 = vaddq_u16(t1, t2);
|
||||
vst1q_s16(UV_data, vsubq_s16(vreinterpretq_s16_u16(t1), vdupq_n_s16(128*4)));
|
||||
|
||||
lane = vld1q_u16((unsigned short*)(pix_data+u_plane_ofs));
|
||||
t1 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
|
||||
lane = vld1q_u16((unsigned short*)(pix_data + u_plane_ofs + step));
|
||||
t2 = vaddq_u16(vshrq_n_u16(lane, 8), vandq_u16(lane, masklo));
|
||||
t1 = vaddq_u16(t1, t2);
|
||||
vst1q_s16(UV_data + 8, vsubq_s16(vreinterpretq_s16_u16(t1), vdupq_n_s16(128*4)));
|
||||
}
|
||||
|
||||
{
|
||||
int16x8_t lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data)));
|
||||
int16x8_t delta = vdupq_n_s16(128);
|
||||
lane = vsubq_s16(lane, delta);
|
||||
vst1q_s16(Y_data, lane);
|
||||
|
||||
lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data+8)));
|
||||
lane = vsubq_s16(lane, delta);
|
||||
vst1q_s16(Y_data + 8, lane);
|
||||
|
||||
lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data+step)));
|
||||
lane = vsubq_s16(lane, delta);
|
||||
vst1q_s16(Y_data+Y_step, lane);
|
||||
|
||||
lane = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pix_data + step + 8)));
|
||||
lane = vsubq_s16(lane, delta);
|
||||
vst1q_s16(Y_data+Y_step + 8, lane);
|
||||
}
|
||||
#else
|
||||
for( j = 0; j < x_limit; j += 2, pix_data += 2 )
|
||||
{
|
||||
Y_data[j] = pix_data[0] - 128;
|
||||
Y_data[j+1] = pix_data[1] - 128;
|
||||
Y_data[j+Y_step] = pix_data[step] - 128;
|
||||
Y_data[j+Y_step+1] = pix_data[step+1] - 128;
|
||||
|
||||
UV_data[j>>1] = pix_data[v_plane_ofs] + pix_data[v_plane_ofs+1] +
|
||||
pix_data[v_plane_ofs+step] + pix_data[v_plane_ofs+step+1] - 128*4;
|
||||
UV_data[(j>>1)+8] = pix_data[u_plane_ofs] + pix_data[u_plane_ofs+1] +
|
||||
pix_data[u_plane_ofs+step] + pix_data[u_plane_ofs+step+1] - 128*4;
|
||||
|
||||
}
|
||||
|
||||
pix_data -= x_limit*input_channels;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( i = 0; i < y_limit; i++, pix_data += step, Y_data += Y_step )
|
||||
{
|
||||
for( j = 0; j < x_limit; j++, pix_data += input_channels )
|
||||
{
|
||||
int Y, U, V;
|
||||
|
||||
if( colorspace == COLORSPACE_BGR )
|
||||
{
|
||||
int r = pix_data[2];
|
||||
int g = pix_data[1];
|
||||
int b = pix_data[0];
|
||||
|
||||
Y = DCT_DESCALE( r*y_r + g*y_g + b*y_b, fixc) - 128;
|
||||
U = DCT_DESCALE( r*cb_r + g*cb_g + b*cb_b, fixc );
|
||||
V = DCT_DESCALE( r*cr_r + g*cr_g + b*cr_b, fixc );
|
||||
}
|
||||
else if( colorspace == COLORSPACE_RGBA )
|
||||
{
|
||||
int r = pix_data[0];
|
||||
int g = pix_data[1];
|
||||
int b = pix_data[2];
|
||||
|
||||
Y = DCT_DESCALE( r*y_r + g*y_g + b*y_b, fixc) - 128;
|
||||
U = DCT_DESCALE( r*cb_r + g*cb_g + b*cb_b, fixc );
|
||||
V = DCT_DESCALE( r*cr_r + g*cr_g + b*cr_b, fixc );
|
||||
}
|
||||
else
|
||||
{
|
||||
Y = pix_data[0] - 128;
|
||||
U = pix_data[v_plane_ofs] - 128;
|
||||
V = pix_data[u_plane_ofs] - 128;
|
||||
}
|
||||
|
||||
int j2 = j >> (x_scale - 1);
|
||||
Y_data[j] = (short)Y;
|
||||
UV_data[j2] = (short)(UV_data[j2] + U);
|
||||
UV_data[j2 + 8] = (short)(UV_data[j2 + 8] + V);
|
||||
}
|
||||
|
||||
pix_data -= x_limit*input_channels;
|
||||
if( ((i+1) & (y_scale - 1)) == 0 )
|
||||
{
|
||||
UV_data += UV_step;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// total_cvt += (double)cv::getTickCount() - t;
|
||||
}
|
||||
else
|
||||
{
|
||||
for( i = 0; i < y_limit; i++, pix_data += step, Y_data += Y_step )
|
||||
{
|
||||
for( j = 0; j < x_limit; j++ )
|
||||
Y_data[j] = (short)(pix_data[j]*4 - 128*4);
|
||||
}
|
||||
}
|
||||
|
||||
for( i = 0; i < block_count; i++ )
|
||||
{
|
||||
int is_chroma = i >= luma_count;
|
||||
int src_step = x_scale * 8;
|
||||
int run = 0, val;
|
||||
const short* src_ptr = block[i & -2] + (i & 1)*8;
|
||||
const unsigned* htable = huff_ac_tab[is_chroma];
|
||||
|
||||
//double t = (double)cv::getTickCount();
|
||||
aan_fdct8x8( src_ptr, buffer, src_step, fdct_qtab[is_chroma] );
|
||||
//total_dct += (double)cv::getTickCount() - t;
|
||||
|
||||
j = is_chroma + (i > luma_count);
|
||||
val = buffer[0] - dc_pred[j];
|
||||
dc_pred[j] = buffer[0];
|
||||
|
||||
{
|
||||
int cat = cat_table[val + CAT_TAB_SIZE];
|
||||
|
||||
//CV_Assert( cat <= 11 );
|
||||
JPUT_HUFF( cat, huff_dc_tab[is_chroma] );
|
||||
JPUT_BITS( val - (val < 0 ? 1 : 0), cat );
|
||||
}
|
||||
|
||||
for( j = 1; j < 64; j++ )
|
||||
{
|
||||
val = buffer[zigzag[j]];
|
||||
|
||||
if( val == 0 )
|
||||
{
|
||||
run++;
|
||||
}
|
||||
else
|
||||
{
|
||||
while( run >= 16 )
|
||||
{
|
||||
JPUT_HUFF( 0xF0, htable ); // encode 16 zeros
|
||||
run -= 16;
|
||||
}
|
||||
|
||||
{
|
||||
int cat = cat_table[val + CAT_TAB_SIZE];
|
||||
//CV_Assert( cat <= 10 );
|
||||
JPUT_HUFF( cat + run*16, htable );
|
||||
JPUT_BITS( val - (val < 0 ? 1 : 0), cat );
|
||||
}
|
||||
|
||||
run = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if( run )
|
||||
{
|
||||
JPUT_HUFF( 0x00, htable ); // encode EOB
|
||||
}
|
||||
}
|
||||
}
|
||||
strm.jput(v[k]);
|
||||
}
|
||||
|
||||
// Flush
|
||||
strm.jflush(currval, bit_idx);
|
||||
strm.jflush(v[last_data_elem], 32 - buffers_list.get_last_bit_len());
|
||||
strm.jputShort( 0xFFD9 ); // EOI marker
|
||||
/*printf("total dct = %.1fms, total cvt = %.1fms\n",
|
||||
total_dct*1000./cv::getTickFrequency(),
|
||||
total_cvt*1000./cv::getTickFrequency());*/
|
||||
|
||||
size_t pos = strm.getPos();
|
||||
size_t pos1 = (pos + 3) & ~3;
|
||||
for( ; pos < pos1; pos++ )
|
||||
|
Loading…
Reference in New Issue
Block a user