avfilter/af_silenceremove: add peak detector

Signed-off-by: Paul B Mahol <onemda@gmail.com>
This commit is contained in:
Paul B Mahol 2015-12-27 20:45:46 +01:00
parent 3215342121
commit 1809894b2a
2 changed files with 77 additions and 33 deletions

View File

@ -2969,6 +2969,10 @@ at the beginning of each period of silence.
For example, if you want to remove long pauses between words but do not want For example, if you want to remove long pauses between words but do not want
to remove the pauses completely. Default value is @code{0}. to remove the pauses completely. Default value is @code{0}.
@item detection
Set how is silence detected. Can be @code{rms} or @code{peak}. Second is faster
and works better with digital silence which is exactly 0.
Default value is @code{rms}.
@end table @end table
@subsection Examples @subsection Examples

View File

@ -65,11 +65,15 @@ typedef struct SilenceRemoveContext {
double *window_current; double *window_current;
double *window_end; double *window_end;
int window_size; int window_size;
double rms_sum; double sum;
int leave_silence; int leave_silence;
int restart; int restart;
int64_t next_pts; int64_t next_pts;
int detection;
void (*update)(struct SilenceRemoveContext *s, double sample);
double(*compute)(struct SilenceRemoveContext *s, double sample);
} SilenceRemoveContext; } SilenceRemoveContext;
#define OFFSET(x) offsetof(SilenceRemoveContext, x) #define OFFSET(x) offsetof(SilenceRemoveContext, x)
@ -82,11 +86,58 @@ static const AVOption silenceremove_options[] = {
{ "stop_duration", NULL, OFFSET(stop_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS }, { "stop_duration", NULL, OFFSET(stop_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS },
{ "stop_threshold", NULL, OFFSET(stop_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS }, { "stop_threshold", NULL, OFFSET(stop_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS },
{ "leave_silence", NULL, OFFSET(leave_silence), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS }, { "leave_silence", NULL, OFFSET(leave_silence), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
{ "detection", NULL, OFFSET(detection), AV_OPT_TYPE_INT, {.i64=1}, 0, 1, FLAGS, "detection" },
{ "peak", 0, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "detection" },
{ "rms", 0, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "detection" },
{ NULL } { NULL }
}; };
AVFILTER_DEFINE_CLASS(silenceremove); AVFILTER_DEFINE_CLASS(silenceremove);
static double compute_peak(SilenceRemoveContext *s, double sample)
{
double new_sum;
new_sum = s->sum;
new_sum -= *s->window_current;
new_sum += fabs(sample);
return new_sum / s->window_size;
}
static void update_peak(SilenceRemoveContext *s, double sample)
{
s->sum -= *s->window_current;
*s->window_current = fabs(sample);
s->sum += *s->window_current;
s->window_current++;
if (s->window_current >= s->window_end)
s->window_current = s->window;
}
static double compute_rms(SilenceRemoveContext *s, double sample)
{
double new_sum;
new_sum = s->sum;
new_sum -= *s->window_current;
new_sum += sample * sample;
return sqrt(new_sum / s->window_size);
}
static void update_rms(SilenceRemoveContext *s, double sample)
{
s->sum -= *s->window_current;
*s->window_current = sample * sample;
s->sum += *s->window_current;
s->window_current++;
if (s->window_current >= s->window_end)
s->window_current = s->window;
}
static av_cold int init(AVFilterContext *ctx) static av_cold int init(AVFilterContext *ctx)
{ {
SilenceRemoveContext *s = ctx->priv; SilenceRemoveContext *s = ctx->priv;
@ -96,16 +147,27 @@ static av_cold int init(AVFilterContext *ctx)
s->restart = 1; s->restart = 1;
} }
switch (s->detection) {
case 0:
s->update = update_peak;
s->compute = compute_peak;
break;
case 1:
s->update = update_rms;
s->compute = compute_rms;
break;
};
return 0; return 0;
} }
static void clear_rms(SilenceRemoveContext *s) static void clear_window(SilenceRemoveContext *s)
{ {
memset(s->window, 0, s->window_size * sizeof(*s->window)); memset(s->window, 0, s->window_size * sizeof(*s->window));
s->window_current = s->window; s->window_current = s->window;
s->window_end = s->window + s->window_size; s->window_end = s->window + s->window_size;
s->rms_sum = 0; s->sum = 0;
} }
static int config_input(AVFilterLink *inlink) static int config_input(AVFilterLink *inlink)
@ -118,7 +180,7 @@ static int config_input(AVFilterLink *inlink)
if (!s->window) if (!s->window)
return AVERROR(ENOMEM); return AVERROR(ENOMEM);
clear_rms(s); clear_window(s);
s->start_duration = av_rescale(s->start_duration, inlink->sample_rate, s->start_duration = av_rescale(s->start_duration, inlink->sample_rate,
AV_TIME_BASE); AV_TIME_BASE);
@ -153,28 +215,6 @@ static int config_input(AVFilterLink *inlink)
return 0; return 0;
} }
static double compute_rms(SilenceRemoveContext *s, double sample)
{
double new_sum;
new_sum = s->rms_sum;
new_sum -= *s->window_current;
new_sum += sample * sample;
return sqrt(new_sum / s->window_size);
}
static void update_rms(SilenceRemoveContext *s, double sample)
{
s->rms_sum -= *s->window_current;
*s->window_current = sample * sample;
s->rms_sum += *s->window_current;
s->window_current++;
if (s->window_current >= s->window_end)
s->window_current = s->window;
}
static void flush(AVFrame *out, AVFilterLink *outlink, static void flush(AVFrame *out, AVFilterLink *outlink,
int *nb_samples_written, int *ret) int *nb_samples_written, int *ret)
{ {
@ -209,12 +249,12 @@ silence_trim:
for (i = 0; i < nbs; i++) { for (i = 0; i < nbs; i++) {
threshold = 0; threshold = 0;
for (j = 0; j < inlink->channels; j++) { for (j = 0; j < inlink->channels; j++) {
threshold |= compute_rms(s, ibuf[j]) > s->start_threshold; threshold |= s->compute(s, ibuf[j]) > s->start_threshold;
} }
if (threshold) { if (threshold) {
for (j = 0; j < inlink->channels; j++) { for (j = 0; j < inlink->channels; j++) {
update_rms(s, *ibuf); s->update(s, *ibuf);
s->start_holdoff[s->start_holdoff_end++] = *ibuf++; s->start_holdoff[s->start_holdoff_end++] = *ibuf++;
nb_samples_read++; nb_samples_read++;
} }
@ -232,7 +272,7 @@ silence_trim:
s->start_holdoff_end = 0; s->start_holdoff_end = 0;
for (j = 0; j < inlink->channels; j++) for (j = 0; j < inlink->channels; j++)
update_rms(s, ibuf[j]); s->update(s, ibuf[j]);
ibuf += inlink->channels; ibuf += inlink->channels;
nb_samples_read += inlink->channels; nb_samples_read += inlink->channels;
@ -284,7 +324,7 @@ silence_copy:
for (i = 0; i < nbs; i++) { for (i = 0; i < nbs; i++) {
threshold = 1; threshold = 1;
for (j = 0; j < inlink->channels; j++) for (j = 0; j < inlink->channels; j++)
threshold &= compute_rms(s, ibuf[j]) > s->stop_threshold; threshold &= s->compute(s, ibuf[j]) > s->stop_threshold;
if (threshold && s->stop_holdoff_end && !s->leave_silence) { if (threshold && s->stop_holdoff_end && !s->leave_silence) {
s->mode = SILENCE_COPY_FLUSH; s->mode = SILENCE_COPY_FLUSH;
@ -292,14 +332,14 @@ silence_copy:
goto silence_copy_flush; goto silence_copy_flush;
} else if (threshold) { } else if (threshold) {
for (j = 0; j < inlink->channels; j++) { for (j = 0; j < inlink->channels; j++) {
update_rms(s, *ibuf); s->update(s, *ibuf);
*obuf++ = *ibuf++; *obuf++ = *ibuf++;
nb_samples_read++; nb_samples_read++;
nb_samples_written++; nb_samples_written++;
} }
} else if (!threshold) { } else if (!threshold) {
for (j = 0; j < inlink->channels; j++) { for (j = 0; j < inlink->channels; j++) {
update_rms(s, *ibuf); s->update(s, *ibuf);
if (s->leave_silence) { if (s->leave_silence) {
*obuf++ = *ibuf; *obuf++ = *ibuf;
nb_samples_written++; nb_samples_written++;
@ -323,7 +363,7 @@ silence_copy:
s->start_found_periods = 0; s->start_found_periods = 0;
s->start_holdoff_offset = 0; s->start_holdoff_offset = 0;
s->start_holdoff_end = 0; s->start_holdoff_end = 0;
clear_rms(s); clear_window(s);
s->mode = SILENCE_TRIM; s->mode = SILENCE_TRIM;
flush(out, outlink, &nb_samples_written, &ret); flush(out, outlink, &nb_samples_written, &ret);
goto silence_trim; goto silence_trim;