FFmpeg filters 分析:af_volumedetect http://blog.tubumu.com/2021/12/03/ffmpeg-filters-af-volumedetect/

发表于2021-12-03更新于2021-12-12分类于FFmpeg

一、概述

本文分析 FFmpegaf_volumedetect 的实现+其输出的含义

二、af_volumedetect 的作用及基本原理

二、af_volumedetect 的作用是获取音频的最大音量、平均音量以及音量直方图。
它只支持 AV_SAMPLE_FMT_S16 和 AV_SAMPLE_FMT_S16P 这两种格式——如果不是当然 FFmpeg 能够自动转换。
如果只是获取最大音量,只需要返回音频采样绝对值最大的即可,如果需要返回分贝,则计算:

| -log10(pow(d, 2)) * 10 // 计算 dB。 d 为峰值。

| | —- |

如果要计算平均音量,

三、在调用 ffmpeg 程序时使用 af_volumedetect

| ffmpeg -i input.mp3 -af “volumedetect” -vn -sn -dn -f null /dev/null

| | —- |

在 Windows 中使用需将 /dev/null 替换为 NUL 或者 dummy
-vn、 -sn 和 -dn 告知 FFmpeg 忽略非音频流。能够在分析时避免不必要的操作从而更快速.
输出类似于:(-vn is no video flow, but -sn -dn is ???)

| [Parsed_volumedetect_0 @ 0x1328042c0] n_samples: 16815744 // 音频包含的采样数
[Parsed_volumedetect_0 @ 0x1328042c0] mean_volume: -25.4 dB // 平均音量
[Parsed_volumedetect_0 @ 0x1328042c0] max_volume: -6.6 dB // 最大音量
[Parsed_volumedetect_0 @ 0x1328042c0] histogram_6db: 35 // 大于 -7dB 并且小于或等于 -6dB 的采样数是 35
[Parsed_volumedetect_0 @ 0x1328042c0] histogram_7db: 2354 // 大于 -8dB 并且小于或等于 -7dB 的采样数是 2354
[Parsed_volumedetect_0 @ 0x1328042c0] histogram_8db: 4969
[Parsed_volumedetect_0 @ 0x1328042c0] histogram_9db: 8978
[Parsed_volumedetect_0 @ 0x1328042c0] histogram_10db: 35545

| | —- |

四、源码分析

af_volumedetect 源码位于 ffmpg/libavfilter/af_volumedetect.c 中。
分析 filter 一般从 static int filter_frame(AVFilterLink inlink, AVFrame in) 函数入手。

| // 0x10001 65536
// 0x8000 32768

typedef struct VolDetectContext {
/*
Number of samples at each PCM value.
histogram[0x8000 + i] is the number of samples at value i.
The extra element is there for symmetry.
*/
// S16 范围是 -32768 ~ 32767,即 65536 个数。histogram 统计每个采样的数量,为了和数组的索引匹配,会将所有采样都加 32768(0x8000)。
// histogram 是采样值与其数量的关系。
uint64_t histogram[0x10001];
} VolDetectContext;

static int filter_frame(AVFilterLink inlink, AVFrame samples)
{
AVFilterContext ctx = inlink->dst;
VolDetectContext
vd = ctx->priv;
int nb_samples = samples->nb_samples;
int nb_channels = samples->channels;
int nb_planes = nb_channels;
int plane, i;
int16_t *pcm;

  1. if (!av_sample_fmt_is_planar(samples->format)) {<br /> nb_samples *= nb_channels;<br /> nb_planes = 1;<br /> }<br /> // 统计每个采样值的采样数。<br /> for (plane = 0; plane < nb_planes; plane++) {<br /> pcm = (int16_t *)samples->extended_data[plane];<br /> for (i = 0; i < nb_samples; i++)<br /> vd->histogram[pcm[i] + 0x8000]++;<br /> }
  2. return ff_filter_frame(inlink->dst->outputs[0], samples);<br />}

| | —- |

print_stats 函数用于计算并打印。


// 最小分贝 -91dB

|

static inline double logdb(uint64_t v)
{
// 由于传入的 v 是 Amplitude 值加了 0x8000 再进行了平方,这里做相关逆运算。
double d = v / (double)(0x8000 0x8000);
if (!v)
return MAX_DB;
return -log10(d)
10;
}

static void print_stats(AVFilterContext ctx)
{
VolDetectContext
vd = ctx->priv;
int i, max_volume, shift;
uint64_t nb_samples = 0, power = 0, nb_samples_shift = 0, sum = 0;
uint64_t histdb[MAX_DB + 1] = { 0 };

  1. // 其实总的采样数 nb_samples 可以定义在 VolDetectContext 中,在 filter_frame 进行计算以避免本次循环。<br /> for (i = 0; i < 0x10000; i++)<br /> nb_samples += vd->histogram[i];<br /> av_log(ctx, AV_LOG_INFO, "n_samples: %"PRId64"\\n", nb_samples);<br /> if (!nb_samples)<br /> return;
  2. /* If nb_samples > 1<<34, there is a risk of overflow in the<br /> multiplication or the sum: shift all histogram values to avoid that.<br /> The total number of samples must be recomputed to avoid rounding<br /> errors. */<br /> shift = av_log2(nb_samples >> 33);<br /> for (i = 0; i < 0x10000; i++) {<br /> nb_samples_shift += vd->histogram[i] >> shift;<br /> power += (i - 0x8000) * (i - 0x8000) * (vd->histogram[i] >> shift);<br /> }<br /> if (!nb_samples_shift)<br /> return;<br /> power = (power + nb_samples_shift / 2) / nb_samples_shift;<br /> av_assert0(power <= 0x8000 * 0x8000);<br /> av_log(ctx, AV_LOG_INFO, "mean_volume: %.1f dB\\n", -logdb(power));
  3. max_volume = 0x8000;<br /> // 倒序搜索 histogram,第一个有采样数的是最大音量值。<br /> while (max_volume > 0 && !vd->histogram[0x8000 + max_volume] &&<br /> !vd->histogram[0x8000 - max_volume])<br /> max_volume--;<br /> av_log(ctx, AV_LOG_INFO, "max_volume: %.1f dB\\n", -logdb(max_volume * max_volume));
  4. // histdb: dB 直方图。用于保存 0dB ~ 91dB 的采样数。<br /> for (i = 0; i < 0x10000; i++)<br /> histdb[(int)logdb((i - 0x8000) * (i - 0x8000))] += vd->histogram[i];<br /> // 不输出整个直方图,并且忽略采样数为 0 的条目。<br /> for (i = 0; i <= MAX_DB && !histdb[i]; i++);<br /> for (; i <= MAX_DB && sum < nb_samples / 1000; i++) {<br /> av_log(ctx, AV_LOG_INFO, "histogram_%ddb: %"PRId64"\\n", i, histdb[i]);<br /> sum += histdb[i];<br /> }<br />}

| | —- |

五、C# 简单实现

| public class VolumeUtils
{
private const int MAX_DB = 91;

  1. private static double LogdB(ulong v)<br /> {<br /> // 0x8000 32768<br /> double d = v / (double)(0x8000 * 0x8000);<br /> if (v == 0)<br /> {<br /> return MAX_DB;<br /> }<br /> //20log_10(x^0.5) = 10log_10(x)<br /> return -Math.Log10(d) * 10;<br /> }
  2. /// <summary><br /> /// 音量检测<br /> /// </summary><br /> /// <param name="raw">PCM 数据。S16LE 格式(-32768 ~ 32767)。</param><br /> /// <param name="offset">偏移</param><br /> /// <param name="length">数据长度。必须是偶数</param><br /> /// <param name="maxVolume">最大音量</param><br /> /// <param name="meanVolume">平均音量</param><br /> /// <returns>音量从大到小的直方图(部分)</returns><br /> public static List<KeyValuePair<int, ulong>> VolumeDetect(byte[] raw, int offset, int length, out double maxVolume, out double meanVolume)<br /> {<br /> // MSE: mean square energy<br /> // 0x10001 65536<br /> // 0x8000 32768
  3. // S16 范围是 -32768 ~ 32767,即 65536 个数。histogram 统计每个采样的数量,为了和数组的索引匹配,会将所有采样都加 32768(0x8000)。<br /> // histogram 是采样值与其数量的关系。<br /> var histogram = new ulong[0x10001];
  4. // 统计每个采样的数量。<br /> ulong nb_samples = length / sizeof(short);<br /> for (var i = offset; i < nb_samples; i++)<br /> {<br /> var sample = BitConverter.ToInt16(raw, i * sizeof(short));<br /> histogram[sample + 0x8000]++;<br /> }
  5. ulong power = 0, nb_samples_shift = 0;
  6. /* If nb_samples > 1<<34, there is a risk of overflow in the<br /> multiplication or the sum: shift all histogram values to avoid that.<br /> The total number of samples must be recomputed to avoid rounding<br /> errors. */<br /> int shift = (int)Math.Log(nb_samples >> 33, 2);<br /> for (var i = 0; i < 0x10000; i++)<br /> {<br /> nb_samples_shift += histogram[i] >> shift;<br /> power += (ulong)(i - 0x8000) * (ulong)(i - 0x8000) * (histogram[i] >> shift);<br /> }<br /> if (nb_samples_shift == 0) {<br /> maxVolume = 0;<br /> meanVolume = 0;<br /> return new List<KeyValuePair<int, ulong>>(0);<br /> }
  7. power = (power + nb_samples_shift / 2) / nb_samples_shift;
  8. // mean volume<br /> meanVolume = -LogdB(power);
  9. // 倒序搜索 histogram,第一个有采样数的是最大音量值。<br /> int max_volume = 0x8000;<br /> while (max_volume > 0 && histogram[0x8000 + max_volume] == 0 && histogram[0x8000 - max_volume] == 0)<br /> max_volume--;
  10. // max volume<br /> maxVolume = -LogdB((ulong)(max_volume * max_volume));
  11. // histdb: dB 直方图。用于保存 0dB ~ 91dB 的采样数。<br /> var histdb = new ulong[MAX_DB + 1];<br /> for (var i = 0; i < 0x10000; i++)<br /> {<br /> histdb[(int)LogdB((ulong)((i - 0x8000) * (i - 0x8000)))] += histogram[i];<br /> }
  12. // 不返回整个直方图,并且忽略采样数为 0 的条目。<br /> var histdBResult = new List<KeyValuePair<int, ulong>>();<br /> var idx = 0;<br /> var sum = 0;<br /> for (idx = 0; idx <= MAX_DB && histdb[idx] == 0; idx++) ;<br /> for (; idx <= MAX_DB && sum < nb_samples / 1000; idx++)<br /> {<br /> histdBResult.Add(new KeyValuePair<int, ulong>(idx, histdb[idx]));<br /> sum += histdb[idx];<br /> }
  13. return histdBResult;<br /> }<br />}

| | —- |

参考资料

  • FFmpeg filters 官网文档: volumedetect

    8.115 volumedetect

    Detect the volume of the input video.
    The filter has no parameters. It supports only 16-bit signed integer samples, so the input will be converted when needed. Statistics about the volume will be printed in the log when the input stream end is reached.
    In particular it will show the mean volume (root mean square), maximum volume (on a per-sample basis), and the beginning of a histogram of the registered volume values (from the maximum value to a cumulated 1/1000 of the samples).
    All volumes are in decibels relative to the maximum PCM value.

    8.115.1 Examples

    Here is an excerpt of the output:
    [Parsed_volumedetect_0 0xa23120] mean_volume: -27 dB
    [Parsed_volumedetect_0 0xa23120] max_volume: -4 dB
    [Parsed_volumedetect_0 0xa23120] histogram_4db: 6
    [Parsed_volumedetect_0 0xa23120] histogram_5db: 62
    [Parsed_volumedetect_0 0xa23120] histogram_6db: 286
    [Parsed_volumedetect_0 0xa23120] histogram_7db: 1042
    [Parsed_volumedetect_0 0xa23120] histogram_8db: 2551
    [Parsed_volumedetect_0 0xa23120] histogram_9db: 4609
    [Parsed_volumedetect_0 0xa23120] histogram_10db: 8409
    It means that:

  • The mean square energy is approximately -27 dB, or 10^-2.7. (???)

  • The largest sample is at -4 dB, or more precisely between -4 dB and -5 dB.
  • There are 6 samples at -4 dB, 62 at -5 dB, 286 at -6 dB, etc. (…)

In other words, raising the volume by +4 dB does not cause any clipping, raising it by +5 dB causes clipping for 6 samples, etc. ( so raise volume may cause loss of information)