ffmpeg录制系统声音

发布时间 2023-11-29 17:13:30作者: 阿风小子
之前本人写过ffmpeg录制系统声音的博客,但是用到的设备名称叫做virtual-audio-capturer,需要实现安装一个软件,ffmpeg才能找到这个设备,很不方便;
今天用windows api采集声卡声音,进行声卡数据抓取,然后放入ffmpeg进行编码。
关于声卡的数据采集api,可以参看下面博客:
声卡数据采集
 
本人从声卡中获取到的格式是:
采样率:48000
采样位数:32
通道数:双通道
 
最终编码时,编码后的的格式为AV_SAMPLE_FMT_FLTP(平面格式),代码如下:
 
av_opt_set_channel_layout(m_pAudioConvertCtx, "in_channel_layout", AV_CH_LAYOUT_STEREO, 0);
av_opt_set_channel_layout(m_pAudioConvertCtx, "out_channel_layout", AV_CH_LAYOUT_STEREO, 0);
av_opt_set_int(m_pAudioConvertCtx, "in_sample_rate", m_formatex.Format.nSamplesPerSec, 0);
av_opt_set_int(m_pAudioConvertCtx, "out_sample_rate", 48000, 0);
av_opt_set_sample_fmt(m_pAudioConvertCtx, "in_sample_fmt", AV_SAMPLE_FMT_S32, 0);
av_opt_set_sample_fmt(m_pAudioConvertCtx, "out_sample_fmt", AV_SAMPLE_FMT_FLTP, 0);
 
相应的采样转换代码如下:
 
uint8_t *audio_buf[2] = { 0 };
audio_buf[0] = (uint8_t *)frame_mic_encode->data[0];
audio_buf[1] = (uint8_t *)frame_mic_encode->data[1];
 
int nb = swr_convert(m_pAudioConvertCtx, audio_buf, num_frames_to_read, (const uint8_t**)&p_audio_data, num_frames_to_read);
 
其中p_audio_data为从声卡中获取的数据buffer,num_frames_to_read为数据长度(以每个采样为单位)
由于编码格式是平面格式,所以定义了audio_buf[2]。
 
如果系统未播放任何声音,则num_frames_to_read为0,这种情况,本人尚未处理。本人给出的例子是系统中播放一段音乐时的处理。
 
main函数如下所示:
 
#include <iostream>
#include "GetSystemAudio.h"
 
int main()
{
CGetSystemAudio cCGetSystemAudio;
cCGetSystemAudio.SetSavePath("E:\\learn\\ffmpeg\\FfmpegTest\\x64\\Release");
cCGetSystemAudio.StartCapture();
Sleep(30000);
cCGetSystemAudio.StopCapture();
return 0;
}
 
可以看出,录了30秒。
 
GetSystemAudio.h的内容如下:
 
#pragma once
#include <string>
#include <combaseapi.h>
#include <mmdeviceapi.h>
#include <audioclient.h>
 
 
#ifdef __cplusplus
extern "C"
{
#endif
#include "libavcodec/avcodec.h"
#include "libavformat/avformat.h"
#include "libswscale/swscale.h"
#include "libswresample/swresample.h"
#include "libavdevice/avdevice.h"
#include "libavutil/audio_fifo.h"
#include "libavutil/avutil.h"
#include "libavutil/fifo.h"
#include "libavutil/frame.h"
#include "libavutil/imgutils.h"
 
#include "libavfilter/avfilter.h"
#include "libavfilter/buffersink.h"
#include "libavfilter/buffersrc.h"
 
 
#pragma comment(lib, "avcodec.lib")
#pragma comment(lib, "avformat.lib")
#pragma comment(lib, "avutil.lib")
#pragma comment(lib, "avdevice.lib")
#pragma comment(lib, "avfilter.lib")
#pragma comment(lib, "postproc.lib")
#pragma comment(lib, "swresample.lib")
#pragma comment(lib, "swscale.lib")
 
 
#ifdef __cplusplus
};
#endif
 
class CGetSystemAudio
{
public:
CGetSystemAudio();
~CGetSystemAudio();
public:
void SetSavePath(std::string strPath);
int StartCapture();
void StopCapture();
int OpenOutPut();
private:
static DWORD WINAPI AudioSystemCaptureProc(LPVOID lpParam);
void AudioSystemCapture();
 
static DWORD WINAPI AudioSystemWriteProc(LPVOID lpParam);
void AudioSystemWrite();
 
HRESULT IsFormatSupported(IAudioClient *audioClient);
private:
std::string m_strRecordPath;
bool m_bRecord;
IAudioClient *pAudioClient = nullptr;
IAudioCaptureClient *pAudioCaptureClient = nullptr;
WAVEFORMATEXTENSIBLE m_formatex;
HANDLE m_hAudioSystemCapture = NULL;
HANDLE m_hAudioSystemWrite = NULL;
 
AVFormatContext *m_pFormatCtx_Out = NULL;
AVFormatContext*m_pFormatCtx_AudioSystem = NULL;
 
AVCodecContext*m_pCodecEncodeCtx_Audio = NULL;
AVCodec*m_pCodecEncode_Audio = NULL;
SwrContext *m_pAudioConvertCtx = NULL;
AVAudioFifo *m_pAudioFifo = NULL;
CRITICAL_SECTION m_csAudioSystemSection;
};
 
 
GetSystemAudio.cpp的内容如下:
 
#include "GetSystemAudio.h"
#include <iostream>
#include <fstream>
#include <thread>
 
#define DEFAULT_SAMPLE_RATE 48000 // 默认采样率:48kHz
#define DEFAULT_BITS_PER_SAMPLE 16 // 默认位深:16bit
#define DEFAULT_CHANNELS 1 // 默认音频通道数:1
#define DEFAULT_AUDIO_PACKET_INTERVAL 10 // 默认音频包发送间隔:10ms
 
HRESULT CreateDeviceEnumerator(IMMDeviceEnumerator **enumerator)
{
CoInitializeEx(nullptr, COINIT_MULTITHREADED);
 
return CoCreateInstance(__uuidof(MMDeviceEnumerator), NULL, CLSCTX_ALL,
__uuidof(IMMDeviceEnumerator),
reinterpret_cast<void **>(enumerator));
}
HRESULT CreateDevice(IMMDeviceEnumerator *enumerator, IMMDevice **device)
{
EDataFlow enDataFlow = eRender;// 表示获取扬声器的audio_endpoint
ERole enRole = eConsole;
return enumerator->GetDefaultAudioEndpoint(enDataFlow, enRole, device);
}
 
HRESULT CreateAudioClient(IMMDevice *device, IAudioClient **audioClient)
{
return device->Activate(__uuidof(IAudioClient), CLSCTX_ALL, NULL,
(void **)audioClient);
}
 
HRESULT CGetSystemAudio::IsFormatSupported(IAudioClient *audioClient)
{
WAVEFORMATEX *format = &m_formatex.Format;
format->nSamplesPerSec = DEFAULT_SAMPLE_RATE;
format->wBitsPerSample = DEFAULT_BITS_PER_SAMPLE;
format->nChannels = DEFAULT_CHANNELS;
 
WAVEFORMATEX *closestMatch = nullptr;
 
HRESULT hr = audioClient->IsFormatSupported(AUDCLNT_SHAREMODE_SHARED, format, &closestMatch);
if (hr == AUDCLNT_E_UNSUPPORTED_FORMAT) // 0x88890008
{
if (closestMatch != nullptr) // 如果找不到最相近的格式,closestMatch可能为nullptr
{
format->nSamplesPerSec = closestMatch->nSamplesPerSec;
format->wBitsPerSample = closestMatch->wBitsPerSample;
format->nChannels = closestMatch->nChannels;
 
return S_OK;
}
}
 
return hr;
}
HRESULT GetPreferFormat(IAudioClient *audioClient, WAVEFORMATEXTENSIBLE *formatex)
{
WAVEFORMATEX *format = nullptr;
HRESULT hr = audioClient->GetMixFormat(&format);
if (FAILED(hr))
{
return hr;
}
 
formatex->Format.nSamplesPerSec = format->nSamplesPerSec;
formatex->Format.wBitsPerSample = format->wBitsPerSample;
formatex->Format.nChannels = format->nChannels;
 
return hr;
}
 
HRESULT InitAudioClient(IAudioClient *audioClient, WAVEFORMATEXTENSIBLE *formatex)
{
AUDCLNT_SHAREMODE shareMode =
AUDCLNT_SHAREMODE_SHARED;  // share Audio Engine with other applications
DWORD streamFlags = AUDCLNT_STREAMFLAGS_LOOPBACK; // loopback speaker
streamFlags |=
AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM; // A channel matrixer and a sample
// rate converter are inserted
streamFlags |=
AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY; // a sample rate converter
// with better quality than
// the default conversion but
// with a higher performance
// cost is used
REFERENCE_TIME hnsBufferDuration = 0;
WAVEFORMATEX *format = &formatex->Format;
format->wFormatTag = WAVE_FORMAT_EXTENSIBLE;
format->nBlockAlign = (format->wBitsPerSample >> 3) * format->nChannels;
format->nAvgBytesPerSec = format->nBlockAlign * format->nSamplesPerSec;
format->cbSize = sizeof(WAVEFORMATEXTENSIBLE) - sizeof(WAVEFORMATEX);
formatex->Samples.wValidBitsPerSample = format->wBitsPerSample;
formatex->dwChannelMask =
format->nChannels == 1 ? KSAUDIO_SPEAKER_MONO : KSAUDIO_SPEAKER_STEREO;
formatex->SubFormat = KSDATAFORMAT_SUBTYPE_PCM;
 
return audioClient->Initialize(shareMode, streamFlags, hnsBufferDuration, 0,
format, nullptr);
}
 
HRESULT CreateAudioCaptureClient(IAudioClient *audioClient, IAudioCaptureClient **audioCaptureClient)
{
HRESULT hr = audioClient->GetService(IID_PPV_ARGS(audioCaptureClient));
if (FAILED(hr))
{
*audioCaptureClient = nullptr;
}
return hr;
}
 
DWORD WINAPI CGetSystemAudio::AudioSystemCaptureProc(LPVOID lpParam)
{
CGetSystemAudio *pCGetSystemAudio = (CGetSystemAudio *)lpParam;
if (pCGetSystemAudio != NULL)
{
pCGetSystemAudio->AudioSystemCapture();
}
return 0;
}
 
void CGetSystemAudio::AudioSystemCapture()
{
HRESULT hr = S_OK;
UINT32 num_success = 0;
 
 
BYTE *p_audio_data = nullptr;
UINT32 num_frames_to_read = 0;
DWORD dw_flag = 0;
 
UINT32 num_frames_in_next_packet = 0;
 
UINT32 num_loop = 0;
 
pAudioClient->Start();
int ret = 0;
int AudioFrameIndex_mic = 1;
while (m_bRecord)
{
std::this_thread::sleep_for(std::chrono::milliseconds(0));
 
while (true)
{
hr = pAudioCaptureClient->GetNextPacketSize(&num_frames_in_next_packet);
if (FAILED(hr))
{
throw std::exception();
}
if (num_frames_in_next_packet == 0)
{
break;
}
 
hr = pAudioCaptureClient->GetBuffer(&p_audio_data, &num_frames_to_read, &dw_flag, nullptr, nullptr);
if (FAILED(hr))
{
throw std::exception();
}
 
AVFrame *frame_mic_encode = NULL;
frame_mic_encode = av_frame_alloc();
 
frame_mic_encode->nb_samples = m_pCodecEncodeCtx_Audio->frame_size;
frame_mic_encode->channel_layout = m_pCodecEncodeCtx_Audio->channel_layout;
frame_mic_encode->format = m_pCodecEncodeCtx_Audio->sample_fmt;
frame_mic_encode->sample_rate = m_pCodecEncodeCtx_Audio->sample_rate;
av_frame_get_buffer(frame_mic_encode, 0);
 
int iDelaySamples = 0;
 
AVPacket pkt_out_mic = { 0 };
 
pkt_out_mic.data = NULL;
pkt_out_mic.size = 0;
 
//uint8_t *audio_buf = NULL;
uint8_t *audio_buf[2] = { 0 };
audio_buf[0] = (uint8_t *)frame_mic_encode->data[0];
audio_buf[1] = (uint8_t *)frame_mic_encode->data[1];
 
int nb = swr_convert(m_pAudioConvertCtx, audio_buf, num_frames_to_read, (const uint8_t**)&p_audio_data, num_frames_to_read);
 
int buf_space = av_audio_fifo_space(m_pAudioFifo);
if (buf_space >= frame_mic_encode->nb_samples)
{
//AudioSection
EnterCriticalSection(&m_csAudioSystemSection);
ret = av_audio_fifo_write(m_pAudioFifo, (void **)frame_mic_encode->data, num_frames_to_read);
LeaveCriticalSection(&m_csAudioSystemSection);
}
 
hr = pAudioCaptureClient->ReleaseBuffer(num_frames_to_read);
if (FAILED(hr))
{
throw std::exception();
}
 
num_loop++;
}
}
 
pAudioClient->Stop();
}
 
DWORD WINAPI CGetSystemAudio::AudioSystemWriteProc(LPVOID lpParam)
{
CGetSystemAudio *pCGetSystemAudio = (CGetSystemAudio *)lpParam;
if (pCGetSystemAudio != NULL)
{
pCGetSystemAudio->AudioSystemWrite();
}
return 0;
}
 
void CGetSystemAudio::AudioSystemWrite()
{
int ret = 0;
 
int AudioFrameIndex_mic = 1;
AVFrame *frame_audio_system = NULL;
frame_audio_system = av_frame_alloc();
 
while (m_bRecord)
{
if (av_audio_fifo_size(m_pAudioFifo) >=
(m_pFormatCtx_Out->streams[0]->codecpar->frame_size > 0 ? m_pFormatCtx_Out->streams[0]->codecpar->frame_size : 1024))
{
 
frame_audio_system->nb_samples = m_pFormatCtx_Out->streams[0]->codecpar->frame_size > 0 ? m_pFormatCtx_Out->streams[0]->codecpar->frame_size : 1024;
frame_audio_system->channel_layout = m_pFormatCtx_Out->streams[0]->codecpar->channel_layout;
frame_audio_system->format = m_pFormatCtx_Out->streams[0]->codecpar->format;
frame_audio_system->sample_rate = m_pFormatCtx_Out->streams[0]->codecpar->sample_rate;
av_frame_get_buffer(frame_audio_system, 0);
 
EnterCriticalSection(&m_csAudioSystemSection);
int readcount = av_audio_fifo_read(m_pAudioFifo, (void **)frame_audio_system->data,
(m_pFormatCtx_Out->streams[0]->codecpar->frame_size > 0 ? m_pFormatCtx_Out->streams[0]->codecpar->frame_size : 1024));
LeaveCriticalSection(&m_csAudioSystemSection);
 
 
AVPacket pkt_out_mic = { 0 };
 
pkt_out_mic.data = NULL;
pkt_out_mic.size = 0;
 
ret = avcodec_send_frame(m_pCodecEncodeCtx_Audio, frame_audio_system);
 
ret = avcodec_receive_packet(m_pCodecEncodeCtx_Audio, &pkt_out_mic);
 
pkt_out_mic.stream_index = 0;
pkt_out_mic.pts = AudioFrameIndex_mic * readcount;
pkt_out_mic.dts = AudioFrameIndex_mic * readcount;
pkt_out_mic.duration = readcount;
 
av_write_frame(m_pFormatCtx_Out, &pkt_out_mic);
av_packet_unref(&pkt_out_mic);
AudioFrameIndex_mic++;
}
else
{
Sleep(1);
if (!m_bRecord)
{
break;
}
}
}
Sleep(100);
av_frame_free(&frame_audio_system);
av_write_trailer(m_pFormatCtx_Out);
 
avio_close(m_pFormatCtx_Out->pb);
}
 
CGetSystemAudio::CGetSystemAudio()
{
m_bRecord = false;
m_hAudioSystemCapture = NULL;
InitializeCriticalSection(&m_csAudioSystemSection);
}
 
 
CGetSystemAudio::~CGetSystemAudio()
{
DeleteCriticalSection(&m_csAudioSystemSection);
}
 
int CGetSystemAudio::OpenOutPut()
{
std::string strFileName = m_strRecordPath;
 
int iRet = -1;
 
AVStream *pAudioStream = NULL;
 
do
{
std::string strFileName = m_strRecordPath;
strFileName += "system_audio";
strFileName += ".mp4";
 
const char *outFileName = strFileName.c_str();
avformat_alloc_output_context2(&m_pFormatCtx_Out, NULL, NULL, outFileName);
 
{
pAudioStream = avformat_new_stream(m_pFormatCtx_Out, NULL);
 
m_pCodecEncode_Audio = (AVCodec *)avcodec_find_encoder(m_pFormatCtx_Out->oformat->audio_codec);
 
m_pCodecEncodeCtx_Audio = avcodec_alloc_context3(m_pCodecEncode_Audio);
if (!m_pCodecEncodeCtx_Audio)
{
break;
}
 
 
//pCodecEncodeCtx_Audio->codec_id = pFormatCtx_Out->oformat->audio_codec;
m_pCodecEncodeCtx_Audio->sample_fmt = m_pCodecEncode_Audio->sample_fmts ? m_pCodecEncode_Audio->sample_fmts[0] : AV_SAMPLE_FMT_FLTP;
m_pCodecEncodeCtx_Audio->bit_rate = 64000;
m_pCodecEncodeCtx_Audio->sample_rate = 48000;
m_pCodecEncodeCtx_Audio->channel_layout = AV_CH_LAYOUT_STEREO;
m_pCodecEncodeCtx_Audio->channels = av_get_channel_layout_nb_channels(m_pCodecEncodeCtx_Audio->channel_layout);
 
 
AVRational timeBase;
timeBase.den = m_pCodecEncodeCtx_Audio->sample_rate;
timeBase.num = 1;
pAudioStream->time_base = timeBase;
 
if (avcodec_open2(m_pCodecEncodeCtx_Audio, m_pCodecEncode_Audio, 0) < 0)
{
//编码器打开失败,退出程序
break;
}
}
 
 
if (!(m_pFormatCtx_Out->oformat->flags & AVFMT_NOFILE))
{
if (avio_open(&m_pFormatCtx_Out->pb, outFileName, AVIO_FLAG_WRITE) < 0)
{
break;
}
}
 
avcodec_parameters_from_context(pAudioStream->codecpar, m_pCodecEncodeCtx_Audio);
 
if (avformat_write_header(m_pFormatCtx_Out, NULL) < 0)
{
break;
}
 
iRet = 0;
} while (0);
 
 
if (iRet != 0)
{
if (m_pCodecEncodeCtx_Audio != NULL)
{
avcodec_free_context(&m_pCodecEncodeCtx_Audio);
m_pCodecEncodeCtx_Audio = NULL;
}
 
if (m_pFormatCtx_Out != NULL)
{
avformat_free_context(m_pFormatCtx_Out);
m_pFormatCtx_Out = NULL;
}
}
 
return iRet;
}
 
 
void CGetSystemAudio::SetSavePath(std::string strPath)
{
m_strRecordPath = strPath;
if (!m_strRecordPath.empty())
{
if (m_strRecordPath[m_strRecordPath.length() - 1] != '\\')
{
m_strRecordPath = m_strRecordPath + "\\";
}
}
}
 
int CGetSystemAudio::StartCapture()
{
int iRet = -1;
do 
{
iRet = OpenOutPut();
if (iRet < 0)
{
break;
}
 
IMMDeviceEnumerator *pDeviceEnumerator = nullptr;
IMMDevice *pDevice = nullptr;
std::unique_ptr<std::thread> capture_thread = nullptr;
 
std::string input_str;
 
HRESULT hr;
 
hr = CreateDeviceEnumerator(&pDeviceEnumerator);
if (FAILED(hr))
{
break;
}
hr = CreateDevice(pDeviceEnumerator, &pDevice);
if (FAILED(hr))
{
break;
}
 
hr = CreateAudioClient(pDevice, &pAudioClient);
if (FAILED(hr))
{
break;
}
 
hr = IsFormatSupported(pAudioClient);
if (FAILED(hr))
{
hr = GetPreferFormat(pAudioClient, &m_formatex);
if (FAILED(hr))
{
break;
}
}
 
hr = InitAudioClient(pAudioClient, &m_formatex);
if (FAILED(hr))
{
break;
}
 
hr = CreateAudioCaptureClient(pAudioClient, &pAudioCaptureClient);
if (FAILED(hr))
{
break;
}
 
m_pAudioConvertCtx = swr_alloc();
av_opt_set_channel_layout(m_pAudioConvertCtx, "in_channel_layout", AV_CH_LAYOUT_STEREO, 0);
av_opt_set_channel_layout(m_pAudioConvertCtx, "out_channel_layout", AV_CH_LAYOUT_STEREO, 0);
av_opt_set_int(m_pAudioConvertCtx, "in_sample_rate", m_formatex.Format.nSamplesPerSec, 0);
av_opt_set_int(m_pAudioConvertCtx, "out_sample_rate", 48000, 0);
av_opt_set_sample_fmt(m_pAudioConvertCtx, "in_sample_fmt", AV_SAMPLE_FMT_S32, 0);
av_opt_set_sample_fmt(m_pAudioConvertCtx, "out_sample_fmt", AV_SAMPLE_FMT_FLTP, 0);
 
iRet = swr_init(m_pAudioConvertCtx);
 
if (NULL == m_pAudioFifo)
{
m_pAudioFifo = av_audio_fifo_alloc((AVSampleFormat)m_pFormatCtx_Out->streams[0]->codecpar->format,
m_pFormatCtx_Out->streams[0]->codecpar->channels, 3000 * 1024);
}
 
m_bRecord = true;
m_hAudioSystemCapture = CreateThread(NULL, 0, AudioSystemCaptureProc, this, 0, NULL);
m_hAudioSystemWrite = CreateThread(NULL, 0, AudioSystemWriteProc, this, 0, NULL);
 
iRet = 0;
} while (0);
 
return 0;
}
 
void CGetSystemAudio::StopCapture()
{
m_bRecord = false;
 
Sleep(1000);
WaitForSingleObject(m_hAudioSystemCapture, INFINITE);
CloseHandle(m_hAudioSystemCapture);
m_hAudioSystemCapture = NULL;
}