From c33dbfdda6ab4d790706a0b12ff52434d23fe112 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=9Aled=C5=BA?= Date: Mon, 5 Aug 2024 11:24:53 +0200 Subject: [PATCH] Move audio/video converter out of reader/decoder --- Makefile | 8 +- c_src/xav/{converter.c => audio_converter.c} | 7 +- c_src/xav/{converter.h => audio_converter.h} | 0 c_src/xav/decoder.c | 86 +----------------- c_src/xav/decoder.h | 20 +---- c_src/xav/reader.c | 87 +----------------- c_src/xav/reader.h | 24 +---- c_src/xav/utils.c | 4 +- c_src/xav/utils.h | 4 +- c_src/xav/video_converter.c | 28 ++++++ c_src/xav/video_converter.h | 7 ++ c_src/xav/xav_decoder.c | 78 +++++++++++++--- c_src/xav/xav_decoder.h | 2 +- c_src/xav/xav_reader.c | 95 +++++++++++++++++--- c_src/xav/xav_reader.h | 2 +- test/decoder_test.exs | 1 + test/reader_test.exs | 1 - 17 files changed, 208 insertions(+), 246 deletions(-) rename c_src/xav/{converter.c => audio_converter.c} (98%) rename c_src/xav/{converter.h => audio_converter.h} (100%) create mode 100644 c_src/xav/video_converter.c create mode 100644 c_src/xav/video_converter.h diff --git a/Makefile b/Makefile index ebdc8be..09432b7 100644 --- a/Makefile +++ b/Makefile @@ -11,11 +11,11 @@ XAV_READER_SO = $(PRIV_DIR)/libxavreader.so # uncomment to compile with debug logs XAV_DEBUG_LOGS = -DXAV_DEBUG=1 -DECODER_HEADERS = $(XAV_DIR)/xav_decoder.h $(XAV_DIR)/decoder.h $(XAV_DIR)/converter.h $(XAV_DIR)/utils.h $(XAV_DIR)/channel_layout.h -DECODER_SOURCES = $(XAV_DIR)/xav_decoder.c $(XAV_DIR)/decoder.c $(XAV_DIR)/converter.c $(XAV_DIR)/utils.c +DECODER_HEADERS = $(XAV_DIR)/xav_decoder.h $(XAV_DIR)/decoder.h $(XAV_DIR)/video_converter.h $(XAV_DIR)/audio_converter.h $(XAV_DIR)/utils.h $(XAV_DIR)/channel_layout.h +DECODER_SOURCES = $(XAV_DIR)/xav_decoder.c $(XAV_DIR)/decoder.c $(XAV_DIR)/video_converter.c $(XAV_DIR)/audio_converter.c $(XAV_DIR)/utils.c -READER_HEADERS = $(XAV_DIR)/xav_reader.h $(XAV_DIR)/reader.h $(XAV_DIR)/converter.h $(XAV_DIR)/utils.h $(XAV_DIR)/channel_layout.h -READER_SOURCES = $(XAV_DIR)/xav_reader.c $(XAV_DIR)/reader.c $(XAV_DIR)/converter.c $(XAV_DIR)/utils.c +READER_HEADERS = $(XAV_DIR)/xav_reader.h $(XAV_DIR)/reader.h $(XAV_DIR)/video_converter.h $(XAV_DIR)/audio_converter.h $(XAV_DIR)/utils.h $(XAV_DIR)/channel_layout.h +READER_SOURCES = $(XAV_DIR)/xav_reader.c $(XAV_DIR)/reader.c $(XAV_DIR)/video_converter.c $(XAV_DIR)/audio_converter.c $(XAV_DIR)/utils.c CFLAGS = $(XAV_DEBUG_LOGS) -fPIC -shared IFLAGS = -I$(ERTS_INCLUDE_DIR) -I$(XAV_DIR) diff --git a/c_src/xav/converter.c b/c_src/xav/audio_converter.c similarity index 98% rename from c_src/xav/converter.c rename to c_src/xav/audio_converter.c index cbcf45b..14490ad 100644 --- a/c_src/xav/converter.c +++ b/c_src/xav/audio_converter.c @@ -4,8 +4,8 @@ #include #include +#include "audio_converter.h" #include "channel_layout.h" -#include "converter.h" #include "utils.h" struct Converter *converter_alloc() { @@ -65,13 +65,12 @@ int converter_convert(struct Converter *c, AVFrame *src_frame, uint8_t ***out_da return ret; } - *out_data = out_data_tmp; - *out_samples = swr_convert(c->swr_ctx, out_data_tmp, max_out_nb_samples, (const uint8_t **)src_frame->data, src_frame->nb_samples); if (*out_samples < 0) { XAV_LOG_DEBUG("Couldn't convert samples: %d", *out_samples); + av_freep(&out_data_tmp[0]); return -1; } @@ -79,6 +78,8 @@ int converter_convert(struct Converter *c, AVFrame *src_frame, uint8_t ***out_da *out_size = *out_samples * out_bytes_per_sample * out_nb_channels; + *out_data = out_data_tmp; + return 0; } diff --git a/c_src/xav/converter.h b/c_src/xav/audio_converter.h similarity index 100% rename from c_src/xav/converter.h rename to c_src/xav/audio_converter.h diff --git a/c_src/xav/decoder.c b/c_src/xav/decoder.c index 1f62090..9aa05c8 100644 --- a/c_src/xav/decoder.c +++ b/c_src/xav/decoder.c @@ -1,5 +1,6 @@ #include "decoder.h" #include "utils.h" +#include "video_converter.h" static int init_converter(struct Decoder *decoder); @@ -8,16 +9,6 @@ struct Decoder *decoder_alloc() { decoder->codec = NULL; decoder->c = NULL; - decoder->out_format_name = NULL; - - for (int i = 0; i < 4; i++) { - decoder->rgb_dst_data[i] = NULL; - } - - decoder->frame_data = NULL; - decoder->frame_linesize = NULL; - decoder->converter = NULL; - decoder->out_data = NULL; return decoder; } @@ -26,12 +17,9 @@ int decoder_init(struct Decoder *decoder, const char *codec) { if (strcmp(codec, "opus") == 0) { decoder->media_type = AVMEDIA_TYPE_AUDIO; decoder->codec = avcodec_find_decoder(AV_CODEC_ID_OPUS); - // we will initialize out_format_name with the first frame - decoder->out_format_name = NULL; } else if (strcmp(codec, "vp8") == 0) { decoder->media_type = AVMEDIA_TYPE_VIDEO; decoder->codec = avcodec_find_decoder(AV_CODEC_ID_VP8); - decoder->out_format_name = "rgb"; } else { return -1; } @@ -69,57 +57,13 @@ int decoder_decode(struct Decoder *decoder, AVPacket *pkt, AVFrame *frame) { return -2; } - ret = avcodec_receive_frame(decoder->c, frame); - if (ret != 0) { - return -1; - } - - if (decoder->media_type == AVMEDIA_TYPE_AUDIO && decoder->out_format_name == NULL) { - enum AVSampleFormat out_sample_fmt = av_get_alt_sample_fmt(frame->format, 0); - decoder->out_format_name = av_get_sample_fmt_name(out_sample_fmt); - } - - if (decoder->media_type == AVMEDIA_TYPE_VIDEO) { - if (frame->format != AV_PIX_FMT_RGB24) { - convert_to_rgb(frame, decoder->rgb_dst_data, decoder->rgb_dst_linesize); - decoder->frame_data = decoder->rgb_dst_data; - decoder->frame_linesize = decoder->rgb_dst_linesize; - } else { - decoder->frame_data = frame->data; - decoder->frame_linesize = frame->linesize; - } - } else if (decoder->media_type == AVMEDIA_TYPE_AUDIO) { - - if (decoder->converter == NULL) { - ret = init_converter(decoder); - if (ret < 0) { - return ret; - } - } - - return converter_convert(decoder->converter, frame, &decoder->out_data, &decoder->out_samples, - &decoder->out_size); - } - - return 0; + return avcodec_receive_frame(decoder->c, frame); } void decoder_free_frame(struct Decoder *decoder) { // TODO revisit this av_frame_unref(decoder->frame); av_packet_unref(decoder->pkt); - - if (decoder->media_type == AVMEDIA_TYPE_AUDIO && decoder->frame_data == decoder->rgb_dst_data) { - av_freep(&decoder->frame_data[0]); - } else if (decoder->media_type == AVMEDIA_TYPE_VIDEO && - decoder->frame_data == decoder->rgb_dst_data) { - av_freep(&decoder->frame_data[0]); - } - - if (decoder->out_data != NULL) { - // av_freep sets pointer to NULL - av_freep(&decoder->out_data); - } } void decoder_free(struct Decoder **decoder) { @@ -143,29 +87,3 @@ void decoder_free(struct Decoder **decoder) { *decoder = NULL; } } - -static int init_converter(struct Decoder *decoder) { - decoder->converter = converter_alloc(); - - if (decoder->converter == NULL) { - XAV_LOG_DEBUG("Couldn't allocate converter"); - return -1; - } - - int out_sample_rate = decoder->c->sample_rate; - enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT; - - struct ChannelLayout in_chlayout, out_chlayout; -#if LIBAVUTIL_VERSION_MAJOR >= 58 - in_chlayout.layout = decoder->c->ch_layout; - out_chlayout.layout = decoder->c->ch_layout; -#else - in_chlayout.layout = decoder->c->channel_layout; - out_chlayout.layout = decoder->c->channel_layout; - XAV_LOG_DEBUG("in_chlayout %ld", in_chlayout.layout); - XAV_LOG_DEBUG("in nb_channels %d", av_get_channel_layout_nb_channels(in_chlayout.layout)); -#endif - - return converter_init(decoder->converter, in_chlayout, decoder->c->sample_rate, - decoder->c->sample_fmt, out_chlayout, out_sample_rate, out_sample_fmt); -} diff --git a/c_src/xav/decoder.h b/c_src/xav/decoder.h index 13d38ac..364e718 100644 --- a/c_src/xav/decoder.h +++ b/c_src/xav/decoder.h @@ -1,7 +1,7 @@ #include #include -#include "converter.h" +#include "audio_converter.h" #include "utils.h" struct Decoder { @@ -10,24 +10,6 @@ struct Decoder { AVPacket *pkt; const AVCodec *codec; AVCodecContext *c; - - const char *out_format_name; - - uint8_t *rgb_dst_data[4]; - int rgb_dst_linesize[4]; - - uint8_t **frame_data; - int *frame_linesize; - - struct Converter *converter; - // Buffer where audio samples are written after conversion. - // We always convet to packed format, so only out_data[0] is set. - uint8_t **out_data; - // Number of samples in out_data buffer - int out_samples; - // Size of out_data buffer. - // This is the same as out_samples * bytes_per_sample(out_format) * out_channels. - int out_size; }; struct Decoder *decoder_alloc(); diff --git a/c_src/xav/reader.c b/c_src/xav/reader.c index 580c666..62d7a7b 100644 --- a/c_src/xav/reader.c +++ b/c_src/xav/reader.c @@ -21,13 +21,6 @@ struct Reader *reader_alloc() { reader->options = NULL; reader->in_format_name = NULL; reader->out_format_name = NULL; - reader->frame_data = NULL; - reader->frame_linesize = NULL; - for (int i = 0; i < 4; i++) { - reader->rgb_dst_data[i] = NULL; - } - reader->converter = NULL; - reader->out_data = NULL; return reader; } @@ -105,7 +98,7 @@ int reader_next_frame(struct Reader *reader) { if (ret == 0) { XAV_LOG_DEBUG("Received frame"); - goto fin; + return 0; } else if (ret == AVERROR_EOF) { XAV_LOG_DEBUG("EOF"); return ret; @@ -188,47 +181,10 @@ int reader_next_frame(struct Reader *reader) { } } -fin: - if (reader->media_type == AVMEDIA_TYPE_VIDEO && reader->frame->format != AV_PIX_FMT_RGB24) { - XAV_LOG_DEBUG("Converting video to RGB"); - convert_to_rgb(reader->frame, reader->rgb_dst_data, reader->rgb_dst_linesize); - reader->frame_data = reader->rgb_dst_data; - reader->frame_linesize = reader->rgb_dst_linesize; - } else if (reader->media_type == AVMEDIA_TYPE_VIDEO) { - reader->frame_data = reader->frame->data; - reader->frame_linesize = reader->frame->linesize; - } else if (reader->media_type == AVMEDIA_TYPE_AUDIO) { - XAV_LOG_DEBUG("Converting audio to desired out format"); - - if (reader->converter == NULL) { - XAV_LOG_DEBUG("Converter not initialized. Initializing."); - ret = init_converter(reader); - if (ret < 0) { - return ret; - } - } - - return converter_convert(reader->converter, reader->frame, &reader->out_data, - &reader->out_samples, &reader->out_size); - } - return 0; } -void reader_free_frame(struct Reader *reader) { - av_frame_unref(reader->frame); - - if (reader->media_type == AVMEDIA_TYPE_AUDIO && reader->frame_data == reader->rgb_dst_data) { - av_freep(&reader->frame_data[0]); - } else if (reader->media_type == AVMEDIA_TYPE_VIDEO && - reader->frame_data == reader->rgb_dst_data) { - av_freep(&reader->frame_data[0]); - } - - if (reader->out_data != NULL) { - av_freep(&reader->out_data); - } -} +void reader_free_frame(struct Reader *reader) { av_frame_unref(reader->frame); } void reader_free(struct Reader **reader) { XAV_LOG_DEBUG("Freeing Reader object"); @@ -259,42 +215,3 @@ void reader_free(struct Reader **reader) { *reader = NULL; } } - -static int init_converter(struct Reader *reader) { - reader->converter = converter_alloc(); - - if (reader->converter == NULL) { - XAV_LOG_DEBUG("Couldn't allocate converter"); - return -1; - } - - int out_sample_rate = 16000; - enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT; - - struct ChannelLayout in_chlayout, out_chlayout; -#if LIBAVUTIL_VERSION_MAJOR >= 58 - in_chlayout.layout = reader->c->ch_layout; - av_channel_layout_from_mask(&out_chlayout.layout, AV_CH_LAYOUT_MONO); -#else - in_chlayout.layout = reader->frame->channel_layout; - out_chlayout.layout = AV_CH_LAYOUT_MONO; - - if (reader->frame->channel_layout == 0 && reader->frame->channels > 0) { - // In newer FFmpeg versions, 0 means that the order of channels is - // unspecified but there still might be information about channels number. - // Let's check againts it and take default channel order for the given channels number. - // This is also what newer FFmpeg versions do under the hood when passing - // unspecified channel order. - XAV_LOG_DEBUG("Channel layout unset. Setting to default for channels number: %d", - reader->frame->channels); - in_chlayout.layout = av_get_default_channel_layout(reader->frame->channels); - } else if (reader->frame->channel_layout == 0) { - XAV_LOG_DEBUG("Both channel layout and channels are unset. Cannot init converter."); - return -1; - } - -#endif - - return converter_init(reader->converter, in_chlayout, reader->c->sample_rate, - reader->c->sample_fmt, out_chlayout, out_sample_rate, out_sample_fmt); -} \ No newline at end of file diff --git a/c_src/xav/reader.h b/c_src/xav/reader.h index c89a9be..d10e4fe 100644 --- a/c_src/xav/reader.h +++ b/c_src/xav/reader.h @@ -9,7 +9,7 @@ #include #include -#include "converter.h" +#include "audio_converter.h" #include "utils.h" struct Reader { @@ -26,28 +26,6 @@ struct Reader { const char *in_format_name; const char *out_format_name; - - // used for converting decoded frame - // to rgb pixel format - uint8_t *rgb_dst_data[4]; - int rgb_dst_linesize[4]; - - // points either to frame->data - // frame->linesize or rgb_dst_data - // rgb_dst_linesize depending on - // whether convertion to rgb was needed - uint8_t **frame_data; - int *frame_linesize; - - struct Converter *converter; - // Buffer where audio samples are written after conversion. - // We always convet to packed format, so only out_data[0] is set. - uint8_t **out_data; - // Number of samples in out_data buffer - int out_samples; - // Size of out_data buffer. - // This is the same as out_samples * bytes_per_sample(out_format) * out_channels. - int out_size; }; struct Reader *reader_alloc(); diff --git a/c_src/xav/utils.c b/c_src/xav/utils.c index 87772cd..424eb64 100644 --- a/c_src/xav/utils.c +++ b/c_src/xav/utils.c @@ -55,8 +55,8 @@ ERL_NIF_TERM xav_nif_audio_frame_to_term(ErlNifEnv *env, uint8_t **out_data, int return enif_make_tuple(env, 4, data_term, format_term, samples_term, pts_term); } -ERL_NIF_TERM xav_nif_video_frame_to_term(ErlNifEnv *env, AVFrame *frame, unsigned char *data[], - int *linesize, const char *format_name) { +ERL_NIF_TERM xav_nif_video_frame_to_term(ErlNifEnv *env, AVFrame *frame, uint8_t *data[4], + int linesize[4], const char *format_name) { ERL_NIF_TERM data_term; unsigned char *ptr = enif_make_new_binary(env, linesize[0] * frame->height, &data_term); memcpy(ptr, data[0], linesize[0] * frame->height); diff --git a/c_src/xav/utils.h b/c_src/xav/utils.h index 72e21dd..dc26494 100644 --- a/c_src/xav/utils.h +++ b/c_src/xav/utils.h @@ -23,7 +23,7 @@ void convert_to_rgb(AVFrame *src_frame, uint8_t *dst_data[], int dst_linesize[]) ERL_NIF_TERM xav_nif_ok(ErlNifEnv *env, ERL_NIF_TERM data_term); ERL_NIF_TERM xav_nif_error(ErlNifEnv *env, char *reason); ERL_NIF_TERM xav_nif_raise(ErlNifEnv *env, char *msg); -ERL_NIF_TERM xav_nif_video_frame_to_term(ErlNifEnv *env, AVFrame *frame, unsigned char *data[], - int *linesize, const char *out_format_name); +ERL_NIF_TERM xav_nif_video_frame_to_term(ErlNifEnv *env, AVFrame *frame, uint8_t *out_data[4], + int out_linesize[4], const char *out_format); ERL_NIF_TERM xav_nif_audio_frame_to_term(ErlNifEnv *env, uint8_t **out_data, int out_samples, int out_size, const char *out_format, int pts); diff --git a/c_src/xav/video_converter.c b/c_src/xav/video_converter.c new file mode 100644 index 0000000..f048bc4 --- /dev/null +++ b/c_src/xav/video_converter.c @@ -0,0 +1,28 @@ +#include "video_converter.h" +#include "utils.h" + +int video_converter_convert(AVFrame *src_frame, uint8_t *out_data[], int out_linesize[]) { + int ret; + + struct SwsContext *sws_ctx = + sws_getContext(src_frame->width, src_frame->height, src_frame->format, src_frame->width, + src_frame->height, AV_PIX_FMT_RGB24, SWS_BILINEAR, NULL, NULL, NULL); + + ret = av_image_alloc(out_data, out_linesize, src_frame->width, src_frame->height, + AV_PIX_FMT_RGB24, 1); + + if (ret < 0) { + return ret; + } + + // is this (const uint8_t * const*) cast really correct? + ret = sws_scale(sws_ctx, (const uint8_t *const *)src_frame->data, src_frame->linesize, 0, + src_frame->height, out_data, out_linesize); + + if (ret < 0) { + av_freep(&out_data[0]); + return ret; + } + + return ret; +} diff --git a/c_src/xav/video_converter.h b/c_src/xav/video_converter.h new file mode 100644 index 0000000..28eec1d --- /dev/null +++ b/c_src/xav/video_converter.h @@ -0,0 +1,7 @@ + +#include +#include +#include +#include + +int video_converter_convert(AVFrame *src_frame, uint8_t *out_data[4], int out_linesize[4]); diff --git a/c_src/xav/xav_decoder.c b/c_src/xav/xav_decoder.c index e4a481f..99fc373 100644 --- a/c_src/xav/xav_decoder.c +++ b/c_src/xav/xav_decoder.c @@ -1,7 +1,11 @@ #include "xav_decoder.h" +#include "audio_converter.h" +#include "video_converter.h" ErlNifResourceType *xav_decoder_resource_type; +static int init_converter(struct XavDecoder *xav_decoder); + ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { if (argc != 1) { return xav_nif_raise(env, "invalid_arg_count"); @@ -39,6 +43,8 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { } ERL_NIF_TERM decode(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { + ERL_NIF_TERM frame_term; + if (argc != 4) { return xav_nif_raise(env, "invalid_arg_count"); } @@ -76,26 +82,78 @@ ERL_NIF_TERM decode(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { return xav_nif_raise(env, "failed_to_decode"); } - ERL_NIF_TERM frame_term; + // convert if (xav_decoder->decoder->media_type == AVMEDIA_TYPE_VIDEO) { + XAV_LOG_DEBUG("Converting video to RGB"); - frame_term = xav_nif_video_frame_to_term(env, xav_decoder->decoder->frame, - xav_decoder->decoder->frame_data, - xav_decoder->decoder->frame_linesize, "rgb"); + uint8_t *out_data[4]; + int out_linesize[4]; + ret = video_converter_convert(xav_decoder->decoder->frame, out_data, out_linesize); + if (ret <= 0) { + return xav_nif_raise(env, "failed_to_decode"); + } + + frame_term = xav_nif_video_frame_to_term(env, xav_decoder->decoder->frame, out_data, + out_linesize, "rgb"); + + av_freep(&out_data[0]); } else if (xav_decoder->decoder->media_type == AVMEDIA_TYPE_AUDIO) { - const char *out_format = - av_get_sample_fmt_name(xav_decoder->decoder->converter->out_sample_fmt); + XAV_LOG_DEBUG("Converting audio to desired out format"); + + uint8_t **out_data; + int out_samples; + int out_size; + + if (xav_decoder->converter == NULL) { + ret = init_converter(xav_decoder); + if (ret < 0) { + return ret; + } + } - frame_term = xav_nif_audio_frame_to_term( - env, xav_decoder->decoder->out_data, xav_decoder->decoder->out_samples, - xav_decoder->decoder->out_size, out_format, xav_decoder->decoder->frame->pts); + ret = converter_convert(xav_decoder->converter, xav_decoder->decoder->frame, &out_data, + &out_samples, &out_size); + if (ret < 0) { + return xav_nif_raise(env, "failed_to_decode"); + } + + const char *out_format = av_get_sample_fmt_name(xav_decoder->converter->out_sample_fmt); + + frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format, + xav_decoder->decoder->frame->pts); + + av_freep(&out_data[0]); } decoder_free_frame(xav_decoder->decoder); return xav_nif_ok(env, frame_term); - ; +} + +static int init_converter(struct XavDecoder *xav_decoder) { + xav_decoder->converter = converter_alloc(); + + if (xav_decoder->converter == NULL) { + XAV_LOG_DEBUG("Couldn't allocate converter"); + return -1; + } + + int out_sample_rate = xav_decoder->decoder->c->sample_rate; + enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT; + + struct ChannelLayout in_chlayout, out_chlayout; +#if LIBAVUTIL_VERSION_MAJOR >= 58 + in_chlayout.layout = xav_decoder->decoder->c->ch_layout; + out_chlayout.layout = xav_decoder->decoder->c->ch_layout; +#else + in_chlayout.layout = xav_decoder->decoder->c->channel_layout; + out_chlayout.layout = xav_decoder->decoder->c->channel_layout; +#endif + + return converter_init(xav_decoder->converter, in_chlayout, xav_decoder->decoder->c->sample_rate, + xav_decoder->decoder->c->sample_fmt, out_chlayout, out_sample_rate, + out_sample_fmt); } void free_xav_decoder(ErlNifEnv *env, void *obj) { diff --git a/c_src/xav/xav_decoder.h b/c_src/xav/xav_decoder.h index 82df1bc..956a8e3 100644 --- a/c_src/xav/xav_decoder.h +++ b/c_src/xav/xav_decoder.h @@ -1,4 +1,4 @@ -#include "converter.h" +#include "audio_converter.h" #include "decoder.h" struct XavDecoder { diff --git a/c_src/xav/xav_reader.c b/c_src/xav/xav_reader.c index 56f47df..5c0dc16 100644 --- a/c_src/xav/xav_reader.c +++ b/c_src/xav/xav_reader.c @@ -1,5 +1,7 @@ #include "xav_reader.h" +#include "video_converter.h" +static int init_converter(struct XavReader *xav_reader); ErlNifResourceType *xav_reader_resource_type; ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { @@ -75,6 +77,8 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { } ERL_NIF_TERM next_frame(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { + ERL_NIF_TERM frame_term; + if (argc != 1) { return xav_nif_raise(env, "invalid_arg_count"); } @@ -92,19 +96,48 @@ ERL_NIF_TERM next_frame(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { return xav_nif_raise(env, "receive_frame"); } - XAV_LOG_DEBUG("Returning to Erlang"); - - ERL_NIF_TERM frame_term; + // convert if (xav_reader->reader->media_type == AVMEDIA_TYPE_VIDEO) { - frame_term = xav_nif_video_frame_to_term( - env, xav_reader->reader->frame, xav_reader->reader->frame_data, - xav_reader->reader->frame_linesize, xav_reader->reader->out_format_name); - } else if (xav_reader->reader->media_type == AVMEDIA_TYPE_AUDIO) { - const char *out_format = av_get_sample_fmt_name(xav_reader->reader->converter->out_sample_fmt); + XAV_LOG_DEBUG("Converting video to RGB"); - frame_term = xav_nif_audio_frame_to_term( - env, xav_reader->reader->out_data, xav_reader->reader->out_samples, - xav_reader->reader->out_size, out_format, xav_reader->reader->frame->pts); + uint8_t *out_data[4]; + int out_linesize[4]; + + ret = video_converter_convert(xav_reader->reader->frame, out_data, out_linesize); + if (ret <= 0) { + return xav_nif_raise(env, "failed_to_read"); + } + + frame_term = + xav_nif_video_frame_to_term(env, xav_reader->reader->frame, out_data, out_linesize, "rgb"); + + av_freep(&out_data[0]); + } else if (xav_reader->reader->media_type == AVMEDIA_TYPE_AUDIO) { + XAV_LOG_DEBUG("Converting audio to desired out format"); + + uint8_t **out_data; + int out_samples; + int out_size; + + if (xav_reader->converter == NULL) { + XAV_LOG_DEBUG("Converter not initialized. Initializing."); + ret = init_converter(xav_reader); + if (ret < 0) { + return ret; + } + } + + ret = converter_convert(xav_reader->converter, xav_reader->reader->frame, &out_data, + &out_samples, &out_size); + if (ret < 0) { + return xav_nif_raise(env, "failed_to_read"); + } + + const char *out_format = av_get_sample_fmt_name(xav_reader->converter->out_sample_fmt); + + frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format, + xav_reader->reader->frame->pts); + av_freep(&out_data[0]); } reader_free_frame(xav_reader->reader); @@ -112,6 +145,46 @@ ERL_NIF_TERM next_frame(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { return xav_nif_ok(env, frame_term); } +static int init_converter(struct XavReader *xav_reader) { + xav_reader->converter = converter_alloc(); + + if (xav_reader->converter == NULL) { + XAV_LOG_DEBUG("Couldn't allocate converter"); + return -1; + } + + int out_sample_rate = 16000; + enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT; + + struct ChannelLayout in_chlayout, out_chlayout; +#if LIBAVUTIL_VERSION_MAJOR >= 58 + in_chlayout.layout = xav_reader->reader->c->ch_layout; + av_channel_layout_from_mask(&out_chlayout.layout, AV_CH_LAYOUT_MONO); +#else + in_chlayout.layout = xav_reader->reader->frame->channel_layout; + out_chlayout.layout = AV_CH_LAYOUT_MONO; + + if (xav_reader->reader->frame->channel_layout == 0 && xav_reader->reader->frame->channels > 0) { + // In newer FFmpeg versions, 0 means that the order of channels is + // unspecified but there still might be information about channels number. + // Let's check againts it and take default channel order for the given channels number. + // This is also what newer FFmpeg versions do under the hood when passing + // unspecified channel order. + XAV_LOG_DEBUG("Channel layout unset. Setting to default for channels number: %d", + xav_reader->reader->frame->channels); + in_chlayout.layout = av_get_default_channel_layout(xav_reader->reader->frame->channels); + } else if (xav_reader->reader->frame->channel_layout == 0) { + XAV_LOG_DEBUG("Both channel layout and channels are unset. Cannot init converter."); + return -1; + } + +#endif + + return converter_init(xav_reader->converter, in_chlayout, xav_reader->reader->c->sample_rate, + xav_reader->reader->c->sample_fmt, out_chlayout, out_sample_rate, + out_sample_fmt); +} + void free_xav_reader(ErlNifEnv *env, void *obj) { XAV_LOG_DEBUG("Freeing XavReader object"); struct XavReader *xav_reader = (struct XavReader *)obj; diff --git a/c_src/xav/xav_reader.h b/c_src/xav/xav_reader.h index 081bc37..d1162d9 100644 --- a/c_src/xav/xav_reader.h +++ b/c_src/xav/xav_reader.h @@ -1,4 +1,4 @@ -#include "converter.h" +#include "audio_converter.h" #include "reader.h" struct XavReader { diff --git a/test/decoder_test.exs b/test/decoder_test.exs index df8fc4d..bc756a7 100644 --- a/test/decoder_test.exs +++ b/test/decoder_test.exs @@ -304,6 +304,7 @@ defmodule Xav.DecoderTest do Xav.Decoder.decode(decoder, @opus_frame) end + @tag :debug test "video keyframe" do decoder = Xav.Decoder.new(:vp8) diff --git a/test/reader_test.exs b/test/reader_test.exs index 1baeedc..943edb9 100644 --- a/test/reader_test.exs +++ b/test/reader_test.exs @@ -11,7 +11,6 @@ defmodule Xav.ReaderTest do assert_raise RuntimeError, fn -> Xav.Reader.new!("non_existing_input") end end - @tag :debug test "next_frame/1" do {:ok, r} = Xav.Reader.new("./test/fixtures/sample_h264.mp4") # the file has 30fps, try to read 5 seconds