BumbleCee/tests/ffmpeg_any_to_opus.cc

#include "precomp.h"

#define OPUS_FRAME_SIZE 960  // 20ms @ 48kHz

int main() {
  const char* input_filename = "golden.webm";
  const char* output_filename = "output.opus";

  AVFormatContext* fmt_ctx = NULL;
  AVCodecContext* dec_ctx = NULL;
  AVCodecContext* enc_ctx = NULL;
  const AVCodec* decoder = NULL;
  const AVCodec* encoder = NULL;
  AVPacket* packet = NULL;
  AVFrame* frame = NULL;
  AVFrame* enc_frame = NULL;
  SwrContext* swr_ctx = NULL;
  FILE* outfile = NULL;

  av_log_set_level(AV_LOG_ERROR);

  if (avformat_open_input(&fmt_ctx, input_filename, NULL, NULL) < 0) {
    fprintf(stderr, "Could not open input file\n");
    return -1;
  }
  if (avformat_find_stream_info(fmt_ctx, NULL) < 0) {
    fprintf(stderr, "Could not find stream info\n");
    return -1;
  }

  int stream_index = -1;
  for (unsigned i = 0; i < fmt_ctx->nb_streams; i++) {
    if (fmt_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
      stream_index = i;
      break;
    }
  }
  if (stream_index == -1) {
    fprintf(stderr, "No audio stream found\n");
    return -1;
  }

  decoder =
      avcodec_find_decoder(fmt_ctx->streams[stream_index]->codecpar->codec_id);
  if (!decoder) {
    fprintf(stderr, "Decoder not found\n");
    return -1;
  }
  dec_ctx = avcodec_alloc_context3(decoder);
  avcodec_parameters_to_context(dec_ctx,
                                fmt_ctx->streams[stream_index]->codecpar);
  avcodec_open2(dec_ctx, decoder, NULL);

  encoder = avcodec_find_encoder(AV_CODEC_ID_OPUS);
  if (!encoder) {
    fprintf(stderr, "Opus encoder not found\n");
    return -1;
  }
  enc_ctx = avcodec_alloc_context3(encoder);

  AVChannelLayout enc_layout;
  av_channel_layout_default(&enc_layout, 2);  // 스테레오
  av_channel_layout_copy(&enc_ctx->ch_layout, &enc_layout);

  enc_ctx->sample_rate = 48000;
  enc_ctx->sample_fmt = AV_SAMPLE_FMT_FLT;
  enc_ctx->bit_rate = 128000;

  avcodec_open2(enc_ctx, encoder, NULL);

  swr_ctx = NULL;
  if (swr_alloc_set_opts2(&swr_ctx, &enc_ctx->ch_layout, enc_ctx->sample_fmt,
                          enc_ctx->sample_rate, &dec_ctx->ch_layout,
                          dec_ctx->sample_fmt, dec_ctx->sample_rate, 0,
                          NULL) < 0) {
    fprintf(stderr, "Failed to allocate SwrContext\n");
    return -1;
  }
  swr_init(swr_ctx);

  packet = av_packet_alloc();
  frame = av_frame_alloc();
  enc_frame = av_frame_alloc();

  outfile = fopen(output_filename, "wb");
  if (!outfile) {
    fprintf(stderr, "Could not open output file\n");
    return -1;
  }

  // 임시 PCM 버퍼 (float, 스테레오)
  float* pcm_buffer = (float*)malloc(sizeof(float) * 2 * OPUS_FRAME_SIZE *
                                     4);  // 충분히 큰 버퍼
  int buffered_samples = 0;

  while (av_read_frame(fmt_ctx, packet) >= 0) {
    if (packet->stream_index != stream_index) {
      av_packet_unref(packet);
      continue;
    }

    avcodec_send_packet(dec_ctx, packet);
    while (avcodec_receive_frame(dec_ctx, frame) == 0) {
      int max_out = av_rescale_rnd(
          swr_get_delay(swr_ctx, dec_ctx->sample_rate) + frame->nb_samples,
          enc_ctx->sample_rate, dec_ctx->sample_rate, AV_ROUND_UP);

      uint8_t** out_data = NULL;
      int out_linesize = 0;
      av_samples_alloc_array_and_samples(&out_data, &out_linesize, 2, max_out,
                                         enc_ctx->sample_fmt, 0);

      int converted =
          swr_convert(swr_ctx, out_data, max_out, (const uint8_t**)frame->data,
                      frame->nb_samples);

      // float PCM으로 임시 버퍼에 추가
      memcpy(pcm_buffer + buffered_samples * 2, out_data[0],
             converted * 2 * sizeof(float));
      buffered_samples += converted;

      av_freep(&out_data[0]);
      free(out_data);

      // OPUS_FRAME_SIZE 단위로 인코딩
      while (buffered_samples >= OPUS_FRAME_SIZE) {
        enc_frame->nb_samples = OPUS_FRAME_SIZE;
        enc_frame->format = enc_ctx->sample_fmt;
        enc_frame->sample_rate = enc_ctx->sample_rate;
        av_channel_layout_copy(&enc_frame->ch_layout, &enc_ctx->ch_layout);
        enc_frame->data[0] = (uint8_t*)pcm_buffer;

        AVPacket* out_pkt = av_packet_alloc();
        avcodec_send_frame(enc_ctx, enc_frame);
        while (avcodec_receive_packet(enc_ctx, out_pkt) == 0) {
          fwrite(out_pkt->data, 1, out_pkt->size, outfile);
          av_packet_unref(out_pkt);
        }
        av_packet_free(&out_pkt);

        // 버퍼 이동
        memmove(pcm_buffer, pcm_buffer + OPUS_FRAME_SIZE * 2,
                (buffered_samples - OPUS_FRAME_SIZE) * 2 * sizeof(float));
        buffered_samples -= OPUS_FRAME_SIZE;
      }
    }
    av_packet_unref(packet);
  }

  // 디코더 플러시
  avcodec_send_packet(dec_ctx, NULL);
  while (avcodec_receive_frame(dec_ctx, frame) == 0) {
    int max_out = av_rescale_rnd(
        swr_get_delay(swr_ctx, dec_ctx->sample_rate) + frame->nb_samples,
        enc_ctx->sample_rate, dec_ctx->sample_rate, AV_ROUND_UP);

    uint8_t** out_data = NULL;
    int out_linesize = 0;
    av_samples_alloc_array_and_samples(&out_data, &out_linesize, 2, max_out,
                                       enc_ctx->sample_fmt, 0);

    int converted =
        swr_convert(swr_ctx, out_data, max_out, (const uint8_t**)frame->data,
                    frame->nb_samples);

    memcpy(pcm_buffer + buffered_samples * 2, out_data[0],
           converted * 2 * sizeof(float));
    buffered_samples += converted;

    av_freep(&out_data[0]);
    free(out_data);

    while (buffered_samples >= OPUS_FRAME_SIZE) {
      enc_frame->nb_samples = OPUS_FRAME_SIZE;
      enc_frame->format = enc_ctx->sample_fmt;
      enc_frame->sample_rate = enc_ctx->sample_rate;
      av_channel_layout_copy(&enc_frame->ch_layout, &enc_ctx->ch_layout);
      enc_frame->data[0] = (uint8_t*)pcm_buffer;

      AVPacket* out_pkt = av_packet_alloc();
      avcodec_send_frame(enc_ctx, enc_frame);
      while (avcodec_receive_packet(enc_ctx, out_pkt) == 0) {
        fwrite(out_pkt->data, 1, out_pkt->size, outfile);
        av_packet_unref(out_pkt);
      }
      av_packet_free(&out_pkt);

      memmove(pcm_buffer, pcm_buffer + OPUS_FRAME_SIZE * 2,
              (buffered_samples - OPUS_FRAME_SIZE) * 2 * sizeof(float));
      buffered_samples -= OPUS_FRAME_SIZE;
    }
  }

  // 마지막 남은 샘플 인코딩
  if (buffered_samples > 0) {
    enc_frame->nb_samples = buffered_samples;
    enc_frame->format = enc_ctx->sample_fmt;
    enc_frame->sample_rate = enc_ctx->sample_rate;
    av_channel_layout_copy(&enc_frame->ch_layout, &enc_ctx->ch_layout);
    enc_frame->data[0] = (uint8_t*)pcm_buffer;

    AVPacket* out_pkt = av_packet_alloc();
    avcodec_send_frame(enc_ctx, enc_frame);
    while (avcodec_receive_packet(enc_ctx, out_pkt) == 0) {
      fwrite(out_pkt->data, 1, out_pkt->size, outfile);
      av_packet_unref(out_pkt);
    }
    av_packet_free(&out_pkt);
  }

  // 인코더 플러시
  avcodec_send_frame(enc_ctx, NULL);
  AVPacket* out_pkt = av_packet_alloc();
  while (avcodec_receive_packet(enc_ctx, out_pkt) == 0) {
    fwrite(out_pkt->data, 1, out_pkt->size, outfile);
    av_packet_unref(out_pkt);
  }
  av_packet_free(&out_pkt);

  fclose(outfile);
  free(pcm_buffer);
  swr_free(&swr_ctx);
  av_frame_free(&frame);
  av_frame_free(&enc_frame);
  av_packet_free(&packet);
  avcodec_free_context(&dec_ctx);
  avcodec_free_context(&enc_ctx);
  avformat_close_input(&fmt_ctx);

  printf("Encoding finished: %s\n", output_filename);
  return 0;
}