Skip to content

Set AAC codec for audio in mp4 files, add transcoding utility #3956

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 52 additions & 36 deletions manim/scene/scene_file_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import json
import shutil
from fractions import Fraction
from pathlib import Path
from queue import Queue
from tempfile import NamedTemporaryFile
Expand Down Expand Up @@ -40,6 +41,38 @@
from manim.renderer.opengl_renderer import OpenGLRenderer


def to_av_frame_rate(fps):
epsilon1 = 1e-4
epsilon2 = 0.02
Comment on lines +45 to +46
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, where did these numbers come from?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I should have added a comment about these epsilons.
pyav wants fractions but at this point in the code all we have is a float. Though, strictly speaking, IEEE 754 floats are fractions, converting from floats to meaningful fractions is not obvious. I say meaningful because floats can only represent powers of two so something like (50 * 2 / 3) does not have an exact representation in binary. For (50 * 2 / 3), Fraction.from_float does not return Fraction(100, 3), as one would naïvely expect:

>>> from fractions import Fraction
>>> Fraction.from_float(50 * 2 / 3)
Fraction(4691249611844267, 140737488355328)

If a frame rate is obtained via some calculation, implicit float rounding rules may result in frame rate values like 49.999... instead of 50. Fraction.from_float returns:

>>> Fraction(50.0 - 2**-47)
Fraction(7036874417766399, 140737488355328)

Using this value rather than Fraction(50, 1) could lead to subtle issues.

I don't think this problem may be solved in general. I made the assumption that frame rates would be either whole values or multiples of 1000 / 1001 (because NTSC) specified with two decimals.
I chose epsilon1 and epsilon2 such that:

  • the epsilon1 check accepts 59.99999... as 60 but rejects 59.94
  • the epsilon2 check accepts common multiples of (1000/1001)

If the need to support other fractions arises, this function will have to be revisited.


if isinstance(fps, int):
(num, denom) = (fps, 1)
elif abs(fps - round(fps)) < epsilon1:
(num, denom) = (round(fps), 1)
else:
denom = 1001
num = round(fps * denom / 1000) * 1000
if abs(fps - num / denom) >= epsilon2:
raise ValueError("invalid frame rate")

return Fraction(num, denom)


def convert_audio(input_path: Path, output_path: Path, codec_name: str):
with (
av.open(input_path) as input_audio,
av.open(output_path, "w") as output_audio,
):
input_audio_stream = input_audio.streams.audio[0]
output_audio_stream = output_audio.add_stream(codec_name)
for frame in input_audio.decode(input_audio_stream):
for packet in output_audio_stream.encode(frame):
output_audio.mux(packet)

for packet in output_audio_stream.encode():
output_audio.mux(packet)


class SceneFileWriter:
"""
SceneFileWriter is the object that actually writes the animations
Expand Down Expand Up @@ -333,19 +366,7 @@ def add_sound(
# we need to pass delete=False to work on Windows
# TODO: figure out a way to cache the wav file generated (benchmark needed)
wav_file_path = NamedTemporaryFile(suffix=".wav", delete=False)
with (
av.open(file_path) as input_container,
av.open(wav_file_path, "w", format="wav") as output_container,
):
for audio_stream in input_container.streams.audio:
output_stream = output_container.add_stream("pcm_s16le")
for frame in input_container.decode(audio_stream):
for packet in output_stream.encode(frame):
output_container.mux(packet)

for packet in output_stream.encode():
output_container.mux(packet)

convert_audio(file_path, wav_file_path, "pcm_s16le")
new_segment = AudioSegment.from_file(wav_file_path.name)
logger.info(f"Automatically converted {file_path} to .wav")
wav_file_path.close()
Expand Down Expand Up @@ -506,9 +527,7 @@ def open_partial_movie_stream(self, file_path=None) -> None:
file_path = self.partial_movie_files[self.renderer.num_plays]
self.partial_movie_file_path = file_path

fps = config["frame_rate"]
if fps == int(fps): # fps is integer
fps = int(fps)
fps = to_av_frame_rate(config.frame_rate)

partial_movie_file_codec = "libx264"
partial_movie_file_pix_fmt = "yuv420p"
Expand All @@ -517,7 +536,7 @@ def open_partial_movie_stream(self, file_path=None) -> None:
"crf": "23", # ffmpeg: -crf, constant rate factor (improved bitrate)
}

if config.format == "webm":
if config.movie_file_extension == ".webm":
partial_movie_file_codec = "libvpx-vp9"
av_options["-auto-alt-ref"] = "1"
if config.transparent:
Expand All @@ -530,7 +549,7 @@ def open_partial_movie_stream(self, file_path=None) -> None:
with av.open(file_path, mode="w") as video_container:
stream = video_container.add_stream(
partial_movie_file_codec,
rate=config.frame_rate,
rate=fps,
options=av_options,
)
stream.pix_fmt = partial_movie_file_pix_fmt
Expand Down Expand Up @@ -622,7 +641,7 @@ def combine_files(
codec_name="gif" if create_gif else None,
template=partial_movies_stream if not create_gif else None,
)
if config.transparent and config.format == "webm":
if config.transparent and config.movie_file_extension == ".webm":
output_stream.pix_fmt = "yuva420p"
if create_gif:
"""
Expand All @@ -636,7 +655,7 @@ def combine_files(
output_stream.pix_fmt = "pal8"
output_stream.width = config.pixel_width
output_stream.height = config.pixel_height
output_stream.rate = config.frame_rate
output_stream.rate = to_av_frame_rate(config.frame_rate)
graph = av.filter.Graph()
input_buffer = graph.add_buffer(template=partial_movies_stream)
split = graph.add("split")
Expand All @@ -663,7 +682,8 @@ def combine_files(
while True:
try:
frame = graph.pull()
frame.time_base = output_stream.codec_context.time_base
if output_stream.codec_context.time_base is not None:
frame.time_base = output_stream.codec_context.time_base
frame.pts = frames_written
frames_written += 1
output_container.mux(output_stream.encode(frame))
Expand Down Expand Up @@ -704,6 +724,7 @@ def combine_to_movie(self):
movie_file_path = self.movie_file_path
if is_gif_format():
movie_file_path = self.gif_file_path

if len(partial_movie_files) == 0: # Prevent calling concat on empty list
logger.info("No animations are contained in this scene.")
return
Expand Down Expand Up @@ -732,21 +753,16 @@ def combine_to_movie(self):
# but tries to call ffmpeg via its CLI -- which we want
# to avoid. This is why we need to do the conversion
# manually.
if config.format == "webm":
with (
av.open(sound_file_path) as wav_audio,
av.open(sound_file_path.with_suffix(".ogg"), "w") as opus_audio,
):
wav_audio_stream = wav_audio.streams.audio[0]
opus_audio_stream = opus_audio.add_stream("libvorbis")
for frame in wav_audio.decode(wav_audio_stream):
for packet in opus_audio_stream.encode(frame):
opus_audio.mux(packet)

for packet in opus_audio_stream.encode():
opus_audio.mux(packet)

sound_file_path = sound_file_path.with_suffix(".ogg")
if config.movie_file_extension == ".webm":
ogg_sound_file_path = sound_file_path.with_suffix(".ogg")
convert_audio(sound_file_path, ogg_sound_file_path, "libvorbis")
sound_file_path = ogg_sound_file_path
elif config.movie_file_extension == ".mp4":
# Similarly, pyav may reject wav audio in an .mp4 file;
# convert to AAC.
aac_sound_file_path = sound_file_path.with_suffix(".aac")
convert_audio(sound_file_path, aac_sound_file_path, "aac")
sound_file_path = aac_sound_file_path

temp_file_path = movie_file_path.with_name(
f"{movie_file_path.stem}_temp{movie_file_path.suffix}"
Expand Down
10 changes: 10 additions & 0 deletions tests/test_scene_rendering/test_file_writer.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import sys
from fractions import Fraction
from pathlib import Path

import av
import numpy as np
import pytest

from manim import DR, Circle, Create, Scene, Star, tempconfig
from manim.scene.scene_file_writer import to_av_frame_rate
from manim.utils.commands import capture, get_video_metadata


Expand Down Expand Up @@ -175,3 +177,11 @@ def test_unicode_partial_movie(tmpdir, simple_scenes_path):

_, err, exit_code = capture(command)
assert exit_code == 0, err


def test_frame_rates():
assert to_av_frame_rate(25) == Fraction(25, 1)
assert to_av_frame_rate(24.0) == Fraction(24, 1)
assert to_av_frame_rate(23.976) == Fraction(24 * 1000, 1001)
assert to_av_frame_rate(23.98) == Fraction(24 * 1000, 1001)
assert to_av_frame_rate(59.94) == Fraction(60 * 1000, 1001)
Loading