Add spatial audio support for more sound sample descriptions

kodawah · Feb 8, 2016 · d328cfa · d328cfa
1 parent fdb852a
commit d328cfa
Show file tree

Hide file tree

Showing 4 changed files with 122 additions and 39 deletions.
diff --git a/spatialmedia/gui.py b/spatialmedia/gui.py
@@ -85,8 +85,7 @@ def action_open(self):
 
         infile = os.path.abspath(self.in_file)
         file_extension = os.path.splitext(infile)[1].lower()
-        self.enable_spatial_audio =\
-            True if (file_extension == ".mp4") else False
+        self.enable_spatial_audio = parsed_metadata.num_audio_channels == 4
 
         if not metadata:
             self.var_spherical.set(0)
@@ -226,21 +225,6 @@ def create_widgets(self):
         self.checkbox_spherical["command"] = self.action_set_spherical
         self.checkbox_spherical.grid(row=row, column=column, padx=14, pady=2)
 
-        # Spatial Audio Checkbox
-        row += 1
-        column = 0
-        self.label_spatial_audio = Label(self)
-        self.label_spatial_audio["text"] = "Spatial Audio"
-        self.label_spatial_audio.grid(row=row, column=column)
-
-        column += 1
-        self.var_spatial_audio = IntVar()
-        self.checkbox_spatial_audio = \
-            Checkbutton(self, variable=self.var_spatial_audio)
-        self.checkbox_spatial_audio["command"] = self.action_set_spatial_audio
-        self.checkbox_spatial_audio.grid(
-            row=row, column=column, padx=0, pady=0)
-
         # 3D
         column = 0
         row = row + 1
@@ -266,6 +250,26 @@ def create_widgets(self):
         self.options_projection["text"] = "Equirectangular"
         self.options_projection.grid(row=row, column=column, padx=14, pady=2)
 
+        # Spherical / Spatial Audio Separator
+        row += 1
+        separator = Frame(self, relief=GROOVE, bd=1, height=2, bg="white")
+        separator.grid(columnspan=row, padx=14, pady=4, sticky=N+E+S+W)
+
+        # Spatial Audio Checkbox
+        row += 1
+        column = 0
+        self.label_spatial_audio = Label(self)
+        self.label_spatial_audio["text"] = "Spatial Audio"
+        self.label_spatial_audio.grid(row=row, column=column)
+
+        column += 1
+        self.var_spatial_audio = IntVar()
+        self.checkbox_spatial_audio = \
+            Checkbutton(self, variable=self.var_spatial_audio)
+        self.checkbox_spatial_audio["command"] = self.action_set_spatial_audio
+        self.checkbox_spatial_audio.grid(
+            row=row, column=column, padx=0, pady=0)
+
         # Ambisonics Type
         column = 0
         row = row + 1

diff --git a/spatialmedia/metadata_utils.py b/spatialmedia/metadata_utils.py
@@ -102,6 +102,7 @@ class ParsedMetadata(object):
     def __init__(self):
         self.video = dict()
         self.audio = None
+        self.num_audio_channels = 0
 
 SPHERICAL_PREFIX = "{http://ns.google.com/videos/1.0/spherical/}"
 SPHERICAL_TAGS = dict()
@@ -211,19 +212,20 @@ def inject_spatial_audio_atom(
                 if sub_element.name != mpeg.constants.TAG_STSD:
                     continue
                 for sample_description in sub_element.contents:
-                    if sample_description.name == mpeg.constants.TAG_MP4A:
+                    if sample_description.name in\
+                            mpeg.constants.SOUND_SAMPLE_DESCRIPTIONS:
                         in_fh.seek(sample_description.position +
                                    sample_description.header_size + 16)
                         num_channels = get_num_audio_channels(
-                            sample_description, in_fh)
+                            sub_element, in_fh)
                         num_ambisonic_components = \
                             get_expected_num_audio_components(
                                 audio_metadata["ambisonic_type"],
                                 audio_metadata["ambisonic_order"])
                         if num_channels != num_ambisonic_components:
-                            err_msg =  "Error: Found %d audio channel(s). "\
+                            err_msg = "Error: Found %d audio channel(s). "\
                                   "Expected %d channel(s) for %s ambisonics "\
-                                  "of orded %d."\
+                                  "of order %d."\
                                 % (num_channels,
                                    num_ambisonic_components,
                                    audio_metadata["ambisonic_type"],
@@ -318,10 +320,13 @@ def parse_spherical_mpeg4(mpeg4_file, fh, console):
                         for stsd_elem in stbl_elem.contents:
                             if stsd_elem.name != mpeg.constants.TAG_STSD:
                                 continue
-                            for mp4a_elem in stsd_elem.contents:
-                                if mp4a_elem.name != mpeg.constants.TAG_MP4A:
+                            metadata.num_audio_channels = get_num_audio_channels(
+                                    stsd_elem, fh)
+                            for sa3d_container_elem in stsd_elem.contents:
+                                if sa3d_container_elem.name not in\
+                                        mpeg.constants.SOUND_SAMPLE_DESCRIPTIONS:
                                     continue
-                                for sa3d_elem in mp4a_elem.contents:
+                                for sa3d_elem in sa3d_container_elem.contents:
                                     if sa3d_elem.name == mpeg.constants.TAG_SA3D:
                                         sa3d_elem.print_box(console)
                                         metadata.audio = sa3d_elem
@@ -406,9 +411,6 @@ def inject_metadata(src, dest, metadata, console):
     extension = os.path.splitext(infile)[1].lower()
 
     if (extension in MPEG_FILE_EXTENSIONS):
-        if (metadata.audio and extension != ".mp4"):
-            error("Error: Spatial audio current not supported for %s ." %
-                  extension)
         inject_mpeg4(infile, outfile, metadata, console)
         return
 
@@ -510,8 +512,53 @@ def get_expected_num_audio_components(ambisonics_type, ambisonics_order):
     else:
         return -1
 
+def get_num_audio_channels(stsd, in_fh):
+    if stsd.name != mpeg.constants.TAG_STSD:
+        print "get_num_audio_channels should be given a STSD box"
+        return -1
+    for sample_description in stsd.contents:
+        if sample_description.name == mpeg.constants.TAG_MP4A:
+            return get_aac_num_channels(sample_description, in_fh)
+        elif sample_description.name in mpeg.constants.SOUND_SAMPLE_DESCRIPTIONS:
+            return get_sample_description_num_channels(sample_description, in_fh)
+    return -1
+
+def get_sample_description_num_channels(sample_description, in_fh):
+    """Reads the number of audio channels from a sound sample description.
+    """
+    p = in_fh.tell()
+    in_fh.seek(sample_description.content_start() + 8)
+
+    version = struct.unpack(">h", in_fh.read(2))[0]
+    revision_level = struct.unpack(">h", in_fh.read(2))[0]
+    vendor = struct.unpack(">i", in_fh.read(4))[0]
+    if version == 0:
+        num_audio_channels = struct.unpack(">h", in_fh.read(2))[0]
+        sample_size_bytes = struct.unpack(">h", in_fh.read(2))[0]
+    elif version == 1:
+        num_audio_channels = struct.unpack(">h", in_fh.read(2))[0]
+        sample_size_bytes = struct.unpack(">h", in_fh.read(2))[0]
+        samples_per_packet = struct.unpack(">i", in_fh.read(4))[0]
+        bytes_per_packet = struct.unpack(">i", in_fh.read(4))[0]
+        bytes_per_frame = struct.unpack(">i", in_fh.read(4))[0]
+        bytes_per_sample = struct.unpack(">i", in_fh.read(4))[0]
+    elif version == 2:
+        always_3 = struct.unpack(">h", in_fh.read(2))[0]
+        always_16 = struct.unpack(">h", in_fh.read(2))[0]
+        always_minus_2 = struct.unpack(">h", in_fh.read(2))[0]
+        always_0 = struct.unpack(">h", in_fh.read(2))[0]
+        always_65536 = struct.unpack(">i", in_fh.read(4))[0]
+        size_of_struct_only = struct.unpack(">i", in_fh.read(4))[0]
+        audio_sample_rate = struct.unpack(">d", in_fh.read(8))[0]
+        num_audio_channels = struct.unpack(">i", in_fh.read(4))[0]
+    else:
+        print "Unsupported version for " + sample_description.name + " box"
+        return -1
+
+    in_fh.seek(p)
+    return num_audio_channels
 
-def get_num_audio_channels(mp4a_atom, in_fh):
+def get_aac_num_channels(mp4a_atom, in_fh):
     """Reads the number of audio channels from AAC's AudioSpecificConfig
        descriptor within the esds child atom of the input mp4a atom.
     """

diff --git a/spatialmedia/mpeg/constants.py b/spatialmedia/mpeg/constants.py
@@ -29,27 +29,55 @@
 TAG_FTYP = "ftyp"
 TAG_ESDS = "esds"
 TAG_SOUN = "soun"
+TAG_SA3D = "SA3D"
 
 # Container types.
 TAG_MOOV = "moov"
 TAG_UDTA = "udta"
 TAG_META = "meta"
 TAG_TRAK = "trak"
 TAG_MDIA = "mdia"
-TAG_MP4A = "mp4a"
 TAG_MINF = "minf"
 TAG_STBL = "stbl"
 TAG_STSD = "stsd"
 TAG_UUID = "uuid"
-TAG_SA3D = "SA3D"
 
-CONTAINERS_LIST = [
+# Sound sample descriptions.
+TAG_NONE = "NONE"
+TAG_RAW_ = "raw "
+TAG_TWOS = "twos"
+TAG_SOWT = "sowt"
+TAG_FL32 = "fl32"
+TAG_FL64 = "fl64"
+TAG_IN24 = "in24"
+TAG_IN32 = "in32"
+TAG_ULAW = "ulaw"
+TAG_ALAW = "alaw"
+TAG_LPCM = "lpcm"
+TAG_MP4A = "mp4a"
+
+SOUND_SAMPLE_DESCRIPTIONS = frozenset([
+    TAG_NONE,
+    TAG_RAW_,
+    TAG_TWOS,
+    TAG_SOWT,
+    TAG_FL32,
+    TAG_FL64,
+    TAG_IN24,
+    TAG_IN32,
+    TAG_ULAW,
+    TAG_ALAW,
+    TAG_LPCM,
+    TAG_MP4A,
+    ])
+
+CONTAINERS_LIST = frozenset([
     TAG_MDIA,
     TAG_MINF,
-    TAG_MP4A,
     TAG_MOOV,
     TAG_STBL,
     TAG_STSD,
     TAG_TRAK,
     TAG_UDTA,
-    ]
+    ]).union(SOUND_SAMPLE_DESCRIPTIONS)
+
diff --git a/spatialmedia/mpeg/container.py b/spatialmedia/mpeg/container.py
@@ -55,20 +55,24 @@ def load(fh, position, end):
         return None
 
     padding = 0
-    stsd_version = 0
-    if (name == constants.TAG_STSD):
+    if name == constants.TAG_STSD:
         padding = 8
 
-    if (name == constants.TAG_MP4A):
+    if name in constants.SOUND_SAMPLE_DESCRIPTIONS:
         current_pos = fh.tell()
         fh.seek(current_pos + 8)
         sample_description_version = struct.unpack(">h", fh.read(2))[0]
         fh.seek(current_pos)
 
-        if sample_description_version == 1:
-          padding = 28+16 # Mov
+        if sample_description_version == 0:
+            padding = 28
+        elif sample_description_version == 1:
+            padding = 28 + 16
+        elif sample_description_version == 2:
+            padding = 56
         else:
-          padding = 28 # Mp4
+            print("Unsupported sample description version:",
+                  sample_description_version)
 
     new_box = Container()
     new_box.name = name