wenet-e2e · robin1001 · May 8, 2024 · May 8, 2024
diff --git a/wenet/transformer/encoder.py b/wenet/transformer/encoder.py
@@ -157,12 +157,17 @@ def forward(
             xs = self.global_cmvn(xs)
         xs, pos_emb, masks = self.embed(xs, masks)
         mask_pad = masks  # (B, 1, T/subsample_rate)
-        chunk_masks = add_optional_chunk_mask(xs, masks,
-                                              self.use_dynamic_chunk,
-                                              self.use_dynamic_left_chunk,
-                                              decoding_chunk_size,
-                                              self.static_chunk_size,
-                                              num_decoding_left_chunks)
+        chunk_masks = add_optional_chunk_mask(
+            xs,
+            masks,
+            self.use_dynamic_chunk,
+            self.use_dynamic_left_chunk,
+            decoding_chunk_size,
+            self.static_chunk_size,
+            num_decoding_left_chunks,
+            # Since we allow up to 1s(100 frames) delay, the maximum
+            # chunk_size is 100 / 4 = 25.
+            max_chunk_size=int(100.0 / self.embed.subsampling_rate))
         if self.use_sdpa:
             chunk_masks = mask_to_bias(chunk_masks, xs.dtype)
         if self.gradient_checkpointing and self.training:

diff --git a/wenet/utils/mask.py b/wenet/utils/mask.py
@@ -130,7 +130,8 @@ def add_optional_chunk_mask(xs: torch.Tensor,
                             decoding_chunk_size: int,
                             static_chunk_size: int,
                             num_decoding_left_chunks: int,
-                            enable_full_context: bool = True):
+                            enable_full_context: bool = True,
+                            max_chunk_size: int = 25):
     """ Apply optional mask for encoder.
 
     Args:
@@ -151,8 +152,8 @@ def add_optional_chunk_mask(xs: torch.Tensor,
             >=0: use num_decoding_left_chunks
             <0: use all left chunks
         enable_full_context (bool):
-            True: chunk size is either [1, 25] or full context(max_len)
-            False: chunk size ~ U[1, 25]
+            True: chunk size is either [1, max_chunk_size] or full context(max_len)
+            False: chunk size ~ U[1, max_chunk_size]
 
     Returns:
         torch.Tensor: chunk mask of the input xs.
@@ -167,15 +168,15 @@ def add_optional_chunk_mask(xs: torch.Tensor,
             chunk_size = decoding_chunk_size
             num_left_chunks = num_decoding_left_chunks
         else:
-            # chunk size is either [1, 25] or full context(max_len).
+            # chunk size is either [1, max_chunk_size] or full context(max_len).
             # Since we use 4 times subsampling and allow up to 1s(100 frames)
             # delay, the maximum frame is 100 / 4 = 25.
             chunk_size = torch.randint(1, max_len, (1, )).item()
             num_left_chunks = -1
             if chunk_size > max_len // 2 and enable_full_context:
                 chunk_size = max_len
             else:
-                chunk_size = chunk_size % 25 + 1
+                chunk_size = chunk_size % max_chunk_size + 1
                 if use_dynamic_left_chunk:
                     max_left_chunks = (max_len - 1) // chunk_size
                     num_left_chunks = torch.randint(0, max_left_chunks,