diff --git a/configs/yolov10/hyp.scratch.high.yaml b/configs/yolov10/hyp.scratch.high.yaml
index 46b1bd61..616cb079 100644
--- a/configs/yolov10/hyp.scratch.high.yaml
+++ b/configs/yolov10/hyp.scratch.high.yaml
@@ -21,7 +21,7 @@ loss:
   reg_max: 16
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   # multi-stage data augment
   train_transforms: {
diff --git a/configs/yolov10/hyp.scratch.low.yaml b/configs/yolov10/hyp.scratch.low.yaml
index 13d55edd..66b6b767 100644
--- a/configs/yolov10/hyp.scratch.low.yaml
+++ b/configs/yolov10/hyp.scratch.low.yaml
@@ -21,7 +21,7 @@ loss:
   reg_max: 16
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   # multi-stage data augment
   train_transforms: {
diff --git a/configs/yolov10/hyp.scratch.med.yaml b/configs/yolov10/hyp.scratch.med.yaml
index 24a89eaf..aeed2f70 100644
--- a/configs/yolov10/hyp.scratch.med.yaml
+++ b/configs/yolov10/hyp.scratch.med.yaml
@@ -21,7 +21,7 @@ loss:
   reg_max: 16
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   # multi-stage data augment
   train_transforms: {
diff --git a/configs/yolov3/hyp.scratch.yaml b/configs/yolov3/hyp.scratch.yaml
index c9ce0077..27d01be9 100644
--- a/configs/yolov3/hyp.scratch.yaml
+++ b/configs/yolov3/hyp.scratch.yaml
@@ -26,7 +26,7 @@ loss:
   label_smoothing: 0.0 # label smoothing epsilon
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   train_transforms:
     - { func_name: mosaic, prob: 1.0 }
diff --git a/configs/yolov4/hyp.scratch.yaml b/configs/yolov4/hyp.scratch.yaml
index 0e24de88..76cd0d11 100644
--- a/configs/yolov4/hyp.scratch.yaml
+++ b/configs/yolov4/hyp.scratch.yaml
@@ -23,7 +23,7 @@ loss:
   label_smoothing: 0.0 # label smoothing epsilon
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   train_transforms:
     - { func_name: mosaic, prob: 1.0 }
diff --git a/configs/yolov5/hyp.scratch-high.yaml b/configs/yolov5/hyp.scratch-high.yaml
index ef7de65c..9641321d 100644
--- a/configs/yolov5/hyp.scratch-high.yaml
+++ b/configs/yolov5/hyp.scratch-high.yaml
@@ -30,7 +30,7 @@ loss:
   label_smoothing: 0.0 # label smoothing epsilon
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   train_transforms:
     - { func_name: mosaic, prob: 1.0 }
diff --git a/configs/yolov5/hyp.scratch-low.yaml b/configs/yolov5/hyp.scratch-low.yaml
index ac0758e0..f6e5916f 100644
--- a/configs/yolov5/hyp.scratch-low.yaml
+++ b/configs/yolov5/hyp.scratch-low.yaml
@@ -26,7 +26,7 @@ loss:
   label_smoothing: 0.0 # label smoothing epsilon
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   train_transforms:
     - { func_name: mosaic, prob: 1.0 }
diff --git a/configs/yolov7/hyp.scratch.p5.yaml b/configs/yolov7/hyp.scratch.p5.yaml
index 54dec978..2e607671 100644
--- a/configs/yolov7/hyp.scratch.p5.yaml
+++ b/configs/yolov7/hyp.scratch.p5.yaml
@@ -25,7 +25,7 @@ loss:
   label_smoothing: 0.0 # label smoothing epsilon
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   train_transforms:
     - { func_name: mosaic, prob: 1.0, mosaic9_prob: 0.2 }
diff --git a/configs/yolov7/hyp.scratch.p6.yaml b/configs/yolov7/hyp.scratch.p6.yaml
index 2ab423da..80f98f2e 100644
--- a/configs/yolov7/hyp.scratch.p6.yaml
+++ b/configs/yolov7/hyp.scratch.p6.yaml
@@ -25,7 +25,7 @@ loss:
   label_smoothing: 0.0 # label smoothing epsilon
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   train_transforms:
     - { func_name: mosaic, prob: 1.0, mosaic9_prob: 0.2 }
diff --git a/configs/yolov7/hyp.scratch.tiny.yaml b/configs/yolov7/hyp.scratch.tiny.yaml
index b77ba4d4..ba5fa9b1 100644
--- a/configs/yolov7/hyp.scratch.tiny.yaml
+++ b/configs/yolov7/hyp.scratch.tiny.yaml
@@ -25,7 +25,7 @@ loss:
   label_smoothing: 0.0 # label smoothing epsilon
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   train_transforms:
     - { func_name: mosaic, prob: 1.0, mosaic9_prob: 0.2 }
diff --git a/configs/yolov8/hyp.scratch.high.yaml b/configs/yolov8/hyp.scratch.high.yaml
index 1d66833b..cb75544c 100644
--- a/configs/yolov8/hyp.scratch.high.yaml
+++ b/configs/yolov8/hyp.scratch.high.yaml
@@ -21,7 +21,7 @@ loss:
   reg_max: 16
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   # multi-stage data augment
   train_transforms: {
diff --git a/configs/yolov8/hyp.scratch.low.yaml b/configs/yolov8/hyp.scratch.low.yaml
index ba309b96..db63bb6e 100644
--- a/configs/yolov8/hyp.scratch.low.yaml
+++ b/configs/yolov8/hyp.scratch.low.yaml
@@ -23,7 +23,7 @@ loss:
   reg_max: 16
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   # multi-stage data augment
   train_transforms: {
diff --git a/configs/yolov8/hyp.scratch.med.yaml b/configs/yolov8/hyp.scratch.med.yaml
index e810f8b9..be5f5d8c 100644
--- a/configs/yolov8/hyp.scratch.med.yaml
+++ b/configs/yolov8/hyp.scratch.med.yaml
@@ -21,7 +21,7 @@ loss:
   reg_max: 16
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   # multi-stage data augment
   train_transforms: {
diff --git a/configs/yolov8/seg/hyp.scratch.high.seg.yaml b/configs/yolov8/seg/hyp.scratch.high.seg.yaml
index 09a3e8c1..bf18f747 100644
--- a/configs/yolov8/seg/hyp.scratch.high.seg.yaml
+++ b/configs/yolov8/seg/hyp.scratch.high.seg.yaml
@@ -26,7 +26,7 @@ loss:
   max_object_num: 600
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   train_transforms: {
     stage_epochs: [ 290, 10 ],
diff --git a/configs/yolov9/hyp.scratch.high.yaml b/configs/yolov9/hyp.scratch.high.yaml
index 682afac0..cb17c06b 100644
--- a/configs/yolov9/hyp.scratch.high.yaml
+++ b/configs/yolov9/hyp.scratch.high.yaml
@@ -21,7 +21,7 @@ loss:
   reg_max: 16
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   # multi-stage data augment
   train_transforms: {
diff --git a/configs/yolox/hyp.scratch.yaml b/configs/yolox/hyp.scratch.yaml
index d60ac33d..d67d7a13 100644
--- a/configs/yolox/hyp.scratch.yaml
+++ b/configs/yolox/hyp.scratch.yaml
@@ -33,7 +33,7 @@ img_size: 640
 sync_bn: False
 
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   train_transforms: {
     stage_epochs: [ 285, 15 ],
diff --git a/docs/en/tutorials/configuration.md b/docs/en/tutorials/configuration.md
index 2c087002..de1ca755 100644
--- a/docs/en/tutorials/configuration.md
+++ b/docs/en/tutorials/configuration.md
@@ -95,7 +95,7 @@ This part of the parameters is defined in [configs/yolov3/hyp.scratch.yaml](http
 
 ```yaml
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   train_transforms:
     - { func_name: mosaic, prob: 1.0, mosaic9_prob: 0.0, translate: 0.1, scale: 0.9 }
diff --git a/docs/zh/tutorials/configuration.md b/docs/zh/tutorials/configuration.md
index 42f53206..fa376b77 100644
--- a/docs/zh/tutorials/configuration.md
+++ b/docs/zh/tutorials/configuration.md
@@ -92,7 +92,7 @@ data:
 
 ```yaml
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   train_transforms:
     - { func_name: mosaic, prob: 1.0, mosaic9_prob: 0.0, translate: 0.1, scale: 0.9 }
diff --git a/mindyolo/data/dataset.py b/mindyolo/data/dataset.py
index d869a78f..dd8d547d 100644
--- a/mindyolo/data/dataset.py
+++ b/mindyolo/data/dataset.py
@@ -86,7 +86,8 @@ def __init__(
         self.is_training = is_training
 
         # set column names
-        self.column_names_getitem = ['samples']
+        self.column_names_getitem = ['im_file', 'cls', 'bboxes', 'segments', 'keypoints', 'bbox_format', 'segment_format', 
+                                     'img', 'ori_shape', 'hw_scale', 'hw_pad'] if self.is_training else ['samples']
         if self.is_training:
             self.column_names_collate = ['images', 'labels']
             if self.return_segments:
@@ -169,7 +170,10 @@ def __init__(
         self.batch = bi  # batch index of image
 
         # Cache images into memory for faster training (WARNING: large datasets may exceed system RAM)
-        self.imgs, self.img_hw_ori, self.indices = None, None, range(n)
+        self.imgs, self.img_hw_ori, self.indices = [None] * n, [None] * n, range(n)
+        # Buffer thread for mosaic images
+        self.buffer = []
+        self.max_buffer_length = min((n, batch_size * 8, 1000)) if self.augment else 0
 
         # Rectangular Train/Test
         if self.rect:
@@ -313,6 +317,14 @@ def __getitem__(self, index):
                     sample = getattr(self, func_name)(sample, **_trans)
 
         sample['img'] = np.ascontiguousarray(sample['img'])
+        if self.is_training:
+            train_sample = []
+            for col_name in self.column_names_getitem:
+                if sample.get(col_name) is None:
+                    train_sample.append(np.nan)
+                else:
+                    train_sample.append(sample.get(col_name, np.nan))
+            return tuple(train_sample)
         return sample
 
     def __len__(self):
@@ -321,7 +333,8 @@ def __len__(self):
     def get_sample(self, index):
         """Get and return label information from the dataset."""
         sample = deepcopy(self.labels[index])
-        if self.imgs is None:
+        img = self.imgs[index]
+        if img is None:
             path = self.img_files[index]
             img = cv2.imread(path)  # BGR
             assert img is not None, "Image Not Found " + path
@@ -331,8 +344,13 @@ def get_sample(self, index):
                 interp = cv2.INTER_AREA if r < 1 and not self.augment else cv2.INTER_LINEAR
                 img = cv2.resize(img, (int(w_ori * r), int(h_ori * r)), interpolation=interp)
 
+            if self.augment:
+                self.imgs[index], self.img_hw_ori[index] = img, np.array([h_ori, w_ori]) # img, hw_original
+                self.buffer.append(index)
+                if 1 < len(self.buffer) >= self.max_buffer_length:
+                    j = self.buffer.pop(0)
+                    self.imgs[j], self.img_hw_ori[j] = None, np.array([None, None])
             sample['img'], sample['ori_shape'] = img, np.array([h_ori, w_ori])  # img, hw_original
-
         else:
             sample['img'], sample['ori_shape'] = self.imgs[index], self.img_hw_ori[index]  # img, hw_original
 
@@ -367,7 +385,7 @@ def _mosaic4(self, sample):
         # loads images in a 4-mosaic
         classes4, bboxes4, segments4 = [], [], []
         mosaic_samples = [sample, ]
-        indices = random.choices(self.indices, k=3)  # 3 additional image indices
+        indices = random.choices(self.buffer, k=3)  # 3 additional image indices
 
         segments_is_list = isinstance(sample['segments'], list)
         if segments_is_list:
@@ -444,7 +462,7 @@ def _mosaic9(self, sample):
         # loads images in a 9-mosaic
         classes9, bboxes9, segments9 = [], [], []
         mosaic_samples = [sample, ]
-        indices = random.choices(self.indices, k=8)  # 8 additional image indices
+        indices = random.choices(self.buffer, k=8)  # 8 additional image indices
 
         segments_is_list = isinstance(sample['segments'], list)
         if segments_is_list:
@@ -1156,21 +1174,17 @@ def _exif_size(self, img):
 
         return s
 
-    def train_collate_fn(self, batch_samples, batch_info):
-        imgs = [sample.pop('img') for sample in batch_samples]
+    def train_collate_fn(self, im_file, cls, bboxes, segments, keypoints, bbox_format, 
+                         segment_format, img, ori_shape, hw_scale, hw_pad, batch_info):
         labels = []
-        for i, sample in enumerate(batch_samples):
-            cls, bboxes = sample.pop('cls'), sample.pop('bboxes')
-            labels.append(np.concatenate((np.full_like(cls, i), cls, bboxes), axis=-1))
-        return_items = [np.stack(imgs, 0), np.stack(labels, 0)]
-
+        for i, (c, b) in enumerate(zip(cls, bboxes)):
+            labels.append(np.concatenate((np.full_like(c, i), c, b), axis=-1))
+        return_items = [np.stack(img, 0), np.stack(labels, 0)]
         if self.return_segments:
-            masks = [sample.pop('segments', None) for sample in batch_samples]
-            return_items.append(np.stack(masks, 0))
+            return_items.append(np.stack(segments, 0))
         if self.return_keypoints:
-            keypoints = [sample.pop('keypoints', None) for sample in batch_samples]
             return_items.append(np.stack(keypoints, 0))
-
+        
         return tuple(return_items)
 
     def test_collate_fn(self, batch_samples, batch_info):
diff --git a/mindyolo/utils/trainer_factory.py b/mindyolo/utils/trainer_factory.py
index b7e1f221..3b569914 100644
--- a/mindyolo/utils/trainer_factory.py
+++ b/mindyolo/utils/trainer_factory.py
@@ -132,7 +132,7 @@ def train(
         manager = CheckpointManager(ckpt_save_policy="latest_k")
         manager_ema = CheckpointManager(ckpt_save_policy="latest_k") if self.ema else None
 
-        loader = self.dataloader.create_dict_iterator(output_numpy=False, num_epochs=1)
+        loader = self.dataloader.create_dict_iterator(output_numpy=False, num_epochs=1, do_cpoy=False)
         s_step_time = time.time()
         s_epoch_time = time.time()
         run_context = RunContext(
diff --git a/tutorials/configuration_CN.md b/tutorials/configuration_CN.md
index 6a1caa4c..f9f4996d 100644
--- a/tutorials/configuration_CN.md
+++ b/tutorials/configuration_CN.md
@@ -90,7 +90,7 @@ data:
 
  ```yaml
 data:
-  num_parallel_workers: 4
+  num_parallel_workers: 8
 
   train_transforms:
     - { func_name: mosaic, prob: 1.0, mosaic9_prob: 0.0, translate: 0.1, scale: 0.9 }