🐛 [Bug] #2881 regression #2991

HolyWu · 2024-07-09T16:03:29Z

Bug Description

Since #2881, if the inference is performed in its own stream, the output randomly becomes all zeros.

To Reproduce

import torch
import torch.nn as nn
import torch_tensorrt


class MyModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.relu(x)


with torch.inference_mode():
    dtype = torch.half
    device = torch.device("cuda", 0)
    model = MyModule().eval().to(device)
    inputs = [torch_tensorrt.Input(shape=(1, 3, 5), dtype=dtype)]

    optimized_model = torch_tensorrt.compile(
        model,
        ir="dynamo",
        inputs=inputs,
        enabled_precisions={dtype},
        min_block_size=1,
        device=device,
    )

    for _ in range(10):
        new_input = torch.randn((1, 3, 5), dtype=dtype, device=device)

        eager_output = model(new_input)

        stream = torch.cuda.Stream(device=device)
        stream.wait_stream(torch.cuda.current_stream(device=device))
        with torch.cuda.stream(stream):
            trt_output_with_stream = optimized_model(new_input)
        torch.cuda.current_stream(device=device).wait_stream(stream)

        trt_output_without_stream = optimized_model(new_input)

        print("")
        print(f"{torch.allclose(eager_output, trt_output_with_stream)=}")
        print(f"{torch.allclose(eager_output, trt_output_without_stream)=}")
        print(f"{trt_output_with_stream=}")
        print(f"{trt_output_without_stream=}")

torch.allclose(eager_output, trt_output_with_stream)=True
torch.allclose(eager_output, trt_output_without_stream)=True
trt_output_with_stream=tensor([[[1.4609, 1.7363, 2.2363, 0.5176, 1.1904],
         [0.0000, 0.0000, 0.9243, 0.4497, 0.0000],
         [0.3186, 0.0000, 0.0000, 0.2537, 0.0707]]], device='cuda:0',
       dtype=torch.float16)
trt_output_without_stream=tensor([[[1.4609, 1.7363, 2.2363, 0.5176, 1.1904],
         [0.0000, 0.0000, 0.9243, 0.4497, 0.0000],
         [0.3186, 0.0000, 0.0000, 0.2537, 0.0707]]], device='cuda:0',
       dtype=torch.float16)

torch.allclose(eager_output, trt_output_with_stream)=False
torch.allclose(eager_output, trt_output_without_stream)=True
trt_output_with_stream=tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]], device='cuda:0', dtype=torch.float16)
trt_output_without_stream=tensor([[[0.6279, 0.6396, 0.9829, 0.2437, 0.0855],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.9268, 0.2942, 0.1624, 0.2323, 0.5444]]], device='cuda:0',
       dtype=torch.float16)

torch.allclose(eager_output, trt_output_with_stream)=False
torch.allclose(eager_output, trt_output_without_stream)=True
trt_output_with_stream=tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]], device='cuda:0', dtype=torch.float16)
trt_output_without_stream=tensor([[[0.4104, 0.9424, 0.9624, 0.0000, 0.6167],
         [0.2534, 0.2571, 0.0000, 0.0000, 0.0000],
         [0.4695, 0.0136, 0.9429, 0.2498, 0.6011]]], device='cuda:0',
       dtype=torch.float16)

torch.allclose(eager_output, trt_output_with_stream)=False
torch.allclose(eager_output, trt_output_without_stream)=True
trt_output_with_stream=tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]], device='cuda:0', dtype=torch.float16)
trt_output_without_stream=tensor([[[0.3899, 0.0000, 0.0000, 0.0000, 0.0000],
         [1.1504, 1.1680, 1.5703, 0.7173, 0.0000],
         [0.4333, 0.1777, 2.0723, 1.0098, 0.0000]]], device='cuda:0',
       dtype=torch.float16)

torch.allclose(eager_output, trt_output_with_stream)=True
torch.allclose(eager_output, trt_output_without_stream)=True
trt_output_with_stream=tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 1.1924],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [1.0889, 0.0027, 0.0000, 1.1680, 0.9487]]], device='cuda:0',
       dtype=torch.float16)
trt_output_without_stream=tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 1.1924],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [1.0889, 0.0027, 0.0000, 1.1680, 0.9487]]], device='cuda:0',
       dtype=torch.float16)

torch.allclose(eager_output, trt_output_with_stream)=False
torch.allclose(eager_output, trt_output_without_stream)=True
trt_output_with_stream=tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]], device='cuda:0', dtype=torch.float16)
trt_output_without_stream=tensor([[[0.0000, 0.0765, 0.4006, 0.0000, 2.0410],
         [2.2793, 0.0000, 0.0000, 0.0000, 0.0000],
         [1.6064, 0.0000, 0.6040, 0.4290, 0.0000]]], device='cuda:0',
       dtype=torch.float16)

torch.allclose(eager_output, trt_output_with_stream)=False
torch.allclose(eager_output, trt_output_without_stream)=True
trt_output_with_stream=tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]], device='cuda:0', dtype=torch.float16)
trt_output_without_stream=tensor([[[0.8057, 0.9116, 0.3110, 0.0000, 0.0000],
         [1.6016, 0.0000, 0.3757, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0048, 0.9331, 1.1475]]], device='cuda:0',
       dtype=torch.float16)

torch.allclose(eager_output, trt_output_with_stream)=False
torch.allclose(eager_output, trt_output_without_stream)=True
trt_output_with_stream=tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]], device='cuda:0', dtype=torch.float16)
trt_output_without_stream=tensor([[[0.0000, 0.7251, 0.0000, 0.0000, 0.9985],
         [0.0000, 1.3789, 0.6831, 0.0000, 0.7051],
         [0.0000, 0.0000, 0.0000, 0.6748, 0.0000]]], device='cuda:0',
       dtype=torch.float16)

torch.allclose(eager_output, trt_output_with_stream)=True
torch.allclose(eager_output, trt_output_without_stream)=True
trt_output_with_stream=tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 1.0420],
         [0.3943, 0.0000, 0.0000, 0.5200, 1.4277]]], device='cuda:0',
       dtype=torch.float16)
trt_output_without_stream=tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 1.0420],
         [0.3943, 0.0000, 0.0000, 0.5200, 1.4277]]], device='cuda:0',
       dtype=torch.float16)

torch.allclose(eager_output, trt_output_with_stream)=False
torch.allclose(eager_output, trt_output_without_stream)=True
trt_output_with_stream=tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]], device='cuda:0', dtype=torch.float16)
trt_output_without_stream=tensor([[[1.1484, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.5659, 0.0000, 1.3525, 0.5669],
         [1.7070, 0.0000, 2.0957, 0.0000, 1.1602]]], device='cuda:0',
       dtype=torch.float16)

Environment

Torch-TensorRT Version (e.g. 1.0.0): 2.5.0.dev20240709+cu124
PyTorch Version (e.g. 1.0): 2.5.0.dev20240709+cu124
CPU Architecture: x86_64
OS (e.g., Linux): Ubuntu 24.04 LTS
How you installed PyTorch (conda, pip, libtorch, source): pip
Build command you used (if compiling from source):
Are you using local sources or building from archives:
Python version: 3.12
CUDA version: 12.4
GPU models and configuration: RTX 3050
Any other relevant information:

Additional context

Interestingly, it's only reproducible when using dtype=torch.half, but not for dtype=torch.float.

The text was updated successfully, but these errors were encountered:

narendasan · 2024-07-09T18:33:26Z

@HolyWu what driver version are you using?

HolyWu · 2024-07-09T23:51:39Z

I'm using GeForce Game Ready driver 555.99 (I run Ubuntu under WSL 2). Note that the CI (https://github.com/pytorch/TensorRT/actions/runs/9848088305/job/27193531360) also has failure, but I'm not sure whether it's related to my issue.

peri044 · 2024-07-12T18:06:54Z

I couldn't reproduce this issue on my RTX 3080 Ti with driver 545.29.06. I have a hunch though.
@HolyWu could you try the following and let me know if you're still seeing the issue

Use Python runtime by specifying use_python_runtime=True in the compilation setting
Comment out
- https://github.com/pytorch/TensorRT/blob/main/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py#L105-L110
- https://github.com/pytorch/TensorRT/blob/main/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py#L213-L218
  And just set self.active_stream = torch.cuda.current_stream(device)

narendasan · 2024-07-12T19:19:39Z

Just as an additional datapoint:
latest main, latest pyt, Driver Version: 550.90.07 (Ubuntu Native) CUDA Version: 12.4 Titan V
No issues over 10000 iterations

Do let us know if the python runtime patch works, but I'm inclined to think its related to the WSL driver

HolyWu · 2024-07-13T09:42:53Z

I had run my test on Windows and Ubuntu-in-WSL, and both had the same issue. Hence it's probably a quirk on Windows and has nothing to do with the WSL driver.

Using use_python_runtime=True alone without modifying _PythonTorchTensorRTModule.py can already produce correct result. Ultimately, I found that adding the following codes above this line can make C++ runtime produce correct result for me.

    auto current_stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());
    if (compiled_engine->active_stream != current_stream) {
      compiled_engine->active_stream = current_stream;
    }

HolyWu · 2024-07-15T16:43:41Z

Hmm...I managed to reproduce this issue on Google Colab. But I had to use a tensor with big enough size like (1, 3, 2160, 3840) and also a few more iterations to make it happen.

lanluo-nvidia · 2024-07-15T17:14:49Z

@HolyWu @narendasan @peri044
I had tried on both windows and Linux, both in the local and GitHub workflow, I was not able to reproduce the issue.
I also tried both use_python_run_time=True and False, it is no issues.

Here is the workflow run with use_python_run_time=False for linux:
https://github.com/pytorch/TensorRT/actions/runs/9943098674/job/27466803783
Here is the workflow run with use_python_run_time=False for window:
https://github.com/pytorch/TensorRT/actions/runs/9943098676/job/27467208309

Let me also try on the big enough size then.

HolyWu · 2024-07-21T09:22:19Z

@narendasan @peri044 @lanluo-nvidia
Just wanted to make sure if any of you could reproduce it with big size tensor? I thought it's not Windows-specific now since it's also reproducible on Google Colab.

HolyWu added the bug Something isn't working label Jul 9, 2024

HolyWu closed this as not planned Won't fix, can't repro, duplicate, stale Jul 27, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

🐛 [Bug] #2881 regression #2991

🐛 [Bug] #2881 regression #2991

HolyWu commented Jul 9, 2024

narendasan commented Jul 9, 2024

HolyWu commented Jul 9, 2024

peri044 commented Jul 12, 2024

narendasan commented Jul 12, 2024

HolyWu commented Jul 13, 2024

HolyWu commented Jul 15, 2024

lanluo-nvidia commented Jul 15, 2024

HolyWu commented Jul 21, 2024

🐛 [Bug] #2881 regression #2991

🐛 [Bug] #2881 regression #2991

Comments

HolyWu commented Jul 9, 2024

Bug Description

To Reproduce

Environment

Additional context

narendasan commented Jul 9, 2024

HolyWu commented Jul 9, 2024

peri044 commented Jul 12, 2024

narendasan commented Jul 12, 2024

HolyWu commented Jul 13, 2024

HolyWu commented Jul 15, 2024

lanluo-nvidia commented Jul 15, 2024

HolyWu commented Jul 21, 2024