-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbenchmark.py
160 lines (140 loc) · 6.46 KB
/
benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from collections import defaultdict
import time
from typing import Any, Counter, DefaultDict, Tuple, Dict, Optional
import warnings
import numpy as np
import torch
from torch import nn
import tqdm
from fvcore.nn import FlopCountAnalysis
from fvcore.nn.jit_handles import Handle
from data.utils.types import DataType
from utils.padding import InputPadderFromShape
import pandas as pd
@torch.no_grad()
def measure_average_inference_time_iter(model, inputs, num_iters=100, warm_iters=5):
ts = []
# note that warm-up iters. are excluded from the total iters.
for iter_ in tqdm.tqdm(range(warm_iters + num_iters)):
torch.cuda.synchronize()
t_ = time.perf_counter()
model(inputs)
torch.cuda.synchronize()
t = time.perf_counter() - t_
if iter_ >= warm_iters:
ts.append(t)
return sum(ts) / len(ts)
@torch.no_grad()
def measure_average_inference_time(model, inputs, num_iters=100, warm_iters=10):
for iter_ in tqdm.tqdm(range(warm_iters + num_iters)):
if iter_ == warm_iters:
torch.cuda.synchronize()
t_ = time.perf_counter()
model(inputs)
torch.cuda.synchronize()
t = time.perf_counter() - t_
return t / num_iters
def python_ops_mode_for_deform_attn(model, ops_mode):
def change_ops_mode(module):
if hasattr(module, "python_ops_for_test"):
module.python_ops_for_test = ops_mode
model.apply(change_ops_mode)
@torch.no_grad()
def compute_fps(model, dataset, num_iters=300, warm_iters=50, batch_size=4, sparsity=0.0):
print(f"computing fps.. (num_iters={num_iters}, batch_size={batch_size}) "
f"warm_iters={warm_iters}, batch_size={batch_size}]")
assert num_iters > 0 and warm_iters >= 0 and batch_size > 0
model.cuda()
model.eval()
inputs = torch.rand((batch_size, 20, 384, 640)).cuda()
inputs[:] = (inputs[:] > sparsity)
t = measure_average_inference_time(model, inputs.int(), num_iters, warm_iters)
# t = measure_average_inference_time(model, dataset, batch_size, num_iters, warm_iters)
model.train()
print(f"FPS: {1.0 / t * batch_size}")
return 1.0 / t * batch_size
@torch.no_grad()
def compute_gflops(model, dataset, approximated=True, sparsity=0.0):
print(f"computing flops.. (approximated={approximated})")
model.eval()
python_ops_mode_for_deform_attn(model, True)
gflops_list = []
imsize_list = []
if approximated:
# use just a single image to approximate the full compuation
# the size of the image was found heuristically
images = [torch.rand((1, 20, 384, 640)).cuda().float()]
images[0] = (images[0] > sparsity).float()
for img in tqdm.tqdm(images):
inputs = img.cuda()
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=RuntimeWarning)
res = flop_count_without_warnings(model, (inputs,), )[0]
gflops = sum(res.values()) - res['conv']
gflops_list.append(gflops)
imsize_list.append(list(img.shape))
else:
# full computation: get the first 100 images of COCO val2017
input_padder = InputPadderFromShape(desired_hw=(384, 640))
r_list = []
for i in tqdm.tqdm(range(len(dataset.datapipe_list))):
kk = len(dataset.datapipe_list[i])
for j in tqdm.tqdm(range(kk)):
inputs = input_padder.pad_tensor_ev_repr(dataset.datapipe_list[i][j][DataType.EV_REPR][-1].unsqueeze(0).float()).cuda()
r = non_zero_ratio(inputs)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=RuntimeWarning)
res = flop_count_without_warnings(model.cuda(), (inputs,), )[0]
gflops = sum(res.values()) - res['conv']
gflops_list.append(gflops)
imsize_list.append(list(inputs.shape))
r_list.append((r.item(), gflops))
df = pd.DataFrame(r_list, columns=['r', 'gflops'])
excel_file_path = 'gflops2.xlsx'
df.to_excel(excel_file_path, index=False, engine='openpyxl')
if approximated:
print(f"The image size used for approximation: [1, 20, 384, 640]")
else:
print("Average image size of first 100 image of COCO val2017 : "
f"{np.array(imsize_list).mean(0)}")
print(f"GFLOPs : {np.array(gflops_list).mean()}")
model.train()
python_ops_mode_for_deform_attn(model, False)
return np.array(gflops_list).mean()
def flop_count_without_warnings(
model: nn.Module,
inputs: Tuple[Any, ...],
supported_ops: Optional[Dict[str, Handle]] = None,
) -> Tuple[DefaultDict[str, float], Counter[str]]:
"""copied and modified from fvcore.nn.flop_count.py
Given a model and an input to the model, compute the per-operator Gflops
of the given model.
Args:
model (nn.Module): The model to compute flop counts.
inputs (tuple): Inputs that are passed to `model` to count flops.
Inputs need to be in a tuple.
supported_ops (dict(str,Callable) or None) : provide additional
handlers for extra ops, or overwrite the existing handlers for
convolution and matmul and einsum. The key is operator name and the value
is a function that takes (inputs, outputs) of the op. We count
one Multiply-Add as one FLOP.
Returns:
tuple[defaultdict, Counter]: A dictionary that records the number of
gflops for each operation and a Counter that records the number of
unsupported operations.
"""
if supported_ops is None:
supported_ops = {}
flop_counter = FlopCountAnalysis(model, inputs).set_op_handle(**supported_ops)
flop_counter.unsupported_ops_warnings(False)
flop_counter.uncalled_modules_warnings(False)
flop_counter.tracer_warnings("no_tracer_warning")
giga_flops = defaultdict(float)
for op, flop in flop_counter.by_operator().items():
giga_flops[op] = flop / 1e9
return giga_flops, flop_counter.unsupported_ops()
@torch.no_grad()
def non_zero_ratio(x: torch.Tensor) -> torch.Tensor:
num_nonzero = torch.sum(torch.sum(x != 0, dtype=torch.int16, dim=[1, 2]), dtype=torch.int32, dim=-1)
result = x.shape[0] * num_nonzero.float() / x.numel()
return result