Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix prefast bugs: 1944959 1997925 1997926 1997927 1997928 #13203

Merged
merged 1 commit into from
Oct 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ Status QOrderedAttention::ComputeInternal(OpKernelContext* context) const {
int64_t size_of_attention_scores = ((int64_t)batch_size) * num_heads_ * sequence_length * sequence_length;

// transposed qkv_layer, union(stacked, attention probs + attention scores)
auto gemm_buffer_quantized = GetScratchBuffer<int8_t>(m * n + std::max((int64_t)m * n, 2 * size_of_attention_scores));
auto gemm_buffer_quantized = GetScratchBuffer<int8_t>((int64_t)m * n + std::max((int64_t)m * n, 2 * size_of_attention_scores));

int8_t* stacked_qkv_layers = gemm_buffer_quantized.get() + ((int64_t)m * n);
int8_t* tranposed_qkv_layers = gemm_buffer_quantized.get();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ QOrderedLongformerAttention::ComputeInternal(OpKernelContext* context) const {

// TODO: only calculate once per model.
// Build Global Index
auto global_index_buffer = GetScratchBuffer<int>(batch_size * sequence_length);
auto global_index_buffer = GetScratchBuffer<int>(static_cast<size_t>(batch_size) * static_cast<size_t>(sequence_length));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SafeInt is recommended for those changes.

auto batch_global_num_buffer = GetScratchBuffer<int>(batch_size);

size_t global_scratch_bytes = GetGlobalScratchSize(sequence_length);
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/cuda/tensor/pad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
ORT_ENFORCE(pads_size == 2 * static_cast<size_t>(dimension_count),
"Pads tensor size should be equal to twice the input dimension count ");

pads.reserve(2 * dimension_count);
pads.reserve(2LL * dimension_count);
for (size_t i = 0; i < pads_size; ++i) {
pads.push_back(pads_tensor_raw_data[i]);
}
Expand Down
26 changes: 13 additions & 13 deletions onnxruntime/test/contrib_ops/qordered_attention_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,17 @@
namespace onnxruntime {
namespace test {

static const int64_t batch_size = 1;
static const int64_t sequence_len = 16;
static const int64_t input_hidden_size = 32;
static const int64_t num_heads = 2;
static const int64_t head_size = 16;
static const int64_t hidden_size = num_heads * head_size;
static constexpr int64_t batch_size = 1;
static constexpr int64_t sequence_len = 16;
static constexpr int64_t input_hidden_size = 32;
static constexpr int64_t num_heads = 2;
static constexpr int64_t head_size = 16;
static constexpr int64_t hidden_size = num_heads * head_size;

static std::vector<int32_t> input_mask = { // [1, 16]
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0};

static float input_scale = 0.025f;
static constexpr float input_scale = 0.025f;

static std::vector<int8_t> inputq = { // [1, 16, 32]
-33, 7, -54, 29, 14, 6, 14, 16, 1, 16, 22, 0, 16, 49, -14, -15, 68, 11, -18, -9, -42, 6, 6, 58, 22, 31, 0, -13, 42, 40, 4, 0,
Expand Down Expand Up @@ -180,13 +180,13 @@ static std::vector<float> v_bias = {
-1.5637541858090884f, 0.053171526292804416f, -1.5821961194911058f, -1.2062417346542489f, 0.23029741928149683f, -0.8920457050782132f, -0.06220760650838387f, 0.2942590084687021f,
-0.4362228349183151f, -0.2344379226413643f, -0.586149329261036f, -1.5243876669794532f, 0.22378084867382358f, -1.715499198175354f, -1.3795418183607775f, -1.2237706022285266f};

static float qlayer_scale = 0.250f;
static float klayer_scale = 0.250f;
static float vlayer_scale = 0.125f;
static constexpr float qlayer_scale = 0.250f;
static constexpr float klayer_scale = 0.250f;
static constexpr float vlayer_scale = 0.125f;

static float qk_scale = 0.5f;
static float probs_scale = 0.0078125f;
static float attn_out_scale = 0.05f;
static constexpr float qk_scale = 0.5f;
static constexpr float probs_scale = 0.0078125f;
static constexpr float attn_out_scale = 0.05f;

static std::vector<int8_t> attn_out_q8 = {
-39, 8, -75, 2, -69, -31, -42, -29, 44, 6, 0, -61, -102, 61, 28, 76,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,19 +79,19 @@ static void run_qordered_longformer_attention_op_test(
}

TEST(QOrderedTest, LongformerAttention_1x128x2x16_window_32) {
const float scale_input = 1.0f / 32.0f;
const float scale_weight = 1.0f / 64.0f;
const float scale_bias = 1.0f / 8.0f;
const float scale_qkv_gemm = 1.0f / 4.0f;
const float scale_global_weight = 1.0f / 64.0f;
const float scale_global_gemm = 1.0f / 4.0f;
const float scale_output = 1.0f / 8.0f;
const int64_t batch_size = 1;
const int64_t sequence_len = 128;
const int64_t num_heads = 2;
const int64_t head_size = 16;
const int64_t window = 32;
const int64_t input_hidden_size = 0; // same as hidden_size
constexpr float scale_input = 1.0f / 32.0f;
constexpr float scale_weight = 1.0f / 64.0f;
constexpr float scale_bias = 1.0f / 8.0f;
constexpr float scale_qkv_gemm = 1.0f / 4.0f;
constexpr float scale_global_weight = 1.0f / 64.0f;
constexpr float scale_global_gemm = 1.0f / 4.0f;
constexpr float scale_output = 1.0f / 8.0f;
constexpr int64_t batch_size = 1;
constexpr int64_t sequence_len = 128;
constexpr int64_t num_heads = 2;
constexpr int64_t head_size = 16;
constexpr int64_t window = 32;
constexpr int64_t input_hidden_size = 0; // same as hidden_size

// Following code generate the input data vectors: (Keep it here in case)
// #include <iostream>
Expand Down Expand Up @@ -154,7 +154,7 @@ TEST(QOrderedTest, LongformerAttention_1x128x2x16_window_32) {
// debug_print(global_attention_mask.data(), batch_size, sequence_len, "global_attention_mask");
// float scale_output = 1.0f / 8.0f;



//========inputq : 128x32 ============
std::vector<int8_t> inputq = {
Expand Down