Skip to content

Transformation of Worksharing constructs using LLVM runtime

Anjia Wang edited this page Oct 26, 2020 · 4 revisions

1. Introduction

2. for directive

2.1. No schedule clause

If there's no schedule clause specified, by default the block distribution will be used for worksharing. Each thread takes a group of continuous iterations. For example, given 4 threads and 12 iterations, thread 0 takes the iterations 0, 1, 2, thread 2 takes the iterations 3, 4, 5, and so on.

#pragma omp parallel for num_threads(4)
  for (int i = 0; i < 12; i++)
    printf("Thread ID: %d, Iteration: %d\n", omp_get_thread_num(), i);

To transform this OpenMP code, we use __kmpc_fork_call as usual to call the outlined function. Inside the outlined function, __kmpc_for_static_init_4 will be used for scheduling. There are also several other similar function calls for different cases, such as __kmpc_for_static_init_4u. Since it's static worksharing, we know exactly how the iterations will be distributed from the very beginning. __kmpc_for_static_init_4 only need to be called once instead of multiple times for finding next available iterations.

static void outlined_function(int *__global_tid,int *__bound_tid,void *__out_argv) {
  int p_index_;
  int p_lower_ = 0; // lower bound
  int p_upper_ = 11; // upper bound
  int __last_iter = 0;
  int __p_stride = 1;

  // 34 indicates the schedule enum for the case of default block distribution
  __kmpc_for_static_init_4(0, *__global_tid, 34, &__last_iter, &p_lower_, &p_upper_, &__p_stride, 1, 1);
  if (p_upper_ > 11) p_upper_ = 11;
  // the original loop
  for (p_index_ = p_lower_; p_index_ <= p_upper_; p_index_ += 1) {
    printf("Thread ID: %d, Iteration: %d\n",(omp_get_thread_num()),p_index_);
  }
  __kmpc_for_static_fini(0, *__global_tid);
}

__kmpc_push_num_threads(0,(__kmpc_global_thread_num(0)),4); // set up 4 omp threads
__kmpc_fork_call(0,1,outlined_function,0);

2.2. schedule clause with static modifier

When the schedule clause with the static modifier is used, each time a thread will fetch the specified number of iterations instead of all the assigned iterations. We use the cyclic distribution as an example. Given 4 threads and 12 iterations, thread 0 takes the iterations 0, 4, 8, thread 2 takes the iterations 1, 5, 9, and so on.

#pragma omp parallel for num_threads(4) schedule(static, 1)
  for (int i = 0; i < 12; i++)
    printf("Thread ID: %d, Iteration: %d\n", omp_get_thread_num(), i);

In this case, __kmpc_for_static_init_4 is still called once, but the sub-loop will be called multiple times since the assigned iterations are not fetched in one round.

static void outlined_function(int *__global_tid,int *__bound_tid,void *__out_argv) {
  int p_index_;
  int p_lower_ = 0; // lower bound
  int p_upper_ = 11; // upper bound
  int __last_iter = 0;
  int __p_stride = 1;

  // 33 indicates the schedule enum for the case of cyclic distribution
  __kmpc_for_static_init_4(0, *__global_tid, 33, &__last_iter, &p_lower_, &p_upper_, &__p_stride, 1, 1);
  while (p_lower_ <= p_upper_) {
    if (p_upper_ > 11) p_upper_ = 11;
    // the original loop
    for (p_index_ = p_lower_; p_index_ <= p_upper_; p_index_ += 1) {
      printf("Thread ID: %d, Iteration: %d\n",(omp_get_thread_num()),p_index_);
    }
    p_upper_ += __p_stride;
    p_lower_ += __p_stride;
  }
  __kmpc_for_static_fini(0, *__global_tid);
}

__kmpc_push_num_threads(0,(__kmpc_global_thread_num(0)),4); // set up 4 omp threads
__kmpc_fork_call(0,1,outlined_function,0);

An additional while loop is used to control the iteration fetching until all the assigned iterations are executed.

2.3. schedule clause with dynamic modifier

When the dynamic modifier is used in a schedule clause, the number of iterations assigned to a thread is unknown. In a similar example as above, we never know the output until runtime and it varies all the time. For example, in the first run, thread 0 could take the iteration 0 while thread 1 takes the iterations 0, 2, 3, 4, 5. In another run, thread 0 could take the iterations 0, 1, 4 while thread 1 only takes the iterations 2.

#pragma omp parallel for num_threads(4) schedule(dynamic)
  for (int i = 0; i < 12; i++)
    printf("Thread ID: %d, Iteration: %d\n", omp_get_thread_num(), i);

Therefore, after the initialization by __kmpc_dispatch_init_4, we need to keep calling __kmpc_dispatch_next_4 to fetch the next available iteration until they are all executed.

static void outlined_function(int *__global_tid,int *__bound_tid,void *__out_argv) {
  int p_index_;
  int p_lower_ = 0; // lower bound
  int p_upper_ = 11; // upper bound
  int __last_iter = 0;
  int __p_stride = 1;

  // 35 indicates the schedule enum for the case of dynamic scheduling
  __kmpc_dispatch_init_4(0, *__global_tid, 35, p_lower_, p_upper_, __p_stride, 1);
  while (__kmpc_dispatch_next_4(0, *__global_tid, &__last_iter, &p_lower_, &p_upper_, &__p_stride)) {
    // the original loop
    for (p_index_ = p_lower_; p_index_ <= p_upper_; p_index_ += 1) {
      printf("Thread ID: %d, Iteration: %d\n",(omp_get_thread_num()),p_index_);
    }
  }
}

__kmpc_push_num_threads(0,(__kmpc_global_thread_num(0)),4); // set up 4 omp threads
__kmpc_fork_call(0,1,outlined_function,0);

2.4. collapse clause

To collapse the nested loop i and j, the total number of iterations is calculated by n_total = n_i * n_j. Then in the flattened loop, they will be recovered by division and mod operation. For a flatten iteration t, the original index would be __i = t / n_i and __j = t % n_i.

#pragma omp parallel for num_threads(16)
  for (i = 0; i < 6; i++)
    for (j = 0; j < 2; j++)
      printf("Thread ID: %d, i: %d, j: %d\n", omp_get_thread_num(), i, j);

In this example, although the number of omp threads is set to 16, only the outter loop i will be parallized and take 6 threads. The inner loop j is executed in serial. The thread id would be 0-5.

#pragma omp parallel for num_threads(16) collapse(2)
  for (i = 0; i < 6; i++)
    for (j = 0; j < 2; j++)
      printf("Thread ID: %d, i: %d, j: %d\n", omp_get_thread_num(), i, j);

With collapse clause, the nested loop is flattened to a single loop with 12 iterations. Then they can be executed in parallel so that the thread id would be 0-11.

Then the code using collapse clause can be transformed as follows.

static void outlined_function(int *__global_tid,int *__bound_tid,void *__out_argv) {
  int *__final_total_iters = 12;
  int *__i_interval = 6;
  int _p_i;
  int _p_j;
  int p_index_;
  int p_lower_ = 0;
  int p_upper_ = *__final_total_iters - 1;
  int __last_iter = 0;
  int __p_stride = 1;

  __kmpc_for_static_init_4(0, *__global_tid, 34, &__last_iter, &p_lower_, &p_upper_, &__p_stride, 1, 1);
  if (p_upper_ > *__final_total_iters - 1) p_upper_ = *__final_total_iters - 1;
  for (p_index_ = p_lower_; p_index_ <= p_upper_; p_index_ += 1) {
    _p_i = p_index_ / 6; // recover the original index i
    _p_j = p_index_ % 6; // recover the original index j
    printf("Thread ID: %d, i: %d, j: %d\n",(omp_get_thread_num()),_p_i,_p_j);
  }
  __kmpc_for_static_fini(0, *__global_tid);
}

__kmpc_push_num_threads(0,(__kmpc_global_thread_num(0)),16); // set up 16 omp threads
__kmpc_fork_call(0,1,outlined_function,&__out_argv);

Other than handling the index, collapse clause does not bring much difference to the transformation.