Skip to content

Commit

Permalink
gpt-2 : better check for CPU backend when settings n_threads
Browse files Browse the repository at this point in the history
  • Loading branch information
slaren committed Oct 3, 2023
1 parent 45d13b1 commit d7d2a6b
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 9 deletions.
8 changes: 3 additions & 5 deletions examples/gpt-2/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -760,11 +760,9 @@ bool gpt2_eval(
ggml_allocr_alloc_graph(allocr, gf);

// run the computation
#ifndef GGML_USE_CUBLAS
// FIXME: the backend may be CPU even if CUDA is enabled
// if (model.backend.id == GGML_BACKEND_ID_CPU)
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
#endif
if (strcmp(ggml_backend_name(model.backend), "CPU") == 0) {
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
}
ggml_backend_graph_compute(model.backend, gf);

//if (n_past%100 == 0) {
Expand Down
9 changes: 5 additions & 4 deletions src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemset hipMemset
#define cudaMemsetAsync hipMemsetAsync
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
Expand Down Expand Up @@ -1576,7 +1577,7 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
}

template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void k_get_rows(const void * x, const int * y, dst_t * dst, const int ncols) {
static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
const int row = blockDim.y*blockIdx.y + threadIdx.y;

Expand Down Expand Up @@ -4586,7 +4587,7 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale


template<int qk, int qr, dequantize_kernel_t dq>
static void get_rows_cuda(const void * x, const int * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
const dim3 block_nums(block_num_x, nrows, 1);
Expand Down Expand Up @@ -5810,7 +5811,7 @@ static void ggml_cuda_op_repeat(
GGML_ASSERT(nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float));

// TODO: very inefficient, implement in a kernel
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
for (int i3 = 0; i3 < nr3; i3++) {
for (int k3 = 0; k3 < ne03; k3++) {
for (int i2 = 0; i2 < nr2; i2++) {
Expand Down Expand Up @@ -5847,7 +5848,7 @@ static void ggml_cuda_op_get_rows(
const int ncols = src0->ne[0];
const int nrows = ggml_nelements(src1);

const int * src1_i32 = (const int *) src1_d;
const int32_t * src1_i32 = (const int32_t *) src1_d;

switch (src0->type) {
case GGML_TYPE_F16:
Expand Down

0 comments on commit d7d2a6b

Please sign in to comment.