Skip to content

Commit fd60a08

Browse files
authored
add a compile-time option to enable 4k page sizes (#52229) (#111)
We're suffering from heavy fragmentation in some of our workloads. Add a build-time option to enable 4k pages (instead of 16k) in the GC, since that improves memory utilization considerably for us. Drawback is that this may increase the number of `madvise` system calls in the sweeping phase by a factor of 4, but concurrent page sweeping should help with some of that.
1 parent 3bc0b8d commit fd60a08

File tree

4 files changed

+68
-18
lines changed

4 files changed

+68
-18
lines changed

src/gc.h

+23-1
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,12 @@
3131
extern "C" {
3232
#endif
3333

34+
#ifdef GC_SMALL_PAGE
35+
#define GC_PAGE_LG2 12 // log2(size of a page)
36+
#else
3437
#define GC_PAGE_LG2 14 // log2(size of a page)
35-
#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k
38+
#endif
39+
#define GC_PAGE_SZ (1 << GC_PAGE_LG2)
3640
#define GC_PAGE_OFFSET (JL_HEAP_ALIGNMENT - (sizeof(jl_taggedvalue_t) % JL_HEAP_ALIGNMENT))
3741

3842
#define jl_malloc_tag ((void*)0xdeadaa01)
@@ -241,6 +245,23 @@ typedef struct {
241245
_Atomic(size_t) n_pages_allocd;
242246
} gc_fragmentation_stat_t;
243247

248+
#ifdef GC_SMALL_PAGE
249+
#ifdef _P64
250+
#define REGION0_PG_COUNT (1 << 16)
251+
#define REGION1_PG_COUNT (1 << 18)
252+
#define REGION2_PG_COUNT (1 << 18)
253+
#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0xFFFF) // shift by GC_PAGE_LG2
254+
#define REGION1_INDEX(p) (((uintptr_t)(p) >> 28) & 0x3FFFF)
255+
#define REGION_INDEX(p) (((uintptr_t)(p) >> 46) & 0x3FFFF)
256+
#else
257+
#define REGION0_PG_COUNT (1 << 10)
258+
#define REGION1_PG_COUNT (1 << 10)
259+
#define REGION2_PG_COUNT (1 << 0)
260+
#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0x3FF) // shift by GC_PAGE_LG2
261+
#define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF)
262+
#define REGION_INDEX(p) (0)
263+
#endif
264+
#else
244265
#ifdef _P64
245266
#define REGION0_PG_COUNT (1 << 16)
246267
#define REGION1_PG_COUNT (1 << 16)
@@ -256,6 +277,7 @@ typedef struct {
256277
#define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF)
257278
#define REGION_INDEX(p) (0)
258279
#endif
280+
#endif
259281

260282
// define the representation of the levels of the page-table (0 to 2)
261283
typedef struct {

src/julia_internal.h

+37-8
Original file line numberDiff line numberDiff line change
@@ -359,24 +359,48 @@ static const int jl_gc_sizeclasses[] = {
359359
144, 160, 176, 192, 208, 224, 240, 256,
360360

361361
// the following tables are computed for maximum packing efficiency via the formula:
362-
// pg = 2^14
362+
// pg = GC_SMALL_PAGE ? 2^12 : 2^14
363363
// sz = (div.(pg-8, rng).÷16)*16; hcat(sz, (pg-8).÷sz, pg .- (pg-8).÷sz.*sz)'
364364

365+
#ifdef GC_SMALL_PAGE
366+
// rng = 15:-1:2 (14 pools)
367+
272, 288, 304, 336, 368, 400, 448, 496, 576, 672, 816, 1008, 1360, 2032
368+
// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, /pool
369+
// 16, 64, 144, 64, 48, 96, 64, 128, 64, 64, 16, 64, 16, 32, bytes lost
370+
#else
365371
// rng = 60:-4:32 (8 pools)
366372
272, 288, 304, 336, 368, 400, 448, 496,
367-
// 60, 56, 53, 48, 44, 40, 36, 33, /pool
368-
// 64, 256, 272, 256, 192, 384, 256, 16, bytes lost
373+
// 60, 56, 53, 48, 44, 40, 36, 33, /pool
374+
// 64, 256, 272, 256, 192, 384, 256, 16, bytes lost
369375

370376
// rng = 30:-2:16 (8 pools)
371377
544, 576, 624, 672, 736, 816, 896, 1008,
372-
// 30, 28, 26, 24, 22, 20, 18, 16, /pool
373-
// 64, 256, 160, 256, 192, 64, 256, 256, bytes lost
378+
// 30, 28, 26, 24, 22, 20, 18, 16, /pool
379+
// 64, 256, 160, 256, 192, 64, 256, 256, bytes lost
374380

375381
// rng = 15:-1:8 (8 pools)
376382
1088, 1168, 1248, 1360, 1488, 1632, 1808, 2032
377-
// 15, 14, 13, 12, 11, 10, 9, 8, /pool
378-
// 64, 32, 160, 64, 16, 64, 112, 128, bytes lost
383+
// 15, 14, 13, 12, 11, 10, 9, 8, /pool
384+
// 64, 32, 160, 64, 16, 64, 112, 128, bytes lost
385+
#endif
379386
};
387+
#ifdef GC_SMALL_PAGE
388+
#ifdef _P64
389+
# define JL_GC_N_POOLS 39
390+
#elif MAX_ALIGN == 8
391+
# define JL_GC_N_POOLS 40
392+
#else
393+
# define JL_GC_N_POOLS 41
394+
#endif
395+
#else
396+
#ifdef _P64
397+
# define JL_GC_N_POOLS 49
398+
#elif MAX_ALIGN == 8
399+
# define JL_GC_N_POOLS 50
400+
#else
401+
# define JL_GC_N_POOLS 51
402+
#endif
403+
#endif
380404
static_assert(sizeof(jl_gc_sizeclasses) / sizeof(jl_gc_sizeclasses[0]) == JL_GC_N_POOLS, "");
381405

382406
STATIC_INLINE int jl_gc_alignment(size_t sz)
@@ -403,7 +427,12 @@ JL_DLLEXPORT int jl_alignment(size_t sz);
403427

404428
// the following table is computed as:
405429
// [searchsortedfirst(jl_gc_sizeclasses, i) - 1 for i = 0:16:jl_gc_sizeclasses[end]]
406-
static const uint8_t szclass_table[] = {0, 1, 3, 5, 7, 9, 11, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 44, 44, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48};
430+
static const uint8_t szclass_table[] =
431+
#ifdef GC_SMALL_PAGE
432+
{0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,33,33,34,34,34,34,34,34,35,35,35,35,35,35,35,35,35,36,36,36,36,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38};
433+
#else
434+
{0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,34,34,35,35,35,36,36,36,37,37,37,37,38,38,38,38,38,39,39,39,39,39,40,40,40,40,40,40,40,41,41,41,41,41,42,42,42,42,42,43,43,43,43,43,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,46,46,46,46,46,46,46,46,46,47,47,47,47,47,47,47,47,47,47,47,48,48,48,48,48,48,48,48,48,48,48,48,48,48};
435+
#endif
407436
static_assert(sizeof(szclass_table) == 128, "");
408437

409438
STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass(unsigned sz)

src/julia_threads.h

+3-9
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
#ifndef JL_THREADS_H
55
#define JL_THREADS_H
66

7-
#include "work-stealing-queue.h"
87
#include "julia_atomics.h"
8+
#include "work-stealing-queue.h"
99
#ifndef _OS_WINDOWS_
1010
#include "pthread.h"
1111
#endif
@@ -160,14 +160,8 @@ typedef struct {
160160
arraylist_t *last_remset;
161161

162162
// variables for allocating objects from pools
163-
#ifdef _P64
164-
# define JL_GC_N_POOLS 49
165-
#elif MAX_ALIGN == 8
166-
# define JL_GC_N_POOLS 50
167-
#else
168-
# define JL_GC_N_POOLS 51
169-
#endif
170-
jl_gc_pool_t norm_pools[JL_GC_N_POOLS];
163+
#define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h`
164+
jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS];
171165

172166
#define JL_N_STACK_POOLS 16
173167
small_arraylist_t free_stacks[JL_N_STACK_POOLS];

src/options.h

+5
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,11 @@
8181
// Automatic Instrumenting Profiler
8282
//#define ENABLE_TIMINGS
8383

84+
// pool allocator configuration options
85+
86+
// GC_SMALL_PAGE allocates objects in 4k pages
87+
// #define GC_SMALL_PAGE
88+
8489

8590
// method dispatch profiling --------------------------------------------------
8691

0 commit comments

Comments
 (0)