Skip to content

Commit 85e3da0

Browse files
authored
add a compile-time option to enable 4k page sizes (#52229)
We're suffering from heavy fragmentation in some of our workloads. Add a build-time option to enable 4k pages (instead of 16k) in the GC, since that improves memory utilization considerably for us. Drawback is that this may increase the number of `madvise` system calls in the sweeping phase by a factor of 4, but concurrent page sweeping should help with some of that.
1 parent 8c9ac8d commit 85e3da0

File tree

4 files changed

+68
-18
lines changed

4 files changed

+68
-18
lines changed

src/gc.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,12 @@
3333
extern "C" {
3434
#endif
3535

36+
#ifdef GC_SMALL_PAGE
37+
#define GC_PAGE_LG2 12 // log2(size of a page)
38+
#else
3639
#define GC_PAGE_LG2 14 // log2(size of a page)
37-
#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k
40+
#endif
41+
#define GC_PAGE_SZ (1 << GC_PAGE_LG2)
3842
#define GC_PAGE_OFFSET (JL_HEAP_ALIGNMENT - (sizeof(jl_taggedvalue_t) % JL_HEAP_ALIGNMENT))
3943

4044
#define jl_malloc_tag ((void*)0xdeadaa01)
@@ -242,6 +246,23 @@ typedef struct {
242246
_Atomic(size_t) n_pages_allocd;
243247
} gc_fragmentation_stat_t;
244248

249+
#ifdef GC_SMALL_PAGE
250+
#ifdef _P64
251+
#define REGION0_PG_COUNT (1 << 16)
252+
#define REGION1_PG_COUNT (1 << 18)
253+
#define REGION2_PG_COUNT (1 << 18)
254+
#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0xFFFF) // shift by GC_PAGE_LG2
255+
#define REGION1_INDEX(p) (((uintptr_t)(p) >> 28) & 0x3FFFF)
256+
#define REGION_INDEX(p) (((uintptr_t)(p) >> 46) & 0x3FFFF)
257+
#else
258+
#define REGION0_PG_COUNT (1 << 10)
259+
#define REGION1_PG_COUNT (1 << 10)
260+
#define REGION2_PG_COUNT (1 << 0)
261+
#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0x3FF) // shift by GC_PAGE_LG2
262+
#define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF)
263+
#define REGION_INDEX(p) (0)
264+
#endif
265+
#else
245266
#ifdef _P64
246267
#define REGION0_PG_COUNT (1 << 16)
247268
#define REGION1_PG_COUNT (1 << 16)
@@ -257,6 +278,7 @@ typedef struct {
257278
#define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF)
258279
#define REGION_INDEX(p) (0)
259280
#endif
281+
#endif
260282

261283
// define the representation of the levels of the page-table (0 to 2)
262284
typedef struct {

src/julia_internal.h

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -373,24 +373,48 @@ static const int jl_gc_sizeclasses[] = {
373373
144, 160, 176, 192, 208, 224, 240, 256,
374374

375375
// the following tables are computed for maximum packing efficiency via the formula:
376-
// pg = 2^14
376+
// pg = GC_SMALL_PAGE ? 2^12 : 2^14
377377
// sz = (div.(pg-8, rng).÷16)*16; hcat(sz, (pg-8).÷sz, pg .- (pg-8).÷sz.*sz)'
378378

379+
#ifdef GC_SMALL_PAGE
380+
// rng = 15:-1:2 (14 pools)
381+
272, 288, 304, 336, 368, 400, 448, 496, 576, 672, 816, 1008, 1360, 2032
382+
// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, /pool
383+
// 16, 64, 144, 64, 48, 96, 64, 128, 64, 64, 16, 64, 16, 32, bytes lost
384+
#else
379385
// rng = 60:-4:32 (8 pools)
380386
272, 288, 304, 336, 368, 400, 448, 496,
381-
// 60, 56, 53, 48, 44, 40, 36, 33, /pool
382-
// 64, 256, 272, 256, 192, 384, 256, 16, bytes lost
387+
// 60, 56, 53, 48, 44, 40, 36, 33, /pool
388+
// 64, 256, 272, 256, 192, 384, 256, 16, bytes lost
383389

384390
// rng = 30:-2:16 (8 pools)
385391
544, 576, 624, 672, 736, 816, 896, 1008,
386-
// 30, 28, 26, 24, 22, 20, 18, 16, /pool
387-
// 64, 256, 160, 256, 192, 64, 256, 256, bytes lost
392+
// 30, 28, 26, 24, 22, 20, 18, 16, /pool
393+
// 64, 256, 160, 256, 192, 64, 256, 256, bytes lost
388394

389395
// rng = 15:-1:8 (8 pools)
390396
1088, 1168, 1248, 1360, 1488, 1632, 1808, 2032
391-
// 15, 14, 13, 12, 11, 10, 9, 8, /pool
392-
// 64, 32, 160, 64, 16, 64, 112, 128, bytes lost
397+
// 15, 14, 13, 12, 11, 10, 9, 8, /pool
398+
// 64, 32, 160, 64, 16, 64, 112, 128, bytes lost
399+
#endif
393400
};
401+
#ifdef GC_SMALL_PAGE
402+
#ifdef _P64
403+
# define JL_GC_N_POOLS 39
404+
#elif MAX_ALIGN == 8
405+
# define JL_GC_N_POOLS 40
406+
#else
407+
# define JL_GC_N_POOLS 41
408+
#endif
409+
#else
410+
#ifdef _P64
411+
# define JL_GC_N_POOLS 49
412+
#elif MAX_ALIGN == 8
413+
# define JL_GC_N_POOLS 50
414+
#else
415+
# define JL_GC_N_POOLS 51
416+
#endif
417+
#endif
394418
static_assert(sizeof(jl_gc_sizeclasses) / sizeof(jl_gc_sizeclasses[0]) == JL_GC_N_POOLS, "");
395419

396420
STATIC_INLINE int jl_gc_alignment(size_t sz) JL_NOTSAFEPOINT
@@ -417,7 +441,12 @@ JL_DLLEXPORT int jl_alignment(size_t sz) JL_NOTSAFEPOINT;
417441

418442
// the following table is computed as:
419443
// [searchsortedfirst(jl_gc_sizeclasses, i) - 1 for i = 0:16:jl_gc_sizeclasses[end]]
420-
static const uint8_t szclass_table[] = {0, 1, 3, 5, 7, 9, 11, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 44, 44, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48};
444+
static const uint8_t szclass_table[] =
445+
#ifdef GC_SMALL_PAGE
446+
{0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,33,33,34,34,34,34,34,34,35,35,35,35,35,35,35,35,35,36,36,36,36,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38};
447+
#else
448+
{0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,34,34,35,35,35,36,36,36,37,37,37,37,38,38,38,38,38,39,39,39,39,39,40,40,40,40,40,40,40,41,41,41,41,41,42,42,42,42,42,43,43,43,43,43,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,46,46,46,46,46,46,46,46,46,47,47,47,47,47,47,47,47,47,47,47,48,48,48,48,48,48,48,48,48,48,48,48,48,48};
449+
#endif
421450
static_assert(sizeof(szclass_table) == 128, "");
422451

423452
STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass(unsigned sz) JL_NOTSAFEPOINT

src/julia_threads.h

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
#ifndef JL_THREADS_H
55
#define JL_THREADS_H
66

7-
#include "work-stealing-queue.h"
87
#include "julia_atomics.h"
8+
#include "work-stealing-queue.h"
99
#ifndef _OS_WINDOWS_
1010
#include "pthread.h"
1111
#endif
@@ -161,14 +161,8 @@ typedef struct {
161161
arraylist_t *last_remset;
162162

163163
// variables for allocating objects from pools
164-
#ifdef _P64
165-
# define JL_GC_N_POOLS 49
166-
#elif MAX_ALIGN == 8
167-
# define JL_GC_N_POOLS 50
168-
#else
169-
# define JL_GC_N_POOLS 51
170-
#endif
171-
jl_gc_pool_t norm_pools[JL_GC_N_POOLS];
164+
#define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h`
165+
jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS];
172166

173167
#define JL_N_STACK_POOLS 16
174168
small_arraylist_t free_stacks[JL_N_STACK_POOLS];

src/options.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,11 @@
7878
// OBJPROFILE counts objects by type
7979
// #define OBJPROFILE
8080

81+
// pool allocator configuration options
82+
83+
// GC_SMALL_PAGE allocates objects in 4k pages
84+
// #define GC_SMALL_PAGE
85+
8186

8287
// method dispatch profiling --------------------------------------------------
8388

0 commit comments

Comments
 (0)