Skip to content

Commit e702f2f

Browse files
slarenarthw
authored andcommitted
ggml : reduce hash table reset cost (ggml-org#8698)
* ggml : reduce hash table reset cost * fix unreachable code warnings after GGML_ASSERT(false) * GGML_ASSERT(false) -> GGML_ABORT("fatal error") * GGML_ABORT use format string
1 parent a1cf044 commit e702f2f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+851
-754
lines changed

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -325,9 +325,9 @@ ifdef LLAMA_DEBUG
325325
endif
326326
else
327327
MK_CPPFLAGS += -DNDEBUG
328-
MK_CFLAGS += -O3
329-
MK_CXXFLAGS += -O3
330-
MK_NVCCFLAGS += -O3
328+
MK_CFLAGS += -O3 -g
329+
MK_CXXFLAGS += -O3 -g
330+
MK_NVCCFLAGS += -O3 -g
331331
endif
332332

333333
ifdef LLAMA_SANITIZE_THREAD

examples/eval-callback/eval-callback.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
6262
} else if (type == GGML_TYPE_I8) {
6363
v = (float) *(int8_t *) &data[i];
6464
} else {
65-
GGML_ASSERT(false);
65+
GGML_ABORT("fatal error");
6666
}
6767
printf("%12.4f", v);
6868
sum += v;

examples/imatrix/imatrix.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
127127
}
128128
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
129129
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
130-
exit(1); //GGML_ASSERT(false);
130+
exit(1); //GGML_ABORT("fatal error");
131131
}
132132
if (m_params.verbosity > 1) {
133133
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
@@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
176176
}
177177
else if (e.values.size() != (size_t)src1->ne[0]) {
178178
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
179-
exit(1); //GGML_ASSERT(false);
179+
exit(1); //GGML_ABORT("fatal error");
180180
}
181181
++e.ncall;
182182
if (m_params.verbosity > 1) {

examples/llama-bench/llama-bench.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ static const char * output_format_str(output_formats format) {
150150
case JSON: return "json";
151151
case MARKDOWN: return "md";
152152
case SQL: return "sql";
153-
default: GGML_ASSERT(!"invalid output format");
153+
default: GGML_ABORT("invalid output format");
154154
}
155155
}
156156

@@ -176,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) {
176176
case LLAMA_SPLIT_MODE_NONE: return "none";
177177
case LLAMA_SPLIT_MODE_LAYER: return "layer";
178178
case LLAMA_SPLIT_MODE_ROW: return "row";
179-
default: GGML_ASSERT(!"invalid split mode");
179+
default: GGML_ABORT("invalid split mode");
180180
}
181181
}
182182

@@ -1326,7 +1326,7 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
13261326
case SQL:
13271327
return std::unique_ptr<printer>(new sql_printer());
13281328
}
1329-
GGML_ASSERT(false);
1329+
GGML_ABORT("fatal error");
13301330
}
13311331

13321332
int main(int argc, char ** argv) {

examples/llava/clip.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -869,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
869869
embeddings = peg_0;
870870
}
871871
else {
872-
GGML_ASSERT(false);
872+
GGML_ABORT("fatal error");
873873
}
874874
}
875875

examples/tokenize/tokenize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
163163
printf(">");
164164
return;
165165
}
166-
GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
166+
GGML_ABORT("MultiByteToWideChar() failed in an unexpected way.");
167167
}
168168

169169
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));

ggml/include/ggml.h

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -254,18 +254,8 @@
254254

255255
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
256256

257-
#define GGML_ASSERT(x) \
258-
do { \
259-
if (!(x)) { \
260-
fflush(stdout); \
261-
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
262-
ggml_print_backtrace(); \
263-
abort(); \
264-
} \
265-
} while (0)
266-
267257
#ifndef NDEBUG
268-
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
258+
#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
269259
#elif defined(__GNUC__)
270260
#define GGML_UNREACHABLE() __builtin_unreachable()
271261
#elif defined(_MSC_VER)
@@ -274,6 +264,17 @@
274264
#define GGML_UNREACHABLE() ((void) 0)
275265
#endif
276266

267+
#ifdef __cplusplus
268+
#define GGML_NORETURN [[noreturn]]
269+
#elif defined(_MSC_VER)
270+
#define GGML_NORETURN __declspec(noreturn)
271+
#else
272+
#define GGML_NORETURN _Noreturn
273+
#endif
274+
275+
#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
276+
#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
277+
277278
// used to copy the number of elements and stride in bytes of tensors into local variables.
278279
// main purpose is to reduce code duplication and improve readability.
279280
//
@@ -322,6 +323,9 @@
322323
extern "C" {
323324
#endif
324325

326+
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
327+
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
328+
325329
enum ggml_status {
326330
GGML_STATUS_ALLOC_FAILED = -2,
327331
GGML_STATUS_FAILED = -1,
@@ -636,8 +640,11 @@ extern "C" {
636640
GGML_CGRAPH_EVAL_ORDER_COUNT
637641
};
638642

643+
typedef uint32_t ggml_bitset_t;
644+
639645
struct ggml_hash_set {
640646
size_t size;
647+
ggml_bitset_t * used;
641648
struct ggml_tensor ** keys;
642649
};
643650

@@ -651,7 +658,7 @@ extern "C" {
651658
struct ggml_tensor ** grads;
652659
struct ggml_tensor ** leafs;
653660

654-
struct ggml_hash_set visited_hash_table;
661+
struct ggml_hash_set visited_hash_set;
655662

656663
enum ggml_cgraph_eval_order order;
657664
};
@@ -698,8 +705,6 @@ extern "C" {
698705
GGML_API int64_t ggml_cycles(void);
699706
GGML_API int64_t ggml_cycles_per_ms(void);
700707

701-
GGML_API void ggml_print_backtrace(void);
702-
703708
// accepts a UTF-8 path, even on Windows
704709
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
705710

@@ -2005,8 +2010,8 @@ extern "C" {
20052010

20062011
// ggml_graph_plan() has to be called before ggml_graph_compute()
20072012
// when plan.work_size > 0, caller must allocate memory for plan.work_data
2008-
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
2009-
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
2013+
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
2014+
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
20102015
// same as ggml_graph_compute() but the work data is allocated as a part of the context
20112016
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
20122017
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);

ggml/src/ggml-alloc.c

Lines changed: 18 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
9191
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
9292
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
9393
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94-
GGML_ASSERT(!"not enough space in the buffer");
95-
return;
94+
GGML_ABORT("not enough space in the buffer");
9695
}
9796

9897
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
133132
return;
134133
}
135134
}
136-
GGML_ASSERT(!"out of allocated_tensors");
135+
GGML_ABORT("out of allocated_tensors");
137136
}
138137
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
139138
for (int i = 0; i < 1024; i++) {
@@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
142141
return;
143142
}
144143
}
145-
fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
146-
GGML_ASSERT(!"tensor not found");
144+
GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
147145
}
148146
#endif
149147

@@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
176174
// this should never happen
177175
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
178176
__func__, size, max_avail);
179-
GGML_ASSERT(!"not enough space in the buffer");
180-
GGML_UNREACHABLE();
177+
GGML_ABORT("not enough space in the buffer");
181178
}
182179
}
183180

@@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
443440
}
444441
}
445442

446-
free(galloc->hash_set.keys);
443+
ggml_hash_set_free(&galloc->hash_set);
447444
free(galloc->hash_values);
448445
free(galloc->bufts);
449446
free(galloc->buffers);
@@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
456453
typedef struct ggml_gallocr * ggml_gallocr_t;
457454

458455
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
459-
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
456+
size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
460457
return &galloc->hash_values[i];
461458
}
462459

@@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
565562

566563
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
567564
// clear hash tables
568-
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
569-
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
565+
ggml_hash_set_reset(&galloc->hash_set);
566+
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
570567

571568
// allocate leafs
572569
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
@@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
671668
}
672669

673670
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
674-
size_t hash_size = graph->visited_hash_table.size;
671+
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
672+
// add 25% margin to avoid hash collisions
673+
min_hash_size += min_hash_size / 4;
675674

676675
// initialize hash table
677-
if (galloc->hash_set.size < hash_size) {
678-
free(galloc->hash_set.keys);
679-
free(galloc->hash_values);
680-
galloc->hash_set.size = hash_size;
681-
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
682-
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
676+
if (galloc->hash_set.size < min_hash_size) {
677+
ggml_hash_set_free(&galloc->hash_set);
678+
galloc->hash_set = ggml_hash_set_new(min_hash_size);
683679
GGML_ASSERT(galloc->hash_set.keys != NULL);
680+
681+
free(galloc->hash_values);
682+
galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
684683
GGML_ASSERT(galloc->hash_values != NULL);
685-
} else {
686-
// reset hash table
687-
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
688-
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
689684
}
690685

691686
// reset allocators
@@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
817812
}
818813

819814
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
820-
ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
821-
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
815+
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
822816
return talloc->size_max >= node_size;
823817
}
824818

0 commit comments

Comments
 (0)