Skip to content

Commit f5a77a6

Browse files
authored
Introduce C-style API (#370)
* Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning
1 parent da0e9fe commit f5a77a6

File tree

14 files changed

+1949
-1747
lines changed

14 files changed

+1949
-1747
lines changed

CMakeLists.txt

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -207,15 +207,10 @@ else()
207207
message(STATUS "Unknown architecture")
208208
endif()
209209

210-
211210
#
212-
# Build library
211+
# Build libraries
213212
#
214213

215-
add_executable(llama main.cpp)
216-
217-
add_executable(quantize quantize.cpp)
218-
219214
add_library(utils OBJECT
220215
utils.cpp
221216
utils.h)
@@ -229,14 +224,24 @@ add_library(ggml OBJECT
229224

230225
target_include_directories(ggml PUBLIC .)
231226
target_compile_features(ggml PUBLIC c_std_11) # don't bump
227+
target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
228+
229+
add_library(llama OBJECT
230+
llama.cpp
231+
llama.h)
232+
233+
target_include_directories(llama PUBLIC .)
234+
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
232235

233236
#
234-
# Linking
237+
# Executables
235238
#
236239

237-
target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
238-
target_link_libraries(llama PRIVATE ggml utils)
239-
target_link_libraries(quantize PRIVATE ggml utils)
240+
add_executable(main main.cpp)
241+
target_link_libraries(main PRIVATE llama ggml utils)
242+
243+
add_executable(quantize quantize.cpp)
244+
target_link_libraries(quantize PRIVATE llama ggml utils)
240245

241246
#
242247
# programs, examples and tests

Makefile

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -220,18 +220,21 @@ default: main quantize
220220
ggml.o: ggml.c ggml.h
221221
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
222222

223+
llama.o: llama.cpp llama.h
224+
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
225+
223226
utils.o: utils.cpp utils.h
224227
$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
225228

226229
clean:
227230
rm -f *.o main quantize
228231

229-
main: main.cpp ggml.o utils.o
230-
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
232+
main: main.cpp ggml.o llama.o utils.o
233+
$(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS)
231234
@echo "\x1b[36mrun ./main -h for help\x1b[0m"
232235

233-
quantize: quantize.cpp ggml.o utils.o
234-
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
236+
quantize: quantize.cpp ggml.o llama.o utils.o
237+
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS)
235238

236239
#
237240
# Tests

convert-pth-to-ggml.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def main():
148148
model = torch.load(fname_model, map_location="cpu")
149149

150150
with open(fname_out, "wb") as fout:
151-
fout.write(struct.pack("i", hparams["vocab_size"]))
151+
write_header(fout, hparams, ftype)
152152
write_tokens(fout, tokenizer)
153153

154154
del model

ggml.c

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10702,6 +10702,127 @@ enum ggml_opt_result ggml_opt(
1070210702

1070310703
////////////////////////////////////////////////////////////////////////////////
1070410704

10705+
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
10706+
const int nb = k / qk;
10707+
const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
10708+
const size_t row_size = nb*bs;
10709+
10710+
assert(k % qk == 0);
10711+
10712+
const size_t pp_size = qk / 2;
10713+
uint8_t * pp = (uint8_t *) alloca(pp_size);
10714+
10715+
char * pdst = (char *) dst;
10716+
10717+
for (int j = 0; j < n; j += k) {
10718+
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
10719+
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
10720+
10721+
for (int i = 0; i < nb; i++) {
10722+
float amax = 0.0f; // absolute max
10723+
10724+
{
10725+
for (int l = 0; l < qk; l++) {
10726+
const float v = src[j + i*qk + l];
10727+
amax = MAX(amax, fabsf(v));
10728+
}
10729+
10730+
const float d = amax / ((1 << 3) - 1);
10731+
const float id = d ? 1.0f/d : 0.0f;
10732+
10733+
*(float *) pd = d;
10734+
pd += bs;
10735+
10736+
for (int l = 0; l < qk; l += 2) {
10737+
const float v0 = (src[j + i*qk + l + 0])*id;
10738+
const float v1 = (src[j + i*qk + l + 1])*id;
10739+
10740+
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
10741+
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
10742+
10743+
assert(vi0 >= 0 && vi0 < 16);
10744+
assert(vi1 >= 0 && vi1 < 16);
10745+
10746+
hist[vi0]++;
10747+
hist[vi1]++;
10748+
10749+
pp[l/2] = vi0 | (vi1 << 4);
10750+
}
10751+
10752+
memcpy(pb, pp, pp_size);
10753+
pb += bs;
10754+
}
10755+
}
10756+
}
10757+
10758+
return (n/k)*row_size;
10759+
}
10760+
10761+
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
10762+
const int nb = k / qk;
10763+
const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
10764+
const size_t row_size = nb*bs;
10765+
10766+
assert(k % qk == 0);
10767+
10768+
const size_t pp_size = qk / 2;
10769+
uint8_t * pp = (uint8_t *) alloca(pp_size);
10770+
10771+
char * pdst = (char *) dst;
10772+
10773+
for (int j = 0; j < n; j += k) {
10774+
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
10775+
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
10776+
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
10777+
10778+
//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
10779+
10780+
for (int i = 0; i < nb; i++) {
10781+
float min = FLT_MAX;
10782+
float max = -FLT_MAX;
10783+
10784+
{
10785+
for (int l = 0; l < qk; l++) {
10786+
const float v = src[j + i*qk + l];
10787+
if (v < min) min = v;
10788+
if (v > max) max = v;
10789+
}
10790+
10791+
const float d = (max - min) / ((1 << 4) - 1);
10792+
const float id = d ? 1.0f/d : 0.0f;
10793+
10794+
*(float *) pd = d;
10795+
*(float *) pm = min;
10796+
pd += bs;
10797+
pm += bs;
10798+
10799+
for (int l = 0; l < qk; l += 2) {
10800+
const float v0 = (src[j + i*qk + l + 0] - min)*id;
10801+
const float v1 = (src[j + i*qk + l + 1] - min)*id;
10802+
10803+
const uint8_t vi0 = round(v0);
10804+
const uint8_t vi1 = round(v1);
10805+
10806+
assert(vi0 >= 0 && vi0 < 16);
10807+
assert(vi1 >= 0 && vi1 < 16);
10808+
10809+
hist[vi0]++;
10810+
hist[vi1]++;
10811+
10812+
pp[l/2] = vi0 | (vi1 << 4);
10813+
}
10814+
10815+
memcpy(pb, pp, pp_size);
10816+
pb += bs;
10817+
}
10818+
}
10819+
}
10820+
10821+
return (n/k)*row_size;
10822+
}
10823+
10824+
////////////////////////////////////////////////////////////////////////////////
10825+
1070510826
int ggml_cpu_has_avx(void) {
1070610827
#if defined(__AVX__)
1070710828
return 1;

ggml.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,13 @@ enum ggml_opt_result ggml_opt(
741741
struct ggml_opt_params params,
742742
struct ggml_tensor * f);
743743

744+
//
745+
// quantization
746+
//
747+
748+
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
749+
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
750+
744751
//
745752
// system info
746753
//

0 commit comments

Comments
 (0)