Skip to content

Commit d38d59c

Browse files
committed
Merge branch 'master' of https://github.com/ggerganov/llama.cpp into clang-warnings
2 parents a6b7476 + 65c2c1c commit d38d59c

File tree

11 files changed

+33
-23
lines changed

11 files changed

+33
-23
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,7 @@ jobs:
468468
with:
469469
operating_system: freebsd
470470
version: '13.2'
471+
hypervisor: 'qemu'
471472
run: |
472473
sudo pkg update
473474
sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas

Makefile

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -542,22 +542,22 @@ main: examples/main/main.cpp build-info.h ggml.
542542
@echo '==== Run ./main -h for help. ===='
543543
@echo
544544

545-
simple: examples/simple/simple.cpp ggml.o llama.o common.o $(OBJS)
545+
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
546546
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
547547

548-
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
548+
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
549549
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
550550

551-
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
551+
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
552552
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
553553

554-
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
554+
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
555555
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
556556

557-
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
557+
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
558558
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
559559

560-
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o common.o $(OBJS)
560+
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
561561
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
562562

563563
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
@@ -610,7 +610,7 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
610610

611611
tests: $(TEST_TARGETS)
612612

613-
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS)
613+
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
614614
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
615615

616616
run-benchmark-matmult: benchmark-matmult

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#pragma once
44

55
#include "llama.h"
6-
#include "build-info.h"
76

87
#define LOG_NO_FILE_LINE_FUNCTION
98
#include "log.h"

examples/benchmark/benchmark-matmult.cpp

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include "build-info.h"
12
#include "common.h"
23
#include "ggml.h"
34

@@ -32,11 +33,11 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
3233
}
3334

3435
static float tensor_sum_elements(const ggml_tensor * tensor) {
35-
float sum = 0;
36-
if (tensor->type==GGML_TYPE_F32) {
36+
double sum = 0;
37+
if (tensor->type == GGML_TYPE_F32) {
3738
for (int j = 0; j < tensor->ne[1]; j++) {
3839
for (int k = 0; k < tensor->ne[0]; k++) {
39-
sum += ((float *) tensor->data)[j*tensor->ne[0]+k];
40+
sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
4041
}
4142
}
4243
}
@@ -125,12 +126,15 @@ int main(int argc, char ** argv) {
125126

126127
//printf("Memsize required = %i\n", sizex*sizex);
127128

129+
// TODO: perform the bench for all types or for a user specified type
130+
const ggml_type qtype = GGML_TYPE_Q4_1;
131+
128132
size_t ctx_size = 0;
129133
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
130134
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
131135
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
132-
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
133-
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
136+
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
137+
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
134138
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
135139
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
136140
ctx_size += 1024*1024*16;
@@ -163,7 +167,7 @@ int main(int argc, char ** argv) {
163167
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
164168
ggml_set_f32(m2, 2.0f);
165169

166-
printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
170+
printf("\n------ Test 1 - Matrix Mult via F32 code\n");
167171
// printf("Creating new tensor m11xm2\n");
168172
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
169173

@@ -181,17 +185,16 @@ int main(int argc, char ** argv) {
181185

182186
TENSOR_DUMP(gf.nodes[0]);
183187

184-
printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
188+
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
185189

186190
int32_t nelements = sizex*sizey;
187-
int32_t ne[2] = { sizex, sizey };
188191

189192
std::vector<int64_t> hist_cur(1 << 4, 0);
190193

191194
// Set up a the benchmark matrices
192195
// printf("Creating new tensor q11 & Running quantize\n");
193-
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
194-
ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
196+
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
197+
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
195198

196199
// Set up a the compute graph
197200
// printf("Creating new tensor q31\n");
@@ -202,8 +205,8 @@ int main(int argc, char ** argv) {
202205

203206
// Set up a second graph computation to make sure we override the CPU cache lines
204207
// printf("Creating new tensor q12 & Running quantize\n");
205-
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
206-
ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
208+
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
209+
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
207210

208211
// printf("Creating new tensor q32\n");
209212
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
@@ -220,7 +223,7 @@ int main(int argc, char ** argv) {
220223
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
221224

222225

223-
// Let's use the F32 result from above as a reference for the q4_0 multiplication
226+
// Let's use the F32 result from above as a reference for the quantized multiplication
224227
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
225228

226229
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
@@ -250,7 +253,7 @@ int main(int argc, char ** argv) {
250253
// Check that the matrix multiplication result is in the right ballpark
251254
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
252255
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
253-
float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
256+
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
254257
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
255258

256259
if (delta > allowed_delta) {

examples/embd-input/embd-input-lib.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include "build-info.h"
12
#include "common.h"
23
#include "embd-input.h"
34

examples/embedding/embedding.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include "build-info.h"
12
#include "common.h"
23
#include "llama.h"
34

examples/perplexity/perplexity.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include "build-info.h"
12
#include "common.h"
23
#include "llama.h"
34

examples/quantize-stats/quantize-stats.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#define LLAMA_API_INTERNAL
2+
#include "build-info.h"
23
#include "common.h"
34
#include "ggml.h"
45
#include "llama.h"

examples/quantize/quantize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include "build-info.h"
12
#include "common.h"
23
#include "llama.h"
34

examples/save-load-state/save-load-state.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include "build-info.h"
12
#include "common.h"
23
#include "llama.h"
34

flake.nix

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@
5252
in
5353
{
5454
packages.default = pkgs.stdenv.mkDerivation {
55-
inherit name src meta postPatch nativeBuildInputs buildInputs postInstall;
55+
inherit name src meta postPatch nativeBuildInputs postInstall;
56+
buildInputs = osSpecific;
5657
cmakeFlags = cmakeFlags
5758
++ (if isAarch64 && isDarwin then [
5859
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"

0 commit comments

Comments
 (0)