Skip to content

Commit 5d259d3

Browse files
Merge branch 'master' of github.com:ggerganov/llama.cpp into phillip-kravtsov/support-adept-persimmon-8b. ggml-ci
2 parents c90ed9f + e2583cb commit 5d259d3

39 files changed

+3370
-783
lines changed

.github/workflows/build.yml

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ jobs:
188188
sysctl -a
189189
mkdir build
190190
cd build
191-
cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
191+
cmake ..
192192
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
193193
194194
- name: Test
@@ -253,6 +253,29 @@ jobs:
253253
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
254254
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
255255
256+
macOS-latest-swift:
257+
runs-on: macos-latest
258+
259+
strategy:
260+
matrix:
261+
destination: ['platform=macOS,name=Any Mac', 'platform=iOS,name=Any iOS Device', 'platform=tvOS,name=Any tvOS Device']
262+
263+
steps:
264+
- name: Clone
265+
id: checkout
266+
uses: actions/checkout@v1
267+
268+
- name: Dependencies
269+
id: depends
270+
continue-on-error: true
271+
run: |
272+
brew update
273+
274+
- name: xcodebuild for swift package
275+
id: xcodebuild
276+
run: |
277+
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
278+
256279
windows-latest-cmake:
257280
runs-on: windows-latest
258281

@@ -265,17 +288,17 @@ jobs:
265288
matrix:
266289
include:
267290
- build: 'noavx'
268-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
291+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
269292
- build: 'avx2'
270-
defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
293+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
271294
- build: 'avx'
272-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
295+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
273296
- build: 'avx512'
274-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
297+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
275298
- build: 'clblast'
276-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
299+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
277300
- build: 'openblas'
278-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
301+
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
279302

280303
steps:
281304
- name: Clone
@@ -414,7 +437,7 @@ jobs:
414437
run: |
415438
mkdir build
416439
cd build
417-
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
440+
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
418441
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
419442
420443
- name: Determine tag name

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,4 +91,5 @@ tests/test-quantize-perf
9191
tests/test-sampling
9292
tests/test-tokenizer-0-llama
9393
tests/test-tokenizer-0-falcon
94-
tests/test-tokenizer-1
94+
tests/test-tokenizer-1-llama
95+
tests/test-tokenizer-1-bpe

CMakeLists.txt

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ endif()
4444

4545
# general
4646
option(LLAMA_STATIC "llama: static link libraries" OFF)
47-
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
47+
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
4848
option(LLAMA_LTO "llama: enable link time optimization" OFF)
4949

5050
# debug
@@ -58,15 +58,21 @@ option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer"
5858
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
5959

6060
# instruction set specific
61-
option(LLAMA_AVX "llama: enable AVX" ON)
62-
option(LLAMA_AVX2 "llama: enable AVX2" ON)
63-
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
64-
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
65-
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
66-
option(LLAMA_FMA "llama: enable FMA" ON)
61+
if (LLAMA_NATIVE)
62+
set(INS_ENB OFF)
63+
else()
64+
set(INS_ENB ON)
65+
endif()
66+
67+
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
68+
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
69+
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
70+
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
71+
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
72+
option(LLAMA_FMA "llama: enable FMA" ${INS_ENB})
6773
# in MSVC F16C is implied with AVX2/AVX512
6874
if (NOT MSVC)
69-
option(LLAMA_F16C "llama: enable F16C" ON)
75+
option(LLAMA_F16C "llama: enable F16C" ${INS_ENB})
7076
endif()
7177

7278
# 3rd party libs
@@ -504,9 +510,6 @@ if (NOT MSVC)
504510
if (LLAMA_GPROF)
505511
add_compile_options(-pg)
506512
endif()
507-
if (LLAMA_NATIVE)
508-
add_compile_options(-march=native)
509-
endif()
510513
endif()
511514

512515
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
@@ -561,6 +564,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
561564
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
562565
endif()
563566
else()
567+
if (LLAMA_NATIVE)
568+
add_compile_options(-march=native)
569+
endif()
564570
if (LLAMA_F16C)
565571
add_compile_options(-mf16c)
566572
endif()

Makefile

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
33

44
# Binaries only useful for tests
5-
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
5+
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
66

77
# Code coverage output files
88
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -62,9 +62,11 @@ test: $(TEST_TARGETS)
6262
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
6363
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
6464
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
65-
continue; \
65+
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
6666
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
6767
continue; \
68+
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
69+
continue; \
6870
else \
6971
echo "Running test $$test_target..."; \
7072
./$$test_target; \
@@ -670,6 +672,9 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
670672
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
671673
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
672674

675+
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS)
676+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
677+
673678
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
674679
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
675680

Package.swift

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,12 @@ let package = Package(
4444
cSettings: [
4545
.unsafeFlags(["-Wno-shorten-64-to-32"]),
4646
.define("GGML_USE_K_QUANTS"),
47-
.define("GGML_USE_ACCELERATE"),
48-
.define("ACCELERATE_NEW_LAPACK"),
49-
.define("ACCELERATE_LAPACK_ILP64")
47+
.define("GGML_USE_ACCELERATE")
48+
// NOTE: NEW_LAPACK will required iOS version 16.4+
49+
// We should consider add this in the future when we drop support for iOS 14
50+
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
51+
// .define("ACCELERATE_NEW_LAPACK"),
52+
// .define("ACCELERATE_LAPACK_ILP64")
5053
] + additionalSettings,
5154
linkerSettings: [
5255
.linkedFramework("Accelerate")

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
66
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
77

8-
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
8+
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
99

1010
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
1111

common/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,7 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
923923
result += piece;
924924
}
925925

926+
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
926927
return result;
927928
}
928929

convert-baichuan-hf-to-gguf.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,14 @@
1111
from pathlib import Path
1212
from typing import TYPE_CHECKING, Any
1313
import itertools
14-
import gguf
1514
import numpy as np
1615
import torch
1716
from sentencepiece import SentencePieceProcessor # type: ignore[import]
1817

18+
if 'NO_LOCAL_GGUF' not in os.environ:
19+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
20+
import gguf
21+
1922

2023
if TYPE_CHECKING:
2124
from typing import TypeAlias
@@ -174,8 +177,11 @@ def parse_args() -> argparse.Namespace:
174177
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
175178

176179
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
180+
vocab_size = hparams.get('vocab_size')
181+
if vocab_size is None:
182+
vocab_size = tokenizer.vocab_size()
177183

178-
for i in range(tokenizer.vocab_size()):
184+
for i in range(vocab_size):
179185
text: bytes
180186
score: float
181187

convert-falcon-hf-to-gguf.py

Lines changed: 12 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -20,28 +20,6 @@
2020
import gguf
2121

2222

23-
def bytes_to_unicode():
24-
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
25-
"""
26-
Returns list of utf-8 byte and a corresponding list of unicode strings.
27-
The reversible bpe codes work on unicode strings.
28-
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
29-
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
30-
This is a significant percentage of your normal, say, 32K bpe vocab.
31-
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
32-
And avoids mapping to whitespace/control characters the bpe code barfs on.
33-
"""
34-
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
35-
cs = bs[:]
36-
n = 0
37-
for b in range(2**8):
38-
if b not in bs:
39-
bs.append(b)
40-
cs.append(2**8+n)
41-
n += 1
42-
return dict(zip(bs, (chr(n) for n in cs)))
43-
44-
4523
def count_model_parts(dir_model: Path) -> int:
4624
num_parts = 0
4725
for filename in os.listdir(dir_model):
@@ -133,50 +111,32 @@ def parse_args() -> argparse.Namespace:
133111
print("gguf: get tokenizer metadata")
134112

135113
tokens: list[bytearray] = []
136-
137-
tokenizer_json_file = dir_model / 'tokenizer.json'
138-
if not tokenizer_json_file.is_file():
139-
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
140-
sys.exit(1)
114+
scores: list[float] = []
115+
toktypes: list[int] = []
141116

142117
# gpt2 tokenizer
143118
gguf_writer.add_tokenizer_model("gpt2")
144119

145-
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
146-
tokenizer_json = json.load(f)
147-
148120
print("gguf: get gpt2 tokenizer vocab")
149121

150-
# The number of tokens in tokenizer.json can differ from the expected vocab size.
151-
# This causes downstream issues with mismatched tensor sizes when running the inference
152-
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
153-
154122
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
155123
tokenizer = AutoTokenizer.from_pretrained(dir_model)
156124

125+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
126+
# This causes downstream issues with mismatched tensor sizes when running the inference
127+
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
128+
assert max(tokenizer.vocab.values()) < vocab_size
129+
157130
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
158-
byte_encoder = bytes_to_unicode()
159-
byte_decoder = {v: k for k, v in byte_encoder.items()}
160131

161132
for i in range(vocab_size):
162-
if i in reverse_vocab:
163-
try:
164-
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
165-
except KeyError:
166-
text = bytearray()
167-
for c in reverse_vocab[i]:
168-
if ord(c) < 256: # single byte character
169-
text.append(byte_decoder[ord(c)])
170-
else: # multibyte special token character
171-
text.extend(c.encode('utf-8'))
172-
else:
173-
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
174-
pad_token = f"[PAD{i}]".encode("utf8")
175-
text = bytearray(pad_token)
176-
177-
tokens.append(text)
133+
tokens.append(reverse_vocab[i])
134+
scores.append(0.0) # dummy
135+
toktypes.append(gguf.TokenType.NORMAL)
178136

179137
gguf_writer.add_token_list(tokens)
138+
gguf_writer.add_token_scores(scores)
139+
gguf_writer.add_token_types(toktypes)
180140

181141
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
182142
special_vocab.add_to_gguf(gguf_writer)

0 commit comments

Comments
 (0)