Skip to content

Commit 794db3e

Browse files
digiwombatanon998SlyEchocirk2JohannesGaessler
authored
Server Example Refactor and Improvements (#1570)
A major rewrite for the server example. Note that if you have built something on the previous server API, it will probably be incompatible. Check out the examples for how a typical chat app could work. This took a lot of effort, there are 24 PR's closed in the submitter's repo alone, over 160 commits and a lot of comments and testing. Summary of the changes: - adds missing generation parameters: tfs_z, typical_p, repeat_last_n, repeat_penalty, presence_penalty, frequency_penalty, mirostat, penalize_nl, seed, ignore_eos - applies missing top k sampler - removes interactive mode/terminal-like behavior, removes exclude parameter - moves threads and batch size to server command-line parameters - adds LoRA loading and matches command line parameters with main example - fixes stopping on EOS token and with the specified token amount with n_predict - adds server timeouts, host, and port settings - adds expanded generation complete response; adds generation settings, stop reason, prompt truncated, model used, and final text - sets defaults for unspecified parameters between requests - removes /next-token endpoint and as_loop parameter, adds stream parameter and server-sent events for streaming - adds CORS headers to responses - adds request logging, exception printing and optional verbose logging - adds better stopping words handling when matching multiple tokens and while streaming, or when it finishes on a partial stop string - adds printing an error when it can't bind to the host/port specified - fixes multi-byte character handling and replaces invalid UTF-8 characters on responses - prints timing and build info on startup - adds logit bias to request parameters - removes embedding mode - updates documentation; adds streaming Node.js and Bash examples - fixes code formatting - sets server threads to 1 since the current global state doesn't work well with simultaneous requests - adds truncation of the input prompt and better context reset - removes token limit from the input prompt - significantly simplified the logic and removed a lot of variables --------- Co-authored-by: anon998 <[email protected]> Co-authored-by: Henri Vasserman <[email protected]> Co-authored-by: Felix Hellmann <[email protected]> Co-authored-by: Johannes Gäßler <[email protected]> Co-authored-by: Lesaun Harvey <[email protected]>
1 parent 5ddf7ea commit 794db3e

File tree

7 files changed

+1119
-953
lines changed

7 files changed

+1119
-953
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ models/*
3636
/train-text-from-scratch
3737
/benchmark-matmult
3838
/vdot
39+
/server
3940
/Pipfile
4041
/libllama.so
4142

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-tex
33

44
ifdef LLAMA_BUILD_SERVER
55
BUILD_TARGETS += server
6+
LLAMA_SERVER_VERBOSE ?= 1
7+
server: private CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
68
endif
79

810
default: $(BUILD_TARGETS)

examples/server/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
set(TARGET server)
2+
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
23
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
34
add_executable(${TARGET} server.cpp json.hpp httplib.h)
5+
target_compile_definitions(${TARGET} PRIVATE
6+
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
7+
)
48
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
59
target_compile_features(${TARGET} PRIVATE cxx_std_11)
610
if(TARGET BUILD_INFO)

examples/server/README.md

Lines changed: 91 additions & 227 deletions
Large diffs are not rendered by default.

examples/server/chat.mjs

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import * as readline from 'node:readline'
2+
import { stdin, stdout } from 'node:process'
3+
4+
const API_URL = 'http://127.0.0.1:8080'
5+
6+
const chat = [
7+
{
8+
human: "Hello, Assistant.",
9+
assistant: "Hello. How may I help you today?"
10+
},
11+
{
12+
human: "Please tell me the largest city in Europe.",
13+
assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia."
14+
},
15+
]
16+
17+
const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.`
18+
19+
function format_prompt(question) {
20+
return `${instruction}\n${
21+
chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n")
22+
}\n### Human: ${question}\n### Assistant:`
23+
}
24+
25+
async function tokenize(content) {
26+
const result = await fetch(`${API_URL}/tokenize`, {
27+
method: 'POST',
28+
body: JSON.stringify({ content })
29+
})
30+
31+
if (!result.ok) {
32+
return []
33+
}
34+
35+
return await result.json().tokens
36+
}
37+
38+
const n_keep = await tokenize(instruction).length
39+
40+
async function chat_completion(question) {
41+
const result = await fetch(`${API_URL}/completion`, {
42+
method: 'POST',
43+
body: JSON.stringify({
44+
prompt: format_prompt(question),
45+
temperature: 0.2,
46+
top_k: 40,
47+
top_p: 0.9,
48+
n_keep: n_keep,
49+
n_predict: 256,
50+
stop: ["\n### Human:"], // stop completion after generating this
51+
stream: true,
52+
})
53+
})
54+
55+
if (!result.ok) {
56+
return
57+
}
58+
59+
let answer = ''
60+
61+
for await (var chunk of result.body) {
62+
const t = Buffer.from(chunk).toString('utf8')
63+
if (t.startsWith('data: ')) {
64+
const message = JSON.parse(t.substring(6))
65+
answer += message.content
66+
process.stdout.write(message.content)
67+
if (message.stop) {
68+
if (message.truncated) {
69+
chat.shift()
70+
}
71+
break
72+
}
73+
}
74+
}
75+
76+
process.stdout.write('\n')
77+
chat.push({ human: question, assistant: answer.trimStart() })
78+
}
79+
80+
const rl = readline.createInterface({ input: stdin, output: stdout });
81+
82+
const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => {
83+
rl.question(query, options, resolve)
84+
});
85+
86+
while(true) {
87+
const question = await readlineQuestion(rl, '> ')
88+
await chat_completion(question)
89+
}

examples/server/chat.sh

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#!/bin/bash
2+
3+
API_URL="${API_URL:-http://127.0.0.1:8080}"
4+
5+
CHAT=(
6+
"Hello, Assistant."
7+
"Hello. How may I help you today?"
8+
"Please tell me the largest city in Europe."
9+
"Sure. The largest city in Europe is Moscow, the capital of Russia."
10+
)
11+
12+
INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
13+
14+
trim() {
15+
shopt -s extglob
16+
set -- "${1##+([[:space:]])}"
17+
printf "%s" "${1%%+([[:space:]])}"
18+
}
19+
20+
trim_trailing() {
21+
shopt -s extglob
22+
printf "%s" "${1%%+([[:space:]])}"
23+
}
24+
25+
format_prompt() {
26+
echo -n "${INSTRUCTION}"
27+
printf "\n### Human: %s\n### Assistant: %s" "${CHAT[@]}" "$1"
28+
}
29+
30+
tokenize() {
31+
curl \
32+
--silent \
33+
--request POST \
34+
--url "${API_URL}/tokenize" \
35+
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
36+
| jq '.tokens[]'
37+
}
38+
39+
N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l)
40+
41+
chat_completion() {
42+
PROMPT="$(trim_trailing "$(format_prompt "$1")")"
43+
DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
44+
prompt: .,
45+
temperature: 0.2,
46+
top_k: 40,
47+
top_p: 0.9,
48+
n_keep: $n_keep,
49+
n_predict: 256,
50+
stop: ["\n### Human:"],
51+
stream: true
52+
}')"
53+
54+
ANSWER=''
55+
56+
while IFS= read -r LINE; do
57+
if [[ $LINE = data:* ]]; then
58+
CONTENT="$(echo "${LINE:5}" | jq -r '.content')"
59+
printf "%s" "${CONTENT}"
60+
ANSWER+="${CONTENT}"
61+
fi
62+
done < <(curl \
63+
--silent \
64+
--no-buffer \
65+
--request POST \
66+
--url "${API_URL}/completion" \
67+
--data-raw "${DATA}")
68+
69+
printf "\n"
70+
71+
CHAT+=("$1" "$(trim "$ANSWER")")
72+
}
73+
74+
while true; do
75+
read -r -e -p "> " QUESTION
76+
chat_completion "${QUESTION}"
77+
done

0 commit comments

Comments
 (0)