server: add test for token probs

JohannesGaessler · JohannesGaessler · commit 9da8bab66dd4 · 2024-05-17T19:05:53.000+02:00
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -49,7 +49,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
 - `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
 - `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
 - `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
-- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
+- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error.
 - `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching).  Default: disabled
 - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature
@@ -70,12 +70,43 @@ Feature: Results
     Then all predictions are equal
     Examples:
       | n_parallel | temp |
-      |  1         | 0.0  |
-      |  2         | 0.0  |
-      |  4         | 0.0  |
-      |  1         | 1.0  |
+      | 1          | 0.0  |
+      | 2          | 0.0  |
+      | 4          | 0.0  |
+      | 1          | 1.0  |
       # FIXME: These tests fail on master. The problem seems to be the unified KV cache.
       # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
       # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
-      # |  2         | 1.0  |
-      # |  4         | 1.0  |
+      # | 2          | 1.0  |
+      # | 4          | 1.0  |
+
+  Scenario Outline: consistent token probs with same seed and prompt
+    Given <n_slots> slots
+    And   1 threads
+    And   1.0 temperature
+    And   <n_predict> max tokens to predict
+    Then  the server is starting
+    Then  the server is healthy
+
+    Given 1 prompts "The meaning of life is" with seed 42
+    And   concurrent completion requests
+    # Then the server is busy # Not all slots will be utilized.
+    Then  the server is idle
+    And   all slots are idle
+
+    Given <n_parallel> prompts "The meaning of life is" with seed 42
+    And   concurrent completion requests
+    # Then the server is busy # Not all slots will be utilized.
+    Then the server is idle
+    And  all slots are idle
+
+    Then all token probabilities are equal
+    Examples:
+      | n_slots | n_parallel | n_predict |
+      | 4       | 1          | 1         |
+      | 4       | 1          | 10        |
+      | 4       | 4          | 1         |
+      # FIXME: These tests fail on master. The problem seems to be the unified KV cache.
+      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
+      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
+      # | 4       | 4          | 10        |
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
@@ -23,6 +23,7 @@
 def step_server_config(context, server_fqdn, server_port):
     context.server_fqdn = server_fqdn
     context.server_port = int(server_port)
+    context.n_threads = None
     context.n_gpu_layer = None
     if 'PORT' in os.environ:
         context.server_port = int(os.environ['PORT'])
@@ -109,6 +110,11 @@ def step_n_gpu_layer(context, ngl):
     context.n_gpu_layer = ngl
 
 
+@step('{n_threads:d} threads')
+def step_n_threads(context, n_threads):
+    context.n_thread = n_threads
+
+
 @step('{draft:d} as draft')
 def step_draft(context, draft):
     context.draft = draft
@@ -274,13 +280,22 @@ async def step_predictions_equal(context):
 
 @step('all predictions are different')
 @async_run_until_complete
-async def step_predictions_equal(context):
+async def step_predictions_different(context):
     n_completions = await gather_tasks_results(context)
     assert n_completions >= 2, "need at least 2 completions"
     assert_all_predictions_different(context.tasks_result)
     context.tasks_result = []
 
 
+@step('all token probabilities are equal')
+@async_run_until_complete
+async def step_token_probabilities_equal(context):
+    n_completions = await gather_tasks_results(context)
+    assert n_completions >= 2, "need at least 2 completions"
+    assert_all_token_probabilities_equal(context.tasks_result)
+    context.tasks_result = []
+
+
 @step('the completion is  truncated')
 def step_assert_completion_truncated(context):
     step_assert_completion_truncated(context, '')
@@ -869,6 +884,7 @@ async def request_completion(prompt,
                                     "id_slot": id_slot,
                                     "seed": seed if seed is not None else 42,
                                     "temperature": temperature if temperature is not None else "0.8f",
+                                    "n_probs": 2,
                                 },
                                 headers=headers,
                                 timeout=3600) as response:
@@ -1123,6 +1139,23 @@ def assert_all_predictions_different(completion_responses):
         assert content_i != content_j, "contents not different"
 
 
+def assert_all_token_probabilities_equal(completion_responses):
+    n_predict = len(completion_responses[0]['completion_probabilities'])
+    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
+        for pos in range(n_predict):
+            for i, response_i in enumerate(completion_responses):
+                probs_i = response_i['completion_probabilities'][pos]['probs']
+                print(f"pos {pos}, probs {i}: {probs_i}")
+    for pos in range(n_predict):
+        for i, response_i in enumerate(completion_responses):
+            probs_i = response_i['completion_probabilities'][pos]['probs']
+            for j, response_j in enumerate(completion_responses):
+                if i == j:
+                    continue
+                probs_j = response_j['completion_probabilities'][pos]['probs']
+            assert probs_i == probs_j, "contents not equal"
+
+
 async def gather_tasks_results(context):
     n_tasks = len(context.concurrent_tasks)
     if context.debug:
@@ -1261,6 +1294,8 @@ def start_server_background(context):
         server_args.extend(['--batch-size', context.n_batch])
     if context.n_ubatch:
         server_args.extend(['--ubatch-size', context.n_ubatch])
+    if context.n_threads:
+        server_args.extend(['--threads', context.threads])
     if context.n_gpu_layer:
         server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
     if context.draft is not None: