|
23 | 23 | def step_server_config(context, server_fqdn, server_port):
|
24 | 24 | context.server_fqdn = server_fqdn
|
25 | 25 | context.server_port = int(server_port)
|
| 26 | + context.n_threads = None |
26 | 27 | context.n_gpu_layer = None
|
27 | 28 | if 'PORT' in os.environ:
|
28 | 29 | context.server_port = int(os.environ['PORT'])
|
@@ -109,6 +110,11 @@ def step_n_gpu_layer(context, ngl):
|
109 | 110 | context.n_gpu_layer = ngl
|
110 | 111 |
|
111 | 112 |
|
| 113 | +@step('{n_threads:d} threads') |
| 114 | +def step_n_threads(context, n_threads): |
| 115 | + context.n_thread = n_threads |
| 116 | + |
| 117 | + |
112 | 118 | @step('{draft:d} as draft')
|
113 | 119 | def step_draft(context, draft):
|
114 | 120 | context.draft = draft
|
@@ -274,13 +280,22 @@ async def step_predictions_equal(context):
|
274 | 280 |
|
275 | 281 | @step('all predictions are different')
|
276 | 282 | @async_run_until_complete
|
277 |
| -async def step_predictions_equal(context): |
| 283 | +async def step_predictions_different(context): |
278 | 284 | n_completions = await gather_tasks_results(context)
|
279 | 285 | assert n_completions >= 2, "need at least 2 completions"
|
280 | 286 | assert_all_predictions_different(context.tasks_result)
|
281 | 287 | context.tasks_result = []
|
282 | 288 |
|
283 | 289 |
|
| 290 | +@step('all token probabilities are equal') |
| 291 | +@async_run_until_complete |
| 292 | +async def step_token_probabilities_equal(context): |
| 293 | + n_completions = await gather_tasks_results(context) |
| 294 | + assert n_completions >= 2, "need at least 2 completions" |
| 295 | + assert_all_token_probabilities_equal(context.tasks_result) |
| 296 | + context.tasks_result = [] |
| 297 | + |
| 298 | + |
284 | 299 | @step('the completion is truncated')
|
285 | 300 | def step_assert_completion_truncated(context):
|
286 | 301 | step_assert_completion_truncated(context, '')
|
@@ -869,6 +884,7 @@ async def request_completion(prompt,
|
869 | 884 | "id_slot": id_slot,
|
870 | 885 | "seed": seed if seed is not None else 42,
|
871 | 886 | "temperature": temperature if temperature is not None else "0.8f",
|
| 887 | + "n_probs": 2, |
872 | 888 | },
|
873 | 889 | headers=headers,
|
874 | 890 | timeout=3600) as response:
|
@@ -1123,6 +1139,23 @@ def assert_all_predictions_different(completion_responses):
|
1123 | 1139 | assert content_i != content_j, "contents not different"
|
1124 | 1140 |
|
1125 | 1141 |
|
| 1142 | +def assert_all_token_probabilities_equal(completion_responses): |
| 1143 | + n_predict = len(completion_responses[0]['completion_probabilities']) |
| 1144 | + if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': |
| 1145 | + for pos in range(n_predict): |
| 1146 | + for i, response_i in enumerate(completion_responses): |
| 1147 | + probs_i = response_i['completion_probabilities'][pos]['probs'] |
| 1148 | + print(f"pos {pos}, probs {i}: {probs_i}") |
| 1149 | + for pos in range(n_predict): |
| 1150 | + for i, response_i in enumerate(completion_responses): |
| 1151 | + probs_i = response_i['completion_probabilities'][pos]['probs'] |
| 1152 | + for j, response_j in enumerate(completion_responses): |
| 1153 | + if i == j: |
| 1154 | + continue |
| 1155 | + probs_j = response_j['completion_probabilities'][pos]['probs'] |
| 1156 | + assert probs_i == probs_j, "contents not equal" |
| 1157 | + |
| 1158 | + |
1126 | 1159 | async def gather_tasks_results(context):
|
1127 | 1160 | n_tasks = len(context.concurrent_tasks)
|
1128 | 1161 | if context.debug:
|
@@ -1261,6 +1294,8 @@ def start_server_background(context):
|
1261 | 1294 | server_args.extend(['--batch-size', context.n_batch])
|
1262 | 1295 | if context.n_ubatch:
|
1263 | 1296 | server_args.extend(['--ubatch-size', context.n_ubatch])
|
| 1297 | + if context.n_threads: |
| 1298 | + server_args.extend(['--threads', context.threads]) |
1264 | 1299 | if context.n_gpu_layer:
|
1265 | 1300 | server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
|
1266 | 1301 | if context.draft is not None:
|
|
0 commit comments