Skip to content

Commit 0483493

Browse files
author
Sergei Voronezhskii
committed
List task for hung workers
If HungListener found some hung tests it will print information about: - worker id - test name - test params - output from .result file Part of #107
1 parent ed45e1d commit 0483493

File tree

3 files changed

+52
-11
lines changed

3 files changed

+52
-11
lines changed

dispatcher.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,14 @@ def __init__(self, task_groups, max_workers_cnt, randomize):
7878
self.worker_next_id = 1
7979

8080
tasks_cnt = 0
81+
self.current_task_queue = SimpleQueue()
8182
self.task_queue_disps = dict()
8283
for key, task_group in task_groups.items():
8384
tasks_cnt += len(task_group['task_ids'])
84-
task_queue_disp = TaskQueueDispatcher(key, task_group, randomize)
85+
task_queue_disp = TaskQueueDispatcher(key,
86+
task_group,
87+
randomize,
88+
self.current_task_queue)
8589
self.task_queue_disps[key] = task_queue_disp
8690
self.result_queues.append(task_queue_disp.result_queue)
8791
self.task_queues.append(task_queue_disp.task_queue)
@@ -97,6 +101,7 @@ def __init__(self, task_groups, max_workers_cnt, randomize):
97101

98102
self.pid_to_worker_id = dict()
99103
self.worker_id_to_pid = dict()
104+
self.worker_id_to_task = dict()
100105

101106
self.randomize = randomize
102107
self.tcp_port_dispatcher = TcpPortDispatcher()
@@ -137,9 +142,19 @@ def init_listeners(self):
137142
no_output_timeout = float(args.no_output_timeout or 120)
138143
hang_watcher = listeners.HangWatcher(
139144
output_watcher.not_done_worker_ids, self.kill_all_workers,
140-
warn_timeout, no_output_timeout)
145+
warn_timeout, no_output_timeout,
146+
self.get_task_by_worker_id, self.set_task_for_worker_id
147+
)
141148
self.listeners.append(hang_watcher)
142149

150+
def set_task_for_worker_id(self):
151+
while not self.current_task_queue.empty():
152+
worker_id, task_id = self.current_task_queue.get()
153+
self.worker_id_to_task[worker_id] = task_id
154+
155+
def get_task_by_worker_id(self, worker_id):
156+
return self.worker_id_to_task[worker_id]
157+
143158
def run_max_workers(self):
144159
ok = True
145160
new_workers_cnt = self.max_workers_cnt - self.workers_cnt
@@ -340,7 +355,7 @@ class TaskQueueDispatcher:
340355
"""Incapsulate data structures necessary for dispatching workers working on
341356
the one task queue.
342357
"""
343-
def __init__(self, key, task_group, randomize):
358+
def __init__(self, key, task_group, randomize, current_task_queue):
344359
self.key = key
345360
self.gen_worker = task_group['gen_worker']
346361
self.task_ids = task_group['task_ids']
@@ -353,6 +368,7 @@ def __init__(self, key, task_group, randomize):
353368
self.randomize = False
354369
self.result_queue = SimpleQueue()
355370
self.task_queue = SimpleQueue()
371+
self.current_task_queue = current_task_queue
356372
for task_id in self.task_ids:
357373
self.task_queue.put(task_id)
358374
self.worker_ids = set()
@@ -366,7 +382,9 @@ def _run_worker(self, worker_id, tcp_port_range):
366382
os.environ['TEST_RUN_TCP_PORT_END'] = str(tcp_port_range[1])
367383
color_stdout.queue = self.result_queue
368384
worker = self.gen_worker(worker_id)
369-
worker.run_all(self.task_queue, self.result_queue)
385+
worker.run_all(self.task_queue,
386+
self.result_queue,
387+
self.current_task_queue)
370388

371389
def add_worker(self, worker_id, tcp_port_range):
372390
# Note: each of our workers should consume only one None, but for the

lib/worker.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,8 @@ def run_task(self, task_id):
274274
raise
275275
return short_status
276276

277-
def run_loop(self, task_queue, result_queue):
277+
# Note: it's not exception safe
278+
def run_loop(self, task_queue, result_queue, current_task_queue):
278279
""" called from 'run_all' """
279280
while True:
280281
task_id = self.task_get(task_queue)
@@ -285,6 +286,8 @@ def run_loop(self, task_queue, result_queue):
285286
schema='test_var')
286287
self.stop_worker(task_queue, result_queue)
287288
break
289+
290+
current_task_queue.put((self.id, task_id))
288291
short_status = self.run_task(task_id)
289292
result_queue.put(self.wrap_result(task_id, short_status))
290293
if not lib.Options().args.is_force and short_status == 'fail':
@@ -299,14 +302,14 @@ def run_loop(self, task_queue, result_queue):
299302
raise VoluntaryStopException()
300303
self.task_done(task_queue)
301304

302-
def run_all(self, task_queue, result_queue):
305+
def run_all(self, task_queue, result_queue, current_task_queue):
303306
if not self.initialized:
304307
self.flush_all_tasks(task_queue, result_queue)
305308
result_queue.put(self.done_marker())
306309
return
307310

308311
try:
309-
self.run_loop(task_queue, result_queue)
312+
self.run_loop(task_queue, result_queue, current_task_queue)
310313
except (KeyboardInterrupt, Exception):
311314
self.stop_worker(task_queue, result_queue, cleanup=False)
312315

listeners.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -170,9 +170,13 @@ class HangError(Exception):
170170

171171
class HangWatcher(BaseWatcher):
172172
"""Terminate all workers if no output received 'no_output_times' time."""
173+
rg = re.compile('\.test.*$')
174+
173175
def __init__(self, get_not_done_worker_ids, kill_all_workers, warn_timeout,
174-
kill_timeout):
176+
kill_timeout, get_task_by_worker_id, set_task_for_worker_id):
175177
self.get_not_done_worker_ids = get_not_done_worker_ids
178+
self.get_task_by_worker_id = get_task_by_worker_id
179+
self.set_task_for_worker_id = set_task_for_worker_id
176180
self.kill_all_workers = kill_all_workers
177181
self.warn_timeout = warn_timeout
178182
self.kill_timeout = kill_timeout
@@ -186,14 +190,30 @@ def process_result(self, obj):
186190
def process_timeout(self, delta_seconds):
187191
self.warned_seconds_ago += delta_seconds
188192
self.inactivity += delta_seconds
193+
self.set_task_for_worker_id()
189194
worker_ids = self.get_not_done_worker_ids()
190195
if self.warned_seconds_ago < self.warn_timeout:
191196
return
192197
color_stdout("No output during %d seconds. "
193-
"List of workers not reporting the status: %s; "
194-
"Will abort after %d seconds without output.\n" % (
195-
self.inactivity, worker_ids, self.kill_timeout),
198+
"Will abort after %d seconds without output. "
199+
"List of workers not reporting the status:\n" % (
200+
self.inactivity, self.kill_timeout),
196201
schema='test_var')
202+
for worker_id in worker_ids:
203+
task_name, task_param = self.get_task_by_worker_id(worker_id)
204+
color_stdout("[{0:03d}] {1} {2}\n".format(worker_id,
205+
task_name,
206+
task_param or ''),
207+
schema='test_var')
208+
task_path = "{0:03d}_{1}".format(worker_id,
209+
self.rg.sub('.result', task_name))
210+
main_vardir = os.path.realpath(lib.Options().args.vardir)
211+
rf = os.path.join(main_vardir, task_path)
212+
if os.path.exists(rf):
213+
color_stdout("Last 15 lines of result file [{0}]\n".format(rf),
214+
schema='error')
215+
lib.utils.print_tail_n(rf, num_lines=15)
216+
197217
self.warned_seconds_ago = 0.0
198218
if self.inactivity < self.kill_timeout:
199219
return

0 commit comments

Comments
 (0)