From dd53ee57999510bf69ea6fa7767defbe95cbfdd6 Mon Sep 17 00:00:00 2001 From: Snorlax Date: Sat, 6 May 2023 18:09:30 +0800 Subject: [PATCH 01/10] refresh.template.py: fix timeout not work --- refresh.template.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/refresh.template.py b/refresh.template.py index 5d5f12d..309d156 100644 --- a/refresh.template.py +++ b/refresh.template.py @@ -825,6 +825,7 @@ def _convert_compile_commands(aquery_output, focused_on_file: str = None): assert not target.startswith('//external'), f"Expecting external targets will start with @. Found //external for action {action}, target {target}" action.is_external = target.startswith('@') and not target.startswith('@//') + outputs = [] # Process each action from Bazelisms -> file paths and their clang commands # Threads instead of processes because most of the execution time is farmed out to subprocesses. No need to sidestep the GIL. Might change after https://github.com/clangd/clangd/issues/123 resolved with concurrent.futures.ThreadPoolExecutor( @@ -836,17 +837,16 @@ def _convert_compile_commands(aquery_output, focused_on_file: str = None): aquery_output.actions, timeout=3 if focused_on_file else None # If running in fast, interactive mode with --file, we need to cap latency. #TODO test timeout--if it doesn't work, cap input length here, before passing in array. Might also want to divide timeout/cap by #targets ) - # Collect outputs, tolerating any timeouts - outputs = [] - try: - for output in output_iterator: - outputs.append(output) - except concurrent.futures.TimeoutError: - assert focused_on_file, "Timeout should only have been set in the fast, interactive --file mode, focused on a single file." - if not found_header_focused_upon.is_set(): - log_warning(f""">>> Timed out looking for a command for {focused_on_file} - If that's a source file, please report this. We should work to improve the performance. - If that's a header file, we should probably do the same, but it may be unavoidable.""") + # Collect outputs, tolerating any timeouts + try: + for output in output_iterator: + outputs.append(output) + except concurrent.futures.TimeoutError: + assert focused_on_file, "Timeout should only have been set in the fast, interactive --file mode, focused on a single file." + if not found_header_focused_upon.is_set(): + log_warning(f""">>> Timed out looking for a command for {focused_on_file} + If that's a source file, please report this. We should work to improve the performance. + If that's a header file, we should probably do the same, but it may be unavoidable.""") # Yield as compile_commands.json entries header_files_already_written = set() From 912dec41357fcf70b4ef0ea0c17c2ffc2fb38727 Mon Sep 17 00:00:00 2001 From: Snorlax Date: Sat, 20 May 2023 11:33:34 +0800 Subject: [PATCH 02/10] refresh.template.py: quoting target_statement --- refresh.template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/refresh.template.py b/refresh.template.py index 309d156..fd1bcf1 100644 --- a/refresh.template.py +++ b/refresh.template.py @@ -979,7 +979,7 @@ def _get_commands(target: str, flags: str): # Then, actually query Bazel's compile actions for that configured target - target_statement = f'deps({target})' # TODO we should always be quoting targets when we splice them in. Let's use single quotes like with mnemonic, above. See slightly down in https://bazel.build/query/language#tokens + target_statement = f"deps('{target}')" # TODO we should always be quoting targets when we splice them in. Let's use single quotes like with mnemonic, above. See slightly down in https://bazel.build/query/language#tokens compile_commands = [] # TODO simplify loop, especially if we can reduce it to one command per case (see below)? Move warning messages outside? if file_flags: file_path = file_flags[0] From 3c26cac3c3759528eb2a9c456d9e308b4012ff27 Mon Sep 17 00:00:00 2001 From: Snorlax Date: Sat, 20 May 2023 15:28:37 +0800 Subject: [PATCH 03/10] Normalize incoming file paths Handling cases where input files are intermediates --- refresh.template.py | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/refresh.template.py b/refresh.template.py index fd1bcf1..8b6a31c 100644 --- a/refresh.template.py +++ b/refresh.template.py @@ -983,16 +983,48 @@ def _get_commands(target: str, flags: str): compile_commands = [] # TODO simplify loop, especially if we can reduce it to one command per case (see below)? Move warning messages outside? if file_flags: file_path = file_flags[0] + # Normalize file_path into 2 types of relative paths. + # Source file. 1. /srcs/file 2. /external//srcs/file + # Genfiles or output file. /bazel-out//bin/srcs/file + + pure_path = pathlib.PurePath(file_path) + + if pure_path.is_absolute(): + workspace_root = subprocess.check_output(["bazel", "info", "workspace"], stderr=subprocess. DEVNULL).decode("utf-8").strip() + output_base = subprocess.check_output(["bazel", "info", "output_base"], stderr=subprocess. DEVNULL).decode("utf-8").strip() + execution_root = subprocess.check_output(["bazel", "info", "execution_root"], stderr=subprocess. DEVNULL).decode("utf-8").strip() + + if pure_path.is_relative_to(workspace_root): + file_path = str(pure_path.relative_to(workspace_root)) + # Execution_root is longer then output_base + elif pure_path.is_relative_to(execution_root): + file_path = str(pure_path.relative_to(execution_root)) + elif pure_path.is_relative_to(output_base): + file_path = str(pure_path.relative_to(output_base)) + else: + # Treat pure_path as system absolute path + pass + found = False target_statement_candidates = [] if file_path.endswith(_get_files.source_extensions): target_statement_candidates.append(f"inputs('{re.escape(file_path)}', {target_statement})") else: - fname = os.path.basename(file_path) # TODO consider running a preliminary aquery to make this more specific, getting the targets that generate the given file. Use outputs() aquery function. Should also test the case where the file is generated/is coming in as a filegroup--that's likely to be fixed by this change. - header_target_statement = f"let v = {target_statement} in attr(hdrs, '{fname}', $v) + attr(srcs, '{fname}', $v)" # Bazel does not list headers as direct inputs, but rather hides them behind "middlemen", necessitating a query like this. + middle_target_candidates = subprocess.Popen(f"bazel aquery 'outputs('{re.escape(file_path)}', {target_statement})' | grep -o 'Target: .*' | grep -o '[@/].*'", shell=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).communicate()[0].decode("utf-8").strip().splitlines() + if middle_target_candidates: + log_info(f">>> File detected as intermediate of {middle_target_candidates}") + target_statement_candidates.extend([ + # TODO there seems a bug in bazel aquery? There are many targets listed output when executing bazel query "somepath('TARGET', 'SUBTARGET')" but there isn't any actions output when executing bazel aquery "somepath('TARGET', 'SUBTARGET')" and it works when executing bazel aquery "" + f"allpaths({target}, {'+'.join(middle_target_candidates)})" + ]) + else: + fname = os.path.basename(file_path) # TODO consider running a preliminary aquery to make this more specific, getting the targets that generate the given file. Use outputs() aquery function. Should also test the case where the file is generated/is coming in as a filegroup--that's likely to be fixed by this change. + header_target_statement = f"let v = {target_statement} in attr(hdrs, '{fname}', $v) + attr(srcs, '{fname}', $v)" # Bazel does not list headers as direct inputs, but rather hides them behind "middlemen", necessitating a query like this. + target_statement_candidates.extend([ + header_target_statement, + f"allpaths({target}, {header_target_statement})", # Ordering is ideal, breadth-first from the deepest dependency, despite the docs. TODO (1) There's a bazel bug that produces extra actions, not on the path but downstream, so we probably want to pass --noinclude_aspects per https://github.com/bazelbuild/bazel/issues/18289 and https://github.com/bazelbuild/bazel/issues/16310 to eliminate them (at the cost of some valid aspects). (2) We might want to benchmark with --infer_universe_scope (if supported) and --universe-scope=target with query allrdeps({header_target_statement}, ) or rdeps, checking speed but also ordering (the docs indicate it is likely to be lost, which is a problem) and for inclusion of the header target. We'd guess it'll have the same aspects bug as allpaths. (3) We probably also also want to *just* run this query, not the whole list, since it captures the former and is therefore unlikely to add much latency, since a given header is probabably either used internally to the target (find on first match) for header-only (must traverse all paths in all targets until you get a match) for all top-level targets, and since we can separate out the last, see below. + ]) target_statement_candidates.extend([ - header_target_statement, - f"allpaths({target}, {header_target_statement})", # Ordering is ideal, breadth-first from the deepest dependency, despite the docs. TODO (1) There's a bazel bug that produces extra actions, not on the path but downstream, so we probably want to pass --noinclude_aspects per https://github.com/bazelbuild/bazel/issues/18289 and https://github.com/bazelbuild/bazel/issues/16310 to eliminate them (at the cost of some valid aspects). (2) We might want to benchmark with --infer_universe_scope (if supported) and --universe-scope=target with query allrdeps({header_target_statement}, ) or rdeps, checking speed but also ordering (the docs indicate it is likely to be lost, which is a problem) and for inclusion of the header target. We'd guess it'll have the same aspects bug as allpaths. (3) We probably also also want to *just* run this query, not the whole list, since it captures the former and is therefore unlikely to add much latency, since a given header is probabably either used internally to the target (find on first match) for header-only (must traverse all paths in all targets until you get a match) for all top-level targets, and since we can separate out the last, see below. f'deps({target})', # TODO: Let's detect out-of-bazel, absolute paths and run this if and only if we're looking for a system header. We need to think about how we want to handle absolute paths more generally, perhaps normalizing them to relative if possible, like with the windows absolute path issue, above. ]) for target_statement in target_statement_candidates: From 4453cdd0104891d8d2b805294abe7d0cda73af62 Mon Sep 17 00:00:00 2001 From: Snorlax Date: Mon, 5 Jun 2023 12:10:12 +0800 Subject: [PATCH 04/10] revert outputs var back to with scope --- refresh.template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/refresh.template.py b/refresh.template.py index 8b6a31c..d91a994 100644 --- a/refresh.template.py +++ b/refresh.template.py @@ -825,7 +825,6 @@ def _convert_compile_commands(aquery_output, focused_on_file: str = None): assert not target.startswith('//external'), f"Expecting external targets will start with @. Found //external for action {action}, target {target}" action.is_external = target.startswith('@') and not target.startswith('@//') - outputs = [] # Process each action from Bazelisms -> file paths and their clang commands # Threads instead of processes because most of the execution time is farmed out to subprocesses. No need to sidestep the GIL. Might change after https://github.com/clangd/clangd/issues/123 resolved with concurrent.futures.ThreadPoolExecutor( @@ -838,6 +837,7 @@ def _convert_compile_commands(aquery_output, focused_on_file: str = None): timeout=3 if focused_on_file else None # If running in fast, interactive mode with --file, we need to cap latency. #TODO test timeout--if it doesn't work, cap input length here, before passing in array. Might also want to divide timeout/cap by #targets ) # Collect outputs, tolerating any timeouts + outputs = [] try: for output in output_iterator: outputs.append(output) From b6b0b301978d99608e2a05f0d9166f0f9423310f Mon Sep 17 00:00:00 2001 From: Snorlax Date: Mon, 5 Jun 2023 12:14:13 +0800 Subject: [PATCH 05/10] fix timeout log indent in execution block --- refresh.template.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/refresh.template.py b/refresh.template.py index d91a994..eeea1dc 100644 --- a/refresh.template.py +++ b/refresh.template.py @@ -845,8 +845,8 @@ def _convert_compile_commands(aquery_output, focused_on_file: str = None): assert focused_on_file, "Timeout should only have been set in the fast, interactive --file mode, focused on a single file." if not found_header_focused_upon.is_set(): log_warning(f""">>> Timed out looking for a command for {focused_on_file} - If that's a source file, please report this. We should work to improve the performance. - If that's a header file, we should probably do the same, but it may be unavoidable.""") + If that's a source file, please report this. We should work to improve the performance. + If that's a header file, we should probably do the same, but it may be unavoidable.""") # Yield as compile_commands.json entries header_files_already_written = set() From e17af2edbd6fe49872d56c348d66cfa1e84907d7 Mon Sep 17 00:00:00 2001 From: Snorlax Date: Mon, 5 Jun 2023 12:15:14 +0800 Subject: [PATCH 06/10] resolve todo for quoting target_statement --- refresh.template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/refresh.template.py b/refresh.template.py index eeea1dc..4066f68 100644 --- a/refresh.template.py +++ b/refresh.template.py @@ -979,7 +979,7 @@ def _get_commands(target: str, flags: str): # Then, actually query Bazel's compile actions for that configured target - target_statement = f"deps('{target}')" # TODO we should always be quoting targets when we splice them in. Let's use single quotes like with mnemonic, above. See slightly down in https://bazel.build/query/language#tokens + target_statement = f"deps('{target}')" compile_commands = [] # TODO simplify loop, especially if we can reduce it to one command per case (see below)? Move warning messages outside? if file_flags: file_path = file_flags[0] From 08481e0d4bfc389e6d71abc83ffc389fa1a0cd35 Mon Sep 17 00:00:00 2001 From: Snorlax Date: Sun, 18 Jun 2023 00:38:16 +0800 Subject: [PATCH 07/10] using subprocess.run instead of deprecated api --- refresh.template.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/refresh.template.py b/refresh.template.py index 4066f68..38085c6 100644 --- a/refresh.template.py +++ b/refresh.template.py @@ -990,9 +990,9 @@ def _get_commands(target: str, flags: str): pure_path = pathlib.PurePath(file_path) if pure_path.is_absolute(): - workspace_root = subprocess.check_output(["bazel", "info", "workspace"], stderr=subprocess. DEVNULL).decode("utf-8").strip() - output_base = subprocess.check_output(["bazel", "info", "output_base"], stderr=subprocess. DEVNULL).decode("utf-8").strip() - execution_root = subprocess.check_output(["bazel", "info", "execution_root"], stderr=subprocess. DEVNULL).decode("utf-8").strip() + workspace_root = subprocess.run(["bazel", "info", "workspace"], capture_output=True, text=True, check=True).stdout.strip() + output_base = subprocess.run(["bazel", "info", "output_base"], capture_output=True, text=True, check=True).stdout.strip() + execution_root = subprocess.run(["bazel", "info", "execution_root"], capture_output=True, text=True, check=True).stdout.strip() if pure_path.is_relative_to(workspace_root): file_path = str(pure_path.relative_to(workspace_root)) From 6b098e83ea8c1d1218c8afca555b445dbdcd6b5d Mon Sep 17 00:00:00 2001 From: Snorlax Date: Sun, 18 Jun 2023 00:40:21 +0800 Subject: [PATCH 08/10] workaround for bug in aquery https://github.com/bazelbuild/bazel/issues/18633 --- refresh.template.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/refresh.template.py b/refresh.template.py index 38085c6..5add6f1 100644 --- a/refresh.template.py +++ b/refresh.template.py @@ -1010,13 +1010,16 @@ def _get_commands(target: str, flags: str): if file_path.endswith(_get_files.source_extensions): target_statement_candidates.append(f"inputs('{re.escape(file_path)}', {target_statement})") else: - middle_target_candidates = subprocess.Popen(f"bazel aquery 'outputs('{re.escape(file_path)}', {target_statement})' | grep -o 'Target: .*' | grep -o '[@/].*'", shell=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).communicate()[0].decode("utf-8").strip().splitlines() + cmd = f"bazel aquery 'outputs('{re.escape(file_path)}', {target_statement})' {' '.join(additional_flags)} | grep -o 'Target: .*' | grep -o '[@/].*'" + middle_target_candidates = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=False).stdout.strip().splitlines() + # TODO We should remove prefix @ in labels since it will cause empty result in bazel query + middle_target_candidates = [re.sub(r'^@', '', candidate) for candidate in middle_target_candidates] + if middle_target_candidates: log_info(f">>> File detected as intermediate of {middle_target_candidates}") - target_statement_candidates.extend([ - # TODO there seems a bug in bazel aquery? There are many targets listed output when executing bazel query "somepath('TARGET', 'SUBTARGET')" but there isn't any actions output when executing bazel aquery "somepath('TARGET', 'SUBTARGET')" and it works when executing bazel aquery "" - f"allpaths({target}, {'+'.join(middle_target_candidates)})" - ]) + # TODO We should replace filter('{candidate}', deps('{target}')) with '{candidate}' until https://github.com/bazelbuild/bazel/issues/18633 is fixed. + target_statement_candidates.extend( + [f"allpaths('{target}', filter('{re.escape(candidate)}$', {target_statement}))" for candidate in middle_target_candidates]) else: fname = os.path.basename(file_path) # TODO consider running a preliminary aquery to make this more specific, getting the targets that generate the given file. Use outputs() aquery function. Should also test the case where the file is generated/is coming in as a filegroup--that's likely to be fixed by this change. header_target_statement = f"let v = {target_statement} in attr(hdrs, '{fname}', $v) + attr(srcs, '{fname}', $v)" # Bazel does not list headers as direct inputs, but rather hides them behind "middlemen", necessitating a query like this. From 7d44b947d9a2ba89e78dac83308b5d0b577ebe4b Mon Sep 17 00:00:00 2001 From: Snorlax Date: Sun, 18 Jun 2023 00:41:18 +0800 Subject: [PATCH 09/10] quoting targets which missed in query statment --- refresh.template.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/refresh.template.py b/refresh.template.py index 5add6f1..a6bf60d 100644 --- a/refresh.template.py +++ b/refresh.template.py @@ -1025,10 +1025,10 @@ def _get_commands(target: str, flags: str): header_target_statement = f"let v = {target_statement} in attr(hdrs, '{fname}', $v) + attr(srcs, '{fname}', $v)" # Bazel does not list headers as direct inputs, but rather hides them behind "middlemen", necessitating a query like this. target_statement_candidates.extend([ header_target_statement, - f"allpaths({target}, {header_target_statement})", # Ordering is ideal, breadth-first from the deepest dependency, despite the docs. TODO (1) There's a bazel bug that produces extra actions, not on the path but downstream, so we probably want to pass --noinclude_aspects per https://github.com/bazelbuild/bazel/issues/18289 and https://github.com/bazelbuild/bazel/issues/16310 to eliminate them (at the cost of some valid aspects). (2) We might want to benchmark with --infer_universe_scope (if supported) and --universe-scope=target with query allrdeps({header_target_statement}, ) or rdeps, checking speed but also ordering (the docs indicate it is likely to be lost, which is a problem) and for inclusion of the header target. We'd guess it'll have the same aspects bug as allpaths. (3) We probably also also want to *just* run this query, not the whole list, since it captures the former and is therefore unlikely to add much latency, since a given header is probabably either used internally to the target (find on first match) for header-only (must traverse all paths in all targets until you get a match) for all top-level targets, and since we can separate out the last, see below. + f"allpaths('{target}', {header_target_statement})", # Ordering is ideal, breadth-first from the deepest dependency, despite the docs. TODO (1) There's a bazel bug that produces extra actions, not on the path but downstream, so we probably want to pass --noinclude_aspects per https://github.com/bazelbuild/bazel/issues/18289 and https://github.com/bazelbuild/bazel/issues/16310 to eliminate them (at the cost of some valid aspects). (2) We might want to benchmark with --infer_universe_scope (if supported) and --universe-scope=target with query allrdeps({header_target_statement}, ) or rdeps, checking speed but also ordering (the docs indicate it is likely to be lost, which is a problem) and for inclusion of the header target. We'd guess it'll have the same aspects bug as allpaths. (3) We probably also also want to *just* run this query, not the whole list, since it captures the former and is therefore unlikely to add much latency, since a given header is probabably either used internally to the target (find on first match) for header-only (must traverse all paths in all targets until you get a match) for all top-level targets, and since we can separate out the last, see below. ]) target_statement_candidates.extend([ - f'deps({target})', # TODO: Let's detect out-of-bazel, absolute paths and run this if and only if we're looking for a system header. We need to think about how we want to handle absolute paths more generally, perhaps normalizing them to relative if possible, like with the windows absolute path issue, above. + f"deps('{target}')", # TODO: Let's detect out-of-bazel, absolute paths and run this if and only if we're looking for a system header. We need to think about how we want to handle absolute paths more generally, perhaps normalizing them to relative if possible, like with the windows absolute path issue, above. ]) for target_statement in target_statement_candidates: commands = list(_get_compile_commands_for_aquery(target_statement, additional_flags, file_path)) From 5823908ec75892945f54cd8e8a5f86743c0b1e59 Mon Sep 17 00:00:00 2001 From: Snorlax Date: Sun, 18 Jun 2023 00:41:45 +0800 Subject: [PATCH 10/10] add more log in query loop --- refresh.template.py | 1 + 1 file changed, 1 insertion(+) diff --git a/refresh.template.py b/refresh.template.py index a6bf60d..a59931d 100644 --- a/refresh.template.py +++ b/refresh.template.py @@ -1031,6 +1031,7 @@ def _get_commands(target: str, flags: str): f"deps('{target}')", # TODO: Let's detect out-of-bazel, absolute paths and run this if and only if we're looking for a system header. We need to think about how we want to handle absolute paths more generally, perhaps normalizing them to relative if possible, like with the windows absolute path issue, above. ]) for target_statement in target_statement_candidates: + log_info(f">>> Trying {target_statement}") commands = list(_get_compile_commands_for_aquery(target_statement, additional_flags, file_path)) compile_commands.extend(commands) # If we did the work to generate a command, we'll update it, whether it's for the requested file or not. if any(command['file'].endswith(file_path) for command in commands):