Skip to content

Make incremental not propagate staleness if the public interface of a module is unchanged (v2) #2014

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Aug 18, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 120 additions & 29 deletions mypy/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import binascii
import collections
import contextlib
import hashlib
import json
import os
import os.path
Expand Down Expand Up @@ -290,6 +291,7 @@ def default_lib_path(data_dir: str, pyversion: Tuple[int, int]) -> List[str]:
('child_modules', List[str]), # all submodules of the given module
('options', Optional[Dict[str, bool]]), # build options
('dep_prios', List[int]),
('interface_hash', str), # hash representing the public interface
('version_id', str), # mypy version for cache invalidation
])
# NOTE: dependencies + suppressed == all reachable imports;
Expand Down Expand Up @@ -351,6 +353,7 @@ def __init__(self, data_dir: str,
self.type_checker = TypeChecker(self.errors, self.modules, options=options)
self.missing_modules = set() # type: Set[str]
self.stale_modules = set() # type: Set[str]
self.rechecked_modules = set() # type: Set[str]

def all_imported_modules_in_file(self,
file: MypyFile) -> List[Tuple[int, str, int]]:
Expand Down Expand Up @@ -728,6 +731,7 @@ def find_cache_meta(id: str, path: str, manager: BuildManager) -> Optional[Cache
meta.get('child_modules', []),
meta.get('options'),
meta.get('dep_prios', []),
meta.get('interface_hash', ''),
meta.get('version_id'),
)
if (m.id != id or m.path != path or
Expand All @@ -750,20 +754,27 @@ def find_cache_meta(id: str, path: str, manager: BuildManager) -> Optional[Cache
manager.trace('Metadata abandoned for {}: options differ'.format(id))
return None

return m


def is_meta_fresh(meta: CacheMeta, id: str, path: str, manager: BuildManager) -> bool:
if meta is None:
return False

# TODO: Share stat() outcome with find_module()
st = os.stat(path) # TODO: Errors
if st.st_mtime != m.mtime or st.st_size != m.size:
if st.st_mtime != meta.mtime or st.st_size != meta.size:
manager.log('Metadata abandoned for {}: file {} is modified'.format(id, path))
return None

# It's a match on (id, path, mtime, size).
# Check data_json; assume if its mtime matches it's good.
# TODO: stat() errors
if os.path.getmtime(data_json) != m.data_mtime:
if os.path.getmtime(meta.data_json) != meta.data_mtime:
manager.log('Metadata abandoned for {}: data cache is modified'.format(id))
return None
manager.log('Found {} {} (metadata is fresh)'.format(id, meta_json))
return m
return False
manager.log('Found {} {} (metadata is fresh)'.format(id, meta.data_json))
return True


def select_options_affecting_cache(options: Options) -> Mapping[str, bool]:
Expand All @@ -783,10 +794,17 @@ def random_string() -> str:
return binascii.hexlify(os.urandom(8)).decode('ascii')


def compute_hash(text: str) -> str:
# We use md5 instead of the builtin hash(...) function because the output of hash(...)
# can differ between runs due to hash randomization (enabled by default in Python 3.3).
# See the note in https://docs.python.org/3/reference/datamodel.html#object.__hash__.
return hashlib.md5(text.encode('utf-8')).hexdigest()


def write_cache(id: str, path: str, tree: MypyFile,
dependencies: List[str], suppressed: List[str],
child_modules: List[str], dep_prios: List[int],
manager: BuildManager) -> None:
old_interface_hash: str, manager: BuildManager) -> str:
"""Write cache files for a module.

Args:
Expand All @@ -796,28 +814,52 @@ def write_cache(id: str, path: str, tree: MypyFile,
dependencies: module IDs on which this module depends
suppressed: module IDs which were suppressed as dependencies
dep_prios: priorities (parallel array to dependencies)
old_interface_hash: the hash from the previous version of the data cache file
manager: the build manager (for pyversion, log/trace)

Return:
The new interface hash based on the serialized tree
"""
# Obtain file paths
path = os.path.abspath(path)
manager.trace('Dumping {} {}'.format(id, path))
st = os.stat(path) # TODO: Errors
mtime = st.st_mtime
size = st.st_size
meta_json, data_json = get_cache_names(
id, path, manager.options.cache_dir, manager.options.python_version)
manager.log('Writing {} {} {}'.format(id, meta_json, data_json))
data = tree.serialize()
manager.log('Writing {} {} {} {}'.format(id, path, meta_json, data_json))

# Make sure directory for cache files exists
parent = os.path.dirname(data_json)
if not os.path.isdir(parent):
os.makedirs(parent)
assert os.path.dirname(meta_json) == parent

# Construct temp file names
nonce = '.' + random_string()
data_json_tmp = data_json + nonce
meta_json_tmp = meta_json + nonce
with open(data_json_tmp, 'w') as f:
json.dump(data, f, indent=2, sort_keys=True)
f.write('\n')
data_mtime = os.path.getmtime(data_json_tmp)

# Serialize data and analyze interface
data = tree.serialize()
data_str = json.dumps(data, indent=2, sort_keys=True)
interface_hash = compute_hash(data_str)

# Write data cache file, if applicable
if old_interface_hash == interface_hash:
# If the interface is unchanged, the cached data is guaranteed
# to be equivalent, and we only need to update the metadata.
data_mtime = os.path.getmtime(data_json)
manager.trace("Interface for {} is unchanged".format(id))
else:
with open(data_json_tmp, 'w') as f:
f.write(data_str)
f.write('\n')
data_mtime = os.path.getmtime(data_json_tmp)
os.replace(data_json_tmp, data_json)
manager.trace("Interface for {} has changed".format(id))

# Obtain and set up metadata
st = os.stat(path) # TODO: Handle errors
mtime = st.st_mtime
size = st.st_size
meta = {'id': id,
'path': path,
'mtime': mtime,
Expand All @@ -828,14 +870,18 @@ def write_cache(id: str, path: str, tree: MypyFile,
'child_modules': child_modules,
'options': select_options_affecting_cache(manager.options),
'dep_prios': dep_prios,
'interface_hash': interface_hash,
'version_id': manager.version_id,
}

# Write meta cache file
with open(meta_json_tmp, 'w') as f:
json.dump(meta, f, sort_keys=True)
f.write('\n')
os.replace(data_json_tmp, data_json)
os.replace(meta_json_tmp, meta_json)

return interface_hash


"""Dependency manager.

Expand Down Expand Up @@ -1021,6 +1067,12 @@ class State:
# If caller_state is set, the line number in the caller where the import occurred
caller_line = 0

# If True, indicate that the public interface of this module is unchanged
externally_same = True

# Contains a hash of the public interface in incremental mode
interface_hash = "" # type: str

def __init__(self,
id: Optional[str],
path: Optional[str],
Expand Down Expand Up @@ -1100,8 +1152,10 @@ def __init__(self,
if path and source is None and manager.options.incremental:
self.meta = find_cache_meta(self.id, self.path, manager)
# TODO: Get mtime if not cached.
if self.meta is not None:
self.interface_hash = self.meta.interface_hash
self.add_ancestors()
if self.meta:
if is_meta_fresh(self.meta, self.id, self.path, manager):
# Make copies, since we may modify these and want to
# compare them to the originals later.
self.dependencies = list(self.meta.dependencies)
Expand All @@ -1113,6 +1167,7 @@ def __init__(self,
self.dep_line_map = {}
else:
# Parse the file (and then some) to get the dependencies.
self.meta = None
self.parse_file()
self.suppressed = []
self.child_modules = set()
Expand Down Expand Up @@ -1163,16 +1218,25 @@ def is_fresh(self) -> bool:
# suppression by --silent-imports. However when a suppressed
# dependency is added back we find out later in the process.
return (self.meta is not None
and self.is_interface_fresh()
and self.dependencies == self.meta.dependencies
and self.child_modules == set(self.meta.child_modules))

def is_interface_fresh(self) -> bool:
return self.externally_same

def has_new_submodules(self) -> bool:
"""Return if this module has new submodules after being loaded from a warm cache."""
return self.meta is not None and self.child_modules != set(self.meta.child_modules)

def mark_stale(self) -> None:
"""Throw away the cache data for this file, marking it as stale."""
def mark_as_rechecked(self) -> None:
"""Marks this module as having been fully re-analyzed by the type-checker."""
self.manager.rechecked_modules.add(self.id)

def mark_interface_stale(self) -> None:
"""Marks this module as having a stale public interface, and discards the cache data."""
self.meta = None
self.externally_same = False
self.manager.stale_modules.add(self.id)

def check_blockers(self) -> None:
Expand Down Expand Up @@ -1362,10 +1426,17 @@ def type_check(self) -> None:
def write_cache(self) -> None:
if self.path and self.manager.options.incremental and not self.manager.errors.is_errors():
dep_prios = [self.priorities.get(dep, PRI_HIGH) for dep in self.dependencies]
write_cache(self.id, self.path, self.tree,
list(self.dependencies), list(self.suppressed), list(self.child_modules),
dep_prios,
self.manager)
new_interface_hash = write_cache(
self.id, self.path, self.tree,
list(self.dependencies), list(self.suppressed), list(self.child_modules),
dep_prios, self.interface_hash,
self.manager)
if new_interface_hash == self.interface_hash:
self.manager.log("Cached module {} has same interface".format(self.id))
else:
self.manager.log("Cached module {} has changed interface".format(self.id))
self.mark_interface_stale()
self.interface_hash = new_interface_hash


def dispatch(sources: List[BuildSource], manager: BuildManager) -> None:
Expand Down Expand Up @@ -1434,6 +1505,7 @@ def load_graph(sources: List[BuildSource], manager: BuildManager) -> Graph:
for id, g in graph.items():
if g.has_new_submodules():
g.parse_file()
g.mark_interface_stale()
return graph


Expand Down Expand Up @@ -1472,7 +1544,7 @@ def process_graph(graph: Graph, manager: BuildManager) -> None:
for id in scc:
deps.update(graph[id].dependencies)
deps -= ascc
stale_deps = {id for id in deps if not graph[id].is_fresh()}
stale_deps = {id for id in deps if not graph[id].is_interface_fresh()}
fresh = fresh and not stale_deps
undeps = set()
if fresh:
Expand All @@ -1488,9 +1560,10 @@ def process_graph(graph: Graph, manager: BuildManager) -> None:
# All cache files are fresh. Check that no dependency's
# cache file is newer than any scc node's cache file.
oldest_in_scc = min(graph[id].meta.data_mtime for id in scc)
newest_in_deps = 0 if not deps else max(graph[dep].meta.data_mtime for dep in deps)
viable = {id for id in deps if not graph[id].is_interface_fresh()}
newest_in_deps = 0 if not viable else max(graph[dep].meta.data_mtime for dep in viable)
if manager.options.verbosity >= 3: # Dump all mtimes for extreme debugging.
all_ids = sorted(ascc | deps, key=lambda id: graph[id].meta.data_mtime)
all_ids = sorted(ascc | viable, key=lambda id: graph[id].meta.data_mtime)
for id in all_ids:
if id in scc:
if graph[id].meta.data_mtime < newest_in_deps:
Expand Down Expand Up @@ -1528,6 +1601,25 @@ def process_graph(graph: Graph, manager: BuildManager) -> None:
else:
process_stale_scc(graph, scc)

# TODO: This is a workaround to get around the "chaining imports" problem
# with the interface checks.
#
# That is, if we have a file named `module_a.py` which does:
#
# import module_b
# module_b.module_c.foo(3)
#
# ...and if the type signature of `module_c.foo(...)` were to change,
# module_a_ would not be rechecked since the interface of `module_b`
# would not be considered changed.
#
# As a workaround, this check will force a module's interface to be
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this workaround defeat the purpose of all the changes in this PR?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the workaround, the PR makes incremental better then what it's currently doing, but not as good as it could be -- my plan was to break up the work by landing this first then moving on to figuring out how to remove this workaround and submit a second pull request, probably by doing something similar to Jukka's suggestions.

More specifically, suppose we had the following test case:

[case testIncrementalChain]
import a

[file a.py]
import b
def func_a() -> int:
    return b.func_b()

[file b.py]
import c
def func_b() -> int:
    return c.func_c()

[file c.py]
def func_c() -> int:
    return 42

With the version of incremental mode in master, changing file c.py in any way will cause the main module, file a.py, b.py, and c.py to all be rechecked.

This pull request modifies incremental so that if c.py was modified without the public interface being changed (maybe changing func_c to return a different value?), then only c.py is rechecked. However, changing c.py so that its interface changes causes mypy to fall back back to its current behavior (rechecking everything). This behaves basically identically to my original (closed) pull request, except the checks are all more robust/I didn't need to resort to writing another visitor.

Ideally, after the workaround is removed, changing the interface of c.py should cause only b.py and c.py to be changed, but that isn't implemented yet and is the next thing I'm planning on working around.

# considered stale if anything it imports has a stale interface,
# which ensures these changes are caught and propagated.
if len(stale_deps) > 0:
for id in scc:
graph[id].mark_interface_stale()


def order_ascc(graph: Graph, ascc: AbstractSet[str], pri_max: int = PRI_ALL) -> List[str]:
"""Come up with the ideal processing order within an SCC.
Expand Down Expand Up @@ -1590,8 +1682,6 @@ def process_fresh_scc(graph: Graph, scc: List[str]) -> None:

def process_stale_scc(graph: Graph, scc: List[str]) -> None:
"""Process the modules in one SCC from source code."""
for id in scc:
graph[id].mark_stale()
for id in scc:
# We may already have parsed the module, or not.
# If the former, parse_file() is a no-op.
Expand All @@ -1606,6 +1696,7 @@ def process_stale_scc(graph: Graph, scc: List[str]) -> None:
for id in scc:
graph[id].type_check()
graph[id].write_cache()
graph[id].mark_as_rechecked()


def sorted_components(graph: Graph,
Expand Down
19 changes: 17 additions & 2 deletions mypy/test/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def parse_test_cases(

files = [] # type: List[Tuple[str, str]] # path and contents
stale_modules = None # type: Optional[Set[str]] # module names
rechecked_modules = None # type: Optional[Set[str]] # module names
while i < len(p) and p[i].id not in ['out', 'case']:
if p[i].id == 'file':
# Record an extra file needed for the test case.
Expand All @@ -68,12 +69,25 @@ def parse_test_cases(
stale_modules = set()
else:
stale_modules = {item.strip() for item in p[i].arg.split(',')}
elif p[i].id == 'rechecked':
if p[i].arg is None:
rechecked_modules = set()
else:
rechecked_modules = {item.strip() for item in p[i].arg.split(',')}
else:
raise ValueError(
'Invalid section header {} in {} at line {}'.format(
p[i].id, path, p[i].line))
i += 1

if rechecked_modules is None:
# If the set of rechecked modules isn't specified, make it the same as the set of
# modules with a stale public interface.
rechecked_modules = stale_modules
if stale_modules is not None and not stale_modules.issubset(rechecked_modules):
raise ValueError(
'Stale modules must be a subset of rechecked modules ({})'.format(path))

tcout = [] # type: List[str]
if i < len(p) and p[i].id == 'out':
tcout = p[i].data
Expand All @@ -90,7 +104,7 @@ def parse_test_cases(
lastline = p[i].line if i < len(p) else p[i - 1].line + 9999
tc = DataDrivenTestCase(p[i0].arg, input, tcout, path,
p[i0].line, lastline, perform,
files, stale_modules)
files, stale_modules, rechecked_modules)
out.append(tc)
if not ok:
raise ValueError(
Expand All @@ -116,7 +130,7 @@ class DataDrivenTestCase(TestCase):
clean_up = None # type: List[Tuple[bool, str]]

def __init__(self, name, input, output, file, line, lastline,
perform, files, expected_stale_modules):
perform, files, expected_stale_modules, expected_rechecked_modules):
super().__init__(name)
self.input = input
self.output = output
Expand All @@ -126,6 +140,7 @@ def __init__(self, name, input, output, file, line, lastline,
self.perform = perform
self.files = files
self.expected_stale_modules = expected_stale_modules
self.expected_rechecked_modules = expected_rechecked_modules

def set_up(self) -> None:
super().set_up()
Expand Down
Loading