diff --git a/Cargo.lock b/Cargo.lock index 909ae597fd5..bd526b0380e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1519,6 +1519,7 @@ dependencies = [ name = "gix-diff" version = "0.37.0" dependencies = [ + "bstr", "document-features", "getrandom", "gix-hash 0.13.1", @@ -1795,7 +1796,9 @@ dependencies = [ "gix-object 0.38.0", "gix-traverse 0.34.0", "itoa", + "libc", "memmap2 0.7.1", + "rustix 0.38.20", "serde", "smallvec", "thiserror", @@ -2082,7 +2085,7 @@ dependencies = [ "gix-config-value", "gix-testtools", "parking_lot", - "rustix 0.38.19", + "rustix 0.38.20", "serial_test", "thiserror", ] @@ -2282,7 +2285,6 @@ dependencies = [ "gix-index 0.26.0", "gix-object 0.38.0", "gix-path 0.10.0", - "gix-pathspec", "gix-worktree 0.27.0", "thiserror", ] @@ -2933,7 +2935,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi", - "rustix 0.38.19", + "rustix 0.38.20", "windows-sys", ] @@ -3859,9 +3861,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.19" +version = "0.38.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745ecfa778e66b2b63c88a61cb36e0eea109e803b0b86bf9879fbc77c70e86ed" +checksum = "67ce50cb2e16c2903e30d1cbccfd8387a74b9d4c938b6a4c5ec6cc7556f7a8a0" dependencies = [ "bitflags 2.4.0", "errno", @@ -4289,7 +4291,7 @@ dependencies = [ "cfg-if", "fastrand 2.0.1", "redox_syscall", - "rustix 0.38.19", + "rustix 0.38.20", "windows-sys", ] diff --git a/crate-status.md b/crate-status.md index e9983498d5d..a6aa1f119eb 100644 --- a/crate-status.md +++ b/crate-status.md @@ -19,72 +19,153 @@ and itself relies on all `gix-*` crates. It's not meant for consumption, for app * [x] **estimate-hours** - estimate the time invested into a repository by evaluating commit dates. * Based on the [git-hours] algorithm. * See the [discussion][git-hours-discussion] for some performance data. -* **the `gix` program** _(plumbing)_ - lower level commands for use in automation - * **progress** - provide an overview of what works and what doesn't from the perspective of the git configuration. - This is likely to change a lot over time depending on actual needs, but maybe useful for you to see - if particular git-configuration is picked up and where it deviates. - * **config** - list the complete git configuration in human-readable form and optionally filter sections by name. - * **exclude** - * [x] **query** - check if path specs are excluded via gits exclusion rules like `.gitignore`. - * **verify** - validate a whole repository, for now only the object database. - * **commit** - * [x] **describe** - identify a commit by its closest tag in its past - * **tree** - * [x] **entries** - list tree entries for a single tree or recursively - * [x] **info** - display tree statistics - * **odb** - * [x] **info** - display odb statistics - * [x] **entries** - display all object ids in the object database - * **mailmap** - * [x] **entries** - display all entries of the aggregated mailmap git would use for substitution - * **revision** - * [x] **list** - list plain revision hashes from a starting point, similar to a very simple version of `git rev-list`. - * [x] **explain** - show what would be done while parsing a revision specification like `HEAD~1` - * [x] **resolve** - show which objects a revspec resolves to, similar to `git rev-parse` but faster and with much better error handling - * [x] **previous-branches** - list all previously checked out branches, powered by the ref-log. - * **remote** - * [x] **refs** - list all references available on the remote based on the current remote configuration. - * [x] **ref-map** - show how remote references relate to their local tracking branches as mapped by refspecs. - * [x] **fetch** - fetch the current remote or the given one, optionally just as dry-run. - * **clone** - * [x] initialize a new **bare** repository and fetch all objects. - * [x] initialize a new repository, fetch all objects and checkout the main worktree. - * **credential** - * [x] **fill/approve/reject** - The same as `git credential`, but implemented in Rust, calling helpers only when from trusted configuration. - * **free** - no git repository necessary - * **pack** - * [x] [verify](https://asciinema.org/a/352942) - * [x] [index verify](https://asciinema.org/a/352945) including each object sha1 and statistics - * [x] [explode](https://asciinema.org/a/352951), useful for transforming packs into loose objects for inspection or restoration - * [x] verify written objects (by reading them back from disk) - * [x] [receive](https://asciinema.org/a/359321) - receive a whole pack produced by **pack-send** or _git-upload-pack_, useful for `clone` like operations. - * [x] **create** - create a pack from given objects or tips of the commit graph. - * [ ] **send** - create a pack and send it using the pack protocol to stdout, similar to 'git-upload-pack', - for consumption by **pack-receive** or _git-receive-pack_ - - **multi-index** - * [x] **info** - print information about the file - * [x] **create** - create a multi-index from pack indices - * [x] **verify** - check the file for consistency - * [x] **entries** - list all entries of the file - - **index** - * [x] [create](https://asciinema.org/a/352941) - create an index file by streaming a pack file as done during clone - * [x] support for thin packs (as needed for fetch/pull) - * **commit-graph** - * [x] **verify** - assure that a commit-graph is consistent - * **mailmap** - * [x] **verify** - check entries of a mailmap file for parse errors and display them - * **index** - * [x] **entries** - show detailed entry information for human or machine consumption (via JSON) - * [x] **verify** - check the index for consistency - * [x] **info** - display general information about the index itself, with detailed extension information by default - * [x] detailed information about the TREE extension - * [ ] …other extensions details aren't implemented yet - * [x] **checkout-exclusive** - a predecessor of `git worktree`, providing flexible options to evaluate checkout performance from an index and/or an object database. +* **the `gix` program** _(plumbing)_ - lower level commands for use during development + - As its main purpose is to help running the latest improvements in the real world, it's self-documenting without + duplicating its features here. Use `gix --help` to start discovery. [skim]: https://github.com/lotabout/skim [git-hours]: https://github.com/kimmobrunfeldt/git-hours/blob/8aaeee237cb9d9028e7a2592a25ad8468b1f45e4/index.js#L114-L143 [git-hours-discussion]: https://github.com/Byron/gitoxide/discussions/78 +### gix + +The top-level crate that acts as hub to all functionality provided by the `gix-*` plumbing crates. + +* [x] utilities for applications to make long running operations interruptible gracefully and to support timeouts in servers. +* [x] handle `core.repositoryFormatVersion` and extensions +* [x] support for unicode-precomposition of command-line arguments (needs explicit use in parent application) +* [ ] strict object creation (validate objects referenced by newly created objects exist) +* [ ] strict hash verification (validate that objects actually have the hashes they claim to have) +* **Repository** + * [x] discovery + * [x] option to not cross file systems (default) + * [x] handle git-common-dir + * [x] support for `GIT_CEILING_DIRECTORIES` environment variable + * [ ] handle other non-discovery modes and provide control over environment variable usage required in applications + * [x] rev-parse + * [x] rev-walk + * [x] include tips + * [ ] exclude commits + * [x] instantiation + * [x] access to refs and objects + * **credentials** + * [x] run `git credential` directly + * [x] use credential helper configuration and to obtain credentials with `gix_credentials::helper::Cascade` + * **config** + * [ ] facilities to apply the [url-match](https://git-scm.com/docs/git-config#Documentation/git-config.txt-httplturlgt) algorithm and to + [normalize urls](https://github.com/git/git/blob/be1a02a17ede4082a86dfbfee0f54f345e8b43ac/urlmatch.c#L109:L109) before comparison. + * **traverse** + * [x] commit graphs + * [ ] make [git-notes](https://git-scm.com/docs/git-notes) accessible + * [x] tree entries + * **diffs/changes** + * [x] tree with other tree + * [ ] respect case-sensitivity of host filesystem. + * [x] a way to access various diff related settings or use them + * [ ] respect `diff.*.textconv`, `diff.*.cachetextconv` and external diff viewers with `diff.*.command`, + [along with support for reading `diff` gitattributes](https://github.com/git/git/blob/73876f4861cd3d187a4682290ab75c9dccadbc56/Documentation/gitattributes.txt#L699:L699). + * **rewrite tracking** + * **deviation** - git keeps up to four candidates whereas we use the first-found candidate that matches the similarity percentage. + This can lead to different sources being found. As such, we also don't consider the filename at all. + * [ ] handle binary files correctly, and apply filters for that matter + * [x] computation limit with observable reduction of precision when it is hit, for copies and renames separately + * **by identity** + * [x] renames (sym-links are only ever compared by identity) + * [x] copies + * **by similarity** - similarity factor controllable separately from renames + * [x] renames + * [x] copies + * [x] 'find-copies-harder' - find copies with the source being the entire tree. + * [ ] tree or index with working tree + - [ ] rename tracking + - [ ] submodule status (recursive) + * [x] diffs between modified blobs with various algorithms + * [ ] tree with index (via index-from-tree and index) + - [ ] rename tracking + - [ ] submodule status (recursive) + * [x] initialize + * [x] Proper configuration depending on platform (e.g. ignorecase, filemode, …) + * **Id** + * [x] short hashes with detection of ambiguity. + * **Commit** + * [x] `git describe` like functionality, with optional commit-graph acceleration + * [x] create new commit from tree + * **Objects** + * [x] lookup + * [x] peel to object kind + * [ ] create [signed commits and tags](https://github.com/Byron/gitoxide/issues/12) + * **trees** + * [x] lookup path + * **references** + * [x] peel to end + * [x] ref-log access + * [x] remote name + * [x] find remote itself + - [ ] respect `branch..merge` in the returned remote. + * **remotes** + * [x] clone + * [x] shallow + * [ ] include-tags when shallow is used (needs separate fetch) + * [ ] prune non-existing shallow commits + * [ ] [bundles](https://git-scm.com/docs/git-bundle) + * [x] fetch + * [x] shallow (remains shallow, options to adjust shallow boundary) + * [ ] a way to auto-explode small packs to avoid them to pile up + * [x] 'ref-in-want' + * [ ] 'wanted-ref' + * [x] standard negotiation algorithms `consecutive`, `skipping` and `noop`. + * [ ] push + * [x] ls-refs + * [x] ls-refs with ref-spec filter + * [x] list, find by name + * [x] create in memory + * [ ] groups + * [ ] [remote and branch files](https://github.com/git/git/blob/master/remote.c#L300) + * [ ] execute hooks + * **refs** + * [ ] run transaction hooks and handle special repository states like quarantine + * [ ] support for different backends like `files` and `reftable` + * **main or linked worktree** + * [ ] add files with `.gitignore` handling + * [ ] checkout with conversions like clean + smudge as in `.gitattributes` + * [ ] _diff_ index with working tree + * [ ] sparse checkout support + * [x] read per-worktree config if `extensions.worktreeConfig` is enabled. + * **index** + * [ ] tree from index + * [x] index from tree + * **worktrees** + * [x] open a repository with worktrees + * [x] read locked state + * [ ] obtain 'prunable' information + * [x] proper handling of worktree related refs + * [x] create a byte stream and create archives for such a stream, including worktree filters and conversions + * [ ] create, move, remove, and repair + * [x] access exclude information + * [x] access attribute information + * [x] respect `core.worktree` configuration + - **deviation** + * The delicate interplay between `GIT_COMMON_DIR` and `GIT_WORK_TREE` isn't implemented. + * **config** + * [x] read the primitive types `boolean`, `integer`, `string` + * [x] read and interpolate trusted paths + * [x] low-level API for more elaborate access to all details of `git-config` files + * [ ] a way to make changes to individual configuration files + * [x] mailmap + * [x] object replacements (`git replace`) + * [x] read git configuration + * [ ] merging + * [ ] stashing + * [ ] Use _Commit Graph_ to speed up certain queries + * [ ] subtree + * [ ] interactive rebase status/manipulation + * **submodules** + * [x] handle 'old' form for reading and detect old form + * [x] list + * [ ] edit +* [ ] API documentation + * [ ] Some examples + ### gix-actor * [x] read and write a signature that uniquely identifies an actor within a git repository * [x] a way to parse `name ` tuples (instead of full signatures) to facilitate parsing @@ -611,138 +692,6 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README. * [x] API documentation * [x] Some examples -### gix -* [x] utilities for applications to make long running operations interruptible gracefully and to support timeouts in servers. -* [x] handle `core.repositoryFormatVersion` and extensions -* [x] support for unicode-precomposition of command-line arguments (needs explicit use in parent application) -* [ ] strict object creation (validate objects referenced by newly created objects exist) -* [ ] strict hash verification (validate that objects actually have the hashes they claim to have) -* **Repository** - * [x] discovery - * [x] option to not cross file systems (default) - * [x] handle git-common-dir - * [x] support for `GIT_CEILING_DIRECTORIES` environment variable - * [ ] handle other non-discovery modes and provide control over environment variable usage required in applications - * [x] rev-parse - * [x] rev-walk - * [x] include tips - * [ ] exclude commits - * [x] instantiation - * [x] access to refs and objects - * **credentials** - * [x] run `git credential` directly - * [x] use credential helper configuration and to obtain credentials with `gix_credentials::helper::Cascade` - * **config** - * [ ] facilities to apply the [url-match](https://git-scm.com/docs/git-config#Documentation/git-config.txt-httplturlgt) algorithm and to - [normalize urls](https://github.com/git/git/blob/be1a02a17ede4082a86dfbfee0f54f345e8b43ac/urlmatch.c#L109:L109) before comparison. - * **traverse** - * [x] commit graphs - * [ ] make [git-notes](https://git-scm.com/docs/git-notes) accessible - * [x] tree entries - * **diffs/changes** - * [x] tree with other tree - * [ ] respect case-sensitivity of host filesystem. - * [x] a way to access various diff related settings or use them - * [ ] respect `diff.*.textconv`, `diff.*.cachetextconv` and external diff viewers with `diff.*.command`, - [along with support for reading `diff` gitattributes](https://github.com/git/git/blob/73876f4861cd3d187a4682290ab75c9dccadbc56/Documentation/gitattributes.txt#L699:L699). - * **rewrite tracking** - * **deviation** - git keeps up to four candidates whereas we use the first-found candidate that matches the similarity percentage. - This can lead to different sources being found. As such, we also don't consider the filename at all. - * [ ] handle binary files correctly, and apply filters for that matter - * [x] computation limit with observable reduction of precision when it is hit, for copies and renames separately - * **by identity** - * [x] renames (sym-links are only ever compared by identity) - * [x] copies - * **by similarity** - similarity factor controllable separately from renames - * [x] renames - * [x] copies - * [x] 'find-copies-harder' - find copies with the source being the entire tree. - * [ ] tree or index with working tree - * [x] diffs between modified blobs with various algorithms - * [ ] tree with index - * [x] initialize - * [x] Proper configuration depending on platform (e.g. ignorecase, filemode, …) - * **Id** - * [x] short hashes with detection of ambiguity. - * **Commit** - * [x] `git describe` like functionality, with optional commit-graph acceleration - * [x] create new commit from tree - * **Objects** - * [x] lookup - * [x] peel to object kind - * [ ] create [signed commits and tags](https://github.com/Byron/gitoxide/issues/12) - * **trees** - * [x] lookup path - * **references** - * [x] peel to end - * [x] ref-log access - * [x] remote name - * [x] find remote itself - - [ ] respect `branch..merge` in the returned remote. - * **remotes** - * [x] clone - * [x] shallow - * [ ] include-tags when shallow is used (needs separate fetch) - * [ ] prune non-existing shallow commits - * [ ] [bundles](https://git-scm.com/docs/git-bundle) - * [x] fetch - * [x] shallow (remains shallow, options to adjust shallow boundary) - * [ ] a way to auto-explode small packs to avoid them to pile up - * [x] 'ref-in-want' - * [ ] 'wanted-ref' - * [x] standard negotiation algorithms `consecutive`, `skipping` and `noop`. - * [ ] push - * [x] ls-refs - * [x] ls-refs with ref-spec filter - * [x] list, find by name - * [x] create in memory - * [ ] groups - * [ ] [remote and branch files](https://github.com/git/git/blob/master/remote.c#L300) - * [ ] execute hooks - * **refs** - * [ ] run transaction hooks and handle special repository states like quarantine - * [ ] support for different backends like `files` and `reftable` - * **main or linked worktree** - * [ ] add files with `.gitignore` handling - * [ ] checkout with conversions like clean + smudge as in `.gitattributes` - * [ ] _diff_ index with working tree - * [ ] sparse checkout support - * [x] read per-worktree config if `extensions.worktreeConfig` is enabled. - * **index** - * [ ] tree from index - * [x] index from tree - * **worktrees** - * [x] open a repository with worktrees - * [x] read locked state - * [ ] obtain 'prunable' information - * [x] proper handling of worktree related refs - * [x] create a byte stream and create archives for such a stream, including worktree filters and conversions - * [ ] create, move, remove, and repair - * [x] access exclude information - * [x] access attribute information - * [x] respect `core.worktree` configuration - - **deviation** - * The delicate interplay between `GIT_COMMON_DIR` and `GIT_WORK_TREE` isn't implemented. - * **config** - * [x] read the primitive types `boolean`, `integer`, `string` - * [x] read and interpolate trusted paths - * [x] low-level API for more elaborate access to all details of `git-config` files - * [ ] a way to make changes to individual configuration files - * [x] mailmap - * [x] object replacements (`git replace`) - * [x] read git configuration - * [ ] merging - * [ ] stashing - * [ ] Use _Commit Graph_ to speed up certain queries - * [ ] subtree - * [ ] interactive rebase status/manipulation - * **submodules** - * [x] handle 'old' form for reading and detect old form - * [x] list - * [ ] edit -* [ ] API documentation - * [ ] Some examples - ### gix-worktree-stream * [x] encode git-tree as stream of bytes (with large file support and actual streaming) diff --git a/gitoxide-core/src/query/engine/update.rs b/gitoxide-core/src/query/engine/update.rs index 0e8281cf950..1dcf57ace7b 100644 --- a/gitoxide-core/src/query/engine/update.rs +++ b/gitoxide-core/src/query/engine/update.rs @@ -9,8 +9,8 @@ use anyhow::{anyhow, bail}; use gix::objs::find::Error; use gix::{ bstr::{BStr, BString, ByteSlice}, + diff::rewrites::CopySource, features::progress, - object::tree::diff::rewrites::CopySource, parallel::{InOrderIter, SequenceId}, prelude::ObjectIdExt, Count, Progress, @@ -139,11 +139,10 @@ pub fn update( }); let rewrites = { - let mut r = - gix::object::tree::diff::Rewrites::try_from_config(&repo.config_snapshot(), true)?.unwrap_or_default(); - r.copies = Some(gix::object::tree::diff::rewrites::Copies { + let mut r = gix::diff::new_rewrites(&repo.config_snapshot(), true)?.unwrap_or_default(); + r.copies = Some(gix::diff::rewrites::Copies { source: if find_copies_harder { - CopySource::FromSetOfModifiedFilesAndSourceTree + CopySource::FromSetOfModifiedFilesAndAllSources } else { CopySource::FromSetOfModifiedFiles }, diff --git a/gix-diff/Cargo.toml b/gix-diff/Cargo.toml index b51eaaaa7da..589b3c34b2d 100644 --- a/gix-diff/Cargo.toml +++ b/gix-diff/Cargo.toml @@ -12,7 +12,7 @@ autotests = false [features] default = ["blob"] -## Enable diffing of blobs using imara-diff. +## Enable diffing of blobs using imara-diff, which also allows for a generic rewrite tracking implementation. blob = ["dep:imara-diff"] ## Data structures implement `serde::Serialize` and `serde::Deserialize`. serde = ["dep:serde", "gix-hash/serde", "gix-object/serde"] @@ -25,10 +25,12 @@ doctest = false [dependencies] gix-hash = { version = "^0.13.1", path = "../gix-hash" } gix-object = { version = "^0.38.0", path = "../gix-object" } + thiserror = "1.0.32" imara-diff = { version = "0.1.3", optional = true } serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"]} getrandom = { version = "0.2.8", optional = true, default-features = false, features = ["js"] } +bstr = { version = "1.5.0", default-features = false } document-features = { version = "0.2.0", optional = true } diff --git a/gix-diff/src/blob.rs b/gix-diff/src/blob.rs index 27c1a131724..7b2a082bd1e 100644 --- a/gix-diff/src/blob.rs +++ b/gix-diff/src/blob.rs @@ -1,3 +1,18 @@ //! For using text diffs, please have a look at the [`imara-diff` documentation](https://docs.rs/imara-diff), //! maintained by [Pascal Kuthe](https://github.com/pascalkuthe). +//! +//! +/// Information about the diff performed to detect similarity. +#[derive(Debug, Default, Clone, Copy, Eq, PartialEq)] +pub struct DiffLineStats { + /// The amount of lines to remove from the source to get to the destination. + pub removals: u32, + /// The amount of lines to add to the source to get to the destination. + pub insertions: u32, + /// The amount of lines of the previous state, in the source. + pub before: u32, + /// The amount of lines of the new state, in the destination. + pub after: u32, +} + pub use imara_diff::*; diff --git a/gix-diff/src/lib.rs b/gix-diff/src/lib.rs index 6d94a75919f..b3a61b2b97b 100644 --- a/gix-diff/src/lib.rs +++ b/gix-diff/src/lib.rs @@ -8,6 +8,34 @@ cfg_attr(doc, doc = ::document_features::document_features!()) #![deny(missing_docs, rust_2018_idioms)] #![forbid(unsafe_code)] +/// A structure to capture how to perform rename and copy tracking, used by the [rewrites::Tracker]. +#[derive(Debug, Copy, Clone, PartialEq)] +#[cfg(feature = "blob")] +pub struct Rewrites { + /// If `Some(…)`, also find copies. `None` is the default which does not try to detect copies at all. + /// + /// Note that this is an even more expensive operation than detecting renames stemming from additions and deletions + /// as the resulting set to search through is usually larger. + pub copies: Option, + /// The percentage of similarity needed for files to be considered renamed, defaulting to `Some(0.5)`. + /// This field is similar to `git diff -M50%`. + /// + /// If `None`, files are only considered equal if their content matches 100%. + /// Note that values greater than 1.0 have no different effect than 1.0. + pub percentage: Option, + /// The amount of files to consider for fuzzy rename or copy tracking. Defaults to 1000, meaning that only 1000*1000 + /// combinations can be tested for fuzzy matches, i.e. the ones that try to find matches by comparing similarity. + /// If 0, there is no limit. + /// + /// If the limit would not be enough to test the entire set of combinations, the algorithm will trade in precision and not + /// run the fuzzy version of identity tests at all. That way results are never partial. + pub limit: usize, +} + +/// Contains a [Tracker](rewrites::Tracker) to detect rewrites. +#[cfg(feature = "blob")] +pub mod rewrites; + /// pub mod tree; diff --git a/gix-diff/src/rewrites/mod.rs b/gix-diff/src/rewrites/mod.rs new file mode 100644 index 00000000000..8af13165f6f --- /dev/null +++ b/gix-diff/src/rewrites/mod.rs @@ -0,0 +1,77 @@ +use crate::Rewrites; + +/// Types related to the rename tracker for renames, rewrites and copies. +pub mod tracker; + +/// A type to retain state related to an ongoing tracking operation to retain sets of interesting changes +/// of which some are retained to at a later stage compute the ones that seem to be renames or copies. +pub struct Tracker { + /// The tracked items thus far, which will be used to determine renames/copies and rewrites later. + items: Vec>, + /// A place to store all paths in to reduce amount of allocations. + path_backing: Vec, + /// A buffer for use when fetching objects for similarity tests. + buf1: Vec, + /// Another buffer for use when fetching objects for similarity tests. + buf2: Vec, + /// How to track copies and/or rewrites. + rewrites: Rewrites, + /// The diff algorithm to use when checking for similarity. + diff_algo: crate::blob::Algorithm, +} + +/// Determine in which set of files to search for copies. +#[derive(Default, Debug, Copy, Clone, Eq, PartialEq)] +pub enum CopySource { + /// Find copies from the set of modified files only. + #[default] + FromSetOfModifiedFiles, + /// Find copies from the set of modified files, as well as all files known to the source (i.e. previous state of the tree). + /// + /// This can be an expensive operation as it scales exponentially with the total amount of files in the set. + FromSetOfModifiedFilesAndAllSources, +} + +/// Under which circumstances we consider a file to be a copy. +#[derive(Debug, Copy, Clone, PartialEq)] +pub struct Copies { + /// The set of files to search when finding the source of copies. + pub source: CopySource, + /// Equivalent to [`Rewrites::percentage`], but used for copy tracking. + /// + /// Useful to have similarity-based rename tracking and cheaper copy tracking. + pub percentage: Option, +} + +impl Default for Copies { + fn default() -> Self { + Copies { + source: CopySource::default(), + percentage: Some(0.5), + } + } +} + +/// Information collected while handling rewrites of files which may be tracked. +#[derive(Default, Clone, Copy, Debug, PartialEq)] +pub struct Outcome { + /// The options used to guide the rewrite tracking. Either fully provided by the caller or retrieved from git configuration. + pub options: Rewrites, + /// The amount of similarity checks that have been conducted to find renamed files and potentially copies. + pub num_similarity_checks: usize, + /// Set to the amount of worst-case rename permutations we didn't search as our limit didn't allow it. + pub num_similarity_checks_skipped_for_rename_tracking_due_to_limit: usize, + /// Set to the amount of worst-case copy permutations we didn't search as our limit didn't allow it. + pub num_similarity_checks_skipped_for_copy_tracking_due_to_limit: usize, +} + +/// The default settings for rewrites according to the git configuration defaults. +impl Default for Rewrites { + fn default() -> Self { + Rewrites { + copies: None, + percentage: Some(0.5), + limit: 1000, + } + } +} diff --git a/gix-diff/src/rewrites/tracker.rs b/gix-diff/src/rewrites/tracker.rs new file mode 100644 index 00000000000..09d3c724608 --- /dev/null +++ b/gix-diff/src/rewrites/tracker.rs @@ -0,0 +1,488 @@ +use std::ops::Range; + +use gix_object::tree::{EntryKind, EntryMode}; + +use crate::blob::DiffLineStats; +use crate::rewrites::{CopySource, Outcome}; +use crate::{rewrites::Tracker, Rewrites}; +use bstr::BStr; +use gix_object::FindExt; + +/// The kind of a change. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)] +pub enum ChangeKind { + /// The change represents the *deletion* of an item. + Deletion, + /// The change represents the *modification* of an item. + Modification, + /// The change represents the *addition* of an item. + Addition, +} + +/// A trait providing all functionality to abstract over the concept of a change, as seen by the [`Tracker`]. +pub trait Change: Clone { + /// Return the hash of this change for identification. + fn id(&self) -> &gix_hash::oid; + /// Return the kind of this change. + fn kind(&self) -> ChangeKind; + /// Return more information about the kind of entry affected by this change. + fn entry_mode(&self) -> EntryMode; + /// Return the id of the change along with its mode. + fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode); +} + +/// A set of tracked items allows to figure out their relations by figuring out their similarity. +pub(crate) struct Item { + /// The underlying raw change + change: T, + /// That slice into the backing for paths. + path: Range, + /// If true, this item was already emitted, i.e. seen by the caller. + emitted: bool, +} + +impl Item { + fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr { + backing[self.path.clone()].as_ref() + } + fn entry_mode_compatible(&self, mode: EntryMode) -> bool { + use EntryKind::*; + matches!( + (mode.kind(), self.change.entry_mode().kind()), + (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) + ) + } + + fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool { + self.entry_mode_compatible(dest_item_mode) + && match kind { + visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion), + visit::SourceKind::Copy => { + matches!(self.change.kind(), ChangeKind::Modification) + } + } + } +} + +/// A module with types used in the user-callback in [Tracker::emit()](crate::rewrites::Tracker::emit()). +pub mod visit { + use crate::blob::DiffLineStats; + use bstr::BStr; + use gix_object::tree::EntryMode; + + /// The source of a rewrite, rename or copy. + pub struct Source<'a> { + /// The kind of entry. + pub entry_mode: EntryMode, + /// The hash of the state of the source as seen in the object database. + pub id: gix_hash::ObjectId, + /// Further specify what kind of source this is. + pub kind: SourceKind, + /// The repository-relative location of this entry. + pub location: &'a BStr, + /// If this is a rewrite, indicate how many lines would need to change to turn this source into the destination. + pub diff: Option, + } + + /// Further identify the kind of [Source]. + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub enum SourceKind { + /// This is the source of an entry that was renamed, as `source` was renamed to `destination`. + Rename, + /// This is the source of a copy, as `source` was copied into `destination`. + Copy, + } + + /// A change along with a location. + pub struct Destination<'a, T> { + /// The change at the given `location`. + pub change: T, + /// The repository-relative location of this destination. + pub location: &'a BStr, + } +} + +/// +pub mod emit { + /// The error returned by [Tracker::emit()](super::Tracker::emit()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Could not find blob for similarity checking")] + FindExistingBlob(#[from] gix_object::find::existing_object::Error), + #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")] + GetItemsForExhaustiveCopyDetection(#[source] Box), + } +} + +/// Lifecycle +impl Tracker { + /// Create a new instance with `rewrites` configuration, and the `diff_algo` to use when performing + /// similarity checking. + pub fn new(rewrites: Rewrites, diff_algo: crate::blob::Algorithm) -> Self { + Tracker { + items: vec![], + path_backing: vec![], + buf1: Vec::new(), + buf2: Vec::new(), + rewrites, + diff_algo, + } + } +} + +/// build state and find matches. +impl Tracker { + /// We may refuse the push if that information isn't needed for what we have to track. + pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option { + if !change.entry_mode().is_blob_or_symlink() { + return Some(change); + } + let keep = match (self.rewrites.copies, change.kind()) { + (Some(_find_copies), _) => true, + (None, ChangeKind::Modification { .. }) => false, + (None, _) => true, + }; + + if !keep { + return Some(change); + } + + let start = self.path_backing.len(); + self.path_backing.extend_from_slice(location); + self.items.push(Item { + path: start..self.path_backing.len(), + change, + emitted: false, + }); + None + } + + /// Can only be called once effectively as it alters its own state. + /// + /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's + /// the destination of a copy or rename, or with `None` for source if no relation to other + /// items in the tracked set exist. + /// + /// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database. + /// Worktree filters and diff conversions will be applied afterwards automatically. + /// + /// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source + /// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push` + /// will panic if `change` is not a modification, and it's valid to not call `push` at all. + pub fn emit( + &mut self, + mut cb: impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, + objects: &dyn gix_object::Find, + mut push_source_tree: PushSourceTreeFn, + ) -> Result + where + PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>, + E: std::error::Error + Send + Sync + 'static, + { + fn by_id_and_location(a: &Item, b: &Item) -> std::cmp::Ordering { + a.change + .id() + .cmp(b.change.id()) + .then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end))) + } + self.items.sort_by(by_id_and_location); + + let mut out = Outcome { + options: self.rewrites, + ..Default::default() + }; + out = self.match_pairs_of_kind( + visit::SourceKind::Rename, + &mut cb, + self.rewrites.percentage, + out, + objects, + )?; + + if let Some(copies) = self.rewrites.copies { + out = self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, out, objects)?; + + match copies.source { + CopySource::FromSetOfModifiedFiles => {} + CopySource::FromSetOfModifiedFilesAndAllSources => { + push_source_tree(&mut |change, location| { + assert!( + self.try_push_change(change, location).is_none(), + "we must accept every change" + ); + // make sure these aren't viable to be emitted anymore. + self.items.last_mut().expect("just pushed").emitted = true; + }) + .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?; + self.items.sort_by(by_id_and_location); + + out = + self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, out, objects)?; + } + } + } + + self.items + .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing))); + for item in self.items.drain(..).filter(|item| !item.emitted) { + if cb( + visit::Destination { + location: item.location(&self.path_backing), + change: item.change, + }, + None, + ) == crate::tree::visit::Action::Cancel + { + break; + } + } + Ok(out) + } +} + +impl Tracker { + fn match_pairs_of_kind( + &mut self, + kind: visit::SourceKind, + cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, + percentage: Option, + mut out: Outcome, + objects: &dyn gix_object::Find, + ) -> Result { + // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively. + let needs_second_pass = !needs_exact_match(percentage); + if self.match_pairs(cb, None /* by identity */, kind, &mut out, objects)? == crate::tree::visit::Action::Cancel + { + return Ok(out); + } + if needs_second_pass { + let is_limited = if self.rewrites.limit == 0 { + false + } else if let Some(permutations) = permutations_over_limit(&self.items, self.rewrites.limit, kind) { + match kind { + visit::SourceKind::Rename => { + out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations; + } + visit::SourceKind::Copy => { + out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations; + } + } + true + } else { + false + }; + if !is_limited { + self.match_pairs(cb, percentage, kind, &mut out, objects)?; + } + } + Ok(out) + } + + fn match_pairs( + &mut self, + cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, + percentage: Option, + kind: visit::SourceKind, + stats: &mut Outcome, + objects: &dyn gix_object::Find, + ) -> Result { + // TODO(perf): reuse object data and interner state and interned tokens, make these available to `find_match()` + let mut dest_ofs = 0; + while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| { + (!item.emitted && matches!(item.change.kind(), ChangeKind::Addition)).then_some((idx, item)) + }) { + dest_idx += dest_ofs; + dest_ofs = dest_idx + 1; + let src = find_match( + &self.items, + dest, + dest_idx, + percentage.map(|p| (p, self.diff_algo)), + kind, + stats, + objects, + &mut self.buf1, + &mut self.buf2, + )? + .map(|(src_idx, src, diff)| { + let (id, entry_mode) = src.change.id_and_entry_mode(); + let id = id.to_owned(); + let location = src.location(&self.path_backing); + ( + visit::Source { + entry_mode, + id, + kind, + location, + diff, + }, + src_idx, + ) + }); + if src.is_none() { + continue; + } + let location = dest.location(&self.path_backing); + let change = dest.change.clone(); + let dest = visit::Destination { change, location }; + self.items[dest_idx].emitted = true; + if let Some(src_idx) = src.as_ref().map(|t| t.1) { + self.items[src_idx].emitted = true; + } + if cb(dest, src.map(|t| t.0)) == crate::tree::visit::Action::Cancel { + return Ok(crate::tree::visit::Action::Cancel); + } + } + Ok(crate::tree::visit::Action::Continue) + } +} + +fn permutations_over_limit(items: &[Item], limit: usize, kind: visit::SourceKind) -> Option { + let (sources, destinations) = items + .iter() + .filter(|item| match kind { + visit::SourceKind::Rename => !item.emitted, + visit::SourceKind::Copy => true, + }) + .fold((0, 0), |(mut src, mut dest), item| { + match item.change.kind() { + ChangeKind::Addition => { + dest += 1; + } + ChangeKind::Deletion => { + if kind == visit::SourceKind::Rename { + src += 1 + } + } + ChangeKind::Modification => { + if kind == visit::SourceKind::Copy { + src += 1 + } + } + } + (src, dest) + }); + let permutations = sources * destinations; + (permutations > limit * limit).then_some(permutations) +} + +fn needs_exact_match(percentage: Option) -> bool { + percentage.map_or(true, |p| p >= 1.0) +} + +/// <`src_idx`, src, possibly diff stat> +type SourceTuple<'a, T> = (usize, &'a Item, Option); + +/// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`. +/// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity. +/// We also ignore emitted items entirely. +/// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or +/// any non-deletion otherwise. +/// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set +/// of items to be searched. +#[allow(clippy::too_many_arguments)] +fn find_match<'a, T: Change>( + items: &'a [Item], + item: &Item, + item_idx: usize, + percentage: Option<(f32, crate::blob::Algorithm)>, + kind: visit::SourceKind, + stats: &mut Outcome, + objects: &dyn gix_object::Find, + buf1: &mut Vec, + buf2: &mut Vec, +) -> Result>, emit::Error> { + let (item_id, item_mode) = item.change.id_and_entry_mode(); + if needs_exact_match(percentage.map(|t| t.0)) || item_mode.is_link() { + let first_idx = items.partition_point(|a| a.change.id() < item_id); + let range = match items.get(first_idx..).map(|items| { + let end = items + .iter() + .position(|a| a.change.id() != item_id) + .map_or(items.len(), |idx| first_idx + idx); + first_idx..end + }) { + Some(range) => range, + None => return Ok(None), + }; + if range.is_empty() { + return Ok(None); + } + let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| { + src_idx += range.start; + (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None)) + }); + if let Some(src) = res { + return Ok(Some(src)); + } + } else { + let new = objects.find_blob(item_id, buf1)?; + let (percentage, algo) = percentage.expect("it's set to something below 1.0 and we assured this"); + debug_assert_eq!( + item.change.entry_mode().kind(), + EntryKind::Blob, + "symlinks are matched exactly, and trees aren't used here" + ); + for (can_idx, src) in items + .iter() + .enumerate() + .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)) + { + let old = objects.find_blob(src.change.id(), buf2)?; + // TODO: make sure we get attribute handling/worktree conversion and binary skips and filters right here. + let tokens = crate::blob::intern::InternedInput::new( + crate::blob::sources::byte_lines_with_terminator(old.data), + crate::blob::sources::byte_lines_with_terminator(new.data), + ); + let counts = crate::blob::diff( + algo, + &tokens, + crate::blob::sink::Counter::new(diff::Statistics { + removed_bytes: 0, + input: &tokens, + }), + ); + let similarity = (old.data.len() - counts.wrapped) as f32 / old.data.len().max(new.data.len()) as f32; + stats.num_similarity_checks += 1; + if similarity >= percentage { + return Ok(Some(( + can_idx, + src, + DiffLineStats { + removals: counts.removals, + insertions: counts.insertions, + before: tokens.before.len().try_into().expect("interner handles only u32"), + after: tokens.after.len().try_into().expect("interner handles only u32"), + } + .into(), + ))); + } + } + } + Ok(None) +} + +mod diff { + use std::ops::Range; + + pub struct Statistics<'a, 'data> { + pub removed_bytes: usize, + pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>, + } + + impl<'a, 'data> crate::blob::Sink for Statistics<'a, 'data> { + type Out = usize; + + fn process_change(&mut self, before: Range, _after: Range) { + self.removed_bytes = self.input.before[before.start as usize..before.end as usize] + .iter() + .map(|token| self.input.interner[*token].len()) + .sum(); + } + + fn finish(self) -> Self::Out { + self.removed_bytes + } + } +} diff --git a/gix-diff/src/tree/visit.rs b/gix-diff/src/tree/visit.rs index 82e38931dc2..c279ed90888 100644 --- a/gix-diff/src/tree/visit.rs +++ b/gix-diff/src/tree/visit.rs @@ -92,6 +92,46 @@ pub trait Visit { fn visit(&mut self, change: Change) -> Action; } +#[cfg(feature = "blob")] +mod change_impls { + use crate::rewrites::tracker::ChangeKind; + use crate::tree::visit::Change; + use gix_hash::oid; + use gix_object::tree::EntryMode; + + impl crate::rewrites::tracker::Change for crate::tree::visit::Change { + fn id(&self) -> &oid { + match self { + Change::Addition { oid, .. } | Change::Deletion { oid, .. } | Change::Modification { oid, .. } => oid, + } + } + + fn kind(&self) -> ChangeKind { + match self { + Change::Addition { .. } => ChangeKind::Addition, + Change::Deletion { .. } => ChangeKind::Deletion, + Change::Modification { .. } => ChangeKind::Modification, + } + } + + fn entry_mode(&self) -> EntryMode { + match self { + Change::Addition { entry_mode, .. } + | Change::Deletion { entry_mode, .. } + | Change::Modification { entry_mode, .. } => *entry_mode, + } + } + + fn id_and_entry_mode(&self) -> (&oid, EntryMode) { + match self { + Change::Addition { entry_mode, oid, .. } + | Change::Deletion { entry_mode, oid, .. } + | Change::Modification { entry_mode, oid, .. } => (oid, *entry_mode), + } + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/gix-index/Cargo.toml b/gix-index/Cargo.toml index 9320daa51b0..cd845b326b7 100644 --- a/gix-index/Cargo.toml +++ b/gix-index/Cargo.toml @@ -41,6 +41,10 @@ bitflags = "2" document-features = { version = "0.2.0", optional = true } +[target.'cfg(not(windows))'.dependencies] +rustix = { version = "0.38.20", default-features = false, features = ["std", "fs"] } +libc = { version = "0.2.149" } + [package.metadata.docs.rs] features = ["document-features", "serde"] rustdoc-args = ["--cfg", "docsrs"] diff --git a/gix-index/src/entry/mode.rs b/gix-index/src/entry/mode.rs index 0301df43800..583c295bc7b 100644 --- a/gix-index/src/entry/mode.rs +++ b/gix-index/src/entry/mode.rs @@ -37,7 +37,7 @@ impl Mode { /// can not be committed to git). pub fn change_to_match_fs( self, - stat: &std::fs::Metadata, + stat: &crate::fs::Metadata, has_symlinks: bool, executable_bit: bool, ) -> Option { @@ -46,15 +46,13 @@ impl Mode { Mode::SYMLINK if has_symlinks && !stat.is_symlink() => (), Mode::SYMLINK if !has_symlinks && !stat.is_file() => (), Mode::COMMIT | Mode::DIR if !stat.is_dir() => (), - Mode::FILE if executable_bit && gix_fs::is_executable(stat) => return Some(Change::ExecutableBit), - Mode::FILE_EXECUTABLE if executable_bit && !gix_fs::is_executable(stat) => { - return Some(Change::ExecutableBit) - } + Mode::FILE if executable_bit && stat.is_executable() => return Some(Change::ExecutableBit), + Mode::FILE_EXECUTABLE if executable_bit && !stat.is_executable() => return Some(Change::ExecutableBit), _ => return None, }; let new_mode = if stat.is_dir() { Mode::COMMIT - } else if executable_bit && gix_fs::is_executable(stat) { + } else if executable_bit && stat.is_executable() { Mode::FILE_EXECUTABLE } else { Mode::FILE diff --git a/gix-index/src/entry/stat.rs b/gix-index/src/entry/stat.rs index 5e60f8540be..9e279e784ea 100644 --- a/gix-index/src/entry/stat.rs +++ b/gix-index/src/entry/stat.rs @@ -76,11 +76,11 @@ impl Stat { } /// Creates stat information from the result of `symlink_metadata`. - pub fn from_fs(fstat: &std::fs::Metadata) -> Result { - let mtime = fstat.modified().unwrap_or(std::time::UNIX_EPOCH); - let ctime = fstat.created().unwrap_or(std::time::UNIX_EPOCH); + pub fn from_fs(stat: &crate::fs::Metadata) -> Result { + let mtime = stat.modified().unwrap_or(std::time::UNIX_EPOCH); + let ctime = stat.created().unwrap_or(std::time::UNIX_EPOCH); - #[cfg(not(unix))] + #[cfg(windows)] let res = Stat { mtime: mtime.try_into()?, ctime: ctime.try_into()?, @@ -89,11 +89,10 @@ impl Stat { uid: 0, gid: 0, // truncation to 32 bits is on purpose (git does the same). - size: fstat.len() as u32, + size: stat.len() as u32, }; - #[cfg(unix)] + #[cfg(not(windows))] let res = { - use std::os::unix::fs::MetadataExt; Stat { mtime: mtime.try_into().unwrap_or_default(), ctime: ctime.try_into().unwrap_or_default(), @@ -101,12 +100,12 @@ impl Stat { // that's what the linux syscalls returns // just rust upcasts to 64 bits for some reason? // numbers this large are impractical anyway (that's a lot of hard-drives). - dev: fstat.dev() as u32, - ino: fstat.ino() as u32, - uid: fstat.uid(), - gid: fstat.gid(), + dev: stat.dev() as u32, + ino: stat.ino() as u32, + uid: stat.uid(), + gid: stat.gid(), // truncation to 32 bits is on purpose (git does the same). - size: fstat.len() as u32, + size: stat.len() as u32, } }; diff --git a/gix-index/src/fs.rs b/gix-index/src/fs.rs new file mode 100644 index 00000000000..21422f9b804 --- /dev/null +++ b/gix-index/src/fs.rs @@ -0,0 +1,166 @@ +//! This module contains a `Metadata` implementation that must be used instead of `std::fs::Metadata` to assure +//! that the `ctime` information is populated exactly like the one in `git`, which wouldn't be the case on unix. +#![allow(clippy::useless_conversion)] // on some MacOOS conversions are required, but on linux usually not. +#![allow(clippy::unnecessary_cast)] + +// it's allowed for good measure, in case there are systems that use different types for that. +use std::path::Path; +use std::time::{Duration, SystemTime}; + +/// A structure to partially mirror [`std::fs::Metadata`]. +#[cfg(not(windows))] +pub struct Metadata(rustix::fs::Stat); + +#[cfg(windows)] +/// A structure to partially mirror [`std::fs::Metadata`]. +pub struct Metadata(std::fs::Metadata); + +/// Lifecycle +impl Metadata { + /// Obtain the metadata at `path` without following symlinks. + pub fn from_path_no_follow(path: &Path) -> Result { + #[cfg(not(windows))] + { + rustix::fs::lstat(path).map(Metadata).map_err(Into::into) + } + #[cfg(windows)] + path.symlink_metadata().map(Metadata) + } + + /// Obtain the metadata at `path` without following symlinks. + pub fn from_file(file: &std::fs::File) -> Result { + #[cfg(not(windows))] + { + rustix::fs::fstat(file).map(Metadata).map_err(Into::into) + } + #[cfg(windows)] + file.metadata().map(Metadata) + } +} + +/// Access +#[allow(clippy::len_without_is_empty)] +impl Metadata { + /// Return true if the metadata belongs to a directory + pub fn is_dir(&self) -> bool { + #[cfg(not(windows))] + { + (self.0.st_mode & libc::S_IFMT) == libc::S_IFDIR + } + #[cfg(windows)] + self.0.is_dir() + } + + /// Return the time at which the underlying file was modified. + pub fn modified(&self) -> Option { + #[cfg(not(windows))] + { + Some(system_time_from_secs_nanos( + self.0.st_mtime.try_into().ok()?, + self.0.st_mtime_nsec.try_into().ok()?, + )) + } + #[cfg(windows)] + self.0.modified().ok() + } + + /// Return the time at which the underlying file was created. + /// + /// Note that this differes from [`std::fs::Metadata::created()`] which would return + /// the inode birth time, which is notably different to what `git` does. + pub fn created(&self) -> Option { + #[cfg(not(windows))] + { + Some(system_time_from_secs_nanos( + self.0.st_ctime.try_into().ok()?, + self.0.st_ctime_nsec.try_into().ok()?, + )) + } + #[cfg(windows)] + self.0.created().ok() + } + + /// Return the size of the file in bytes. + pub fn len(&self) -> u64 { + #[cfg(not(windows))] + { + self.0.st_size as u64 + } + #[cfg(windows)] + self.0.len() + } + + /// Return the device id on which the file is located, or 0 on windows. + pub fn dev(&self) -> u64 { + #[cfg(not(windows))] + { + self.0.st_dev as u64 + } + #[cfg(windows)] + 0 + } + + /// Return the inode id tracking the file, or 0 on windows. + pub fn ino(&self) -> u64 { + #[cfg(not(windows))] + { + self.0.st_ino as u64 + } + #[cfg(windows)] + 0 + } + + /// Return the user-id of the file or 0 on windows. + pub fn uid(&self) -> u32 { + #[cfg(not(windows))] + { + self.0.st_uid as u32 + } + #[cfg(windows)] + 0 + } + + /// Return the group-id of the file or 0 on windows. + pub fn gid(&self) -> u32 { + #[cfg(not(windows))] + { + self.0.st_gid as u32 + } + #[cfg(windows)] + 0 + } + + /// Return `true` if the file's executable bit is set, or `false` on windows. + pub fn is_executable(&self) -> bool { + #[cfg(not(windows))] + { + (self.0.st_mode & libc::S_IFMT) == libc::S_IFREG && self.0.st_mode & libc::S_IXUSR == libc::S_IXUSR + } + #[cfg(windows)] + gix_fs::is_executable(&self.0) + } + + /// Return `true` if the file's is a symbolic link. + pub fn is_symlink(&self) -> bool { + #[cfg(not(windows))] + { + (self.0.st_mode & libc::S_IFMT) == libc::S_IFLNK + } + #[cfg(windows)] + self.0.is_symlink() + } + + /// Return `true` if this is a regular file, executable or not. + pub fn is_file(&self) -> bool { + #[cfg(not(windows))] + { + (self.0.st_mode & libc::S_IFMT) == libc::S_IFREG + } + #[cfg(windows)] + self.0.is_file() + } +} + +fn system_time_from_secs_nanos(secs: u64, nanos: u32) -> SystemTime { + std::time::UNIX_EPOCH + Duration::new(secs, nanos) +} diff --git a/gix-index/src/lib.rs b/gix-index/src/lib.rs index 55b332a8280..e54c4aaf1fa 100644 --- a/gix-index/src/lib.rs +++ b/gix-index/src/lib.rs @@ -33,6 +33,8 @@ pub mod verify; /// pub mod write; +pub mod fs; + /// All known versions of a git index file. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] diff --git a/gix-status/Cargo.toml b/gix-status/Cargo.toml index ea3263d0f35..0f7d58ddd59 100644 --- a/gix-status/Cargo.toml +++ b/gix-status/Cargo.toml @@ -20,11 +20,9 @@ gix-hash = { version = "^0.13.1", path = "../gix-hash" } gix-object = { version = "^0.38.0", path = "../gix-object" } gix-path = { version = "^0.10.0", path = "../gix-path" } gix-features = { version = "^0.36.0", path = "../gix-features" } -gix-pathspec = { version = "^0.4.0", path = "../gix-pathspec" } gix-filter = { version = "^0.6.0", path = "../gix-filter" } gix-worktree = { version = "^0.27.0", path = "../gix-worktree", default-features = false, features = ["attributes"] } thiserror = "1.0.26" filetime = "0.2.15" bstr = { version = "1.3.0", default-features = false } - diff --git a/gix-status/src/index_as_worktree/function.rs b/gix-status/src/index_as_worktree/function.rs index 7e1b9c86465..d14b2b7234d 100644 --- a/gix-status/src/index_as_worktree/function.rs +++ b/gix-status/src/index_as_worktree/function.rs @@ -25,9 +25,11 @@ use crate::{ /// `submodule` which can take a look at submodules in detail to produce status information (BASE version if its conflicting). /// `options` are used to configure the operation. /// -/// Note that `index` may require changes to be up-to-date with the working tree and avoid expensive computations by updating respective entries -/// with stat information from the worktree, and its timestamp is adjusted to the current time for which it will be considered fresh -/// as long as it is included which depends on `pathspec`. All this is delegated to the caller. +/// Note that `index` may require changes to be up-to-date with the working tree and avoid expensive computations by updating +/// respective entries with stat information from the worktree, and its timestamp is adjusted to the current time for which it +/// will be considered fresh. All changes that would be applied to the index are delegated to the caller, which receives these +/// as [`EntryStatus`]. +/// The `pathspec` is used to determine which index entries to check for status in the first place. /// /// `should_interrupt` can be used to stop all processing. /// `filter` is used to convert worktree files back to their internal git representation. For this to be correct, @@ -348,7 +350,7 @@ impl<'index> State<'_, 'index> { Err(err) => return Err(Error::Io(err)), }; self.symlink_metadata_calls.fetch_add(1, Ordering::Relaxed); - let metadata = match worktree_path.symlink_metadata() { + let metadata = match gix_index::fs::Metadata::from_path_no_follow(worktree_path) { Ok(metadata) if metadata.is_dir() => { // index entries are normally only for files/symlinks // if a file turned into a directory it was removed diff --git a/gix-worktree-state/src/checkout/entry.rs b/gix-worktree-state/src/checkout/entry.rs index b913c3bbda3..77db18daa1e 100644 --- a/gix-worktree-state/src/checkout/entry.rs +++ b/gix-worktree-state/src/checkout/entry.rs @@ -161,7 +161,7 @@ where file.close()?; } - entry.stat = Stat::from_fs(&std::fs::symlink_metadata(dest)?)?; + entry.stat = Stat::from_fs(&gix_index::fs::Metadata::from_path_no_follow(dest)?)?; obj.data.len() } gix_index::entry::Mode::DIR => { @@ -285,7 +285,7 @@ pub(crate) fn finalize_entry( } // NOTE: we don't call `file.sync_all()` here knowing that some filesystems don't handle this well. // revisit this once there is a bug to fix. - entry.stat = Stat::from_fs(&file.metadata()?)?; + entry.stat = Stat::from_fs(&gix_index::fs::Metadata::from_file(&file)?)?; file.close()?; Ok(()) } diff --git a/gix/src/config/cache/access.rs b/gix/src/config/cache/access.rs index e8363e1f6b7..ec3e7e1b424 100644 --- a/gix/src/config/cache/access.rs +++ b/gix/src/config/cache/access.rs @@ -93,13 +93,9 @@ impl Cache { } #[cfg(feature = "blob-diff")] - pub(crate) fn diff_renames( - &self, - ) -> Result, crate::object::tree::diff::rewrites::Error> { + pub(crate) fn diff_renames(&self) -> Result, crate::diff::new_rewrites::Error> { self.diff_renames - .get_or_try_init(|| { - crate::object::tree::diff::Rewrites::try_from_config(&self.resolved, self.lenient_config) - }) + .get_or_try_init(|| crate::diff::new_rewrites(&self.resolved, self.lenient_config)) .copied() } diff --git a/gix/src/config/mod.rs b/gix/src/config/mod.rs index 438c54378a9..f48575c174f 100644 --- a/gix/src/config/mod.rs +++ b/gix/src/config/mod.rs @@ -515,7 +515,7 @@ pub(crate) struct Cache { pub(crate) url_rewrite: OnceCell, /// The lazy-loaded rename information for diffs. #[cfg(feature = "blob-diff")] - pub(crate) diff_renames: OnceCell>, + pub(crate) diff_renames: OnceCell>, /// A lazily loaded mapping to know which url schemes to allow #[cfg(any(feature = "blocking-network-client", feature = "async-network-client"))] pub(crate) url_scheme: OnceCell, diff --git a/gix/src/diff.rs b/gix/src/diff.rs index b1081929394..445698cea39 100644 --- a/gix/src/diff.rs +++ b/gix/src/diff.rs @@ -15,3 +15,66 @@ pub mod rename { RenamesAndCopies, } } + +/// +#[cfg(feature = "blob-diff")] +mod utils { + use crate::config::cache::util::ApplyLeniency; + use crate::config::tree::Diff; + use crate::diff::rename::Tracking; + use gix_diff::rewrites::Copies; + use gix_diff::Rewrites; + + /// + pub mod new_rewrites { + /// The error returned by [`new_rewrites()`](super::new_rewrites()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error(transparent)] + ConfigDiffRenames(#[from] crate::config::key::GenericError), + #[error(transparent)] + ConfigDiffRenameLimit(#[from] crate::config::unsigned_integer::Error), + } + } + + /// Create an instance by reading all relevant information from the `config`uration, while being `lenient` or not. + /// Returns `Ok(None)` if nothing is configured. + /// + /// Note that missing values will be defaulted similar to what git does. + #[allow(clippy::result_large_err)] + pub fn new_rewrites( + config: &gix_config::File<'static>, + lenient: bool, + ) -> Result, new_rewrites::Error> { + let key = "diff.renames"; + let copies = match config + .boolean_by_key(key) + .map(|value| Diff::RENAMES.try_into_renames(value)) + .transpose() + .with_leniency(lenient)? + { + Some(renames) => match renames { + Tracking::Disabled => return Ok(None), + Tracking::Renames => None, + Tracking::RenamesAndCopies => Some(Copies::default()), + }, + None => return Ok(None), + }; + + let default = Rewrites::default(); + Ok(Rewrites { + copies, + limit: config + .integer_by_key("diff.renameLimit") + .map(|value| Diff::RENAME_LIMIT.try_into_usize(value)) + .transpose() + .with_leniency(lenient)? + .unwrap_or(default.limit), + ..default + } + .into()) + } +} +#[cfg(feature = "blob-diff")] +pub use utils::new_rewrites; diff --git a/gix/src/object/tree/diff/change.rs b/gix/src/object/tree/diff/change.rs index e6826d6ed32..a95770d6656 100644 --- a/gix/src/object/tree/diff/change.rs +++ b/gix/src/object/tree/diff/change.rs @@ -1,18 +1,6 @@ +use crate::diff::blob::DiffLineStats; use crate::{bstr::BStr, Id}; -/// Information about the diff performed to detect similarity of a [Rewrite][Event::Rewrite]. -#[derive(Debug, Default, Clone, Copy, Eq, PartialEq)] -pub struct DiffLineStats { - /// The amount of lines to remove from the source to get to the destination. - pub removals: u32, - /// The amount of lines to add to the source to get to the destination. - pub insertions: u32, - /// The amount of lines of the previous state, in the source. - pub before: u32, - /// The amount of lines of the new state, in the destination. - pub after: u32, -} - /// An event emitted when finding differences between two trees. #[derive(Debug, Clone, Copy)] pub enum Event<'a, 'old, 'new> { diff --git a/gix/src/object/tree/diff/for_each.rs b/gix/src/object/tree/diff/for_each.rs index cd9c60f547d..404e804327d 100644 --- a/gix/src/object/tree/diff/for_each.rs +++ b/gix/src/object/tree/diff/for_each.rs @@ -1,15 +1,8 @@ use gix_object::TreeRefIter; use super::{change, Action, Change, Platform}; -use crate::{ - bstr::BStr, - ext::ObjectIdExt, - object::tree::{ - diff, - diff::{rewrites, tracked}, - }, - Repository, Tree, -}; +use crate::diff::rewrites::tracker; +use crate::{bstr::BStr, diff::rewrites, ext::ObjectIdExt, object::tree::diff, Repository, Tree}; /// The error return by methods on the [diff platform][Platform]. #[derive(Debug, thiserror::Error)] @@ -19,12 +12,10 @@ pub enum Error { Diff(#[from] gix_diff::tree::changes::Error), #[error("The user-provided callback failed")] ForEach(#[source] Box), - #[error("Could not find blob for similarity checking")] - FindExistingBlob(#[from] crate::object::find::existing::Error), #[error("Could not configure diff algorithm prior to checking similarity")] ConfigureDiffAlgorithm(#[from] crate::config::diff::algorithm::Error), - #[error("Could not traverse tree to obtain possible sources for copies")] - TraverseTreeForExhaustiveCopyDetection(#[from] gix_traverse::tree::breadthfirst::Error), + #[error("Failure during rename tracking")] + RenameTracking(#[from] tracker::emit::Error), } /// @@ -49,12 +40,14 @@ impl<'a, 'old> Platform<'a, 'old> { E: std::error::Error + Sync + Send + 'static, { let repo = self.lhs.repo; + let diff_algo = repo.config.diff_algorithm()?; let mut delegate = Delegate { src_tree: self.lhs, other_repo: other.repo, recorder: gix_diff::tree::Recorder::default().track_location(self.tracking), visit: for_each, - tracked: self.rewrites.map(|r| tracked::State::new(r, self.tracking)), + location: self.tracking, + tracked: self.rewrites.map(|r| rewrites::Tracker::new(r, diff_algo)), err: None, }; match gix_diff::tree::Changes::from(TreeRefIter::from_bytes(&self.lhs.data)).needed_to_obtain( @@ -87,7 +80,8 @@ struct Delegate<'a, 'old, 'new, VisitFn, E> { other_repo: &'new Repository, recorder: gix_diff::tree::Recorder, visit: VisitFn, - tracked: Option, + tracked: Option>, + location: Option, err: Option, } @@ -151,14 +145,14 @@ where location: dest.location, event: diff::change::Event::Rewrite { source_location: source.location, - source_entry_mode: source.mode, + source_entry_mode: source.entry_mode, source_id: source.id.attach(self.src_tree.repo), entry_mode: mode, id: oid.to_owned().attach(self.other_repo), diff: source.diff, copy: match source.kind { - tracked::visit::Kind::RenameTarget => false, - tracked::visit::Kind::CopyDestination => true, + tracker::visit::SourceKind::Rename => false, + tracker::visit::SourceKind::Copy => true, }, }, }; @@ -180,7 +174,12 @@ where &mut self.err, ), }, - self.src_tree, + &self.src_tree.repo.objects, + |push| { + self.src_tree + .traverse() + .breadthfirst(&mut tree_to_changes::Delegate::new(push, self.location)) + }, )?; Ok(Some(outcome)) } @@ -233,3 +232,68 @@ where } } } + +mod tree_to_changes { + use gix_diff::tree::visit::Change; + use gix_object::tree::EntryRef; + + use crate::bstr::BStr; + + pub struct Delegate<'a> { + push: &'a mut dyn FnMut(Change, &BStr), + recorder: gix_traverse::tree::Recorder, + } + + impl<'a> Delegate<'a> { + pub fn new( + push: &'a mut dyn FnMut(Change, &BStr), + location: Option, + ) -> Self { + let location = location.map(|t| match t { + gix_diff::tree::recorder::Location::FileName => gix_traverse::tree::recorder::Location::FileName, + gix_diff::tree::recorder::Location::Path => gix_traverse::tree::recorder::Location::Path, + }); + Self { + push, + recorder: gix_traverse::tree::Recorder::default().track_location(location), + } + } + } + + impl gix_traverse::tree::Visit for Delegate<'_> { + fn pop_front_tracked_path_and_set_current(&mut self) { + self.recorder.pop_front_tracked_path_and_set_current() + } + + fn push_back_tracked_path_component(&mut self, component: &BStr) { + self.recorder.push_back_tracked_path_component(component) + } + + fn push_path_component(&mut self, component: &BStr) { + self.recorder.push_path_component(component) + } + + fn pop_path_component(&mut self) { + self.recorder.pop_path_component(); + } + + fn visit_tree(&mut self, _entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action { + gix_traverse::tree::visit::Action::Continue + } + + fn visit_nontree(&mut self, entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action { + if entry.mode.is_blob() { + (self.push)( + Change::Modification { + previous_entry_mode: entry.mode, + previous_oid: gix_hash::ObjectId::null(entry.oid.kind()), + entry_mode: entry.mode, + oid: entry.oid.to_owned(), + }, + self.recorder.path(), + ); + } + gix_traverse::tree::visit::Action::Continue + } + } +} diff --git a/gix/src/object/tree/diff/mod.rs b/gix/src/object/tree/diff/mod.rs index 5f7a041e4df..b5e6c5bae4d 100644 --- a/gix/src/object/tree/diff/mod.rs +++ b/gix/src/object/tree/diff/mod.rs @@ -1,5 +1,6 @@ use gix_diff::tree::recorder::Location; +use crate::diff::Rewrites; use crate::{bstr::BStr, Tree}; /// Returned by the `for_each` function to control flow. @@ -39,7 +40,7 @@ impl<'repo> Tree<'repo> { /// try to access blobs to compute a similarity metric. Thus, it's more compatible to turn rewrite tracking off /// using [`Platform::track_rewrites()`]. #[allow(clippy::result_large_err)] - pub fn changes<'a>(&'a self) -> Result, rewrites::Error> { + pub fn changes<'a>(&'a self) -> Result, crate::diff::new_rewrites::Error> { Ok(Platform { state: Default::default(), lhs: self, @@ -58,34 +59,6 @@ pub struct Platform<'a, 'repo> { rewrites: Option, } -/// A structure to capture how to perform rename and copy tracking -#[derive(Debug, Copy, Clone, PartialEq)] -pub struct Rewrites { - /// If `Some(…)`, do also find copies. `None` is the default which does not try to detect copies at all. - /// - /// Note that this is an even more expensive operation than detecting renames as files. - pub copies: Option, - /// The percentage of similarity needed for files to be considered renamed, defaulting to `Some(0.5)`. - /// This field is similar to `git diff -M50%`. - /// - /// If `None`, files are only considered equal if their content matches 100%. - /// Note that values greater than 1.0 have no different effect than 1.0. - pub percentage: Option, - /// The amount of files to consider for fuzzy rename or copy tracking. Defaults to 1000, meaning that only 1000*1000 - /// combinations can be tested for fuzzy matches, i.e. the ones that try to find matches by comparing similarity. - /// If 0, there is no limit. - /// - /// If the limit would not be enough to test the entire set of combinations, the algorithm will trade in precision and not - /// run the fuzzy version of identity tests at all. That way results are never partial. - pub limit: usize, -} - -/// -pub mod rewrites; - -/// types to actually perform rename tracking. -pub(crate) mod tracked; - /// Configuration impl<'a, 'repo> Platform<'a, 'repo> { /// Keep track of file-names, which makes the [`location`][Change::location] field usable with the filename of the changed item. diff --git a/gix/src/object/tree/diff/rewrites.rs b/gix/src/object/tree/diff/rewrites.rs deleted file mode 100644 index e434726d9e6..00000000000 --- a/gix/src/object/tree/diff/rewrites.rs +++ /dev/null @@ -1,108 +0,0 @@ -use crate::{ - config::{cache::util::ApplyLeniency, tree::Diff}, - diff::rename::Tracking, - object::tree::diff::Rewrites, -}; - -/// From where to source copies -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub enum CopySource { - /// Find copies from the set of modified files only. - FromSetOfModifiedFiles, - /// Find copies from the set of changed files, as well as all files known to the source (i.e. previous state) of the tree. - /// - /// This can be an expensive operation as it scales exponentially with the total amount of files in the tree. - FromSetOfModifiedFilesAndSourceTree, -} - -/// How to determine copied files. -#[derive(Debug, Copy, Clone, PartialEq)] -pub struct Copies { - /// The set of files to search when finding the source of copies. - pub source: CopySource, - /// Equivalent to [`Rewrites::percentage`], but used for copy tracking. - /// - /// Useful to have similarity-based rename tracking and cheaper copy tracking, which also is the default - /// as only identity plays a role. - pub percentage: Option, -} - -impl Default for Copies { - fn default() -> Self { - Copies { - source: CopySource::FromSetOfModifiedFiles, - percentage: Some(0.5), - } - } -} - -/// Information collected while handling rewrites of files which may be tracked. -#[derive(Default, Clone, Copy, Debug, PartialEq)] -pub struct Outcome { - /// The options used to guide the rewrite tracking. Either fully provided by the caller or retrieved from git configuration. - pub options: Rewrites, - /// The amount of similarity checks that have been conducted to find renamed files and potentially copies. - pub num_similarity_checks: usize, - /// Set to the amount of worst-case rename permutations we didn't search as our limit didn't allow it. - pub num_similarity_checks_skipped_for_rename_tracking_due_to_limit: usize, - /// Set to the amount of worst-case copy permutations we didn't search as our limit didn't allow it. - pub num_similarity_checks_skipped_for_copy_tracking_due_to_limit: usize, -} - -/// The error returned by [`Rewrites::try_from_config()`]. -#[derive(Debug, thiserror::Error)] -#[allow(missing_docs)] -pub enum Error { - #[error(transparent)] - ConfigDiffRenames(#[from] crate::config::key::GenericError), - #[error(transparent)] - ConfigDiffRenameLimit(#[from] crate::config::unsigned_integer::Error), -} - -/// The default settings for rewrites according to the git configuration defaults. -impl Default for Rewrites { - fn default() -> Self { - Rewrites { - copies: None, - percentage: Some(0.5), - limit: 1000, - } - } -} - -impl Rewrites { - /// Create an instance by reading all relevant information from the `config`uration, while being `lenient` or not. - /// Returns `Ok(None)` if nothing is configured. - /// - /// Note that missing values will be defaulted similar to what git does. - #[allow(clippy::result_large_err)] - pub fn try_from_config(config: &gix_config::File<'static>, lenient: bool) -> Result, Error> { - let key = "diff.renames"; - let copies = match config - .boolean_by_key(key) - .map(|value| Diff::RENAMES.try_into_renames(value)) - .transpose() - .with_leniency(lenient)? - { - Some(renames) => match renames { - Tracking::Disabled => return Ok(None), - Tracking::Renames => None, - Tracking::RenamesAndCopies => Some(Copies::default()), - }, - None => return Ok(None), - }; - - let default = Self::default(); - Ok(Rewrites { - copies, - limit: config - .integer_by_key("diff.renameLimit") - .map(|value| Diff::RENAME_LIMIT.try_into_usize(value)) - .transpose() - .with_leniency(lenient)? - .unwrap_or(default.limit), - ..default - } - .into()) - } -} diff --git a/gix/src/object/tree/diff/tracked.rs b/gix/src/object/tree/diff/tracked.rs deleted file mode 100644 index 318ce295063..00000000000 --- a/gix/src/object/tree/diff/tracked.rs +++ /dev/null @@ -1,554 +0,0 @@ -use std::ops::Range; - -use gix_diff::tree::visit::Change; -use gix_object::tree::{EntryKind, EntryMode}; - -use crate::{ - bstr::BStr, - ext::ObjectIdExt, - object::tree::diff::{ - change::DiffLineStats, - rewrites::{CopySource, Outcome}, - Rewrites, - }, - Repository, Tree, -}; - -/// A set of tracked items allows to figure out their relations by figuring out their similarity. -pub struct Item { - /// The underlying raw change - change: Change, - /// That slice into the backing for paths. - location: Range, - /// If true, this item was already emitted, i.e. seen by the caller. - emitted: bool, -} - -impl Item { - fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr { - backing[self.location.clone()].as_ref() - } - fn entry_mode_compatible(&self, mode: EntryMode) -> bool { - use EntryKind::*; - matches!( - (mode.kind(), self.change.entry_mode().kind()), - (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) - ) - } - - fn is_source_for_destination_of(&self, kind: visit::Kind, dest_item_mode: EntryMode) -> bool { - self.entry_mode_compatible(dest_item_mode) - && match kind { - visit::Kind::RenameTarget => !self.emitted && matches!(self.change, Change::Deletion { .. }), - visit::Kind::CopyDestination => { - matches!(self.change, Change::Modification { .. }) - } - } - } -} - -pub struct State { - items: Vec, - path_backing: Vec, - rewrites: Rewrites, - tracking: Option, -} - -pub mod visit { - use crate::{bstr::BStr, object::tree::diff::change::DiffLineStats}; - - pub struct Source<'a> { - pub mode: gix_object::tree::EntryMode, - pub id: gix_hash::ObjectId, - pub kind: Kind, - pub location: &'a BStr, - pub diff: Option, - } - - #[derive(Debug, Copy, Clone, Eq, PartialEq)] - pub enum Kind { - RenameTarget, - CopyDestination, - } - - pub struct Destination<'a> { - pub change: gix_diff::tree::visit::Change, - pub location: &'a BStr, - } -} - -impl State { - pub(crate) fn new(renames: Rewrites, tracking: Option) -> Self { - State { - items: vec![], - path_backing: vec![], - rewrites: renames, - tracking, - } - } -} - -/// build state and find matches. -impl State { - /// We may refuse the push if that information isn't needed for what we have to track. - pub fn try_push_change(&mut self, change: Change, location: &BStr) -> Option { - if !change.entry_mode().is_blob_or_symlink() { - return Some(change); - } - let keep = match (self.rewrites.copies, &change) { - (Some(_find_copies), _) => true, - (None, Change::Modification { .. }) => false, - (None, _) => true, - }; - - if !keep { - return Some(change); - } - - let start = self.path_backing.len(); - self.path_backing.extend_from_slice(location); - self.items.push(Item { - location: start..self.path_backing.len(), - change, - emitted: false, - }); - None - } - - /// Can only be called once effectively as it alters its own state. - /// - /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's - /// the destination of a copy or rename, or with `None` for source if no relation to other - /// items in the tracked set exist. - pub fn emit( - &mut self, - mut cb: impl FnMut(visit::Destination<'_>, Option>) -> gix_diff::tree::visit::Action, - src_tree: &Tree<'_>, - ) -> Result { - fn by_id_and_location(a: &Item, b: &Item) -> std::cmp::Ordering { - a.change.oid().cmp(b.change.oid()).then_with(|| { - a.location - .start - .cmp(&b.location.start) - .then(a.location.end.cmp(&b.location.end)) - }) - } - self.items.sort_by(by_id_and_location); - - let mut out = Outcome { - options: self.rewrites, - ..Default::default() - }; - out = self.match_pairs_of_kind( - visit::Kind::RenameTarget, - &mut cb, - self.rewrites.percentage, - out, - src_tree.repo, - )?; - - if let Some(copies) = self.rewrites.copies { - out = self.match_pairs_of_kind( - visit::Kind::CopyDestination, - &mut cb, - copies.percentage, - out, - src_tree.repo, - )?; - - match copies.source { - CopySource::FromSetOfModifiedFiles => {} - CopySource::FromSetOfModifiedFilesAndSourceTree => { - src_tree - .traverse() - .breadthfirst(&mut tree_to_events::Delegate::new(self))?; - self.items.sort_by(by_id_and_location); - - out = self.match_pairs_of_kind( - visit::Kind::CopyDestination, - &mut cb, - copies.percentage, - out, - src_tree.repo, - )?; - } - } - } - - self.items - .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing))); - for item in self.items.drain(..).filter(|item| !item.emitted) { - if cb( - visit::Destination { - location: item.location(&self.path_backing), - change: item.change, - }, - None, - ) == gix_diff::tree::visit::Action::Cancel - { - break; - } - } - Ok(out) - } - - fn match_pairs_of_kind( - &mut self, - kind: visit::Kind, - cb: &mut impl FnMut(visit::Destination<'_>, Option>) -> gix_diff::tree::visit::Action, - percentage: Option, - mut out: Outcome, - repo: &Repository, - ) -> Result { - // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively. - let needs_second_pass = !needs_exact_match(percentage); - if self.match_pairs(cb, None /* by identity */, kind, repo, &mut out)? == gix_diff::tree::visit::Action::Cancel - { - return Ok(out); - } - if needs_second_pass { - let is_limited = if self.rewrites.limit == 0 { - false - } else if let Some(permutations) = permutations_over_limit(&self.items, self.rewrites.limit, kind) { - match kind { - visit::Kind::RenameTarget => { - out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations; - } - visit::Kind::CopyDestination => { - out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations; - } - } - true - } else { - false - }; - if !is_limited { - self.match_pairs(cb, self.rewrites.percentage, kind, repo, &mut out)?; - } - } - Ok(out) - } - - fn match_pairs( - &mut self, - cb: &mut impl FnMut(visit::Destination<'_>, Option>) -> gix_diff::tree::visit::Action, - percentage: Option, - kind: visit::Kind, - repo: &Repository, - stats: &mut Outcome, - ) -> Result { - // TODO(perf): reuse object data and interner state and interned tokens, make these available to `find_match()` - let mut dest_ofs = 0; - while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| { - (!item.emitted && matches!(item.change, Change::Addition { .. })).then_some((idx, item)) - }) { - dest_idx += dest_ofs; - dest_ofs = dest_idx + 1; - let src = - find_match(&self.items, dest, dest_idx, percentage, kind, repo, stats)?.map(|(src_idx, src, diff)| { - let (id, mode) = src.change.oid_and_entry_mode(); - let id = id.to_owned(); - let location = src.location(&self.path_backing); - ( - visit::Source { - mode, - id, - kind, - location, - diff, - }, - src_idx, - ) - }); - if src.is_none() { - continue; - } - let location = dest.location(&self.path_backing); - let change = dest.change.clone(); - let dest = visit::Destination { change, location }; - self.items[dest_idx].emitted = true; - if let Some(src_idx) = src.as_ref().map(|t| t.1) { - self.items[src_idx].emitted = true; - } - if cb(dest, src.map(|t| t.0)) == gix_diff::tree::visit::Action::Cancel { - return Ok(gix_diff::tree::visit::Action::Cancel); - } - } - Ok(gix_diff::tree::visit::Action::Continue) - } -} - -fn permutations_over_limit(items: &[Item], limit: usize, kind: visit::Kind) -> Option { - let (sources, destinations) = items - .iter() - .filter(|item| match kind { - visit::Kind::RenameTarget => !item.emitted, - visit::Kind::CopyDestination => true, - }) - .fold((0, 0), |(mut src, mut dest), item| { - match item.change { - Change::Addition { .. } => { - dest += 1; - } - Change::Deletion { .. } => { - if kind == visit::Kind::RenameTarget { - src += 1 - } - } - Change::Modification { .. } => { - if kind == visit::Kind::CopyDestination { - src += 1 - } - } - } - (src, dest) - }); - let permutations = sources * destinations; - (permutations > limit * limit).then_some(permutations) -} - -fn needs_exact_match(percentage: Option) -> bool { - percentage.map_or(true, |p| p >= 1.0) -} - -/// <`src_idx`, src, possibly diff stat> -type SourceTuple<'a> = (usize, &'a Item, Option); - -/// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`. -/// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity. -/// We also ignore emitted items entirely. -/// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or -/// any non-deletion otherwise. -/// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set -/// of items to be searched. -fn find_match<'a>( - items: &'a [Item], - item: &Item, - item_idx: usize, - percentage: Option, - kind: visit::Kind, - repo: &Repository, - stats: &mut Outcome, -) -> Result>, crate::object::tree::diff::for_each::Error> { - let (item_id, item_mode) = item.change.oid_and_entry_mode(); - if needs_exact_match(percentage) || item_mode.is_link() { - let first_idx = items.partition_point(|a| a.change.oid() < item_id); - let range = match items.get(first_idx..).map(|items| { - let end = items - .iter() - .position(|a| a.change.oid() != item_id) - .map_or(items.len(), |idx| first_idx + idx); - first_idx..end - }) { - Some(range) => range, - None => return Ok(None), - }; - if range.is_empty() { - return Ok(None); - } - let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| { - src_idx += range.start; - (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None)) - }); - if let Some(src) = res { - return Ok(Some(src)); - } - } else { - let new = item_id.to_owned().attach(repo).object()?; - let percentage = percentage.expect("it's set to something below 1.0 and we assured this"); - debug_assert!( - item.change.entry_mode().is_blob(), - "symlinks are matched exactly, and trees aren't used here" - ); - let algo = repo.config.diff_algorithm()?; - for (can_idx, src) in items - .iter() - .enumerate() - .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)) - { - let old = src.change.oid().to_owned().attach(repo).object()?; - // TODO: make sure we get attribute handling and binary skips and filters right here. There is crate::object::blob::diff::Platform - // which should have facilities for that one day, but we don't use it because we need newlines in our tokens. - let tokens = gix_diff::blob::intern::InternedInput::new( - gix_diff::blob::sources::byte_lines_with_terminator(&old.data), - gix_diff::blob::sources::byte_lines_with_terminator(&new.data), - ); - let counts = gix_diff::blob::diff( - algo, - &tokens, - gix_diff::blob::sink::Counter::new(diff::Statistics { - removed_bytes: 0, - input: &tokens, - }), - ); - let similarity = (old.data.len() - counts.wrapped) as f32 / old.data.len().max(new.data.len()) as f32; - stats.num_similarity_checks += 1; - if similarity >= percentage { - return Ok(Some(( - can_idx, - src, - DiffLineStats { - removals: counts.removals, - insertions: counts.insertions, - before: tokens.before.len().try_into().expect("interner handles only u32"), - after: tokens.after.len().try_into().expect("interner handles only u32"), - } - .into(), - ))); - } - } - } - Ok(None) -} - -mod diff { - use std::ops::Range; - - pub struct Statistics<'a, 'data> { - pub removed_bytes: usize, - pub input: &'a gix_diff::blob::intern::InternedInput<&'data [u8]>, - } - - impl<'a, 'data> gix_diff::blob::Sink for Statistics<'a, 'data> { - type Out = usize; - - fn process_change(&mut self, before: Range, _after: Range) { - self.removed_bytes += self.input.before[before.start as usize..before.end as usize] - .iter() - .map(|token| self.input.interner[*token].len()) - .sum::(); - } - - fn finish(self) -> Self::Out { - self.removed_bytes - } - } - - #[cfg(test)] - mod tests { - fn removed_bytes(before: &[u8], after: &[u8]) -> usize { - let input = gix_diff::blob::intern::InternedInput::new(before, after); - gix_diff::blob::diff( - gix_diff::blob::Algorithm::Myers, - &input, - super::Statistics { - removed_bytes: 0, - input: &input, - }, - ) - } - - #[test] - fn counts_removed_bytes_correctly() { - assert_eq!(1, removed_bytes(b"a", b"")); - assert_eq!(0, removed_bytes(b"", b"a")); - - // need the inputs to have more than one "hunk" - // of differences to stress the fact that - // process_change is called multiple times - let before = b" -a -a -a -b -b -b -b -a -a -a -a -a -b -b -a -a -a -a -a -"; - // it's `before`, with the "b" lines removed - let after = b" -a -a -a -a -a -a -a -a -a -a -a -a -a - "; - - assert_eq!(6, removed_bytes(before, after)); - } - } -} - -mod tree_to_events { - use gix_diff::tree::visit::Change; - use gix_object::tree::EntryRef; - - use crate::bstr::BStr; - - pub struct Delegate<'a> { - parent: &'a mut super::State, - recorder: gix_traverse::tree::Recorder, - } - - impl<'a> Delegate<'a> { - pub fn new(parent: &'a mut super::State) -> Self { - let tracking = parent.tracking.map(|t| match t { - gix_diff::tree::recorder::Location::FileName => gix_traverse::tree::recorder::Location::FileName, - gix_diff::tree::recorder::Location::Path => gix_traverse::tree::recorder::Location::Path, - }); - Self { - parent, - recorder: gix_traverse::tree::Recorder::default().track_location(tracking), - } - } - } - - impl gix_traverse::tree::Visit for Delegate<'_> { - fn pop_front_tracked_path_and_set_current(&mut self) { - self.recorder.pop_front_tracked_path_and_set_current() - } - - fn push_back_tracked_path_component(&mut self, component: &BStr) { - self.recorder.push_back_tracked_path_component(component) - } - - fn push_path_component(&mut self, component: &BStr) { - self.recorder.push_path_component(component) - } - - fn pop_path_component(&mut self) { - self.recorder.pop_path_component(); - } - - fn visit_tree(&mut self, _entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action { - gix_traverse::tree::visit::Action::Continue - } - - fn visit_nontree(&mut self, entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action { - if entry.mode.is_blob() { - self.parent.try_push_change( - Change::Modification { - previous_entry_mode: entry.mode, - previous_oid: gix_hash::ObjectId::null(entry.oid.kind()), - entry_mode: entry.mode, - oid: entry.oid.to_owned(), - }, - self.recorder.path(), - ); - // make sure these aren't viable to be emitted anymore. - self.parent.items.last_mut().expect("just pushed").emitted = true; - } - gix_traverse::tree::visit::Action::Continue - } - } -} diff --git a/gix/tests/object/tree/diff.rs b/gix/tests/object/tree/diff.rs index 75a6549d8ea..c3304965b3f 100644 --- a/gix/tests/object/tree/diff.rs +++ b/gix/tests/object/tree/diff.rs @@ -104,11 +104,12 @@ fn tree_named(repo: &gix::Repository, rev_spec: impl AsRef) -> gix::Tree { mod track_rewrites { use std::convert::Infallible; - use gix::object::tree::diff::{ - change::{DiffLineStats, Event}, + use gix::diff::blob::DiffLineStats; + use gix::diff::{ rewrites::{Copies, CopySource}, Rewrites, }; + use gix::object::tree::diff::change::Event; use gix_ref::bstr::BStr; use crate::{ @@ -484,7 +485,7 @@ mod track_rewrites { .track_rewrites( Rewrites { copies: Some(Copies { - source: CopySource::FromSetOfModifiedFilesAndSourceTree, + source: CopySource::FromSetOfModifiedFilesAndAllSources, ..Default::default() }), ..Default::default() @@ -555,7 +556,7 @@ mod track_rewrites { .track_rewrites( Rewrites { copies: Some(Copies { - source: CopySource::FromSetOfModifiedFilesAndSourceTree, + source: CopySource::FromSetOfModifiedFilesAndAllSources, ..Default::default() }), limit: 2, // similarity checks can't be made that way