|
1 | 1 | use crate::common::Config;
|
| 2 | +use std::cmp; |
2 | 3 | use std::env;
|
3 | 4 | use std::ffi::OsStr;
|
| 5 | +use std::mem; |
4 | 6 | use std::path::PathBuf;
|
5 | 7 | use std::process::Command;
|
6 | 8 |
|
@@ -76,3 +78,256 @@ pub fn add_dylib_path(cmd: &mut Command, paths: impl Iterator<Item = impl Into<P
|
76 | 78 | let new_paths = paths.map(Into::into).chain(old_paths.into_iter().flatten());
|
77 | 79 | cmd.env(dylib_env_var(), env::join_paths(new_paths).unwrap());
|
78 | 80 | }
|
| 81 | + |
| 82 | +// Edit distance taken from rustc's `rustc_span::edit_distance`. |
| 83 | + |
| 84 | +/// Finds the [edit distance] between two strings. |
| 85 | +/// |
| 86 | +/// Returns `None` if the distance exceeds the limit. |
| 87 | +/// |
| 88 | +/// [edit distance]: https://en.wikipedia.org/wiki/Edit_distance |
| 89 | +pub fn edit_distance(a: &str, b: &str, limit: usize) -> Option<usize> { |
| 90 | + let mut a = &a.chars().collect::<Vec<_>>()[..]; |
| 91 | + let mut b = &b.chars().collect::<Vec<_>>()[..]; |
| 92 | + |
| 93 | + // Ensure that `b` is the shorter string, minimizing memory use. |
| 94 | + if a.len() < b.len() { |
| 95 | + mem::swap(&mut a, &mut b); |
| 96 | + } |
| 97 | + |
| 98 | + let min_dist = a.len() - b.len(); |
| 99 | + // If we know the limit will be exceeded, we can return early. |
| 100 | + if min_dist > limit { |
| 101 | + return None; |
| 102 | + } |
| 103 | + |
| 104 | + // Strip common prefix. |
| 105 | + loop { |
| 106 | + let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_first().zip(a.split_first()) |
| 107 | + else { |
| 108 | + break; |
| 109 | + }; |
| 110 | + |
| 111 | + if a_char != b_char { |
| 112 | + break; |
| 113 | + } |
| 114 | + |
| 115 | + a = a_rest; |
| 116 | + b = b_rest; |
| 117 | + } |
| 118 | + |
| 119 | + // Strip common suffix. |
| 120 | + loop { |
| 121 | + let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_last().zip(a.split_last()) else { |
| 122 | + break; |
| 123 | + }; |
| 124 | + |
| 125 | + if a_char != b_char { |
| 126 | + break; |
| 127 | + } |
| 128 | + |
| 129 | + a = a_rest; |
| 130 | + b = b_rest; |
| 131 | + } |
| 132 | + |
| 133 | + // If either string is empty, the distance is the length of the other. |
| 134 | + // We know that `b` is the shorter string, so we don't need to check `a`. |
| 135 | + if b.len() == 0 { |
| 136 | + return Some(min_dist); |
| 137 | + } |
| 138 | + |
| 139 | + let mut prev_prev = vec![usize::MAX; b.len() + 1]; |
| 140 | + let mut prev = (0..=b.len()).collect::<Vec<_>>(); |
| 141 | + let mut current = vec![0; b.len() + 1]; |
| 142 | + |
| 143 | + // row by row |
| 144 | + for i in 1..=a.len() { |
| 145 | + current[0] = i; |
| 146 | + let a_idx = i - 1; |
| 147 | + |
| 148 | + // column by column |
| 149 | + for j in 1..=b.len() { |
| 150 | + let b_idx = j - 1; |
| 151 | + |
| 152 | + // There is no cost to substitute a character with itself. |
| 153 | + let substitution_cost = if a[a_idx] == b[b_idx] { 0 } else { 1 }; |
| 154 | + |
| 155 | + current[j] = cmp::min( |
| 156 | + // deletion |
| 157 | + prev[j] + 1, |
| 158 | + cmp::min( |
| 159 | + // insertion |
| 160 | + current[j - 1] + 1, |
| 161 | + // substitution |
| 162 | + prev[j - 1] + substitution_cost, |
| 163 | + ), |
| 164 | + ); |
| 165 | + |
| 166 | + if (i > 1) && (j > 1) && (a[a_idx] == b[b_idx - 1]) && (a[a_idx - 1] == b[b_idx]) { |
| 167 | + // transposition |
| 168 | + current[j] = cmp::min(current[j], prev_prev[j - 2] + 1); |
| 169 | + } |
| 170 | + } |
| 171 | + |
| 172 | + // Rotate the buffers, reusing the memory. |
| 173 | + [prev_prev, prev, current] = [prev, current, prev_prev]; |
| 174 | + } |
| 175 | + |
| 176 | + // `prev` because we already rotated the buffers. |
| 177 | + let distance = prev[b.len()]; |
| 178 | + (distance <= limit).then_some(distance) |
| 179 | +} |
| 180 | + |
| 181 | +/// Provides a word similarity score between two words that accounts for substrings being more |
| 182 | +/// meaningful than a typical edit distance. The lower the score, the closer the match. 0 is an |
| 183 | +/// identical match. |
| 184 | +/// |
| 185 | +/// Uses the edit distance between the two strings and removes the cost of the length difference. |
| 186 | +/// If this is 0 then it is either a substring match or a full word match, in the substring match |
| 187 | +/// case we detect this and return `1`. To prevent finding meaningless substrings, eg. "in" in |
| 188 | +/// "shrink", we only perform this subtraction of length difference if one of the words is not |
| 189 | +/// greater than twice the length of the other. For cases where the words are close in size but not |
| 190 | +/// an exact substring then the cost of the length difference is discounted by half. |
| 191 | +/// |
| 192 | +/// Returns `None` if the distance exceeds the limit. |
| 193 | +pub fn edit_distance_with_substrings(a: &str, b: &str, limit: usize) -> Option<usize> { |
| 194 | + let n = a.chars().count(); |
| 195 | + let m = b.chars().count(); |
| 196 | + |
| 197 | + // Check one isn't less than half the length of the other. If this is true then there is a |
| 198 | + // big difference in length. |
| 199 | + let big_len_diff = (n * 2) < m || (m * 2) < n; |
| 200 | + let len_diff = if n < m { m - n } else { n - m }; |
| 201 | + let distance = edit_distance(a, b, limit + len_diff)?; |
| 202 | + |
| 203 | + // This is the crux, subtracting length difference means exact substring matches will now be 0 |
| 204 | + let score = distance - len_diff; |
| 205 | + |
| 206 | + // If the score is 0 but the words have different lengths then it's a substring match not a full |
| 207 | + // word match |
| 208 | + let score = if score == 0 && len_diff > 0 && !big_len_diff { |
| 209 | + 1 // Exact substring match, but not a total word match so return non-zero |
| 210 | + } else if !big_len_diff { |
| 211 | + // Not a big difference in length, discount cost of length difference |
| 212 | + score + (len_diff + 1) / 2 |
| 213 | + } else { |
| 214 | + // A big difference in length, add back the difference in length to the score |
| 215 | + score + len_diff |
| 216 | + }; |
| 217 | + |
| 218 | + (score <= limit).then_some(score) |
| 219 | +} |
| 220 | + |
| 221 | +/// Finds the best match for given word in the given iterator where substrings are meaningful. |
| 222 | +/// |
| 223 | +/// A version of [`find_best_match_for_name`] that uses [`edit_distance_with_substrings`] as the |
| 224 | +/// score for word similarity. This takes an optional distance limit which defaults to one-third of |
| 225 | +/// the given word. |
| 226 | +/// |
| 227 | +/// We use case insensitive comparison to improve accuracy on an edge case with a lower(upper)case |
| 228 | +/// letters mismatch. |
| 229 | +pub fn find_best_match_for_name_with_substrings<'c, 'd, 'l>( |
| 230 | + candidates: &'c [&'d str], |
| 231 | + lookup: &'l str, |
| 232 | + dist: Option<usize>, |
| 233 | +) -> Option<&'d str> { |
| 234 | + find_best_match_for_name_impl(true, candidates, lookup, dist) |
| 235 | +} |
| 236 | + |
| 237 | +/// Finds the best match for a given word in the given iterator. |
| 238 | +/// |
| 239 | +/// As a loose rule to avoid the obviously incorrect suggestions, it takes |
| 240 | +/// an optional limit for the maximum allowable edit distance, which defaults |
| 241 | +/// to one-third of the given word. |
| 242 | +/// |
| 243 | +/// We use case insensitive comparison to improve accuracy on an edge case with a lower(upper)case |
| 244 | +/// letters mismatch. |
| 245 | +pub fn find_best_match_for_name<'c, 'd, 'l>( |
| 246 | + candidates: &'c [&'d str], |
| 247 | + lookup: &'l str, |
| 248 | + dist: Option<usize>, |
| 249 | +) -> Option<&'d str> { |
| 250 | + find_best_match_for_name_impl(false, candidates, lookup, dist) |
| 251 | +} |
| 252 | + |
| 253 | +#[cold] |
| 254 | +fn find_best_match_for_name_impl<'c, 'd, 'l>( |
| 255 | + use_substring_score: bool, |
| 256 | + candidates: &'c [&'d str], |
| 257 | + lookup: &'l str, |
| 258 | + dist: Option<usize>, |
| 259 | +) -> Option<&'d str> { |
| 260 | + let lookup_uppercase = lookup.to_uppercase(); |
| 261 | + |
| 262 | + // Priority of matches: |
| 263 | + // 1. Exact case insensitive match |
| 264 | + // 2. Edit distance match |
| 265 | + // 3. Sorted word match |
| 266 | + if let Some(c) = candidates.iter().find(|c| c.to_uppercase() == lookup_uppercase) { |
| 267 | + return Some(*c); |
| 268 | + } |
| 269 | + |
| 270 | + // `fn edit_distance()` use `chars()` to calculate edit distance, so we must |
| 271 | + // also use `chars()` (and not `str::len()`) to calculate length here. |
| 272 | + let lookup_len = lookup.chars().count(); |
| 273 | + |
| 274 | + let mut dist = dist.unwrap_or_else(|| cmp::max(lookup_len, 3) / 3); |
| 275 | + let mut best = None; |
| 276 | + // store the candidates with the same distance, only for `use_substring_score` current. |
| 277 | + let mut next_candidates = vec![]; |
| 278 | + for c in candidates { |
| 279 | + match if use_substring_score { |
| 280 | + edit_distance_with_substrings(lookup, c, dist) |
| 281 | + } else { |
| 282 | + edit_distance(lookup, c, dist) |
| 283 | + } { |
| 284 | + Some(0) => return Some(*c), |
| 285 | + Some(d) => { |
| 286 | + if use_substring_score { |
| 287 | + if d < dist { |
| 288 | + dist = d; |
| 289 | + next_candidates.clear(); |
| 290 | + } else { |
| 291 | + // `d == dist` here, we need to store the candidates with the same distance |
| 292 | + // so we won't decrease the distance in the next loop. |
| 293 | + } |
| 294 | + next_candidates.push(*c); |
| 295 | + } else { |
| 296 | + dist = d - 1; |
| 297 | + } |
| 298 | + best = Some(*c); |
| 299 | + } |
| 300 | + None => {} |
| 301 | + } |
| 302 | + } |
| 303 | + |
| 304 | + // We have a tie among several candidates, try to select the best among them ignoring substrings. |
| 305 | + // For example, the candidates list `force_capture`, `capture`, and user inputted `forced_capture`, |
| 306 | + // we select `force_capture` with a extra round of edit distance calculation. |
| 307 | + if next_candidates.len() > 1 { |
| 308 | + debug_assert!(use_substring_score); |
| 309 | + best = find_best_match_for_name_impl(false, &next_candidates, lookup, Some(lookup.len())); |
| 310 | + } |
| 311 | + if best.is_some() { |
| 312 | + return best; |
| 313 | + } |
| 314 | + |
| 315 | + find_match_by_sorted_words(candidates, lookup) |
| 316 | +} |
| 317 | + |
| 318 | +fn find_match_by_sorted_words<'c, 'd, 'l>( |
| 319 | + iter_names: &'c [&'d str], |
| 320 | + lookup: &'l str, |
| 321 | +) -> Option<&'d str> { |
| 322 | + let lookup_sorted_by_words = sort_by_words(lookup); |
| 323 | + iter_names.iter().fold(None, |result, candidate| { |
| 324 | + if sort_by_words(candidate) == lookup_sorted_by_words { Some(*candidate) } else { result } |
| 325 | + }) |
| 326 | +} |
| 327 | + |
| 328 | +fn sort_by_words(name: &str) -> Vec<&str> { |
| 329 | + let mut split_words: Vec<&str> = name.split('_').collect(); |
| 330 | + // We are sorting primitive &strs and can use unstable sort here. |
| 331 | + split_words.sort_unstable(); |
| 332 | + split_words |
| 333 | +} |
0 commit comments