Skip to content

Commit 4dfe2ee

Browse files
committed
worker: add a job to check for potential typosquatting
This only fires when new crates are published: updates to existing crates will not cause this job to run. On a technical level, the major impact here is that the background worker will keep an in memory cache of the top 3000 crates and their owners. I don't expect the impact of this to be significant in practice. As this is an experiment at present, configuration is hardcoded into the new worker job module. If this becomes a longer term thing, this would be split out into our normal configuration system for easier management.
1 parent 10a5672 commit 4dfe2ee

File tree

11 files changed

+707
-2
lines changed

11 files changed

+707
-2
lines changed

Cargo.lock

Lines changed: 29 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,20 @@ tar = "=0.4.40"
8888
tempfile = "=3.8.0"
8989
thiserror = "=1.0.49"
9090
threadpool = "=1.8.1"
91-
tokio = { version = "=1.32.0", features = ["net", "signal", "io-std", "io-util", "rt-multi-thread", "macros"]}
91+
tokio = { version = "=1.32.0", features = [
92+
"net",
93+
"signal",
94+
"io-std",
95+
"io-util",
96+
"rt-multi-thread",
97+
"macros",
98+
] }
9299
toml = "=0.8.1"
93100
tower = "=0.4.13"
94101
tower-http = { version = "=0.4.4", features = ["fs", "catch-panic"] }
95102
tracing = "=0.1.37"
96103
tracing-subscriber = { version = "=0.3.17", features = ["env-filter"] }
104+
typomania = { version = "=0.1.0", default-features = false }
97105
url = "=2.4.1"
98106

99107
[dev-dependencies]

src/background_jobs.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ macro_rules! job_variant_from_value {
7777

7878
jobs! {
7979
pub enum Job {
80+
CheckTyposquat(CheckTyposquatJob),
8081
DailyDbMaintenance,
8182
DumpDb(DumpDbJob),
8283
NormalizeIndex(NormalizeIndexJob),
@@ -166,6 +167,12 @@ impl Job {
166167
Ok(())
167168
}
168169

170+
pub fn check_typosquat<T: ToString>(krate: T) -> Self {
171+
Self::CheckTyposquat(CheckTyposquatJob {
172+
krate: krate.to_string(),
173+
})
174+
}
175+
169176
pub fn daily_db_maintenance() -> Self {
170177
Self::DailyDbMaintenance
171178
}
@@ -250,6 +257,7 @@ impl Job {
250257
.as_ref()
251258
.expect("Application should configure a background runner environment");
252259
match self {
260+
Job::CheckTyposquat(args) => worker::check_typosquat(env, conn, &args.krate),
253261
Job::DailyDbMaintenance => {
254262
worker::perform_daily_db_maintenance(&mut *fresh_connection(pool)?)
255263
}
@@ -298,6 +306,11 @@ pub struct AddCrateJob {
298306
pub(super) krate: crates_io_index::Crate,
299307
}
300308

309+
#[derive(Serialize, Deserialize)]
310+
pub struct CheckTyposquatJob {
311+
pub(super) krate: String,
312+
}
313+
301314
#[derive(Serialize, Deserialize)]
302315
pub struct UpdateCrateIndexJob {
303316
pub(super) crate_name: String,

src/controllers/krate/publish.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,11 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult<Json<GoodCra
290290

291291
Job::enqueue_sync_to_index(&krate.name, conn)?;
292292

293+
// Experiment: check new crates for potential typosquatting.
294+
if existing_crate.is_none() {
295+
Job::check_typosquat(&krate.name).enqueue(conn)?;
296+
}
297+
293298
// The `other` field on `PublishWarnings` was introduced to handle a temporary warning
294299
// that is no longer needed. As such, crates.io currently does not return any `other`
295300
// warnings at this time, but if we need to, the field is available.

src/email.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,32 @@ or go to https://{domain}/me/pending-invites to manage all of your crate ownersh
9191
self.send(email, subject, &body)
9292
}
9393

94+
/// Attempts to send a notification that a new crate may be typosquatting another crate.
95+
pub fn send_possible_typosquat_notification(
96+
&self,
97+
email: &str,
98+
crate_name: &str,
99+
squats: &[typomania::checks::Squat],
100+
) -> AppResult<()> {
101+
let subject = "Possible typosquatting in new crate";
102+
let body = format!(
103+
"New crate {crate_name} may be typosquatting one or more other crates.\n
104+
Visit https://{domain}/crates/{crate_name} to see the offending crate.\n
105+
\n
106+
Specific squat checks that triggered:\n
107+
\n
108+
- {squats}\n",
109+
domain = crate::config::domain_name(),
110+
squats = squats
111+
.iter()
112+
.map(|squat| format!("{squat}"))
113+
.collect::<Vec<_>>()
114+
.join("\n- "),
115+
);
116+
117+
self.send(email, subject, &body)
118+
}
119+
94120
/// Attempts to send an API token exposure notification email
95121
pub fn send_token_exposed_notification(
96122
&self,

src/worker/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ pub mod dump_db;
99
pub mod fastly;
1010
mod git;
1111
mod readmes;
12+
mod typosquat;
1213
mod update_downloads;
1314

1415
pub(crate) use daily_db_maintenance::perform_daily_db_maintenance;
@@ -17,4 +18,5 @@ pub(crate) use git::{
1718
perform_index_squash, perform_normalize_index, sync_to_git_index, sync_to_sparse_index,
1819
};
1920
pub(crate) use readmes::perform_render_and_upload_readme;
21+
pub(crate) use typosquat::check_typosquat;
2022
pub(crate) use update_downloads::perform_update_downloads;

src/worker/typosquat.rs

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
use diesel::PgConnection;
2+
use typomania::Package;
3+
4+
use crate::{background_jobs::Environment, swirl::PerformError, Emails};
5+
6+
use self::types::OwnedCrate;
7+
8+
mod cache;
9+
mod config;
10+
mod types;
11+
12+
#[cfg(test)]
13+
mod test_util;
14+
15+
#[instrument(skip_all, fields(krate.name = ?name))]
16+
pub fn check_typosquat(
17+
env: &Environment,
18+
conn: &mut PgConnection,
19+
name: &str,
20+
) -> Result<(), PerformError> {
21+
check_typosquat_inner(env.emails(), conn, name)
22+
}
23+
24+
fn check_typosquat_inner(
25+
emails: &Emails,
26+
conn: &mut PgConnection,
27+
name: &str,
28+
) -> Result<(), PerformError> {
29+
info!("Checking new crate for potential typosquatting");
30+
31+
let krate: Box<dyn Package> = Box::new(OwnedCrate::from_name(conn, name)?);
32+
let squats = cache::get_harness(conn)?.check_package(name, krate)?;
33+
if !squats.is_empty() {
34+
// Well, well, well. For now, the only action we'll take is to e-mail people who hopefully
35+
// care to check into things more closely.
36+
info!(?squats, "Found potential typosquatting");
37+
38+
for email in config::NOTIFY_EMAILS.iter() {
39+
if let Err(e) = emails.send_possible_typosquat_notification(email, name, &squats) {
40+
error!(?e, ?email, "sending possible typosquat notification");
41+
}
42+
}
43+
}
44+
45+
Ok(())
46+
}
47+
48+
#[cfg(test)]
49+
mod tests {
50+
use crate::test_util::pg_connection;
51+
52+
use super::test_util::Faker;
53+
use super::*;
54+
55+
#[test]
56+
fn integration() -> Result<(), PerformError> {
57+
let emails = Emails::new_in_memory();
58+
let mut faker = Faker::new(pg_connection());
59+
60+
// Set up a user and a crate to match against.
61+
let user = faker.user("a")?;
62+
faker.crate_and_version("my-crate", "It's awesome", &user, 100)?;
63+
64+
// Prime the cache so it only includes the crate we just created.
65+
//
66+
// Note that there's theoretical flakiness here if the test takes longer to run than the
67+
// cache TTL. Of course, since the cache TTL is currently set to 12 hours, that would
68+
// probably indicate bigger problems.
69+
let _harness = super::cache::get_harness(faker.borrow_conn())?;
70+
71+
// Now we'll create new crates: one problematic, one not so.
72+
let other_user = faker.user("b")?;
73+
let (angel, _version) = faker.crate_and_version(
74+
"innocent-crate",
75+
"I'm just a simple, innocent crate",
76+
&other_user,
77+
0,
78+
)?;
79+
let (demon, _version) = faker.crate_and_version(
80+
"mycrate",
81+
"I'm even more innocent, obviously",
82+
&other_user,
83+
0,
84+
)?;
85+
86+
// OK, we're done faking stuff.
87+
let mut conn = faker.into_conn();
88+
89+
// Run the check with a crate that shouldn't cause problems.
90+
check_typosquat_inner(&emails, &mut conn, &angel.name)?;
91+
assert!(emails.mails_in_memory().unwrap().is_empty());
92+
93+
// Now run the check with a less innocent crate.
94+
check_typosquat_inner(&emails, &mut conn, &demon.name)?;
95+
assert!(!emails.mails_in_memory().unwrap().is_empty());
96+
97+
Ok(())
98+
}
99+
}

src/worker/typosquat/cache.rs

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
use std::{
2+
sync::{Arc, Mutex},
3+
time::Instant,
4+
};
5+
6+
use diesel::PgConnection;
7+
use typomania::{
8+
checks::{Bitflips, Omitted, SwappedWords, Typos},
9+
Harness,
10+
};
11+
12+
use crate::swirl::PerformError;
13+
14+
use super::{config, types::TopCrates};
15+
16+
/// Gets the typomania harness for the cached top crates, regenerating it if it is out of date.
17+
pub(super) fn get_harness(
18+
conn: &mut PgConnection,
19+
) -> Result<Arc<Harness<TopCrates>>, PerformError> {
20+
HARNESS_CACHE.get(conn)
21+
}
22+
23+
static HARNESS_CACHE: Cache = Cache::new();
24+
25+
struct Cache(Mutex<Inner>);
26+
27+
impl Cache {
28+
const fn new() -> Self {
29+
Self(Mutex::new(Inner::new()))
30+
}
31+
32+
fn get(&self, conn: &mut PgConnection) -> Result<Arc<Harness<TopCrates>>, PerformError> {
33+
let mut inner = self.0.lock().unwrap();
34+
35+
Ok(if let Some(harness) = inner.get() {
36+
harness
37+
} else {
38+
let harness = Arc::new(new_harness(conn)?);
39+
40+
inner.update(harness.clone());
41+
harness
42+
})
43+
}
44+
}
45+
46+
#[instrument(skip_all)]
47+
fn new_harness(conn: &mut PgConnection) -> Result<Harness<TopCrates>, PerformError> {
48+
debug!("Rebuilding top crate cache");
49+
let start = Instant::now();
50+
51+
let top_crates = TopCrates::new(conn, config::TOP_CRATES)?;
52+
53+
// This is essentially the standard set of checks that was implemented by typogard-crates.
54+
let harness = Harness::builder()
55+
.with_check(Bitflips::new(
56+
config::CRATE_NAME_ALPHABET,
57+
top_crates.iter_names(),
58+
))
59+
.with_check(Omitted::new(config::CRATE_NAME_ALPHABET))
60+
.with_check(SwappedWords::new("-_"))
61+
.with_check(Typos::new(config::TYPOS.iter().map(|(c, typos)| {
62+
(*c, typos.iter().map(|ss| ss.to_string()).collect())
63+
})))
64+
.build(top_crates);
65+
66+
let elapsed = Instant::now() - start;
67+
debug!(?elapsed, "Top crate cache rebuilt");
68+
69+
Ok(harness)
70+
}
71+
72+
struct Inner {
73+
harness: Option<Arc<Harness<TopCrates>>>,
74+
last_update: Option<Instant>,
75+
}
76+
77+
impl Inner {
78+
const fn new() -> Self {
79+
Self {
80+
harness: None,
81+
last_update: None,
82+
}
83+
}
84+
85+
fn get(&self) -> Option<Arc<Harness<TopCrates>>> {
86+
if let Some(harness) = &self.harness {
87+
if let Some(when) = self.last_update {
88+
if when >= Instant::now() - config::CACHE_TTL {
89+
return Some(harness.clone());
90+
}
91+
}
92+
}
93+
None
94+
}
95+
96+
fn update(&mut self, harness: Arc<Harness<TopCrates>>) {
97+
self.harness = Some(harness);
98+
self.last_update = Some(Instant::now());
99+
}
100+
}

0 commit comments

Comments
 (0)