Skip to content

Commit 002691a

Browse files
pwnalldjmitche
authored andcommitted
Fix solution in Link Checker in Concurrency Morning exercises (google#904)
* Fix solution in Link Checker in Concurrency Morning exercises. This change fixes the following issues with the current solution: 1. It is not listed on the "solutions" page. 2. It is not multi-threaded and does not use channels. --------- Co-authored-by: Dustin J. Mitchell <[email protected]>
1 parent 35d3742 commit 002691a

File tree

3 files changed

+145
-45
lines changed

3 files changed

+145
-45
lines changed

src/exercises/concurrency/link-checker.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,13 @@ Your `src/main.rs` file should look something like this:
5757
```rust,compile_fail
5858
{{#include link-checker.rs:setup}}
5959
60-
{{#include link-checker.rs:extract_links}}
60+
{{#include link-checker.rs:visit_page}}
6161
6262
fn main() {
63+
let client = Client::new();
6364
let start_url = Url::parse("https://www.google.org").unwrap();
64-
let response = get(start_url).unwrap();
65-
match extract_links(response) {
65+
let crawl_command = CrawlCommand{ url: start_url, extract_links: true };
66+
match visit_page(&client, &crawl_command) {
6667
Ok(links) => println!("Links: {links:#?}"),
6768
Err(err) => println!("Could not extract links: {err:#}"),
6869
}

src/exercises/concurrency/link-checker.rs

Lines changed: 134 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -12,78 +12,170 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
use std::{sync::Arc, sync::Mutex, sync::mpsc, thread};
16+
1517
// ANCHOR: setup
16-
use reqwest::blocking::{get, Response};
17-
use reqwest::Url;
18+
use reqwest::{blocking::Client, Url};
1819
use scraper::{Html, Selector};
1920
use thiserror::Error;
2021

2122
#[derive(Error, Debug)]
2223
enum Error {
2324
#[error("request error: {0}")]
2425
ReqwestError(#[from] reqwest::Error),
26+
#[error("bad http response: {0}")]
27+
BadResponse(String),
2528
}
2629
// ANCHOR_END: setup
2730

28-
// ANCHOR: extract_links
29-
fn extract_links(response: Response) -> Result<Vec<Url>, Error> {
31+
// ANCHOR: visit_page
32+
#[derive(Debug)]
33+
struct CrawlCommand {
34+
url: Url,
35+
extract_links: bool,
36+
}
37+
38+
fn visit_page(client: &Client, command: &CrawlCommand) -> Result<Vec<Url>, Error> {
39+
println!("Checking {:#}", command.url);
40+
let response = client.get(command.url.clone()).send()?;
41+
if !response.status().is_success() {
42+
return Err(Error::BadResponse(response.status().to_string()));
43+
}
44+
45+
let mut link_urls = Vec::new();
46+
if !command.extract_links {
47+
return Ok(link_urls);
48+
}
49+
3050
let base_url = response.url().to_owned();
31-
let document = response.text()?;
32-
let html = Html::parse_document(&document);
33-
let selector = Selector::parse("a").unwrap();
51+
let body_text = response.text()?;
52+
let document = Html::parse_document(&body_text);
3453

35-
let mut valid_urls = Vec::new();
36-
for element in html.select(&selector) {
37-
if let Some(href) = element.value().attr("href") {
38-
match base_url.join(href) {
39-
Ok(url) => valid_urls.push(url),
40-
Err(err) => {
41-
println!("On {base_url}: could not parse {href:?}: {err} (ignored)",);
42-
}
54+
let selector = Selector::parse("a").unwrap();
55+
let href_values = document
56+
.select(&selector)
57+
.filter_map(|element| element.value().attr("href"));
58+
for href in href_values {
59+
match base_url.join(href) {
60+
Ok(link_url) => {
61+
link_urls.push(link_url);
62+
}
63+
Err(err) => {
64+
println!("On {base_url:#}: ignored unparsable {href:?}: {err}");
4365
}
4466
}
4567
}
68+
Ok(link_urls)
69+
}
70+
// ANCHOR_END: visit_page
4671

47-
Ok(valid_urls)
72+
struct CrawlState {
73+
domain: String,
74+
visited_pages: std::collections::HashSet<String>,
4875
}
49-
// ANCHOR_END: extract_links
5076

51-
fn check_links(url: Url) -> Result<Vec<Url>, Error> {
52-
println!("Checking {url}");
77+
impl CrawlState {
78+
fn new(start_url: &Url) -> CrawlState {
79+
let mut visited_pages = std::collections::HashSet::new();
80+
visited_pages.insert(start_url.as_str().to_string());
81+
CrawlState {
82+
domain: start_url.domain().unwrap().to_string(),
83+
visited_pages,
84+
}
85+
}
5386

54-
let response = get(url.to_owned())?;
87+
/// Determine whether links within the given page should be extracted.
88+
fn should_extract_links(&self, url: &Url) -> bool {
89+
let Some(url_domain) = url.domain() else {
90+
return false;
91+
};
92+
url_domain == self.domain
93+
}
5594

56-
if !response.status().is_success() {
57-
return Ok(vec![url.to_owned()]);
95+
/// Mark the given page as visited, returning true if it had already
96+
/// been visited.
97+
fn mark_visited(&mut self, url: &Url) -> bool {
98+
self.visited_pages.insert(url.as_str().to_string())
5899
}
100+
}
59101

60-
let links = extract_links(response)?;
61-
for link in &links {
62-
println!("{link}, {:?}", link.domain());
102+
type CrawlResult = Result<Vec<Url>, (Url, Error)>;
103+
fn spawn_crawler_threads(
104+
command_receiver: mpsc::Receiver<CrawlCommand>,
105+
result_sender: mpsc::Sender<CrawlResult>,
106+
thread_count: u32,
107+
) {
108+
let command_receiver = Arc::new(Mutex::new(command_receiver));
109+
110+
for _ in 0..thread_count {
111+
let result_sender = result_sender.clone();
112+
let command_receiver = command_receiver.clone();
113+
thread::spawn(move || {
114+
let client = Client::new();
115+
loop {
116+
let command_result = {
117+
let receiver_guard = command_receiver.lock().unwrap();
118+
receiver_guard.recv()
119+
};
120+
let Ok(crawl_command) = command_result else {
121+
// The sender got dropped. No more commands coming in.
122+
break;
123+
};
124+
let crawl_result = match visit_page(&client, &crawl_command) {
125+
Ok(link_urls) => Ok(link_urls),
126+
Err(error) => Err((crawl_command.url, error)),
127+
};
128+
result_sender.send(crawl_result).unwrap();
129+
}
130+
});
63131
}
132+
}
64133

65-
let mut failed_links = Vec::new();
66-
for link in links {
67-
if link.domain() != url.domain() {
68-
println!("Checking external link: {link}");
69-
let response = get(link.clone())?;
70-
if !response.status().is_success() {
71-
println!("Error on {url}: {link} failed: {}", response.status());
72-
failed_links.push(link);
134+
fn control_crawl(
135+
start_url: Url,
136+
command_sender: mpsc::Sender<CrawlCommand>,
137+
result_receiver: mpsc::Receiver<CrawlResult>,
138+
) -> Vec<Url> {
139+
let mut crawl_state = CrawlState::new(&start_url);
140+
let start_command = CrawlCommand { url: start_url, extract_links: true };
141+
command_sender.send(start_command).unwrap();
142+
let mut pending_urls = 1;
143+
144+
let mut bad_urls = Vec::new();
145+
while pending_urls > 0 {
146+
let crawl_result = result_receiver.recv().unwrap();
147+
pending_urls -= 1;
148+
149+
match crawl_result {
150+
Ok(link_urls) => {
151+
for url in link_urls {
152+
if crawl_state.mark_visited(&url) {
153+
let extract_links = crawl_state.should_extract_links(&url);
154+
let crawl_command = CrawlCommand { url, extract_links };
155+
command_sender.send(crawl_command).unwrap();
156+
pending_urls += 1;
157+
}
158+
}
159+
}
160+
Err((url, error)) => {
161+
bad_urls.push(url);
162+
println!("Got crawling error: {:#}", error);
163+
continue;
73164
}
74-
} else {
75-
println!("Checking link in same domain: {link}");
76-
failed_links.extend(check_links(link)?)
77165
}
78166
}
167+
bad_urls
168+
}
79169

80-
Ok(failed_links)
170+
fn check_links(start_url: Url) -> Vec<Url> {
171+
let (result_sender, result_receiver) = mpsc::channel::<CrawlResult>();
172+
let (command_sender, command_receiver) = mpsc::channel::<CrawlCommand>();
173+
spawn_crawler_threads(command_receiver, result_sender, 16);
174+
control_crawl(start_url, command_sender, result_receiver)
81175
}
82176

83177
fn main() {
84-
let start_url = Url::parse("https://www.google.org").unwrap();
85-
match check_links(start_url) {
86-
Ok(links) => println!("Links: {links:#?}"),
87-
Err(err) => println!("Could not extract links: {err:#}"),
88-
}
178+
let start_url = reqwest::Url::parse("https://www.google.org").unwrap();
179+
let bad_urls = check_links(start_url);
180+
println!("Bad URLs: {:#?}", bad_urls);
89181
}

src/exercises/concurrency/solutions-morning.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,10 @@
88
{{#include dining-philosophers.rs}}
99
```
1010

11+
## Link Checker
12+
13+
([back to exercise](link-checker.md))
14+
15+
```rust,compile_fail
16+
{{#include link-checker.rs}}
17+
```

0 commit comments

Comments
 (0)