Skip to content

Fix solution in Link Checker in Concurrency Morning exercises #904

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/exercises/concurrency/link-checker.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,13 @@ Your `src/main.rs` file should look something like this:
```rust,compile_fail
{{#include link-checker.rs:setup}}

{{#include link-checker.rs:extract_links}}
{{#include link-checker.rs:visit_page}}

fn main() {
let client = Client::new();
let start_url = Url::parse("https://www.google.org").unwrap();
let response = get(start_url).unwrap();
match extract_links(response) {
let crawl_command = CrawlCommand{ url: start_url, extract_links: true };
match visit_page(&client, &crawl_command) {
Ok(links) => println!("Links: {links:#?}"),
Err(err) => println!("Could not extract links: {err:#}"),
}
Expand Down
176 changes: 134 additions & 42 deletions src/exercises/concurrency/link-checker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,78 +12,170 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use std::{sync::Arc, sync::Mutex, sync::mpsc, thread};

// ANCHOR: setup
use reqwest::blocking::{get, Response};
use reqwest::Url;
use reqwest::{blocking::Client, Url};
use scraper::{Html, Selector};
use thiserror::Error;

#[derive(Error, Debug)]
enum Error {
#[error("request error: {0}")]
ReqwestError(#[from] reqwest::Error),
#[error("bad http response: {0}")]
BadResponse(String),
}
// ANCHOR_END: setup

// ANCHOR: extract_links
fn extract_links(response: Response) -> Result<Vec<Url>, Error> {
// ANCHOR: visit_page
#[derive(Debug)]
struct CrawlCommand {
url: Url,
extract_links: bool,
}

fn visit_page(client: &Client, command: &CrawlCommand) -> Result<Vec<Url>, Error> {
println!("Checking {:#}", command.url);
let response = client.get(command.url.clone()).send()?;
if !response.status().is_success() {
return Err(Error::BadResponse(response.status().to_string()));
}

let mut link_urls = Vec::new();
if !command.extract_links {
return Ok(link_urls);
}

let base_url = response.url().to_owned();
let document = response.text()?;
let html = Html::parse_document(&document);
let selector = Selector::parse("a").unwrap();
let body_text = response.text()?;
let document = Html::parse_document(&body_text);

let mut valid_urls = Vec::new();
for element in html.select(&selector) {
if let Some(href) = element.value().attr("href") {
match base_url.join(href) {
Ok(url) => valid_urls.push(url),
Err(err) => {
println!("On {base_url}: could not parse {href:?}: {err} (ignored)",);
}
let selector = Selector::parse("a").unwrap();
let href_values = document
.select(&selector)
.filter_map(|element| element.value().attr("href"));
for href in href_values {
match base_url.join(href) {
Ok(link_url) => {
link_urls.push(link_url);
}
Err(err) => {
println!("On {base_url:#}: ignored unparsable {href:?}: {err}");
}
}
}
Ok(link_urls)
}
// ANCHOR_END: visit_page

Ok(valid_urls)
struct CrawlState {
domain: String,
visited_pages: std::collections::HashSet<String>,
}
// ANCHOR_END: extract_links

fn check_links(url: Url) -> Result<Vec<Url>, Error> {
println!("Checking {url}");
impl CrawlState {
fn new(start_url: &Url) -> CrawlState {
let mut visited_pages = std::collections::HashSet::new();
visited_pages.insert(start_url.as_str().to_string());
CrawlState {
domain: start_url.domain().unwrap().to_string(),
visited_pages,
}
}

let response = get(url.to_owned())?;
/// Determine whether links within the given page should be extracted.
fn should_extract_links(&self, url: &Url) -> bool {
let Some(url_domain) = url.domain() else {
return false;
};
url_domain == self.domain
}

if !response.status().is_success() {
return Ok(vec![url.to_owned()]);
/// Mark the given page as visited, returning true if it had already
/// been visited.
fn mark_visited(&mut self, url: &Url) -> bool {
self.visited_pages.insert(url.as_str().to_string())
}
}

let links = extract_links(response)?;
for link in &links {
println!("{link}, {:?}", link.domain());
type CrawlResult = Result<Vec<Url>, (Url, Error)>;
fn spawn_crawler_threads(
command_receiver: mpsc::Receiver<CrawlCommand>,
result_sender: mpsc::Sender<CrawlResult>,
thread_count: u32,
) {
let command_receiver = Arc::new(Mutex::new(command_receiver));

for _ in 0..thread_count {
let result_sender = result_sender.clone();
let command_receiver = command_receiver.clone();
thread::spawn(move || {
let client = Client::new();
loop {
let command_result = {
let receiver_guard = command_receiver.lock().unwrap();
receiver_guard.recv()
};
let Ok(crawl_command) = command_result else {
// The sender got dropped. No more commands coming in.
break;
};
let crawl_result = match visit_page(&client, &crawl_command) {
Ok(link_urls) => Ok(link_urls),
Err(error) => Err((crawl_command.url, error)),
};
result_sender.send(crawl_result).unwrap();
}
});
}
}

let mut failed_links = Vec::new();
for link in links {
if link.domain() != url.domain() {
println!("Checking external link: {link}");
let response = get(link.clone())?;
if !response.status().is_success() {
println!("Error on {url}: {link} failed: {}", response.status());
failed_links.push(link);
fn control_crawl(
start_url: Url,
command_sender: mpsc::Sender<CrawlCommand>,
result_receiver: mpsc::Receiver<CrawlResult>,
) -> Vec<Url> {
let mut crawl_state = CrawlState::new(&start_url);
let start_command = CrawlCommand { url: start_url, extract_links: true };
command_sender.send(start_command).unwrap();
let mut pending_urls = 1;

let mut bad_urls = Vec::new();
while pending_urls > 0 {
let crawl_result = result_receiver.recv().unwrap();
pending_urls -= 1;

match crawl_result {
Ok(link_urls) => {
for url in link_urls {
if crawl_state.mark_visited(&url) {
let extract_links = crawl_state.should_extract_links(&url);
let crawl_command = CrawlCommand { url, extract_links };
command_sender.send(crawl_command).unwrap();
pending_urls += 1;
}
}
}
Err((url, error)) => {
bad_urls.push(url);
println!("Got crawling error: {:#}", error);
continue;
}
} else {
println!("Checking link in same domain: {link}");
failed_links.extend(check_links(link)?)
}
}
bad_urls
}

Ok(failed_links)
fn check_links(start_url: Url) -> Vec<Url> {
let (result_sender, result_receiver) = mpsc::channel::<CrawlResult>();
let (command_sender, command_receiver) = mpsc::channel::<CrawlCommand>();
spawn_crawler_threads(command_receiver, result_sender, 16);
control_crawl(start_url, command_sender, result_receiver)
}

fn main() {
let start_url = Url::parse("https://www.google.org").unwrap();
match check_links(start_url) {
Ok(links) => println!("Links: {links:#?}"),
Err(err) => println!("Could not extract links: {err:#}"),
}
let start_url = reqwest::Url::parse("https://www.google.org").unwrap();
let bad_urls = check_links(start_url);
println!("Bad URLs: {:#?}", bad_urls);
}
7 changes: 7 additions & 0 deletions src/exercises/concurrency/solutions-morning.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,10 @@
{{#include dining-philosophers.rs}}
```

## Link Checker

([back to exercise](link-checker.md))

```rust,compile_fail
{{#include link-checker.rs}}
```