link_checking: prevent rate-limiting (#1421)

* link_checking: prevent rate-limiting

Fix for https://github.com/getzola/zola/issues/1056.

- assign all links for a domain to the same thread
- reduce number of threads from 32 to 8
- add sleep between HTTP calls

* Add get_link_domain(), use for loops

* Do not sleep after last link for domain

* Avoid quadratic complexity

* remove prints
This commit is contained in:
Stanislas 2021-04-21 21:13:38 +02:00 committed by GitHub
parent 3346439a32
commit 47b920777a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 84 additions and 39 deletions

1
Cargo.lock generated
View file

@ -2449,6 +2449,7 @@ dependencies = [
"tempfile", "tempfile",
"templates", "templates",
"tera", "tera",
"url",
"utils", "utils",
"walkdir", "walkdir",
] ]

View file

@ -16,6 +16,7 @@ sass-rs = "0.2"
lazy_static = "1.1" lazy_static = "1.1"
relative-path = "1" relative-path = "1"
slotmap = "0.4" slotmap = "0.4"
url = "2"
errors = { path = "../errors" } errors = { path = "../errors" }
config = { path = "../config" } config = { path = "../config" }

View file

@ -1,7 +1,11 @@
use rayon::prelude::*; use rayon::prelude::*;
use crate::Site; use crate::Site;
use errors::{Error, ErrorKind, Result}; use core::time;
use errors::{bail, Result};
use errors::{Error, ErrorKind};
use std::{collections::HashMap, path::PathBuf, thread};
use url::Url;
/// Check whether all internal links pointing to explicit anchor fragments are valid. /// Check whether all internal links pointing to explicit anchor fragments are valid.
/// ///
@ -91,60 +95,98 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
} }
} }
fn get_link_domain(link: &str) -> Result<String> {
return match Url::parse(&link) {
Ok(url) => match url.host_str().map(String::from) {
Some(domain_str) => Ok(domain_str),
None => bail!("could not parse domain `{}` from link", link),
},
Err(err) => bail!("could not parse domain `{}` from link: `{}`", link, err),
};
}
pub fn check_external_links(site: &Site) -> Result<()> { pub fn check_external_links(site: &Site) -> Result<()> {
let library = site.library.write().expect("Get lock for check_external_links"); let library = site.library.write().expect("Get lock for check_external_links");
let page_links = library
.pages() let mut all_links: Vec<(PathBuf, String, String)> = vec![];
.values()
.map(|p| { for p in library.pages_values().into_iter() {
let path = &p.file.path; for external_link in p.clone().external_links.into_iter() {
p.external_links.iter().map(move |l| (path.clone(), l)) let domain = get_link_domain(&external_link)?;
}) all_links.push((p.file.path.clone(), external_link, domain));
.flatten(); }
let section_links = library }
.sections()
.values() for s in library.sections_values().into_iter() {
.map(|p| { for external_link in s.clone().external_links.into_iter() {
let path = &p.file.path; let domain = get_link_domain(&external_link)?;
p.external_links.iter().map(move |l| (path.clone(), l)) all_links.push((s.file.path.clone(), external_link, domain));
}) }
.flatten(); }
let all_links = page_links.chain(section_links).collect::<Vec<_>>();
println!("Checking {} external link(s).", all_links.len()); println!("Checking {} external link(s).", all_links.len());
let mut links_by_domain: HashMap<String, Vec<(PathBuf, String)>> = HashMap::new();
for link in all_links.iter() {
links_by_domain.entry(link.2.to_string()).or_insert(Vec::new());
// Insert content path and link under the domain key
links_by_domain
.get_mut(&link.2.to_string())
.unwrap()
.push((link.0.clone(), link.1.clone()));
}
if all_links.is_empty() { if all_links.is_empty() {
return Ok(()); return Ok(());
} }
// create thread pool with lots of threads so we can fetch // create thread pool with lots of threads so we can fetch
// (almost) all pages simultaneously // (almost) all pages simultaneously, limiting all links for a single
let threads = std::cmp::min(all_links.len(), 32); // domain to one thread to avoid rate-limiting
let threads = std::cmp::min(links_by_domain.len(), 8);
let pool = rayon::ThreadPoolBuilder::new() let pool = rayon::ThreadPoolBuilder::new()
.num_threads(threads) .num_threads(threads)
.build() .build()
.map_err(|e| Error { kind: ErrorKind::Msg(e.to_string()), source: None })?; .map_err(|e| Error { kind: ErrorKind::Msg(e.to_string()), source: None })?;
let errors: Vec<_> = pool.install(|| { let errors = pool.install(|| {
all_links links_by_domain
.par_iter() .par_iter()
.filter_map(|(page_path, link)| { .map(|(_domain, links)| {
if site let mut links_to_process = links.len();
.config links
.link_checker .into_iter()
.skip_prefixes .filter_map(move |(page_path, link)| {
.iter() links_to_process -= 1;
.any(|prefix| link.starts_with(prefix))
{ if site
return None; .config
} .link_checker
let res = link_checker::check_url(&link, &site.config.link_checker); .skip_prefixes
if link_checker::is_valid(&res) { .iter()
None .any(|prefix| link.starts_with(prefix))
} else { {
Some((page_path, link, res)) return None;
} }
let res = link_checker::check_url(&link, &site.config.link_checker);
if links_to_process > 0 {
// Prevent rate-limiting, wait before next crawl unless we're done with this domain
thread::sleep(time::Duration::from_millis(500));
}
if link_checker::is_valid(&res) {
None
} else {
Some((page_path, link, res))
}
})
.collect::<Vec<_>>()
}) })
.collect() .flatten()
.collect::<Vec<_>>()
}); });
println!("> Checked {} external link(s): {} error(s) found.", all_links.len(), errors.len()); println!("> Checked {} external link(s): {} error(s) found.", all_links.len(), errors.len());
@ -165,5 +207,6 @@ pub fn check_external_links(site: &Site) -> Result<()> {
}) })
.collect::<Vec<_>>() .collect::<Vec<_>>()
.join("\n"); .join("\n");
Err(Error { kind: ErrorKind::Msg(msg), source: None }) Err(Error { kind: ErrorKind::Msg(msg), source: None })
} }