link_checking: prevent rate-limiting (#1421)
* link_checking: prevent rate-limiting Fix for https://github.com/getzola/zola/issues/1056. - assign all links for a domain to the same thread - reduce number of threads from 32 to 8 - add sleep between HTTP calls * Add get_link_domain(), use for loops * Do not sleep after last link for domain * Avoid quadratic complexity * remove prints
This commit is contained in:
parent
3346439a32
commit
47b920777a
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -2449,6 +2449,7 @@ dependencies = [
|
|||
"tempfile",
|
||||
"templates",
|
||||
"tera",
|
||||
"url",
|
||||
"utils",
|
||||
"walkdir",
|
||||
]
|
||||
|
|
|
@ -16,6 +16,7 @@ sass-rs = "0.2"
|
|||
lazy_static = "1.1"
|
||||
relative-path = "1"
|
||||
slotmap = "0.4"
|
||||
url = "2"
|
||||
|
||||
errors = { path = "../errors" }
|
||||
config = { path = "../config" }
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
use rayon::prelude::*;
|
||||
|
||||
use crate::Site;
|
||||
use errors::{Error, ErrorKind, Result};
|
||||
use core::time;
|
||||
use errors::{bail, Result};
|
||||
use errors::{Error, ErrorKind};
|
||||
use std::{collections::HashMap, path::PathBuf, thread};
|
||||
use url::Url;
|
||||
|
||||
/// Check whether all internal links pointing to explicit anchor fragments are valid.
|
||||
///
|
||||
|
@ -91,43 +95,71 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
|
|||
}
|
||||
}
|
||||
|
||||
fn get_link_domain(link: &str) -> Result<String> {
|
||||
return match Url::parse(&link) {
|
||||
Ok(url) => match url.host_str().map(String::from) {
|
||||
Some(domain_str) => Ok(domain_str),
|
||||
None => bail!("could not parse domain `{}` from link", link),
|
||||
},
|
||||
Err(err) => bail!("could not parse domain `{}` from link: `{}`", link, err),
|
||||
};
|
||||
}
|
||||
|
||||
pub fn check_external_links(site: &Site) -> Result<()> {
|
||||
let library = site.library.write().expect("Get lock for check_external_links");
|
||||
let page_links = library
|
||||
.pages()
|
||||
.values()
|
||||
.map(|p| {
|
||||
let path = &p.file.path;
|
||||
p.external_links.iter().map(move |l| (path.clone(), l))
|
||||
})
|
||||
.flatten();
|
||||
let section_links = library
|
||||
.sections()
|
||||
.values()
|
||||
.map(|p| {
|
||||
let path = &p.file.path;
|
||||
p.external_links.iter().map(move |l| (path.clone(), l))
|
||||
})
|
||||
.flatten();
|
||||
let all_links = page_links.chain(section_links).collect::<Vec<_>>();
|
||||
|
||||
let mut all_links: Vec<(PathBuf, String, String)> = vec![];
|
||||
|
||||
for p in library.pages_values().into_iter() {
|
||||
for external_link in p.clone().external_links.into_iter() {
|
||||
let domain = get_link_domain(&external_link)?;
|
||||
all_links.push((p.file.path.clone(), external_link, domain));
|
||||
}
|
||||
}
|
||||
|
||||
for s in library.sections_values().into_iter() {
|
||||
for external_link in s.clone().external_links.into_iter() {
|
||||
let domain = get_link_domain(&external_link)?;
|
||||
all_links.push((s.file.path.clone(), external_link, domain));
|
||||
}
|
||||
}
|
||||
|
||||
println!("Checking {} external link(s).", all_links.len());
|
||||
|
||||
let mut links_by_domain: HashMap<String, Vec<(PathBuf, String)>> = HashMap::new();
|
||||
|
||||
for link in all_links.iter() {
|
||||
links_by_domain.entry(link.2.to_string()).or_insert(Vec::new());
|
||||
// Insert content path and link under the domain key
|
||||
links_by_domain
|
||||
.get_mut(&link.2.to_string())
|
||||
.unwrap()
|
||||
.push((link.0.clone(), link.1.clone()));
|
||||
}
|
||||
|
||||
if all_links.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// create thread pool with lots of threads so we can fetch
|
||||
// (almost) all pages simultaneously
|
||||
let threads = std::cmp::min(all_links.len(), 32);
|
||||
// (almost) all pages simultaneously, limiting all links for a single
|
||||
// domain to one thread to avoid rate-limiting
|
||||
let threads = std::cmp::min(links_by_domain.len(), 8);
|
||||
let pool = rayon::ThreadPoolBuilder::new()
|
||||
.num_threads(threads)
|
||||
.build()
|
||||
.map_err(|e| Error { kind: ErrorKind::Msg(e.to_string()), source: None })?;
|
||||
|
||||
let errors: Vec<_> = pool.install(|| {
|
||||
all_links
|
||||
let errors = pool.install(|| {
|
||||
links_by_domain
|
||||
.par_iter()
|
||||
.filter_map(|(page_path, link)| {
|
||||
.map(|(_domain, links)| {
|
||||
let mut links_to_process = links.len();
|
||||
links
|
||||
.into_iter()
|
||||
.filter_map(move |(page_path, link)| {
|
||||
links_to_process -= 1;
|
||||
|
||||
if site
|
||||
.config
|
||||
.link_checker
|
||||
|
@ -137,14 +169,24 @@ pub fn check_external_links(site: &Site) -> Result<()> {
|
|||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let res = link_checker::check_url(&link, &site.config.link_checker);
|
||||
|
||||
if links_to_process > 0 {
|
||||
// Prevent rate-limiting, wait before next crawl unless we're done with this domain
|
||||
thread::sleep(time::Duration::from_millis(500));
|
||||
}
|
||||
|
||||
if link_checker::is_valid(&res) {
|
||||
None
|
||||
} else {
|
||||
Some((page_path, link, res))
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.flatten()
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
|
||||
println!("> Checked {} external link(s): {} error(s) found.", all_links.len(), errors.len());
|
||||
|
@ -165,5 +207,6 @@ pub fn check_external_links(site: &Site) -> Result<()> {
|
|||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
|
||||
Err(Error { kind: ErrorKind::Msg(msg), source: None })
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue