From 47b920777ab732a0e20de2207127339331a7992a Mon Sep 17 00:00:00 2001 From: Stanislas Date: Wed, 21 Apr 2021 21:13:38 +0200 Subject: [PATCH] link_checking: prevent rate-limiting (#1421) * link_checking: prevent rate-limiting Fix for https://github.com/getzola/zola/issues/1056. - assign all links for a domain to the same thread - reduce number of threads from 32 to 8 - add sleep between HTTP calls * Add get_link_domain(), use for loops * Do not sleep after last link for domain * Avoid quadratic complexity * remove prints --- Cargo.lock | 1 + components/site/Cargo.toml | 1 + components/site/src/link_checking.rs | 121 ++++++++++++++++++--------- 3 files changed, 84 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6c72a084..b7984b7e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2449,6 +2449,7 @@ dependencies = [ "tempfile", "templates", "tera", + "url", "utils", "walkdir", ] diff --git a/components/site/Cargo.toml b/components/site/Cargo.toml index eb90d9c8..0a69fd43 100644 --- a/components/site/Cargo.toml +++ b/components/site/Cargo.toml @@ -16,6 +16,7 @@ sass-rs = "0.2" lazy_static = "1.1" relative-path = "1" slotmap = "0.4" +url = "2" errors = { path = "../errors" } config = { path = "../config" } diff --git a/components/site/src/link_checking.rs b/components/site/src/link_checking.rs index 46c34389..4774afa2 100644 --- a/components/site/src/link_checking.rs +++ b/components/site/src/link_checking.rs @@ -1,7 +1,11 @@ use rayon::prelude::*; use crate::Site; -use errors::{Error, ErrorKind, Result}; +use core::time; +use errors::{bail, Result}; +use errors::{Error, ErrorKind}; +use std::{collections::HashMap, path::PathBuf, thread}; +use url::Url; /// Check whether all internal links pointing to explicit anchor fragments are valid. /// @@ -91,60 +95,98 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> { } } +fn get_link_domain(link: &str) -> Result { + return match Url::parse(&link) { + Ok(url) => match url.host_str().map(String::from) { + Some(domain_str) => Ok(domain_str), + None => bail!("could not parse domain `{}` from link", link), + }, + Err(err) => bail!("could not parse domain `{}` from link: `{}`", link, err), + }; +} + pub fn check_external_links(site: &Site) -> Result<()> { let library = site.library.write().expect("Get lock for check_external_links"); - let page_links = library - .pages() - .values() - .map(|p| { - let path = &p.file.path; - p.external_links.iter().map(move |l| (path.clone(), l)) - }) - .flatten(); - let section_links = library - .sections() - .values() - .map(|p| { - let path = &p.file.path; - p.external_links.iter().map(move |l| (path.clone(), l)) - }) - .flatten(); - let all_links = page_links.chain(section_links).collect::>(); + + let mut all_links: Vec<(PathBuf, String, String)> = vec![]; + + for p in library.pages_values().into_iter() { + for external_link in p.clone().external_links.into_iter() { + let domain = get_link_domain(&external_link)?; + all_links.push((p.file.path.clone(), external_link, domain)); + } + } + + for s in library.sections_values().into_iter() { + for external_link in s.clone().external_links.into_iter() { + let domain = get_link_domain(&external_link)?; + all_links.push((s.file.path.clone(), external_link, domain)); + } + } + println!("Checking {} external link(s).", all_links.len()); + let mut links_by_domain: HashMap> = HashMap::new(); + + for link in all_links.iter() { + links_by_domain.entry(link.2.to_string()).or_insert(Vec::new()); + // Insert content path and link under the domain key + links_by_domain + .get_mut(&link.2.to_string()) + .unwrap() + .push((link.0.clone(), link.1.clone())); + } + if all_links.is_empty() { return Ok(()); } // create thread pool with lots of threads so we can fetch - // (almost) all pages simultaneously - let threads = std::cmp::min(all_links.len(), 32); + // (almost) all pages simultaneously, limiting all links for a single + // domain to one thread to avoid rate-limiting + let threads = std::cmp::min(links_by_domain.len(), 8); let pool = rayon::ThreadPoolBuilder::new() .num_threads(threads) .build() .map_err(|e| Error { kind: ErrorKind::Msg(e.to_string()), source: None })?; - let errors: Vec<_> = pool.install(|| { - all_links + let errors = pool.install(|| { + links_by_domain .par_iter() - .filter_map(|(page_path, link)| { - if site - .config - .link_checker - .skip_prefixes - .iter() - .any(|prefix| link.starts_with(prefix)) - { - return None; - } - let res = link_checker::check_url(&link, &site.config.link_checker); - if link_checker::is_valid(&res) { - None - } else { - Some((page_path, link, res)) - } + .map(|(_domain, links)| { + let mut links_to_process = links.len(); + links + .into_iter() + .filter_map(move |(page_path, link)| { + links_to_process -= 1; + + if site + .config + .link_checker + .skip_prefixes + .iter() + .any(|prefix| link.starts_with(prefix)) + { + return None; + } + + let res = link_checker::check_url(&link, &site.config.link_checker); + + if links_to_process > 0 { + // Prevent rate-limiting, wait before next crawl unless we're done with this domain + thread::sleep(time::Duration::from_millis(500)); + } + + if link_checker::is_valid(&res) { + None + } else { + Some((page_path, link, res)) + } + }) + .collect::>() }) - .collect() + .flatten() + .collect::>() }); println!("> Checked {} external link(s): {} error(s) found.", all_links.len(), errors.len()); @@ -165,5 +207,6 @@ pub fn check_external_links(site: &Site) -> Result<()> { }) .collect::>() .join("\n"); + Err(Error { kind: ErrorKind::Msg(msg), source: None }) }