zola/components/site/src/link_checking.rs

use rayon::prelude::*;

use crate::Site;
use core::time;
use errors::{bail, Result};
use errors::{Error, ErrorKind};
use std::{collections::HashMap, path::PathBuf, thread};
use url::Url;

/// Check whether all internal links pointing to explicit anchor fragments are valid.
///
/// This is very similar to `check_external_links`, although internal links checking
/// is always performed (while external ones only conditionally in `zola check`).
pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
    println!("Checking all internal links with anchors.");
    let library = site.library.write().expect("Get lock for check_internal_links_with_anchors");

    // Chain all internal links, from both sections and pages.
    let page_links = library
        .pages()
        .values()
        .map(|p| {
            let path = &p.file.path;
            p.internal_links.iter().map(move |l| (path.clone(), l))
        })
        .flatten();
    let section_links = library
        .sections()
        .values()
        .map(|p| {
            let path = &p.file.path;
            p.internal_links.iter().map(move |l| (path.clone(), l))
        })
        .flatten();
    let all_links = page_links.chain(section_links);

    // Only keep links with anchor fragments, and count them too.
    // Bare files have already been checked elsewhere, thus they are not interesting here.
    let mut anchors_total = 0usize;
    let links_with_anchors = all_links
        .filter_map(|(page_path, link)| match link {
            (md_path, Some(anchor)) => Some((page_path, md_path, anchor)),
            _ => None,
        })
        .inspect(|_| anchors_total = anchors_total.saturating_add(1));

    // Check for targets existence (including anchors), then keep only the faulty
    // entries for error reporting purposes.
    let missing_targets = links_with_anchors.filter(|(_, md_path, anchor)| {
        // There are a few `expect` here since the presence of the .md file will
        // already have been checked in the markdown rendering
        let mut full_path = site.base_path.clone();
        full_path.push("content");
        for part in md_path.split('/') {
            full_path.push(part);
        }
        if md_path.contains("_index.md") {
            let section = library
                .get_section(&full_path)
                .expect("Couldn't find section in check_internal_links_with_anchors");
            !section.has_anchor(&anchor)
        } else {
            let page = library
                .get_page(&full_path)
                .expect("Couldn't find section in check_internal_links_with_anchors");
            !page.has_anchor(&anchor)
        }
    });

    // Format faulty entries into error messages, and collect them.
    let errors = missing_targets
        .map(|(page_path, md_path, anchor)| {
            format!(
                "The anchor in the link `@/{}#{}` in {} does not exist.",
                md_path,
                anchor,
                page_path.to_string_lossy(),
            )
        })
        .collect::<Vec<_>>();

    // Finally emit a summary, and return overall anchors-checking result.
    match errors.len() {
        0 => {
            println!("> Succesfully checked {} internal link(s) with anchors.", anchors_total);
            Ok(())
        }
        errors_total => {
            println!(
                "> Checked {} internal link(s) with anchors: {} target(s) missing.",
                anchors_total, errors_total,
            );
            Err(Error { kind: ErrorKind::Msg(errors.join("\n")), source: None })
        }
    }
}

fn get_link_domain(link: &str) -> Result<String> {
    return match Url::parse(&link) {
        Ok(url) => match url.host_str().map(String::from) {
            Some(domain_str) => Ok(domain_str),
            None => bail!("could not parse domain `{}` from link", link),
        },
        Err(err) => bail!("could not parse domain `{}` from link: `{}`", link, err),
    };
}

pub fn check_external_links(site: &Site) -> Result<()> {
    let library = site.library.write().expect("Get lock for check_external_links");

    let mut all_links: Vec<(PathBuf, String, String)> = vec![];

    for p in library.pages_values().into_iter() {
        for external_link in p.clone().external_links.into_iter() {
            let domain = get_link_domain(&external_link)?;
            all_links.push((p.file.path.clone(), external_link, domain));
        }
    }

    for s in library.sections_values().into_iter() {
        for external_link in s.clone().external_links.into_iter() {
            let domain = get_link_domain(&external_link)?;
            all_links.push((s.file.path.clone(), external_link, domain));
        }
    }

    println!("Checking {} external link(s).", all_links.len());

    let mut links_by_domain: HashMap<String, Vec<(PathBuf, String)>> = HashMap::new();

    for link in all_links.iter() {
        links_by_domain.entry(link.2.to_string()).or_insert(Vec::new());
        // Insert content path and link under the domain key
        links_by_domain
            .get_mut(&link.2.to_string())
            .unwrap()
            .push((link.0.clone(), link.1.clone()));
    }

    if all_links.is_empty() {
        return Ok(());
    }

    // create thread pool with lots of threads so we can fetch
    // (almost) all pages simultaneously, limiting all links for a single
    // domain to one thread to avoid rate-limiting
    let threads = std::cmp::min(links_by_domain.len(), 8);
    let pool = rayon::ThreadPoolBuilder::new()
        .num_threads(threads)
        .build()
        .map_err(|e| Error { kind: ErrorKind::Msg(e.to_string()), source: None })?;

    let errors = pool.install(|| {
        links_by_domain
            .par_iter()
            .map(|(_domain, links)| {
                let mut links_to_process = links.len();
                links
                    .into_iter()
                    .filter_map(move |(page_path, link)| {
                        links_to_process -= 1;

                        if site
                            .config
                            .link_checker
                            .skip_prefixes
                            .iter()
                            .any(|prefix| link.starts_with(prefix))
                        {
                            return None;
                        }

                        let res = link_checker::check_url(&link, &site.config.link_checker);

                        if links_to_process > 0 {
                            // Prevent rate-limiting, wait before next crawl unless we're done with this domain
                            thread::sleep(time::Duration::from_millis(500));
                        }

                        if link_checker::is_valid(&res) {
                            None
                        } else {
                            Some((page_path, link, res))
                        }
                    })
                    .collect::<Vec<_>>()
            })
            .flatten()
            .collect::<Vec<_>>()
    });

    println!("> Checked {} external link(s): {} error(s) found.", all_links.len(), errors.len());

    if errors.is_empty() {
        return Ok(());
    }

    let msg = errors
        .into_iter()
        .map(|(page_path, link, check_res)| {
            format!(
                "Dead link in {} to {}: {}",
                page_path.to_string_lossy(),
                link,
                link_checker::message(&check_res)
            )
        })
        .collect::<Vec<_>>()
        .join("\n");

    Err(Error { kind: ErrorKind::Msg(msg), source: None })
}
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`use rayon::prelude::*;`

			`use crate::Site;`
link_checking: prevent rate-limiting (#1421) * link_checking: prevent rate-limiting Fix for https://github.com/getzola/zola/issues/1056. - assign all links for a domain to the same thread - reduce number of threads from 32 to 8 - add sleep between HTTP calls * Add get_link_domain(), use for loops * Do not sleep after last link for domain * Avoid quadratic complexity * remove prints 2021-04-21 19:13:38 +00:00			`use core::time;`
			`use errors::{bail, Result};`
			`use errors::{Error, ErrorKind};`
			`use std::{collections::HashMap, path::PathBuf, thread};`
			`use url::Url;`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00
rendering: keep track of all internal links (#1424) This updates rendered markdown structures in order to keep track of all internal links, not anymore limiting to only those targeting an explicit anchor fragment. The goal of this rework is to allow building other features, such as backlinks, on top of the existing collection of internal links. 2021-04-21 19:13:11 +00:00			`/// Check whether all internal links pointing to explicit anchor fragments are valid.`
			`///`
			/// This is very similar to `check_external_links`, although internal links checking
			/// is always performed (while external ones only conditionally in `zola check`).
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {`
rendering: keep track of all internal links (#1424) This updates rendered markdown structures in order to keep track of all internal links, not anymore limiting to only those targeting an explicit anchor fragment. The goal of this rework is to allow building other features, such as backlinks, on top of the existing collection of internal links. 2021-04-21 19:13:11 +00:00			`println!("Checking all internal links with anchors.");`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`let library = site.library.write().expect("Get lock for check_internal_links_with_anchors");`
rendering: keep track of all internal links (#1424) This updates rendered markdown structures in order to keep track of all internal links, not anymore limiting to only those targeting an explicit anchor fragment. The goal of this rework is to allow building other features, such as backlinks, on top of the existing collection of internal links. 2021-04-21 19:13:11 +00:00
			`// Chain all internal links, from both sections and pages.`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`let page_links = library`
			`.pages()`
			`.values()`
			`.map(\|p\| {`
			`let path = &p.file.path;`
rendering: keep track of all internal links (#1424) This updates rendered markdown structures in order to keep track of all internal links, not anymore limiting to only those targeting an explicit anchor fragment. The goal of this rework is to allow building other features, such as backlinks, on top of the existing collection of internal links. 2021-04-21 19:13:11 +00:00			`p.internal_links.iter().map(move \|l\| (path.clone(), l))`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`})`
			`.flatten();`
			`let section_links = library`
			`.sections()`
			`.values()`
			`.map(\|p\| {`
			`let path = &p.file.path;`
rendering: keep track of all internal links (#1424) This updates rendered markdown structures in order to keep track of all internal links, not anymore limiting to only those targeting an explicit anchor fragment. The goal of this rework is to allow building other features, such as backlinks, on top of the existing collection of internal links. 2021-04-21 19:13:11 +00:00			`p.internal_links.iter().map(move \|l\| (path.clone(), l))`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`})`
			`.flatten();`
rendering: keep track of all internal links (#1424) This updates rendered markdown structures in order to keep track of all internal links, not anymore limiting to only those targeting an explicit anchor fragment. The goal of this rework is to allow building other features, such as backlinks, on top of the existing collection of internal links. 2021-04-21 19:13:11 +00:00			`let all_links = page_links.chain(section_links);`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00
rendering: keep track of all internal links (#1424) This updates rendered markdown structures in order to keep track of all internal links, not anymore limiting to only those targeting an explicit anchor fragment. The goal of this rework is to allow building other features, such as backlinks, on top of the existing collection of internal links. 2021-04-21 19:13:11 +00:00			`// Only keep links with anchor fragments, and count them too.`
			`// Bare files have already been checked elsewhere, thus they are not interesting here.`
			`let mut anchors_total = 0usize;`
			`let links_with_anchors = all_links`
			`.filter_map(\|(page_path, link)\| match link {`
			`(md_path, Some(anchor)) => Some((page_path, md_path, anchor)),`
			`_ => None,`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`})`
rendering: keep track of all internal links (#1424) This updates rendered markdown structures in order to keep track of all internal links, not anymore limiting to only those targeting an explicit anchor fragment. The goal of this rework is to allow building other features, such as backlinks, on top of the existing collection of internal links. 2021-04-21 19:13:11 +00:00			`.inspect(\|_\| anchors_total = anchors_total.saturating_add(1));`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00
rendering: keep track of all internal links (#1424) This updates rendered markdown structures in order to keep track of all internal links, not anymore limiting to only those targeting an explicit anchor fragment. The goal of this rework is to allow building other features, such as backlinks, on top of the existing collection of internal links. 2021-04-21 19:13:11 +00:00			`// Check for targets existence (including anchors), then keep only the faulty`
			`// entries for error reporting purposes.`
			`let missing_targets = links_with_anchors.filter(\|(_, md_path, anchor)\| {`
			// There are a few `expect` here since the presence of the .md file will
			`// already have been checked in the markdown rendering`
			`let mut full_path = site.base_path.clone();`
			`full_path.push("content");`
			`for part in md_path.split('/') {`
			`full_path.push(part);`
			`}`
			`if md_path.contains("_index.md") {`
			`let section = library`
			`.get_section(&full_path)`
			`.expect("Couldn't find section in check_internal_links_with_anchors");`
			`!section.has_anchor(&anchor)`
			`} else {`
			`let page = library`
			`.get_page(&full_path)`
			`.expect("Couldn't find section in check_internal_links_with_anchors");`
			`!page.has_anchor(&anchor)`
			`}`
			`});`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00
rendering: keep track of all internal links (#1424) This updates rendered markdown structures in order to keep track of all internal links, not anymore limiting to only those targeting an explicit anchor fragment. The goal of this rework is to allow building other features, such as backlinks, on top of the existing collection of internal links. 2021-04-21 19:13:11 +00:00			`// Format faulty entries into error messages, and collect them.`
			`let errors = missing_targets`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`.map(\|(page_path, md_path, anchor)\| {`
			`format!(`
			"The anchor in the link `@/{}#{}` in {} does not exist.",
			`md_path,`
			`anchor,`
			`page_path.to_string_lossy(),`
			`)`
			`})`
rendering: keep track of all internal links (#1424) This updates rendered markdown structures in order to keep track of all internal links, not anymore limiting to only those targeting an explicit anchor fragment. The goal of this rework is to allow building other features, such as backlinks, on top of the existing collection of internal links. 2021-04-21 19:13:11 +00:00			`.collect::<Vec<_>>();`

			`// Finally emit a summary, and return overall anchors-checking result.`
			`match errors.len() {`
			`0 => {`
			`println!("> Succesfully checked {} internal link(s) with anchors.", anchors_total);`
			`Ok(())`
			`}`
			`errors_total => {`
			`println!(`
			`"> Checked {} internal link(s) with anchors: {} target(s) missing.",`
			`anchors_total, errors_total,`
			`);`
			`Err(Error { kind: ErrorKind::Msg(errors.join("\n")), source: None })`
			`}`
			`}`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`}`

link_checking: prevent rate-limiting (#1421) * link_checking: prevent rate-limiting Fix for https://github.com/getzola/zola/issues/1056. - assign all links for a domain to the same thread - reduce number of threads from 32 to 8 - add sleep between HTTP calls * Add get_link_domain(), use for loops * Do not sleep after last link for domain * Avoid quadratic complexity * remove prints 2021-04-21 19:13:38 +00:00			`fn get_link_domain(link: &str) -> Result<String> {`
			`return match Url::parse(&link) {`
			`Ok(url) => match url.host_str().map(String::from) {`
			`Some(domain_str) => Ok(domain_str),`
			None => bail!("could not parse domain `{}` from link", link),
			`},`
			Err(err) => bail!("could not parse domain `{}` from link: `{}`", link, err),
			`};`
			`}`

Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`pub fn check_external_links(site: &Site) -> Result<()> {`
			`let library = site.library.write().expect("Get lock for check_external_links");`
link_checking: prevent rate-limiting (#1421) * link_checking: prevent rate-limiting Fix for https://github.com/getzola/zola/issues/1056. - assign all links for a domain to the same thread - reduce number of threads from 32 to 8 - add sleep between HTTP calls * Add get_link_domain(), use for loops * Do not sleep after last link for domain * Avoid quadratic complexity * remove prints 2021-04-21 19:13:38 +00:00
			`let mut all_links: Vec<(PathBuf, String, String)> = vec![];`

			`for p in library.pages_values().into_iter() {`
			`for external_link in p.clone().external_links.into_iter() {`
			`let domain = get_link_domain(&external_link)?;`
			`all_links.push((p.file.path.clone(), external_link, domain));`
			`}`
			`}`

			`for s in library.sections_values().into_iter() {`
			`for external_link in s.clone().external_links.into_iter() {`
			`let domain = get_link_domain(&external_link)?;`
			`all_links.push((s.file.path.clone(), external_link, domain));`
			`}`
			`}`

Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`println!("Checking {} external link(s).", all_links.len());`

link_checking: prevent rate-limiting (#1421) * link_checking: prevent rate-limiting Fix for https://github.com/getzola/zola/issues/1056. - assign all links for a domain to the same thread - reduce number of threads from 32 to 8 - add sleep between HTTP calls * Add get_link_domain(), use for loops * Do not sleep after last link for domain * Avoid quadratic complexity * remove prints 2021-04-21 19:13:38 +00:00			`let mut links_by_domain: HashMap<String, Vec<(PathBuf, String)>> = HashMap::new();`

			`for link in all_links.iter() {`
			`links_by_domain.entry(link.2.to_string()).or_insert(Vec::new());`
			`// Insert content path and link under the domain key`
			`links_by_domain`
			`.get_mut(&link.2.to_string())`
			`.unwrap()`
			`.push((link.0.clone(), link.1.clone()));`
			`}`

Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`if all_links.is_empty() {`
			`return Ok(());`
			`}`

			`// create thread pool with lots of threads so we can fetch`
link_checking: prevent rate-limiting (#1421) * link_checking: prevent rate-limiting Fix for https://github.com/getzola/zola/issues/1056. - assign all links for a domain to the same thread - reduce number of threads from 32 to 8 - add sleep between HTTP calls * Add get_link_domain(), use for loops * Do not sleep after last link for domain * Avoid quadratic complexity * remove prints 2021-04-21 19:13:38 +00:00			`// (almost) all pages simultaneously, limiting all links for a single`
			`// domain to one thread to avoid rate-limiting`
			`let threads = std::cmp::min(links_by_domain.len(), 8);`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`let pool = rayon::ThreadPoolBuilder::new()`
			`.num_threads(threads)`
			`.build()`
			`.map_err(\|e\| Error { kind: ErrorKind::Msg(e.to_string()), source: None })?;`

link_checking: prevent rate-limiting (#1421) * link_checking: prevent rate-limiting Fix for https://github.com/getzola/zola/issues/1056. - assign all links for a domain to the same thread - reduce number of threads from 32 to 8 - add sleep between HTTP calls * Add get_link_domain(), use for loops * Do not sleep after last link for domain * Avoid quadratic complexity * remove prints 2021-04-21 19:13:38 +00:00			`let errors = pool.install(\|\| {`
			`links_by_domain`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`.par_iter()`
link_checking: prevent rate-limiting (#1421) * link_checking: prevent rate-limiting Fix for https://github.com/getzola/zola/issues/1056. - assign all links for a domain to the same thread - reduce number of threads from 32 to 8 - add sleep between HTTP calls * Add get_link_domain(), use for loops * Do not sleep after last link for domain * Avoid quadratic complexity * remove prints 2021-04-21 19:13:38 +00:00			`.map(\|(_domain, links)\| {`
			`let mut links_to_process = links.len();`
			`links`
			`.into_iter()`
			`.filter_map(move \|(page_path, link)\| {`
			`links_to_process -= 1;`

			`if site`
			`.config`
			`.link_checker`
			`.skip_prefixes`
			`.iter()`
			`.any(\|prefix\| link.starts_with(prefix))`
			`{`
			`return None;`
			`}`

			`let res = link_checker::check_url(&link, &site.config.link_checker);`

			`if links_to_process > 0 {`
			`// Prevent rate-limiting, wait before next crawl unless we're done with this domain`
			`thread::sleep(time::Duration::from_millis(500));`
			`}`

			`if link_checker::is_valid(&res) {`
			`None`
			`} else {`
			`Some((page_path, link, res))`
			`}`
			`})`
			`.collect::<Vec<_>>()`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`})`
link_checking: prevent rate-limiting (#1421) * link_checking: prevent rate-limiting Fix for https://github.com/getzola/zola/issues/1056. - assign all links for a domain to the same thread - reduce number of threads from 32 to 8 - add sleep between HTTP calls * Add get_link_domain(), use for loops * Do not sleep after last link for domain * Avoid quadratic complexity * remove prints 2021-04-21 19:13:38 +00:00			`.flatten()`
			`.collect::<Vec<_>>()`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`});`

Extract some Tera logic out of site/lib.rs 2020-07-24 21:44:00 +00:00			`println!("> Checked {} external link(s): {} error(s) found.", all_links.len(), errors.len());`
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00
			`if errors.is_empty() {`
			`return Ok(());`
			`}`

			`let msg = errors`
			`.into_iter()`
			`.map(\|(page_path, link, check_res)\| {`
			`format!(`
			`"Dead link in {} to {}: {}",`
			`page_path.to_string_lossy(),`
			`link,`
			`link_checker::message(&check_res)`
			`)`
			`})`
			`.collect::<Vec<_>>()`
			`.join("\n");`
link_checking: prevent rate-limiting (#1421) * link_checking: prevent rate-limiting Fix for https://github.com/getzola/zola/issues/1056. - assign all links for a domain to the same thread - reduce number of threads from 32 to 8 - add sleep between HTTP calls * Add get_link_domain(), use for loops * Do not sleep after last link for domain * Avoid quadratic complexity * remove prints 2021-04-21 19:13:38 +00:00
Extract link_checking outside of site/lib.rs 2020-07-24 21:16:21 +00:00			`Err(Error { kind: ErrorKind::Msg(msg), source: None })`
			`}`