From 3346439a32854d5dd4f2ff87465c47fba2407d32 Mon Sep 17 00:00:00 2001 From: Luca Bruno Date: Wed, 21 Apr 2021 19:13:11 +0000 Subject: [PATCH] rendering: keep track of all internal links (#1424) This updates rendered markdown structures in order to keep track of all internal links, not anymore limiting to only those targeting an explicit anchor fragment. The goal of this rework is to allow building other features, such as backlinks, on top of the existing collection of internal links. --- components/library/src/content/page.rs | 13 ++- components/library/src/content/section.rs | 13 ++- components/rendering/src/markdown.rs | 17 ++-- components/site/src/link_checking.rs | 119 +++++++++++----------- components/utils/src/site.rs | 40 ++++---- 5 files changed, 98 insertions(+), 104 deletions(-) diff --git a/components/library/src/content/page.rs b/components/library/src/content/page.rs index 216de4ce..9a37ecc8 100644 --- a/components/library/src/content/page.rs +++ b/components/library/src/content/page.rs @@ -82,12 +82,11 @@ pub struct Page { pub lang: String, /// Contains all the translated version of that page pub translations: Vec, - /// Contains the internal links that have an anchor: we can only check the anchor - /// after all pages have been built and their ToC compiled. The page itself should exist otherwise - /// it would have errored before getting there - /// (path to markdown, anchor value) - pub internal_links_with_anchors: Vec<(String, String)>, - /// Contains the external links that need to be checked + /// The list of all internal links (as path to markdown file), with optional anchor fragments. + /// We can only check the anchor after all pages have been built and their ToC compiled. + /// The page itself should exist otherwise it would have errored before getting there. + pub internal_links: Vec<(String, Option)>, + /// The list of all links to external webpages. They can be validated by the `link_checker`. pub external_links: Vec, } @@ -268,7 +267,7 @@ impl Page { self.content = res.body; self.toc = res.toc; self.external_links = res.external_links; - self.internal_links_with_anchors = res.internal_links_with_anchors; + self.internal_links = res.internal_links; Ok(()) } diff --git a/components/library/src/content/section.rs b/components/library/src/content/section.rs index 1b284cca..3d4fb6fd 100644 --- a/components/library/src/content/section.rs +++ b/components/library/src/content/section.rs @@ -56,12 +56,11 @@ pub struct Section { /// The language of that section. Equal to the default lang if the user doesn't setup `languages` in config. /// Corresponds to the lang in the _index.{lang}.md file scheme pub lang: String, - /// Contains the internal links that have an anchor: we can only check the anchor - /// after all pages have been built and their ToC compiled. The page itself should exist otherwise - /// it would have errored before getting there - /// (path to markdown, anchor value) - pub internal_links_with_anchors: Vec<(String, String)>, - /// Contains the external links that need to be checked + /// The list of all internal links (as path to markdown file), with optional anchor fragments. + /// We can only check the anchor after all pages have been built and their ToC compiled. + /// The page itself should exist otherwise it would have errored before getting there. + pub internal_links: Vec<(String, Option)>, + /// The list of all links to external webpages. They can be validated by the `link_checker`. pub external_links: Vec, } @@ -186,7 +185,7 @@ impl Section { self.content = res.body; self.toc = res.toc; self.external_links = res.external_links; - self.internal_links_with_anchors = res.internal_links_with_anchors; + self.internal_links = res.internal_links; Ok(()) } diff --git a/components/rendering/src/markdown.rs b/components/rendering/src/markdown.rs index 731b2c7b..5929e1fe 100644 --- a/components/rendering/src/markdown.rs +++ b/components/rendering/src/markdown.rs @@ -26,7 +26,9 @@ pub struct Rendered { pub body: String, pub summary_len: Option, pub toc: Vec, - pub internal_links_with_anchors: Vec<(String, String)>, + /// Links to site-local pages: relative path plus optional anchor target. + pub internal_links: Vec<(String, Option)>, + /// Outgoing links to external webpages (i.e. HTTP(S) targets). pub external_links: Vec, } @@ -93,7 +95,7 @@ fn fix_link( link_type: LinkType, link: &str, context: &RenderContext, - internal_links_with_anchors: &mut Vec<(String, String)>, + internal_links: &mut Vec<(String, Option)>, external_links: &mut Vec, ) -> Result { if link_type == LinkType::Email { @@ -107,10 +109,7 @@ fn fix_link( let result = if link.starts_with("@/") { match resolve_internal_link(&link, &context.permalinks) { Ok(resolved) => { - if resolved.anchor.is_some() { - internal_links_with_anchors - .push((resolved.md_path.unwrap(), resolved.anchor.unwrap())); - } + internal_links.push((resolved.md_path, resolved.anchor)); resolved.permalink } Err(_) => { @@ -175,7 +174,7 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result = vec![]; let mut headings: Vec = vec![]; - let mut internal_links_with_anchors = Vec::new(); + let mut internal_links = Vec::new(); let mut external_links = Vec::new(); let mut opts = Options::empty(); @@ -294,7 +293,7 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result fixed_link, @@ -429,7 +428,7 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result Result<()> { + println!("Checking all internal links with anchors."); let library = site.library.write().expect("Get lock for check_internal_links_with_anchors"); + + // Chain all internal links, from both sections and pages. let page_links = library .pages() .values() .map(|p| { let path = &p.file.path; - p.internal_links_with_anchors.iter().map(move |l| (path.clone(), l)) + p.internal_links.iter().map(move |l| (path.clone(), l)) }) .flatten(); let section_links = library @@ -20,67 +25,46 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> { .values() .map(|p| { let path = &p.file.path; - p.internal_links_with_anchors.iter().map(move |l| (path.clone(), l)) + p.internal_links.iter().map(move |l| (path.clone(), l)) }) .flatten(); - let all_links = page_links.chain(section_links).collect::>(); + let all_links = page_links.chain(section_links); - if site.config.is_in_check_mode() { - println!("Checking {} internal link(s) with an anchor.", all_links.len()); - } - - if all_links.is_empty() { - return Ok(()); - } - - let mut full_path = site.base_path.clone(); - full_path.push("content"); - - let errors: Vec<_> = all_links - .iter() - .filter_map(|(page_path, (md_path, anchor))| { - // There are a few `expect` here since the presence of the .md file will - // already have been checked in the markdown rendering - let mut p = full_path.clone(); - for part in md_path.split('/') { - p.push(part); - } - if md_path.contains("_index.md") { - let section = library - .get_section(&p) - .expect("Couldn't find section in check_internal_links_with_anchors"); - if section.has_anchor(&anchor) { - None - } else { - Some((page_path, md_path, anchor)) - } - } else { - let page = library - .get_page(&p) - .expect("Couldn't find section in check_internal_links_with_anchors"); - if page.has_anchor(&anchor) { - None - } else { - Some((page_path, md_path, anchor)) - } - } + // Only keep links with anchor fragments, and count them too. + // Bare files have already been checked elsewhere, thus they are not interesting here. + let mut anchors_total = 0usize; + let links_with_anchors = all_links + .filter_map(|(page_path, link)| match link { + (md_path, Some(anchor)) => Some((page_path, md_path, anchor)), + _ => None, }) - .collect(); + .inspect(|_| anchors_total = anchors_total.saturating_add(1)); - if site.config.is_in_check_mode() { - println!( - "> Checked {} internal link(s) with an anchor: {} error(s) found.", - all_links.len(), - errors.len() - ); - } + // Check for targets existence (including anchors), then keep only the faulty + // entries for error reporting purposes. + let missing_targets = links_with_anchors.filter(|(_, md_path, anchor)| { + // There are a few `expect` here since the presence of the .md file will + // already have been checked in the markdown rendering + let mut full_path = site.base_path.clone(); + full_path.push("content"); + for part in md_path.split('/') { + full_path.push(part); + } + if md_path.contains("_index.md") { + let section = library + .get_section(&full_path) + .expect("Couldn't find section in check_internal_links_with_anchors"); + !section.has_anchor(&anchor) + } else { + let page = library + .get_page(&full_path) + .expect("Couldn't find section in check_internal_links_with_anchors"); + !page.has_anchor(&anchor) + } + }); - if errors.is_empty() { - return Ok(()); - } - - let msg = errors - .into_iter() + // Format faulty entries into error messages, and collect them. + let errors = missing_targets .map(|(page_path, md_path, anchor)| { format!( "The anchor in the link `@/{}#{}` in {} does not exist.", @@ -89,9 +73,22 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> { page_path.to_string_lossy(), ) }) - .collect::>() - .join("\n"); - Err(Error { kind: ErrorKind::Msg(msg), source: None }) + .collect::>(); + + // Finally emit a summary, and return overall anchors-checking result. + match errors.len() { + 0 => { + println!("> Succesfully checked {} internal link(s) with anchors.", anchors_total); + Ok(()) + } + errors_total => { + println!( + "> Checked {} internal link(s) with anchors: {} target(s) missing.", + anchors_total, errors_total, + ); + Err(Error { kind: ErrorKind::Msg(errors.join("\n")), source: None }) + } + } } pub fn check_external_links(site: &Site) -> Result<()> { diff --git a/components/utils/src/site.rs b/components/utils/src/site.rs index 26e0f8ac..d49e559a 100644 --- a/components/utils/src/site.rs +++ b/components/utils/src/site.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use std::hash::BuildHasher; use unicode_segmentation::UnicodeSegmentation; -use errors::{bail, Result}; +use errors::Result; /// Get word count and estimated reading time pub fn get_reading_analytics(content: &str) -> (usize, usize) { @@ -14,12 +14,15 @@ pub fn get_reading_analytics(content: &str) -> (usize, usize) { (word_count, ((word_count + 199) / 200)) } +/// Result of a successful resolution of an internal link. #[derive(Debug, PartialEq, Clone)] pub struct ResolvedInternalLink { + /// Resolved link target, as absolute URL address. pub permalink: String, - // The 2 fields below are only set when there is an anchor - // as we will need that to check if it exists after the markdown rendering is done - pub md_path: Option, + /// Internal path to the .md file, without the leading `@/`. + pub md_path: String, + /// Optional anchor target. + /// We can check whether it exists only after all the markdown rendering is done. pub anchor: Option, } @@ -36,20 +39,17 @@ pub fn resolve_internal_link( let parts = clean_link.split('#').collect::>(); // If we have slugification turned off, we might end up with some escaped characters so we need // to decode them first - let decoded = &*percent_decode(parts[0].as_bytes()).decode_utf8_lossy(); - match permalinks.get(decoded) { - Some(p) => { - if parts.len() > 1 { - Ok(ResolvedInternalLink { - permalink: format!("{}#{}", p, parts[1]), - md_path: Some(decoded.to_string()), - anchor: Some(parts[1].to_string()), - }) - } else { - Ok(ResolvedInternalLink { permalink: p.to_string(), md_path: None, anchor: None }) - } - } - None => bail!(format!("Relative link {} not found.", link)), + let decoded = percent_decode(parts[0].as_bytes()).decode_utf8_lossy().to_string(); + let target = + permalinks.get(&decoded).ok_or_else(|| format!("Relative link {} not found.", link))?; + if parts.len() > 1 { + Ok(ResolvedInternalLink { + permalink: format!("{}#{}", target, parts[1]), + md_path: decoded, + anchor: Some(parts[1].to_string()), + }) + } else { + Ok(ResolvedInternalLink { permalink: target.to_string(), md_path: decoded, anchor: None }) } } @@ -81,7 +81,7 @@ mod tests { permalinks.insert("pages/about.md".to_string(), "https://vincent.is/about".to_string()); let res = resolve_internal_link("@/pages/about.md#hello", &permalinks).unwrap(); assert_eq!(res.permalink, "https://vincent.is/about#hello"); - assert_eq!(res.md_path, Some("pages/about.md".to_string())); + assert_eq!(res.md_path, "pages/about.md".to_string()); assert_eq!(res.anchor, Some("hello".to_string())); } @@ -94,7 +94,7 @@ mod tests { ); let res = resolve_internal_link("@/pages/about%20space.md#hello", &permalinks).unwrap(); assert_eq!(res.permalink, "https://vincent.is/about%20space/#hello"); - assert_eq!(res.md_path, Some("pages/about space.md".to_string())); + assert_eq!(res.md_path, "pages/about space.md".to_string()); assert_eq!(res.anchor, Some("hello".to_string())); }