rendering: keep track of all internal links (#1424)

This updates rendered markdown structures in order to keep track
of all internal links, not anymore limiting to only those targeting
an explicit anchor fragment.
The goal of this rework is to allow building other features, such
as backlinks, on top of the existing collection of internal links.
This commit is contained in:
Luca Bruno 2021-04-21 19:13:11 +00:00 committed by GitHub
parent 4f7b960985
commit 3346439a32
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 98 additions and 104 deletions

View file

@ -82,12 +82,11 @@ pub struct Page {
pub lang: String, pub lang: String,
/// Contains all the translated version of that page /// Contains all the translated version of that page
pub translations: Vec<DefaultKey>, pub translations: Vec<DefaultKey>,
/// Contains the internal links that have an anchor: we can only check the anchor /// The list of all internal links (as path to markdown file), with optional anchor fragments.
/// after all pages have been built and their ToC compiled. The page itself should exist otherwise /// We can only check the anchor after all pages have been built and their ToC compiled.
/// it would have errored before getting there /// The page itself should exist otherwise it would have errored before getting there.
/// (path to markdown, anchor value) pub internal_links: Vec<(String, Option<String>)>,
pub internal_links_with_anchors: Vec<(String, String)>, /// The list of all links to external webpages. They can be validated by the `link_checker`.
/// Contains the external links that need to be checked
pub external_links: Vec<String>, pub external_links: Vec<String>,
} }
@ -268,7 +267,7 @@ impl Page {
self.content = res.body; self.content = res.body;
self.toc = res.toc; self.toc = res.toc;
self.external_links = res.external_links; self.external_links = res.external_links;
self.internal_links_with_anchors = res.internal_links_with_anchors; self.internal_links = res.internal_links;
Ok(()) Ok(())
} }

View file

@ -56,12 +56,11 @@ pub struct Section {
/// The language of that section. Equal to the default lang if the user doesn't setup `languages` in config. /// The language of that section. Equal to the default lang if the user doesn't setup `languages` in config.
/// Corresponds to the lang in the _index.{lang}.md file scheme /// Corresponds to the lang in the _index.{lang}.md file scheme
pub lang: String, pub lang: String,
/// Contains the internal links that have an anchor: we can only check the anchor /// The list of all internal links (as path to markdown file), with optional anchor fragments.
/// after all pages have been built and their ToC compiled. The page itself should exist otherwise /// We can only check the anchor after all pages have been built and their ToC compiled.
/// it would have errored before getting there /// The page itself should exist otherwise it would have errored before getting there.
/// (path to markdown, anchor value) pub internal_links: Vec<(String, Option<String>)>,
pub internal_links_with_anchors: Vec<(String, String)>, /// The list of all links to external webpages. They can be validated by the `link_checker`.
/// Contains the external links that need to be checked
pub external_links: Vec<String>, pub external_links: Vec<String>,
} }
@ -186,7 +185,7 @@ impl Section {
self.content = res.body; self.content = res.body;
self.toc = res.toc; self.toc = res.toc;
self.external_links = res.external_links; self.external_links = res.external_links;
self.internal_links_with_anchors = res.internal_links_with_anchors; self.internal_links = res.internal_links;
Ok(()) Ok(())
} }

View file

@ -26,7 +26,9 @@ pub struct Rendered {
pub body: String, pub body: String,
pub summary_len: Option<usize>, pub summary_len: Option<usize>,
pub toc: Vec<Heading>, pub toc: Vec<Heading>,
pub internal_links_with_anchors: Vec<(String, String)>, /// Links to site-local pages: relative path plus optional anchor target.
pub internal_links: Vec<(String, Option<String>)>,
/// Outgoing links to external webpages (i.e. HTTP(S) targets).
pub external_links: Vec<String>, pub external_links: Vec<String>,
} }
@ -93,7 +95,7 @@ fn fix_link(
link_type: LinkType, link_type: LinkType,
link: &str, link: &str,
context: &RenderContext, context: &RenderContext,
internal_links_with_anchors: &mut Vec<(String, String)>, internal_links: &mut Vec<(String, Option<String>)>,
external_links: &mut Vec<String>, external_links: &mut Vec<String>,
) -> Result<String> { ) -> Result<String> {
if link_type == LinkType::Email { if link_type == LinkType::Email {
@ -107,10 +109,7 @@ fn fix_link(
let result = if link.starts_with("@/") { let result = if link.starts_with("@/") {
match resolve_internal_link(&link, &context.permalinks) { match resolve_internal_link(&link, &context.permalinks) {
Ok(resolved) => { Ok(resolved) => {
if resolved.anchor.is_some() { internal_links.push((resolved.md_path, resolved.anchor));
internal_links_with_anchors
.push((resolved.md_path.unwrap(), resolved.anchor.unwrap()));
}
resolved.permalink resolved.permalink
} }
Err(_) => { Err(_) => {
@ -175,7 +174,7 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result<Render
let mut inserted_anchors: Vec<String> = vec![]; let mut inserted_anchors: Vec<String> = vec![];
let mut headings: Vec<Heading> = vec![]; let mut headings: Vec<Heading> = vec![];
let mut internal_links_with_anchors = Vec::new(); let mut internal_links = Vec::new();
let mut external_links = Vec::new(); let mut external_links = Vec::new();
let mut opts = Options::empty(); let mut opts = Options::empty();
@ -294,7 +293,7 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result<Render
link_type, link_type,
&link, &link,
context, context,
&mut internal_links_with_anchors, &mut internal_links,
&mut external_links, &mut external_links,
) { ) {
Ok(fixed_link) => fixed_link, Ok(fixed_link) => fixed_link,
@ -429,7 +428,7 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result<Render
summary_len: if has_summary { html.find(CONTINUE_READING) } else { None }, summary_len: if has_summary { html.find(CONTINUE_READING) } else { None },
body: html, body: html,
toc: make_table_of_contents(headings), toc: make_table_of_contents(headings),
internal_links_with_anchors, internal_links,
external_links, external_links,
}) })
} }

View file

@ -3,16 +3,21 @@ use rayon::prelude::*;
use crate::Site; use crate::Site;
use errors::{Error, ErrorKind, Result}; use errors::{Error, ErrorKind, Result};
/// Very similar to check_external_links but can't be merged as far as I can see since we always /// Check whether all internal links pointing to explicit anchor fragments are valid.
/// want to check the internal links but only the external in zola check :/ ///
/// This is very similar to `check_external_links`, although internal links checking
/// is always performed (while external ones only conditionally in `zola check`).
pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> { pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
println!("Checking all internal links with anchors.");
let library = site.library.write().expect("Get lock for check_internal_links_with_anchors"); let library = site.library.write().expect("Get lock for check_internal_links_with_anchors");
// Chain all internal links, from both sections and pages.
let page_links = library let page_links = library
.pages() .pages()
.values() .values()
.map(|p| { .map(|p| {
let path = &p.file.path; let path = &p.file.path;
p.internal_links_with_anchors.iter().map(move |l| (path.clone(), l)) p.internal_links.iter().map(move |l| (path.clone(), l))
}) })
.flatten(); .flatten();
let section_links = library let section_links = library
@ -20,67 +25,46 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
.values() .values()
.map(|p| { .map(|p| {
let path = &p.file.path; let path = &p.file.path;
p.internal_links_with_anchors.iter().map(move |l| (path.clone(), l)) p.internal_links.iter().map(move |l| (path.clone(), l))
}) })
.flatten(); .flatten();
let all_links = page_links.chain(section_links).collect::<Vec<_>>(); let all_links = page_links.chain(section_links);
if site.config.is_in_check_mode() { // Only keep links with anchor fragments, and count them too.
println!("Checking {} internal link(s) with an anchor.", all_links.len()); // Bare files have already been checked elsewhere, thus they are not interesting here.
} let mut anchors_total = 0usize;
let links_with_anchors = all_links
if all_links.is_empty() { .filter_map(|(page_path, link)| match link {
return Ok(()); (md_path, Some(anchor)) => Some((page_path, md_path, anchor)),
} _ => None,
let mut full_path = site.base_path.clone();
full_path.push("content");
let errors: Vec<_> = all_links
.iter()
.filter_map(|(page_path, (md_path, anchor))| {
// There are a few `expect` here since the presence of the .md file will
// already have been checked in the markdown rendering
let mut p = full_path.clone();
for part in md_path.split('/') {
p.push(part);
}
if md_path.contains("_index.md") {
let section = library
.get_section(&p)
.expect("Couldn't find section in check_internal_links_with_anchors");
if section.has_anchor(&anchor) {
None
} else {
Some((page_path, md_path, anchor))
}
} else {
let page = library
.get_page(&p)
.expect("Couldn't find section in check_internal_links_with_anchors");
if page.has_anchor(&anchor) {
None
} else {
Some((page_path, md_path, anchor))
}
}
}) })
.collect(); .inspect(|_| anchors_total = anchors_total.saturating_add(1));
if site.config.is_in_check_mode() { // Check for targets existence (including anchors), then keep only the faulty
println!( // entries for error reporting purposes.
"> Checked {} internal link(s) with an anchor: {} error(s) found.", let missing_targets = links_with_anchors.filter(|(_, md_path, anchor)| {
all_links.len(), // There are a few `expect` here since the presence of the .md file will
errors.len() // already have been checked in the markdown rendering
); let mut full_path = site.base_path.clone();
} full_path.push("content");
for part in md_path.split('/') {
full_path.push(part);
}
if md_path.contains("_index.md") {
let section = library
.get_section(&full_path)
.expect("Couldn't find section in check_internal_links_with_anchors");
!section.has_anchor(&anchor)
} else {
let page = library
.get_page(&full_path)
.expect("Couldn't find section in check_internal_links_with_anchors");
!page.has_anchor(&anchor)
}
});
if errors.is_empty() { // Format faulty entries into error messages, and collect them.
return Ok(()); let errors = missing_targets
}
let msg = errors
.into_iter()
.map(|(page_path, md_path, anchor)| { .map(|(page_path, md_path, anchor)| {
format!( format!(
"The anchor in the link `@/{}#{}` in {} does not exist.", "The anchor in the link `@/{}#{}` in {} does not exist.",
@ -89,9 +73,22 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
page_path.to_string_lossy(), page_path.to_string_lossy(),
) )
}) })
.collect::<Vec<_>>() .collect::<Vec<_>>();
.join("\n");
Err(Error { kind: ErrorKind::Msg(msg), source: None }) // Finally emit a summary, and return overall anchors-checking result.
match errors.len() {
0 => {
println!("> Succesfully checked {} internal link(s) with anchors.", anchors_total);
Ok(())
}
errors_total => {
println!(
"> Checked {} internal link(s) with anchors: {} target(s) missing.",
anchors_total, errors_total,
);
Err(Error { kind: ErrorKind::Msg(errors.join("\n")), source: None })
}
}
} }
pub fn check_external_links(site: &Site) -> Result<()> { pub fn check_external_links(site: &Site) -> Result<()> {

View file

@ -3,7 +3,7 @@ use std::collections::HashMap;
use std::hash::BuildHasher; use std::hash::BuildHasher;
use unicode_segmentation::UnicodeSegmentation; use unicode_segmentation::UnicodeSegmentation;
use errors::{bail, Result}; use errors::Result;
/// Get word count and estimated reading time /// Get word count and estimated reading time
pub fn get_reading_analytics(content: &str) -> (usize, usize) { pub fn get_reading_analytics(content: &str) -> (usize, usize) {
@ -14,12 +14,15 @@ pub fn get_reading_analytics(content: &str) -> (usize, usize) {
(word_count, ((word_count + 199) / 200)) (word_count, ((word_count + 199) / 200))
} }
/// Result of a successful resolution of an internal link.
#[derive(Debug, PartialEq, Clone)] #[derive(Debug, PartialEq, Clone)]
pub struct ResolvedInternalLink { pub struct ResolvedInternalLink {
/// Resolved link target, as absolute URL address.
pub permalink: String, pub permalink: String,
// The 2 fields below are only set when there is an anchor /// Internal path to the .md file, without the leading `@/`.
// as we will need that to check if it exists after the markdown rendering is done pub md_path: String,
pub md_path: Option<String>, /// Optional anchor target.
/// We can check whether it exists only after all the markdown rendering is done.
pub anchor: Option<String>, pub anchor: Option<String>,
} }
@ -36,20 +39,17 @@ pub fn resolve_internal_link<S: BuildHasher>(
let parts = clean_link.split('#').collect::<Vec<_>>(); let parts = clean_link.split('#').collect::<Vec<_>>();
// If we have slugification turned off, we might end up with some escaped characters so we need // If we have slugification turned off, we might end up with some escaped characters so we need
// to decode them first // to decode them first
let decoded = &*percent_decode(parts[0].as_bytes()).decode_utf8_lossy(); let decoded = percent_decode(parts[0].as_bytes()).decode_utf8_lossy().to_string();
match permalinks.get(decoded) { let target =
Some(p) => { permalinks.get(&decoded).ok_or_else(|| format!("Relative link {} not found.", link))?;
if parts.len() > 1 { if parts.len() > 1 {
Ok(ResolvedInternalLink { Ok(ResolvedInternalLink {
permalink: format!("{}#{}", p, parts[1]), permalink: format!("{}#{}", target, parts[1]),
md_path: Some(decoded.to_string()), md_path: decoded,
anchor: Some(parts[1].to_string()), anchor: Some(parts[1].to_string()),
}) })
} else { } else {
Ok(ResolvedInternalLink { permalink: p.to_string(), md_path: None, anchor: None }) Ok(ResolvedInternalLink { permalink: target.to_string(), md_path: decoded, anchor: None })
}
}
None => bail!(format!("Relative link {} not found.", link)),
} }
} }
@ -81,7 +81,7 @@ mod tests {
permalinks.insert("pages/about.md".to_string(), "https://vincent.is/about".to_string()); permalinks.insert("pages/about.md".to_string(), "https://vincent.is/about".to_string());
let res = resolve_internal_link("@/pages/about.md#hello", &permalinks).unwrap(); let res = resolve_internal_link("@/pages/about.md#hello", &permalinks).unwrap();
assert_eq!(res.permalink, "https://vincent.is/about#hello"); assert_eq!(res.permalink, "https://vincent.is/about#hello");
assert_eq!(res.md_path, Some("pages/about.md".to_string())); assert_eq!(res.md_path, "pages/about.md".to_string());
assert_eq!(res.anchor, Some("hello".to_string())); assert_eq!(res.anchor, Some("hello".to_string()));
} }
@ -94,7 +94,7 @@ mod tests {
); );
let res = resolve_internal_link("@/pages/about%20space.md#hello", &permalinks).unwrap(); let res = resolve_internal_link("@/pages/about%20space.md#hello", &permalinks).unwrap();
assert_eq!(res.permalink, "https://vincent.is/about%20space/#hello"); assert_eq!(res.permalink, "https://vincent.is/about%20space/#hello");
assert_eq!(res.md_path, Some("pages/about space.md".to_string())); assert_eq!(res.md_path, "pages/about space.md".to_string());
assert_eq!(res.anchor, Some("hello".to_string())); assert_eq!(res.anchor, Some("hello".to_string()));
} }