rendering: keep track of all internal links (#1424)

This updates rendered markdown structures in order to keep track
of all internal links, not anymore limiting to only those targeting
an explicit anchor fragment.
The goal of this rework is to allow building other features, such
as backlinks, on top of the existing collection of internal links.
This commit is contained in:
Luca Bruno 2021-04-21 19:13:11 +00:00 committed by GitHub
parent 4f7b960985
commit 3346439a32
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 98 additions and 104 deletions

View file

@ -82,12 +82,11 @@ pub struct Page {
pub lang: String,
/// Contains all the translated version of that page
pub translations: Vec<DefaultKey>,
/// Contains the internal links that have an anchor: we can only check the anchor
/// after all pages have been built and their ToC compiled. The page itself should exist otherwise
/// it would have errored before getting there
/// (path to markdown, anchor value)
pub internal_links_with_anchors: Vec<(String, String)>,
/// Contains the external links that need to be checked
/// The list of all internal links (as path to markdown file), with optional anchor fragments.
/// We can only check the anchor after all pages have been built and their ToC compiled.
/// The page itself should exist otherwise it would have errored before getting there.
pub internal_links: Vec<(String, Option<String>)>,
/// The list of all links to external webpages. They can be validated by the `link_checker`.
pub external_links: Vec<String>,
}
@ -268,7 +267,7 @@ impl Page {
self.content = res.body;
self.toc = res.toc;
self.external_links = res.external_links;
self.internal_links_with_anchors = res.internal_links_with_anchors;
self.internal_links = res.internal_links;
Ok(())
}

View file

@ -56,12 +56,11 @@ pub struct Section {
/// The language of that section. Equal to the default lang if the user doesn't setup `languages` in config.
/// Corresponds to the lang in the _index.{lang}.md file scheme
pub lang: String,
/// Contains the internal links that have an anchor: we can only check the anchor
/// after all pages have been built and their ToC compiled. The page itself should exist otherwise
/// it would have errored before getting there
/// (path to markdown, anchor value)
pub internal_links_with_anchors: Vec<(String, String)>,
/// Contains the external links that need to be checked
/// The list of all internal links (as path to markdown file), with optional anchor fragments.
/// We can only check the anchor after all pages have been built and their ToC compiled.
/// The page itself should exist otherwise it would have errored before getting there.
pub internal_links: Vec<(String, Option<String>)>,
/// The list of all links to external webpages. They can be validated by the `link_checker`.
pub external_links: Vec<String>,
}
@ -186,7 +185,7 @@ impl Section {
self.content = res.body;
self.toc = res.toc;
self.external_links = res.external_links;
self.internal_links_with_anchors = res.internal_links_with_anchors;
self.internal_links = res.internal_links;
Ok(())
}

View file

@ -26,7 +26,9 @@ pub struct Rendered {
pub body: String,
pub summary_len: Option<usize>,
pub toc: Vec<Heading>,
pub internal_links_with_anchors: Vec<(String, String)>,
/// Links to site-local pages: relative path plus optional anchor target.
pub internal_links: Vec<(String, Option<String>)>,
/// Outgoing links to external webpages (i.e. HTTP(S) targets).
pub external_links: Vec<String>,
}
@ -93,7 +95,7 @@ fn fix_link(
link_type: LinkType,
link: &str,
context: &RenderContext,
internal_links_with_anchors: &mut Vec<(String, String)>,
internal_links: &mut Vec<(String, Option<String>)>,
external_links: &mut Vec<String>,
) -> Result<String> {
if link_type == LinkType::Email {
@ -107,10 +109,7 @@ fn fix_link(
let result = if link.starts_with("@/") {
match resolve_internal_link(&link, &context.permalinks) {
Ok(resolved) => {
if resolved.anchor.is_some() {
internal_links_with_anchors
.push((resolved.md_path.unwrap(), resolved.anchor.unwrap()));
}
internal_links.push((resolved.md_path, resolved.anchor));
resolved.permalink
}
Err(_) => {
@ -175,7 +174,7 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result<Render
let mut inserted_anchors: Vec<String> = vec![];
let mut headings: Vec<Heading> = vec![];
let mut internal_links_with_anchors = Vec::new();
let mut internal_links = Vec::new();
let mut external_links = Vec::new();
let mut opts = Options::empty();
@ -294,7 +293,7 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result<Render
link_type,
&link,
context,
&mut internal_links_with_anchors,
&mut internal_links,
&mut external_links,
) {
Ok(fixed_link) => fixed_link,
@ -429,7 +428,7 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result<Render
summary_len: if has_summary { html.find(CONTINUE_READING) } else { None },
body: html,
toc: make_table_of_contents(headings),
internal_links_with_anchors,
internal_links,
external_links,
})
}

View file

@ -3,16 +3,21 @@ use rayon::prelude::*;
use crate::Site;
use errors::{Error, ErrorKind, Result};
/// Very similar to check_external_links but can't be merged as far as I can see since we always
/// want to check the internal links but only the external in zola check :/
/// Check whether all internal links pointing to explicit anchor fragments are valid.
///
/// This is very similar to `check_external_links`, although internal links checking
/// is always performed (while external ones only conditionally in `zola check`).
pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
println!("Checking all internal links with anchors.");
let library = site.library.write().expect("Get lock for check_internal_links_with_anchors");
// Chain all internal links, from both sections and pages.
let page_links = library
.pages()
.values()
.map(|p| {
let path = &p.file.path;
p.internal_links_with_anchors.iter().map(move |l| (path.clone(), l))
p.internal_links.iter().map(move |l| (path.clone(), l))
})
.flatten();
let section_links = library
@ -20,67 +25,46 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
.values()
.map(|p| {
let path = &p.file.path;
p.internal_links_with_anchors.iter().map(move |l| (path.clone(), l))
p.internal_links.iter().map(move |l| (path.clone(), l))
})
.flatten();
let all_links = page_links.chain(section_links).collect::<Vec<_>>();
let all_links = page_links.chain(section_links);
if site.config.is_in_check_mode() {
println!("Checking {} internal link(s) with an anchor.", all_links.len());
}
if all_links.is_empty() {
return Ok(());
}
let mut full_path = site.base_path.clone();
full_path.push("content");
let errors: Vec<_> = all_links
.iter()
.filter_map(|(page_path, (md_path, anchor))| {
// There are a few `expect` here since the presence of the .md file will
// already have been checked in the markdown rendering
let mut p = full_path.clone();
for part in md_path.split('/') {
p.push(part);
}
if md_path.contains("_index.md") {
let section = library
.get_section(&p)
.expect("Couldn't find section in check_internal_links_with_anchors");
if section.has_anchor(&anchor) {
None
} else {
Some((page_path, md_path, anchor))
}
} else {
let page = library
.get_page(&p)
.expect("Couldn't find section in check_internal_links_with_anchors");
if page.has_anchor(&anchor) {
None
} else {
Some((page_path, md_path, anchor))
}
}
// Only keep links with anchor fragments, and count them too.
// Bare files have already been checked elsewhere, thus they are not interesting here.
let mut anchors_total = 0usize;
let links_with_anchors = all_links
.filter_map(|(page_path, link)| match link {
(md_path, Some(anchor)) => Some((page_path, md_path, anchor)),
_ => None,
})
.collect();
.inspect(|_| anchors_total = anchors_total.saturating_add(1));
if site.config.is_in_check_mode() {
println!(
"> Checked {} internal link(s) with an anchor: {} error(s) found.",
all_links.len(),
errors.len()
);
}
// Check for targets existence (including anchors), then keep only the faulty
// entries for error reporting purposes.
let missing_targets = links_with_anchors.filter(|(_, md_path, anchor)| {
// There are a few `expect` here since the presence of the .md file will
// already have been checked in the markdown rendering
let mut full_path = site.base_path.clone();
full_path.push("content");
for part in md_path.split('/') {
full_path.push(part);
}
if md_path.contains("_index.md") {
let section = library
.get_section(&full_path)
.expect("Couldn't find section in check_internal_links_with_anchors");
!section.has_anchor(&anchor)
} else {
let page = library
.get_page(&full_path)
.expect("Couldn't find section in check_internal_links_with_anchors");
!page.has_anchor(&anchor)
}
});
if errors.is_empty() {
return Ok(());
}
let msg = errors
.into_iter()
// Format faulty entries into error messages, and collect them.
let errors = missing_targets
.map(|(page_path, md_path, anchor)| {
format!(
"The anchor in the link `@/{}#{}` in {} does not exist.",
@ -89,9 +73,22 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
page_path.to_string_lossy(),
)
})
.collect::<Vec<_>>()
.join("\n");
Err(Error { kind: ErrorKind::Msg(msg), source: None })
.collect::<Vec<_>>();
// Finally emit a summary, and return overall anchors-checking result.
match errors.len() {
0 => {
println!("> Succesfully checked {} internal link(s) with anchors.", anchors_total);
Ok(())
}
errors_total => {
println!(
"> Checked {} internal link(s) with anchors: {} target(s) missing.",
anchors_total, errors_total,
);
Err(Error { kind: ErrorKind::Msg(errors.join("\n")), source: None })
}
}
}
pub fn check_external_links(site: &Site) -> Result<()> {

View file

@ -3,7 +3,7 @@ use std::collections::HashMap;
use std::hash::BuildHasher;
use unicode_segmentation::UnicodeSegmentation;
use errors::{bail, Result};
use errors::Result;
/// Get word count and estimated reading time
pub fn get_reading_analytics(content: &str) -> (usize, usize) {
@ -14,12 +14,15 @@ pub fn get_reading_analytics(content: &str) -> (usize, usize) {
(word_count, ((word_count + 199) / 200))
}
/// Result of a successful resolution of an internal link.
#[derive(Debug, PartialEq, Clone)]
pub struct ResolvedInternalLink {
/// Resolved link target, as absolute URL address.
pub permalink: String,
// The 2 fields below are only set when there is an anchor
// as we will need that to check if it exists after the markdown rendering is done
pub md_path: Option<String>,
/// Internal path to the .md file, without the leading `@/`.
pub md_path: String,
/// Optional anchor target.
/// We can check whether it exists only after all the markdown rendering is done.
pub anchor: Option<String>,
}
@ -36,20 +39,17 @@ pub fn resolve_internal_link<S: BuildHasher>(
let parts = clean_link.split('#').collect::<Vec<_>>();
// If we have slugification turned off, we might end up with some escaped characters so we need
// to decode them first
let decoded = &*percent_decode(parts[0].as_bytes()).decode_utf8_lossy();
match permalinks.get(decoded) {
Some(p) => {
if parts.len() > 1 {
Ok(ResolvedInternalLink {
permalink: format!("{}#{}", p, parts[1]),
md_path: Some(decoded.to_string()),
anchor: Some(parts[1].to_string()),
})
} else {
Ok(ResolvedInternalLink { permalink: p.to_string(), md_path: None, anchor: None })
}
}
None => bail!(format!("Relative link {} not found.", link)),
let decoded = percent_decode(parts[0].as_bytes()).decode_utf8_lossy().to_string();
let target =
permalinks.get(&decoded).ok_or_else(|| format!("Relative link {} not found.", link))?;
if parts.len() > 1 {
Ok(ResolvedInternalLink {
permalink: format!("{}#{}", target, parts[1]),
md_path: decoded,
anchor: Some(parts[1].to_string()),
})
} else {
Ok(ResolvedInternalLink { permalink: target.to_string(), md_path: decoded, anchor: None })
}
}
@ -81,7 +81,7 @@ mod tests {
permalinks.insert("pages/about.md".to_string(), "https://vincent.is/about".to_string());
let res = resolve_internal_link("@/pages/about.md#hello", &permalinks).unwrap();
assert_eq!(res.permalink, "https://vincent.is/about#hello");
assert_eq!(res.md_path, Some("pages/about.md".to_string()));
assert_eq!(res.md_path, "pages/about.md".to_string());
assert_eq!(res.anchor, Some("hello".to_string()));
}
@ -94,7 +94,7 @@ mod tests {
);
let res = resolve_internal_link("@/pages/about%20space.md#hello", &permalinks).unwrap();
assert_eq!(res.permalink, "https://vincent.is/about%20space/#hello");
assert_eq!(res.md_path, Some("pages/about space.md".to_string()));
assert_eq!(res.md_path, "pages/about space.md".to_string());
assert_eq!(res.anchor, Some("hello".to_string()));
}