Skip anchor checking for URL with prefix in config (#812)

* cargo fmt & clippy

* Skip anchor checking for URL with prefix in config
This commit is contained in:
Tjeu Kayim 2019-10-14 18:31:03 +02:00 committed by Vincent Prouillet
parent 4aa2ba84fc
commit 6149fd17e1
16 changed files with 133 additions and 28 deletions

1
Cargo.lock generated
View file

@ -1241,6 +1241,7 @@ dependencies = [
name = "link_checker"
version = "0.1.0"
dependencies = [
"config 0.1.0",
"errors 0.1.0",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"reqwest 0.9.21 (registry+https://github.com/rust-lang/crates.io-index)",

View file

@ -7,8 +7,8 @@ use syntect::parsing::{SyntaxSet, SyntaxSetBuilder};
use toml;
use toml::Value as Toml;
use errors::Result;
use errors::Error;
use errors::Result;
use highlighting::THEME_SET;
use theme::Theme;
use utils::fs::read_file_with_error;
@ -86,7 +86,20 @@ impl Default for Taxonomy {
}
}
type TranslateTerm = HashMap<String, String>;
type TranslateTerm = HashMap<String, String>;
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(default)]
pub struct LinkChecker {
/// Skip anchor checking for these URL prefixes
pub skip_anchor_prefixes: Vec<String>,
}
impl Default for LinkChecker {
fn default() -> LinkChecker {
LinkChecker { skip_anchor_prefixes: Vec::new() }
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(default)]
@ -152,6 +165,8 @@ pub struct Config {
#[serde(skip_serializing, skip_deserializing)] // not a typo, 2 are need
pub extra_syntax_set: Option<SyntaxSet>,
pub link_checker: LinkChecker,
/// All user params set in [extra] in the config
pub extra: HashMap<String, Toml>,
@ -317,9 +332,16 @@ impl Config {
Error::msg(format!("Translation for language '{}' is missing", lang.as_ref()))
})?;
terms.get(key.as_ref()).ok_or_else(|| {
Error::msg(format!("Translation key '{}' for language '{}' is missing", key.as_ref(), lang.as_ref()))
}).map(|term| term.to_string())
terms
.get(key.as_ref())
.ok_or_else(|| {
Error::msg(format!(
"Translation key '{}' for language '{}' is missing",
key.as_ref(),
lang.as_ref()
))
})
.map(|term| term.to_string())
}
}
@ -346,6 +368,7 @@ impl Default for Config {
translations: HashMap::new(),
extra_syntaxes: Vec::new(),
extra_syntax_set: None,
link_checker: LinkChecker::default(),
extra: HashMap::new(),
build_timestamp: Some(1),
}
@ -551,4 +574,25 @@ ignored_content = ["*.{graphml,iso}", "*.py?"]
assert!(g.is_match("foo.py3"));
assert!(!g.is_match("foo.py"));
}
#[test]
fn link_checker_skip_anchor_prefixes() {
let config_str = r#"
title = "My site"
base_url = "example.com"
[link_checker]
skip_anchor_prefixes = [
"https://caniuse.com/#feat=",
"https://github.com/rust-lang/rust/blob/",
]
"#;
let config = Config::parse(config_str).unwrap();
let v = config.link_checker.skip_anchor_prefixes;
assert_eq!(
v,
vec!["https://caniuse.com/#feat=", "https://github.com/rust-lang/rust/blob/"]
);
}
}

View file

@ -14,7 +14,7 @@ extern crate utils;
mod config;
pub mod highlighting;
mod theme;
pub use config::{Config, Language, Taxonomy};
pub use config::{Config, Language, LinkChecker, Taxonomy};
use std::path::Path;

View file

@ -272,7 +272,7 @@ impl ImageOp {
} else {
img
}
},
}
Fill(w, h) => {
let factor_w = img_w as f32 / w as f32;
let factor_h = img_h as f32 / h as f32;

View file

@ -1,7 +1,7 @@
use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use slotmap::{DenseSlotMap, DefaultKey};
use slotmap::{DefaultKey, DenseSlotMap};
use front_matter::SortBy;

View file

@ -21,7 +21,9 @@ pub fn sort_actual_pages_by_date(a: &&Page, b: &&Page) -> Ordering {
/// Takes a list of (page key, date, permalink) and sort them by dates if possible
/// Pages without date will be put in the unsortable bucket
/// The permalink is used to break ties
pub fn sort_pages_by_date(pages: Vec<(&DefaultKey, Option<NaiveDateTime>, &str)>) -> (Vec<DefaultKey>, Vec<DefaultKey>) {
pub fn sort_pages_by_date(
pages: Vec<(&DefaultKey, Option<NaiveDateTime>, &str)>,
) -> (Vec<DefaultKey>, Vec<DefaultKey>) {
let (mut can_be_sorted, cannot_be_sorted): (Vec<_>, Vec<_>) =
pages.into_par_iter().partition(|page| page.1.is_some());
@ -40,7 +42,9 @@ pub fn sort_pages_by_date(pages: Vec<(&DefaultKey, Option<NaiveDateTime>, &str)>
/// Takes a list of (page key, weight, permalink) and sort them by weight if possible
/// Pages without weight will be put in the unsortable bucket
/// The permalink is used to break ties
pub fn sort_pages_by_weight(pages: Vec<(&DefaultKey, Option<usize>, &str)>) -> (Vec<DefaultKey>, Vec<DefaultKey>) {
pub fn sort_pages_by_weight(
pages: Vec<(&DefaultKey, Option<usize>, &str)>,
) -> (Vec<DefaultKey>, Vec<DefaultKey>) {
let (mut can_be_sorted, cannot_be_sorted): (Vec<_>, Vec<_>) =
pages.into_par_iter().partition(|page| page.1.is_some());
@ -57,7 +61,9 @@ pub fn sort_pages_by_weight(pages: Vec<(&DefaultKey, Option<usize>, &str)>) -> (
}
/// Find the lighter/heavier and earlier/later pages for all pages having a date/weight
pub fn find_siblings(sorted: &[DefaultKey]) -> Vec<(DefaultKey, Option<DefaultKey>, Option<DefaultKey>)> {
pub fn find_siblings(
sorted: &[DefaultKey],
) -> Vec<(DefaultKey, Option<DefaultKey>, Option<DefaultKey>)> {
let mut res = Vec::with_capacity(sorted.len());
let length = sorted.len();

View file

@ -7,4 +7,5 @@ authors = ["Vincent Prouillet <prouillet.vincent@gmail.com>"]
reqwest = "0.9"
lazy_static = "1"
config = { path = "../config" }
errors = { path = "../errors" }

View file

@ -2,11 +2,13 @@ extern crate reqwest;
#[macro_use]
extern crate lazy_static;
extern crate config;
extern crate errors;
use reqwest::header::{HeaderMap, ACCEPT};
use reqwest::StatusCode;
use config::LinkChecker;
use errors::Result;
use std::collections::HashMap;
@ -51,7 +53,7 @@ lazy_static! {
static ref LINKS: Arc<RwLock<HashMap<String, LinkResult>>> = Arc::new(RwLock::new(HashMap::new()));
}
pub fn check_url(url: &str) -> LinkResult {
pub fn check_url(url: &str, config: &LinkChecker) -> LinkResult {
{
let guard = LINKS.read().unwrap();
if let Some(res) = guard.get(url) {
@ -65,9 +67,11 @@ pub fn check_url(url: &str) -> LinkResult {
let client = reqwest::Client::new();
let check_anchor = !config.skip_anchor_prefixes.iter().any(|prefix| url.starts_with(prefix));
// Need to actually do the link checking
let res = match client.get(url).headers(headers).send() {
Ok(ref mut response) if has_anchor(url) => {
Ok(ref mut response) if check_anchor && has_anchor(url) => {
match check_page_for_anchor(url, response.text()) {
Ok(_) => LinkResult { code: Some(response.status()), error: None },
Err(e) => LinkResult { code: None, error: Some(e.to_string()) },
@ -111,21 +115,21 @@ fn check_page_for_anchor(url: &str, body: reqwest::Result<String>) -> Result<()>
#[cfg(test)]
mod tests {
use super::{check_page_for_anchor, check_url, has_anchor, LINKS};
use super::{check_page_for_anchor, check_url, has_anchor, LinkChecker, LINKS};
#[test]
fn can_validate_ok_links() {
let url = "https://google.com";
let res = check_url(url);
let res = check_url(url, &LinkChecker::default());
assert!(res.is_valid());
assert!(LINKS.read().unwrap().get(url).is_some());
let res = check_url(url);
let res = check_url(url, &LinkChecker::default());
assert!(res.is_valid());
}
#[test]
fn can_fail_404_links() {
let res = check_url("https://google.comys");
let res = check_url("https://google.comys", &LinkChecker::default());
assert_eq!(res.is_valid(), false);
assert!(res.code.is_none());
assert!(res.error.is_some());
@ -190,4 +194,23 @@ mod tests {
let res = has_anchor(url);
assert_eq!(res, false);
}
#[test]
fn skip_anchor_prefixes() {
let config = LinkChecker {
skip_anchor_prefixes: vec!["https://github.com/rust-lang/rust/blob/".to_owned()],
};
// anchor check is ignored because the url matches the prefix
let permalink = "https://github.com/rust-lang/rust/blob/c772948b687488a087356cb91432425662e034b9/src/librustc_back/target/mod.rs#L194-L214";
assert!(check_url(&permalink, &config).is_valid());
// other anchors are checked
let glossary = "https://help.github.com/en/articles/github-glossary#blame";
assert!(check_url(&glossary, &config).is_valid());
let glossary_invalid =
"https://help.github.com/en/articles/github-glossary#anchor-does-not-exist";
assert_eq!(check_url(&glossary_invalid, &config).is_valid(), false);
}
}

View file

@ -335,7 +335,7 @@ fn is_section(path: &str, languages_codes: &[&str]) -> bool {
}
}
return false;
false
}
/// What happens when a section or a page is created/edited

View file

@ -296,8 +296,9 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result<Render
let start_idx = heading_ref.start_idx;
let end_idx = heading_ref.end_idx;
let title = get_text(&events[start_idx + 1..end_idx]);
let id =
heading_ref.id.unwrap_or_else(|| find_anchor(&inserted_anchors, slugify(&title), 0));
let id = heading_ref
.id
.unwrap_or_else(|| find_anchor(&inserted_anchors, slugify(&title), 0));
inserted_anchors.push(id.clone());
// insert `id` to the tag
@ -326,7 +327,8 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result<Render
// record heading to make table of contents
let permalink = format!("{}#{}", context.current_page_permalink, id);
let h = Heading { level: heading_ref.level, id, permalink, title, children: Vec::new() };
let h =
Heading { level: heading_ref.level, id, permalink, title, children: Vec::new() };
headings.push(h);
}

View file

@ -399,7 +399,7 @@ impl Site {
all_links
.par_iter()
.filter_map(|(page_path, link)| {
let res = check_url(&link);
let res = check_url(&link, &self.config.link_checker);
if res.is_valid() {
None
} else {

View file

@ -662,3 +662,14 @@ fn can_ignore_markdown_content() {
let (_, _tmp_dir, public) = build_site("test_site");
assert!(!file_exists!(public, "posts/ignored/index.html"));
}
#[test]
fn check_site() {
let (mut site, _tmp_dir, _public) = build_site("test_site");
let prefixes = &site.config.link_checker.skip_anchor_prefixes;
assert_eq!(prefixes, &vec!["https://github.com/rust-lang/rust/blob/"]);
site.config.enable_check_mode();
site.load().expect("link check test_site");
}

View file

@ -34,9 +34,10 @@ impl TeraFn for Trans {
let lang = optional_arg!(String, args.get("lang"), "`trans`: `lang` must be a string.")
.unwrap_or_else(|| self.config.default_language.clone());
let term = self.config.get_translation(lang, key).map_err(|e| {
Error::chain("Failed to retreive term translation", e)
})?;
let term = self
.config
.get_translation(lang, key)
.map_err(|e| Error::chain("Failed to retreive term translation", e))?;
Ok(to_value(term).unwrap())
}
@ -509,7 +510,6 @@ mod tests {
assert!(static_fn.call(&args).is_err());
}
const TRANS_CONFIG: &str = r#"
base_url = "https://remplace-par-ton-url.fr"
default_language = "fr"

View file

@ -95,8 +95,14 @@ extra_syntaxes = []
#
# [translations.en]
# title = "A title"
#
[translations]
# Configure the link checker
[link_checker]
# Skip anchor checking for external URLs that start with these prefixes
skip_anchor_prefixes = [
"https://caniuse.com/",
]
# You can put any kind of data in there and it
# will be accessible in all templates

View file

@ -13,5 +13,10 @@ extra_syntaxes = ["syntaxes"]
ignored_content = ["*/ignored.md"]
[link_checker]
skip_anchor_prefixes = [
"https://github.com/rust-lang/rust/blob/",
]
[extra.author]
name = "Vincent Prouillet"

View file

@ -5,3 +5,9 @@ date = 2017-01-01
+++
A simple page
<!-- more -->
Link to some rust-lang [source code][permalink].
[permalink]: https://github.com/rust-lang/rust/blob/c772948b687488a087356cb91432425662e034b9/src/librustc_back/target/mod.rs#L194-L214