From ceb9bc8ed7c04e654794d2d30880d544b73b9c27 Mon Sep 17 00:00:00 2001 From: Vincent Prouillet Date: Sat, 21 Dec 2019 10:44:13 +0100 Subject: [PATCH] Optionally do not slugify paths (#875) * maybe_slugify() only does simple sanitation if config.slugify is false * slugify is disabled by default, turn on for backwards-compatibility * First docs changes for optional slugification * Remove # from slugs but not & * Add/fix tests for utf8 slugs * Fix test sites for i18n slugs * fix templates tests for i18n slugs * Rename slugify setting to slugify_paths * Default slugify_paths * Update documentation for slugify_paths * quasi_slugify removes ?, /, # and newlines * Remove forbidden NTFS chars in quasi_slugify() * Slugification forbidden chars can be configured * Remove trailing dot/space in quasi_slugify * Fix NTFS path sanitation * Revert configurable slugification charset * Remove \r for windows newlines and \t tabulations in quasi_slugify() * Update docs for output paths * Replace slugify with slugify_paths * Fix test * Default to not slugifying * Move slugs utils to utils crate * Use slugify_paths for anchors as well --- .gitignore | 2 + CHANGELOG.md | 2 + Cargo.lock | 16 +- components/config/src/config.rs | 3 + components/library/Cargo.toml | 1 - components/library/src/content/page.rs | 48 +++- components/library/src/lib.rs | 1 - components/library/src/taxonomies/mod.rs | 251 +++++++++++++++++- components/rendering/Cargo.toml | 1 - components/rendering/src/lib.rs | 1 - components/rendering/src/markdown.rs | 4 +- components/rendering/tests/markdown.rs | 11 + components/templates/src/global_fns/mod.rs | 6 +- components/utils/Cargo.toml | 1 + components/utils/src/lib.rs | 2 + components/utils/src/slugs.rs | 107 ++++++++ docs/content/documentation/content/linking.md | 8 +- docs/content/documentation/content/page.md | 43 +++ .../documentation/content/taxonomies.md | 41 ++- .../getting-started/configuration.md | 4 + test_site/config.toml | 1 + 21 files changed, 515 insertions(+), 39 deletions(-) create mode 100644 components/utils/src/slugs.rs diff --git a/.gitignore b/.gitignore index 423dde2a..b60977ca 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,5 @@ stage # nixos dependencies snippet shell.nix +# vim temporary files +**/.*.sw* diff --git a/CHANGELOG.md b/CHANGELOG.md index 64fe14cc..57e619f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ ### Breaking - Remove `toc` variable in section/page context and pass it to `page.toc` and `section.toc` instead so they are accessible everywhere +- [Slugification](https://en.wikipedia.org/wiki/Slug_(web_publishing)#Slug) of page paths is now optional. By default, every path will be slugified as it is happening right now. +To keep non-ASCII characters, set `slugify_paths = true` in your config. ### Other - Add zenburn syntax highlighting theme diff --git a/Cargo.lock b/Cargo.lock index 0aa7d60f..29db8b7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -344,10 +344,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "bincode" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.103 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -1141,7 +1140,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "gif 0.10.3 (registry+https://github.com/rust-lang/crates.io-index)", - "jpeg-decoder 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)", + "jpeg-decoder 0.1.18 (registry+https://github.com/rust-lang/crates.io-index)", "num-iter 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)", "num-rational 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", "num-traits 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1223,7 +1222,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "jpeg-decoder" -version = "0.1.16" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1275,7 +1274,6 @@ dependencies = [ "serde 1.0.103 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.103 (registry+https://github.com/rust-lang/crates.io-index)", "slotmap 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "slug 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", "tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "tera 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "toml 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", @@ -2099,7 +2097,6 @@ dependencies = [ "regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.103 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.103 (registry+https://github.com/rust-lang/crates.io-index)", - "slug 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", "syntect 3.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "templates 0.1.0", "tera 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -2520,7 +2517,7 @@ name = "syntect" version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "bincode 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "bincode 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "flate2 1.0.13 (registry+https://github.com/rust-lang/crates.io-index)", "fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)", @@ -3009,6 +3006,7 @@ version = "0.1.0" dependencies = [ "errors 0.1.0", "serde 1.0.103 (registry+https://github.com/rust-lang/crates.io-index)", + "slug 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", "tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "tera 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "toml 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", @@ -3259,7 +3257,7 @@ dependencies = [ "checksum backtrace-sys 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6575f128516de27e3ce99689419835fce9643a9b215a14d2b5b685be018491" "checksum base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0b25d992356d2eb0ed82172f5248873db5560c4721f564b13cb5193bda5e668e" "checksum base64 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b41b7ea54a0c9d92199de89e20e58d49f02f8e699814ef3fdf266f6f748d15c7" -"checksum bincode 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b8ab639324e3ee8774d296864fbc0dbbb256cf1a41c490b94cba90c082915f92" +"checksum bincode 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "5753e2a71534719bf3f4e57006c3a4f0d2c672a4b676eec84161f763eca87dbf" "checksum bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" "checksum block-buffer 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)" = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b" "checksum block-padding 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5" @@ -3351,7 +3349,7 @@ dependencies = [ "checksum iovec 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" "checksum ipconfig 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "aa79fa216fbe60834a9c0737d7fcd30425b32d1c58854663e24d4c4b328ed83f" "checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f" -"checksum jpeg-decoder 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "c1aae18ffeeae409c6622c3b6a7ee49792a7e5a062eea1b135fbb74e301792ba" +"checksum jpeg-decoder 0.1.18 (registry+https://github.com/rust-lang/crates.io-index)" = "0256f0aec7352539102a9efbcb75543227b7ab1117e0f95450023af730128451" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" "checksum language-tags 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a91d884b6667cd606bb5a69aa0c99ba811a115fc68915e7056ec08a46e93199a" "checksum lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" diff --git a/components/config/src/config.rs b/components/config/src/config.rs index 5f92e67a..c1dbe289 100644 --- a/components/config/src/config.rs +++ b/components/config/src/config.rs @@ -130,6 +130,8 @@ pub struct Config { /// key into different language. translations: HashMap, + /// Whether to slugify page and taxonomy URLs (disable for UTF-8 URLs) + pub slugify_paths: bool, /// Whether to highlight all code blocks found in markdown files. Defaults to false pub highlight_code: bool, /// Which themes to use for code highlighting. See Readme for supported themes @@ -354,6 +356,7 @@ impl Default for Config { title: None, description: None, theme: None, + slugify_paths: true, highlight_code: false, highlight_theme: "base16-ocean-dark".to_string(), default_language: "en".to_string(), diff --git a/components/library/Cargo.toml b/components/library/Cargo.toml index cc26d5cc..b9788dd3 100644 --- a/components/library/Cargo.toml +++ b/components/library/Cargo.toml @@ -10,7 +10,6 @@ chrono = { version = "0.4", features = ["serde"] } tera = "1" serde = "1" serde_derive = "1" -slug = "0.1" regex = "1" lazy_static = "1" diff --git a/components/library/src/content/page.rs b/components/library/src/content/page.rs index cb1a0c4e..4f3af1ac 100644 --- a/components/library/src/content/page.rs +++ b/components/library/src/content/page.rs @@ -4,7 +4,6 @@ use std::path::{Path, PathBuf}; use regex::Regex; use slotmap::DefaultKey; -use slug::slugify; use tera::{Context as TeraContext, Tera}; use config::Config; @@ -19,6 +18,7 @@ use utils::templates::render_template; use content::file_info::FileInfo; use content::has_anchor; use content::ser::SerializingPage; +use utils::slugs::maybe_slugify_paths; lazy_static! { // Based on https://regex101.com/r/H2n38Z/1/tests @@ -160,21 +160,21 @@ impl Page { page.slug = { if let Some(ref slug) = page.meta.slug { - slugify(&slug.trim()) + maybe_slugify_paths(&slug.trim(), config.slugify_paths) } else if page.file.name == "index" { if let Some(parent) = page.file.path.parent() { if let Some(slug) = slug_from_dated_filename { - slugify(&slug) + maybe_slugify_paths(&slug, config.slugify_paths) } else { - slugify(parent.file_name().unwrap().to_str().unwrap()) + maybe_slugify_paths(parent.file_name().unwrap().to_str().unwrap(), config.slugify_paths) } } else { - slugify(&page.file.name) + maybe_slugify_paths(&page.file.name, config.slugify_paths) } } else if let Some(slug) = slug_from_dated_filename { - slugify(&slug) + maybe_slugify_paths(&slug, config.slugify_paths) } else { - slugify(&page.file.name) + maybe_slugify_paths(&page.file.name, config.slugify_paths) } }; @@ -443,7 +443,8 @@ Hello world"#; slug = "hello-&-world" +++ Hello world"#; - let config = Config::default(); + let mut config = Config::default(); + config.slugify_paths = true; let res = Page::parse(Path::new("start.md"), content, &config, &PathBuf::new()); assert!(res.is_ok()); let page = res.unwrap(); @@ -452,6 +453,23 @@ Hello world"#; assert_eq!(page.permalink, config.make_permalink("hello-world")); } + #[test] + fn can_make_url_from_utf8_slug_frontmatter() { + let content = r#" + +++ + slug = "日本" + +++ + Hello world"#; + let mut config = Config::default(); + config.slugify_paths = false; + let res = Page::parse(Path::new("start.md"), content, &config, &PathBuf::new()); + assert!(res.is_ok()); + let page = res.unwrap(); + assert_eq!(page.path, "日本/"); + assert_eq!(page.components, vec!["日本"]); + assert_eq!(page.permalink, config.make_permalink("日本")); + } + #[test] fn can_make_url_from_path() { let content = r#" @@ -508,7 +526,8 @@ Hello world"#; #[test] fn can_make_slug_from_non_slug_filename() { - let config = Config::default(); + let mut config = Config::default(); + config.slugify_paths = true; let res = Page::parse(Path::new(" file with space.md"), "+++\n+++", &config, &PathBuf::new()); assert!(res.is_ok()); @@ -517,6 +536,17 @@ Hello world"#; assert_eq!(page.permalink, config.make_permalink(&page.slug)); } + #[test] + fn can_make_path_from_utf8_filename() { + let mut config = Config::default(); + config.slugify_paths = false; + let res = Page::parse(Path::new("日本.md"), "+++\n++++", &config, &PathBuf::new()); + assert!(res.is_ok()); + let page = res.unwrap(); + assert_eq!(page.slug, "日本"); + assert_eq!(page.permalink, config.make_permalink(&page.slug)); + } + #[test] fn can_specify_summary() { let config = Config::default(); diff --git a/components/library/src/lib.rs b/components/library/src/lib.rs index 9f851e05..a9b1242c 100644 --- a/components/library/src/lib.rs +++ b/components/library/src/lib.rs @@ -1,5 +1,4 @@ extern crate serde; -extern crate slug; extern crate tera; #[macro_use] extern crate serde_derive; diff --git a/components/library/src/taxonomies/mod.rs b/components/library/src/taxonomies/mod.rs index 8805af4b..43f55e54 100644 --- a/components/library/src/taxonomies/mod.rs +++ b/components/library/src/taxonomies/mod.rs @@ -1,7 +1,6 @@ use std::collections::HashMap; use slotmap::DefaultKey; -use slug::slugify; use tera::{Context, Tera}; use config::{Config, Taxonomy as TaxonomyConfig}; @@ -10,6 +9,7 @@ use utils::templates::render_template; use content::SerializingPage; use library::Library; +use utils::slugs::maybe_slugify_paths; use sorting::sort_pages_by_date; #[derive(Debug, Clone, PartialEq, Serialize)] @@ -69,7 +69,7 @@ impl TaxonomyItem { }) .collect(); let (mut pages, ignored_pages) = sort_pages_by_date(data); - let slug = slugify(name); + let slug = maybe_slugify_paths(name, config.slugify_paths); let permalink = if taxonomy.lang != config.default_language { config.make_permalink(&format!("/{}/{}/{}", taxonomy.lang, taxonomy.name, slug)) } else { @@ -169,7 +169,6 @@ impl Taxonomy { self.items.iter().map(|i| SerializedTaxonomyItem::from_item(i, library)).collect(); context.insert("terms", &terms); context.insert("taxonomy", &self.kind); - context.insert("lang", &self.kind.lang); context.insert("current_url", &config.make_permalink(&self.kind.name)); context.insert("current_path", &self.kind.name); @@ -331,6 +330,101 @@ mod tests { assert_eq!(categories.items[1].pages.len(), 1); } + #[test] + fn can_make_slugified_taxonomies() { + let mut config = Config::default(); + let mut library = Library::new(2, 0, false); + + config.taxonomies = vec![ + TaxonomyConfig { + name: "categories".to_string(), + lang: config.default_language.clone(), + ..TaxonomyConfig::default() + }, + TaxonomyConfig { + name: "tags".to_string(), + lang: config.default_language.clone(), + ..TaxonomyConfig::default() + }, + TaxonomyConfig { + name: "authors".to_string(), + lang: config.default_language.clone(), + ..TaxonomyConfig::default() + }, + ]; + + let mut page1 = Page::default(); + let mut taxo_page1 = HashMap::new(); + taxo_page1.insert("tags".to_string(), vec!["rust".to_string(), "db".to_string()]); + taxo_page1.insert("categories".to_string(), vec!["Programming tutorials".to_string()]); + page1.meta.taxonomies = taxo_page1; + page1.lang = config.default_language.clone(); + library.insert_page(page1); + + let mut page2 = Page::default(); + let mut taxo_page2 = HashMap::new(); + taxo_page2.insert("tags".to_string(), vec!["rust".to_string(), "js".to_string()]); + taxo_page2.insert("categories".to_string(), vec!["Other".to_string()]); + page2.meta.taxonomies = taxo_page2; + page2.lang = config.default_language.clone(); + library.insert_page(page2); + + let mut page3 = Page::default(); + let mut taxo_page3 = HashMap::new(); + taxo_page3.insert("tags".to_string(), vec!["js".to_string()]); + taxo_page3.insert("authors".to_string(), vec!["Vincent Prouillet".to_string()]); + page3.meta.taxonomies = taxo_page3; + page3.lang = config.default_language.clone(); + library.insert_page(page3); + + let taxonomies = find_taxonomies(&config, &library).unwrap(); + let (tags, categories, authors) = { + let mut t = None; + let mut c = None; + let mut a = None; + for x in taxonomies { + match x.kind.name.as_ref() { + "tags" => t = Some(x), + "categories" => c = Some(x), + "authors" => a = Some(x), + _ => unreachable!(), + } + } + (t.unwrap(), c.unwrap(), a.unwrap()) + }; + assert_eq!(tags.items.len(), 3); + assert_eq!(categories.items.len(), 2); + assert_eq!(authors.items.len(), 1); + + assert_eq!(tags.items[0].name, "db"); + assert_eq!(tags.items[0].slug, "db"); + assert_eq!(tags.items[0].permalink, "http://a-website.com/tags/db/"); + assert_eq!(tags.items[0].pages.len(), 1); + + assert_eq!(tags.items[1].name, "js"); + assert_eq!(tags.items[1].slug, "js"); + assert_eq!(tags.items[1].permalink, "http://a-website.com/tags/js/"); + assert_eq!(tags.items[1].pages.len(), 2); + + assert_eq!(tags.items[2].name, "rust"); + assert_eq!(tags.items[2].slug, "rust"); + assert_eq!(tags.items[2].permalink, "http://a-website.com/tags/rust/"); + assert_eq!(tags.items[2].pages.len(), 2); + + assert_eq!(categories.items[0].name, "Other"); + assert_eq!(categories.items[0].slug, "other"); + assert_eq!(categories.items[0].permalink, "http://a-website.com/categories/other/"); + assert_eq!(categories.items[0].pages.len(), 1); + + assert_eq!(categories.items[1].name, "Programming tutorials"); + assert_eq!(categories.items[1].slug, "programming-tutorials"); + assert_eq!( + categories.items[1].permalink, + "http://a-website.com/categories/programming-tutorials/" + ); + assert_eq!(categories.items[1].pages.len(), 1); + } + #[test] fn errors_on_unknown_taxonomy() { let mut config = Config::default(); @@ -466,4 +560,155 @@ mod tests { ); assert_eq!(categories.items[1].pages.len(), 1); } + + #[test] + fn can_make_utf8_taxonomies() { + let mut config = Config::default(); + config.slugify_paths = false; + config.languages.push(Language { + rss: false, + code: "fr".to_string(), + ..Language::default() + }); + let mut library = Library::new(2, 0, true); + + config.taxonomies = vec![TaxonomyConfig { + name: "catégories".to_string(), + lang: "fr".to_string(), + ..TaxonomyConfig::default() + }]; + + let mut page = Page::default(); + page.lang = "fr".to_string(); + let mut taxo_page = HashMap::new(); + taxo_page.insert("catégories".to_string(), vec!["Écologie".to_string()]); + page.meta.taxonomies = taxo_page; + library.insert_page(page); + + let taxonomies = find_taxonomies(&config, &library).unwrap(); + let categories = &taxonomies[0]; + + assert_eq!(categories.items.len(), 1); + assert_eq!(categories.items[0].name, "Écologie"); + assert_eq!( + categories.items[0].permalink, + "http://a-website.com/fr/catégories/Écologie/" + ); + assert_eq!(categories.items[0].pages.len(), 1); + } + + #[test] + fn can_make_slugified_taxonomies_in_multiple_languages() { + let mut config = Config::default(); + config.slugify_paths = true; + config.languages.push(Language { + rss: false, + code: "fr".to_string(), + ..Language::default() + }); + let mut library = Library::new(2, 0, true); + + config.taxonomies = vec![ + TaxonomyConfig { + name: "categories".to_string(), + lang: config.default_language.clone(), + ..TaxonomyConfig::default() + }, + TaxonomyConfig { + name: "tags".to_string(), + lang: config.default_language.clone(), + ..TaxonomyConfig::default() + }, + TaxonomyConfig { + name: "auteurs".to_string(), + lang: "fr".to_string(), + ..TaxonomyConfig::default() + }, + TaxonomyConfig { + name: "tags".to_string(), + lang: "fr".to_string(), + ..TaxonomyConfig::default() + }, + ]; + + let mut page1 = Page::default(); + let mut taxo_page1 = HashMap::new(); + taxo_page1.insert("tags".to_string(), vec!["rust".to_string(), "db".to_string()]); + taxo_page1.insert("categories".to_string(), vec!["Programming tutorials".to_string()]); + page1.meta.taxonomies = taxo_page1; + page1.lang = config.default_language.clone(); + library.insert_page(page1); + + let mut page2 = Page::default(); + let mut taxo_page2 = HashMap::new(); + taxo_page2.insert("tags".to_string(), vec!["rust".to_string()]); + taxo_page2.insert("categories".to_string(), vec!["Other".to_string()]); + page2.meta.taxonomies = taxo_page2; + page2.lang = config.default_language.clone(); + library.insert_page(page2); + + let mut page3 = Page::default(); + page3.lang = "fr".to_string(); + let mut taxo_page3 = HashMap::new(); + taxo_page3.insert("tags".to_string(), vec!["rust".to_string()]); + taxo_page3.insert("auteurs".to_string(), vec!["Vincent Prouillet".to_string()]); + page3.meta.taxonomies = taxo_page3; + library.insert_page(page3); + + let taxonomies = find_taxonomies(&config, &library).unwrap(); + let (tags, categories, authors) = { + let mut t = None; + let mut c = None; + let mut a = None; + for x in taxonomies { + match x.kind.name.as_ref() { + "tags" => { + if x.kind.lang == "en" { + t = Some(x) + } + } + "categories" => c = Some(x), + "auteurs" => a = Some(x), + _ => unreachable!(), + } + } + (t.unwrap(), c.unwrap(), a.unwrap()) + }; + + assert_eq!(tags.items.len(), 2); + assert_eq!(categories.items.len(), 2); + assert_eq!(authors.items.len(), 1); + + assert_eq!(tags.items[0].name, "db"); + assert_eq!(tags.items[0].slug, "db"); + assert_eq!(tags.items[0].permalink, "http://a-website.com/tags/db/"); + assert_eq!(tags.items[0].pages.len(), 1); + + assert_eq!(tags.items[1].name, "rust"); + assert_eq!(tags.items[1].slug, "rust"); + assert_eq!(tags.items[1].permalink, "http://a-website.com/tags/rust/"); + assert_eq!(tags.items[1].pages.len(), 2); + + assert_eq!(authors.items[0].name, "Vincent Prouillet"); + assert_eq!(authors.items[0].slug, "vincent-prouillet"); + assert_eq!( + authors.items[0].permalink, + "http://a-website.com/fr/auteurs/vincent-prouillet/" + ); + assert_eq!(authors.items[0].pages.len(), 1); + + assert_eq!(categories.items[0].name, "Other"); + assert_eq!(categories.items[0].slug, "other"); + assert_eq!(categories.items[0].permalink, "http://a-website.com/categories/other/"); + assert_eq!(categories.items[0].pages.len(), 1); + + assert_eq!(categories.items[1].name, "Programming tutorials"); + assert_eq!(categories.items[1].slug, "programming-tutorials"); + assert_eq!( + categories.items[1].permalink, + "http://a-website.com/categories/programming-tutorials/" + ); + assert_eq!(categories.items[1].pages.len(), 1); + } + } diff --git a/components/rendering/Cargo.toml b/components/rendering/Cargo.toml index f9efd4a1..af84f848 100644 --- a/components/rendering/Cargo.toml +++ b/components/rendering/Cargo.toml @@ -7,7 +7,6 @@ authors = ["Vincent Prouillet "] tera = { version = "1", features = ["preserve_order"] } syntect = "=3.2.0" pulldown-cmark = "0.6" -slug = "0.1" serde = "1" serde_derive = "1" pest = "2" diff --git a/components/rendering/src/lib.rs b/components/rendering/src/lib.rs index 542b32ff..8c340431 100644 --- a/components/rendering/src/lib.rs +++ b/components/rendering/src/lib.rs @@ -1,5 +1,4 @@ extern crate pulldown_cmark; -extern crate slug; extern crate syntect; extern crate tera; #[macro_use] diff --git a/components/rendering/src/markdown.rs b/components/rendering/src/markdown.rs index c5fcbe85..89ae825d 100644 --- a/components/rendering/src/markdown.rs +++ b/components/rendering/src/markdown.rs @@ -1,6 +1,5 @@ use pulldown_cmark as cmark; use regex::Regex; -use slug::slugify; use syntect::easy::HighlightLines; use syntect::html::{ start_highlighted_html_snippet, styled_line_to_highlighted_html, IncludeBackground, @@ -13,6 +12,7 @@ use front_matter::InsertAnchor; use table_of_contents::{make_table_of_contents, Heading}; use utils::site::resolve_internal_link; use utils::vec::InsertMany; +use utils::slugs::maybe_slugify_anchors; use self::cmark::{Event, LinkType, Options, Parser, Tag}; @@ -298,7 +298,7 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> ResultHello\n

Hello

\n"); } +#[test] +fn can_add_non_slug_id_to_headings() { + let tera_ctx = Tera::default(); + let permalinks_ctx = HashMap::new(); + let mut config = Config::default(); + config.slugify_paths = false; + let context = RenderContext::new(&tera_ctx, &config, "", &permalinks_ctx, InsertAnchor::None); + let res = render_content(r#"# L'écologie et vous"#, &context).unwrap(); + assert_eq!(res.body, "

L'écologie et vous

\n"); +} + #[test] fn can_handle_manual_ids_on_headings() { let tera_ctx = Tera::default(); diff --git a/components/templates/src/global_fns/mod.rs b/components/templates/src/global_fns/mod.rs index 08e7c70b..50fc1463 100644 --- a/components/templates/src/global_fns/mod.rs +++ b/components/templates/src/global_fns/mod.rs @@ -389,7 +389,8 @@ mod tests { #[test] fn can_get_taxonomy() { - let config = Config::default(); + let mut config = Config::default(); + config.slugify_paths = true; let taxo_config = TaxonomyConfig { name: "tags".to_string(), lang: config.default_language.clone(), @@ -466,7 +467,8 @@ mod tests { #[test] fn can_get_taxonomy_url() { - let config = Config::default(); + let mut config = Config::default(); + config.slugify_paths = true; let taxo_config = TaxonomyConfig { name: "tags".to_string(), lang: config.default_language.clone(), diff --git a/components/utils/Cargo.toml b/components/utils/Cargo.toml index 9f8dd134..072a11ef 100644 --- a/components/utils/Cargo.toml +++ b/components/utils/Cargo.toml @@ -10,6 +10,7 @@ unicode-segmentation = "1.2" walkdir = "2" toml = "0.5" serde = "1" +slug = "0.1" [dev-dependencies] tempfile = "3" diff --git a/components/utils/src/lib.rs b/components/utils/src/lib.rs index 8e462ccf..8dbbe868 100644 --- a/components/utils/src/lib.rs +++ b/components/utils/src/lib.rs @@ -8,6 +8,7 @@ extern crate tera; extern crate toml; extern crate unicode_segmentation; extern crate walkdir; +extern crate slug; pub mod de; pub mod fs; @@ -15,3 +16,4 @@ pub mod net; pub mod site; pub mod templates; pub mod vec; +pub mod slugs; diff --git a/components/utils/src/slugs.rs b/components/utils/src/slugs.rs new file mode 100644 index 00000000..073a9e41 --- /dev/null +++ b/components/utils/src/slugs.rs @@ -0,0 +1,107 @@ +fn strip_chars(s: &str, chars: &str) -> String { + let mut sanitized_string = s.to_string(); + sanitized_string.retain( |c| !chars.contains(c)); + sanitized_string +} + +fn strip_invalid_paths_chars(s: &str) -> String { + // NTFS forbidden characters : https://gist.github.com/doctaphred/d01d05291546186941e1b7ddc02034d3 + // Also we need to trim . from the end of filename + let trimmed = s.trim_end_matches(|c| c == ' ' || c == '.'); + let cleaned = trimmed.replace(" ", "_"); + // And () [] since they are not allowed in markdown links + strip_chars(&cleaned, "<>:/|?*#()[]\n\"\\\r\t") +} + +fn strip_invalid_anchors_chars(s: &str) -> String { + // spaces are not valid in markdown links + let cleaned = s.replace(" ", "_"); + // https://tools.ietf.org/html/rfc3986#section-3.5 + strip_chars(&cleaned, "\"#%<>[\\]()^`{|}") +} + +pub fn maybe_slugify_paths(s: &str, slugify: bool) -> String { + if slugify { + // ASCII slugification + slug::slugify(s) + } + else { + // Only remove forbidden characters + strip_invalid_paths_chars(s) + } +} + +pub fn maybe_slugify_anchors(s: &str, slugify: bool) -> String { + if slugify { + // ASCII slugification + slug::slugify(s) + } + else { + // Only remove forbidden characters + strip_invalid_anchors_chars(s) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn strip_invalid_paths_chars_works() { + let tests = vec![ + // no newlines + ("test\ntest", "testtest"), + // no whitespaces + ("test ", "test"), + ("t est ", "t_est"), + // invalid NTFS + ("test .", "test"), + ("test. ", "test"), + ("test#test/test?test", "testtesttesttest"), + // Invalid CommonMark chars in links + ("test (hey)", "test_hey"), + ("test (hey", "test_hey"), + ("test hey)", "test_hey"), + ("test [hey]", "test_hey"), + ("test [hey", "test_hey"), + ("test hey]", "test_hey"), + // UTF-8 + ("日本", "日本"), + ]; + + for (input, expected) in tests { + assert_eq!(strip_invalid_paths_chars(&input), expected); + } + } + + #[test] + fn strip_invalid_anchors_chars_works() { + let tests = vec![ + ("日本", "日本"), + // Some invalid chars get removed + ("test#", "test"), + ("test<", "test"), + ("test%", "test"), + ("test^", "test"), + ("test{", "test"), + ("test|", "test"), + ("test(", "test"), + // Spaces are replaced by `_` + ("test hey", "test_hey"), + ]; + + for (input, expected) in tests { + assert_eq!(strip_invalid_anchors_chars(&input), expected); + } + } + + #[test] + fn maybe_slugify_paths_enabled() { + assert_eq!(maybe_slugify_paths("héhé", true), "hehe"); + } + + #[test] + fn maybe_slugify_paths_disabled() { + assert_eq!(maybe_slugify_paths("héhé", false), "héhé"); + } +} diff --git a/docs/content/documentation/content/linking.md b/docs/content/documentation/content/linking.md index 793f7087..20bc065c 100644 --- a/docs/content/documentation/content/linking.md +++ b/docs/content/documentation/content/linking.md @@ -4,9 +4,11 @@ weight = 50 +++ ## Heading id and anchor insertion -While rendering the Markdown content, a unique id will automatically be assigned to each heading. This id is created -by converting the heading text to a [slug](https://en.wikipedia.org/wiki/Semantic_URL#Slug), and appending numbers at -the end if the slug already exists for that article. For example: +While rendering the Markdown content, a unique id will automatically be assigned to each heading. +This id is created by converting the heading text to a [slug](https://en.wikipedia.org/wiki/Semantic_URL#Slug) if `slugify_paths` is enabled. +if `slugify_paths` is disabled, whitespaces are replaced by `_` and the following characters are stripped: `#`, `%`, `<`, `>`, `[`, `]`, `(`, `)`, \`, `^`, `{`, `|`, `}`. +A number is appended at the end if the slug already exists for that article +For example: ```md # Something exciting! <- something-exciting diff --git a/docs/content/documentation/content/page.md b/docs/content/documentation/content/page.md index bd1b3781..de731841 100644 --- a/docs/content/documentation/content/page.md +++ b/docs/content/documentation/content/page.md @@ -27,6 +27,49 @@ As you can see, creating an `about.md` file is equivalent to creating an the `about` directory allows you to use asset co-location, as discussed in the [overview](@/documentation/content/overview.md#asset-colocation) section. +## Output paths + +For any page within your content folder, its output path will be defined by either: + +- its `slug` frontmatter key +- its filename + +Either way, these proposed path will be sanitized before being used. +If `slugify_paths` is enabled in the site's config - the default - paths are [slugified](https://en.wikipedia.org/wiki/Clean_URL#Slug). +Otherwise, a simpler sanitation is performed, outputting only valid NTFS paths. +The following characters are removed: `<`, `>`, `:`, `/`, `|`, `?`, `*`, `#`, `\\`, `(`, `)`, `[`, `]` as well as newlines and tabulations. +Additionally, trailing whitespace and dots are removed and whitespaces are replaced by `_`. + +**NOTE:** To produce URLs containing non-English characters (UTF8), `slugify_paths` needs to be set to `false`. + +### Path from frontmatter + +The output path for the page will first be read from the `slug` key in the page's frontmatter. + +**Example:** (file `content/zines/mlf-kurdistan.md`) + +``` ++++ +title = "Le mouvement des Femmes Libres, à la tête de la libération kurde" +slug = "femmes-libres-libération-kurde" ++++ +This is my article. +``` + +This frontmatter will output the article to `[base_url]/zines/femmes-libres-libération-kurde` with `slugify_paths` disabled, and to `[base_url]/zines/femmes-libres-liberation-kurde` with `slugify_enabled` enabled. + +### Path from filename + +When the article's output path is not specified in the frontmatter, it is extracted from the file's path in the content folder. Consider a file `content/foo/bar/thing.md`. The output path is constructed: +- if the filename is `index.md`, its parent folder name (`bar`) is used as output path +- otherwise, the output path is extracted from `thing` (the filename without the `.md` extension) + +If the path found starts with a datetime string (`YYYY-mm-dd` or [a RFC3339 datetime](https://www.ietf.org/rfc/rfc3339.txt)) followed by an underscore (`_`) or a dash (`-`), this date is removed from the output path and will be used as the page date (unless already set in the front-matter). Note that the full RFC3339 datetime contains colons, which is not a valid character in a filename on Windows. + +The output path extracted from the file path is then slugified or not depending on the `slugify_paths` config, as explained previously. + +**Example:** The file `content/blog/2018-10-10-hello-world.md` will generated a page available at will be available at `[base_url]/hello-world`. + ## Front matter The TOML front matter is a set of metadata embedded in a file at the beginning of the file enclosed diff --git a/docs/content/documentation/content/taxonomies.md b/docs/content/documentation/content/taxonomies.md index 5bdc0636..e312f169 100644 --- a/docs/content/documentation/content/taxonomies.md +++ b/docs/content/documentation/content/taxonomies.md @@ -5,7 +5,7 @@ weight = 90 Zola has built-in support for taxonomies. -The first step is to define the taxonomies in your [config.toml](@/documentation/getting-started/configuration.md). +## Configuration A taxonomy has five variables: @@ -16,21 +16,48 @@ For example the default would be page/1. - `rss`: if set to `true`, an RSS feed will be generated for each term. - `lang`: only set this if you are making a multilingual site and want to indicate which language this taxonomy is for -Once this is done, you can then set taxonomies in your content and Zola will pick -them up: +**Example 1:** (one language) + +```toml +taxonomies = [ name = "categories", rss = true ] +``` + +**Example 2:** (multilingual site) + +```toml +taxonomies = [ + {name = "tags", lang = "fr"}, + {name = "tags", lang = "eo"}, + {name = "tags", lang = "en"}, +] +``` + +## Using taxonomies + +Once the configuration is done, you can then set taxonomies in your content and Zola will pick them up: + +**Example:** ```toml +++ -... +title = "Writing a static-site generator in Rust" +date = 2019-08-15 [taxonomies] tags = ["rust", "web"] categories = ["programming"] +++ ``` -The taxonomy pages are available at the following paths: +## Output paths + +In a similar manner to how section and pages calculate their output path: +- the taxonomy name is never slugified +- the taxonomy entry (eg. as specific tag) is slugified when `slugify_paths` is enabled in the configuration + +The taxonomy pages are then available at the following paths: ```plain -$BASE_URL/$NAME/ -$BASE_URL/$NAME/$SLUG +$BASE_URL/$NAME/ (taxonomy) +$BASE_URL/$NAME/$SLUG (taxonomy entry) ``` + diff --git a/docs/content/documentation/getting-started/configuration.md b/docs/content/documentation/getting-started/configuration.md index 00309f76..6f5c806e 100644 --- a/docs/content/documentation/getting-started/configuration.md +++ b/docs/content/documentation/getting-started/configuration.md @@ -27,6 +27,10 @@ default_language = "en" # The site theme to use. theme = "" +# Slugify paths for compatibility with ASCII-only URLs produced by Zola < 0.9 +# Enabling this setting removes non-English (UTF8) characters in URLs +slugify_paths = false + # When set to "true", all code blocks are highlighted. highlight_code = false diff --git a/test_site/config.toml b/test_site/config.toml index b326b5f9..7d837c8b 100644 --- a/test_site/config.toml +++ b/test_site/config.toml @@ -4,6 +4,7 @@ highlight_code = true compile_sass = true generate_rss = true theme = "sample" +slugify_paths = true taxonomies = [ {name = "categories", rss = true},