Filter ignored content in page.rs.

* Add ignored_content to the Config structure. * Use the GlobSet crate to parse the glob patterns into a matcher, which is created once at program initialization. If there are no patterns in ignored_content, an empty globber is created, which excludes no files. This is consistent with the existing behaviour of Gutenberg, before this feature was added. * Bail if there are any errors in the glob patterns. * Add a call to the globber in page.rs to actually do the filtering. * Update documentation. A note on the Config structure ------------------------------ * I had to remove the PartialEq derive from the Config structure as it does not work for the GlobSet type. No harm is done, Config does not need to be PartialEq anyway, since there is no need to sort Configs. * The implementation follows the pattern of the existing config settings in that it uses an Option<...>. This would appear unnecessary, in that an empty vec could be used as the default, but it appears to be needed by the TOML parsing. A better approach would be to use a separate SerializableConfig and map to/from a Config struct. This would also allow the elimination of most, if not all, of the other Options in the Config structure, but that ought to be another PR.
2018-02-25 11:42:31 +00:00 · 2018-02-25 11:42:31 +00:00 · 3e1221064b
parent 972687fd76
commit 3e1221064b
8 changed files with 179 additions and 16 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -172,6 +172,7 @@ version = "0.1.0"
 dependencies = [
 "chrono 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "errors 0.1.0",
+ "globset 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "highlighting 0.1.0",
 "serde 1.0.27 (registry+https://github.com/rust-lang/crates.io-index)",
 "serde_derive 1.0.27 (registry+https://github.com/rust-lang/crates.io-index)",
@ -185,6 +186,7 @@ dependencies = [
 "config 0.1.0",
 "errors 0.1.0",
 "front_matter 0.1.0",
+ "globset 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "rayon 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "rendering 0.1.0",
 "serde 1.0.27 (registry+https://github.com/rust-lang/crates.io-index)",
@ -364,6 +366,18 @@ name = "glob"
 version = "0.2.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"

+[[package]]
+name = "globset"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "aho-corasick 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
+ "fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
+ "log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "gutenberg"
 version = "0.3.1"
@ -1546,6 +1560,7 @@ dependencies = [
 "checksum gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5e33ec290da0d127825013597dbdfc28bee4964690c7ce1166cbc2a7bd08b1bb"
 "checksum getopts 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)" = "b900c08c1939860ce8b54dc6a89e26e00c04c380fd0e09796799bd7f12861e05"
 "checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
+"checksum globset 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1e96ab92362c06811385ae9a34d2698e8a1160745e0c78fbb434a44c8de3fabc"
 "checksum httparse 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "c2f407128745b78abc95c0ffbe4e5d37427fdc0d45470710cfef8c44522a2e37"
 "checksum humansize 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b6cab2627acfc432780848602f3f558f7e9dd427352224b0d9324025796d2a5e"
 "checksum hyper 0.10.13 (registry+https://github.com/rust-lang/crates.io-index)" = "368cb56b2740ebf4230520e2b90ebb0461e69034d85d1945febd9b3971426db2"
--- a/components/config/Cargo.toml
+++ b/components/config/Cargo.toml
@ -8,6 +8,7 @@ toml = "0.4"
 serde = "1"
 serde_derive = "1"
 chrono = "0.4"
+globset = "0.3.0"

 errors = { path = "../errors" }
 highlighting = { path = "../highlighting"}
--- a/components/config/src/lib.rs
+++ b/components/config/src/lib.rs
@ -5,6 +5,7 @@ extern crate toml;
 extern crate errors;
 extern crate highlighting;
 extern crate chrono;
+extern crate globset;

 use std::collections::HashMap;
 use std::fs::File;
@ -13,6 +14,7 @@ use std::path::{Path, PathBuf};

 use toml::{Value as Toml};
 use chrono::Utc;
+use globset::{Glob, GlobSet, GlobSetBuilder};

 use errors::{Result, ResultExt};
 use highlighting::THEME_SET;
@ -22,7 +24,7 @@ mod theme;

 use theme::Theme;

-#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct Config {
    /// Base URL of the site, the only required config argument
    pub base_url: String,
@ -49,6 +51,12 @@ pub struct Config {
    pub generate_categories_pages: Option<bool>,
    /// Whether to compile the `sass` directory and output the css files into the static folder
    pub compile_sass: Option<bool>,
+    /// A list of file glob patterns to ignore when processing the content folder. Defaults to none.
+    /// Had to remove the PartialEq derive because GlobSet does not implement it. No impact
+    /// because it's unused anyway (who wants to sort Configs?).
+    pub ignored_content: Option<Vec<String>>,
+    #[serde(skip_serializing, skip_deserializing)]
+    pub ignored_content_globber: Option<GlobSet>,

    /// Languages list and translated strings
    pub translations: Option<HashMap<String, Toml>>,
@ -84,6 +92,7 @@ impl Config {
        set_default!(config.generate_tags_pages, false);
        set_default!(config.generate_categories_pages, false);
        set_default!(config.compile_sass, false);
+        set_default!(config.ignored_content, Vec::new());
        set_default!(config.translations, HashMap::new());
        set_default!(config.extra, HashMap::new());

@ -97,6 +106,27 @@ impl Config {
        };

        config.build_timestamp = Some(Utc::now().timestamp());
+
+        // Convert the file glob strings into a compiled glob set matcher. We want to do this once,
+        // at program initialization, rather than for every page, for example. We arrange for the
+        // globset matcher to always exist (even though it has to be an inside an Option at the
+        // moment because of the TOML serializer); if the glob set is empty the `is_match` function
+        // of the globber always returns false.
+        let mut glob_set_builder = GlobSetBuilder::new();
+
+        if let Some(ref v) = config.ignored_content {
+            if v.len() > 0 {
+                for pat in v {
+                    let glob = match Glob::new(pat) {
+                        Ok(g) => g,
+                        Err(e) => bail!("Invalid ignored_content glob pattern: {}, error = {}", pat, e)
+                    };
+                    glob_set_builder.add(glob);
+                }
+            }
+        }
+        config.ignored_content_globber = Some(glob_set_builder.build().expect("Bad ignored_content in config file."));
+
        Ok(config)
    }

@ -176,6 +206,8 @@ impl Default for Config {
            generate_tags_pages: Some(true),
            generate_categories_pages: Some(true),
            compile_sass: Some(false),
+            ignored_content: Some(Vec::new()),
+            ignored_content_globber: Some(GlobSetBuilder::new().build().unwrap()),
            translations: None,
            extra: None,
            build_timestamp: Some(1),
@ -330,4 +362,51 @@ title = "A title"
        assert_eq!(translations["en"]["title"].as_str().unwrap(), "A title");
    }

+    #[test]
+    fn missing_ignored_content_results_in_empty_vector_and_empty_globber() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        let v = config.ignored_content.unwrap();
+        assert_eq!(v.len(), 0);
+        assert!(config.ignored_content_globber.unwrap().is_empty());
+    }
+
+    #[test]
+    fn empty_ignored_content_results_in_empty_vector_and_empty_globber() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+ignored_content = []
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        assert_eq!(config.ignored_content.unwrap().len(), 0);
+        assert!(config.ignored_content_globber.unwrap().is_empty());
+    }
+
+    #[test]
+    fn non_empty_ignored_content_results_in_vector_of_patterns_and_configured_globber() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+ignored_content = ["*.{graphml,iso}", "*.py?"]
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        let v = config.ignored_content.unwrap();
+        assert_eq!(v, vec!["*.{graphml,iso}", "*.py?"]);
+
+        let g = config.ignored_content_globber.unwrap();
+        assert_eq!(g.len(), 2);
+        assert!(g.is_match("foo.graphml"));
+        assert!(g.is_match("foo.iso"));
+        assert!(!g.is_match("foo.png"));
+        assert!(g.is_match("foo.py2"));
+        assert!(g.is_match("foo.py3"));
+        assert!(!g.is_match("foo.py"));
+    }
 }
--- a/components/content/Cargo.toml
+++ b/components/content/Cargo.toml
@ -8,6 +8,7 @@ tera = "0.11"
 serde = "1"
 slug = "0.1"
 rayon = "1"
+globset = "0.3.0"

 errors = { path = "../errors" }
 config = { path = "../config" }
--- a/components/content/src/lib.rs
+++ b/components/content/src/lib.rs
@ -13,6 +13,8 @@ extern crate utils;
 extern crate tempdir;
 #[cfg(test)]
 extern crate toml;
+#[cfg(test)]
+extern crate globset;

 mod file_info;
 mod page;
--- a/components/content/src/page.rs
+++ b/components/content/src/page.rs
@ -128,10 +128,27 @@ impl Page {
        let path = path.as_ref();
        let content = read_file(path)?;
        let mut page = Page::parse(path, &content, config)?;
-        page.assets = vec![];

        if page.file.name == "index" {
-            page.assets = find_related_assets(path.parent().unwrap());
+            // `find_related_assets` only scans the immediate directory (it is not recursive) so our
+            // filtering only needs to work against the file_name component, not the full suffix. If
+            // `find_related_assets` was changed to also return files in subdirectories, we could
+            // use `PathBuf.strip_prefix` to remove the parent directory and then glob-filter
+            // against the remaining path. Note that the current behaviour effectively means that
+            // the `ignored_content` setting in the config file is limited to single-file glob
+            // patterns (no "**" patterns).
+            let globber = config.ignored_content_globber.as_ref().unwrap();
+            let parent_dir = path.parent().unwrap();
+            page.assets = find_related_assets(parent_dir).into_iter()
+                .filter(|path|
+                    match path.file_name() {
+                        None => true,
+                        Some(file) => !globber.is_match(file)
+                    }
+                ).collect();
+
+        } else {
+            page.assets = vec![];
        }

        Ok(page)
@ -240,6 +257,7 @@ mod tests {

    use tera::Tera;
    use tempdir::TempDir;
+    use globset::{Glob, GlobSetBuilder};

    use config::Config;
    use super::Page;
@ -419,4 +437,34 @@ Hello world
        assert_eq!(page.assets.len(), 3);
        assert_eq!(page.permalink, "http://a-website.com/posts/hey/");
    }
+
+    #[test]
+    fn page_with_ignored_assets_filters_out_correct_files() {
+        let tmp_dir = TempDir::new("example").expect("create temp dir");
+        let path = tmp_dir.path();
+        create_dir(&path.join("content")).expect("create content temp dir");
+        create_dir(&path.join("content").join("posts")).expect("create posts temp dir");
+        let nested_path = path.join("content").join("posts").join("with-assets");
+        create_dir(&nested_path).expect("create nested temp dir");
+        let mut f = File::create(nested_path.join("index.md")).unwrap();
+        f.write_all(b"+++\nslug=\"hey\"\n+++\n").unwrap();
+        File::create(nested_path.join("example.js")).unwrap();
+        File::create(nested_path.join("graph.jpg")).unwrap();
+        File::create(nested_path.join("fail.png")).unwrap();
+
+        let mut gsb = GlobSetBuilder::new();
+        gsb.add(Glob::new("*.{js,png}").unwrap());
+        let mut config = Config::default();
+        config.ignored_content_globber = Some(gsb.build().unwrap());
+
+        let res = Page::from_file(
+            nested_path.join("index.md").as_path(),
+            &config
+        );
+
+        assert!(res.is_ok());
+        let page = res.unwrap();
+        assert_eq!(page.assets.len(), 1);
+        assert_eq!(page.assets[0].file_name().unwrap().to_str(), Some("graph.jpg"));
+    }
 }
--- a/docs/content/documentation/content/overview.md
+++ b/docs/content/documentation/content/overview.md
@ -52,3 +52,14 @@ Those assets will be copied in the same folder when building the site which allo
 ```

 By default, this page will get the folder name (`with-assets` in this case) as its slug.
+
+It is possible to ignore selected asset files using the
+[ignored_content](./documentation/getting-started/configuration.md) setting in the config file.
+For example, say you have an Excel spreadsheet from which you are taking several screenshots and
+then linking to those image files on your website. For maintainability purposes, you want to keep
+the spreadsheet in the same folder as the markdown, but you don't want to copy the spreadsheet to
+the public web site. You can achieve this by simply setting `ignored_content` in the config file:
+
+```
+ignored_content = ["*.xlsx"]
+```
--- a/docs/content/documentation/getting-started/configuration.md
+++ b/docs/content/documentation/getting-started/configuration.md
@ -51,6 +51,12 @@ generate_categories_pages = false
 # Whether to compile the Sass files found in the `sass` directory
 compile_sass = false

+# A list of glob patterns specifying asset files to ignore when
+# copying content. Defaults to none, which means all asset files
+# are copied over to the public folder. Example:
+#     ignored_content = ["*.{graphml,xlsx}", "temp.*"]
+ignored_content = []
+
 # Optional translation object. The key if present should be a language code
 [translations]