Filter ignored content in page.rs.

* Add ignored_content to the Config structure. * Use the GlobSet crate to parse the glob patterns into a matcher, which is created once at program initialization. If there are no patterns in ignored_content, an empty globber is created, which excludes no files. This is consistent with the existing behaviour of Gutenberg, before this feature was added. * Bail if there are any errors in the glob patterns. * Add a call to the globber in page.rs to actually do the filtering. * Update documentation. A note on the Config structure ------------------------------ * I had to remove the PartialEq derive from the Config structure as it does not work for the GlobSet type. No harm is done, Config does not need to be PartialEq anyway, since there is no need to sort Configs. * The implementation follows the pattern of the existing config settings in that it uses an Option<...>. This would appear unnecessary, in that an empty vec could be used as the default, but it appears to be needed by the TOML parsing. A better approach would be to use a separate SerializableConfig and map to/from a Config struct. This would also allow the elimination of most, if not all, of the other Options in the Config structure, but that ought to be another PR.
2018-02-25 11:42:31 +00:00 · 2018-02-25 11:42:31 +00:00 · 3e1221064b
parent 972687fd76
commit 3e1221064b
8 changed files with 179 additions and 16 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -172,6 +172,7 @@ version = "0.1.0"
 dependencies = [
 "chrono 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "errors 0.1.0",
+ "globset 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "highlighting 0.1.0",
 "serde 1.0.27 (registry+https://github.com/rust-lang/crates.io-index)",
 "serde_derive 1.0.27 (registry+https://github.com/rust-lang/crates.io-index)",
@ -185,6 +186,7 @@ dependencies = [
 "config 0.1.0",
 "errors 0.1.0",
 "front_matter 0.1.0",
+ "globset 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "rayon 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "rendering 0.1.0",
 "serde 1.0.27 (registry+https://github.com/rust-lang/crates.io-index)",
@ -364,6 +366,18 @@ name = "glob"
 version = "0.2.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"

+[[package]]
+name = "globset"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "aho-corasick 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
+ "fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
+ "log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "gutenberg"
 version = "0.3.1"
@ -1546,6 +1560,7 @@ dependencies = [
 "checksum gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5e33ec290da0d127825013597dbdfc28bee4964690c7ce1166cbc2a7bd08b1bb"
 "checksum getopts 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)" = "b900c08c1939860ce8b54dc6a89e26e00c04c380fd0e09796799bd7f12861e05"
 "checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
+"checksum globset 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1e96ab92362c06811385ae9a34d2698e8a1160745e0c78fbb434a44c8de3fabc"
 "checksum httparse 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "c2f407128745b78abc95c0ffbe4e5d37427fdc0d45470710cfef8c44522a2e37"
 "checksum humansize 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b6cab2627acfc432780848602f3f558f7e9dd427352224b0d9324025796d2a5e"
 "checksum hyper 0.10.13 (registry+https://github.com/rust-lang/crates.io-index)" = "368cb56b2740ebf4230520e2b90ebb0461e69034d85d1945febd9b3971426db2"
--- a/components/config/Cargo.toml
+++ b/components/config/Cargo.toml
@ -8,6 +8,7 @@ toml = "0.4"
 serde = "1"
 serde_derive = "1"
 chrono = "0.4"
+globset = "0.3.0"

 errors = { path = "../errors" }
 highlighting = { path = "../highlighting"}
--- a/components/config/src/lib.rs
+++ b/components/config/src/lib.rs
@ -5,6 +5,7 @@ extern crate toml;
 extern crate errors;
 extern crate highlighting;
 extern crate chrono;
+extern crate globset;

 use std::collections::HashMap;
 use std::fs::File;
@ -13,6 +14,7 @@ use std::path::{Path, PathBuf};

 use toml::{Value as Toml};
 use chrono::Utc;
+use globset::{Glob, GlobSet, GlobSetBuilder};

 use errors::{Result, ResultExt};
 use highlighting::THEME_SET;
@ -22,7 +24,7 @@ mod theme;

 use theme::Theme;

-#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct Config {
    /// Base URL of the site, the only required config argument
    pub base_url: String,
@ -49,6 +51,12 @@ pub struct Config {
    pub generate_categories_pages: Option<bool>,
    /// Whether to compile the `sass` directory and output the css files into the static folder
    pub compile_sass: Option<bool>,
+    /// A list of file glob patterns to ignore when processing the content folder. Defaults to none.
+    /// Had to remove the PartialEq derive because GlobSet does not implement it. No impact
+    /// because it's unused anyway (who wants to sort Configs?).
+    pub ignored_content: Option<Vec<String>>,
+    #[serde(skip_serializing, skip_deserializing)]
+    pub ignored_content_globber: Option<GlobSet>,

    /// Languages list and translated strings
    pub translations: Option<HashMap<String, Toml>>,
@ -84,6 +92,7 @@ impl Config {
        set_default!(config.generate_tags_pages, false);
        set_default!(config.generate_categories_pages, false);
        set_default!(config.compile_sass, false);
+        set_default!(config.ignored_content, Vec::new());
        set_default!(config.translations, HashMap::new());
        set_default!(config.extra, HashMap::new());

@ -97,6 +106,27 @@ impl Config {
        };

        config.build_timestamp = Some(Utc::now().timestamp());
+
+        // Convert the file glob strings into a compiled glob set matcher. We want to do this once,
+        // at program initialization, rather than for every page, for example. We arrange for the
+        // globset matcher to always exist (even though it has to be an inside an Option at the
+        // moment because of the TOML serializer); if the glob set is empty the `is_match` function
+        // of the globber always returns false.
+        let mut glob_set_builder = GlobSetBuilder::new();
+
+        if let Some(ref v) = config.ignored_content {
+            if v.len() > 0 {
+                for pat in v {
+                    let glob = match Glob::new(pat) {
+                        Ok(g) => g,
+                        Err(e) => bail!("Invalid ignored_content glob pattern: {}, error = {}", pat, e)
+                    };
+                    glob_set_builder.add(glob);
+                }
+            }
+        }
+        config.ignored_content_globber = Some(glob_set_builder.build().expect("Bad ignored_content in config file."));
+
        Ok(config)
    }

@ -176,6 +206,8 @@ impl Default for Config {
            generate_tags_pages: Some(true),
            generate_categories_pages: Some(true),
            compile_sass: Some(false),
+            ignored_content: Some(Vec::new()),
+            ignored_content_globber: Some(GlobSetBuilder::new().build().unwrap()),
            translations: None,
            extra: None,
            build_timestamp: Some(1),
@ -330,4 +362,51 @@ title = "A title"
        assert_eq!(translations["en"]["title"].as_str().unwrap(), "A title");
    }

+    #[test]
+    fn missing_ignored_content_results_in_empty_vector_and_empty_globber() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        let v = config.ignored_content.unwrap();
+        assert_eq!(v.len(), 0);
+        assert!(config.ignored_content_globber.unwrap().is_empty());
+    }
+
+    #[test]
+    fn empty_ignored_content_results_in_empty_vector_and_empty_globber() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+ignored_content = []
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        assert_eq!(config.ignored_content.unwrap().len(), 0);
+        assert!(config.ignored_content_globber.unwrap().is_empty());
+    }
+
+    #[test]
+    fn non_empty_ignored_content_results_in_vector_of_patterns_and_configured_globber() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+ignored_content = ["*.{graphml,iso}", "*.py?"]
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        let v = config.ignored_content.unwrap();
+        assert_eq!(v, vec!["*.{graphml,iso}", "*.py?"]);
+
+        let g = config.ignored_content_globber.unwrap();
+        assert_eq!(g.len(), 2);
+        assert!(g.is_match("foo.graphml"));
+        assert!(g.is_match("foo.iso"));
+        assert!(!g.is_match("foo.png"));
+        assert!(g.is_match("foo.py2"));
+        assert!(g.is_match("foo.py3"));
+        assert!(!g.is_match("foo.py"));
+    }
 }
--- a/components/content/Cargo.toml
+++ b/components/content/Cargo.toml
@ -8,6 +8,7 @@ tera = "0.11"
 serde = "1"
 slug = "0.1"
 rayon = "1"
+globset = "0.3.0"

 errors = { path = "../errors" }
 config = { path = "../config" }
--- a/components/content/src/lib.rs
+++ b/components/content/src/lib.rs
@ -13,6 +13,8 @@ extern crate utils;
 extern crate tempdir;
 #[cfg(test)]
 extern crate toml;
+#[cfg(test)]
+extern crate globset;

 mod file_info;
 mod page;
--- a/components/content/src/page.rs
+++ b/components/content/src/page.rs
@ -128,10 +128,27 @@ impl Page {
        let path = path.as_ref();
        let content = read_file(path)?;
        let mut page = Page::parse(path, &content, config)?;
-        page.assets = vec![];

        if page.file.name == "index" {
-            page.assets = find_related_assets(path.parent().unwrap());
+            // `find_related_assets` only scans the immediate directory (it is not recursive) so our
+            // filtering only needs to work against the file_name component, not the full suffix. If
+            // `find_related_assets` was changed to also return files in subdirectories, we could
+            // use `PathBuf.strip_prefix` to remove the parent directory and then glob-filter
+            // against the remaining path. Note that the current behaviour effectively means that
+            // the `ignored_content` setting in the config file is limited to single-file glob
+            // patterns (no "**" patterns).
+            let globber = config.ignored_content_globber.as_ref().unwrap();
+            let parent_dir = path.parent().unwrap();
+            page.assets = find_related_assets(parent_dir).into_iter()
+                .filter(|path|
+                    match path.file_name() {
+                        None => true,
+                        Some(file) => !globber.is_match(file)
+                    }
+                ).collect();
+
+        } else {
+            page.assets = vec![];
        }

        Ok(page)
@ -240,6 +257,7 @@ mod tests {

    use tera::Tera;
    use tempdir::TempDir;
+    use globset::{Glob, GlobSetBuilder};

    use config::Config;
    use super::Page;
@ -419,4 +437,34 @@ Hello world
        assert_eq!(page.assets.len(), 3);
        assert_eq!(page.permalink, "http://a-website.com/posts/hey/");
    }
+
+    #[test]
+    fn page_with_ignored_assets_filters_out_correct_files() {
+        let tmp_dir = TempDir::new("example").expect("create temp dir");
+        let path = tmp_dir.path();
+        create_dir(&path.join("content")).expect("create content temp dir");
+        create_dir(&path.join("content").join("posts")).expect("create posts temp dir");
+        let nested_path = path.join("content").join("posts").join("with-assets");
+        create_dir(&nested_path).expect("create nested temp dir");
+        let mut f = File::create(nested_path.join("index.md")).unwrap();
+        f.write_all(b"+++\nslug=\"hey\"\n+++\n").unwrap();
+        File::create(nested_path.join("example.js")).unwrap();
+        File::create(nested_path.join("graph.jpg")).unwrap();
+        File::create(nested_path.join("fail.png")).unwrap();
+
+        let mut gsb = GlobSetBuilder::new();
+        gsb.add(Glob::new("*.{js,png}").unwrap());
+        let mut config = Config::default();
+        config.ignored_content_globber = Some(gsb.build().unwrap());
+
+        let res = Page::from_file(
+            nested_path.join("index.md").as_path(),
+            &config
+        );
+
+        assert!(res.is_ok());
+        let page = res.unwrap();
+        assert_eq!(page.assets.len(), 1);
+        assert_eq!(page.assets[0].file_name().unwrap().to_str(), Some("graph.jpg"));
+    }
 }
--- a/docs/content/documentation/content/overview.md
+++ b/docs/content/documentation/content/overview.md
@ -5,8 +5,8 @@ weight = 10


 Gutenberg uses the folder structure to determine the site structure.
-Each folder in the `content` directory represents a [section](./documentation/content/section.md) 
-that contains [pages](./documentation/content/page.md): your `.md` files. 
+Each folder in the `content` directory represents a [section](./documentation/content/section.md)
+that contains [pages](./documentation/content/page.md): your `.md` files.

 ```bash
 .
@ -26,21 +26,21 @@ that contains [pages](./documentation/content/page.md): your `.md` files.
 Each page path (the part after the `base_url`, for example `blog/cli-usage/`) can be customised by changing the `path` or `slug`
 attribute of the [page front-matter](./documentation/content/page.md#front-matter).

-You might have noticed a file named `_index.md` in the example above. 
+You might have noticed a file named `_index.md` in the example above.
 This file will be used for the metadata and content of the section itself and is not considered a page.

 To make sure the terminology used in the rest of the documentation is understood, let's go over the example above.

 The `content` directory in this case has three `sections`: `content`, `blog` and `landing`. The `content` section has only
-one page, `something.md`, the `landing` section has no page and the `blog` section has 4 pages: `cli-usage.md`, `configuration.md`, `directory-structure.md` 
+one page, `something.md`, the `landing` section has no page and the `blog` section has 4 pages: `cli-usage.md`, `configuration.md`, `directory-structure.md`
 and `installation.md`.

 While not shown in the example, sections can be nested indefinitely.

 ## Assets colocation

-The `content` directory is not limited to markup files though: it's natural to want to co-locate a page and some related 
-assets. 
+The `content` directory is not limited to markup files though: it's natural to want to co-locate a page and some related
+assets.

 Gutenberg supports that pattern out of the box: create a folder, add a `index.md` file and as many non-markdown files as you want.
 Those assets will be copied in the same folder when building the site which allows you to use a relative path to access them.
@ -52,3 +52,14 @@ Those assets will be copied in the same folder when building the site which allo
 ```

 By default, this page will get the folder name (`with-assets` in this case) as its slug.
+
+It is possible to ignore selected asset files using the
+[ignored_content](./documentation/getting-started/configuration.md) setting in the config file.
+For example, say you have an Excel spreadsheet from which you are taking several screenshots and
+then linking to those image files on your website. For maintainability purposes, you want to keep
+the spreadsheet in the same folder as the markdown, but you don't want to copy the spreadsheet to
+the public web site. You can achieve this by simply setting `ignored_content` in the config file:
+
+```
+ignored_content = ["*.xlsx"]
+```
--- a/docs/content/documentation/getting-started/configuration.md
+++ b/docs/content/documentation/getting-started/configuration.md
@ -3,10 +3,10 @@ title = "Configuration"
 weight = 4
 +++

-The default configuration will be enough to get Gutenberg running locally but not more than that. 
+The default configuration will be enough to get Gutenberg running locally but not more than that.
 It follows the philosophy of only paying for what you need: almost everything is turned off by default.

-To change the config, edit the `config.toml` file. 
+To change the config, edit the `config.toml` file.
 If you are not familiar with TOML, have a look at [the TOML Spec](https://github.com/toml-lang/toml)
 to learn about it.

@ -30,7 +30,7 @@ theme = ""
 # Highlight all code blocks found
 highlight_code = false

-# Which theme to use for the code highlighting. 
+# Which theme to use for the code highlighting.
 # See below for list of accepted values
 highlight_theme = "base16-ocean-dark"

@ -40,21 +40,27 @@ generate_rss = false
 # The number of articles to include in the RSS feed
 rss_limit = 20

-# Whether to generate a tags page and individual 
+# Whether to generate a tags page and individual
 # tag pages for pages with tags
 generate_tags_pages = false

-# Whether to generate a categories page and individual 
+# Whether to generate a categories page and individual
 # category pages for pages with a category
 generate_categories_pages = false

 # Whether to compile the Sass files found in the `sass` directory
 compile_sass = false

+# A list of glob patterns specifying asset files to ignore when
+# copying content. Defaults to none, which means all asset files
+# are copied over to the public folder. Example:
+#     ignored_content = ["*.{graphml,xlsx}", "temp.*"]
+ignored_content = []
+
 # Optional translation object. The key if present should be a language code
 [translations]

-# You can put any kind of data in there and it 
+# You can put any kind of data in there and it
 # will be accessible in all templates
 [extra]
 ```
@ -76,5 +82,5 @@ Gutenberg currently has the following highlight themes available:
 - solarized-light
 - 1337

-Gutenberg uses the Sublime Text themes, making it very easy to add more. 
+Gutenberg uses the Sublime Text themes, making it very easy to add more.
 If you want a theme not on that list, please open an issue or a pull request on the [Gutenberg repo](https://github.com/Keats/gutenberg).