zola/components/search/src/lib.rs

use std::collections::{HashMap, HashSet};

use elasticlunr::pipeline;
use elasticlunr::pipeline::TokenizerFn;
use elasticlunr::{Index, Language};
use lazy_static::lazy_static;

use config::{Config, Search};
use errors::{bail, Result};
use library::{Library, Section};

pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");

lazy_static! {
    static ref AMMONIA: ammonia::Builder<'static> = {
        let mut clean_content = HashSet::new();
        clean_content.insert("script");
        clean_content.insert("style");
        let mut builder = ammonia::Builder::new();
        builder
            .tags(HashSet::new())
            .tag_attributes(HashMap::new())
            .generic_attributes(HashSet::new())
            .link_rel(None)
            .allowed_classes(HashMap::new())
            .clean_content_tags(clean_content);
        builder
    };
}

fn build_fields(search_config: &Search) -> Vec<String> {
    let mut fields = vec![];
    if search_config.include_title {
        fields.push("title".to_owned());
    }

    if search_config.include_description {
        fields.push("description".to_owned());
    }

    if search_config.include_path {
        fields.push("path".to_owned());
    }

    if search_config.include_content {
        fields.push("body".to_owned());
    }

    fields
}

fn path_tokenizer(text: &str) -> Vec<String> {
    text.split(|c: char| c.is_whitespace() || c == '-' || c == '/')
        .filter(|s| !s.is_empty())
        .map(|s| s.trim().to_lowercase())
        .collect()
}

fn build_tokenizers(search_config: &Search, language: Language) -> Vec<TokenizerFn> {
    let text_tokenizer = match language {
        #[cfg(feature = "indexing-zh")]
        Language::Chinese => pipeline::tokenize_chinese,
        #[cfg(feature = "indexing-ja")]
        Language::Japanese => pipeline::tokenize_japanese,
        _ => pipeline::tokenize,
    };
    let mut tokenizers: Vec<TokenizerFn> = vec![];
    if search_config.include_title {
        tokenizers.push(text_tokenizer);
    }

    if search_config.include_description {
        tokenizers.push(text_tokenizer);
    }

    if search_config.include_path {
        tokenizers.push(path_tokenizer);
    }

    if search_config.include_content {
        tokenizers.push(text_tokenizer);
    }

    tokenizers
}

fn fill_index(
    search_config: &Search,
    title: &Option<String>,
    description: &Option<String>,
    path: &str,
    content: &str,
) -> Vec<String> {
    let mut row = vec![];

    if search_config.include_title {
        row.push(title.clone().unwrap_or_default());
    }

    if search_config.include_description {
        row.push(description.clone().unwrap_or_default());
    }

    if search_config.include_path {
        row.push(path.to_string());
    }

    if search_config.include_content {
        let body = AMMONIA.clean(&content).to_string();
        if let Some(truncate_len) = search_config.truncate_content_length {
            // Not great for unicode
            // TODO: fix it like the truncate in Tera
            match body.char_indices().nth(truncate_len) {
                None => row.push(body),
                Some((idx, _)) => row.push((&body[..idx]).to_string()),
            };
        } else {
            row.push(body);
        };
    }

    row
}

/// Returns the generated JSON index with all the documents of the site added using
/// the language given
/// Errors if the language given is not available in Elasticlunr
/// TODO: is making `in_search_index` apply to subsections of a `false` section useful?
pub fn build_index(lang: &str, library: &Library, config: &Config) -> Result<String> {
    let language = match Language::from_code(lang) {
        Some(l) => l,
        None => {
            bail!("Tried to build search index for language {} which is not supported", lang);
        }
    };
    let language_options = &config.languages[lang];
    let mut index = Index::with_language(language, &build_fields(&language_options.search));

    let tokenizers = build_tokenizers(&language_options.search, language);

    for section in library.sections_values() {
        if section.lang == lang {
            add_section_to_index(
                &mut index,
                section,
                library,
                &language_options.search,
                tokenizers.clone(),
            );
        }
    }

    Ok(index.to_json())
}

fn add_section_to_index(
    index: &mut Index,
    section: &Section,
    library: &Library,
    search_config: &Search,
    tokenizers: Vec<TokenizerFn>,
) {
    if !section.meta.in_search_index {
        return;
    }

    // Don't index redirecting sections
    if section.meta.redirect_to.is_none() {
        index.add_doc_with_tokenizers(
            &section.permalink,
            &fill_index(
                search_config,
                &section.meta.title,
                &section.meta.description,
                &section.path,
                &section.content,
            ),
            tokenizers.clone(),
        );
    }

    for key in &section.pages {
        let page = library.get_page_by_key(*key);
        if !page.meta.in_search_index {
            continue;
        }

        index.add_doc_with_tokenizers(
            &page.permalink,
            &fill_index(
                search_config,
                &page.meta.title,
                &page.meta.description,
                &page.path,
                &page.content,
            ),
            tokenizers.clone(),
        );
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    use config::Config;

    #[test]
    fn can_build_fields() {
        let mut config = Config::default();
        let fields = build_fields(&config.search);
        assert_eq!(fields, vec!["title", "body"]);

        config.search.include_content = false;
        config.search.include_description = true;
        let fields = build_fields(&config.search);
        assert_eq!(fields, vec!["title", "description"]);

        config.search.include_content = true;
        let fields = build_fields(&config.search);
        assert_eq!(fields, vec!["title", "description", "body"]);

        config.search.include_title = false;
        let fields = build_fields(&config.search);
        assert_eq!(fields, vec!["description", "body"]);
    }

    #[test]
    fn can_fill_index_default() {
        let config = Config::default();
        let title = Some("A title".to_string());
        let description = Some("A description".to_string());
        let path = "/a/page/".to_string();
        let content = "Some content".to_string();

        let res = fill_index(&config.search, &title, &description, &path, &content);
        assert_eq!(res.len(), 2);
        assert_eq!(res[0], title.unwrap());
        assert_eq!(res[1], content);
    }

    #[test]
    fn can_fill_index_description() {
        let mut config = Config::default();
        config.search.include_description = true;
        let title = Some("A title".to_string());
        let description = Some("A description".to_string());
        let path = "/a/page/".to_string();
        let content = "Some content".to_string();

        let res = fill_index(&config.search, &title, &description, &path, &content);
        assert_eq!(res.len(), 3);
        assert_eq!(res[0], title.unwrap());
        assert_eq!(res[1], description.unwrap());
        assert_eq!(res[2], content);
    }

    #[test]
    fn can_fill_index_truncated_content() {
        let mut config = Config::default();
        config.search.truncate_content_length = Some(5);
        let title = Some("A title".to_string());
        let description = Some("A description".to_string());
        let path = "/a/page/".to_string();
        let content = "Some content".to_string();

        let res = fill_index(&config.search, &title, &description, &path, &content);
        assert_eq!(res.len(), 2);
        assert_eq!(res[0], title.unwrap());
        assert_eq!(res[1], content[..5]);
    }
}
[WIP] Search 2018-03-15 17:58:32 +00:00			`use std::collections::{HashMap, HashSet};`

Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`use elasticlunr::pipeline;`
			`use elasticlunr::pipeline::TokenizerFn;`
Fix config loading Closes #1512 2021-06-02 19:46:19 +00:00			`use elasticlunr::{Index, Language};`
Use Rust 2018 edition (#885) 2019-12-21 21:52:39 +00:00			`use lazy_static::lazy_static;`
Search is working now 2018-03-20 20:27:33 +00:00
Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`use config::{Config, Search};`
Use Rust 2018 edition (#885) 2019-12-21 21:52:39 +00:00			`use errors::{bail, Result};`
rustfmt 2018-10-31 07:18:57 +00:00			`use library::{Library, Section};`
[WIP] Search 2018-03-15 17:58:32 +00:00
Fix some clippy warnings 2018-09-30 19:15:09 +00:00			`pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");`
[WIP] Search 2018-03-15 17:58:32 +00:00
			`lazy_static! {`
			`static ref AMMONIA: ammonia::Builder<'static> = {`
			`let mut clean_content = HashSet::new();`
			`clean_content.insert("script");`
			`clean_content.insert("style");`
			`let mut builder = ammonia::Builder::new();`
			`builder`
			`.tags(HashSet::new())`
			`.tag_attributes(HashMap::new())`
			`.generic_attributes(HashSet::new())`
			`.link_rel(None)`
			`.allowed_classes(HashMap::new())`
			`.clean_content_tags(clean_content);`
			`builder`
			`};`
			`}`

Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`fn build_fields(search_config: &Search) -> Vec<String> {`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`let mut fields = vec![];`
Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`if search_config.include_title {`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`fields.push("title".to_owned());`
			`}`

Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`if search_config.include_description {`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`fields.push("description".to_owned());`
			`}`

Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`if search_config.include_path {`
			`fields.push("path".to_owned());`
			`}`

Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`if search_config.include_content {`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`fields.push("body".to_owned());`
			`}`

			`fields`
			`}`

Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`fn path_tokenizer(text: &str) -> Vec<String> {`
			`text.split(\|c: char\| c.is_whitespace() \|\| c == '-' \|\| c == '/')`
			`.filter(\|s\| !s.is_empty())`
			`.map(\|s\| s.trim().to_lowercase())`
			`.collect()`
			`}`

			`fn build_tokenizers(search_config: &Search, language: Language) -> Vec<TokenizerFn> {`
			`let text_tokenizer = match language {`
			`#[cfg(feature = "indexing-zh")]`
			`Language::Chinese => pipeline::tokenize_chinese,`
			`#[cfg(feature = "indexing-ja")]`
			`Language::Japanese => pipeline::tokenize_japanese,`
			`_ => pipeline::tokenize,`
			`};`
			`let mut tokenizers: Vec<TokenizerFn> = vec![];`
			`if search_config.include_title {`
			`tokenizers.push(text_tokenizer);`
			`}`

			`if search_config.include_description {`
			`tokenizers.push(text_tokenizer);`
			`}`

			`if search_config.include_path {`
			`tokenizers.push(path_tokenizer);`
			`}`

			`if search_config.include_content {`
			`tokenizers.push(text_tokenizer);`
			`}`

			`tokenizers`
			`}`

Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`fn fill_index(`
Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`search_config: &Search,`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`title: &Option<String>,`
			`description: &Option<String>,`
Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`path: &str,`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`content: &str,`
			`) -> Vec<String> {`
			`let mut row = vec![];`

Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`if search_config.include_title {`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`row.push(title.clone().unwrap_or_default());`
			`}`

Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`if search_config.include_description {`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`row.push(description.clone().unwrap_or_default());`
			`}`

Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`if search_config.include_path {`
			`row.push(path.to_string());`
			`}`

Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`if search_config.include_content {`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`let body = AMMONIA.clean(&content).to_string();`
Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`if let Some(truncate_len) = search_config.truncate_content_length {`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`// Not great for unicode`
			`// TODO: fix it like the truncate in Tera`
			`match body.char_indices().nth(truncate_len) {`
			`None => row.push(body),`
			`Some((idx, _)) => row.push((&body[..idx]).to_string()),`
			`};`
			`} else {`
			`row.push(body);`
			`};`
			`}`

			`row`
			`}`

Search is working now 2018-03-20 20:27:33 +00:00			`/// Returns the generated JSON index with all the documents of the site added using`
			`/// the language given`
			`/// Errors if the language given is not available in Elasticlunr`
[WIP] Search 2018-03-15 17:58:32 +00:00			/// TODO: is making `in_search_index` apply to subsections of a `false` section useful?
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`pub fn build_index(lang: &str, library: &Library, config: &Config) -> Result<String> {`
Search is working now 2018-03-20 20:27:33 +00:00			`let language = match Language::from_code(lang) {`
			`Some(l) => l,`
rustfmt 2018-10-31 07:18:57 +00:00			`None => {`
			`bail!("Tried to build search index for language {} which is not supported", lang);`
			`}`
Search is working now 2018-03-20 20:27:33 +00:00			`};`
Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`let language_options = &config.languages[lang];`
			`let mut index = Index::with_language(language, &build_fields(&language_options.search));`
[WIP] Search 2018-03-15 17:58:32 +00:00
Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`let tokenizers = build_tokenizers(&language_options.search, language);`

Slotmap refactor 2018-10-02 14:42:34 +00:00			`for section in library.sections_values() {`
Fix the issue of generating the search index for multiple language (#794) * fix the issue of generating the search index for multiple language * updat docs for generating the search index for multiple language * fix failed tests * add tests for the search index of multiple language 2019-09-03 14:50:23 +00:00			`if section.lang == lang {`
Fix config loading Closes #1512 2021-06-02 19:46:19 +00:00			`add_section_to_index(`
			`&mut index,`
			`section,`
			`library,`
			`&language_options.search,`
			`tokenizers.clone(),`
			`);`
Fix the issue of generating the search index for multiple language (#794) * fix the issue of generating the search index for multiple language * updat docs for generating the search index for multiple language * fix failed tests * add tests for the search index of multiple language 2019-09-03 14:50:23 +00:00			`}`
[WIP] Search 2018-03-15 17:58:32 +00:00			`}`

Search is working now 2018-03-20 20:27:33 +00:00			`Ok(index.to_json())`
[WIP] Search 2018-03-15 17:58:32 +00:00			`}`

Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`fn add_section_to_index(`
			`index: &mut Index,`
			`section: &Section,`
			`library: &Library,`
			`search_config: &Search,`
Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`tokenizers: Vec<TokenizerFn>,`
Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`) {`
[WIP] Search 2018-03-15 17:58:32 +00:00			`if !section.meta.in_search_index {`
			`return;`
			`}`

			`// Don't index redirecting sections`
			`if section.meta.redirect_to.is_none() {`
Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`index.add_doc_with_tokenizers(`
[WIP] Search 2018-03-15 17:58:32 +00:00			`&section.permalink,`
Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`&fill_index(`
			`search_config,`
			`&section.meta.title,`
			`&section.meta.description,`
Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`&section.path,`
Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`&section.content,`
			`),`
Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`tokenizers.clone(),`
[WIP] Search 2018-03-15 17:58:32 +00:00			`);`
			`}`

Slotmap refactor 2018-10-02 14:42:34 +00:00			`for key in &section.pages {`
			`let page = library.get_page_by_key(*key);`
Remove all draft specific code 2019-07-19 09:10:28 +00:00			`if !page.meta.in_search_index {`
[WIP] Search 2018-03-15 17:58:32 +00:00			`continue;`
			`}`

Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`index.add_doc_with_tokenizers(`
[WIP] Search 2018-03-15 17:58:32 +00:00			`&page.permalink,`
Fix config loading Closes #1512 2021-06-02 19:46:19 +00:00			`&fill_index(`
			`search_config,`
			`&page.meta.title,`
			`&page.meta.description,`
			`&page.path,`
			`&page.content,`
			`),`
Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`tokenizers.clone(),`
[WIP] Search 2018-03-15 17:58:32 +00:00			`);`
			`}`
			`}`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00
			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`use config::Config;`

			`#[test]`
			`fn can_build_fields() {`
			`let mut config = Config::default();`
Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`let fields = build_fields(&config.search);`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`assert_eq!(fields, vec!["title", "body"]);`

			`config.search.include_content = false;`
			`config.search.include_description = true;`
Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`let fields = build_fields(&config.search);`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`assert_eq!(fields, vec!["title", "description"]);`

			`config.search.include_content = true;`
Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`let fields = build_fields(&config.search);`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`assert_eq!(fields, vec!["title", "description", "body"]);`

			`config.search.include_title = false;`
Lang aware search indices settings 2021-03-13 20:27:17 +00:00			`let fields = build_fields(&config.search);`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`assert_eq!(fields, vec!["description", "body"]);`
			`}`

			`#[test]`
			`fn can_fill_index_default() {`
			`let config = Config::default();`
			`let title = Some("A title".to_string());`
			`let description = Some("A description".to_string());`
Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`let path = "/a/page/".to_string();`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`let content = "Some content".to_string();`

Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`let res = fill_index(&config.search, &title, &description, &path, &content);`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`assert_eq!(res.len(), 2);`
			`assert_eq!(res[0], title.unwrap());`
			`assert_eq!(res[1], content);`
			`}`

			`#[test]`
			`fn can_fill_index_description() {`
			`let mut config = Config::default();`
			`config.search.include_description = true;`
			`let title = Some("A title".to_string());`
			`let description = Some("A description".to_string());`
Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`let path = "/a/page/".to_string();`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`let content = "Some content".to_string();`

Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`let res = fill_index(&config.search, &title, &description, &path, &content);`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`assert_eq!(res.len(), 3);`
			`assert_eq!(res[0], title.unwrap());`
			`assert_eq!(res[1], description.unwrap());`
			`assert_eq!(res[2], content);`
			`}`

			`#[test]`
			`fn can_fill_index_truncated_content() {`
			`let mut config = Config::default();`
			`config.search.truncate_content_length = Some(5);`
			`let title = Some("A title".to_string());`
			`let description = Some("A description".to_string());`
Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`let path = "/a/page/".to_string();`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`let content = "Some content".to_string();`

Include path in the search index with include_path (#1509) 2021-06-02 07:18:39 +00:00			`let res = fill_index(&config.search, &title, &description, &path, &content);`
Make search index configurable Closes #961 2020-06-29 18:02:05 +00:00			`assert_eq!(res.len(), 2);`
			`assert_eq!(res[0], title.unwrap());`
			`assert_eq!(res[1], content[..5]);`
			`}`
			`}`