2018-03-15 17:58:32 +00:00
|
|
|
use std::collections::{HashMap, HashSet};
|
|
|
|
|
2018-03-20 20:27:33 +00:00
|
|
|
use elasticlunr::{Index, Language};
|
2021-06-02 07:18:39 +00:00
|
|
|
use elasticlunr::pipeline;
|
|
|
|
use elasticlunr::pipeline::TokenizerFn;
|
2019-12-21 21:52:39 +00:00
|
|
|
use lazy_static::lazy_static;
|
2018-03-20 20:27:33 +00:00
|
|
|
|
2021-03-13 20:27:17 +00:00
|
|
|
use config::{Config, Search};
|
2019-12-21 21:52:39 +00:00
|
|
|
use errors::{bail, Result};
|
2018-10-31 07:18:57 +00:00
|
|
|
use library::{Library, Section};
|
2018-03-15 17:58:32 +00:00
|
|
|
|
2018-09-30 19:15:09 +00:00
|
|
|
pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");
|
2018-03-15 17:58:32 +00:00
|
|
|
|
|
|
|
lazy_static! {
|
|
|
|
static ref AMMONIA: ammonia::Builder<'static> = {
|
|
|
|
let mut clean_content = HashSet::new();
|
|
|
|
clean_content.insert("script");
|
|
|
|
clean_content.insert("style");
|
|
|
|
let mut builder = ammonia::Builder::new();
|
|
|
|
builder
|
|
|
|
.tags(HashSet::new())
|
|
|
|
.tag_attributes(HashMap::new())
|
|
|
|
.generic_attributes(HashSet::new())
|
|
|
|
.link_rel(None)
|
|
|
|
.allowed_classes(HashMap::new())
|
|
|
|
.clean_content_tags(clean_content);
|
|
|
|
builder
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2021-03-13 20:27:17 +00:00
|
|
|
fn build_fields(search_config: &Search) -> Vec<String> {
|
2020-06-29 18:02:05 +00:00
|
|
|
let mut fields = vec![];
|
2021-03-13 20:27:17 +00:00
|
|
|
if search_config.include_title {
|
2020-06-29 18:02:05 +00:00
|
|
|
fields.push("title".to_owned());
|
|
|
|
}
|
|
|
|
|
2021-03-13 20:27:17 +00:00
|
|
|
if search_config.include_description {
|
2020-06-29 18:02:05 +00:00
|
|
|
fields.push("description".to_owned());
|
|
|
|
}
|
|
|
|
|
2021-06-02 07:18:39 +00:00
|
|
|
if search_config.include_path {
|
|
|
|
fields.push("path".to_owned());
|
|
|
|
}
|
|
|
|
|
2021-03-13 20:27:17 +00:00
|
|
|
if search_config.include_content {
|
2020-06-29 18:02:05 +00:00
|
|
|
fields.push("body".to_owned());
|
|
|
|
}
|
|
|
|
|
|
|
|
fields
|
|
|
|
}
|
|
|
|
|
2021-06-02 07:18:39 +00:00
|
|
|
fn path_tokenizer(text: &str) -> Vec<String> {
|
|
|
|
text.split(|c: char| c.is_whitespace() || c == '-' || c == '/')
|
|
|
|
.filter(|s| !s.is_empty())
|
|
|
|
.map(|s| s.trim().to_lowercase())
|
|
|
|
.collect()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn build_tokenizers(search_config: &Search, language: Language) -> Vec<TokenizerFn> {
|
|
|
|
let text_tokenizer = match language {
|
|
|
|
#[cfg(feature = "indexing-zh")]
|
|
|
|
Language::Chinese => pipeline::tokenize_chinese,
|
|
|
|
#[cfg(feature = "indexing-ja")]
|
|
|
|
Language::Japanese => pipeline::tokenize_japanese,
|
|
|
|
_ => pipeline::tokenize,
|
|
|
|
};
|
|
|
|
let mut tokenizers: Vec<TokenizerFn> = vec![];
|
|
|
|
if search_config.include_title {
|
|
|
|
tokenizers.push(text_tokenizer);
|
|
|
|
}
|
|
|
|
|
|
|
|
if search_config.include_description {
|
|
|
|
tokenizers.push(text_tokenizer);
|
|
|
|
}
|
|
|
|
|
|
|
|
if search_config.include_path {
|
|
|
|
tokenizers.push(path_tokenizer);
|
|
|
|
}
|
|
|
|
|
|
|
|
if search_config.include_content {
|
|
|
|
tokenizers.push(text_tokenizer);
|
|
|
|
}
|
|
|
|
|
|
|
|
tokenizers
|
|
|
|
}
|
|
|
|
|
2020-06-29 18:02:05 +00:00
|
|
|
fn fill_index(
|
2021-03-13 20:27:17 +00:00
|
|
|
search_config: &Search,
|
2020-06-29 18:02:05 +00:00
|
|
|
title: &Option<String>,
|
|
|
|
description: &Option<String>,
|
2021-06-02 07:18:39 +00:00
|
|
|
path: &str,
|
2020-06-29 18:02:05 +00:00
|
|
|
content: &str,
|
|
|
|
) -> Vec<String> {
|
|
|
|
let mut row = vec![];
|
|
|
|
|
2021-03-13 20:27:17 +00:00
|
|
|
if search_config.include_title {
|
2020-06-29 18:02:05 +00:00
|
|
|
row.push(title.clone().unwrap_or_default());
|
|
|
|
}
|
|
|
|
|
2021-03-13 20:27:17 +00:00
|
|
|
if search_config.include_description {
|
2020-06-29 18:02:05 +00:00
|
|
|
row.push(description.clone().unwrap_or_default());
|
|
|
|
}
|
|
|
|
|
2021-06-02 07:18:39 +00:00
|
|
|
if search_config.include_path {
|
|
|
|
row.push(path.to_string());
|
|
|
|
}
|
|
|
|
|
2021-03-13 20:27:17 +00:00
|
|
|
if search_config.include_content {
|
2020-06-29 18:02:05 +00:00
|
|
|
let body = AMMONIA.clean(&content).to_string();
|
2021-03-13 20:27:17 +00:00
|
|
|
if let Some(truncate_len) = search_config.truncate_content_length {
|
2020-06-29 18:02:05 +00:00
|
|
|
// Not great for unicode
|
|
|
|
// TODO: fix it like the truncate in Tera
|
|
|
|
match body.char_indices().nth(truncate_len) {
|
|
|
|
None => row.push(body),
|
|
|
|
Some((idx, _)) => row.push((&body[..idx]).to_string()),
|
|
|
|
};
|
|
|
|
} else {
|
|
|
|
row.push(body);
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
row
|
|
|
|
}
|
|
|
|
|
2018-03-20 20:27:33 +00:00
|
|
|
/// Returns the generated JSON index with all the documents of the site added using
|
|
|
|
/// the language given
|
|
|
|
/// Errors if the language given is not available in Elasticlunr
|
2018-03-15 17:58:32 +00:00
|
|
|
/// TODO: is making `in_search_index` apply to subsections of a `false` section useful?
|
2020-06-29 18:02:05 +00:00
|
|
|
pub fn build_index(lang: &str, library: &Library, config: &Config) -> Result<String> {
|
2018-03-20 20:27:33 +00:00
|
|
|
let language = match Language::from_code(lang) {
|
|
|
|
Some(l) => l,
|
2018-10-31 07:18:57 +00:00
|
|
|
None => {
|
|
|
|
bail!("Tried to build search index for language {} which is not supported", lang);
|
|
|
|
}
|
2018-03-20 20:27:33 +00:00
|
|
|
};
|
2021-03-13 20:27:17 +00:00
|
|
|
let language_options = &config.languages[lang];
|
|
|
|
let mut index = Index::with_language(language, &build_fields(&language_options.search));
|
2018-03-15 17:58:32 +00:00
|
|
|
|
2021-06-02 07:18:39 +00:00
|
|
|
let tokenizers = build_tokenizers(&language_options.search, language);
|
|
|
|
|
2018-10-02 14:42:34 +00:00
|
|
|
for section in library.sections_values() {
|
2019-09-03 14:50:23 +00:00
|
|
|
if section.lang == lang {
|
2021-06-02 07:18:39 +00:00
|
|
|
add_section_to_index(&mut index, section, library, &language_options.search, tokenizers.clone());
|
2019-09-03 14:50:23 +00:00
|
|
|
}
|
2018-03-15 17:58:32 +00:00
|
|
|
}
|
|
|
|
|
2018-03-20 20:27:33 +00:00
|
|
|
Ok(index.to_json())
|
2018-03-15 17:58:32 +00:00
|
|
|
}
|
|
|
|
|
2021-03-13 20:27:17 +00:00
|
|
|
fn add_section_to_index(
|
|
|
|
index: &mut Index,
|
|
|
|
section: &Section,
|
|
|
|
library: &Library,
|
|
|
|
search_config: &Search,
|
2021-06-02 07:18:39 +00:00
|
|
|
tokenizers: Vec<TokenizerFn>,
|
2021-03-13 20:27:17 +00:00
|
|
|
) {
|
2018-03-15 17:58:32 +00:00
|
|
|
if !section.meta.in_search_index {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Don't index redirecting sections
|
|
|
|
if section.meta.redirect_to.is_none() {
|
2021-06-02 07:18:39 +00:00
|
|
|
index.add_doc_with_tokenizers(
|
2018-03-15 17:58:32 +00:00
|
|
|
§ion.permalink,
|
2021-03-13 20:27:17 +00:00
|
|
|
&fill_index(
|
|
|
|
search_config,
|
|
|
|
§ion.meta.title,
|
|
|
|
§ion.meta.description,
|
2021-06-02 07:18:39 +00:00
|
|
|
§ion.path,
|
2021-03-13 20:27:17 +00:00
|
|
|
§ion.content,
|
|
|
|
),
|
2021-06-02 07:18:39 +00:00
|
|
|
tokenizers.clone(),
|
2018-03-15 17:58:32 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2018-10-02 14:42:34 +00:00
|
|
|
for key in §ion.pages {
|
|
|
|
let page = library.get_page_by_key(*key);
|
2019-07-19 09:10:28 +00:00
|
|
|
if !page.meta.in_search_index {
|
2018-03-15 17:58:32 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2021-06-02 07:18:39 +00:00
|
|
|
index.add_doc_with_tokenizers(
|
2018-03-15 17:58:32 +00:00
|
|
|
&page.permalink,
|
2021-06-02 07:18:39 +00:00
|
|
|
&fill_index(search_config, &page.meta.title, &page.meta.description, &page.path, &page.content),
|
|
|
|
tokenizers.clone(),
|
2018-03-15 17:58:32 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
2020-06-29 18:02:05 +00:00
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
use config::Config;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn can_build_fields() {
|
|
|
|
let mut config = Config::default();
|
2021-03-13 20:27:17 +00:00
|
|
|
let fields = build_fields(&config.search);
|
2020-06-29 18:02:05 +00:00
|
|
|
assert_eq!(fields, vec!["title", "body"]);
|
|
|
|
|
|
|
|
config.search.include_content = false;
|
|
|
|
config.search.include_description = true;
|
2021-03-13 20:27:17 +00:00
|
|
|
let fields = build_fields(&config.search);
|
2020-06-29 18:02:05 +00:00
|
|
|
assert_eq!(fields, vec!["title", "description"]);
|
|
|
|
|
|
|
|
config.search.include_content = true;
|
2021-03-13 20:27:17 +00:00
|
|
|
let fields = build_fields(&config.search);
|
2020-06-29 18:02:05 +00:00
|
|
|
assert_eq!(fields, vec!["title", "description", "body"]);
|
|
|
|
|
|
|
|
config.search.include_title = false;
|
2021-03-13 20:27:17 +00:00
|
|
|
let fields = build_fields(&config.search);
|
2020-06-29 18:02:05 +00:00
|
|
|
assert_eq!(fields, vec!["description", "body"]);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn can_fill_index_default() {
|
|
|
|
let config = Config::default();
|
|
|
|
let title = Some("A title".to_string());
|
|
|
|
let description = Some("A description".to_string());
|
2021-06-02 07:18:39 +00:00
|
|
|
let path = "/a/page/".to_string();
|
2020-06-29 18:02:05 +00:00
|
|
|
let content = "Some content".to_string();
|
|
|
|
|
2021-06-02 07:18:39 +00:00
|
|
|
let res = fill_index(&config.search, &title, &description, &path, &content);
|
2020-06-29 18:02:05 +00:00
|
|
|
assert_eq!(res.len(), 2);
|
|
|
|
assert_eq!(res[0], title.unwrap());
|
|
|
|
assert_eq!(res[1], content);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn can_fill_index_description() {
|
|
|
|
let mut config = Config::default();
|
|
|
|
config.search.include_description = true;
|
|
|
|
let title = Some("A title".to_string());
|
|
|
|
let description = Some("A description".to_string());
|
2021-06-02 07:18:39 +00:00
|
|
|
let path = "/a/page/".to_string();
|
2020-06-29 18:02:05 +00:00
|
|
|
let content = "Some content".to_string();
|
|
|
|
|
2021-06-02 07:18:39 +00:00
|
|
|
let res = fill_index(&config.search, &title, &description, &path, &content);
|
2020-06-29 18:02:05 +00:00
|
|
|
assert_eq!(res.len(), 3);
|
|
|
|
assert_eq!(res[0], title.unwrap());
|
|
|
|
assert_eq!(res[1], description.unwrap());
|
|
|
|
assert_eq!(res[2], content);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn can_fill_index_truncated_content() {
|
|
|
|
let mut config = Config::default();
|
|
|
|
config.search.truncate_content_length = Some(5);
|
|
|
|
let title = Some("A title".to_string());
|
|
|
|
let description = Some("A description".to_string());
|
2021-06-02 07:18:39 +00:00
|
|
|
let path = "/a/page/".to_string();
|
2020-06-29 18:02:05 +00:00
|
|
|
let content = "Some content".to_string();
|
|
|
|
|
2021-06-02 07:18:39 +00:00
|
|
|
let res = fill_index(&config.search, &title, &description, &path, &content);
|
2020-06-29 18:02:05 +00:00
|
|
|
assert_eq!(res.len(), 2);
|
|
|
|
assert_eq!(res[0], title.unwrap());
|
|
|
|
assert_eq!(res[1], content[..5]);
|
|
|
|
}
|
|
|
|
}
|