Include path in the search index with include_path (#1509)

This commit is contained in:
Tim Schumacher 2021-06-02 09:18:39 +02:00 committed by GitHub
parent 4c22996e11
commit 16c123aa20
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 89 additions and 30 deletions

45
Cargo.lock generated
View file

@ -155,9 +155,9 @@ dependencies = [
[[package]] [[package]]
name = "bumpalo" name = "bumpalo"
version = "3.6.1" version = "3.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63396b8a4b9de3f4fdfb320ab6080762242f66a8ef174c49d8e19b674db4cdbe" checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631"
[[package]] [[package]]
name = "byte-tools" name = "byte-tools"
@ -207,9 +207,9 @@ checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040"
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.0.67" version = "1.0.68"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd" checksum = "4a72c244c1ff497a746a7e1fb3d14bd08420ecda70c8f25c7112f2781652d787"
dependencies = [ dependencies = [
"jobserver", "jobserver",
] ]
@ -348,9 +348,9 @@ dependencies = [
[[package]] [[package]]
name = "crossbeam-epoch" name = "crossbeam-epoch"
version = "0.9.4" version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52fb27eab85b17fbb9f6fd667089e07d6a2eb8743d02639ee7f6a7a7729c9c94" checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd"
dependencies = [ dependencies = [
"cfg-if 1.0.0", "cfg-if 1.0.0",
"crossbeam-utils", "crossbeam-utils",
@ -361,11 +361,10 @@ dependencies = [
[[package]] [[package]]
name = "crossbeam-utils" name = "crossbeam-utils"
version = "0.8.4" version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4feb231f0d4d6af81aed15928e58ecf5816aa62a2393e2c82f46973e92a9a278" checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db"
dependencies = [ dependencies = [
"autocfg",
"cfg-if 1.0.0", "cfg-if 1.0.0",
"lazy_static", "lazy_static",
] ]
@ -462,9 +461,9 @@ checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]] [[package]]
name = "elasticlunr-rs" name = "elasticlunr-rs"
version = "2.3.11" version = "2.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "959fbc9a6ebced545cbe365fdce5e25c6ab7683f2ca4ecc9fb9d0db663bf73d5" checksum = "2f8cf73b19a7aece6942f5745a2fc1ae3c8b0533569707d596b5d6baa7d6c600"
dependencies = [ dependencies = [
"jieba-rs", "jieba-rs",
"lazy_static", "lazy_static",
@ -922,9 +921,9 @@ checksum = "f3a87b616e37e93c22fb19bcd386f02f3af5ea98a25670ad0fce773de23c5e68"
[[package]] [[package]]
name = "httpdate" name = "httpdate"
version = "1.0.0" version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05842d0d43232b23ccb7060ecb0f0626922c21f30012e97b767b30afd4a5d4b9" checksum = "6456b8a6c8f33fee7d958fcd1b60d55b11940a79e63ae87013e6d22e26034440"
[[package]] [[package]]
name = "humansize" name = "humansize"
@ -934,9 +933,9 @@ checksum = "02296996cb8796d7c6e3bc2d9211b7802812d36999a51bb754123ead7d37d026"
[[package]] [[package]]
name = "hyper" name = "hyper"
version = "0.14.7" version = "0.14.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e5f105c494081baa3bf9e200b279e27ec1623895cd504c7dbef8d0b080fcf54" checksum = "d3f71a7eea53a3f8257a7b4795373ff886397178cd634430ea94e12d7fe4fe34"
dependencies = [ dependencies = [
"bytes 1.0.1", "bytes 1.0.1",
"futures-channel", "futures-channel",
@ -1177,9 +1176,9 @@ dependencies = [
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.94" version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e" checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36"
[[package]] [[package]]
name = "library" name = "library"
@ -1364,9 +1363,9 @@ checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
[[package]] [[package]]
name = "memoffset" name = "memoffset"
version = "0.6.3" version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f83fb6581e8ed1f85fd45c116db8405483899489e38406156c25eb743554361d" checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9"
dependencies = [ dependencies = [
"autocfg", "autocfg",
] ]
@ -2719,9 +2718,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]] [[package]]
name = "tokio" name = "tokio"
version = "1.6.0" version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37" checksum = "0a38d31d7831c6ed7aad00aa4c12d9375fd225a6dd77da1d25b707346319a975"
dependencies = [ dependencies = [
"autocfg", "autocfg",
"bytes 1.0.1", "bytes 1.0.1",
@ -2898,9 +2897,9 @@ dependencies = [
[[package]] [[package]]
name = "unicode-normalization" name = "unicode-normalization"
version = "0.1.17" version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07fbfce1c8a97d547e8b5334978438d9d6ec8c20e38f56d4a4374d181493eaef" checksum = "33717dca7ac877f497014e10d73f3acf948c342bee31b5ca7892faf94ccc6b49"
dependencies = [ dependencies = [
"tinyvec", "tinyvec",
] ]

View file

@ -13,6 +13,8 @@ pub struct Search {
/// Includes the description in the search index. When the site becomes too large, you can switch /// Includes the description in the search index. When the site becomes too large, you can switch
/// to that instead. `false` by default /// to that instead. `false` by default
pub include_description: bool, pub include_description: bool,
/// Include the path of the page in the search index. `false` by default.
pub include_path: bool,
} }
impl Default for Search { impl Default for Search {
@ -21,6 +23,7 @@ impl Default for Search {
include_title: true, include_title: true,
include_content: true, include_content: true,
include_description: false, include_description: false,
include_path: false,
truncate_content_length: None, truncate_content_length: None,
} }
} }

View file

@ -1,6 +1,8 @@
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use elasticlunr::{Index, Language}; use elasticlunr::{Index, Language};
use elasticlunr::pipeline;
use elasticlunr::pipeline::TokenizerFn;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use config::{Config, Search}; use config::{Config, Search};
@ -36,6 +38,10 @@ fn build_fields(search_config: &Search) -> Vec<String> {
fields.push("description".to_owned()); fields.push("description".to_owned());
} }
if search_config.include_path {
fields.push("path".to_owned());
}
if search_config.include_content { if search_config.include_content {
fields.push("body".to_owned()); fields.push("body".to_owned());
} }
@ -43,10 +49,46 @@ fn build_fields(search_config: &Search) -> Vec<String> {
fields fields
} }
fn path_tokenizer(text: &str) -> Vec<String> {
text.split(|c: char| c.is_whitespace() || c == '-' || c == '/')
.filter(|s| !s.is_empty())
.map(|s| s.trim().to_lowercase())
.collect()
}
fn build_tokenizers(search_config: &Search, language: Language) -> Vec<TokenizerFn> {
let text_tokenizer = match language {
#[cfg(feature = "indexing-zh")]
Language::Chinese => pipeline::tokenize_chinese,
#[cfg(feature = "indexing-ja")]
Language::Japanese => pipeline::tokenize_japanese,
_ => pipeline::tokenize,
};
let mut tokenizers: Vec<TokenizerFn> = vec![];
if search_config.include_title {
tokenizers.push(text_tokenizer);
}
if search_config.include_description {
tokenizers.push(text_tokenizer);
}
if search_config.include_path {
tokenizers.push(path_tokenizer);
}
if search_config.include_content {
tokenizers.push(text_tokenizer);
}
tokenizers
}
fn fill_index( fn fill_index(
search_config: &Search, search_config: &Search,
title: &Option<String>, title: &Option<String>,
description: &Option<String>, description: &Option<String>,
path: &str,
content: &str, content: &str,
) -> Vec<String> { ) -> Vec<String> {
let mut row = vec![]; let mut row = vec![];
@ -59,6 +101,10 @@ fn fill_index(
row.push(description.clone().unwrap_or_default()); row.push(description.clone().unwrap_or_default());
} }
if search_config.include_path {
row.push(path.to_string());
}
if search_config.include_content { if search_config.include_content {
let body = AMMONIA.clean(&content).to_string(); let body = AMMONIA.clean(&content).to_string();
if let Some(truncate_len) = search_config.truncate_content_length { if let Some(truncate_len) = search_config.truncate_content_length {
@ -90,9 +136,11 @@ pub fn build_index(lang: &str, library: &Library, config: &Config) -> Result<Str
let language_options = &config.languages[lang]; let language_options = &config.languages[lang];
let mut index = Index::with_language(language, &build_fields(&language_options.search)); let mut index = Index::with_language(language, &build_fields(&language_options.search));
let tokenizers = build_tokenizers(&language_options.search, language);
for section in library.sections_values() { for section in library.sections_values() {
if section.lang == lang { if section.lang == lang {
add_section_to_index(&mut index, section, library, &language_options.search); add_section_to_index(&mut index, section, library, &language_options.search, tokenizers.clone());
} }
} }
@ -104,6 +152,7 @@ fn add_section_to_index(
section: &Section, section: &Section,
library: &Library, library: &Library,
search_config: &Search, search_config: &Search,
tokenizers: Vec<TokenizerFn>,
) { ) {
if !section.meta.in_search_index { if !section.meta.in_search_index {
return; return;
@ -111,14 +160,16 @@ fn add_section_to_index(
// Don't index redirecting sections // Don't index redirecting sections
if section.meta.redirect_to.is_none() { if section.meta.redirect_to.is_none() {
index.add_doc( index.add_doc_with_tokenizers(
&section.permalink, &section.permalink,
&fill_index( &fill_index(
search_config, search_config,
&section.meta.title, &section.meta.title,
&section.meta.description, &section.meta.description,
&section.path,
&section.content, &section.content,
), ),
tokenizers.clone(),
); );
} }
@ -128,9 +179,10 @@ fn add_section_to_index(
continue; continue;
} }
index.add_doc( index.add_doc_with_tokenizers(
&page.permalink, &page.permalink,
&fill_index(search_config, &page.meta.title, &page.meta.description, &page.content), &fill_index(search_config, &page.meta.title, &page.meta.description, &page.path, &page.content),
tokenizers.clone(),
); );
} }
} }
@ -166,9 +218,10 @@ mod tests {
let config = Config::default(); let config = Config::default();
let title = Some("A title".to_string()); let title = Some("A title".to_string());
let description = Some("A description".to_string()); let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string(); let content = "Some content".to_string();
let res = fill_index(&config.search, &title, &description, &content); let res = fill_index(&config.search, &title, &description, &path, &content);
assert_eq!(res.len(), 2); assert_eq!(res.len(), 2);
assert_eq!(res[0], title.unwrap()); assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], content); assert_eq!(res[1], content);
@ -180,9 +233,10 @@ mod tests {
config.search.include_description = true; config.search.include_description = true;
let title = Some("A title".to_string()); let title = Some("A title".to_string());
let description = Some("A description".to_string()); let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string(); let content = "Some content".to_string();
let res = fill_index(&config.search, &title, &description, &content); let res = fill_index(&config.search, &title, &description, &path, &content);
assert_eq!(res.len(), 3); assert_eq!(res.len(), 3);
assert_eq!(res[0], title.unwrap()); assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], description.unwrap()); assert_eq!(res[1], description.unwrap());
@ -195,9 +249,10 @@ mod tests {
config.search.truncate_content_length = Some(5); config.search.truncate_content_length = Some(5);
let title = Some("A title".to_string()); let title = Some("A title".to_string());
let description = Some("A description".to_string()); let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string(); let content = "Some content".to_string();
let res = fill_index(&config.search, &title, &description, &content); let res = fill_index(&config.search, &title, &description, &path, &content);
assert_eq!(res.len(), 2); assert_eq!(res.len(), 2);
assert_eq!(res[0], title.unwrap()); assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], content[..5]); assert_eq!(res[1], content[..5]);

View file

@ -151,6 +151,8 @@ build_search_index = false
include_title = true include_title = true
# Whether to include the description of the page/section in the index # Whether to include the description of the page/section in the index
include_description = false include_description = false
# Whether to include the path of the page/section in the index
include_path = false
# Whether to include the rendered content of the page/section in the index # Whether to include the rendered content of the page/section in the index
include_content = true include_content = true
# At which character to truncate the content to. Useful if you have a lot of pages and the index would # At which character to truncate the content to. Useful if you have a lot of pages and the index would