Include path in the search index with include_path (#1509)

This commit is contained in:
Tim Schumacher 2021-06-02 09:18:39 +02:00 committed by GitHub
parent 4c22996e11
commit 16c123aa20
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 89 additions and 30 deletions

45
Cargo.lock generated
View file

@ -155,9 +155,9 @@ dependencies = [
[[package]]
name = "bumpalo"
version = "3.6.1"
version = "3.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63396b8a4b9de3f4fdfb320ab6080762242f66a8ef174c49d8e19b674db4cdbe"
checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631"
[[package]]
name = "byte-tools"
@ -207,9 +207,9 @@ checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040"
[[package]]
name = "cc"
version = "1.0.67"
version = "1.0.68"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd"
checksum = "4a72c244c1ff497a746a7e1fb3d14bd08420ecda70c8f25c7112f2781652d787"
dependencies = [
"jobserver",
]
@ -348,9 +348,9 @@ dependencies = [
[[package]]
name = "crossbeam-epoch"
version = "0.9.4"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52fb27eab85b17fbb9f6fd667089e07d6a2eb8743d02639ee7f6a7a7729c9c94"
checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd"
dependencies = [
"cfg-if 1.0.0",
"crossbeam-utils",
@ -361,11 +361,10 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.4"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4feb231f0d4d6af81aed15928e58ecf5816aa62a2393e2c82f46973e92a9a278"
checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db"
dependencies = [
"autocfg",
"cfg-if 1.0.0",
"lazy_static",
]
@ -462,9 +461,9 @@ checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "elasticlunr-rs"
version = "2.3.11"
version = "2.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "959fbc9a6ebced545cbe365fdce5e25c6ab7683f2ca4ecc9fb9d0db663bf73d5"
checksum = "2f8cf73b19a7aece6942f5745a2fc1ae3c8b0533569707d596b5d6baa7d6c600"
dependencies = [
"jieba-rs",
"lazy_static",
@ -922,9 +921,9 @@ checksum = "f3a87b616e37e93c22fb19bcd386f02f3af5ea98a25670ad0fce773de23c5e68"
[[package]]
name = "httpdate"
version = "1.0.0"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05842d0d43232b23ccb7060ecb0f0626922c21f30012e97b767b30afd4a5d4b9"
checksum = "6456b8a6c8f33fee7d958fcd1b60d55b11940a79e63ae87013e6d22e26034440"
[[package]]
name = "humansize"
@ -934,9 +933,9 @@ checksum = "02296996cb8796d7c6e3bc2d9211b7802812d36999a51bb754123ead7d37d026"
[[package]]
name = "hyper"
version = "0.14.7"
version = "0.14.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e5f105c494081baa3bf9e200b279e27ec1623895cd504c7dbef8d0b080fcf54"
checksum = "d3f71a7eea53a3f8257a7b4795373ff886397178cd634430ea94e12d7fe4fe34"
dependencies = [
"bytes 1.0.1",
"futures-channel",
@ -1177,9 +1176,9 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.94"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e"
checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36"
[[package]]
name = "library"
@ -1364,9 +1363,9 @@ checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
[[package]]
name = "memoffset"
version = "0.6.3"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f83fb6581e8ed1f85fd45c116db8405483899489e38406156c25eb743554361d"
checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9"
dependencies = [
"autocfg",
]
@ -2719,9 +2718,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "tokio"
version = "1.6.0"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37"
checksum = "0a38d31d7831c6ed7aad00aa4c12d9375fd225a6dd77da1d25b707346319a975"
dependencies = [
"autocfg",
"bytes 1.0.1",
@ -2898,9 +2897,9 @@ dependencies = [
[[package]]
name = "unicode-normalization"
version = "0.1.17"
version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07fbfce1c8a97d547e8b5334978438d9d6ec8c20e38f56d4a4374d181493eaef"
checksum = "33717dca7ac877f497014e10d73f3acf948c342bee31b5ca7892faf94ccc6b49"
dependencies = [
"tinyvec",
]

View file

@ -13,6 +13,8 @@ pub struct Search {
/// Includes the description in the search index. When the site becomes too large, you can switch
/// to that instead. `false` by default
pub include_description: bool,
/// Include the path of the page in the search index. `false` by default.
pub include_path: bool,
}
impl Default for Search {
@ -21,6 +23,7 @@ impl Default for Search {
include_title: true,
include_content: true,
include_description: false,
include_path: false,
truncate_content_length: None,
}
}

View file

@ -1,6 +1,8 @@
use std::collections::{HashMap, HashSet};
use elasticlunr::{Index, Language};
use elasticlunr::pipeline;
use elasticlunr::pipeline::TokenizerFn;
use lazy_static::lazy_static;
use config::{Config, Search};
@ -36,6 +38,10 @@ fn build_fields(search_config: &Search) -> Vec<String> {
fields.push("description".to_owned());
}
if search_config.include_path {
fields.push("path".to_owned());
}
if search_config.include_content {
fields.push("body".to_owned());
}
@ -43,10 +49,46 @@ fn build_fields(search_config: &Search) -> Vec<String> {
fields
}
fn path_tokenizer(text: &str) -> Vec<String> {
text.split(|c: char| c.is_whitespace() || c == '-' || c == '/')
.filter(|s| !s.is_empty())
.map(|s| s.trim().to_lowercase())
.collect()
}
fn build_tokenizers(search_config: &Search, language: Language) -> Vec<TokenizerFn> {
let text_tokenizer = match language {
#[cfg(feature = "indexing-zh")]
Language::Chinese => pipeline::tokenize_chinese,
#[cfg(feature = "indexing-ja")]
Language::Japanese => pipeline::tokenize_japanese,
_ => pipeline::tokenize,
};
let mut tokenizers: Vec<TokenizerFn> = vec![];
if search_config.include_title {
tokenizers.push(text_tokenizer);
}
if search_config.include_description {
tokenizers.push(text_tokenizer);
}
if search_config.include_path {
tokenizers.push(path_tokenizer);
}
if search_config.include_content {
tokenizers.push(text_tokenizer);
}
tokenizers
}
fn fill_index(
search_config: &Search,
title: &Option<String>,
description: &Option<String>,
path: &str,
content: &str,
) -> Vec<String> {
let mut row = vec![];
@ -59,6 +101,10 @@ fn fill_index(
row.push(description.clone().unwrap_or_default());
}
if search_config.include_path {
row.push(path.to_string());
}
if search_config.include_content {
let body = AMMONIA.clean(&content).to_string();
if let Some(truncate_len) = search_config.truncate_content_length {
@ -90,9 +136,11 @@ pub fn build_index(lang: &str, library: &Library, config: &Config) -> Result<Str
let language_options = &config.languages[lang];
let mut index = Index::with_language(language, &build_fields(&language_options.search));
let tokenizers = build_tokenizers(&language_options.search, language);
for section in library.sections_values() {
if section.lang == lang {
add_section_to_index(&mut index, section, library, &language_options.search);
add_section_to_index(&mut index, section, library, &language_options.search, tokenizers.clone());
}
}
@ -104,6 +152,7 @@ fn add_section_to_index(
section: &Section,
library: &Library,
search_config: &Search,
tokenizers: Vec<TokenizerFn>,
) {
if !section.meta.in_search_index {
return;
@ -111,14 +160,16 @@ fn add_section_to_index(
// Don't index redirecting sections
if section.meta.redirect_to.is_none() {
index.add_doc(
index.add_doc_with_tokenizers(
&section.permalink,
&fill_index(
search_config,
&section.meta.title,
&section.meta.description,
&section.path,
&section.content,
),
tokenizers.clone(),
);
}
@ -128,9 +179,10 @@ fn add_section_to_index(
continue;
}
index.add_doc(
index.add_doc_with_tokenizers(
&page.permalink,
&fill_index(search_config, &page.meta.title, &page.meta.description, &page.content),
&fill_index(search_config, &page.meta.title, &page.meta.description, &page.path, &page.content),
tokenizers.clone(),
);
}
}
@ -166,9 +218,10 @@ mod tests {
let config = Config::default();
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();
let res = fill_index(&config.search, &title, &description, &content);
let res = fill_index(&config.search, &title, &description, &path, &content);
assert_eq!(res.len(), 2);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], content);
@ -180,9 +233,10 @@ mod tests {
config.search.include_description = true;
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();
let res = fill_index(&config.search, &title, &description, &content);
let res = fill_index(&config.search, &title, &description, &path, &content);
assert_eq!(res.len(), 3);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], description.unwrap());
@ -195,9 +249,10 @@ mod tests {
config.search.truncate_content_length = Some(5);
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();
let res = fill_index(&config.search, &title, &description, &content);
let res = fill_index(&config.search, &title, &description, &path, &content);
assert_eq!(res.len(), 2);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], content[..5]);

View file

@ -151,6 +151,8 @@ build_search_index = false
include_title = true
# Whether to include the description of the page/section in the index
include_description = false
# Whether to include the path of the page/section in the index
include_path = false
# Whether to include the rendered content of the page/section in the index
include_content = true
# At which character to truncate the content to. Useful if you have a lot of pages and the index would