Skip to content
Snippets Groups Projects
Commit 20a54aac authored by Benjamin Lee's avatar Benjamin Lee Committed by 🥺
Browse files

factor search tokenization out into a function


This ensures that the tokenization algorithm will remain in sync between
querying, indexing, and deindexing. The existing code had slightly
different behavior for querying, because it did not discard words with
>50 bytes. This was inconsequential, because >50 byte tokens are never
present in the index.

Signed-off-by: default avatarstrawberry <strawberry@puppygock.gay>
parent 81cd677b
No related branches found
No related tags found
No related merge requests found
......@@ -10,20 +10,26 @@ pub trait Data: Send + Sync {
fn search_pdus<'a>(&'a self, room_id: &RoomId, search_string: &str) -> SearchPdusResult<'a>;
}
/// Splits a string into tokens used as keys in the search inverted index
///
/// This may be used to tokenize both message bodies (for indexing) or search
/// queries (for querying).
fn tokenize(body: &str) -> impl Iterator<Item = String> + '_ {
body.split_terminator(|c: char| !c.is_alphanumeric())
.filter(|s| !s.is_empty())
.filter(|word| word.len() <= 50)
.map(str::to_lowercase)
}
impl Data for KeyValueDatabase {
fn index_pdu(&self, shortroomid: u64, pdu_id: &[u8], message_body: &str) -> Result<()> {
let mut batch = message_body
.split_terminator(|c: char| !c.is_alphanumeric())
.filter(|s| !s.is_empty())
.filter(|word| word.len() <= 50)
.map(str::to_lowercase)
.map(|word| {
let mut key = shortroomid.to_be_bytes().to_vec();
key.extend_from_slice(word.as_bytes());
key.push(0xFF);
key.extend_from_slice(pdu_id); // TODO: currently we save the room id a second time here
(key, Vec::new())
});
let mut batch = tokenize(message_body).map(|word| {
let mut key = shortroomid.to_be_bytes().to_vec();
key.extend_from_slice(word.as_bytes());
key.push(0xFF);
key.extend_from_slice(pdu_id); // TODO: currently we save the room id a second time here
(key, Vec::new())
});
self.tokenids.insert_batch(&mut batch)
}
......@@ -37,11 +43,7 @@ fn search_pdus<'a>(&'a self, room_id: &RoomId, search_string: &str) -> SearchPdu
.to_be_bytes()
.to_vec();
let words: Vec<_> = search_string
.split_terminator(|c: char| !c.is_alphanumeric())
.filter(|s| !s.is_empty())
.map(str::to_lowercase)
.collect();
let words: Vec<_> = tokenize(search_string).collect();
let iterators = words.clone().into_iter().map(move |word| {
let mut prefix2 = prefix.clone();
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment