Skip to content

Commit 9586c5b

Browse files
authored
Add code tokenizer (#3647)
* Add code tokenizer. * Rename and add bench. * Add 'source_code' tokenizer in tokenizer manager.
1 parent 65121c4 commit 9586c5b

File tree

5 files changed

+482
-3
lines changed

5 files changed

+482
-3
lines changed

quickwit/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

quickwit/quickwit-query/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,10 @@ tantivy = { workspace = true }
2222
thiserror = { workspace = true }
2323

2424
[dev-dependencies]
25+
criterion = { workspace = true }
2526
proptest = { workspace = true }
2627
time = { workspace = true }
28+
29+
[[bench]]
30+
name = "tokenizers_bench"
31+
harness = false
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
// Copyright (C) 2023 Quickwit, Inc.
2+
//
3+
// Quickwit is offered under the AGPL v3.0 and as commercial software.
4+
// For commercial licensing, contact us at hello@quickwit.io.
5+
//
6+
// AGPL:
7+
// This program is free software: you can redistribute it and/or modify
8+
// it under the terms of the GNU Affero General Public License as
9+
// published by the Free Software Foundation, either version 3 of the
10+
// License, or (at your option) any later version.
11+
//
12+
// This program is distributed in the hope that it will be useful,
13+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
// GNU Affero General Public License for more details.
16+
//
17+
// You should have received a copy of the GNU Affero General Public License
18+
// along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
20+
use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
21+
use quickwit_query::CodeTokenizer;
22+
use tantivy::tokenizer::{RegexTokenizer, TextAnalyzer, Token, TokenStream};
23+
24+
// A random ascii string of length 100 chars.
25+
static CODE_TEXT: &str = r#"
26+
# Camel case variables
27+
firstName = "John"
28+
lastName = "Doe"
29+
ageOfPerson = 30
30+
isEmployed = True
31+
32+
# Snake case variables
33+
first_name = "Jane"
34+
last_name = "Smith"
35+
age_of_person = 25
36+
is_employed = False
37+
38+
# Mixed case variables
39+
fullName = firstName + " " + lastName
40+
isPersonEmployed = isEmployed and is_employed
41+
42+
# Code logic
43+
if isEmployed and is_employed:
44+
print(f"{firstName} {first_name} is currently employed.")
45+
else:
46+
print(f"{lastName} {last_name} is not employed at the moment.")
47+
48+
totalAge = ageOfPerson + age_of_person
49+
print(f"The combined age is: {totalAge}")
50+
51+
# Longer word examples
52+
longCamelCaseWord = "LongCamelCase"
53+
longSnakeCaseWord = "long_snake_case"
54+
mixedCaseWord = "ThisIsAMixedCaseWord"
55+
longCamelCaseWord = "LongCamelCase"
56+
longSnakeCaseWord = "long_snake_case"
57+
mixedCaseWord = "ThisIsAMixedCaseWord"
58+
59+
# Words with consecutive uppercase letters
60+
WORDWITHConsecutiveUppercase1 = "1"
61+
WORDWITHCONSECUTIVEUppercase2 = "2"
62+
WORDWITHCONSECUTIVEUPPERCASE2 = "3"
63+
"#;
64+
65+
fn process_tokens(analyzer: &mut TextAnalyzer, text: &str) -> Vec<Token> {
66+
let mut token_stream = analyzer.token_stream(text);
67+
let mut tokens: Vec<Token> = vec![];
68+
token_stream.process(&mut |token: &Token| tokens.push(token.clone()));
69+
tokens
70+
}
71+
72+
pub fn tokenizers_throughput_benchmark(c: &mut Criterion) {
73+
let mut group = c.benchmark_group("code_tokenizer");
74+
let mut regex_tokenizer = TextAnalyzer::from(
75+
RegexTokenizer::new("(\\p{Ll}+|\\p{Lu}\\p{Ll}+|\\p{Lu}+|\\d+)").unwrap(),
76+
);
77+
let mut code_tokenizer = TextAnalyzer::from(CodeTokenizer::default());
78+
79+
group
80+
.throughput(Throughput::Bytes(CODE_TEXT.len() as u64))
81+
.bench_with_input("regex-tokenize", CODE_TEXT, |b, text| {
82+
b.iter(|| process_tokens(&mut regex_tokenizer, black_box(text)));
83+
});
84+
group
85+
.throughput(Throughput::Bytes(CODE_TEXT.len() as u64))
86+
.bench_with_input("code-tokenize", CODE_TEXT, |b, text| {
87+
b.iter(|| process_tokens(&mut code_tokenizer, black_box(text)));
88+
});
89+
}
90+
91+
criterion_group!(
92+
tokenizers_throughput_benches,
93+
tokenizers_throughput_benchmark
94+
);
95+
criterion_main!(tokenizers_throughput_benches);

quickwit/quickwit-query/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ use serde::{Deserialize, Serialize};
4444
pub use tantivy::query::Query as TantivyQuery;
4545
pub use tokenizers::{
4646
create_default_quickwit_tokenizer_manager, get_quickwit_fastfield_normalizer_manager,
47-
DEFAULT_REMOVE_TOKEN_LENGTH,
47+
CodeTokenizer, DEFAULT_REMOVE_TOKEN_LENGTH,
4848
};
4949

5050
#[derive(Serialize, Deserialize, Debug, Default, Copy, Clone, Eq, PartialEq)]

0 commit comments

Comments
 (0)