optimize scoring performance

This commit is contained in:
Armaël Guéneau 2024-11-23 11:10:44 +01:00
parent d9251ce395
commit 876488bb4b
5 changed files with 24 additions and 6 deletions

1
.gitignore vendored
View file

@ -2,3 +2,4 @@
classification.json
db.json
api_token
profile.json

View file

@ -19,3 +19,7 @@ tera = "1"
lazy_static = "1"
actix-files = "0.6"
unicode-segmentation = "1"
[profile.profiling]
inherits = "dev"
debug = true

File diff suppressed because one or more lines are too long

View file

@ -20,6 +20,8 @@ struct Counter {
#[derive(Default, Debug, Serialize, Deserialize)]
pub struct Classifier {
token_table: HashMap<String, Counter>,
spam_total_count: u32,
ham_total_count: u32,
}
impl Classifier {
@ -56,6 +58,7 @@ impl Classifier {
for word in tokens {
let counter = self.token_table.entry(word.to_string()).or_default();
counter.spam += 1;
self.spam_total_count += 1;
}
}
@ -64,17 +67,18 @@ impl Classifier {
for word in tokens {
let counter = self.token_table.entry(word.to_string()).or_default();
counter.ham += 1;
self.ham_total_count += 1;
}
}
/// Return the total number of spam in token table.
fn spam_total_count(&self) -> u32 {
self.token_table.values().map(|x| x.spam).sum()
self.spam_total_count
}
/// Return the total number of ham in token table.
fn ham_total_count(&self) -> u32 {
self.token_table.values().map(|x| x.ham).sum()
self.ham_total_count
}
/// Compute the probability of `tokens` to be part of a spam.

View file

@ -334,14 +334,22 @@ async fn load_db() -> anyhow::Result<(Db, Classifier)> {
}
fn update_user(db: &mut Db, classifier: &mut Classifier, id: UserId) {
let tokens = db.users.get(&id).unwrap().to_tokens();
let userdata = db.users.get(&id).unwrap();
let tokens =
match db.tokens.get(&id) {
Some(tokens) => tokens,
None => {
let tokens = userdata.to_tokens();
db.tokens.insert(id, tokens);
db.tokens.get(&id).unwrap()
}
};
let score = classifier.score(&tokens);
for tok in &tokens {
for tok in tokens {
db.users_of_token.entry(tok.to_string()).or_default().push(id)
};
db.tokens.insert(id, tokens);
db.score.insert(id, score);
}