always recompute the score of all users

(the perf δ is insignificant and the code is simpler)
This commit is contained in:
Armaël Guéneau 2024-11-23 11:35:25 +01:00
parent 876488bb4b
commit ff95f3807b

View file

@ -63,7 +63,6 @@ struct Db {
// caches: derived from the rest
score: HashMap<UserId, f32>,
tokens: HashMap<UserId, Vec<String>>,
users_of_token: HashMap<String, Vec<UserId>>,
}
impl UserData {
@ -117,9 +116,12 @@ impl Db {
tokens: HashMap::new(),
classification: HashMap::new(),
score: HashMap::new(),
users_of_token: HashMap::new(),
}
}
fn all_users(&self) -> Vec<UserId> {
self.users.iter().map(|(id, _)| *id).collect()
}
}
async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::Repository>> {
@ -320,8 +322,7 @@ async fn load_db() -> anyhow::Result<(Db, Classifier)> {
db.users = get_users_data(&forge).await?;
eprintln!("Scoring users...");
let ids: Vec<_> = db.users.iter().map(|(id, _)| *id).collect();
for &user_id in &ids {
for &user_id in &db.all_users() {
update_user(&mut db, &mut classifier, user_id);
}
@ -344,13 +345,7 @@ fn update_user(db: &mut Db, classifier: &mut Classifier, id: UserId) {
db.tokens.get(&id).unwrap()
}
};
let score = classifier.score(&tokens);
for tok in tokens {
db.users_of_token.entry(tok.to_string()).or_default().push(id)
};
db.score.insert(id, score);
db.score.insert(id, classifier.score(&tokens));
}
fn unclassified_users<'a>(db: &'a Db) -> Vec<(&'a UserId, &'a UserData)> {
@ -363,7 +358,7 @@ fn unclassified_users<'a>(db: &'a Db) -> Vec<(&'a UserId, &'a UserData)> {
fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)]) {
let mut all_tokens = HashSet::new();
eprintln!("training classifier");
eprintln!("updating classifier");
for (id, is_spam) in ids {
let tokens = db.tokens.get(id).unwrap();
@ -379,25 +374,10 @@ fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)]) {
}
}
eprintln!("computing users to update");
eprintln!("recomputing user scores");
let mut users_to_update = HashSet::new();
for token in all_tokens {
match db.users_of_token.get(&token) {
None => (),
Some(users) => {
for user in users {
users_to_update.insert(*user);
}
},
}
}
eprintln!("recomputing scores for {}/{} users", users_to_update.len(), db.users.len());
for user in users_to_update {
update_user(db, classifier, user)
for &user_id in &db.all_users() {
update_user(db, classifier, user_id)
}
}