always recompute the score of all users

(the perf δ is insignificant and the code is simpler)
This commit is contained in:
Armaël Guéneau 2024-11-23 11:35:25 +01:00
parent 876488bb4b
commit ff95f3807b

View file

@ -63,7 +63,6 @@ struct Db {
// caches: derived from the rest // caches: derived from the rest
score: HashMap<UserId, f32>, score: HashMap<UserId, f32>,
tokens: HashMap<UserId, Vec<String>>, tokens: HashMap<UserId, Vec<String>>,
users_of_token: HashMap<String, Vec<UserId>>,
} }
impl UserData { impl UserData {
@ -117,9 +116,12 @@ impl Db {
tokens: HashMap::new(), tokens: HashMap::new(),
classification: HashMap::new(), classification: HashMap::new(),
score: HashMap::new(), score: HashMap::new(),
users_of_token: HashMap::new(),
} }
} }
fn all_users(&self) -> Vec<UserId> {
self.users.iter().map(|(id, _)| *id).collect()
}
} }
async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::Repository>> { async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::Repository>> {
@ -320,8 +322,7 @@ async fn load_db() -> anyhow::Result<(Db, Classifier)> {
db.users = get_users_data(&forge).await?; db.users = get_users_data(&forge).await?;
eprintln!("Scoring users..."); eprintln!("Scoring users...");
let ids: Vec<_> = db.users.iter().map(|(id, _)| *id).collect(); for &user_id in &db.all_users() {
for &user_id in &ids {
update_user(&mut db, &mut classifier, user_id); update_user(&mut db, &mut classifier, user_id);
} }
@ -344,13 +345,7 @@ fn update_user(db: &mut Db, classifier: &mut Classifier, id: UserId) {
db.tokens.get(&id).unwrap() db.tokens.get(&id).unwrap()
} }
}; };
let score = classifier.score(&tokens); db.score.insert(id, classifier.score(&tokens));
for tok in tokens {
db.users_of_token.entry(tok.to_string()).or_default().push(id)
};
db.score.insert(id, score);
} }
fn unclassified_users<'a>(db: &'a Db) -> Vec<(&'a UserId, &'a UserData)> { fn unclassified_users<'a>(db: &'a Db) -> Vec<(&'a UserId, &'a UserData)> {
@ -363,7 +358,7 @@ fn unclassified_users<'a>(db: &'a Db) -> Vec<(&'a UserId, &'a UserData)> {
fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)]) { fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)]) {
let mut all_tokens = HashSet::new(); let mut all_tokens = HashSet::new();
eprintln!("training classifier"); eprintln!("updating classifier");
for (id, is_spam) in ids { for (id, is_spam) in ids {
let tokens = db.tokens.get(id).unwrap(); let tokens = db.tokens.get(id).unwrap();
@ -379,25 +374,10 @@ fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)]) {
} }
} }
eprintln!("computing users to update"); eprintln!("recomputing user scores");
let mut users_to_update = HashSet::new(); for &user_id in &db.all_users() {
update_user(db, classifier, user_id)
for token in all_tokens {
match db.users_of_token.get(&token) {
None => (),
Some(users) => {
for user in users {
users_to_update.insert(*user);
}
},
}
}
eprintln!("recomputing scores for {}/{} users", users_to_update.len(), db.users.len());
for user in users_to_update {
update_user(db, classifier, user)
} }
} }