From d4af61fb35accafe6e09c060d31b357de7f3973a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arma=C3=ABl=20Gu=C3=A9neau?= Date: Thu, 19 Dec 2024 15:21:33 +0100 Subject: [PATCH] WIP: lock spam accounts then delete after a grace period --- src/db.rs | 6 ++- src/main.rs | 140 +++++++++++++++++++++++++++++++++++++++++-------- src/scrape.rs | 2 +- src/workers.rs | 52 +++++++++++++++++- 4 files changed, 174 insertions(+), 26 deletions(-) diff --git a/src/db.rs b/src/db.rs index 04690a4..bc26e02 100644 --- a/src/db.rs +++ b/src/db.rs @@ -110,14 +110,15 @@ impl Db { Ok(()) } - pub fn unclassified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData)> { + pub fn unclassified_users<'a>(&'a self) -> Vec<(UserId, &'a UserData)> { self.users .iter() .filter(|(user_id, _)| !self.is_spam.contains_key(&user_id)) + .map(|(id, d)| (*id, d)) .collect() } - pub fn classified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData, IsSpam)> { + pub fn classified_users<'a>(&'a self) -> Vec<(UserId, &'a UserData, IsSpam)> { self.users .iter() .filter_map(|(user_id, user_data)| { @@ -125,6 +126,7 @@ impl Db { .get(&user_id) .map(|is_spam| (user_id, user_data, *is_spam)) }) + .map(|(id, d, s)| (*id, d, s)) .collect() } } diff --git a/src/main.rs b/src/main.rs index 849941a..c613a1a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -23,6 +23,15 @@ use db::{Db, IsSpam}; // Fetch user data from forgejo from time to time const FORGEJO_POLL_DELAY: Duration = Duration::from_secs(11 * 3600); // 11 hours +// Duration of the grace period. + +// The grace period starts after a user is marked as spam and we block their +// account. This gives time for the user to contact us and ask that they be +// unblocked. +// If the grace period expires and the user is still marked as spam, their +// account is deleted. +const GRACE_PERIOD: Duration = Duration::from_secs(30 * 24 * 3600); // 30 days + // Heuristic score thresholds used for: // - the display color when displaying unclassified users (green/orange/red) // - chosing when to remove an existing classification after a user's data changes @@ -66,10 +75,28 @@ async fn load_db(forge: &Forgejo) -> anyhow::Result<(Db, Classifier)> { Ok((db, classifier)) } -// XXX: This function looks like it is doing too many things at once. -fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], overwrite: bool) { +// Register a list of decisions taken by the admin using the webpage, checking +// for classification conflicts. +// +// Only updates the database and the classifier. +// Returns the list of newlyfound spammers whose account must be blocked. +// +// The [overwrite] parameter is true when for "edit" mode (when updating +// existing classifications), and false when classifying new users. +// +// NB: some of the input decisions may be no-ops: when using the page to edit +// existing classifications, the webform sends the list of all existing and +// changed classifications. +fn set_spam( + db: &mut Db, + classifier: &mut Classifier, + ids: &[(UserId, bool)], + overwrite: bool, +) -> Vec { + let mut spammers = Vec::new(); + for &(user_id, is_spam) in ids { - let mut train_classifier = false; + let mut update_classification = false; match db.is_spam.get(&user_id) { Some(&was_spam) if overwrite && was_spam.as_bool() != is_spam => { @@ -79,16 +106,16 @@ fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], ov was_spam, is_spam ); - db.is_spam.insert(user_id, IsSpam::from_bool(is_spam)); - // This is somewhat hackish: we already trained the classifier - // on the previous classification, possibly with the same - // tokens. + // Training the classifier again is somewhat hackish in this + // case: we already trained the classifier on the previous + // classification, possibly with the same tokens. + // // Ideally we would undo the previous training and train with // the correct classification now, but the classifier has no way // to easily undo a previous training (we don't know whether the // tokens that we have now are the same as the one that were // used previously). - train_classifier = true; + update_classification = true; } Some(&was_spam) if !overwrite && was_spam.as_bool() != is_spam => { // Classification conflict between concurrent queries. @@ -101,16 +128,22 @@ fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], ov db.is_spam.remove(&user_id); } None => { - db.is_spam.insert(user_id, IsSpam::from_bool(is_spam)); - train_classifier = true; + update_classification = true; } Some(was_spam) => { assert!(was_spam.as_bool() == is_spam); - // nothing to do + // nothing to do. + // In particular, keep the spam classification time as is. } } - if train_classifier { + if update_classification { + db.is_spam.insert(user_id, IsSpam::from_bool(is_spam)); + // if we just classified the user as spam, add it to the list + if is_spam { + spammers.push(user_id) + } + // Train the classifier with tokens from the user let tokens = db.tokens.get(&user_id).unwrap(); if is_spam { @@ -123,6 +156,56 @@ fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], ov eprintln!("recomputing user scores"); db.recompute_scores(&classifier); + + spammers +} + +async fn lock_user_account(forge: &Forgejo, username: &str) -> anyhow::Result<()> { + let opts = forgejo_api::structs::EditUserOption { + // boilerplate: we do not change these settings + active: None, + admin: None, + allow_create_organization: None, + allow_git_hook: None, + allow_import_local: None, + description: None, + email: None, + full_name: None, + location: None, + login_name: None, + max_repo_creation: None, + must_change_password: None, + password: None, + pronouns: None, + restricted: None, + source_id: None, + website: None, + // lock the account and set its visibility to private: the user's + // description and info will not be publicly visible + prohibit_login: Some(true), + visibility: Some("private".to_string()), + }; + forge.admin_edit_user(username, opts).await?; + Ok(()) +} + +async fn apply_classification( + forge: &Forgejo, + db: &mut Db, + classifier: &mut Classifier, + ids: &[(UserId, bool)], + overwrite: bool, +) -> anyhow::Result<()> { + let spammers = set_spam(db, classifier, ids, overwrite); + + for user in spammers { + // TODO: send email (what do we do if sending the email didn't work?) + // TODO: batch the email sending? (only open one smtp connection) + lock_user_account(forge, &db.users.get(&user).unwrap().login).await?; + // TODO: better error handling: retries, ..? + } + + Ok(()) } lazy_static! { @@ -140,6 +223,7 @@ lazy_static! { struct AppState { db: Arc>, classifier: Arc>, + forge: Arc, } #[derive(Debug, Deserialize)] @@ -175,10 +259,10 @@ async fn index( let db = &data.db.lock().unwrap(); - let mut users: Vec<(&UserId, &UserData, f32)> = db + let mut users: Vec<(UserId, &UserData, f32)> = db .unclassified_users() .into_iter() - .map(|(id, u)| (id, u, *db.score.get(id).unwrap())) + .map(|(id, u)| (id, u, *db.score.get(&id).unwrap())) .collect(); let mut rng = rand::thread_rng(); @@ -200,7 +284,7 @@ async fn index( } // compute the rough "spam score" (low/mid/high) and spam guess (true/false) - let users: Vec<(&UserId, &UserData, f32, ApproxScore, bool)> = users + let users: Vec<(UserId, &UserData, f32, ApproxScore, bool)> = users .into_iter() .map(|(id, u, score)| { ( @@ -239,13 +323,16 @@ async fn post_classified( let db = &mut data.db.lock().unwrap(); let classifier = &mut data.classifier.lock().unwrap(); + let forge = &data.forge; let updates: Vec<(UserId, bool)> = form .iter() .map(|(id, classification)| (UserId(*id), classification == "spam")) .collect(); - set_spam(db, classifier, &updates, overwrite); + apply_classification(forge, db, classifier, &updates, overwrite) + .await + .unwrap(); // FIXME db.store_to_path(Path::new("db.json")).unwrap(); // FIXME classifier @@ -286,10 +373,10 @@ async fn classified( let db = &data.db.lock().unwrap(); - let mut users: Vec<(&UserId, &UserData, f32, bool)> = db + let mut users: Vec<(UserId, &UserData, f32, bool)> = db .classified_users() .into_iter() - .map(|(id, u, s)| (id, u, *db.score.get(id).unwrap(), s.as_bool())) + .map(|(id, u, s)| (id, u, *db.score.get(&id).unwrap(), s.as_bool())) .collect(); // sort "spam first" users.sort_by_key(|(_, _, score, _)| 1000 - (score * 1000.) as u64); @@ -313,7 +400,7 @@ async fn main() -> std::io::Result<()> { let _ = *TEMPLATES; eprintln!("Load users and repos"); - let forge = Arc::new(forge().unwrap()); // FIXME + let forge = Arc::new(forge().unwrap() /* FIXME */); let (db, classifier) = load_db(&forge).await.unwrap(); // FIXME let db = Arc::new(Mutex::new(db)); let classifier = Arc::new(Mutex::new(classifier)); @@ -321,11 +408,20 @@ async fn main() -> std::io::Result<()> { let st = web::Data::new(AppState { db: db.clone(), classifier: classifier.clone(), + forge: forge.clone(), }); - let _ = tokio::spawn(async move { - workers::refresh_user_data(forge.clone(), db.clone(), classifier.clone()) - }); + let _ = { + let forge = forge.clone(); + let db = db.clone(); + let classifier = classifier.clone(); + tokio::spawn(async move { workers::refresh_user_data(forge, db, classifier) }) + }; + let _ = { + let forge = forge.clone(); + let db = db.clone(); + tokio::spawn(async move { workers::purge_spammer_accounts(forge, db) }) + }; println!("Listening on http://127.0.0.1:8080"); diff --git a/src/scrape.rs b/src/scrape.rs index 64ba532..a8c07d0 100644 --- a/src/scrape.rs +++ b/src/scrape.rs @@ -63,7 +63,7 @@ async fn scrape_users(forge: &Forgejo) -> anyhow::Result todo!("scrape_users: implement retries"), } page += 1; - sleep(Duration::from_millis(100)).await; + sleep(Duration::from_millis(20)).await; } Ok(users) } diff --git a/src/workers.rs b/src/workers.rs index 4fb5c60..b4ae437 100644 --- a/src/workers.rs +++ b/src/workers.rs @@ -1,5 +1,5 @@ use crate::classifier::Classifier; -use crate::db::Db; +use crate::db::{Db, IsSpam}; use crate::scrape; use forgejo_api::Forgejo; use std::collections::HashMap; @@ -7,8 +7,11 @@ use std::path::Path; use std::sync::{Arc, Mutex}; use crate::FORGEJO_POLL_DELAY; +use crate::GRACE_PERIOD; use crate::{GUESS_LEGIT_THRESHOLD, GUESS_SPAM_THRESHOLD}; +// Worker to refresh user data by periodically polling Forgejo + async fn try_refresh_user_data( forge: &Forgejo, db: Arc>, @@ -73,3 +76,50 @@ pub async fn refresh_user_data( } } } + +// Worker to delete spam accounts after their grace period expired + +async fn try_purge_account(forge: &Forgejo, login: &str) -> anyhow::Result<()> { + forge + .admin_delete_user( + login, + forgejo_api::structs::AdminDeleteUserQuery { purge: Some(true) }, + ) + .await?; + Ok(()) +} + +pub async fn purge_spammer_accounts(forge: Arc, db: Arc>) { + loop { + tokio::time::sleep(std::time::Duration::from_secs(3600)).await; + + let mut classified_users = Vec::new(); + { + let db = &db.lock().unwrap(); + for (id, user, is_spam) in db.classified_users() { + classified_users.push((id, user.login.clone(), is_spam)); + } + } + + for (user_id, login, is_spam) in classified_users { + if let IsSpam::Spam { classified_at } = is_spam { + match classified_at.elapsed() { + Ok(duration) if duration > GRACE_PERIOD => { + if let Err(e) = try_purge_account(&forge, &login).await { + eprintln!("Error while deleting spammer account {login}: {:?}", e) + } else { + eprintln!("Deleted spammer account {login}"); + let db = &mut db.lock().unwrap(); + db.users.remove(&user_id); + db.is_spam.remove(&user_id); + db.score.remove(&user_id); + db.tokens.remove(&user_id); + db.store_to_path(Path::new("db.json")).unwrap(); // FIXME + } + } + _ => (), + } + } + } + } +}