From d4af61fb35accafe6e09c060d31b357de7f3973a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arma=C3=ABl=20Gu=C3=A9neau?= <armael.gueneau@ens-lyon.org>
Date: Thu, 19 Dec 2024 15:21:33 +0100
Subject: [PATCH] WIP: lock spam accounts then delete after a grace period

---
 src/db.rs      |   6 ++-
 src/main.rs    | 140 +++++++++++++++++++++++++++++++++++++++++--------
 src/scrape.rs  |   2 +-
 src/workers.rs |  52 +++++++++++++++++-
 4 files changed, 174 insertions(+), 26 deletions(-)
diff --git a/src/db.rs b/src/db.rs
index 04690a4..bc26e02 100644
--- a/src/db.rs
+++ b/src/db.rs
@@ -110,14 +110,15 @@ impl Db {
         Ok(())
     }
 
-    pub fn unclassified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData)> {
+    pub fn unclassified_users<'a>(&'a self) -> Vec<(UserId, &'a UserData)> {
         self.users
             .iter()
             .filter(|(user_id, _)| !self.is_spam.contains_key(&user_id))
+            .map(|(id, d)| (*id, d))
             .collect()
     }
 
-    pub fn classified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData, IsSpam)> {
+    pub fn classified_users<'a>(&'a self) -> Vec<(UserId, &'a UserData, IsSpam)> {
         self.users
             .iter()
             .filter_map(|(user_id, user_data)| {
@@ -125,6 +126,7 @@ impl Db {
                     .get(&user_id)
                     .map(|is_spam| (user_id, user_data, *is_spam))
             })
+            .map(|(id, d, s)| (*id, d, s))
             .collect()
     }
 }
diff --git a/src/main.rs b/src/main.rs
index 849941a..c613a1a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -23,6 +23,15 @@ use db::{Db, IsSpam};
 // Fetch user data from forgejo from time to time
 const FORGEJO_POLL_DELAY: Duration = Duration::from_secs(11 * 3600); // 11 hours
 
+// Duration of the grace period.
+
+// The grace period starts after a user is marked as spam and we block their
+// account. This gives time for the user to contact us and ask that they be
+// unblocked.
+// If the grace period expires and the user is still marked as spam, their
+// account is deleted.
+const GRACE_PERIOD: Duration = Duration::from_secs(30 * 24 * 3600); // 30 days
+
 // Heuristic score thresholds used for:
 // - the display color when displaying unclassified users (green/orange/red)
 // - chosing when to remove an existing classification after a user's data changes
@@ -66,10 +75,28 @@ async fn load_db(forge: &Forgejo) -> anyhow::Result<(Db, Classifier)> {
     Ok((db, classifier))
 }
 
-// XXX: This function looks like it is doing too many things at once.
-fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], overwrite: bool) {
+// Register a list of decisions taken by the admin using the webpage, checking
+// for classification conflicts.
+//
+// Only updates the database and the classifier.
+// Returns the list of newlyfound spammers whose account must be blocked.
+//
+// The [overwrite] parameter is true when for "edit" mode (when updating
+// existing classifications), and false when classifying new users.
+//
+// NB: some of the input decisions may be no-ops: when using the page to edit
+// existing classifications, the webform sends the list of all existing and
+// changed classifications.
+fn set_spam(
+    db: &mut Db,
+    classifier: &mut Classifier,
+    ids: &[(UserId, bool)],
+    overwrite: bool,
+) -> Vec<UserId> {
+    let mut spammers = Vec::new();
+
     for &(user_id, is_spam) in ids {
-        let mut train_classifier = false;
+        let mut update_classification = false;
 
         match db.is_spam.get(&user_id) {
             Some(&was_spam) if overwrite && was_spam.as_bool() != is_spam => {
@@ -79,16 +106,16 @@ fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], ov
                     was_spam,
                     is_spam
                 );
-                db.is_spam.insert(user_id, IsSpam::from_bool(is_spam));
-                // This is somewhat hackish: we already trained the classifier
-                // on the previous classification, possibly with the same
-                // tokens.
+                // Training the classifier again is somewhat hackish in this
+                // case: we already trained the classifier on the previous
+                // classification, possibly with the same tokens.
+                //
                 // Ideally we would undo the previous training and train with
                 // the correct classification now, but the classifier has no way
                 // to easily undo a previous training (we don't know whether the
                 // tokens that we have now are the same as the one that were
                 // used previously).
-                train_classifier = true;
+                update_classification = true;
             }
             Some(&was_spam) if !overwrite && was_spam.as_bool() != is_spam => {
                 // Classification conflict between concurrent queries.
@@ -101,16 +128,22 @@ fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], ov
                 db.is_spam.remove(&user_id);
             }
             None => {
-                db.is_spam.insert(user_id, IsSpam::from_bool(is_spam));
-                train_classifier = true;
+                update_classification = true;
             }
             Some(was_spam) => {
                 assert!(was_spam.as_bool() == is_spam);
-                // nothing to do
+                // nothing to do.
+                // In particular, keep the spam classification time as is.
             }
         }
 
-        if train_classifier {
+        if update_classification {
+            db.is_spam.insert(user_id, IsSpam::from_bool(is_spam));
+            // if we just classified the user as spam, add it to the list
+            if is_spam {
+                spammers.push(user_id)
+            }
+
             // Train the classifier with tokens from the user
             let tokens = db.tokens.get(&user_id).unwrap();
             if is_spam {
@@ -123,6 +156,56 @@ fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], ov
 
     eprintln!("recomputing user scores");
     db.recompute_scores(&classifier);
+
+    spammers
+}
+
+async fn lock_user_account(forge: &Forgejo, username: &str) -> anyhow::Result<()> {
+    let opts = forgejo_api::structs::EditUserOption {
+        // boilerplate: we do not change these settings
+        active: None,
+        admin: None,
+        allow_create_organization: None,
+        allow_git_hook: None,
+        allow_import_local: None,
+        description: None,
+        email: None,
+        full_name: None,
+        location: None,
+        login_name: None,
+        max_repo_creation: None,
+        must_change_password: None,
+        password: None,
+        pronouns: None,
+        restricted: None,
+        source_id: None,
+        website: None,
+        // lock the account and set its visibility to private: the user's
+        // description and info will not be publicly visible
+        prohibit_login: Some(true),
+        visibility: Some("private".to_string()),
+    };
+    forge.admin_edit_user(username, opts).await?;
+    Ok(())
+}
+
+async fn apply_classification(
+    forge: &Forgejo,
+    db: &mut Db,
+    classifier: &mut Classifier,
+    ids: &[(UserId, bool)],
+    overwrite: bool,
+) -> anyhow::Result<()> {
+    let spammers = set_spam(db, classifier, ids, overwrite);
+
+    for user in spammers {
+        // TODO: send email (what do we do if sending the email didn't work?)
+        // TODO: batch the email sending? (only open one smtp connection)
+        lock_user_account(forge, &db.users.get(&user).unwrap().login).await?;
+        // TODO: better error handling: retries, ..?
+    }
+
+    Ok(())
 }
 
 lazy_static! {
@@ -140,6 +223,7 @@ lazy_static! {
 struct AppState {
     db: Arc<Mutex<Db>>,
     classifier: Arc<Mutex<Classifier>>,
+    forge: Arc<Forgejo>,
 }
 
 #[derive(Debug, Deserialize)]
@@ -175,10 +259,10 @@ async fn index(
 
     let db = &data.db.lock().unwrap();
 
-    let mut users: Vec<(&UserId, &UserData, f32)> = db
+    let mut users: Vec<(UserId, &UserData, f32)> = db
         .unclassified_users()
         .into_iter()
-        .map(|(id, u)| (id, u, *db.score.get(id).unwrap()))
+        .map(|(id, u)| (id, u, *db.score.get(&id).unwrap()))
         .collect();
     let mut rng = rand::thread_rng();
 
@@ -200,7 +284,7 @@ async fn index(
     }
 
     // compute the rough "spam score" (low/mid/high) and spam guess (true/false)
-    let users: Vec<(&UserId, &UserData, f32, ApproxScore, bool)> = users
+    let users: Vec<(UserId, &UserData, f32, ApproxScore, bool)> = users
         .into_iter()
         .map(|(id, u, score)| {
             (
@@ -239,13 +323,16 @@ async fn post_classified(
 
     let db = &mut data.db.lock().unwrap();
     let classifier = &mut data.classifier.lock().unwrap();
+    let forge = &data.forge;
 
     let updates: Vec<(UserId, bool)> = form
         .iter()
         .map(|(id, classification)| (UserId(*id), classification == "spam"))
         .collect();
 
-    set_spam(db, classifier, &updates, overwrite);
+    apply_classification(forge, db, classifier, &updates, overwrite)
+        .await
+        .unwrap(); // FIXME
 
     db.store_to_path(Path::new("db.json")).unwrap(); // FIXME
     classifier
@@ -286,10 +373,10 @@ async fn classified(
 
     let db = &data.db.lock().unwrap();
 
-    let mut users: Vec<(&UserId, &UserData, f32, bool)> = db
+    let mut users: Vec<(UserId, &UserData, f32, bool)> = db
         .classified_users()
         .into_iter()
-        .map(|(id, u, s)| (id, u, *db.score.get(id).unwrap(), s.as_bool()))
+        .map(|(id, u, s)| (id, u, *db.score.get(&id).unwrap(), s.as_bool()))
         .collect();
     // sort "spam first"
     users.sort_by_key(|(_, _, score, _)| 1000 - (score * 1000.) as u64);
@@ -313,7 +400,7 @@ async fn main() -> std::io::Result<()> {
     let _ = *TEMPLATES;
 
     eprintln!("Load users and repos");
-    let forge = Arc::new(forge().unwrap()); // FIXME
+    let forge = Arc::new(forge().unwrap() /* FIXME */);
     let (db, classifier) = load_db(&forge).await.unwrap(); // FIXME
     let db = Arc::new(Mutex::new(db));
     let classifier = Arc::new(Mutex::new(classifier));
@@ -321,11 +408,20 @@ async fn main() -> std::io::Result<()> {
     let st = web::Data::new(AppState {
         db: db.clone(),
         classifier: classifier.clone(),
+        forge: forge.clone(),
     });
 
-    let _ = tokio::spawn(async move {
-        workers::refresh_user_data(forge.clone(), db.clone(), classifier.clone())
-    });
+    let _ = {
+        let forge = forge.clone();
+        let db = db.clone();
+        let classifier = classifier.clone();
+        tokio::spawn(async move { workers::refresh_user_data(forge, db, classifier) })
+    };
+    let _ = {
+        let forge = forge.clone();
+        let db = db.clone();
+        tokio::spawn(async move { workers::purge_spammer_accounts(forge, db) })
+    };
 
     println!("Listening on http://127.0.0.1:8080");
 
diff --git a/src/scrape.rs b/src/scrape.rs
index 64ba532..a8c07d0 100644
--- a/src/scrape.rs
+++ b/src/scrape.rs
@@ -63,7 +63,7 @@ async fn scrape_users(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::struct
             _ => todo!("scrape_users: implement retries"),
         }
         page += 1;
-        sleep(Duration::from_millis(100)).await;
+        sleep(Duration::from_millis(20)).await;
     }
     Ok(users)
 }
diff --git a/src/workers.rs b/src/workers.rs
index 4fb5c60..b4ae437 100644
--- a/src/workers.rs
+++ b/src/workers.rs
@@ -1,5 +1,5 @@
 use crate::classifier::Classifier;
-use crate::db::Db;
+use crate::db::{Db, IsSpam};
 use crate::scrape;
 use forgejo_api::Forgejo;
 use std::collections::HashMap;
@@ -7,8 +7,11 @@ use std::path::Path;
 use std::sync::{Arc, Mutex};
 
 use crate::FORGEJO_POLL_DELAY;
+use crate::GRACE_PERIOD;
 use crate::{GUESS_LEGIT_THRESHOLD, GUESS_SPAM_THRESHOLD};
 
+// Worker to refresh user data by periodically polling Forgejo
+
 async fn try_refresh_user_data(
     forge: &Forgejo,
     db: Arc<Mutex<Db>>,
@@ -73,3 +76,50 @@ pub async fn refresh_user_data(
         }
     }
 }
+
+// Worker to delete spam accounts after their grace period expired
+
+async fn try_purge_account(forge: &Forgejo, login: &str) -> anyhow::Result<()> {
+    forge
+        .admin_delete_user(
+            login,
+            forgejo_api::structs::AdminDeleteUserQuery { purge: Some(true) },
+        )
+        .await?;
+    Ok(())
+}
+
+pub async fn purge_spammer_accounts(forge: Arc<Forgejo>, db: Arc<Mutex<Db>>) {
+    loop {
+        tokio::time::sleep(std::time::Duration::from_secs(3600)).await;
+
+        let mut classified_users = Vec::new();
+        {
+            let db = &db.lock().unwrap();
+            for (id, user, is_spam) in db.classified_users() {
+                classified_users.push((id, user.login.clone(), is_spam));
+            }
+        }
+
+        for (user_id, login, is_spam) in classified_users {
+            if let IsSpam::Spam { classified_at } = is_spam {
+                match classified_at.elapsed() {
+                    Ok(duration) if duration > GRACE_PERIOD => {
+                        if let Err(e) = try_purge_account(&forge, &login).await {
+                            eprintln!("Error while deleting spammer account {login}: {:?}", e)
+                        } else {
+                            eprintln!("Deleted spammer account {login}");
+                            let db = &mut db.lock().unwrap();
+                            db.users.remove(&user_id);
+                            db.is_spam.remove(&user_id);
+                            db.score.remove(&user_id);
+                            db.tokens.remove(&user_id);
+                            db.store_to_path(Path::new("db.json")).unwrap(); // FIXME
+                        }
+                    }
+                    _ => (),
+                }
+            }
+        }
+    }
+}