WIP: lock spam accounts then delete after a grace period
This commit is contained in:
parent
45ff1f3ea5
commit
d4af61fb35
4 changed files with 174 additions and 26 deletions
|
@ -110,14 +110,15 @@ impl Db {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn unclassified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData)> {
|
pub fn unclassified_users<'a>(&'a self) -> Vec<(UserId, &'a UserData)> {
|
||||||
self.users
|
self.users
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|(user_id, _)| !self.is_spam.contains_key(&user_id))
|
.filter(|(user_id, _)| !self.is_spam.contains_key(&user_id))
|
||||||
|
.map(|(id, d)| (*id, d))
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn classified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData, IsSpam)> {
|
pub fn classified_users<'a>(&'a self) -> Vec<(UserId, &'a UserData, IsSpam)> {
|
||||||
self.users
|
self.users
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|(user_id, user_data)| {
|
.filter_map(|(user_id, user_data)| {
|
||||||
|
@ -125,6 +126,7 @@ impl Db {
|
||||||
.get(&user_id)
|
.get(&user_id)
|
||||||
.map(|is_spam| (user_id, user_data, *is_spam))
|
.map(|is_spam| (user_id, user_data, *is_spam))
|
||||||
})
|
})
|
||||||
|
.map(|(id, d, s)| (*id, d, s))
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
140
src/main.rs
140
src/main.rs
|
@ -23,6 +23,15 @@ use db::{Db, IsSpam};
|
||||||
// Fetch user data from forgejo from time to time
|
// Fetch user data from forgejo from time to time
|
||||||
const FORGEJO_POLL_DELAY: Duration = Duration::from_secs(11 * 3600); // 11 hours
|
const FORGEJO_POLL_DELAY: Duration = Duration::from_secs(11 * 3600); // 11 hours
|
||||||
|
|
||||||
|
// Duration of the grace period.
|
||||||
|
|
||||||
|
// The grace period starts after a user is marked as spam and we block their
|
||||||
|
// account. This gives time for the user to contact us and ask that they be
|
||||||
|
// unblocked.
|
||||||
|
// If the grace period expires and the user is still marked as spam, their
|
||||||
|
// account is deleted.
|
||||||
|
const GRACE_PERIOD: Duration = Duration::from_secs(30 * 24 * 3600); // 30 days
|
||||||
|
|
||||||
// Heuristic score thresholds used for:
|
// Heuristic score thresholds used for:
|
||||||
// - the display color when displaying unclassified users (green/orange/red)
|
// - the display color when displaying unclassified users (green/orange/red)
|
||||||
// - chosing when to remove an existing classification after a user's data changes
|
// - chosing when to remove an existing classification after a user's data changes
|
||||||
|
@ -66,10 +75,28 @@ async fn load_db(forge: &Forgejo) -> anyhow::Result<(Db, Classifier)> {
|
||||||
Ok((db, classifier))
|
Ok((db, classifier))
|
||||||
}
|
}
|
||||||
|
|
||||||
// XXX: This function looks like it is doing too many things at once.
|
// Register a list of decisions taken by the admin using the webpage, checking
|
||||||
fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], overwrite: bool) {
|
// for classification conflicts.
|
||||||
|
//
|
||||||
|
// Only updates the database and the classifier.
|
||||||
|
// Returns the list of newlyfound spammers whose account must be blocked.
|
||||||
|
//
|
||||||
|
// The [overwrite] parameter is true when for "edit" mode (when updating
|
||||||
|
// existing classifications), and false when classifying new users.
|
||||||
|
//
|
||||||
|
// NB: some of the input decisions may be no-ops: when using the page to edit
|
||||||
|
// existing classifications, the webform sends the list of all existing and
|
||||||
|
// changed classifications.
|
||||||
|
fn set_spam(
|
||||||
|
db: &mut Db,
|
||||||
|
classifier: &mut Classifier,
|
||||||
|
ids: &[(UserId, bool)],
|
||||||
|
overwrite: bool,
|
||||||
|
) -> Vec<UserId> {
|
||||||
|
let mut spammers = Vec::new();
|
||||||
|
|
||||||
for &(user_id, is_spam) in ids {
|
for &(user_id, is_spam) in ids {
|
||||||
let mut train_classifier = false;
|
let mut update_classification = false;
|
||||||
|
|
||||||
match db.is_spam.get(&user_id) {
|
match db.is_spam.get(&user_id) {
|
||||||
Some(&was_spam) if overwrite && was_spam.as_bool() != is_spam => {
|
Some(&was_spam) if overwrite && was_spam.as_bool() != is_spam => {
|
||||||
|
@ -79,16 +106,16 @@ fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], ov
|
||||||
was_spam,
|
was_spam,
|
||||||
is_spam
|
is_spam
|
||||||
);
|
);
|
||||||
db.is_spam.insert(user_id, IsSpam::from_bool(is_spam));
|
// Training the classifier again is somewhat hackish in this
|
||||||
// This is somewhat hackish: we already trained the classifier
|
// case: we already trained the classifier on the previous
|
||||||
// on the previous classification, possibly with the same
|
// classification, possibly with the same tokens.
|
||||||
// tokens.
|
//
|
||||||
// Ideally we would undo the previous training and train with
|
// Ideally we would undo the previous training and train with
|
||||||
// the correct classification now, but the classifier has no way
|
// the correct classification now, but the classifier has no way
|
||||||
// to easily undo a previous training (we don't know whether the
|
// to easily undo a previous training (we don't know whether the
|
||||||
// tokens that we have now are the same as the one that were
|
// tokens that we have now are the same as the one that were
|
||||||
// used previously).
|
// used previously).
|
||||||
train_classifier = true;
|
update_classification = true;
|
||||||
}
|
}
|
||||||
Some(&was_spam) if !overwrite && was_spam.as_bool() != is_spam => {
|
Some(&was_spam) if !overwrite && was_spam.as_bool() != is_spam => {
|
||||||
// Classification conflict between concurrent queries.
|
// Classification conflict between concurrent queries.
|
||||||
|
@ -101,16 +128,22 @@ fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], ov
|
||||||
db.is_spam.remove(&user_id);
|
db.is_spam.remove(&user_id);
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
db.is_spam.insert(user_id, IsSpam::from_bool(is_spam));
|
update_classification = true;
|
||||||
train_classifier = true;
|
|
||||||
}
|
}
|
||||||
Some(was_spam) => {
|
Some(was_spam) => {
|
||||||
assert!(was_spam.as_bool() == is_spam);
|
assert!(was_spam.as_bool() == is_spam);
|
||||||
// nothing to do
|
// nothing to do.
|
||||||
|
// In particular, keep the spam classification time as is.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if train_classifier {
|
if update_classification {
|
||||||
|
db.is_spam.insert(user_id, IsSpam::from_bool(is_spam));
|
||||||
|
// if we just classified the user as spam, add it to the list
|
||||||
|
if is_spam {
|
||||||
|
spammers.push(user_id)
|
||||||
|
}
|
||||||
|
|
||||||
// Train the classifier with tokens from the user
|
// Train the classifier with tokens from the user
|
||||||
let tokens = db.tokens.get(&user_id).unwrap();
|
let tokens = db.tokens.get(&user_id).unwrap();
|
||||||
if is_spam {
|
if is_spam {
|
||||||
|
@ -123,6 +156,56 @@ fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], ov
|
||||||
|
|
||||||
eprintln!("recomputing user scores");
|
eprintln!("recomputing user scores");
|
||||||
db.recompute_scores(&classifier);
|
db.recompute_scores(&classifier);
|
||||||
|
|
||||||
|
spammers
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn lock_user_account(forge: &Forgejo, username: &str) -> anyhow::Result<()> {
|
||||||
|
let opts = forgejo_api::structs::EditUserOption {
|
||||||
|
// boilerplate: we do not change these settings
|
||||||
|
active: None,
|
||||||
|
admin: None,
|
||||||
|
allow_create_organization: None,
|
||||||
|
allow_git_hook: None,
|
||||||
|
allow_import_local: None,
|
||||||
|
description: None,
|
||||||
|
email: None,
|
||||||
|
full_name: None,
|
||||||
|
location: None,
|
||||||
|
login_name: None,
|
||||||
|
max_repo_creation: None,
|
||||||
|
must_change_password: None,
|
||||||
|
password: None,
|
||||||
|
pronouns: None,
|
||||||
|
restricted: None,
|
||||||
|
source_id: None,
|
||||||
|
website: None,
|
||||||
|
// lock the account and set its visibility to private: the user's
|
||||||
|
// description and info will not be publicly visible
|
||||||
|
prohibit_login: Some(true),
|
||||||
|
visibility: Some("private".to_string()),
|
||||||
|
};
|
||||||
|
forge.admin_edit_user(username, opts).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn apply_classification(
|
||||||
|
forge: &Forgejo,
|
||||||
|
db: &mut Db,
|
||||||
|
classifier: &mut Classifier,
|
||||||
|
ids: &[(UserId, bool)],
|
||||||
|
overwrite: bool,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let spammers = set_spam(db, classifier, ids, overwrite);
|
||||||
|
|
||||||
|
for user in spammers {
|
||||||
|
// TODO: send email (what do we do if sending the email didn't work?)
|
||||||
|
// TODO: batch the email sending? (only open one smtp connection)
|
||||||
|
lock_user_account(forge, &db.users.get(&user).unwrap().login).await?;
|
||||||
|
// TODO: better error handling: retries, ..?
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
|
@ -140,6 +223,7 @@ lazy_static! {
|
||||||
struct AppState {
|
struct AppState {
|
||||||
db: Arc<Mutex<Db>>,
|
db: Arc<Mutex<Db>>,
|
||||||
classifier: Arc<Mutex<Classifier>>,
|
classifier: Arc<Mutex<Classifier>>,
|
||||||
|
forge: Arc<Forgejo>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
|
@ -175,10 +259,10 @@ async fn index(
|
||||||
|
|
||||||
let db = &data.db.lock().unwrap();
|
let db = &data.db.lock().unwrap();
|
||||||
|
|
||||||
let mut users: Vec<(&UserId, &UserData, f32)> = db
|
let mut users: Vec<(UserId, &UserData, f32)> = db
|
||||||
.unclassified_users()
|
.unclassified_users()
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(id, u)| (id, u, *db.score.get(id).unwrap()))
|
.map(|(id, u)| (id, u, *db.score.get(&id).unwrap()))
|
||||||
.collect();
|
.collect();
|
||||||
let mut rng = rand::thread_rng();
|
let mut rng = rand::thread_rng();
|
||||||
|
|
||||||
|
@ -200,7 +284,7 @@ async fn index(
|
||||||
}
|
}
|
||||||
|
|
||||||
// compute the rough "spam score" (low/mid/high) and spam guess (true/false)
|
// compute the rough "spam score" (low/mid/high) and spam guess (true/false)
|
||||||
let users: Vec<(&UserId, &UserData, f32, ApproxScore, bool)> = users
|
let users: Vec<(UserId, &UserData, f32, ApproxScore, bool)> = users
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(id, u, score)| {
|
.map(|(id, u, score)| {
|
||||||
(
|
(
|
||||||
|
@ -239,13 +323,16 @@ async fn post_classified(
|
||||||
|
|
||||||
let db = &mut data.db.lock().unwrap();
|
let db = &mut data.db.lock().unwrap();
|
||||||
let classifier = &mut data.classifier.lock().unwrap();
|
let classifier = &mut data.classifier.lock().unwrap();
|
||||||
|
let forge = &data.forge;
|
||||||
|
|
||||||
let updates: Vec<(UserId, bool)> = form
|
let updates: Vec<(UserId, bool)> = form
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(id, classification)| (UserId(*id), classification == "spam"))
|
.map(|(id, classification)| (UserId(*id), classification == "spam"))
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
set_spam(db, classifier, &updates, overwrite);
|
apply_classification(forge, db, classifier, &updates, overwrite)
|
||||||
|
.await
|
||||||
|
.unwrap(); // FIXME
|
||||||
|
|
||||||
db.store_to_path(Path::new("db.json")).unwrap(); // FIXME
|
db.store_to_path(Path::new("db.json")).unwrap(); // FIXME
|
||||||
classifier
|
classifier
|
||||||
|
@ -286,10 +373,10 @@ async fn classified(
|
||||||
|
|
||||||
let db = &data.db.lock().unwrap();
|
let db = &data.db.lock().unwrap();
|
||||||
|
|
||||||
let mut users: Vec<(&UserId, &UserData, f32, bool)> = db
|
let mut users: Vec<(UserId, &UserData, f32, bool)> = db
|
||||||
.classified_users()
|
.classified_users()
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(id, u, s)| (id, u, *db.score.get(id).unwrap(), s.as_bool()))
|
.map(|(id, u, s)| (id, u, *db.score.get(&id).unwrap(), s.as_bool()))
|
||||||
.collect();
|
.collect();
|
||||||
// sort "spam first"
|
// sort "spam first"
|
||||||
users.sort_by_key(|(_, _, score, _)| 1000 - (score * 1000.) as u64);
|
users.sort_by_key(|(_, _, score, _)| 1000 - (score * 1000.) as u64);
|
||||||
|
@ -313,7 +400,7 @@ async fn main() -> std::io::Result<()> {
|
||||||
let _ = *TEMPLATES;
|
let _ = *TEMPLATES;
|
||||||
|
|
||||||
eprintln!("Load users and repos");
|
eprintln!("Load users and repos");
|
||||||
let forge = Arc::new(forge().unwrap()); // FIXME
|
let forge = Arc::new(forge().unwrap() /* FIXME */);
|
||||||
let (db, classifier) = load_db(&forge).await.unwrap(); // FIXME
|
let (db, classifier) = load_db(&forge).await.unwrap(); // FIXME
|
||||||
let db = Arc::new(Mutex::new(db));
|
let db = Arc::new(Mutex::new(db));
|
||||||
let classifier = Arc::new(Mutex::new(classifier));
|
let classifier = Arc::new(Mutex::new(classifier));
|
||||||
|
@ -321,11 +408,20 @@ async fn main() -> std::io::Result<()> {
|
||||||
let st = web::Data::new(AppState {
|
let st = web::Data::new(AppState {
|
||||||
db: db.clone(),
|
db: db.clone(),
|
||||||
classifier: classifier.clone(),
|
classifier: classifier.clone(),
|
||||||
|
forge: forge.clone(),
|
||||||
});
|
});
|
||||||
|
|
||||||
let _ = tokio::spawn(async move {
|
let _ = {
|
||||||
workers::refresh_user_data(forge.clone(), db.clone(), classifier.clone())
|
let forge = forge.clone();
|
||||||
});
|
let db = db.clone();
|
||||||
|
let classifier = classifier.clone();
|
||||||
|
tokio::spawn(async move { workers::refresh_user_data(forge, db, classifier) })
|
||||||
|
};
|
||||||
|
let _ = {
|
||||||
|
let forge = forge.clone();
|
||||||
|
let db = db.clone();
|
||||||
|
tokio::spawn(async move { workers::purge_spammer_accounts(forge, db) })
|
||||||
|
};
|
||||||
|
|
||||||
println!("Listening on http://127.0.0.1:8080");
|
println!("Listening on http://127.0.0.1:8080");
|
||||||
|
|
||||||
|
|
|
@ -63,7 +63,7 @@ async fn scrape_users(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::struct
|
||||||
_ => todo!("scrape_users: implement retries"),
|
_ => todo!("scrape_users: implement retries"),
|
||||||
}
|
}
|
||||||
page += 1;
|
page += 1;
|
||||||
sleep(Duration::from_millis(100)).await;
|
sleep(Duration::from_millis(20)).await;
|
||||||
}
|
}
|
||||||
Ok(users)
|
Ok(users)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
use crate::classifier::Classifier;
|
use crate::classifier::Classifier;
|
||||||
use crate::db::Db;
|
use crate::db::{Db, IsSpam};
|
||||||
use crate::scrape;
|
use crate::scrape;
|
||||||
use forgejo_api::Forgejo;
|
use forgejo_api::Forgejo;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
@ -7,8 +7,11 @@ use std::path::Path;
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
use crate::FORGEJO_POLL_DELAY;
|
use crate::FORGEJO_POLL_DELAY;
|
||||||
|
use crate::GRACE_PERIOD;
|
||||||
use crate::{GUESS_LEGIT_THRESHOLD, GUESS_SPAM_THRESHOLD};
|
use crate::{GUESS_LEGIT_THRESHOLD, GUESS_SPAM_THRESHOLD};
|
||||||
|
|
||||||
|
// Worker to refresh user data by periodically polling Forgejo
|
||||||
|
|
||||||
async fn try_refresh_user_data(
|
async fn try_refresh_user_data(
|
||||||
forge: &Forgejo,
|
forge: &Forgejo,
|
||||||
db: Arc<Mutex<Db>>,
|
db: Arc<Mutex<Db>>,
|
||||||
|
@ -73,3 +76,50 @@ pub async fn refresh_user_data(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Worker to delete spam accounts after their grace period expired
|
||||||
|
|
||||||
|
async fn try_purge_account(forge: &Forgejo, login: &str) -> anyhow::Result<()> {
|
||||||
|
forge
|
||||||
|
.admin_delete_user(
|
||||||
|
login,
|
||||||
|
forgejo_api::structs::AdminDeleteUserQuery { purge: Some(true) },
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn purge_spammer_accounts(forge: Arc<Forgejo>, db: Arc<Mutex<Db>>) {
|
||||||
|
loop {
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(3600)).await;
|
||||||
|
|
||||||
|
let mut classified_users = Vec::new();
|
||||||
|
{
|
||||||
|
let db = &db.lock().unwrap();
|
||||||
|
for (id, user, is_spam) in db.classified_users() {
|
||||||
|
classified_users.push((id, user.login.clone(), is_spam));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (user_id, login, is_spam) in classified_users {
|
||||||
|
if let IsSpam::Spam { classified_at } = is_spam {
|
||||||
|
match classified_at.elapsed() {
|
||||||
|
Ok(duration) if duration > GRACE_PERIOD => {
|
||||||
|
if let Err(e) = try_purge_account(&forge, &login).await {
|
||||||
|
eprintln!("Error while deleting spammer account {login}: {:?}", e)
|
||||||
|
} else {
|
||||||
|
eprintln!("Deleted spammer account {login}");
|
||||||
|
let db = &mut db.lock().unwrap();
|
||||||
|
db.users.remove(&user_id);
|
||||||
|
db.is_spam.remove(&user_id);
|
||||||
|
db.score.remove(&user_id);
|
||||||
|
db.tokens.remove(&user_id);
|
||||||
|
db.store_to_path(Path::new("db.json")).unwrap(); // FIXME
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue