Add S3 as storage backend, refactor db & storage code
This commit is contained in:
parent
af38eae2c3
commit
edc49a6d1d
11 changed files with 1608 additions and 535 deletions
1118
Cargo.lock
generated
1118
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,5 @@
|
||||||
[package]
|
[package]
|
||||||
name = "forgejo-antispam"
|
name = "forgery"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
|
@ -21,6 +21,8 @@ actix-files = "0.6"
|
||||||
unicode-segmentation = "1"
|
unicode-segmentation = "1"
|
||||||
lettre = { version = "0.11", features = ["builder", "smtp-transport", "rustls-tls"], default-features = false }
|
lettre = { version = "0.11", features = ["builder", "smtp-transport", "rustls-tls"], default-features = false }
|
||||||
include_dir = "0.7"
|
include_dir = "0.7"
|
||||||
|
aws-config = { version = "1.1.7", features = ["behavior-version-latest"] }
|
||||||
|
aws-sdk-s3 = "1.66.0"
|
||||||
|
|
||||||
[profile.profiling]
|
[profile.profiling]
|
||||||
inherits = "dev"
|
inherits = "dev"
|
||||||
|
|
20
README.md
20
README.md
|
@ -27,12 +27,28 @@ Forgery reads the following environment variables:
|
||||||
default) or set to `false`, no actual action is taken: spammers are only
|
default) or set to `false`, no actual action is taken: spammers are only
|
||||||
listed in the database. The variable should be set in production, but probably
|
listed in the database. The variable should be set in production, but probably
|
||||||
not for testing.
|
not for testing.
|
||||||
|
- `STORAGE_BACKEND`: either `local` (default) or `s3`. Chose `local` to store
|
||||||
|
the application state to local files, or `s3` to store them in S3-compatible
|
||||||
|
storage (see below for corresponding configuration variables).
|
||||||
|
|
||||||
Environment variables that are relevant when `ACTUALLY_BAN_USERS=true`:
|
Environment variables read when `ACTUALLY_BAN_USERS=true`:
|
||||||
- `SMTP_ADDRESS`: address of the SMTP relay used to send email notifications
|
- `SMTP_ADDRESS`: address of the SMTP relay used to send email notifications
|
||||||
- `SMTP_USERNAME`: SMTP username
|
- `SMTP_USERNAME`: SMTP username
|
||||||
- `SMTP_PASSWORD`: SMTP password
|
- `SMTP_PASSWORD`: SMTP password
|
||||||
|
|
||||||
|
Environment variables read when `STORAGE_BACKEND=local`:
|
||||||
|
- `STORAGE_LOCAL_DIR`: path to a local directory where to store the application
|
||||||
|
data (as two files `db.json` and `model.json`). Defaults to `.` if not
|
||||||
|
defined.
|
||||||
|
|
||||||
|
Environment variables read when `STORAGE_BACKEND=s3`:
|
||||||
|
- `STORAGE_S3_BUCKET`: name of the bucket where to store the application data
|
||||||
|
(as two entries `db.json` and `model.json`).
|
||||||
|
- `AWS_DEFAULT_REGION`: S3 endpoint region
|
||||||
|
- `AWS_ENDPOINT_URL`: S3 endpoint URL
|
||||||
|
- `AWS_ACCESS_KEY_ID`: S3 key id
|
||||||
|
- `AWS_SECRET_ACCESS_KEY`: S3 key secret
|
||||||
|
|
||||||
## Todos
|
## Todos
|
||||||
|
|
||||||
- discuss the current design choices for when locking the account/sending a
|
- discuss the current design choices for when locking the account/sending a
|
||||||
|
@ -40,5 +56,5 @@ Environment variables that are relevant when `ACTUALLY_BAN_USERS=true`:
|
||||||
(Current behavior is to periodically retry, avoid deleting if the account
|
(Current behavior is to periodically retry, avoid deleting if the account
|
||||||
could not be locked, but delete the account after the grace period even if
|
could not be locked, but delete the account after the grace period even if
|
||||||
the email could not be sent…)
|
the email could not be sent…)
|
||||||
- add backend to store data on garage instead of local files
|
- auth: add support for connecting to the forge using oauth?
|
||||||
- improve error handling
|
- improve error handling
|
||||||
|
|
|
@ -1,11 +1,8 @@
|
||||||
// code based on the bayespam crate
|
// code based on the bayespam crate
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fs::File;
|
|
||||||
use std::io;
|
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::{from_reader, to_writer, to_writer_pretty};
|
|
||||||
use unicode_segmentation::UnicodeSegmentation;
|
use unicode_segmentation::UnicodeSegmentation;
|
||||||
|
|
||||||
const INITIAL_RATING: f32 = 0.5;
|
const INITIAL_RATING: f32 = 0.5;
|
||||||
|
@ -30,23 +27,6 @@ impl Classifier {
|
||||||
Default::default()
|
Default::default()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build a new classifier with a pre-trained model loaded from `file`.
|
|
||||||
pub fn new_from_pre_trained(file: &mut File) -> Result<Self, io::Error> {
|
|
||||||
let pre_trained_model = from_reader(file)?;
|
|
||||||
Ok(pre_trained_model)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Save the classifier to `file` as JSON.
|
|
||||||
/// The JSON will be pretty printed if `pretty` is `true`.
|
|
||||||
pub fn save(&self, file: &mut File, pretty: bool) -> Result<(), io::Error> {
|
|
||||||
if pretty {
|
|
||||||
to_writer_pretty(file, &self)?;
|
|
||||||
} else {
|
|
||||||
to_writer(file, &self)?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Split `msg` into a list of words.
|
/// Split `msg` into a list of words.
|
||||||
pub fn into_word_list(msg: &str) -> Vec<String> {
|
pub fn into_word_list(msg: &str) -> Vec<String> {
|
||||||
let word_list = msg.unicode_words().collect::<Vec<&str>>();
|
let word_list = msg.unicode_words().collect::<Vec<&str>>();
|
||||||
|
|
|
@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
|
||||||
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
pub struct UserId(pub i64);
|
pub struct UserId(pub i64);
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
pub struct UserData {
|
pub struct UserData {
|
||||||
pub login: String,
|
pub login: String,
|
||||||
pub email: String,
|
pub email: String,
|
||||||
|
@ -20,7 +20,7 @@ pub struct UserData {
|
||||||
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
pub struct RepoId(pub i64);
|
pub struct RepoId(pub i64);
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
pub struct RepoData {
|
pub struct RepoData {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
pub description: Option<String>,
|
pub description: Option<String>,
|
||||||
|
@ -29,7 +29,7 @@ pub struct RepoData {
|
||||||
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
pub struct IssueId(pub i64);
|
pub struct IssueId(pub i64);
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
pub struct IssueData {
|
pub struct IssueData {
|
||||||
pub title: String,
|
pub title: String,
|
||||||
pub body: String,
|
pub body: String,
|
||||||
|
|
232
src/db.rs
232
src/db.rs
|
@ -1,138 +1,126 @@
|
||||||
use crate::classifier::Classifier;
|
use crate::classifier::Classifier;
|
||||||
use crate::data::*;
|
use crate::data::*;
|
||||||
use serde::{Deserialize, Serialize};
|
use crate::userdb::{IsSpam, UserDb};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fmt;
|
use std::sync::{Arc, Mutex};
|
||||||
use std::fs::File;
|
|
||||||
use std::io::{BufReader, BufWriter};
|
|
||||||
use std::path::Path;
|
|
||||||
use std::time::SystemTime;
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug, Clone, Copy)]
|
#[derive(Clone)]
|
||||||
pub enum IsSpam {
|
|
||||||
Legit,
|
|
||||||
Spam {
|
|
||||||
classified_at: SystemTime,
|
|
||||||
locked: bool,
|
|
||||||
notified: bool,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
impl IsSpam {
|
|
||||||
pub fn as_bool(&self) -> bool {
|
|
||||||
match self {
|
|
||||||
IsSpam::Legit => true,
|
|
||||||
IsSpam::Spam { .. } => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn from_bool(b: bool) -> IsSpam {
|
|
||||||
if b {
|
|
||||||
IsSpam::Spam {
|
|
||||||
classified_at: SystemTime::now(),
|
|
||||||
locked: false,
|
|
||||||
notified: false,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
IsSpam::Legit
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Display for IsSpam {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
||||||
match self {
|
|
||||||
IsSpam::Legit => write!(f, "legit"),
|
|
||||||
IsSpam::Spam { .. } => write!(f, "spam"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO (?): make the fields private and provide an API that automatically
|
|
||||||
// recomputes the caches when necessary?
|
|
||||||
pub struct Db {
|
pub struct Db {
|
||||||
// persisted data
|
userdb: Arc<Mutex<UserDb>>,
|
||||||
pub users: HashMap<UserId, UserData>,
|
classifier: Arc<Mutex<Classifier>>,
|
||||||
pub is_spam: HashMap<UserId, IsSpam>,
|
cache: Arc<Mutex<Cache>>,
|
||||||
pub last_scrape: SystemTime,
|
}
|
||||||
// caches: computed from persisted data on load
|
|
||||||
pub score: HashMap<UserId, f32>,
|
struct Cache {
|
||||||
pub tokens: HashMap<UserId, Vec<String>>,
|
score: HashMap<UserId, f32>,
|
||||||
|
tokens: HashMap<UserId, Vec<String>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Db {
|
impl Db {
|
||||||
pub fn recompute_tokens(&mut self) {
|
// Creating
|
||||||
for (id, user) in &self.users {
|
|
||||||
self.tokens.insert(*id, user.to_tokens());
|
pub fn create(userdb: UserDb, classifier: Classifier) -> Self {
|
||||||
|
let cache = Cache::create(&userdb, &classifier);
|
||||||
|
Self {
|
||||||
|
userdb: Arc::new(Mutex::new(userdb)),
|
||||||
|
classifier: Arc::new(Mutex::new(classifier)),
|
||||||
|
cache: Arc::new(Mutex::new(cache)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn recompute_scores(&mut self, classifier: &Classifier) {
|
pub fn replace_userdb(&self, newdb: UserDb) {
|
||||||
for (id, tokens) in &self.tokens {
|
let userdb: &mut UserDb = &mut self.userdb.lock().unwrap();
|
||||||
self.score.insert(*id, classifier.score(tokens));
|
let _ = std::mem::replace(userdb, newdb);
|
||||||
|
let new_cache = Cache::create(userdb, &self.classifier.lock().unwrap());
|
||||||
|
let cache: &mut Cache = &mut self.cache.lock().unwrap();
|
||||||
|
let _ = std::mem::replace(cache, new_cache);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reading
|
||||||
|
|
||||||
|
pub fn with_userdb<F, T>(&self, f: F) -> T
|
||||||
|
where
|
||||||
|
F: FnOnce(&UserDb) -> T,
|
||||||
|
{
|
||||||
|
let lock = &self.userdb.lock().unwrap();
|
||||||
|
f(lock)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn login(&self, uid: UserId) -> Option<String> {
|
||||||
|
self.with_userdb(|u| u.userdata(uid).map(|d| d.login.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn score(&self, uid: UserId) -> Option<f32> {
|
||||||
|
self.cache.lock().unwrap().score.get(&uid).copied()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_tokens<F>(&self, uid: UserId, f: F)
|
||||||
|
where
|
||||||
|
F: FnOnce(Option<&[String]>),
|
||||||
|
{
|
||||||
|
let lock = self.cache.lock().unwrap();
|
||||||
|
f(lock.tokens.get(&uid).map(|v| &**v))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Updating
|
||||||
|
|
||||||
|
// pub fn recompute_scores(&self, classifier: &Classifier) {
|
||||||
|
// let lock = &mut self.inner.lock().unwrap();
|
||||||
|
// lock.recompute_scores(classifier)
|
||||||
|
// }
|
||||||
|
|
||||||
|
pub fn set_spam(&self, uid: UserId, is_spam: Option<IsSpam>) {
|
||||||
|
let udb = &mut self.userdb.lock().unwrap();
|
||||||
|
udb.set_spam(uid, is_spam)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_classifier<F, T>(&self, f: F) -> T
|
||||||
|
where
|
||||||
|
F: FnOnce(&Classifier) -> T,
|
||||||
|
{
|
||||||
|
let classifier = &self.classifier.lock().unwrap();
|
||||||
|
f(classifier)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_classifier_mut<F, T>(&self, f: F) -> T
|
||||||
|
where
|
||||||
|
F: FnOnce(&mut Classifier) -> T,
|
||||||
|
{
|
||||||
|
let classifier = &mut self.classifier.lock().unwrap();
|
||||||
|
let res = f(classifier);
|
||||||
|
// recompute scores
|
||||||
|
let cache: &mut Cache = &mut self.cache.lock().unwrap();
|
||||||
|
for (id, tokens) in &cache.tokens {
|
||||||
|
cache.score.insert(*id, classifier.score(tokens));
|
||||||
}
|
}
|
||||||
|
res
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_path(path: &Path, classifier: &Classifier) -> anyhow::Result<Self> {
|
pub fn remove_user(&self, uid: UserId) {
|
||||||
let file = File::open(path)?;
|
let userdb = &mut self.userdb.lock().unwrap();
|
||||||
let (users, is_spam, last_scrape) = serde_json::from_reader(BufReader::new(file))?;
|
userdb.remove_user(uid);
|
||||||
let mut db = Db {
|
let cache = &mut self.cache.lock().unwrap();
|
||||||
users,
|
cache.remove_user(uid);
|
||||||
is_spam,
|
}
|
||||||
last_scrape,
|
}
|
||||||
tokens: HashMap::new(),
|
|
||||||
score: HashMap::new(),
|
impl Cache {
|
||||||
};
|
fn create(userdb: &UserDb, classifier: &Classifier) -> Self {
|
||||||
db.recompute_tokens();
|
let mut tokens = HashMap::new();
|
||||||
db.recompute_scores(classifier);
|
let mut score = HashMap::new();
|
||||||
Ok(db)
|
|
||||||
}
|
for (id, user, _) in userdb {
|
||||||
|
let user_tokens = user.to_tokens();
|
||||||
pub fn from_users(
|
let user_score = classifier.score(&user_tokens);
|
||||||
users: HashMap<UserId, UserData>,
|
tokens.insert(id, user_tokens);
|
||||||
is_spam: HashMap<UserId, IsSpam>,
|
score.insert(id, user_score);
|
||||||
classifier: &Classifier,
|
}
|
||||||
) -> Db {
|
|
||||||
let mut db = Db {
|
Cache { tokens, score }
|
||||||
users,
|
}
|
||||||
is_spam,
|
|
||||||
last_scrape: SystemTime::now(),
|
fn remove_user(&mut self, uid: UserId) {
|
||||||
tokens: HashMap::new(),
|
self.score.remove(&uid);
|
||||||
score: HashMap::new(),
|
self.tokens.remove(&uid);
|
||||||
};
|
|
||||||
db.recompute_tokens();
|
|
||||||
db.recompute_scores(classifier);
|
|
||||||
db
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn store_to_path(&self, path: &Path) -> anyhow::Result<()> {
|
|
||||||
let file = File::create(path)?;
|
|
||||||
let dat: (
|
|
||||||
&HashMap<UserId, UserData>,
|
|
||||||
&HashMap<UserId, IsSpam>,
|
|
||||||
SystemTime,
|
|
||||||
) = (&self.users, &self.is_spam, self.last_scrape);
|
|
||||||
serde_json::to_writer(BufWriter::new(file), &dat)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn unclassified_users(&self) -> Vec<(UserId, &UserData)> {
|
|
||||||
self.users
|
|
||||||
.iter()
|
|
||||||
.filter(|(user_id, _)| !self.is_spam.contains_key(user_id))
|
|
||||||
.map(|(id, d)| (*id, d))
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn classified_users(&self) -> Vec<(UserId, &UserData, IsSpam)> {
|
|
||||||
self.users
|
|
||||||
.iter()
|
|
||||||
.filter_map(|(user_id, user_data)| {
|
|
||||||
self.is_spam
|
|
||||||
.get(user_id)
|
|
||||||
.map(|is_spam| (user_id, user_data, *is_spam))
|
|
||||||
})
|
|
||||||
.map(|(id, d, s)| (*id, d, s))
|
|
||||||
.collect()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,6 +50,7 @@ pub async fn send_locked_account_notice(
|
||||||
let email = Message::builder()
|
let email = Message::builder()
|
||||||
.from(smtp.username.parse().unwrap())
|
.from(smtp.username.parse().unwrap())
|
||||||
.to(email.parse()?)
|
.to(email.parse()?)
|
||||||
|
.reply_to(admin_contact_email.parse().unwrap())
|
||||||
.subject(format!(
|
.subject(format!(
|
||||||
"[Forgejo {org_name}] Your account was marked as spam and will be deleted in {} days",
|
"[Forgejo {org_name}] Your account was marked as spam and will be deleted in {} days",
|
||||||
grace_period_days
|
grace_period_days
|
||||||
|
|
226
src/main.rs
226
src/main.rs
|
@ -5,9 +5,7 @@ use lazy_static::lazy_static;
|
||||||
use rand::prelude::*;
|
use rand::prelude::*;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fs::File;
|
use std::sync::Arc;
|
||||||
use std::path::Path;
|
|
||||||
use std::sync::{Arc, Mutex};
|
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tera::Tera;
|
use tera::Tera;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
@ -17,12 +15,15 @@ mod data;
|
||||||
mod db;
|
mod db;
|
||||||
mod email;
|
mod email;
|
||||||
mod scrape;
|
mod scrape;
|
||||||
|
mod storage;
|
||||||
|
mod userdb;
|
||||||
mod workers;
|
mod workers;
|
||||||
|
|
||||||
use classifier::Classifier;
|
|
||||||
use data::*;
|
use data::*;
|
||||||
use db::{Db, IsSpam};
|
use db::Db;
|
||||||
use email::SmtpConfig;
|
use email::SmtpConfig;
|
||||||
|
use storage::Storage;
|
||||||
|
use userdb::{IsSpam, UserDb};
|
||||||
|
|
||||||
// Fetch user data from forgejo from time to time
|
// Fetch user data from forgejo from time to time
|
||||||
const FORGEJO_POLL_DELAY: Duration = Duration::from_secs(11 * 3600); // 11 hours
|
const FORGEJO_POLL_DELAY: Duration = Duration::from_secs(11 * 3600); // 11 hours
|
||||||
|
@ -95,9 +96,10 @@ struct AppState {
|
||||||
config: Arc<Config>,
|
config: Arc<Config>,
|
||||||
// authenticated access to the forgejo instance
|
// authenticated access to the forgejo instance
|
||||||
forge: Arc<Forgejo>,
|
forge: Arc<Forgejo>,
|
||||||
// runtime state (to be persisted in the storage when modified)
|
// handle to the storage backend
|
||||||
db: Arc<Mutex<Db>>,
|
storage: Arc<Storage>,
|
||||||
classifier: Arc<Mutex<Classifier>>,
|
// persistent state (written to the storage when modified)
|
||||||
|
db: Db,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn forge(url: &Url) -> anyhow::Result<Forgejo> {
|
fn forge(url: &Url) -> anyhow::Result<Forgejo> {
|
||||||
|
@ -108,28 +110,22 @@ fn forge(url: &Url) -> anyhow::Result<Forgejo> {
|
||||||
Ok(forge)
|
Ok(forge)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn load_db(forge: &Forgejo) -> anyhow::Result<(Db, Classifier)> {
|
async fn load_db(storage: &Storage, forge: &Forgejo) -> anyhow::Result<Db> {
|
||||||
let model_path = Path::new("model.json");
|
let classifier = storage::load_classifier(storage).await?;
|
||||||
let classifier = if model_path.is_file() {
|
let userdb = match storage::load_userdb(storage).await? {
|
||||||
Classifier::new_from_pre_trained(&mut File::open(model_path)?)?
|
Some(db) => db,
|
||||||
} else {
|
None => {
|
||||||
Classifier::new()
|
let db = UserDb::from_users(
|
||||||
|
scrape::get_user_data(forge).await?,
|
||||||
|
HashMap::new(),
|
||||||
|
std::time::SystemTime::now(),
|
||||||
|
);
|
||||||
|
storage::store_userdb(storage, &db).await?;
|
||||||
|
db
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let db_path = Path::new("db.json");
|
Ok(Db::create(userdb, classifier))
|
||||||
let db: Db = if db_path.is_file() {
|
|
||||||
Db::from_path(db_path, &classifier)?
|
|
||||||
} else {
|
|
||||||
let db = Db::from_users(
|
|
||||||
scrape::get_user_data(forge).await?,
|
|
||||||
HashMap::new(),
|
|
||||||
&classifier,
|
|
||||||
);
|
|
||||||
db.store_to_path(db_path)?;
|
|
||||||
db
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok((db, classifier))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register a list of decisions taken by the admin using the webpage, checking
|
// Register a list of decisions taken by the admin using the webpage, checking
|
||||||
|
@ -144,97 +140,89 @@ async fn load_db(forge: &Forgejo) -> anyhow::Result<(Db, Classifier)> {
|
||||||
// NB: some of the input decisions may be no-ops: when using the page to edit
|
// NB: some of the input decisions may be no-ops: when using the page to edit
|
||||||
// existing classifications, the webform sends the list of all existing and
|
// existing classifications, the webform sends the list of all existing and
|
||||||
// changed classifications.
|
// changed classifications.
|
||||||
fn set_spam(
|
fn set_spam(db: &Db, ids: &[(UserId, bool)], overwrite: bool) -> Vec<UserId> {
|
||||||
db: &mut Db,
|
let mut updated_spam = vec![];
|
||||||
classifier: &mut Classifier,
|
|
||||||
ids: &[(UserId, bool)],
|
|
||||||
overwrite: bool,
|
|
||||||
) -> Vec<UserId> {
|
|
||||||
let mut spammers = Vec::new();
|
|
||||||
|
|
||||||
for &(user_id, is_spam) in ids {
|
for &(user_id, set_spam) in ids {
|
||||||
let mut update_classification = false;
|
let login = db.login(user_id).unwrap();
|
||||||
|
|
||||||
match db.is_spam.get(&user_id) {
|
match db.with_userdb(|u| u.is_spam(user_id)) {
|
||||||
Some(&was_spam) if overwrite && was_spam.as_bool() != is_spam => {
|
Some(was_spam) if overwrite && was_spam.as_bool() != set_spam => {
|
||||||
eprintln!(
|
eprintln!("User {login}: changing classification from {was_spam} to {set_spam}");
|
||||||
"User {}: changing classification from {} to {}",
|
db.set_spam(user_id, Some(IsSpam::from_bool(set_spam)));
|
||||||
db.users.get(&user_id).unwrap().login,
|
// We train the classifier again, which is somewhat hackish: we
|
||||||
was_spam,
|
// already trained it on the previous classification, possibly
|
||||||
is_spam
|
// with the same tokens.
|
||||||
);
|
|
||||||
// Training the classifier again is somewhat hackish in this
|
|
||||||
// case: we already trained the classifier on the previous
|
|
||||||
// classification, possibly with the same tokens.
|
|
||||||
//
|
//
|
||||||
// Ideally we would undo the previous training and train with
|
// Ideally we would undo the previous training and train with
|
||||||
// the correct classification now, but the classifier has no way
|
// the correct classification now, but the classifier has no way
|
||||||
// to easily undo a previous training (we don't know whether the
|
// to easily undo a previous training (we don't know whether the
|
||||||
// tokens that we have now are the same as the one that were
|
// tokens that we have now are the same as the one that were
|
||||||
// used previously).
|
// used previously).
|
||||||
update_classification = true;
|
updated_spam.push((user_id, set_spam));
|
||||||
}
|
}
|
||||||
Some(&was_spam) if !overwrite && was_spam.as_bool() != is_spam => {
|
Some(was_spam) if !overwrite && was_spam.as_bool() != set_spam => {
|
||||||
// Classification conflict between concurrent queries.
|
// Classification conflict between concurrent queries.
|
||||||
// In this case we play it safe and discard the classification
|
// In this case we play it safe and discard the classification
|
||||||
// for this user; the user will need to be manually classified again.
|
// for this user; the user will need to be manually classified again.
|
||||||
eprintln!(
|
eprintln!(
|
||||||
"Classification conflict for user {}; discarding our current classification",
|
"Classification conflict for user {login}; discarding our current classification"
|
||||||
db.users.get(&user_id).unwrap().login
|
|
||||||
);
|
);
|
||||||
db.is_spam.remove(&user_id);
|
db.set_spam(user_id, None);
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
update_classification = true;
|
db.set_spam(user_id, Some(IsSpam::from_bool(set_spam)));
|
||||||
|
updated_spam.push((user_id, set_spam));
|
||||||
}
|
}
|
||||||
Some(was_spam) => {
|
Some(was_spam) => {
|
||||||
assert!(was_spam.as_bool() == is_spam);
|
assert!(was_spam.as_bool() == set_spam);
|
||||||
// nothing to do.
|
// nothing to do.
|
||||||
// In particular, keep the spam classification time as is.
|
// In particular, keep the spam classification time as is.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if update_classification {
|
let mut new_spammers = vec![];
|
||||||
db.is_spam.insert(user_id, IsSpam::from_bool(is_spam));
|
|
||||||
// if we just classified the user as spam, add it to the list
|
// update the classifier
|
||||||
if is_spam {
|
db.with_classifier_mut(|classifier| {
|
||||||
spammers.push(user_id)
|
for &(user_id, set_spam) in &updated_spam {
|
||||||
|
// if we just classified the user as spam, add it to the list of new
|
||||||
|
// spammers
|
||||||
|
if set_spam {
|
||||||
|
new_spammers.push(user_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Train the classifier with tokens from the user
|
// Train the classifier with tokens from the user
|
||||||
let tokens = db.tokens.get(&user_id).unwrap();
|
db.with_tokens(user_id, |tokens| {
|
||||||
if is_spam {
|
let tokens = tokens.unwrap();
|
||||||
classifier.train_spam(tokens)
|
if set_spam {
|
||||||
} else {
|
classifier.train_spam(tokens)
|
||||||
classifier.train_ham(tokens)
|
} else {
|
||||||
}
|
classifier.train_ham(tokens)
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
eprintln!("recomputing user scores");
|
new_spammers
|
||||||
db.recompute_scores(classifier);
|
|
||||||
|
|
||||||
spammers
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn apply_classification(
|
async fn apply_classification(
|
||||||
config: &Config,
|
config: &Config,
|
||||||
|
storage: &Storage,
|
||||||
forge: &Forgejo,
|
forge: &Forgejo,
|
||||||
db: Arc<Mutex<Db>>,
|
db: &Db,
|
||||||
classifier: Arc<Mutex<Classifier>>,
|
|
||||||
ids: &[(UserId, bool)],
|
ids: &[(UserId, bool)],
|
||||||
overwrite: bool,
|
overwrite: bool,
|
||||||
) {
|
) {
|
||||||
let spammers = {
|
let spammers = set_spam(db, ids, overwrite);
|
||||||
let classifier = &mut classifier.lock().unwrap();
|
|
||||||
set_spam(&mut db.lock().unwrap(), classifier, ids, overwrite)
|
|
||||||
};
|
|
||||||
|
|
||||||
for user in spammers {
|
for user in spammers {
|
||||||
let login = db.lock().unwrap().users.get(&user).unwrap().login.clone();
|
let login = db.login(user).unwrap();
|
||||||
// It is ok for any of these calls to fail now: a worker will periodically retry
|
// It is ok for any of these calls to fail now: a worker will periodically retry
|
||||||
// TODO: signal the worker to wake up instead of performing a manual call here
|
// TODO: signal the worker to wake up instead of performing a manual call here
|
||||||
workers::try_lock_and_notify_user(config, forge, db.clone(), user)
|
workers::try_lock_and_notify_user(config, storage, forge, db, user)
|
||||||
.await
|
.await
|
||||||
.unwrap_or_else(|err| eprintln!("Failed to lock or notify user {login}: {err}"));
|
.unwrap_or_else(|err| eprintln!("Failed to lock or notify user {login}: {err}"));
|
||||||
}
|
}
|
||||||
|
@ -297,13 +285,14 @@ async fn index(
|
||||||
) -> impl Responder {
|
) -> impl Responder {
|
||||||
eprintln!("GET {}", req.uri());
|
eprintln!("GET {}", req.uri());
|
||||||
|
|
||||||
let db = &data.db.lock().unwrap();
|
let db = &data.db;
|
||||||
|
|
||||||
let mut users: Vec<(UserId, &UserData, f32)> = db
|
let mut users: Vec<(UserId, UserData, f32)> = db.with_userdb(|udb| {
|
||||||
.unclassified_users()
|
udb.unclassified_users()
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(id, u)| (id, u, *db.score.get(&id).unwrap()))
|
.map(|(id, u)| (id, u.clone(), db.score(id).unwrap()))
|
||||||
.collect();
|
.collect()
|
||||||
|
});
|
||||||
let mut rng = rand::thread_rng();
|
let mut rng = rand::thread_rng();
|
||||||
|
|
||||||
users.shuffle(&mut rng);
|
users.shuffle(&mut rng);
|
||||||
|
@ -324,7 +313,7 @@ async fn index(
|
||||||
}
|
}
|
||||||
|
|
||||||
// compute the rough "spam score" (low/mid/high) and spam guess (true/false)
|
// compute the rough "spam score" (low/mid/high) and spam guess (true/false)
|
||||||
let users: Vec<(UserId, &UserData, f32, ApproxScore, bool)> = users
|
let users: Vec<(UserId, UserData, f32, ApproxScore, bool)> = users
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(id, u, score)| {
|
.map(|(id, u, score)| {
|
||||||
(
|
(
|
||||||
|
@ -337,8 +326,8 @@ async fn index(
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let users_count = db.users.len();
|
let users_count = db.with_userdb(|udb| udb.nb_users());
|
||||||
let classified_count = db.is_spam.len();
|
let classified_count = db.with_userdb(|udb| udb.nb_classified());
|
||||||
|
|
||||||
let mut context = tera::Context::new();
|
let mut context = tera::Context::new();
|
||||||
context.insert("forge_url", &data.config.forge_url.to_string());
|
context.insert("forge_url", &data.config.forge_url.to_string());
|
||||||
|
@ -369,30 +358,26 @@ async fn post_classified(
|
||||||
|
|
||||||
apply_classification(
|
apply_classification(
|
||||||
&data.config,
|
&data.config,
|
||||||
|
&data.storage,
|
||||||
&data.forge,
|
&data.forge,
|
||||||
data.db.clone(),
|
&data.db,
|
||||||
data.classifier.clone(),
|
|
||||||
&updates,
|
&updates,
|
||||||
overwrite,
|
overwrite,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
data.db
|
let res = storage::store_db(&data.storage, &data.db).await;
|
||||||
.lock()
|
|
||||||
.unwrap()
|
|
||||||
.store_to_path(Path::new("db.json"))
|
|
||||||
.unwrap(); // FIXME
|
|
||||||
|
|
||||||
data.classifier
|
|
||||||
.lock()
|
|
||||||
.unwrap()
|
|
||||||
.save(&mut File::create(Path::new("model.json")).unwrap(), false)
|
|
||||||
.unwrap(); // FIXME
|
|
||||||
|
|
||||||
eprintln!("done");
|
eprintln!("done");
|
||||||
HttpResponse::SeeOther()
|
|
||||||
.insert_header(("Location", req.uri().to_string()))
|
match res {
|
||||||
.finish()
|
Ok(()) => HttpResponse::SeeOther()
|
||||||
|
.insert_header(("Location", req.uri().to_string()))
|
||||||
|
.finish(),
|
||||||
|
Err(e) => {
|
||||||
|
HttpResponse::InternalServerError().body(format!("Internal server error:\n\n{e}"))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[post("/")]
|
#[post("/")]
|
||||||
|
@ -421,13 +406,13 @@ async fn classified(
|
||||||
) -> impl Responder {
|
) -> impl Responder {
|
||||||
eprintln!("GET {}", req.uri());
|
eprintln!("GET {}", req.uri());
|
||||||
|
|
||||||
let db = &data.db.lock().unwrap();
|
let db = &data.db;
|
||||||
|
let mut users: Vec<(UserId, UserData, f32, bool)> = db.with_userdb(|udb| {
|
||||||
let mut users: Vec<(UserId, &UserData, f32, bool)> = db
|
udb.classified_users()
|
||||||
.classified_users()
|
.into_iter()
|
||||||
.into_iter()
|
.map(|(id, u, s)| (id, u.clone(), db.score(id).unwrap(), s.as_bool()))
|
||||||
.map(|(id, u, s)| (id, u, *db.score.get(&id).unwrap(), s.as_bool()))
|
.collect()
|
||||||
.collect();
|
});
|
||||||
// sort "spam first"
|
// sort "spam first"
|
||||||
users.sort_by_key(|(_, _, score, _)| 1000 - (score * 1000.) as u64);
|
users.sort_by_key(|(_, _, score, _)| 1000 - (score * 1000.) as u64);
|
||||||
|
|
||||||
|
@ -465,15 +450,14 @@ async fn main() -> anyhow::Result<()> {
|
||||||
|
|
||||||
let config = Arc::new(Config::from_env().await?);
|
let config = Arc::new(Config::from_env().await?);
|
||||||
let forge = Arc::new(forge(&config.forge_url)?);
|
let forge = Arc::new(forge(&config.forge_url)?);
|
||||||
|
let storage = Arc::new(Storage::from_env().await?);
|
||||||
|
|
||||||
eprintln!("Load users and repos");
|
eprintln!("Load users and repos");
|
||||||
let (db, classifier) = load_db(&forge).await?;
|
let db = load_db(&storage, &forge).await?;
|
||||||
let db = Arc::new(Mutex::new(db));
|
|
||||||
let classifier = Arc::new(Mutex::new(classifier));
|
|
||||||
|
|
||||||
let st = web::Data::new(AppState {
|
let st = web::Data::new(AppState {
|
||||||
db: db.clone(),
|
db: db.clone(),
|
||||||
classifier: classifier.clone(),
|
storage: storage.clone(),
|
||||||
forge: forge.clone(),
|
forge: forge.clone(),
|
||||||
config: config.clone(),
|
config: config.clone(),
|
||||||
});
|
});
|
||||||
|
@ -481,22 +465,26 @@ async fn main() -> anyhow::Result<()> {
|
||||||
let mut workers = tokio::task::JoinSet::new();
|
let mut workers = tokio::task::JoinSet::new();
|
||||||
|
|
||||||
let _ = {
|
let _ = {
|
||||||
|
let storage = storage.clone();
|
||||||
let forge = forge.clone();
|
let forge = forge.clone();
|
||||||
let db = db.clone();
|
let db = db.clone();
|
||||||
let classifier = classifier.clone();
|
workers.spawn(async move { workers::refresh_user_data(storage, forge, db).await })
|
||||||
workers.spawn(async move { workers::refresh_user_data(forge, db, classifier).await })
|
|
||||||
};
|
};
|
||||||
let _ = {
|
let _ = {
|
||||||
let config = config.clone();
|
let config = config.clone();
|
||||||
|
let storage = storage.clone();
|
||||||
let forge = forge.clone();
|
let forge = forge.clone();
|
||||||
let db = db.clone();
|
let db = db.clone();
|
||||||
workers.spawn(async move { workers::purge_spammer_accounts(config, forge, db).await })
|
workers
|
||||||
|
.spawn(async move { workers::purge_spammer_accounts(config, storage, forge, db).await })
|
||||||
};
|
};
|
||||||
let _ = {
|
let _ = {
|
||||||
let config = config.clone();
|
let config = config.clone();
|
||||||
|
let storage = storage.clone();
|
||||||
let forge = forge.clone();
|
let forge = forge.clone();
|
||||||
let db = db.clone();
|
let db = db.clone();
|
||||||
workers.spawn(async move { workers::lock_and_notify_users(config, forge, db).await })
|
workers
|
||||||
|
.spawn(async move { workers::lock_and_notify_users(config, storage, forge, db).await })
|
||||||
};
|
};
|
||||||
|
|
||||||
println!("Listening on http://127.0.0.1:8080");
|
println!("Listening on http://127.0.0.1:8080");
|
||||||
|
|
167
src/storage.rs
Normal file
167
src/storage.rs
Normal file
|
@ -0,0 +1,167 @@
|
||||||
|
use anyhow::Context;
|
||||||
|
use aws_sdk_s3 as s3;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::prelude::{Read, Write};
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
pub enum Storage {
|
||||||
|
LocalFiles { dir: PathBuf },
|
||||||
|
S3 { client: s3::Client, bucket: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
use Storage::*;
|
||||||
|
|
||||||
|
impl Storage {
|
||||||
|
pub fn from_local_dir(dir: PathBuf) -> Self {
|
||||||
|
LocalFiles { dir }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn from_s3(bucket: String) -> Self {
|
||||||
|
let sdk_config = aws_config::load_from_env().await;
|
||||||
|
let config = aws_sdk_s3::config::Builder::from(&sdk_config)
|
||||||
|
.force_path_style(true)
|
||||||
|
.build();
|
||||||
|
let client = aws_sdk_s3::Client::from_conf(config);
|
||||||
|
S3 { client, bucket }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn from_env() -> anyhow::Result<Self> {
|
||||||
|
match std::env::var("STORAGE_BACKEND")
|
||||||
|
.context("reading the STORAGE_BACKEND environment variable")?
|
||||||
|
.as_ref()
|
||||||
|
{
|
||||||
|
"local" => {
|
||||||
|
let dir = match std::env::var("STORAGE_LOCAL_DIR") {
|
||||||
|
Ok(dir) => dir,
|
||||||
|
Err(_) => ".".to_string(),
|
||||||
|
};
|
||||||
|
Ok(Self::from_local_dir(PathBuf::from(dir)))
|
||||||
|
}
|
||||||
|
"s3" => {
|
||||||
|
let bucket = std::env::var("STORAGE_S3_BUCKET")
|
||||||
|
.context("reading the STORAGE_S3_BUCKET environment variable")?;
|
||||||
|
Ok(Self::from_s3(bucket).await)
|
||||||
|
}
|
||||||
|
other => {
|
||||||
|
anyhow::bail!("STORAGE_BACKEND: unexpected value {other} (expected: local/s3)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_file(dir: &Path, path: &str) -> anyhow::Result<Option<Vec<u8>>> {
|
||||||
|
let path = dir.join(path);
|
||||||
|
if path.is_file() {
|
||||||
|
let mut file = File::open(path)?;
|
||||||
|
let mut data = vec![];
|
||||||
|
file.read_to_end(&mut data)?;
|
||||||
|
Ok(Some(data))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_file(dir: &Path, path: &str, data: Vec<u8>) -> anyhow::Result<()> {
|
||||||
|
let path = dir.join(path);
|
||||||
|
let mut file = File::create(path)?;
|
||||||
|
file.write_all(&data)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn read_s3(
|
||||||
|
client: &s3::Client,
|
||||||
|
bucket: &str,
|
||||||
|
path: &str,
|
||||||
|
) -> anyhow::Result<Option<Vec<u8>>> {
|
||||||
|
let output = client.get_object().bucket(bucket).key(path).send().await;
|
||||||
|
match output {
|
||||||
|
Ok(output) => {
|
||||||
|
let data = output
|
||||||
|
.body
|
||||||
|
.collect()
|
||||||
|
.await
|
||||||
|
.context(format!("error reading {} from bucket {}", path, bucket))?
|
||||||
|
.into_bytes()
|
||||||
|
.to_vec();
|
||||||
|
Ok(Some(data))
|
||||||
|
}
|
||||||
|
Err(e) if is_no_such_key_error(&e) => Ok(None),
|
||||||
|
Err(err) => Err(err)?,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn write_s3(
|
||||||
|
client: &s3::Client,
|
||||||
|
bucket: &str,
|
||||||
|
path: &str,
|
||||||
|
data: Vec<u8>,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
client
|
||||||
|
.put_object()
|
||||||
|
.bucket(bucket)
|
||||||
|
.key(path)
|
||||||
|
.body(s3::primitives::ByteStream::from(data))
|
||||||
|
.send()
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn write(&self, path: &str, data: Vec<u8>) -> anyhow::Result<()> {
|
||||||
|
match self {
|
||||||
|
LocalFiles { dir } => Self::write_file(dir, path, data),
|
||||||
|
S3 { client, bucket } => Self::write_s3(client, bucket, path, data).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn read(&self, path: &str) -> anyhow::Result<Option<Vec<u8>>> {
|
||||||
|
match self {
|
||||||
|
LocalFiles { dir } => Self::read_file(dir, path),
|
||||||
|
S3 { client, bucket } => Self::read_s3(client, bucket, path).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
use s3::error::SdkError;
|
||||||
|
use s3::operation::get_object::GetObjectError;
|
||||||
|
|
||||||
|
fn is_no_such_key_error<R>(err: &SdkError<GetObjectError, R>) -> bool {
|
||||||
|
match err {
|
||||||
|
SdkError::ServiceError(e) => matches!(e.err(), GetObjectError::NoSuchKey(_)),
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
use crate::classifier::Classifier;
|
||||||
|
|
||||||
|
pub async fn load_classifier(storage: &Storage) -> anyhow::Result<Classifier> {
|
||||||
|
match storage.read("model.json").await? {
|
||||||
|
Some(data) => Ok(serde_json::from_slice(&data)?),
|
||||||
|
None => Ok(Classifier::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
use crate::userdb::UserDb;
|
||||||
|
|
||||||
|
pub async fn load_userdb(storage: &Storage) -> anyhow::Result<Option<UserDb>> {
|
||||||
|
if let Some(data) = storage.read("db.json").await? {
|
||||||
|
Ok(Some(serde_json::from_slice(&data)?))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn store_userdb(storage: &Storage, userdb: &UserDb) -> anyhow::Result<()> {
|
||||||
|
storage
|
||||||
|
.write("db.json", serde_json::to_vec(userdb)?)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
use crate::db::Db;
|
||||||
|
|
||||||
|
pub async fn store_db(storage: &Storage, db: &Db) -> anyhow::Result<()> {
|
||||||
|
let userdb_bytes = db.with_userdb(serde_json::to_vec)?;
|
||||||
|
let classifier_bytes = db.with_classifier(serde_json::to_vec)?;
|
||||||
|
storage.write("db.json", userdb_bytes).await?;
|
||||||
|
storage.write("model.json", classifier_bytes).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
169
src/userdb.rs
Normal file
169
src/userdb.rs
Normal file
|
@ -0,0 +1,169 @@
|
||||||
|
use crate::data::*;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::{hash_map, HashMap};
|
||||||
|
use std::fmt;
|
||||||
|
use std::time::SystemTime;
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone, Copy)]
|
||||||
|
pub enum IsSpam {
|
||||||
|
Legit,
|
||||||
|
Spam {
|
||||||
|
classified_at: SystemTime,
|
||||||
|
locked: bool,
|
||||||
|
notified: bool,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
impl IsSpam {
|
||||||
|
pub fn as_bool(&self) -> bool {
|
||||||
|
match self {
|
||||||
|
IsSpam::Legit => true,
|
||||||
|
IsSpam::Spam { .. } => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn from_bool(b: bool) -> IsSpam {
|
||||||
|
if b {
|
||||||
|
IsSpam::Spam {
|
||||||
|
classified_at: SystemTime::now(),
|
||||||
|
locked: false,
|
||||||
|
notified: false,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
IsSpam::Legit
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for IsSpam {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
match self {
|
||||||
|
IsSpam::Legit => write!(f, "legit"),
|
||||||
|
IsSpam::Spam { .. } => write!(f, "spam"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct UserDb {
|
||||||
|
users: HashMap<UserId, UserData>,
|
||||||
|
is_spam: HashMap<UserId, IsSpam>,
|
||||||
|
last_scrape: SystemTime,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl UserDb {
|
||||||
|
// Creating
|
||||||
|
|
||||||
|
pub fn from_users(
|
||||||
|
users: HashMap<UserId, UserData>,
|
||||||
|
is_spam: HashMap<UserId, IsSpam>,
|
||||||
|
last_scrape: SystemTime,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
users,
|
||||||
|
is_spam,
|
||||||
|
last_scrape,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reading
|
||||||
|
|
||||||
|
pub fn userdata(&self, uid: UserId) -> Option<&UserData> {
|
||||||
|
self.users.get(&uid)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_spam(&self, uid: UserId) -> Option<IsSpam> {
|
||||||
|
self.is_spam.get(&uid).copied()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn last_scrape(&self) -> SystemTime {
|
||||||
|
self.last_scrape
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn nb_users(&self) -> usize {
|
||||||
|
self.users.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn nb_classified(&self) -> usize {
|
||||||
|
self.is_spam.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn unclassified_users(&self) -> Vec<(UserId, &UserData)> {
|
||||||
|
self.users
|
||||||
|
.iter()
|
||||||
|
.filter(|(user_id, _)| !self.is_spam.contains_key(user_id))
|
||||||
|
.map(|(id, d)| (*id, d))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn classified_users(&self) -> Vec<(UserId, &UserData, IsSpam)> {
|
||||||
|
self.users
|
||||||
|
.iter()
|
||||||
|
.filter_map(|(user_id, user_data)| {
|
||||||
|
self.is_spam
|
||||||
|
.get(user_id)
|
||||||
|
.map(|is_spam| (user_id, user_data, *is_spam))
|
||||||
|
})
|
||||||
|
.map(|(id, d, s)| (*id, d, s))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Updating
|
||||||
|
|
||||||
|
pub fn set_spam(&mut self, uid: UserId, is_spam: Option<IsSpam>) {
|
||||||
|
match is_spam {
|
||||||
|
Some(is_spam) => self.is_spam.insert(uid, is_spam),
|
||||||
|
None => self.is_spam.remove(&uid),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn remove_user(&mut self, uid: UserId) {
|
||||||
|
self.users.remove(&uid);
|
||||||
|
self.is_spam.remove(&uid);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Internal helpers
|
||||||
|
|
||||||
|
// XXX remove?
|
||||||
|
// fn recompute_tokens_for(&mut self, uid: UserId) {
|
||||||
|
// self.tokens.insert(uid, self.users.get(&uid).unwrap().to_tokens());
|
||||||
|
// }
|
||||||
|
|
||||||
|
// fn recompute_tokens(&mut self) {
|
||||||
|
// for (id, user) in &self.users {
|
||||||
|
// self.tokens.insert(*id, user.to_tokens());
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// fn recompute_scores(&mut self, classifier: &Classifier) {
|
||||||
|
// for (id, tokens) in &self.tokens {
|
||||||
|
// self.score.insert(*id, classifier.score(tokens));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Iter<'a> {
|
||||||
|
iter_users: hash_map::Iter<'a, UserId, UserData>,
|
||||||
|
is_spam: &'a HashMap<UserId, IsSpam>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for Iter<'a> {
|
||||||
|
type Item = (UserId, &'a UserData, Option<IsSpam>);
|
||||||
|
fn next(&mut self) -> Option<(UserId, &'a UserData, Option<IsSpam>)> {
|
||||||
|
self.iter_users.next().map(|(uid, udata)| {
|
||||||
|
let is_spam = self.is_spam.get(uid).copied();
|
||||||
|
(*uid, udata, is_spam)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> IntoIterator for &'a UserDb {
|
||||||
|
type Item = (UserId, &'a UserData, Option<IsSpam>);
|
||||||
|
type IntoIter = Iter<'a>;
|
||||||
|
fn into_iter(self) -> Iter<'a> {
|
||||||
|
Iter {
|
||||||
|
iter_users: self.users.iter(),
|
||||||
|
is_spam: &self.is_spam,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
180
src/workers.rs
180
src/workers.rs
|
@ -1,13 +1,14 @@
|
||||||
use crate::classifier::Classifier;
|
|
||||||
use crate::data::UserId;
|
use crate::data::UserId;
|
||||||
use crate::db::{Db, IsSpam};
|
use crate::db::Db;
|
||||||
use crate::email;
|
use crate::email;
|
||||||
use crate::scrape;
|
use crate::scrape;
|
||||||
|
use crate::userdb::{IsSpam, UserDb};
|
||||||
|
use crate::{storage, storage::Storage};
|
||||||
use anyhow::anyhow;
|
use anyhow::anyhow;
|
||||||
use forgejo_api::Forgejo;
|
use forgejo_api::Forgejo;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::path::Path;
|
use std::sync::Arc;
|
||||||
use std::sync::{Arc, Mutex};
|
use std::time::SystemTime;
|
||||||
|
|
||||||
use crate::FORGEJO_POLL_DELAY;
|
use crate::FORGEJO_POLL_DELAY;
|
||||||
use crate::GRACE_PERIOD;
|
use crate::GRACE_PERIOD;
|
||||||
|
@ -16,14 +17,9 @@ use crate::{GUESS_LEGIT_THRESHOLD, GUESS_SPAM_THRESHOLD};
|
||||||
|
|
||||||
// Worker to refresh user data by periodically polling Forgejo
|
// Worker to refresh user data by periodically polling Forgejo
|
||||||
|
|
||||||
async fn try_refresh_user_data(
|
async fn try_refresh_user_data(storage: &Storage, forge: &Forgejo, db: &Db) -> anyhow::Result<()> {
|
||||||
forge: &Forgejo,
|
|
||||||
db: Arc<Mutex<Db>>,
|
|
||||||
classifier: Arc<Mutex<Classifier>>,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
{
|
{
|
||||||
let db = &db.lock().unwrap();
|
let d = db.with_userdb(|udb| udb.last_scrape().elapsed())?;
|
||||||
let d = db.last_scrape.elapsed()?;
|
|
||||||
if d < FORGEJO_POLL_DELAY {
|
if d < FORGEJO_POLL_DELAY {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
@ -32,49 +28,51 @@ async fn try_refresh_user_data(
|
||||||
eprintln!("Fetching user data");
|
eprintln!("Fetching user data");
|
||||||
let users = scrape::get_user_data(forge).await?;
|
let users = scrape::get_user_data(forge).await?;
|
||||||
|
|
||||||
let db: &mut Db = &mut db.lock().unwrap();
|
{
|
||||||
let classifier = &classifier.lock().unwrap();
|
// NB: Some user accounts may have been deleted since last fetch (hopefully
|
||||||
|
// they were spammers).
|
||||||
|
// Such users will appear in the current [db] but not in the new [users].
|
||||||
|
// We don't want to keep them in the database, so we rebuild a fresh [db]
|
||||||
|
// containing only data for users who still exist.
|
||||||
|
|
||||||
// NB: Some user accounts may have been deleted since last fetch (hopefully
|
let mut newdb = UserDb::from_users(users, HashMap::new(), SystemTime::now());
|
||||||
// they were spammers).
|
|
||||||
// Such users will appear in the current [db] but not in the new [users].
|
|
||||||
// We don't want to keep them in the database, so we rebuild a fresh [db]
|
|
||||||
// containing only data for users who still exist.
|
|
||||||
|
|
||||||
let mut newdb = Db::from_users(users, HashMap::new(), classifier);
|
let users: Vec<(UserId, Vec<String>, String)> = newdb
|
||||||
|
.unclassified_users()
|
||||||
|
.iter()
|
||||||
|
.map(|(user_id, user_data)| (*user_id, user_data.to_tokens(), user_data.login.clone()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
// Import spam classification from the previous Db
|
// Import spam classification from the previous Db.
|
||||||
for (&user_id, user_data) in &newdb.users {
|
// (Initially, all users are "unclassified" in newdb.)
|
||||||
let &score = newdb.score.get(&user_id).unwrap();
|
for (user_id, tokens, login) in users.into_iter() {
|
||||||
if let Some(&user_was_spam) = db.is_spam.get(&user_id) {
|
let score = db.with_classifier(|c| c.score(&tokens));
|
||||||
if (user_was_spam.as_bool() && score < GUESS_SPAM_THRESHOLD)
|
if let Some(user_was_spam) = db.with_userdb(|u| u.is_spam(user_id)) {
|
||||||
|| (!user_was_spam.as_bool() && score > GUESS_LEGIT_THRESHOLD)
|
if (user_was_spam.as_bool() && score < GUESS_SPAM_THRESHOLD)
|
||||||
{
|
|| (!user_was_spam.as_bool() && score > GUESS_LEGIT_THRESHOLD)
|
||||||
eprintln!(
|
{
|
||||||
"Score for user {} changed past threshold; discarding our current classification",
|
eprintln!(
|
||||||
user_data.login
|
"Score for user {login} changed past threshold; discarding our current classification",
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
newdb.is_spam.insert(user_id, user_was_spam);
|
newdb.set_spam(user_id, Some(user_was_spam));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// switch to [newdb]
|
||||||
|
db.replace_userdb(newdb);
|
||||||
}
|
}
|
||||||
|
|
||||||
// switch to [newdb]
|
let res = storage::store_db(storage, db).await;
|
||||||
let _ = std::mem::replace(db, newdb);
|
res.unwrap(); // FIXME
|
||||||
|
|
||||||
db.store_to_path(Path::new("db.json")).unwrap(); // FIXME
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn refresh_user_data(
|
pub async fn refresh_user_data(storage: Arc<Storage>, forge: Arc<Forgejo>, db: Db) {
|
||||||
forge: Arc<Forgejo>,
|
|
||||||
db: Arc<Mutex<Db>>,
|
|
||||||
classifier: Arc<Mutex<Classifier>>,
|
|
||||||
) {
|
|
||||||
loop {
|
loop {
|
||||||
if let Err(e) = try_refresh_user_data(&forge, db.clone(), classifier.clone()).await {
|
if let Err(e) = try_refresh_user_data(&storage, &forge, &db).await {
|
||||||
eprintln!("Error refreshing user data: {:?}", e);
|
eprintln!("Error refreshing user data: {:?}", e);
|
||||||
}
|
}
|
||||||
tokio::time::sleep(FORGEJO_POLL_DELAY.mul_f32(0.1)).await;
|
tokio::time::sleep(FORGEJO_POLL_DELAY.mul_f32(0.1)).await;
|
||||||
|
@ -100,15 +98,19 @@ async fn try_purge_account(config: &Config, forge: &Forgejo, login: &str) -> any
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn purge_spammer_accounts(config: Arc<Config>, forge: Arc<Forgejo>, db: Arc<Mutex<Db>>) {
|
pub async fn purge_spammer_accounts(
|
||||||
|
config: Arc<Config>,
|
||||||
|
storage: Arc<Storage>,
|
||||||
|
forge: Arc<Forgejo>,
|
||||||
|
db: Db,
|
||||||
|
) {
|
||||||
loop {
|
loop {
|
||||||
let mut classified_users = Vec::new();
|
let classified_users: Vec<_> = db.with_userdb(|u| {
|
||||||
{
|
u.classified_users()
|
||||||
let db = &db.lock().unwrap();
|
.into_iter()
|
||||||
for (id, user, is_spam) in db.classified_users() {
|
.map(|(user_id, user, is_spam)| (user_id, user.login.clone(), is_spam))
|
||||||
classified_users.push((id, user.login.clone(), is_spam));
|
.collect()
|
||||||
}
|
});
|
||||||
}
|
|
||||||
|
|
||||||
for (user_id, login, is_spam) in classified_users {
|
for (user_id, login, is_spam) in classified_users {
|
||||||
if let IsSpam::Spam {
|
if let IsSpam::Spam {
|
||||||
|
@ -141,12 +143,11 @@ pub async fn purge_spammer_accounts(config: Arc<Config>, forge: Arc<Forgejo>, db
|
||||||
eprintln!("Error while deleting spammer account {login}: {:?}", e)
|
eprintln!("Error while deleting spammer account {login}: {:?}", e)
|
||||||
} else {
|
} else {
|
||||||
eprintln!("Deleted spammer account {login}");
|
eprintln!("Deleted spammer account {login}");
|
||||||
let db = &mut db.lock().unwrap();
|
{
|
||||||
db.users.remove(&user_id);
|
db.remove_user(user_id);
|
||||||
db.is_spam.remove(&user_id);
|
}
|
||||||
db.score.remove(&user_id);
|
let res = storage::store_db(&storage, &db).await;
|
||||||
db.tokens.remove(&user_id);
|
res.unwrap(); // FIXME
|
||||||
db.store_to_path(Path::new("db.json")).unwrap(); // FIXME
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => (),
|
_ => (),
|
||||||
|
@ -193,22 +194,22 @@ async fn lock_user_account(forge: &Forgejo, username: &str) -> anyhow::Result<()
|
||||||
|
|
||||||
pub async fn try_lock_and_notify_user(
|
pub async fn try_lock_and_notify_user(
|
||||||
config: &Config,
|
config: &Config,
|
||||||
|
storage: &Storage,
|
||||||
forge: &Forgejo,
|
forge: &Forgejo,
|
||||||
db: Arc<Mutex<Db>>,
|
db: &Db,
|
||||||
user_id: UserId,
|
user_id: UserId,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let (login, email, is_spam) = {
|
let (login, email, is_spam) = db.with_userdb(|u| {
|
||||||
let db = &db.lock().unwrap();
|
let user = u.userdata(user_id).unwrap();
|
||||||
let user = db.users.get(&user_id).unwrap();
|
(user.login.clone(), user.email.clone(), u.is_spam(user_id))
|
||||||
let is_spam = match db.is_spam.get(&user_id) {
|
});
|
||||||
Some(IsSpam::Spam {
|
let is_spam = match is_spam{
|
||||||
classified_at,
|
Some(IsSpam::Spam {
|
||||||
locked,
|
classified_at,
|
||||||
notified,
|
locked,
|
||||||
}) => Some((*classified_at, *locked, *notified)),
|
notified,
|
||||||
_ => None,
|
}) => Some((classified_at, locked, notified)),
|
||||||
};
|
_ => None,
|
||||||
(user.login.clone(), user.email.clone(), is_spam)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Some((classified_at, locked, notified)) = is_spam {
|
if let Some((classified_at, locked, notified)) = is_spam {
|
||||||
|
@ -222,16 +223,16 @@ pub async fn try_lock_and_notify_user(
|
||||||
ActuallyBan::No => eprintln!("[Simulating: lock account of user {login}]"),
|
ActuallyBan::No => eprintln!("[Simulating: lock account of user {login}]"),
|
||||||
}
|
}
|
||||||
|
|
||||||
let db = &mut db.lock().unwrap();
|
db.set_spam(
|
||||||
db.is_spam.insert(
|
|
||||||
user_id,
|
user_id,
|
||||||
IsSpam::Spam {
|
Some(IsSpam::Spam {
|
||||||
classified_at,
|
classified_at,
|
||||||
locked: true,
|
locked: true,
|
||||||
notified,
|
notified,
|
||||||
},
|
}),
|
||||||
);
|
);
|
||||||
db.store_to_path(Path::new("db.json")).unwrap(); // FIXME
|
|
||||||
|
storage::store_db(storage, db).await.unwrap(); // FIXME
|
||||||
}
|
}
|
||||||
|
|
||||||
if !notified {
|
if !notified {
|
||||||
|
@ -245,16 +246,17 @@ pub async fn try_lock_and_notify_user(
|
||||||
eprintln!("[Simulating: send notification email to user {login}]")
|
eprintln!("[Simulating: send notification email to user {login}]")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let db = &mut db.lock().unwrap();
|
|
||||||
db.is_spam.insert(
|
db.set_spam(
|
||||||
user_id,
|
user_id,
|
||||||
IsSpam::Spam {
|
Some(IsSpam::Spam {
|
||||||
classified_at,
|
classified_at,
|
||||||
locked: true,
|
locked: true,
|
||||||
notified: true,
|
notified: true,
|
||||||
},
|
}),
|
||||||
);
|
);
|
||||||
db.store_to_path(Path::new("db.json")).unwrap(); // FIXME
|
|
||||||
|
storage::store_db(storage, db).await.unwrap(); // FIXME
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -266,19 +268,25 @@ pub async fn try_lock_and_notify_user(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn lock_and_notify_users(config: Arc<Config>, forge: Arc<Forgejo>, db: Arc<Mutex<Db>>) {
|
pub async fn lock_and_notify_users(
|
||||||
|
config: Arc<Config>,
|
||||||
|
storage: Arc<Storage>,
|
||||||
|
forge: Arc<Forgejo>,
|
||||||
|
db: Db,
|
||||||
|
) {
|
||||||
let mut spammers = Vec::new();
|
let mut spammers = Vec::new();
|
||||||
{
|
{
|
||||||
let db = &db.lock().unwrap();
|
db.with_userdb(|udb| {
|
||||||
for (id, user, is_spam) in db.classified_users() {
|
for (id, user, is_spam) in udb.classified_users() {
|
||||||
if is_spam.as_bool() {
|
if is_spam.as_bool() {
|
||||||
spammers.push((id, user.login.clone()))
|
spammers.push((id, user.login.clone()))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
for (user_id, login) in spammers {
|
for (user_id, login) in spammers {
|
||||||
try_lock_and_notify_user(&config, &forge, db.clone(), user_id)
|
try_lock_and_notify_user(&config, &storage, &forge, &db, user_id)
|
||||||
.await
|
.await
|
||||||
.unwrap_or_else(|err| eprintln!("Failed to lock or notify user {login}: {err}"));
|
.unwrap_or_else(|err| eprintln!("Failed to lock or notify user {login}: {err}"));
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue