Add a new /classified page that allows editing classifications

This commit is contained in:
Armaël Guéneau 2024-12-18 13:22:56 +01:00
parent 6a23483073
commit b2406dd883
9 changed files with 315 additions and 183 deletions

View file

@ -17,7 +17,6 @@
- take concrete actions for spam accounts: lock the account, send a warning
email, then delete+purge account after some time.
- allow changing the classification of already-classified users
- add backend to store data on garage instead of local files
- replate the `api_token` file with a better mechanism: oauth maybe?
- improve error handling

File diff suppressed because one or more lines are too long

View file

@ -78,7 +78,8 @@ impl Db {
pub fn store_to_path(&self, path: &Path) -> anyhow::Result<()> {
let file = File::create(path)?;
(&self.users, &self.is_spam);
let dat: (&HashMap<UserId, UserData>, &HashMap<UserId, bool>, u64) =
(&self.users, &self.is_spam, self.last_scrape);
serde_json::to_writer(BufWriter::new(file), &dat)?;
Ok(())
}
@ -89,4 +90,13 @@ impl Db {
.filter(|(user_id, _)| !self.is_spam.contains_key(&user_id))
.collect()
}
pub fn classified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData, bool)> {
self.users
.iter()
.filter_map(|(user_id, user_data)|
self.is_spam.get(&user_id).map(|is_spam| (user_id, user_data, *is_spam))
)
.collect()
}
}

View file

@ -2,7 +2,7 @@ use actix_web::{get, post, web, App, HttpResponse, HttpServer, Responder};
use forgejo_api::{Auth, Forgejo};
use lazy_static::lazy_static;
use rand::prelude::*;
use serde::Deserialize;
use serde::{Serialize, Deserialize};
use std::collections::HashMap;
use std::fs::File;
use std::path::Path;
@ -62,31 +62,57 @@ async fn load_db(forge: &Forgejo) -> anyhow::Result<(Db, Classifier)> {
Ok((db, classifier))
}
fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)]) {
eprintln!("updating classifier");
// XXX: This function looks like it is doing too many things at once.
fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], overwrite: bool) {
for (user_id, is_spam) in ids {
// Train classifier with tokens from the user
let tokens = db.tokens.get(user_id).unwrap();
if *is_spam {
classifier.train_spam(tokens);
} else {
classifier.train_ham(tokens);
}
let mut train_classifier = false;
match db.is_spam.get(user_id) {
Some(b) if b != is_spam => {
// classification conflict between concurrent queries.
// In this case we play it safe and erase the classification for this user;
// it will need to be manually classified again.
Some(was_spam) if overwrite && was_spam != is_spam => {
eprintln!(
"User {}: changing classification from {} to {}",
db.users.get(user_id).unwrap().login,
(if *was_spam { "spam" } else { "legit" }),
(if *is_spam { "spam" } else { "legit" })
);
db.is_spam.insert(*user_id, *is_spam);
// This is somewhat hackish: we already trained the classifier
// on the previous classification, possibly with the same
// tokens.
// Ideally we would undo the previous training and train with
// the correct classification now, but the classifier has no way
// to easily undo a previous training (we don't know whether the
// tokens that we have now are the same as the one that were
// used previously).
train_classifier = true;
},
Some(was_spam) if !overwrite && was_spam != is_spam => {
// Classification conflict between concurrent queries.
// In this case we play it safe and discard the classification
// for this user; the user will need to be manually classified again.
eprintln!(
"Classification conflict for user {}; discarding our current classification",
db.users.get(user_id).unwrap().login
);
db.is_spam.remove(user_id);
},
_ => {
None => {
db.is_spam.insert(*user_id, *is_spam);
train_classifier = true;
},
Some(was_spam) => {
assert!(was_spam == is_spam);
// nothing to do
}
}
if train_classifier {
// Train the classifier with tokens from the user
let tokens = db.tokens.get(user_id).unwrap();
if *is_spam {
classifier.train_spam(tokens);
} else {
classifier.train_ham(tokens);
}
}
}
@ -117,13 +143,26 @@ struct SortSetting {
sort: Option<String>,
}
#[derive(Serialize, Deserialize)]
enum ApproxScore { Low, Mid, High }
// approximated score, for feeding to the template
fn approx_score(score: f32) -> ApproxScore {
if score <= GUESS_LEGIT_THRESHOLD {
ApproxScore::Low
} else if score < GUESS_SPAM_THRESHOLD {
ApproxScore::Mid
} else {
ApproxScore::High
}
}
#[get("/")]
async fn index(data: web::Data<AppState>, q: web::Query<SortSetting>) -> impl Responder {
eprintln!("GET /");
let db = &data.db.lock().unwrap();
eprintln!("scoring users...");
let mut users: Vec<(&UserId, &UserData, f32)> = db
.unclassified_users()
.into_iter()
@ -131,10 +170,8 @@ async fn index(data: web::Data<AppState>, q: web::Query<SortSetting>) -> impl Re
.collect();
let mut rng = rand::thread_rng();
eprintln!("randomizing...");
users.shuffle(&mut rng);
eprintln!("sorting...");
let sorting_req = q.sort.as_ref().map(|s| s.as_str());
match &sorting_req {
// sort "legit first": by increasing score
@ -150,12 +187,16 @@ async fn index(data: web::Data<AppState>, q: web::Query<SortSetting>) -> impl Re
users.sort_by_key(|(_, _, score)| 1000 - (score * 1000.) as u64)
}
// compute the rough "spam score" (low/mid/high) and spam guess (true/false)
let users: Vec<(&UserId, &UserData, f32, ApproxScore, bool)> =
users.into_iter()
.map(|(id, u, score)| (id, u, score, approx_score(score), score >= GUESS_SPAM_THRESHOLD))
.collect();
let users_count = db.users.len();
let classified_count = db.is_spam.len();
let mut context = tera::Context::new();
context.insert("spam_threshold", &GUESS_SPAM_THRESHOLD);
context.insert("legit_threshold", &GUESS_LEGIT_THRESHOLD);
context.insert("users", &users);
context.insert(
"unclassified_users_count",
@ -168,8 +209,7 @@ async fn index(data: web::Data<AppState>, q: web::Query<SortSetting>) -> impl Re
HttpResponse::Ok().body(page)
}
#[post("/")]
async fn apply(data: web::Data<AppState>, req: web::Form<HashMap<i64, String>>) -> impl Responder {
async fn post_classified(data: web::Data<AppState>, req: web::Form<HashMap<i64, String>>, overwrite: bool, redirect: &str) -> impl Responder {
eprintln!("POST /");
let db = &mut data.db.lock().unwrap();
@ -180,18 +220,56 @@ async fn apply(data: web::Data<AppState>, req: web::Form<HashMap<i64, String>>)
.map(|(id, classification)| (UserId(*id), classification == "spam"))
.collect();
set_spam(db, classifier, &updates);
set_spam(db, classifier, &updates, overwrite);
db.store_to_path(Path::new("db.json")).unwrap(); // FIXME
classifier
.save(&mut File::create(Path::new("model.json")).unwrap(), false)
.unwrap(); // FIXME
eprintln!("done");
HttpResponse::SeeOther()
.insert_header(("Location", ""))
.insert_header(("Location", redirect))
.finish()
}
#[post("/")]
async fn post_classified_index(data: web::Data<AppState>, req: web::Form<HashMap<i64, String>>) -> impl Responder {
post_classified(data, req, false, "/").await
}
#[post("/classified")]
async fn post_classified_edit(data: web::Data<AppState>, req: web::Form<HashMap<i64, String>>) -> impl Responder {
post_classified(data, req, true, "/classified").await
}
#[get("/classified")]
async fn classified(data: web::Data<AppState>, _q: web::Query<SortSetting>) -> impl Responder {
eprintln!("GET /classified");
let db = &data.db.lock().unwrap();
let mut users: Vec<(&UserId, &UserData, f32, bool)> = db
.classified_users()
.into_iter()
.map(|(id, u, s)| (id, u, *db.score.get(id).unwrap(), s))
.collect();
// sort "spam first"
users.sort_by_key(|(_, _, score, _)| 1000 - (score * 1000.) as u64);
let users: Vec<_> =
users.into_iter()
.map(|(id, u, score, is_spam)| (id, u, score, approx_score(score), is_spam))
.collect();
let mut context = tera::Context::new();
context.insert("users", &users);
eprintln!("rendering template...");
let page = TEMPLATES.render("classified.html", &context).unwrap();
eprintln!("done");
HttpResponse::Ok().body(page)
}
#[actix_web::main]
async fn main() -> std::io::Result<()> {
eprintln!("Eval templates");
@ -219,7 +297,9 @@ async fn main() -> std::io::Result<()> {
.service(actix_files::Files::new("/static/", "./static"))
.app_data(st.clone())
.service(index)
.service(apply)
.service(classified)
.service(post_classified_index)
.service(post_classified_edit)
})
.bind(("127.0.0.1", 8080))?
.run()

85
static/style.css Normal file
View file

@ -0,0 +1,85 @@
.main {
display: flex;
flex-direction: column;
gap: 30px;
align-items: center;
}
.users {
display: flex;
flex-direction: column;
gap: 15px;
}
.user {
display: flex;
flex-direction: row;
gap: 10px;
align-items: center;
border: 1px dotted #000;
padding: 3px 8px 3px 8px;
margin: 3px;
}
.user-card {
display: flex;
flex-direction: column;
}
.user-name {
display: flex;
flex-direction: row;
flex-wrap: wrap;
gap: 10px;
}
.user-info {
display: flex;
flex-direction: row;
flex-wrap: wrap;
gap: 10px;
}
.user-classification {
display: flex;
flex-direction: column;
gap: 3px;
}
input.radio-classify {
display: none;
}
input.radio-classify + label {
border: 1px solid #000;
padding: 2px;
text-align: center;
}
input.radio-spam:checked + label {
border: 1px solid #d00400;
background: #d00400;
color: #fff;
}
input.radio-legit:checked + label {
border: 1px solid #048e02;
background: #048e02;
color: #fff;
}
.score {
padding-left: 3px;
padding-right: 3px;
width: 2.8em;
text-align: center;
flex-grow: 0;
flex-shrink: 0;
}
.score-High {
background: #ff696b;
}
.score-Mid {
background: #ffa769;
}
.score-Low {
background: #5fd770;
}

16
templates/base.html Normal file
View file

@ -0,0 +1,16 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
<meta http-equiv="x-ua-compatible" content="ie=edge" />
<link rel="stylesheet" type="text/css" href="static/style.css" />
<title>{% block title %}{% endblock title %} - Forgejo Spam Admin</title>
</head>
<body>
<div class="main">
{% block content %}
{% endblock content %}
</div>
</body>
</html>

29
templates/classified.html Normal file
View file

@ -0,0 +1,29 @@
{% import "ui.html" as ui %}
{% extends "base.html" %}
{% block title %}Classified{% endblock title %}
{% block content %}
<div>
Editing classified users
</div>
<div>
<a href="/">Home</a>
</div>
<form method="post">
<div class="users">
{% for user_data in users %}
{{ ui::user_card(
user_id=user_data[0],
user=user_data[1],
score=user_data[2],
score_approx=user_data[3],
is_spam=user_data[4]
) }}
{% endfor %}
</div>
<input type="submit" value="Apply" class="button" style="width: 200px; height: 30px"/>
</form>
{% endblock content %}

View file

@ -1,170 +1,35 @@
{% import "macros.html" as macros %}
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
<meta http-equiv="x-ua-compatible" content="ie=edge" />
<title>Forgejo Spam Admin</title>
</head>
<style>
.main {
display: flex;
flex-direction: column;
gap: 30px;
align-items: center;
}
.users {
display: flex;
flex-direction: column;
gap: 15px;
}
.user {
display: flex;
flex-direction: row;
gap: 10px;
align-items: center;
border: 1px dotted #000;
padding: 3px 8px 3px 8px;
margin: 3px;
}
.user-card {
display: flex;
flex-direction: column;
}
.user-name {
display: flex;
flex-direction: row;
flex-wrap: wrap;
gap: 10px;
}
{% import "ui.html" as ui %}
{% extends "base.html" %}
{% block title %}Home{% endblock title %}
.user-info {
display: flex;
flex-direction: row;
flex-wrap: wrap;
gap: 10px;
}
.user-classification {
display: flex;
flex-direction: column;
gap: 3px;
}
input.radio-classify {
display: none;
}
input.radio-classify + label {
border: 1px solid #000;
padding: 2px;
text-align: center;
}
input.radio-spam:checked + label {
border: 1px solid #d00400;
background: #d00400;
color: #fff;
}
input.radio-legit:checked + label {
border: 1px solid #048e02;
background: #048e02;
color: #fff;
}
.score {
padding-left: 3px;
padding-right: 3px;
width: 2.8em;
text-align: center;
flex-grow: 0;
flex-shrink: 0;
}
.score-high {
background: #ff696b;
}
.score-mid {
background: #ffa769;
}
.score-low {
background: #5fd770;
}
</style>
<body>
<div class="main">
{% block content %}
<div class="stats">
Users: unclassified: {{unclassified_users_count}} | total: {{total_users_count}}
</div>
<div>
<a href="/classified">Edit classified users</a>
</div>
<div class="sort-options">
<a href="/?sort=spam">Sort: Spam first</a> |
<a href="/?sort=legit">Sort: Legit first</a> |
<a href="/?sort=random">Sort: Random</a>
</div>
<form method="post">
<div class="users">
{% for id_user_score in users %}
{% set user_id = id_user_score[0] %}
{% set user = id_user_score[1] %}
{% set score = id_user_score[2] %}
<div class="user">
<div class="user-classification">
<input type="radio" name="{{user_id}}" id="{{user_id}}-spam" value="spam"
class="radio-classify radio-spam"
{% if score >= 0.8 %}checked{% endif %}
/>
<label for="{{user_id}}-spam">Spam</label>
<input type="radio" name="{{user_id}}" id="{{user_id}}-legit" value="legit"
class="radio-classify radio-legit"
{% if score < 0.8 %}checked{% endif %}
/>
<label for="{{user_id}}-legit">Legit</label>
</div>
<div class="score
{% if score >= spam_threshold %} score-high {% endif %}
{% if score < spam_threshold and score > legit_threshold %} score-mid {% endif %}
{% if score <= legit_threshold %} score-low {% endif %}
">
{{ score | round(precision=2) }}
</div>
<div class="user-card">
<div class="user-name">
<div><strong><a href="https://git.deuxfleurs.fr/{{user.login}}">{{ user.login }}</a></strong></div>
{%- if user.full_name %}<div><strong>({{ user.full_name }})</strong></div>{% endif -%}
</div>
<div class="user-info">
{%- if user.location %}<div>[L] {{ user.location }}</div>{% endif -%}
{%- if user.website %}<div>[W] {{ user.website }}</div>{% endif -%}
</div>
{%- if user.description %}<div>[D] {{ user.description }}</div>{% endif -%}
{%- if user.repos | length > 0 %}
<div class="user-repos">
<div>Repositories:</div>
{% for repo in user.repos %}
<div>{{ macros::compact(name=repo[1].name, desc=repo[1].description) }}</div>
{% endfor %}
</div>
{% endif -%}
{%- if user.issues | length > 0 %}
<div class="user-issues">
<div>Issues:</div>
{% for issue in user.issues %}
<div>{{ macros::compact(name=issue[1].title, desc=issue[1].body) }}</div>
{% endfor %}
</div>
{% endif -%}
</div>
</div>
{% for user_data in users %}
{{ ui::user_card(
user_id=user_data[0],
user=user_data[1],
score=user_data[2],
score_approx=user_data[3],
is_spam=user_data[4]
) }}
{% endfor %}
</div>
<input type="submit" value="Apply" class="button" style="width: 200px; height: 30px"/>
</form>
</div>
</body>
</html>
{% endblock content %}

48
templates/ui.html Normal file
View file

@ -0,0 +1,48 @@
{% import "macros.html" as macros %}
{% macro user_card(user_id, user, score, score_approx, is_spam) %}
<div class="user">
<div class="user-classification">
<input type="radio" name="{{user_id}}" id="{{user_id}}-spam" value="spam"
class="radio-classify radio-spam"
{% if is_spam %}checked{% endif %}
/>
<label for="{{user_id}}-spam">Spam</label>
<input type="radio" name="{{user_id}}" id="{{user_id}}-legit" value="legit"
class="radio-classify radio-legit"
{% if not is_spam %}checked{% endif %}
/>
<label for="{{user_id}}-legit">Legit</label>
</div>
<div class="score score-{{score_approx}}">
{{ score | round(precision=2) }}
</div>
<div class="user-card">
<div class="user-name">
<div><strong><a href="https://git.deuxfleurs.fr/{{user.login}}">{{ user.login }}</a></strong></div>
{%- if user.full_name %}<div><strong>({{ user.full_name }})</strong></div>{% endif -%}
</div>
<div class="user-info">
{%- if user.location %}<div>[L] {{ user.location }}</div>{% endif -%}
{%- if user.website %}<div>[W] {{ user.website }}</div>{% endif -%}
</div>
{%- if user.description %}<div>[D] {{ user.description }}</div>{% endif -%}
{%- if user.repos | length > 0 %}
<div class="user-repos">
<div>Repositories:</div>
{% for repo in user.repos %}
<div>{{ macros::compact(name=repo[1].name, desc=repo[1].description) }}</div>
{% endfor %}
</div>
{% endif -%}
{%- if user.issues | length > 0 %}
<div class="user-issues">
<div>Issues:</div>
{% for issue in user.issues %}
<div>{{ macros::compact(name=issue[1].title, desc=issue[1].body) }}</div>
{% endfor %}
</div>
{% endif -%}
</div>
</div>
{% endmacro user_card %}