Add a new /classified page that allows editing classifications

2024-12-18 13:22:56 +01:00 · 2024-12-18 13:22:56 +01:00 · b2406dd883
commit b2406dd883
parent 6a23483073
9 changed files with 315 additions and 183 deletions
--- a/README.md
+++ b/README.md
@ -17,7 +17,6 @@

 - take concrete actions for spam accounts: lock the account, send a warning
  email, then delete+purge account after some time.
- allow changing the classification of already-classified users
 - add backend to store data on garage instead of local files
 - replate the `api_token` file with a better mechanism: oauth maybe?
 - improve error handling 
--- a/model.json
+++ b/model.json
--- a/src/db.rs
+++ b/src/db.rs
@ -78,7 +78,8 @@ impl Db {

    pub fn store_to_path(&self, path: &Path) -> anyhow::Result<()> {
        let file = File::create(path)?;
-            (&self.users, &self.is_spam);
+        let dat: (&HashMap<UserId, UserData>, &HashMap<UserId, bool>, u64) =
+            (&self.users, &self.is_spam, self.last_scrape);
        serde_json::to_writer(BufWriter::new(file), &dat)?;
        Ok(())
    }
@ -89,4 +90,13 @@ impl Db {
            .filter(|(user_id, _)| !self.is_spam.contains_key(&user_id))
            .collect()
    }
+
+    pub fn classified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData, bool)> {
+        self.users
+            .iter()
+            .filter_map(|(user_id, user_data)|
+                        self.is_spam.get(&user_id).map(|is_spam| (user_id, user_data, *is_spam))
+            )
+            .collect()
+    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -2,7 +2,7 @@ use actix_web::{get, post, web, App, HttpResponse, HttpServer, Responder};
 use forgejo_api::{Auth, Forgejo};
 use lazy_static::lazy_static;
 use rand::prelude::*;
-use serde::Deserialize;
+use serde::{Serialize, Deserialize};
 use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
@ -62,31 +62,57 @@ async fn load_db(forge: &Forgejo) -> anyhow::Result<(Db, Classifier)> {
    Ok((db, classifier))
 }

-fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)]) {
-    eprintln!("updating classifier");
-
+// XXX: This function looks like it is doing too many things at once.
+fn set_spam(db: &mut Db, classifier: &mut Classifier, ids: &[(UserId, bool)], overwrite: bool) {
    for (user_id, is_spam) in ids {
-        // Train classifier with tokens from the user
-        let tokens = db.tokens.get(user_id).unwrap();
-        if *is_spam {
-            classifier.train_spam(tokens);
-        } else {
-            classifier.train_ham(tokens);
-        }
+        let mut train_classifier = false;

        match db.is_spam.get(user_id) {
-            Some(b) if b != is_spam => {
-                // classification conflict between concurrent queries.
-                // In this case we play it safe and erase the classification for this user;
-                // it will need to be manually classified again.
+            Some(was_spam) if overwrite && was_spam != is_spam => {
+                eprintln!(
+                    "User {}: changing classification from {} to {}",
+                    db.users.get(user_id).unwrap().login,
+                    (if *was_spam { "spam" } else { "legit" }),
+                    (if *is_spam { "spam" } else { "legit" })
+                );
+                db.is_spam.insert(*user_id, *is_spam);
+                // This is somewhat hackish: we already trained the classifier
+                // on the previous classification, possibly with the same
+                // tokens.
+                // Ideally we would undo the previous training and train with
+                // the correct classification now, but the classifier has no way
+                // to easily undo a previous training (we don't know whether the
+                // tokens that we have now are the same as the one that were
+                // used previously).
+                train_classifier = true;
+            },
+            Some(was_spam) if !overwrite && was_spam != is_spam => {
+                // Classification conflict between concurrent queries.
+                // In this case we play it safe and discard the classification
+                // for this user; the user will need to be manually classified again.
                eprintln!(
                    "Classification conflict for user {}; discarding our current classification",
                    db.users.get(user_id).unwrap().login
                );
                db.is_spam.remove(user_id);
            },
-            _ => {
+            None => {
                db.is_spam.insert(*user_id, *is_spam);
+                train_classifier = true;
+            },
+            Some(was_spam) => {
+                assert!(was_spam == is_spam);
+                // nothing to do
+            }
+        }
+
+        if train_classifier {
+            // Train the classifier with tokens from the user
+            let tokens = db.tokens.get(user_id).unwrap();
+            if *is_spam {
+                classifier.train_spam(tokens);
+            } else {
+                classifier.train_ham(tokens);
            }
        }
    }
@ -117,13 +143,26 @@ struct SortSetting {
    sort: Option<String>,
 }

+#[derive(Serialize, Deserialize)]
+enum ApproxScore { Low, Mid, High }
+
+// approximated score, for feeding to the template
+fn approx_score(score: f32) -> ApproxScore {
+    if score <= GUESS_LEGIT_THRESHOLD {
+        ApproxScore::Low
+    } else if score < GUESS_SPAM_THRESHOLD {
+        ApproxScore::Mid
+    } else {
+        ApproxScore::High
+    }
+}
+
 #[get("/")]
 async fn index(data: web::Data<AppState>, q: web::Query<SortSetting>) -> impl Responder {
    eprintln!("GET /");

    let db = &data.db.lock().unwrap();

-    eprintln!("scoring users...");
    let mut users: Vec<(&UserId, &UserData, f32)> = db
        .unclassified_users()
        .into_iter()
@ -131,10 +170,8 @@ async fn index(data: web::Data<AppState>, q: web::Query<SortSetting>) -> impl Re
        .collect();
    let mut rng = rand::thread_rng();

-    eprintln!("randomizing...");
    users.shuffle(&mut rng);

-    eprintln!("sorting...");
    let sorting_req = q.sort.as_ref().map(|s| s.as_str());
    match &sorting_req {
        // sort "legit first": by increasing score
@ -150,12 +187,16 @@ async fn index(data: web::Data<AppState>, q: web::Query<SortSetting>) -> impl Re
        users.sort_by_key(|(_, _, score)| 1000 - (score * 1000.) as u64)
    }

+    // compute the rough "spam score" (low/mid/high) and spam guess (true/false)
+    let users: Vec<(&UserId, &UserData, f32, ApproxScore, bool)> =
+        users.into_iter()
+        .map(|(id, u, score)| (id, u, score, approx_score(score), score >= GUESS_SPAM_THRESHOLD))
+        .collect();
+
    let users_count = db.users.len();
    let classified_count = db.is_spam.len();

    let mut context = tera::Context::new();
-    context.insert("spam_threshold", &GUESS_SPAM_THRESHOLD);
-    context.insert("legit_threshold", &GUESS_LEGIT_THRESHOLD);
    context.insert("users", &users);
    context.insert(
        "unclassified_users_count",
@ -168,8 +209,7 @@ async fn index(data: web::Data<AppState>, q: web::Query<SortSetting>) -> impl Re
    HttpResponse::Ok().body(page)
 }

-#[post("/")]
-async fn apply(data: web::Data<AppState>, req: web::Form<HashMap<i64, String>>) -> impl Responder {
+async fn post_classified(data: web::Data<AppState>, req: web::Form<HashMap<i64, String>>, overwrite: bool, redirect: &str) -> impl Responder {
    eprintln!("POST /");

    let db = &mut data.db.lock().unwrap();
@ -180,18 +220,56 @@ async fn apply(data: web::Data<AppState>, req: web::Form<HashMap<i64, String>>)
        .map(|(id, classification)| (UserId(*id), classification == "spam"))
        .collect();

-    set_spam(db, classifier, &updates);
+    set_spam(db, classifier, &updates, overwrite);

    db.store_to_path(Path::new("db.json")).unwrap(); // FIXME
    classifier
        .save(&mut File::create(Path::new("model.json")).unwrap(), false)
        .unwrap(); // FIXME

+    eprintln!("done");
    HttpResponse::SeeOther()
-        .insert_header(("Location", ""))
+        .insert_header(("Location", redirect))
        .finish()
 }

+#[post("/")]
+async fn post_classified_index(data: web::Data<AppState>, req: web::Form<HashMap<i64, String>>) -> impl Responder {
+    post_classified(data, req, false, "/").await
+}
+
+#[post("/classified")]
+async fn post_classified_edit(data: web::Data<AppState>, req: web::Form<HashMap<i64, String>>) -> impl Responder {
+    post_classified(data, req, true, "/classified").await
+}
+
+#[get("/classified")]
+async fn classified(data: web::Data<AppState>, _q: web::Query<SortSetting>) -> impl Responder {
+    eprintln!("GET /classified");
+
+    let db = &data.db.lock().unwrap();
+
+    let mut users: Vec<(&UserId, &UserData, f32, bool)> = db
+        .classified_users()
+        .into_iter()
+        .map(|(id, u, s)| (id, u, *db.score.get(id).unwrap(), s))
+        .collect();
+    // sort "spam first"
+    users.sort_by_key(|(_, _, score, _)| 1000 - (score * 1000.) as u64);
+
+    let users: Vec<_> =
+        users.into_iter()
+        .map(|(id, u, score, is_spam)| (id, u, score, approx_score(score), is_spam))
+        .collect();
+
+    let mut context = tera::Context::new();
+    context.insert("users", &users);
+    eprintln!("rendering template...");
+    let page = TEMPLATES.render("classified.html", &context).unwrap();
+    eprintln!("done");
+    HttpResponse::Ok().body(page)
+}
+
 #[actix_web::main]
 async fn main() -> std::io::Result<()> {
    eprintln!("Eval templates");
@ -219,7 +297,9 @@ async fn main() -> std::io::Result<()> {
            .service(actix_files::Files::new("/static/", "./static"))
            .app_data(st.clone())
            .service(index)
-            .service(apply)
+            .service(classified)
+            .service(post_classified_index)
+            .service(post_classified_edit)
    })
    .bind(("127.0.0.1", 8080))?
    .run()
--- a/static/style.css
+++ b/static/style.css
@ -0,0 +1,85 @@
+.main {
+    display: flex;
+    flex-direction: column;
+    gap: 30px;
+    align-items: center;
+}
+
+.users {
+    display: flex;
+    flex-direction: column;
+    gap: 15px;
+}
+
+.user {
+    display: flex;
+    flex-direction: row;
+    gap: 10px;
+    align-items: center;
+    border: 1px dotted #000;
+    padding: 3px 8px 3px 8px;
+    margin: 3px;
+}
+
+.user-card {
+    display: flex;
+    flex-direction: column;
+}
+
+.user-name {
+    display: flex;
+    flex-direction: row;
+    flex-wrap: wrap;
+    gap: 10px;
+}
+
+.user-info {
+    display: flex;
+    flex-direction: row;
+    flex-wrap: wrap;
+    gap: 10px;
+}
+
+.user-classification {
+    display: flex;
+    flex-direction: column;
+    gap: 3px;
+}
+
+input.radio-classify {
+    display: none;
+}
+input.radio-classify + label {
+    border: 1px solid #000;
+    padding: 2px;
+    text-align: center;
+}
+input.radio-spam:checked + label {
+    border: 1px solid #d00400;
+    background: #d00400;
+    color: #fff;
+}
+input.radio-legit:checked + label {
+    border: 1px solid #048e02;
+    background: #048e02;
+    color: #fff;
+}
+
+.score {
+    padding-left: 3px;
+    padding-right: 3px;
+    width: 2.8em;
+    text-align: center;
+    flex-grow: 0;
+    flex-shrink: 0;
+}
+
+.score-High {
+    background: #ff696b;
+}
+.score-Mid {
+    background: #ffa769;
+}
+.score-Low {
+    background: #5fd770;
+}
--- a/templates/base.html
+++ b/templates/base.html
@ -0,0 +1,16 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
+  <meta http-equiv="x-ua-compatible" content="ie=edge" />
+  <link rel="stylesheet" type="text/css" href="static/style.css" />
+  <title>{% block title %}{% endblock title %} - Forgejo Spam Admin</title>
+</head>
+<body>
+  <div class="main">
+    {% block content %}
+    {% endblock content %}
+  </div>
+</body>
+</html>
--- a/templates/classified.html
+++ b/templates/classified.html
@ -0,0 +1,29 @@
+{% import "ui.html" as ui %}
+{% extends "base.html" %}
+{% block title %}Classified{% endblock title %}
+
+{% block content %}
+  <div>
+    Editing classified users
+  </div>
+
+  <div>
+    <a href="/">Home</a>
+  </div>
+
+  <form method="post">
+  <div class="users">
+    {% for user_data in users %}
+      {{ ui::user_card(
+           user_id=user_data[0],
+           user=user_data[1],
+           score=user_data[2],
+           score_approx=user_data[3],
+           is_spam=user_data[4]
+         ) }}
+    {% endfor %}
+  </div>
+
+  <input type="submit" value="Apply" class="button" style="width: 200px; height: 30px"/>
+  </form>
+{% endblock content %}
--- a/templates/index.html
+++ b/templates/index.html
@ -1,170 +1,35 @@
-{% import "macros.html" as macros %}
-<!DOCTYPE html>
-<html lang="en">
-<head>
-  <meta charset="UTF-8" />
-  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
-  <meta http-equiv="x-ua-compatible" content="ie=edge" />
-  <title>Forgejo Spam Admin</title>
-</head>
-<style>
-  .main {
-      display: flex;
-      flex-direction: column;
-      gap: 30px;
-      align-items: center;
-  }
-  
-  .users {
-      display: flex;
-      flex-direction: column;
-      gap: 15px;
-  }
-  
-  .user {
-      display: flex;
-      flex-direction: row;
-      gap: 10px;
-      align-items: center;
-      border: 1px dotted #000;
-      padding: 3px 8px 3px 8px;
-      margin: 3px;
-  }
-  
-  .user-card {
-      display: flex;
-      flex-direction: column;
-  }
-  
-  .user-name {
-      display: flex;
-      flex-direction: row;
-      flex-wrap: wrap;
-      gap: 10px;
-  }
+{% import "ui.html" as ui %}
+{% extends "base.html" %}
+{% block title %}Home{% endblock title %}

-  .user-info {
-      display: flex;
-      flex-direction: row;
-      flex-wrap: wrap;
-      gap: 10px;
-  }
-
-  .user-classification {
-      display: flex;
-      flex-direction: column;
-      gap: 3px;
-  }
-
-  input.radio-classify {
-      display: none;
-  }
-  input.radio-classify + label {
-      border: 1px solid #000;
-      padding: 2px;
-      text-align: center;
-  }
-  input.radio-spam:checked + label {
-      border: 1px solid #d00400;
-      background: #d00400;
-      color: #fff;
-  }
-  input.radio-legit:checked + label {
-      border: 1px solid #048e02;
-      background: #048e02;
-      color: #fff;
-  }
-
-  .score {
-      padding-left: 3px;
-      padding-right: 3px;
-      width: 2.8em;
-      text-align: center;
-      flex-grow: 0;
-      flex-shrink: 0;
-  }
-
-  .score-high {
-      background: #ff696b;
-  }
-  .score-mid {
-      background: #ffa769;
-  }
-  .score-low {
-      background: #5fd770;
-  }
-</style>
-<body>
-  <div class="main">
+{% block content %}
  <div class="stats">
    Users: unclassified: {{unclassified_users_count}} | total: {{total_users_count}}
  </div>
+
+  <div>
+    <a href="/classified">Edit classified users</a>
+  </div>
+
  <div class="sort-options">
    <a href="/?sort=spam">Sort: Spam first</a> |
    <a href="/?sort=legit">Sort: Legit first</a> |
    <a href="/?sort=random">Sort: Random</a>
  </div>
+
  <form method="post">
  <div class="users">
-    {% for id_user_score in users %}
-    {% set user_id = id_user_score[0] %}
-    {% set user = id_user_score[1] %}
-    {% set score = id_user_score[2] %}
-    <div class="user">
-      <div class="user-classification">
-        <input type="radio" name="{{user_id}}" id="{{user_id}}-spam" value="spam"
-               class="radio-classify radio-spam"
-               {% if score >= 0.8 %}checked{% endif %}
-        />
-        <label for="{{user_id}}-spam">Spam</label>
-        <input type="radio" name="{{user_id}}" id="{{user_id}}-legit" value="legit"
-               class="radio-classify radio-legit"
-               {% if score < 0.8 %}checked{% endif %}
-        />
-        <label for="{{user_id}}-legit">Legit</label>
-      </div>
-      <div class="score
-                  {% if score >= spam_threshold %} score-high {% endif %}
-                  {% if score < spam_threshold and score > legit_threshold %} score-mid {% endif %}
-                  {% if score <= legit_threshold %} score-low {% endif %}
-                  ">
-        {{ score | round(precision=2) }}
-      </div>
-      <div class="user-card">
-        <div class="user-name">
-          <div><strong><a href="https://git.deuxfleurs.fr/{{user.login}}">{{ user.login }}</a></strong></div>
-          {%- if user.full_name %}<div><strong>({{ user.full_name }})</strong></div>{% endif -%}
-        </div>
-        <div class="user-info">
-          {%- if user.location %}<div>[L] {{ user.location }}</div>{% endif -%}
-          {%- if user.website %}<div>[W] {{ user.website }}</div>{% endif -%}
-        </div>
-        {%- if user.description %}<div>[D] {{ user.description }}</div>{% endif -%}
-        {%- if user.repos | length > 0 %}
-        <div class="user-repos">
-          <div>Repositories:</div>
-          {% for repo in user.repos %}
-          <div>{{ macros::compact(name=repo[1].name, desc=repo[1].description) }}</div>
-          {% endfor %}
-        </div>
-        {% endif -%}
-        {%- if user.issues | length > 0 %}
-        <div class="user-issues">
-          <div>Issues:</div>
-          {% for issue in user.issues %}
-          <div>{{ macros::compact(name=issue[1].title, desc=issue[1].body) }}</div>
-          {% endfor %}
-        </div>
-        {% endif -%}
-      </div>
-    </div>
+    {% for user_data in users %}
+      {{ ui::user_card(
+           user_id=user_data[0],
+           user=user_data[1],
+           score=user_data[2],
+           score_approx=user_data[3],
+           is_spam=user_data[4]
+         ) }}
    {% endfor %}
-
  </div>

  <input type="submit" value="Apply" class="button" style="width: 200px; height: 30px"/>
  </form>
-  </div>
-
-</body>
-</html>
+{% endblock content %}
--- a/templates/ui.html
+++ b/templates/ui.html
@ -0,0 +1,48 @@
+{% import "macros.html" as macros %}
+
+{% macro user_card(user_id, user, score, score_approx, is_spam) %}
+  <div class="user">
+    <div class="user-classification">
+      <input type="radio" name="{{user_id}}" id="{{user_id}}-spam" value="spam"
+             class="radio-classify radio-spam"
+             {% if is_spam %}checked{% endif %}
+      />
+      <label for="{{user_id}}-spam">Spam</label>
+      <input type="radio" name="{{user_id}}" id="{{user_id}}-legit" value="legit"
+             class="radio-classify radio-legit"
+             {% if not is_spam %}checked{% endif %}
+      />
+      <label for="{{user_id}}-legit">Legit</label>
+    </div>
+    <div class="score score-{{score_approx}}">
+      {{ score | round(precision=2) }}
+    </div>
+    <div class="user-card">
+      <div class="user-name">
+        <div><strong><a href="https://git.deuxfleurs.fr/{{user.login}}">{{ user.login }}</a></strong></div>
+        {%- if user.full_name %}<div><strong>({{ user.full_name }})</strong></div>{% endif -%}
+      </div>
+      <div class="user-info">
+        {%- if user.location %}<div>[L] {{ user.location }}</div>{% endif -%}
+        {%- if user.website %}<div>[W] {{ user.website }}</div>{% endif -%}
+      </div>
+      {%- if user.description %}<div>[D] {{ user.description }}</div>{% endif -%}
+      {%- if user.repos | length > 0 %}
+      <div class="user-repos">
+        <div>Repositories:</div>
+        {% for repo in user.repos %}
+        <div>{{ macros::compact(name=repo[1].name, desc=repo[1].description) }}</div>
+        {% endfor %}
+      </div>
+      {% endif -%}
+      {%- if user.issues | length > 0 %}
+      <div class="user-issues">
+        <div>Issues:</div>
+        {% for issue in user.issues %}
+        <div>{{ macros::compact(name=issue[1].title, desc=issue[1].body) }}</div>
+        {% endfor %}
+      </div>
+      {% endif -%}
+    </div>
+  </div>
+{% endmacro user_card %}