basic scraping & CLI classification

This commit is contained in:
Armaël Guéneau 2024-11-18 17:12:07 +01:00
commit 3f4f93826c
5 changed files with 2185 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/target

1925
Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

15
Cargo.toml Normal file
View file

@ -0,0 +1,15 @@
[package]
name = "forgejo-antispam"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
tokio = { version = "1", features = ["full"] }
reqwest = { version = "0.12", features = ["json"] }
serde = { version = "1", features = ["derive"] }
forgejo-api = "0.4"
url = "2"
anyhow = "1.0.93"
bayespam = "1.1.0"

1
model.json Normal file

File diff suppressed because one or more lines are too long

243
src/main.rs Normal file
View file

@ -0,0 +1,243 @@
use forgejo_api::structs::{RepoSearchQuery, Repository, User, UserSearchQuery};
use forgejo_api::{Auth, Forgejo};
use std::collections::HashMap;
use tokio::time::{sleep, Duration};
use bayespam::classifier::Classifier;
use serde::{Serialize, Deserialize};
use std::path::Path;
use std::fs::File;
#[derive(Debug, Hash, PartialEq, Eq)]
#[derive(Serialize, Deserialize)]
struct RepoId(i64);
#[derive(Debug)]
struct RepoData {
name: String,
description: Option<String>,
}
#[derive(Debug, Hash, PartialEq, Eq)]
#[derive(Serialize, Deserialize)]
struct UserId(i64);
#[derive(Debug)]
struct UserData {
// login: String,
email: String,
location: Option<String>,
website: Option<String>,
description: Option<String>,
repos: Vec<(RepoId, RepoData)>,
}
#[derive(Debug, Serialize, Deserialize)]
enum Classification {
Spam,
Legit,
Unknown,
}
#[derive(Debug, Serialize, Deserialize)]
struct Db {
users: HashMap<UserId, Classification>,
}
impl UserData {
fn to_text(&self) -> String {
let mut text = String::new();
let mut add = |s: &str| {
text += s; text += " "
};
for email_part in self.email.split('@') {
add(email_part)
};
match &self.location {
Some(s) => add(&s),
None => add("__NO_LOCATION__"),
}
match &self.website {
Some(s) => add(&s),
None => add("__NO_WEBSITE__"),
}
match &self.description {
Some(s) => add(&s),
None => add("__NO_USER_DESCRIPTION__"),
}
for (_id, repo) in &self.repos {
add(&repo.name);
match &repo.description {
Some(s) => add(s),
None => add("__NO_REPO_DESCRIPTION__"),
}
}
text
}
}
async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<Repository>> {
let mut repos = Vec::new();
let mut query = RepoSearchQuery::default();
query.limit = Some(50);
let mut page: u32 = 1;
loop {
query.page = Some(page);
let resp = forge.repo_search(query.clone()).await?;
match (resp.ok, resp.data) {
(Some(true), Some(mut query_repos)) => {
if query_repos.is_empty() {
break;
}
repos.append(&mut query_repos);
}
_ => todo!("scrape_repos: implement retries"),
}
page += 1;
sleep(Duration::from_millis(100)).await;
}
Ok(repos)
}
async fn scrape_users(forge: &Forgejo) -> anyhow::Result<Vec<User>> {
let mut users = Vec::new();
let mut query = UserSearchQuery::default();
query.limit = Some(50);
let mut page: u32 = 1;
loop {
query.page = Some(page);
let resp = forge.user_search(query.clone()).await?;
match (resp.ok, resp.data) {
(Some(true), Some(mut query_repos)) => {
if query_repos.is_empty() {
break;
}
users.append(&mut query_repos);
}
_ => todo!("scrape_users: implement retries"),
}
page += 1;
sleep(Duration::from_millis(100)).await;
}
Ok(users)
}
async fn get_users_repos(forge: &Forgejo) -> anyhow::Result<HashMap<UserId, UserData>> {
let mut data = HashMap::new();
for user in scrape_users(&forge).await? {
let Some(id) = user.id else {
eprintln!("WARN: user with no id");
continue;
};
// let Some(login) = user.login else {
// eprintln!("WARN: missing login for user {id}");
// continue;
// };
// TODO: fetch those from the admin API instead
let Some(email) = user.email else {
eprintln!("WARN: missing email for user {id}");
continue;
};
data.insert(
UserId(id),
UserData {
// login,
email,
location: user.location,
website: user.website,
description: user.description,
repos: Vec::new(),
},
);
}
for repo in scrape_repos(&forge).await? {
let Some(id) = repo.id else {
eprintln!("WARN: repo with no id");
continue;
};
let Some(owner) = repo.owner else {
eprintln!("WARN: repo {} with no owner", id);
continue;
};
let Some(owner_id) = owner.id else {
eprintln!("WARN: owner for repo {} has no id", id);
continue;
};
let Some(repo_name) = repo.name else {
eprintln!("WARN: repo {} has no name", id);
continue;
};
let Some(forge_owner) = data.get_mut(&UserId(owner_id)) else {
// this currently happens for repos owned by organizations
eprintln!(
"WARN: repo owner {} for repo {} is not in database",
owner.login.unwrap(),
repo_name
);
continue;
};
forge_owner.repos.push((
RepoId(id),
RepoData {
name: repo_name,
description: repo.description,
}));
}
Ok(data)
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let forge = Forgejo::new(Auth::None, url::Url::parse("https://git.deuxfleurs.fr")?)?;
let data = get_users_repos(&forge).await?;
println!("got {} users", data.len());
let model_path = Path::new("model.json");
let mut classifier = if model_path.is_file() {
Classifier::new_from_pre_trained(&mut File::open(model_path)?)?
} else {
Classifier::new()
};
for (_, user) in data {
println!("{:#?}", user);
let user_text = user.to_text();
println!("SCORE: {}", classifier.score(&user_text));
let is_spam = {
let mut resp = String::new();
loop {
println!("SPAM? (y/n/?) ");
std::io::stdin().read_line(&mut resp)?;
match resp.as_str() {
"y\n" => break Some(true),
"n\n" => break Some(false),
"?\n" => break None,
_ => resp.clear()
}
}
};
match is_spam {
Some(true) => classifier.train_spam(&user_text),
Some(false) => classifier.train_ham(&user_text),
None => ()
}
{
classifier.save(&mut File::create(model_path)?, false)?;
}
}
Ok(())
}