basic scraping & CLI classification
This commit is contained in:
commit
3f4f93826c
5 changed files with 2185 additions and 0 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
/target
|
1925
Cargo.lock
generated
Normal file
1925
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
15
Cargo.toml
Normal file
15
Cargo.toml
Normal file
|
@ -0,0 +1,15 @@
|
|||
[package]
|
||||
name = "forgejo-antispam"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
reqwest = { version = "0.12", features = ["json"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
forgejo-api = "0.4"
|
||||
url = "2"
|
||||
anyhow = "1.0.93"
|
||||
bayespam = "1.1.0"
|
1
model.json
Normal file
1
model.json
Normal file
File diff suppressed because one or more lines are too long
243
src/main.rs
Normal file
243
src/main.rs
Normal file
|
@ -0,0 +1,243 @@
|
|||
use forgejo_api::structs::{RepoSearchQuery, Repository, User, UserSearchQuery};
|
||||
use forgejo_api::{Auth, Forgejo};
|
||||
use std::collections::HashMap;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use bayespam::classifier::Classifier;
|
||||
use serde::{Serialize, Deserialize};
|
||||
use std::path::Path;
|
||||
use std::fs::File;
|
||||
|
||||
#[derive(Debug, Hash, PartialEq, Eq)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct RepoId(i64);
|
||||
|
||||
#[derive(Debug)]
|
||||
struct RepoData {
|
||||
name: String,
|
||||
description: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Hash, PartialEq, Eq)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct UserId(i64);
|
||||
|
||||
#[derive(Debug)]
|
||||
struct UserData {
|
||||
// login: String,
|
||||
email: String,
|
||||
location: Option<String>,
|
||||
website: Option<String>,
|
||||
description: Option<String>,
|
||||
repos: Vec<(RepoId, RepoData)>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
enum Classification {
|
||||
Spam,
|
||||
Legit,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct Db {
|
||||
users: HashMap<UserId, Classification>,
|
||||
}
|
||||
|
||||
impl UserData {
|
||||
fn to_text(&self) -> String {
|
||||
let mut text = String::new();
|
||||
let mut add = |s: &str| {
|
||||
text += s; text += " "
|
||||
};
|
||||
|
||||
for email_part in self.email.split('@') {
|
||||
add(email_part)
|
||||
};
|
||||
|
||||
match &self.location {
|
||||
Some(s) => add(&s),
|
||||
None => add("__NO_LOCATION__"),
|
||||
}
|
||||
|
||||
match &self.website {
|
||||
Some(s) => add(&s),
|
||||
None => add("__NO_WEBSITE__"),
|
||||
}
|
||||
|
||||
match &self.description {
|
||||
Some(s) => add(&s),
|
||||
None => add("__NO_USER_DESCRIPTION__"),
|
||||
}
|
||||
|
||||
for (_id, repo) in &self.repos {
|
||||
add(&repo.name);
|
||||
match &repo.description {
|
||||
Some(s) => add(s),
|
||||
None => add("__NO_REPO_DESCRIPTION__"),
|
||||
}
|
||||
}
|
||||
|
||||
text
|
||||
}
|
||||
}
|
||||
|
||||
async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<Repository>> {
|
||||
let mut repos = Vec::new();
|
||||
let mut query = RepoSearchQuery::default();
|
||||
query.limit = Some(50);
|
||||
let mut page: u32 = 1;
|
||||
loop {
|
||||
query.page = Some(page);
|
||||
let resp = forge.repo_search(query.clone()).await?;
|
||||
match (resp.ok, resp.data) {
|
||||
(Some(true), Some(mut query_repos)) => {
|
||||
if query_repos.is_empty() {
|
||||
break;
|
||||
}
|
||||
repos.append(&mut query_repos);
|
||||
}
|
||||
_ => todo!("scrape_repos: implement retries"),
|
||||
}
|
||||
page += 1;
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
Ok(repos)
|
||||
}
|
||||
|
||||
async fn scrape_users(forge: &Forgejo) -> anyhow::Result<Vec<User>> {
|
||||
let mut users = Vec::new();
|
||||
let mut query = UserSearchQuery::default();
|
||||
query.limit = Some(50);
|
||||
let mut page: u32 = 1;
|
||||
loop {
|
||||
query.page = Some(page);
|
||||
let resp = forge.user_search(query.clone()).await?;
|
||||
match (resp.ok, resp.data) {
|
||||
(Some(true), Some(mut query_repos)) => {
|
||||
if query_repos.is_empty() {
|
||||
break;
|
||||
}
|
||||
users.append(&mut query_repos);
|
||||
}
|
||||
_ => todo!("scrape_users: implement retries"),
|
||||
}
|
||||
page += 1;
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
Ok(users)
|
||||
}
|
||||
|
||||
async fn get_users_repos(forge: &Forgejo) -> anyhow::Result<HashMap<UserId, UserData>> {
|
||||
let mut data = HashMap::new();
|
||||
for user in scrape_users(&forge).await? {
|
||||
let Some(id) = user.id else {
|
||||
eprintln!("WARN: user with no id");
|
||||
continue;
|
||||
};
|
||||
// let Some(login) = user.login else {
|
||||
// eprintln!("WARN: missing login for user {id}");
|
||||
// continue;
|
||||
// };
|
||||
|
||||
// TODO: fetch those from the admin API instead
|
||||
let Some(email) = user.email else {
|
||||
eprintln!("WARN: missing email for user {id}");
|
||||
continue;
|
||||
};
|
||||
data.insert(
|
||||
UserId(id),
|
||||
UserData {
|
||||
// login,
|
||||
email,
|
||||
location: user.location,
|
||||
website: user.website,
|
||||
description: user.description,
|
||||
repos: Vec::new(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
for repo in scrape_repos(&forge).await? {
|
||||
let Some(id) = repo.id else {
|
||||
eprintln!("WARN: repo with no id");
|
||||
continue;
|
||||
};
|
||||
let Some(owner) = repo.owner else {
|
||||
eprintln!("WARN: repo {} with no owner", id);
|
||||
continue;
|
||||
};
|
||||
let Some(owner_id) = owner.id else {
|
||||
eprintln!("WARN: owner for repo {} has no id", id);
|
||||
continue;
|
||||
};
|
||||
let Some(repo_name) = repo.name else {
|
||||
eprintln!("WARN: repo {} has no name", id);
|
||||
continue;
|
||||
};
|
||||
let Some(forge_owner) = data.get_mut(&UserId(owner_id)) else {
|
||||
// this currently happens for repos owned by organizations
|
||||
eprintln!(
|
||||
"WARN: repo owner {} for repo {} is not in database",
|
||||
owner.login.unwrap(),
|
||||
repo_name
|
||||
);
|
||||
continue;
|
||||
};
|
||||
forge_owner.repos.push((
|
||||
RepoId(id),
|
||||
RepoData {
|
||||
name: repo_name,
|
||||
description: repo.description,
|
||||
}));
|
||||
}
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let forge = Forgejo::new(Auth::None, url::Url::parse("https://git.deuxfleurs.fr")?)?;
|
||||
let data = get_users_repos(&forge).await?;
|
||||
println!("got {} users", data.len());
|
||||
|
||||
let model_path = Path::new("model.json");
|
||||
let mut classifier = if model_path.is_file() {
|
||||
Classifier::new_from_pre_trained(&mut File::open(model_path)?)?
|
||||
} else {
|
||||
Classifier::new()
|
||||
};
|
||||
|
||||
for (_, user) in data {
|
||||
println!("{:#?}", user);
|
||||
let user_text = user.to_text();
|
||||
|
||||
println!("SCORE: {}", classifier.score(&user_text));
|
||||
|
||||
let is_spam = {
|
||||
let mut resp = String::new();
|
||||
loop {
|
||||
println!("SPAM? (y/n/?) ");
|
||||
std::io::stdin().read_line(&mut resp)?;
|
||||
match resp.as_str() {
|
||||
"y\n" => break Some(true),
|
||||
"n\n" => break Some(false),
|
||||
"?\n" => break None,
|
||||
_ => resp.clear()
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
match is_spam {
|
||||
Some(true) => classifier.train_spam(&user_text),
|
||||
Some(false) => classifier.train_ham(&user_text),
|
||||
None => ()
|
||||
}
|
||||
|
||||
{
|
||||
classifier.save(&mut File::create(model_path)?, false)?;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
Loading…
Add table
Reference in a new issue