store user classifications

This commit is contained in:
Armaël Guéneau 2024-11-18 17:27:06 +01:00
parent 3f4f93826c
commit 0f8368031f
5 changed files with 45 additions and 15 deletions

1
Cargo.lock generated
View file

@ -216,6 +216,7 @@ dependencies = [
"forgejo-api", "forgejo-api",
"reqwest 0.12.9", "reqwest 0.12.9",
"serde", "serde",
"serde_json",
"tokio", "tokio",
"url", "url",
] ]

View file

@ -13,3 +13,4 @@ forgejo-api = "0.4"
url = "2" url = "2"
anyhow = "1.0.93" anyhow = "1.0.93"
bayespam = "1.1.0" bayespam = "1.1.0"
serde_json = "1.0.133"

1
classification.json Normal file
View file

@ -0,0 +1 @@
{"users":{"5847":"Spam","5637":"Spam","4640":"Spam","3590":"Spam","137":"Legit","2176":"Spam","3489":"Spam","4357":"Spam","1985":"Legit","1905":"Spam","4683":"Spam","5006":"Spam","4248":"Spam","4780":"Spam","1790":"Spam","5778":"Spam","2101":"Spam","768":"Legit","2117":"Spam","5516":"Spam","1552":"Legit","946":"Legit","5968":"Spam","3077":"Spam","1376":"Legit","5571":"Spam","4832":"Spam","5513":"Spam","5620":"Spam","3879":"Spam","5366":"Spam","3299":"Spam","12":"Legit","4940":"Spam","5611":"Spam","5524":"Spam","3760":"Spam","4759":"Spam","5184":"Spam","400":"Legit","5695":"Spam","4629":"Spam","5235":"Spam"}}

File diff suppressed because one or more lines are too long

View file

@ -37,6 +37,7 @@ enum Classification {
Legit, Legit,
Unknown, Unknown,
} }
use Classification::*;
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
struct Db { struct Db {
@ -81,6 +82,14 @@ impl UserData {
} }
} }
impl Db {
fn new() -> Db {
Db {
users: HashMap::new(),
}
}
}
async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<Repository>> { async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<Repository>> {
let mut repos = Vec::new(); let mut repos = Vec::new();
let mut query = RepoSearchQuery::default(); let mut query = RepoSearchQuery::default();
@ -196,10 +205,6 @@ async fn get_users_repos(forge: &Forgejo) -> anyhow::Result<HashMap<UserId, User
#[tokio::main] #[tokio::main]
async fn main() -> anyhow::Result<()> { async fn main() -> anyhow::Result<()> {
let forge = Forgejo::new(Auth::None, url::Url::parse("https://git.deuxfleurs.fr")?)?;
let data = get_users_repos(&forge).await?;
println!("got {} users", data.len());
let model_path = Path::new("model.json"); let model_path = Path::new("model.json");
let mut classifier = if model_path.is_file() { let mut classifier = if model_path.is_file() {
Classifier::new_from_pre_trained(&mut File::open(model_path)?)? Classifier::new_from_pre_trained(&mut File::open(model_path)?)?
@ -207,37 +212,59 @@ async fn main() -> anyhow::Result<()> {
Classifier::new() Classifier::new()
}; };
for (_, user) in data { let db_path = Path::new("classification.json");
let mut db = if db_path.is_file() {
let file = File::open(db_path)?;
let reader = std::io::BufReader::new(file);
serde_json::from_reader(reader)?
} else {
Db::new()
};
let forge = Forgejo::new(Auth::None, url::Url::parse("https://git.deuxfleurs.fr")?)?;
let data = get_users_repos(&forge).await?;
println!("got {} users", data.len());
for (user_id, user) in data {
if db.users.contains_key(&user_id) {
continue;
}
println!("{:#?}", user); println!("{:#?}", user);
let user_text = user.to_text(); let user_text = user.to_text();
println!("SCORE: {}", classifier.score(&user_text)); println!("SCORE: {}", classifier.score(&user_text));
let is_spam = { let c = {
let mut resp = String::new(); let mut resp = String::new();
loop { loop {
println!("SPAM? (y/n/?) "); println!("SPAM? (y/n/?) ");
std::io::stdin().read_line(&mut resp)?; std::io::stdin().read_line(&mut resp)?;
match resp.as_str() { match resp.as_str() {
"y\n" => break Some(true), "y\n" => break Spam,
"n\n" => break Some(false), "n\n" => break Legit,
"?\n" => break None, "?\n" => break Unknown,
_ => resp.clear() _ => resp.clear()
} }
} }
}; };
match is_spam { match c {
Some(true) => classifier.train_spam(&user_text), Spam => classifier.train_spam(&user_text),
Some(false) => classifier.train_ham(&user_text), Legit => classifier.train_ham(&user_text),
None => () Unknown => ()
} }
db.users.insert(user_id, c);
{ {
classifier.save(&mut File::create(model_path)?, false)?; classifier.save(&mut File::create(model_path)?, false)?;
let file = File::create(db_path)?;
let writer = std::io::BufWriter::new(file);
serde_json::to_writer(writer, &db)?;
} }
} }
Ok(()) Ok(())
} }