diff --git a/Cargo.lock b/Cargo.lock index e98eef8..caf0da2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -97,6 +97,12 @@ version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.8.0" @@ -214,6 +220,7 @@ dependencies = [ "anyhow", "bayespam", "forgejo-api", + "rand", "reqwest 0.12.9", "serde", "serde_json", @@ -914,6 +921,15 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + [[package]] name = "proc-macro2" version = "1.0.89" @@ -932,6 +948,36 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + [[package]] name = "redox_syscall" version = "0.5.7" @@ -1876,6 +1922,27 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "zerofrom" version = "0.1.4" diff --git a/Cargo.toml b/Cargo.toml index f5d5fe5..c99eadc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,3 +14,4 @@ url = "2" anyhow = "1.0.93" bayespam = "1.1.0" serde_json = "1.0.133" +rand = "0.8.5" diff --git a/data.json b/data.json new file mode 100644 index 0000000..6132a58 --- /dev/null +++ b/data.json @@ -0,0 +1 @@ +{"users":{"2176":"Spam","1376":"Legit","1552":"Legit","2101":"Spam","5366":"Spam","946":"Legit","1863":"Legit","5400":"Spam","4827":"Spam","5968":"Spam","5620":"Spam","5571":"Spam","3879":"Spam","548":"Legit","2487":"Spam","2103":"Spam","3881":"Unknown","4640":"Spam","1905":"Spam","4357":"Spam","3299":"Spam","5611":"Spam","3859":"Spam","5184":"Spam","2934":"Unknown","2897":"Spam","4485":"Unknown","5593":"Spam","5847":"Spam","2887":"Spam","5006":"Spam","5513":"Spam","5524":"Spam","5628":"Spam","5212":"Spam","1985":"Legit","768":"Legit","4683":"Spam","4759":"Spam","4743":"Unknown","4832":"Spam","2630":"Unknown","5516":"Spam","4780":"Spam","2077":"Spam","1231":"Legit","4950":"Spam","2651":"Unknown","4248":"Spam","3489":"Spam","4940":"Spam","2655":"Unknown","12":"Legit","4629":"Spam","2209":"Spam","3626":"Legit","5335":"Unknown","400":"Legit","3590":"Spam","3760":"Spam","5637":"Spam","3077":"Spam","1790":"Spam","5695":"Spam","5235":"Spam","2850":"Legit","2117":"Spam","137":"Legit","3851":"Spam","5778":"Spam","4261":"Unknown"}} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index e6eb372..f4cc17e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,6 +6,7 @@ use serde::{Serialize, Deserialize}; use std::path::Path; use std::fs::File; use std::io::{BufReader, BufWriter}; +use rand::prelude::*; #[derive(Debug, Hash, PartialEq, Eq)] #[derive(Serialize, Deserialize)] @@ -285,9 +286,9 @@ async fn main() -> anyhow::Result<()> { Classifier::new() }; - let db_path = Path::new("classification.json"); - let mut db = if db_path.is_file() { - let file = File::open(db_path)?; + let classification_path = Path::new("classification.json"); + let mut classification = if classification_path.is_file() { + let file = File::open(classification_path)?; serde_json::from_reader(BufReader::new(file))? } else { Db::new() @@ -308,20 +309,31 @@ async fn main() -> anyhow::Result<()> { } else { let data = get_users_data(&forge).await?; let file = File::create(data_path)?; - serde_json::to_writer(BufWriter::new(file), &db)?; + serde_json::to_writer(BufWriter::new(file), &classification)?; data }; println!("got {} users", data.len()); - for (user_id, user) in data { - if db.users.contains_key(&user_id) { - continue; - } + let mut users: Vec<_> = + data.into_iter() + .filter_map( + |(user_id, user)| + if classification.users.contains_key(&user_id) { + None + } else { + let text = user.to_text(); + let score = classifier.score(&text); + Some((user_id, user, text, score)) + } + ) + .collect(); + let mut rng = rand::thread_rng(); + users.shuffle(&mut rng); + users.sort_by_key(|(_, _, _, score)| 1000 - (score * 1000.) as u64); + for (user_id, user, text, score) in users { println!("{:#?}", user); - let user_text = user.to_text(); - - println!("SCORE: {}", classifier.score(&user_text)); + println!("SCORE: {}", score); let c = { let mut resp = String::new(); @@ -338,18 +350,18 @@ async fn main() -> anyhow::Result<()> { }; match c { - Spam => classifier.train_spam(&user_text), - Legit => classifier.train_ham(&user_text), + Spam => classifier.train_spam(&text), + Legit => classifier.train_ham(&text), Unknown => () } - db.users.insert(user_id, c); + classification.users.insert(user_id, c); { classifier.save(&mut File::create(model_path)?, false)?; - let file = File::create(db_path)?; - serde_json::to_writer(BufWriter::new(file), &db)?; + let file = File::create(classification_path)?; + serde_json::to_writer(BufWriter::new(file), &classification)?; } }