randomize entries then sort by decreasing score

This commit is contained in:
Armaël Guéneau 2024-11-19 13:58:46 +01:00
parent b57ead4a5c
commit 797377734f
4 changed files with 97 additions and 16 deletions

67
Cargo.lock generated
View file

@ -97,6 +97,12 @@ version = "3.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "bytes"
version = "1.8.0"
@ -214,6 +220,7 @@ dependencies = [
"anyhow",
"bayespam",
"forgejo-api",
"rand",
"reqwest 0.12.9",
"serde",
"serde_json",
@ -914,6 +921,15 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "ppv-lite86"
version = "0.2.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
dependencies = [
"zerocopy",
]
[[package]]
name = "proc-macro2"
version = "1.0.89"
@ -932,6 +948,36 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom",
]
[[package]]
name = "redox_syscall"
version = "0.5.7"
@ -1876,6 +1922,27 @@ dependencies = [
"synstructure",
]
[[package]]
name = "zerocopy"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
dependencies = [
"byteorder",
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "zerofrom"
version = "0.1.4"

View file

@ -14,3 +14,4 @@ url = "2"
anyhow = "1.0.93"
bayespam = "1.1.0"
serde_json = "1.0.133"
rand = "0.8.5"

1
data.json Normal file
View file

@ -0,0 +1 @@
{"users":{"2176":"Spam","1376":"Legit","1552":"Legit","2101":"Spam","5366":"Spam","946":"Legit","1863":"Legit","5400":"Spam","4827":"Spam","5968":"Spam","5620":"Spam","5571":"Spam","3879":"Spam","548":"Legit","2487":"Spam","2103":"Spam","3881":"Unknown","4640":"Spam","1905":"Spam","4357":"Spam","3299":"Spam","5611":"Spam","3859":"Spam","5184":"Spam","2934":"Unknown","2897":"Spam","4485":"Unknown","5593":"Spam","5847":"Spam","2887":"Spam","5006":"Spam","5513":"Spam","5524":"Spam","5628":"Spam","5212":"Spam","1985":"Legit","768":"Legit","4683":"Spam","4759":"Spam","4743":"Unknown","4832":"Spam","2630":"Unknown","5516":"Spam","4780":"Spam","2077":"Spam","1231":"Legit","4950":"Spam","2651":"Unknown","4248":"Spam","3489":"Spam","4940":"Spam","2655":"Unknown","12":"Legit","4629":"Spam","2209":"Spam","3626":"Legit","5335":"Unknown","400":"Legit","3590":"Spam","3760":"Spam","5637":"Spam","3077":"Spam","1790":"Spam","5695":"Spam","5235":"Spam","2850":"Legit","2117":"Spam","137":"Legit","3851":"Spam","5778":"Spam","4261":"Unknown"}}

View file

@ -6,6 +6,7 @@ use serde::{Serialize, Deserialize};
use std::path::Path;
use std::fs::File;
use std::io::{BufReader, BufWriter};
use rand::prelude::*;
#[derive(Debug, Hash, PartialEq, Eq)]
#[derive(Serialize, Deserialize)]
@ -285,9 +286,9 @@ async fn main() -> anyhow::Result<()> {
Classifier::new()
};
let db_path = Path::new("classification.json");
let mut db = if db_path.is_file() {
let file = File::open(db_path)?;
let classification_path = Path::new("classification.json");
let mut classification = if classification_path.is_file() {
let file = File::open(classification_path)?;
serde_json::from_reader(BufReader::new(file))?
} else {
Db::new()
@ -308,20 +309,31 @@ async fn main() -> anyhow::Result<()> {
} else {
let data = get_users_data(&forge).await?;
let file = File::create(data_path)?;
serde_json::to_writer(BufWriter::new(file), &db)?;
serde_json::to_writer(BufWriter::new(file), &classification)?;
data
};
println!("got {} users", data.len());
for (user_id, user) in data {
if db.users.contains_key(&user_id) {
continue;
let mut users: Vec<_> =
data.into_iter()
.filter_map(
|(user_id, user)|
if classification.users.contains_key(&user_id) {
None
} else {
let text = user.to_text();
let score = classifier.score(&text);
Some((user_id, user, text, score))
}
)
.collect();
let mut rng = rand::thread_rng();
users.shuffle(&mut rng);
users.sort_by_key(|(_, _, _, score)| 1000 - (score * 1000.) as u64);
for (user_id, user, text, score) in users {
println!("{:#?}", user);
let user_text = user.to_text();
println!("SCORE: {}", classifier.score(&user_text));
println!("SCORE: {}", score);
let c = {
let mut resp = String::new();
@ -338,18 +350,18 @@ async fn main() -> anyhow::Result<()> {
};
match c {
Spam => classifier.train_spam(&user_text),
Legit => classifier.train_ham(&user_text),
Spam => classifier.train_spam(&text),
Legit => classifier.train_ham(&text),
Unknown => ()
}
db.users.insert(user_id, c);
classification.users.insert(user_id, c);
{
classifier.save(&mut File::create(model_path)?, false)?;
let file = File::create(db_path)?;
serde_json::to_writer(BufWriter::new(file), &db)?;
let file = File::create(classification_path)?;
serde_json::to_writer(BufWriter::new(file), &classification)?;
}
}