randomize entries then sort by decreasing score
This commit is contained in:
parent
b57ead4a5c
commit
797377734f
4 changed files with 97 additions and 16 deletions
67
Cargo.lock
generated
67
Cargo.lock
generated
|
@ -97,6 +97,12 @@ version = "3.16.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
version = "1.8.0"
|
||||
|
@ -214,6 +220,7 @@ dependencies = [
|
|||
"anyhow",
|
||||
"bayespam",
|
||||
"forgejo-api",
|
||||
"rand",
|
||||
"reqwest 0.12.9",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
@ -914,6 +921,15 @@ version = "0.2.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
|
||||
dependencies = [
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.89"
|
||||
|
@ -932,6 +948,36 @@ dependencies = [
|
|||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_chacha",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.5.7"
|
||||
|
@ -1876,6 +1922,27 @@ dependencies = [
|
|||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.7.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"zerocopy-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy-derive"
|
||||
version = "0.7.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerofrom"
|
||||
version = "0.1.4"
|
||||
|
|
|
@ -14,3 +14,4 @@ url = "2"
|
|||
anyhow = "1.0.93"
|
||||
bayespam = "1.1.0"
|
||||
serde_json = "1.0.133"
|
||||
rand = "0.8.5"
|
||||
|
|
1
data.json
Normal file
1
data.json
Normal file
|
@ -0,0 +1 @@
|
|||
{"users":{"2176":"Spam","1376":"Legit","1552":"Legit","2101":"Spam","5366":"Spam","946":"Legit","1863":"Legit","5400":"Spam","4827":"Spam","5968":"Spam","5620":"Spam","5571":"Spam","3879":"Spam","548":"Legit","2487":"Spam","2103":"Spam","3881":"Unknown","4640":"Spam","1905":"Spam","4357":"Spam","3299":"Spam","5611":"Spam","3859":"Spam","5184":"Spam","2934":"Unknown","2897":"Spam","4485":"Unknown","5593":"Spam","5847":"Spam","2887":"Spam","5006":"Spam","5513":"Spam","5524":"Spam","5628":"Spam","5212":"Spam","1985":"Legit","768":"Legit","4683":"Spam","4759":"Spam","4743":"Unknown","4832":"Spam","2630":"Unknown","5516":"Spam","4780":"Spam","2077":"Spam","1231":"Legit","4950":"Spam","2651":"Unknown","4248":"Spam","3489":"Spam","4940":"Spam","2655":"Unknown","12":"Legit","4629":"Spam","2209":"Spam","3626":"Legit","5335":"Unknown","400":"Legit","3590":"Spam","3760":"Spam","5637":"Spam","3077":"Spam","1790":"Spam","5695":"Spam","5235":"Spam","2850":"Legit","2117":"Spam","137":"Legit","3851":"Spam","5778":"Spam","4261":"Unknown"}}
|
42
src/main.rs
42
src/main.rs
|
@ -6,6 +6,7 @@ use serde::{Serialize, Deserialize};
|
|||
use std::path::Path;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, BufWriter};
|
||||
use rand::prelude::*;
|
||||
|
||||
#[derive(Debug, Hash, PartialEq, Eq)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
|
@ -285,9 +286,9 @@ async fn main() -> anyhow::Result<()> {
|
|||
Classifier::new()
|
||||
};
|
||||
|
||||
let db_path = Path::new("classification.json");
|
||||
let mut db = if db_path.is_file() {
|
||||
let file = File::open(db_path)?;
|
||||
let classification_path = Path::new("classification.json");
|
||||
let mut classification = if classification_path.is_file() {
|
||||
let file = File::open(classification_path)?;
|
||||
serde_json::from_reader(BufReader::new(file))?
|
||||
} else {
|
||||
Db::new()
|
||||
|
@ -308,20 +309,31 @@ async fn main() -> anyhow::Result<()> {
|
|||
} else {
|
||||
let data = get_users_data(&forge).await?;
|
||||
let file = File::create(data_path)?;
|
||||
serde_json::to_writer(BufWriter::new(file), &db)?;
|
||||
serde_json::to_writer(BufWriter::new(file), &classification)?;
|
||||
data
|
||||
};
|
||||
println!("got {} users", data.len());
|
||||
|
||||
for (user_id, user) in data {
|
||||
if db.users.contains_key(&user_id) {
|
||||
continue;
|
||||
let mut users: Vec<_> =
|
||||
data.into_iter()
|
||||
.filter_map(
|
||||
|(user_id, user)|
|
||||
if classification.users.contains_key(&user_id) {
|
||||
None
|
||||
} else {
|
||||
let text = user.to_text();
|
||||
let score = classifier.score(&text);
|
||||
Some((user_id, user, text, score))
|
||||
}
|
||||
)
|
||||
.collect();
|
||||
let mut rng = rand::thread_rng();
|
||||
users.shuffle(&mut rng);
|
||||
users.sort_by_key(|(_, _, _, score)| 1000 - (score * 1000.) as u64);
|
||||
|
||||
for (user_id, user, text, score) in users {
|
||||
println!("{:#?}", user);
|
||||
let user_text = user.to_text();
|
||||
|
||||
println!("SCORE: {}", classifier.score(&user_text));
|
||||
println!("SCORE: {}", score);
|
||||
|
||||
let c = {
|
||||
let mut resp = String::new();
|
||||
|
@ -338,18 +350,18 @@ async fn main() -> anyhow::Result<()> {
|
|||
};
|
||||
|
||||
match c {
|
||||
Spam => classifier.train_spam(&user_text),
|
||||
Legit => classifier.train_ham(&user_text),
|
||||
Spam => classifier.train_spam(&text),
|
||||
Legit => classifier.train_ham(&text),
|
||||
Unknown => ()
|
||||
}
|
||||
|
||||
db.users.insert(user_id, c);
|
||||
classification.users.insert(user_id, c);
|
||||
|
||||
{
|
||||
classifier.save(&mut File::create(model_path)?, false)?;
|
||||
|
||||
let file = File::create(db_path)?;
|
||||
serde_json::to_writer(BufWriter::new(file), &db)?;
|
||||
let file = File::create(classification_path)?;
|
||||
serde_json::to_writer(BufWriter::new(file), &classification)?;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue