From f801c26d34c95e47268d0e2d9c3c50204febda2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arma=C3=ABl=20Gu=C3=A9neau?= Date: Wed, 18 Dec 2024 09:28:07 +0100 Subject: [PATCH] refactoring: split off parts of main.rs into auxiliary files --- src/data.rs | 89 +++++++++++++ src/db.rs | 76 +++++++++++ src/main.rs | 352 ++------------------------------------------------ src/scrape.rs | 190 +++++++++++++++++++++++++++ 4 files changed, 363 insertions(+), 344 deletions(-) create mode 100644 src/data.rs create mode 100644 src/db.rs create mode 100644 src/scrape.rs diff --git a/src/data.rs b/src/data.rs new file mode 100644 index 0000000..b634e20 --- /dev/null +++ b/src/data.rs @@ -0,0 +1,89 @@ +use serde::{Deserialize, Serialize}; +use crate::classifier::Classifier; + +#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)] +pub struct UserId(pub i64); + +#[derive(Debug, Serialize, Deserialize)] +pub struct UserData { + pub login: String, + pub email: String, + pub full_name: Option, + pub location: Option, + pub website: Option, + pub description: Option, + // TODO: visibility + pub repos: Vec<(RepoId, RepoData)>, + pub issues: Vec<(IssueId, IssueData)>, +} + +#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)] +pub struct RepoId(pub i64); + +#[derive(Debug, Serialize, Deserialize)] +pub struct RepoData { + pub name: String, + pub description: Option, +} + +#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)] +pub struct IssueId(pub i64); + +#[derive(Debug, Serialize, Deserialize)] +pub struct IssueData { + pub title: String, + pub body: String, +} + +impl UserData { + pub fn is_empty(&self) -> bool { + self.full_name.is_none() + && self.location.is_none() + && self.website.is_none() + && self.description.is_none() + && self.repos.is_empty() + && self.issues.is_empty() + } + + pub fn to_tokens(&self) -> Vec { + let mut text = String::new(); + let mut add = |s: &str| { + text += s; + text += " " + }; + + for email_part in self.email.split('@') { + add(email_part) + } + + match &self.location { + Some(s) => add(&s), + None => add("__NO_LOCATION__"), + } + + match &self.website { + Some(s) => add(&s), + None => add("__NO_WEBSITE__"), + } + + match &self.description { + Some(s) => add(&s), + None => add("__NO_USER_DESCRIPTION__"), + } + + for (_id, repo) in &self.repos { + add(&repo.name); + match &repo.description { + Some(s) => add(s), + None => add("__NO_REPO_DESCRIPTION__"), + } + } + + for (_id, issue) in &self.issues { + add(&issue.title); + add(&issue.body); + } + + Classifier::into_word_list(&text) + } +} diff --git a/src/db.rs b/src/db.rs new file mode 100644 index 0000000..2b1c35e --- /dev/null +++ b/src/db.rs @@ -0,0 +1,76 @@ +use std::collections::HashMap; +use std::path::Path; +use std::fs::File; +use std::io::{BufReader, BufWriter}; +use crate::data::*; +use crate::classifier::Classifier; + +// TODO (?): make the fields private and provide an API that automatically +// recomputes the caches when necessary? +pub struct Db { + // persisted data + pub users: HashMap, + pub is_spam: HashMap, + // caches: computed from persisted data on load + pub score: HashMap, + pub tokens: HashMap>, +} + +impl Db { + pub fn recompute_tokens(&mut self) { + for (id, user) in &self.users { + self.tokens.insert(*id, user.to_tokens()); + } + } + + pub fn recompute_scores(&mut self, classifier: &Classifier) { + for (id, tokens) in &self.tokens { + self.score.insert(*id, classifier.score(tokens)); + } + } + + pub fn from_path(path: &Path, classifier: &Classifier) -> anyhow::Result { + let file = File::open(path)?; + let (users, is_spam) = serde_json::from_reader(BufReader::new(file))?; + let mut db = Db { + users, + is_spam, + tokens: HashMap::new(), + score: HashMap::new(), + }; + db.recompute_tokens(); + db.recompute_scores(classifier); + Ok(db) + } + + pub fn from_users( + users: HashMap, + is_spam: HashMap, + classifier: &Classifier, + ) -> Db { + let mut db = Db { + users, + is_spam, + tokens: HashMap::new(), + score: HashMap::new(), + }; + db.recompute_tokens(); + db.recompute_scores(classifier); + db + } + + pub fn store_to_path(&self, path: &Path) -> anyhow::Result<()> { + let file = File::create(path)?; + let dat: (&HashMap, &HashMap) = + (&self.users, &self.is_spam); + serde_json::to_writer(BufWriter::new(file), &dat)?; + Ok(()) + } + + pub fn unclassified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData)> { + self.users + .iter() + .filter(|(user_id, _)| !self.is_spam.contains_key(&user_id)) + .collect() + } +} diff --git a/src/main.rs b/src/main.rs index 7a6ffed..106f541 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,357 +2,21 @@ use actix_web::{get, post, web, App, HttpResponse, HttpServer, Responder}; use forgejo_api::{Auth, Forgejo}; use lazy_static::lazy_static; use rand::prelude::*; -use serde::{Deserialize, Serialize}; +use serde::Deserialize; use std::collections::HashMap; use std::fs::File; -use std::io::{BufReader, BufWriter}; use std::path::Path; use std::sync::Mutex; use tera::Tera; -use tokio::time::{sleep, Duration}; mod classifier; +mod data; +mod scrape; +mod db; + use classifier::Classifier; - -#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)] -struct RepoId(i64); - -#[derive(Debug, Serialize, Deserialize)] -struct RepoData { - name: String, - description: Option, -} - -#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)] -struct IssueId(i64); - -#[derive(Debug, Serialize, Deserialize)] -struct IssueData { - title: String, - body: String, -} - -#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)] -struct UserId(i64); - -#[derive(Debug, Serialize, Deserialize)] -struct UserData { - login: String, - email: String, - full_name: Option, - location: Option, - website: Option, - description: Option, - // TODO: visibility - repos: Vec<(RepoId, RepoData)>, - issues: Vec<(IssueId, IssueData)>, -} - -struct Db { - users: HashMap, - is_spam: HashMap, - // caches: derived from the rest - score: HashMap, - tokens: HashMap>, -} - -impl UserData { - fn is_empty(&self) -> bool { - self.full_name.is_none() - && self.location.is_none() - && self.website.is_none() - && self.description.is_none() - && self.repos.is_empty() - && self.issues.is_empty() - } - - fn to_tokens(&self) -> Vec { - let mut text = String::new(); - let mut add = |s: &str| { - text += s; - text += " " - }; - - for email_part in self.email.split('@') { - add(email_part) - } - - match &self.location { - Some(s) => add(&s), - None => add("__NO_LOCATION__"), - } - - match &self.website { - Some(s) => add(&s), - None => add("__NO_WEBSITE__"), - } - - match &self.description { - Some(s) => add(&s), - None => add("__NO_USER_DESCRIPTION__"), - } - - for (_id, repo) in &self.repos { - add(&repo.name); - match &repo.description { - Some(s) => add(s), - None => add("__NO_REPO_DESCRIPTION__"), - } - } - - for (_id, issue) in &self.issues { - add(&issue.title); - add(&issue.body); - } - - Classifier::into_word_list(&text) - } -} - -impl Db { - fn recompute_tokens(&mut self) { - for (id, user) in &self.users { - self.tokens.insert(*id, user.to_tokens()); - } - } - - fn recompute_scores(&mut self, classifier: &Classifier) { - for (id, tokens) in &self.tokens { - self.score.insert(*id, classifier.score(tokens)); - } - } - - fn from_path(path: &Path, classifier: &Classifier) -> anyhow::Result { - let file = File::open(path)?; - let (users, is_spam) = serde_json::from_reader(BufReader::new(file))?; - let mut db = Db { - users, - is_spam, - tokens: HashMap::new(), - score: HashMap::new(), - }; - db.recompute_tokens(); - db.recompute_scores(classifier); - Ok(db) - } - - fn from_users( - users: HashMap, - is_spam: HashMap, - classifier: &Classifier, - ) -> Db { - let mut db = Db { - users, - is_spam, - tokens: HashMap::new(), - score: HashMap::new(), - }; - db.recompute_tokens(); - db.recompute_scores(classifier); - db - } - - fn store_to_path(&self, path: &Path) -> anyhow::Result<()> { - let file = File::create(path)?; - let dat: (&HashMap, &HashMap) = - (&self.users, &self.is_spam); - serde_json::to_writer(BufWriter::new(file), &dat)?; - Ok(()) - } - - fn unclassified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData)> { - self.users - .iter() - .filter(|(user_id, _)| !self.is_spam.contains_key(&user_id)) - .collect() - } -} - -async fn scrape_repos(forge: &Forgejo) -> anyhow::Result> { - let mut repos = Vec::new(); - let mut query = forgejo_api::structs::RepoSearchQuery::default(); - query.limit = Some(50); - let mut page: u32 = 1; - loop { - query.page = Some(page); - let resp = forge.repo_search(query.clone()).await?; - match (resp.ok, resp.data) { - (Some(true), Some(mut query_repos)) => { - if query_repos.is_empty() { - break; - } - repos.append(&mut query_repos); - } - _ => todo!("scrape_repos: implement retries"), - } - page += 1; - sleep(Duration::from_millis(100)).await; - } - Ok(repos) -} - -async fn scrape_issues(forge: &Forgejo) -> anyhow::Result> { - let mut issues = Vec::new(); - let mut query = forgejo_api::structs::IssueSearchIssuesQuery::default(); - query.limit = Some(50); - let mut page: u32 = 1; - loop { - query.page = Some(page); - let mut resp = forge.issue_search_issues(query.clone()).await?; - if resp.is_empty() { - break; - } - issues.append(&mut resp); - page += 1; - sleep(Duration::from_millis(100)).await; - } - Ok(issues) -} - -async fn scrape_users(forge: &Forgejo) -> anyhow::Result> { - let mut users = Vec::new(); - let mut query = forgejo_api::structs::UserSearchQuery::default(); - query.limit = Some(50); - let mut page: u32 = 1; - loop { - query.page = Some(page); - let resp = forge.user_search(query.clone()).await?; - match (resp.ok, resp.data) { - (Some(true), Some(mut query_repos)) => { - if query_repos.is_empty() { - break; - } - users.append(&mut query_repos); - } - _ => todo!("scrape_users: implement retries"), - } - page += 1; - sleep(Duration::from_millis(100)).await; - } - Ok(users) -} - -async fn get_users_data(forge: &Forgejo) -> anyhow::Result> { - let mut data = HashMap::new(); - - let discard_empty = |o: Option| { - match o { - None => None, - Some(s) if s.is_empty() => None, - Some(s) => Some(s), - } - }; - - eprintln!("Fetching users..."); - for user in scrape_users(&forge).await? { - let Some(id) = user.id else { - eprintln!("WARN: user with no id"); - continue; - }; - let Some(login) = user.login else { - eprintln!("WARN: missing login for user {id}"); - continue; - }; - - // TODO: fetch those from the admin API instead - let Some(email) = user.email else { - eprintln!("WARN: missing email for user {id}"); - continue; - }; - - data.insert( - UserId(id), - UserData { - login, - email, - full_name: discard_empty(user.full_name), - location: discard_empty(user.location), - website: discard_empty(user.website), - description: discard_empty(user.description), - repos: Vec::new(), - issues: Vec::new(), - }, - ); - } - - eprintln!("Fetching repos..."); - for repo in scrape_repos(&forge).await? { - let Some(id) = repo.id else { - eprintln!("WARN: repo with no id"); - continue; - }; - let Some(owner) = repo.owner else { - eprintln!("WARN: repo {} with no owner", id); - continue; - }; - let Some(owner_id) = owner.id else { - eprintln!("WARN: owner for repo {} has no id", id); - continue; - }; - let Some(repo_name) = repo.name else { - eprintln!("WARN: repo {} has no name", id); - continue; - }; - let Some(forge_owner) = data.get_mut(&UserId(owner_id)) else { - // this currently happens for repos owned by organizations - eprintln!( - "WARN: repo owner {} for repo {} is not in database", - owner.login.unwrap_or_default(), - repo_name - ); - continue; - }; - forge_owner.repos.push(( - RepoId(id), - RepoData { - name: repo_name, - description: discard_empty(repo.description), - }, - )); - } - - eprintln!("Fetching issues..."); - for issue in scrape_issues(&forge).await? { - let Some(id) = issue.id else { - eprintln!("WARN: issue with no id"); - continue; - }; - let Some(user) = issue.user else { - eprintln!("WARN: issue {} has no owner", id); - continue; - }; - let Some(user_id) = user.id else { - eprintln!("WARN: user for issue {} has no id", id); - continue; - }; - let Some(forge_user) = data.get_mut(&UserId(user_id)) else { - eprintln!( - "WARN: issue user {} {} for issue {} is not in database", - user.login.unwrap_or_default(), - user_id, - issue - .html_url - .map_or(String::from(""), |url| url.as_str().to_string()) - ); - continue; - }; - forge_user.issues.push(( - IssueId(id), - IssueData { - title: issue.title.unwrap_or_default(), - body: issue.body.unwrap_or_default(), - }, - )); - } - - // discard users with an entirely empty profile: there is nothing useful we - // can say about them - let data = data - .into_iter() - .filter(|(_, user)| !user.is_empty()) - .collect(); - Ok(data) -} +use data::*; +use db::Db; async fn load_db() -> anyhow::Result<(Db, Classifier)> { let model_path = Path::new("model.json"); @@ -374,7 +38,7 @@ async fn load_db() -> anyhow::Result<(Db, Classifier)> { let db: Db = if db_path.is_file() { Db::from_path(db_path, &classifier)? } else { - let db = Db::from_users(get_users_data(&forge).await?, HashMap::new(), &classifier); + let db = Db::from_users(scrape::get_users_data(&forge).await?, HashMap::new(), &classifier); db.store_to_path(db_path)?; db }; diff --git a/src/scrape.rs b/src/scrape.rs new file mode 100644 index 0000000..c02f166 --- /dev/null +++ b/src/scrape.rs @@ -0,0 +1,190 @@ +use forgejo_api::Forgejo; +use tokio::time::{sleep, Duration}; +use std::collections::HashMap; + +use crate::data::*; + +async fn scrape_repos(forge: &Forgejo) -> anyhow::Result> { + let mut repos = Vec::new(); + let mut query = forgejo_api::structs::RepoSearchQuery::default(); + query.limit = Some(50); + let mut page: u32 = 1; + loop { + query.page = Some(page); + let resp = forge.repo_search(query.clone()).await?; + match (resp.ok, resp.data) { + (Some(true), Some(mut query_repos)) => { + if query_repos.is_empty() { + break; + } + repos.append(&mut query_repos); + } + _ => todo!("scrape_repos: implement retries"), + } + page += 1; + sleep(Duration::from_millis(100)).await; + } + Ok(repos) +} + +async fn scrape_issues(forge: &Forgejo) -> anyhow::Result> { + let mut issues = Vec::new(); + let mut query = forgejo_api::structs::IssueSearchIssuesQuery::default(); + query.limit = Some(50); + let mut page: u32 = 1; + loop { + query.page = Some(page); + let mut resp = forge.issue_search_issues(query.clone()).await?; + if resp.is_empty() { + break; + } + issues.append(&mut resp); + page += 1; + sleep(Duration::from_millis(100)).await; + } + Ok(issues) +} + +async fn scrape_users(forge: &Forgejo) -> anyhow::Result> { + let mut users = Vec::new(); + let mut query = forgejo_api::structs::UserSearchQuery::default(); + query.limit = Some(50); + let mut page: u32 = 1; + loop { + query.page = Some(page); + let resp = forge.user_search(query.clone()).await?; + match (resp.ok, resp.data) { + (Some(true), Some(mut query_repos)) => { + if query_repos.is_empty() { + break; + } + users.append(&mut query_repos); + } + _ => todo!("scrape_users: implement retries"), + } + page += 1; + sleep(Duration::from_millis(100)).await; + } + Ok(users) +} + +pub async fn get_users_data(forge: &Forgejo) -> anyhow::Result> { + let mut data = HashMap::new(); + + let discard_empty = |o: Option| { + match o { + None => None, + Some(s) if s.is_empty() => None, + Some(s) => Some(s), + } + }; + + eprintln!("Fetching users..."); + for user in scrape_users(&forge).await? { + let Some(id) = user.id else { + eprintln!("WARN: user with no id"); + continue; + }; + let Some(login) = user.login else { + eprintln!("WARN: missing login for user {id}"); + continue; + }; + + let Some(email) = user.email else { + eprintln!("WARN: missing email for user {id}"); + continue; + }; + + data.insert( + UserId(id), + UserData { + login, + email, + full_name: discard_empty(user.full_name), + location: discard_empty(user.location), + website: discard_empty(user.website), + description: discard_empty(user.description), + repos: Vec::new(), + issues: Vec::new(), + }, + ); + } + + eprintln!("Fetching repos..."); + for repo in scrape_repos(&forge).await? { + let Some(id) = repo.id else { + eprintln!("WARN: repo with no id"); + continue; + }; + let Some(owner) = repo.owner else { + eprintln!("WARN: repo {} with no owner", id); + continue; + }; + let Some(owner_id) = owner.id else { + eprintln!("WARN: owner for repo {} has no id", id); + continue; + }; + let Some(repo_name) = repo.name else { + eprintln!("WARN: repo {} has no name", id); + continue; + }; + let Some(forge_owner) = data.get_mut(&UserId(owner_id)) else { + // this currently happens for repos owned by organizations + eprintln!( + "WARN: repo owner {} for repo {} is not in database", + owner.login.unwrap_or_default(), + repo_name + ); + continue; + }; + forge_owner.repos.push(( + RepoId(id), + RepoData { + name: repo_name, + description: discard_empty(repo.description), + }, + )); + } + + eprintln!("Fetching issues..."); + for issue in scrape_issues(&forge).await? { + let Some(id) = issue.id else { + eprintln!("WARN: issue with no id"); + continue; + }; + let Some(user) = issue.user else { + eprintln!("WARN: issue {} has no owner", id); + continue; + }; + let Some(user_id) = user.id else { + eprintln!("WARN: user for issue {} has no id", id); + continue; + }; + let Some(forge_user) = data.get_mut(&UserId(user_id)) else { + eprintln!( + "WARN: issue user {} {} for issue {} is not in database", + user.login.unwrap_or_default(), + user_id, + issue + .html_url + .map_or(String::from(""), |url| url.as_str().to_string()) + ); + continue; + }; + forge_user.issues.push(( + IssueId(id), + IssueData { + title: issue.title.unwrap_or_default(), + body: issue.body.unwrap_or_default(), + }, + )); + } + + // discard users with an entirely empty profile: there is nothing useful we + // can say about them + let data = data + .into_iter() + .filter(|(_, user)| !user.is_empty()) + .collect(); + Ok(data) +}