refactoring: split off parts of main.rs into auxiliary files

This commit is contained in:
Armaël Guéneau 2024-12-18 09:28:07 +01:00
parent 483b9860b0
commit f801c26d34
4 changed files with 363 additions and 344 deletions

89
src/data.rs Normal file
View file

@ -0,0 +1,89 @@
use serde::{Deserialize, Serialize};
use crate::classifier::Classifier;
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
pub struct UserId(pub i64);
#[derive(Debug, Serialize, Deserialize)]
pub struct UserData {
pub login: String,
pub email: String,
pub full_name: Option<String>,
pub location: Option<String>,
pub website: Option<String>,
pub description: Option<String>,
// TODO: visibility
pub repos: Vec<(RepoId, RepoData)>,
pub issues: Vec<(IssueId, IssueData)>,
}
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
pub struct RepoId(pub i64);
#[derive(Debug, Serialize, Deserialize)]
pub struct RepoData {
pub name: String,
pub description: Option<String>,
}
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
pub struct IssueId(pub i64);
#[derive(Debug, Serialize, Deserialize)]
pub struct IssueData {
pub title: String,
pub body: String,
}
impl UserData {
pub fn is_empty(&self) -> bool {
self.full_name.is_none()
&& self.location.is_none()
&& self.website.is_none()
&& self.description.is_none()
&& self.repos.is_empty()
&& self.issues.is_empty()
}
pub fn to_tokens(&self) -> Vec<String> {
let mut text = String::new();
let mut add = |s: &str| {
text += s;
text += " "
};
for email_part in self.email.split('@') {
add(email_part)
}
match &self.location {
Some(s) => add(&s),
None => add("__NO_LOCATION__"),
}
match &self.website {
Some(s) => add(&s),
None => add("__NO_WEBSITE__"),
}
match &self.description {
Some(s) => add(&s),
None => add("__NO_USER_DESCRIPTION__"),
}
for (_id, repo) in &self.repos {
add(&repo.name);
match &repo.description {
Some(s) => add(s),
None => add("__NO_REPO_DESCRIPTION__"),
}
}
for (_id, issue) in &self.issues {
add(&issue.title);
add(&issue.body);
}
Classifier::into_word_list(&text)
}
}

76
src/db.rs Normal file
View file

@ -0,0 +1,76 @@
use std::collections::HashMap;
use std::path::Path;
use std::fs::File;
use std::io::{BufReader, BufWriter};
use crate::data::*;
use crate::classifier::Classifier;
// TODO (?): make the fields private and provide an API that automatically
// recomputes the caches when necessary?
pub struct Db {
// persisted data
pub users: HashMap<UserId, UserData>,
pub is_spam: HashMap<UserId, bool>,
// caches: computed from persisted data on load
pub score: HashMap<UserId, f32>,
pub tokens: HashMap<UserId, Vec<String>>,
}
impl Db {
pub fn recompute_tokens(&mut self) {
for (id, user) in &self.users {
self.tokens.insert(*id, user.to_tokens());
}
}
pub fn recompute_scores(&mut self, classifier: &Classifier) {
for (id, tokens) in &self.tokens {
self.score.insert(*id, classifier.score(tokens));
}
}
pub fn from_path(path: &Path, classifier: &Classifier) -> anyhow::Result<Self> {
let file = File::open(path)?;
let (users, is_spam) = serde_json::from_reader(BufReader::new(file))?;
let mut db = Db {
users,
is_spam,
tokens: HashMap::new(),
score: HashMap::new(),
};
db.recompute_tokens();
db.recompute_scores(classifier);
Ok(db)
}
pub fn from_users(
users: HashMap<UserId, UserData>,
is_spam: HashMap<UserId, bool>,
classifier: &Classifier,
) -> Db {
let mut db = Db {
users,
is_spam,
tokens: HashMap::new(),
score: HashMap::new(),
};
db.recompute_tokens();
db.recompute_scores(classifier);
db
}
pub fn store_to_path(&self, path: &Path) -> anyhow::Result<()> {
let file = File::create(path)?;
let dat: (&HashMap<UserId, UserData>, &HashMap<UserId, bool>) =
(&self.users, &self.is_spam);
serde_json::to_writer(BufWriter::new(file), &dat)?;
Ok(())
}
pub fn unclassified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData)> {
self.users
.iter()
.filter(|(user_id, _)| !self.is_spam.contains_key(&user_id))
.collect()
}
}

View file

@ -2,357 +2,21 @@ use actix_web::{get, post, web, App, HttpResponse, HttpServer, Responder};
use forgejo_api::{Auth, Forgejo};
use lazy_static::lazy_static;
use rand::prelude::*;
use serde::{Deserialize, Serialize};
use serde::Deserialize;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, BufWriter};
use std::path::Path;
use std::sync::Mutex;
use tera::Tera;
use tokio::time::{sleep, Duration};
mod classifier;
mod data;
mod scrape;
mod db;
use classifier::Classifier;
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
struct RepoId(i64);
#[derive(Debug, Serialize, Deserialize)]
struct RepoData {
name: String,
description: Option<String>,
}
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
struct IssueId(i64);
#[derive(Debug, Serialize, Deserialize)]
struct IssueData {
title: String,
body: String,
}
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
struct UserId(i64);
#[derive(Debug, Serialize, Deserialize)]
struct UserData {
login: String,
email: String,
full_name: Option<String>,
location: Option<String>,
website: Option<String>,
description: Option<String>,
// TODO: visibility
repos: Vec<(RepoId, RepoData)>,
issues: Vec<(IssueId, IssueData)>,
}
struct Db {
users: HashMap<UserId, UserData>,
is_spam: HashMap<UserId, bool>,
// caches: derived from the rest
score: HashMap<UserId, f32>,
tokens: HashMap<UserId, Vec<String>>,
}
impl UserData {
fn is_empty(&self) -> bool {
self.full_name.is_none()
&& self.location.is_none()
&& self.website.is_none()
&& self.description.is_none()
&& self.repos.is_empty()
&& self.issues.is_empty()
}
fn to_tokens(&self) -> Vec<String> {
let mut text = String::new();
let mut add = |s: &str| {
text += s;
text += " "
};
for email_part in self.email.split('@') {
add(email_part)
}
match &self.location {
Some(s) => add(&s),
None => add("__NO_LOCATION__"),
}
match &self.website {
Some(s) => add(&s),
None => add("__NO_WEBSITE__"),
}
match &self.description {
Some(s) => add(&s),
None => add("__NO_USER_DESCRIPTION__"),
}
for (_id, repo) in &self.repos {
add(&repo.name);
match &repo.description {
Some(s) => add(s),
None => add("__NO_REPO_DESCRIPTION__"),
}
}
for (_id, issue) in &self.issues {
add(&issue.title);
add(&issue.body);
}
Classifier::into_word_list(&text)
}
}
impl Db {
fn recompute_tokens(&mut self) {
for (id, user) in &self.users {
self.tokens.insert(*id, user.to_tokens());
}
}
fn recompute_scores(&mut self, classifier: &Classifier) {
for (id, tokens) in &self.tokens {
self.score.insert(*id, classifier.score(tokens));
}
}
fn from_path(path: &Path, classifier: &Classifier) -> anyhow::Result<Self> {
let file = File::open(path)?;
let (users, is_spam) = serde_json::from_reader(BufReader::new(file))?;
let mut db = Db {
users,
is_spam,
tokens: HashMap::new(),
score: HashMap::new(),
};
db.recompute_tokens();
db.recompute_scores(classifier);
Ok(db)
}
fn from_users(
users: HashMap<UserId, UserData>,
is_spam: HashMap<UserId, bool>,
classifier: &Classifier,
) -> Db {
let mut db = Db {
users,
is_spam,
tokens: HashMap::new(),
score: HashMap::new(),
};
db.recompute_tokens();
db.recompute_scores(classifier);
db
}
fn store_to_path(&self, path: &Path) -> anyhow::Result<()> {
let file = File::create(path)?;
let dat: (&HashMap<UserId, UserData>, &HashMap<UserId, bool>) =
(&self.users, &self.is_spam);
serde_json::to_writer(BufWriter::new(file), &dat)?;
Ok(())
}
fn unclassified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData)> {
self.users
.iter()
.filter(|(user_id, _)| !self.is_spam.contains_key(&user_id))
.collect()
}
}
async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::Repository>> {
let mut repos = Vec::new();
let mut query = forgejo_api::structs::RepoSearchQuery::default();
query.limit = Some(50);
let mut page: u32 = 1;
loop {
query.page = Some(page);
let resp = forge.repo_search(query.clone()).await?;
match (resp.ok, resp.data) {
(Some(true), Some(mut query_repos)) => {
if query_repos.is_empty() {
break;
}
repos.append(&mut query_repos);
}
_ => todo!("scrape_repos: implement retries"),
}
page += 1;
sleep(Duration::from_millis(100)).await;
}
Ok(repos)
}
async fn scrape_issues(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::Issue>> {
let mut issues = Vec::new();
let mut query = forgejo_api::structs::IssueSearchIssuesQuery::default();
query.limit = Some(50);
let mut page: u32 = 1;
loop {
query.page = Some(page);
let mut resp = forge.issue_search_issues(query.clone()).await?;
if resp.is_empty() {
break;
}
issues.append(&mut resp);
page += 1;
sleep(Duration::from_millis(100)).await;
}
Ok(issues)
}
async fn scrape_users(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::User>> {
let mut users = Vec::new();
let mut query = forgejo_api::structs::UserSearchQuery::default();
query.limit = Some(50);
let mut page: u32 = 1;
loop {
query.page = Some(page);
let resp = forge.user_search(query.clone()).await?;
match (resp.ok, resp.data) {
(Some(true), Some(mut query_repos)) => {
if query_repos.is_empty() {
break;
}
users.append(&mut query_repos);
}
_ => todo!("scrape_users: implement retries"),
}
page += 1;
sleep(Duration::from_millis(100)).await;
}
Ok(users)
}
async fn get_users_data(forge: &Forgejo) -> anyhow::Result<HashMap<UserId, UserData>> {
let mut data = HashMap::new();
let discard_empty = |o: Option<String>| {
match o {
None => None,
Some(s) if s.is_empty() => None,
Some(s) => Some(s),
}
};
eprintln!("Fetching users...");
for user in scrape_users(&forge).await? {
let Some(id) = user.id else {
eprintln!("WARN: user with no id");
continue;
};
let Some(login) = user.login else {
eprintln!("WARN: missing login for user {id}");
continue;
};
// TODO: fetch those from the admin API instead
let Some(email) = user.email else {
eprintln!("WARN: missing email for user {id}");
continue;
};
data.insert(
UserId(id),
UserData {
login,
email,
full_name: discard_empty(user.full_name),
location: discard_empty(user.location),
website: discard_empty(user.website),
description: discard_empty(user.description),
repos: Vec::new(),
issues: Vec::new(),
},
);
}
eprintln!("Fetching repos...");
for repo in scrape_repos(&forge).await? {
let Some(id) = repo.id else {
eprintln!("WARN: repo with no id");
continue;
};
let Some(owner) = repo.owner else {
eprintln!("WARN: repo {} with no owner", id);
continue;
};
let Some(owner_id) = owner.id else {
eprintln!("WARN: owner for repo {} has no id", id);
continue;
};
let Some(repo_name) = repo.name else {
eprintln!("WARN: repo {} has no name", id);
continue;
};
let Some(forge_owner) = data.get_mut(&UserId(owner_id)) else {
// this currently happens for repos owned by organizations
eprintln!(
"WARN: repo owner {} for repo {} is not in database",
owner.login.unwrap_or_default(),
repo_name
);
continue;
};
forge_owner.repos.push((
RepoId(id),
RepoData {
name: repo_name,
description: discard_empty(repo.description),
},
));
}
eprintln!("Fetching issues...");
for issue in scrape_issues(&forge).await? {
let Some(id) = issue.id else {
eprintln!("WARN: issue with no id");
continue;
};
let Some(user) = issue.user else {
eprintln!("WARN: issue {} has no owner", id);
continue;
};
let Some(user_id) = user.id else {
eprintln!("WARN: user for issue {} has no id", id);
continue;
};
let Some(forge_user) = data.get_mut(&UserId(user_id)) else {
eprintln!(
"WARN: issue user {} {} for issue {} is not in database",
user.login.unwrap_or_default(),
user_id,
issue
.html_url
.map_or(String::from(""), |url| url.as_str().to_string())
);
continue;
};
forge_user.issues.push((
IssueId(id),
IssueData {
title: issue.title.unwrap_or_default(),
body: issue.body.unwrap_or_default(),
},
));
}
// discard users with an entirely empty profile: there is nothing useful we
// can say about them
let data = data
.into_iter()
.filter(|(_, user)| !user.is_empty())
.collect();
Ok(data)
}
use data::*;
use db::Db;
async fn load_db() -> anyhow::Result<(Db, Classifier)> {
let model_path = Path::new("model.json");
@ -374,7 +38,7 @@ async fn load_db() -> anyhow::Result<(Db, Classifier)> {
let db: Db = if db_path.is_file() {
Db::from_path(db_path, &classifier)?
} else {
let db = Db::from_users(get_users_data(&forge).await?, HashMap::new(), &classifier);
let db = Db::from_users(scrape::get_users_data(&forge).await?, HashMap::new(), &classifier);
db.store_to_path(db_path)?;
db
};

190
src/scrape.rs Normal file
View file

@ -0,0 +1,190 @@
use forgejo_api::Forgejo;
use tokio::time::{sleep, Duration};
use std::collections::HashMap;
use crate::data::*;
async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::Repository>> {
let mut repos = Vec::new();
let mut query = forgejo_api::structs::RepoSearchQuery::default();
query.limit = Some(50);
let mut page: u32 = 1;
loop {
query.page = Some(page);
let resp = forge.repo_search(query.clone()).await?;
match (resp.ok, resp.data) {
(Some(true), Some(mut query_repos)) => {
if query_repos.is_empty() {
break;
}
repos.append(&mut query_repos);
}
_ => todo!("scrape_repos: implement retries"),
}
page += 1;
sleep(Duration::from_millis(100)).await;
}
Ok(repos)
}
async fn scrape_issues(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::Issue>> {
let mut issues = Vec::new();
let mut query = forgejo_api::structs::IssueSearchIssuesQuery::default();
query.limit = Some(50);
let mut page: u32 = 1;
loop {
query.page = Some(page);
let mut resp = forge.issue_search_issues(query.clone()).await?;
if resp.is_empty() {
break;
}
issues.append(&mut resp);
page += 1;
sleep(Duration::from_millis(100)).await;
}
Ok(issues)
}
async fn scrape_users(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::User>> {
let mut users = Vec::new();
let mut query = forgejo_api::structs::UserSearchQuery::default();
query.limit = Some(50);
let mut page: u32 = 1;
loop {
query.page = Some(page);
let resp = forge.user_search(query.clone()).await?;
match (resp.ok, resp.data) {
(Some(true), Some(mut query_repos)) => {
if query_repos.is_empty() {
break;
}
users.append(&mut query_repos);
}
_ => todo!("scrape_users: implement retries"),
}
page += 1;
sleep(Duration::from_millis(100)).await;
}
Ok(users)
}
pub async fn get_users_data(forge: &Forgejo) -> anyhow::Result<HashMap<UserId, UserData>> {
let mut data = HashMap::new();
let discard_empty = |o: Option<String>| {
match o {
None => None,
Some(s) if s.is_empty() => None,
Some(s) => Some(s),
}
};
eprintln!("Fetching users...");
for user in scrape_users(&forge).await? {
let Some(id) = user.id else {
eprintln!("WARN: user with no id");
continue;
};
let Some(login) = user.login else {
eprintln!("WARN: missing login for user {id}");
continue;
};
let Some(email) = user.email else {
eprintln!("WARN: missing email for user {id}");
continue;
};
data.insert(
UserId(id),
UserData {
login,
email,
full_name: discard_empty(user.full_name),
location: discard_empty(user.location),
website: discard_empty(user.website),
description: discard_empty(user.description),
repos: Vec::new(),
issues: Vec::new(),
},
);
}
eprintln!("Fetching repos...");
for repo in scrape_repos(&forge).await? {
let Some(id) = repo.id else {
eprintln!("WARN: repo with no id");
continue;
};
let Some(owner) = repo.owner else {
eprintln!("WARN: repo {} with no owner", id);
continue;
};
let Some(owner_id) = owner.id else {
eprintln!("WARN: owner for repo {} has no id", id);
continue;
};
let Some(repo_name) = repo.name else {
eprintln!("WARN: repo {} has no name", id);
continue;
};
let Some(forge_owner) = data.get_mut(&UserId(owner_id)) else {
// this currently happens for repos owned by organizations
eprintln!(
"WARN: repo owner {} for repo {} is not in database",
owner.login.unwrap_or_default(),
repo_name
);
continue;
};
forge_owner.repos.push((
RepoId(id),
RepoData {
name: repo_name,
description: discard_empty(repo.description),
},
));
}
eprintln!("Fetching issues...");
for issue in scrape_issues(&forge).await? {
let Some(id) = issue.id else {
eprintln!("WARN: issue with no id");
continue;
};
let Some(user) = issue.user else {
eprintln!("WARN: issue {} has no owner", id);
continue;
};
let Some(user_id) = user.id else {
eprintln!("WARN: user for issue {} has no id", id);
continue;
};
let Some(forge_user) = data.get_mut(&UserId(user_id)) else {
eprintln!(
"WARN: issue user {} {} for issue {} is not in database",
user.login.unwrap_or_default(),
user_id,
issue
.html_url
.map_or(String::from(""), |url| url.as_str().to_string())
);
continue;
};
forge_user.issues.push((
IssueId(id),
IssueData {
title: issue.title.unwrap_or_default(),
body: issue.body.unwrap_or_default(),
},
));
}
// discard users with an entirely empty profile: there is nothing useful we
// can say about them
let data = data
.into_iter()
.filter(|(_, user)| !user.is_empty())
.collect();
Ok(data)
}