refactoring: split off parts of main.rs into auxiliary files
This commit is contained in:
parent
483b9860b0
commit
f801c26d34
4 changed files with 363 additions and 344 deletions
89
src/data.rs
Normal file
89
src/data.rs
Normal file
|
@ -0,0 +1,89 @@
|
|||
use serde::{Deserialize, Serialize};
|
||||
use crate::classifier::Classifier;
|
||||
|
||||
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct UserId(pub i64);
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct UserData {
|
||||
pub login: String,
|
||||
pub email: String,
|
||||
pub full_name: Option<String>,
|
||||
pub location: Option<String>,
|
||||
pub website: Option<String>,
|
||||
pub description: Option<String>,
|
||||
// TODO: visibility
|
||||
pub repos: Vec<(RepoId, RepoData)>,
|
||||
pub issues: Vec<(IssueId, IssueData)>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct RepoId(pub i64);
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct RepoData {
|
||||
pub name: String,
|
||||
pub description: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct IssueId(pub i64);
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct IssueData {
|
||||
pub title: String,
|
||||
pub body: String,
|
||||
}
|
||||
|
||||
impl UserData {
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.full_name.is_none()
|
||||
&& self.location.is_none()
|
||||
&& self.website.is_none()
|
||||
&& self.description.is_none()
|
||||
&& self.repos.is_empty()
|
||||
&& self.issues.is_empty()
|
||||
}
|
||||
|
||||
pub fn to_tokens(&self) -> Vec<String> {
|
||||
let mut text = String::new();
|
||||
let mut add = |s: &str| {
|
||||
text += s;
|
||||
text += " "
|
||||
};
|
||||
|
||||
for email_part in self.email.split('@') {
|
||||
add(email_part)
|
||||
}
|
||||
|
||||
match &self.location {
|
||||
Some(s) => add(&s),
|
||||
None => add("__NO_LOCATION__"),
|
||||
}
|
||||
|
||||
match &self.website {
|
||||
Some(s) => add(&s),
|
||||
None => add("__NO_WEBSITE__"),
|
||||
}
|
||||
|
||||
match &self.description {
|
||||
Some(s) => add(&s),
|
||||
None => add("__NO_USER_DESCRIPTION__"),
|
||||
}
|
||||
|
||||
for (_id, repo) in &self.repos {
|
||||
add(&repo.name);
|
||||
match &repo.description {
|
||||
Some(s) => add(s),
|
||||
None => add("__NO_REPO_DESCRIPTION__"),
|
||||
}
|
||||
}
|
||||
|
||||
for (_id, issue) in &self.issues {
|
||||
add(&issue.title);
|
||||
add(&issue.body);
|
||||
}
|
||||
|
||||
Classifier::into_word_list(&text)
|
||||
}
|
||||
}
|
76
src/db.rs
Normal file
76
src/db.rs
Normal file
|
@ -0,0 +1,76 @@
|
|||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, BufWriter};
|
||||
use crate::data::*;
|
||||
use crate::classifier::Classifier;
|
||||
|
||||
// TODO (?): make the fields private and provide an API that automatically
|
||||
// recomputes the caches when necessary?
|
||||
pub struct Db {
|
||||
// persisted data
|
||||
pub users: HashMap<UserId, UserData>,
|
||||
pub is_spam: HashMap<UserId, bool>,
|
||||
// caches: computed from persisted data on load
|
||||
pub score: HashMap<UserId, f32>,
|
||||
pub tokens: HashMap<UserId, Vec<String>>,
|
||||
}
|
||||
|
||||
impl Db {
|
||||
pub fn recompute_tokens(&mut self) {
|
||||
for (id, user) in &self.users {
|
||||
self.tokens.insert(*id, user.to_tokens());
|
||||
}
|
||||
}
|
||||
|
||||
pub fn recompute_scores(&mut self, classifier: &Classifier) {
|
||||
for (id, tokens) in &self.tokens {
|
||||
self.score.insert(*id, classifier.score(tokens));
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_path(path: &Path, classifier: &Classifier) -> anyhow::Result<Self> {
|
||||
let file = File::open(path)?;
|
||||
let (users, is_spam) = serde_json::from_reader(BufReader::new(file))?;
|
||||
let mut db = Db {
|
||||
users,
|
||||
is_spam,
|
||||
tokens: HashMap::new(),
|
||||
score: HashMap::new(),
|
||||
};
|
||||
db.recompute_tokens();
|
||||
db.recompute_scores(classifier);
|
||||
Ok(db)
|
||||
}
|
||||
|
||||
pub fn from_users(
|
||||
users: HashMap<UserId, UserData>,
|
||||
is_spam: HashMap<UserId, bool>,
|
||||
classifier: &Classifier,
|
||||
) -> Db {
|
||||
let mut db = Db {
|
||||
users,
|
||||
is_spam,
|
||||
tokens: HashMap::new(),
|
||||
score: HashMap::new(),
|
||||
};
|
||||
db.recompute_tokens();
|
||||
db.recompute_scores(classifier);
|
||||
db
|
||||
}
|
||||
|
||||
pub fn store_to_path(&self, path: &Path) -> anyhow::Result<()> {
|
||||
let file = File::create(path)?;
|
||||
let dat: (&HashMap<UserId, UserData>, &HashMap<UserId, bool>) =
|
||||
(&self.users, &self.is_spam);
|
||||
serde_json::to_writer(BufWriter::new(file), &dat)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn unclassified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData)> {
|
||||
self.users
|
||||
.iter()
|
||||
.filter(|(user_id, _)| !self.is_spam.contains_key(&user_id))
|
||||
.collect()
|
||||
}
|
||||
}
|
352
src/main.rs
352
src/main.rs
|
@ -2,357 +2,21 @@ use actix_web::{get, post, web, App, HttpResponse, HttpServer, Responder};
|
|||
use forgejo_api::{Auth, Forgejo};
|
||||
use lazy_static::lazy_static;
|
||||
use rand::prelude::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde::Deserialize;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, BufWriter};
|
||||
use std::path::Path;
|
||||
use std::sync::Mutex;
|
||||
use tera::Tera;
|
||||
use tokio::time::{sleep, Duration};
|
||||
|
||||
mod classifier;
|
||||
mod data;
|
||||
mod scrape;
|
||||
mod db;
|
||||
|
||||
use classifier::Classifier;
|
||||
|
||||
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
|
||||
struct RepoId(i64);
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct RepoData {
|
||||
name: String,
|
||||
description: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
|
||||
struct IssueId(i64);
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct IssueData {
|
||||
title: String,
|
||||
body: String,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
|
||||
struct UserId(i64);
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct UserData {
|
||||
login: String,
|
||||
email: String,
|
||||
full_name: Option<String>,
|
||||
location: Option<String>,
|
||||
website: Option<String>,
|
||||
description: Option<String>,
|
||||
// TODO: visibility
|
||||
repos: Vec<(RepoId, RepoData)>,
|
||||
issues: Vec<(IssueId, IssueData)>,
|
||||
}
|
||||
|
||||
struct Db {
|
||||
users: HashMap<UserId, UserData>,
|
||||
is_spam: HashMap<UserId, bool>,
|
||||
// caches: derived from the rest
|
||||
score: HashMap<UserId, f32>,
|
||||
tokens: HashMap<UserId, Vec<String>>,
|
||||
}
|
||||
|
||||
impl UserData {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.full_name.is_none()
|
||||
&& self.location.is_none()
|
||||
&& self.website.is_none()
|
||||
&& self.description.is_none()
|
||||
&& self.repos.is_empty()
|
||||
&& self.issues.is_empty()
|
||||
}
|
||||
|
||||
fn to_tokens(&self) -> Vec<String> {
|
||||
let mut text = String::new();
|
||||
let mut add = |s: &str| {
|
||||
text += s;
|
||||
text += " "
|
||||
};
|
||||
|
||||
for email_part in self.email.split('@') {
|
||||
add(email_part)
|
||||
}
|
||||
|
||||
match &self.location {
|
||||
Some(s) => add(&s),
|
||||
None => add("__NO_LOCATION__"),
|
||||
}
|
||||
|
||||
match &self.website {
|
||||
Some(s) => add(&s),
|
||||
None => add("__NO_WEBSITE__"),
|
||||
}
|
||||
|
||||
match &self.description {
|
||||
Some(s) => add(&s),
|
||||
None => add("__NO_USER_DESCRIPTION__"),
|
||||
}
|
||||
|
||||
for (_id, repo) in &self.repos {
|
||||
add(&repo.name);
|
||||
match &repo.description {
|
||||
Some(s) => add(s),
|
||||
None => add("__NO_REPO_DESCRIPTION__"),
|
||||
}
|
||||
}
|
||||
|
||||
for (_id, issue) in &self.issues {
|
||||
add(&issue.title);
|
||||
add(&issue.body);
|
||||
}
|
||||
|
||||
Classifier::into_word_list(&text)
|
||||
}
|
||||
}
|
||||
|
||||
impl Db {
|
||||
fn recompute_tokens(&mut self) {
|
||||
for (id, user) in &self.users {
|
||||
self.tokens.insert(*id, user.to_tokens());
|
||||
}
|
||||
}
|
||||
|
||||
fn recompute_scores(&mut self, classifier: &Classifier) {
|
||||
for (id, tokens) in &self.tokens {
|
||||
self.score.insert(*id, classifier.score(tokens));
|
||||
}
|
||||
}
|
||||
|
||||
fn from_path(path: &Path, classifier: &Classifier) -> anyhow::Result<Self> {
|
||||
let file = File::open(path)?;
|
||||
let (users, is_spam) = serde_json::from_reader(BufReader::new(file))?;
|
||||
let mut db = Db {
|
||||
users,
|
||||
is_spam,
|
||||
tokens: HashMap::new(),
|
||||
score: HashMap::new(),
|
||||
};
|
||||
db.recompute_tokens();
|
||||
db.recompute_scores(classifier);
|
||||
Ok(db)
|
||||
}
|
||||
|
||||
fn from_users(
|
||||
users: HashMap<UserId, UserData>,
|
||||
is_spam: HashMap<UserId, bool>,
|
||||
classifier: &Classifier,
|
||||
) -> Db {
|
||||
let mut db = Db {
|
||||
users,
|
||||
is_spam,
|
||||
tokens: HashMap::new(),
|
||||
score: HashMap::new(),
|
||||
};
|
||||
db.recompute_tokens();
|
||||
db.recompute_scores(classifier);
|
||||
db
|
||||
}
|
||||
|
||||
fn store_to_path(&self, path: &Path) -> anyhow::Result<()> {
|
||||
let file = File::create(path)?;
|
||||
let dat: (&HashMap<UserId, UserData>, &HashMap<UserId, bool>) =
|
||||
(&self.users, &self.is_spam);
|
||||
serde_json::to_writer(BufWriter::new(file), &dat)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn unclassified_users<'a>(&'a self) -> Vec<(&'a UserId, &'a UserData)> {
|
||||
self.users
|
||||
.iter()
|
||||
.filter(|(user_id, _)| !self.is_spam.contains_key(&user_id))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::Repository>> {
|
||||
let mut repos = Vec::new();
|
||||
let mut query = forgejo_api::structs::RepoSearchQuery::default();
|
||||
query.limit = Some(50);
|
||||
let mut page: u32 = 1;
|
||||
loop {
|
||||
query.page = Some(page);
|
||||
let resp = forge.repo_search(query.clone()).await?;
|
||||
match (resp.ok, resp.data) {
|
||||
(Some(true), Some(mut query_repos)) => {
|
||||
if query_repos.is_empty() {
|
||||
break;
|
||||
}
|
||||
repos.append(&mut query_repos);
|
||||
}
|
||||
_ => todo!("scrape_repos: implement retries"),
|
||||
}
|
||||
page += 1;
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
Ok(repos)
|
||||
}
|
||||
|
||||
async fn scrape_issues(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::Issue>> {
|
||||
let mut issues = Vec::new();
|
||||
let mut query = forgejo_api::structs::IssueSearchIssuesQuery::default();
|
||||
query.limit = Some(50);
|
||||
let mut page: u32 = 1;
|
||||
loop {
|
||||
query.page = Some(page);
|
||||
let mut resp = forge.issue_search_issues(query.clone()).await?;
|
||||
if resp.is_empty() {
|
||||
break;
|
||||
}
|
||||
issues.append(&mut resp);
|
||||
page += 1;
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
Ok(issues)
|
||||
}
|
||||
|
||||
async fn scrape_users(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::User>> {
|
||||
let mut users = Vec::new();
|
||||
let mut query = forgejo_api::structs::UserSearchQuery::default();
|
||||
query.limit = Some(50);
|
||||
let mut page: u32 = 1;
|
||||
loop {
|
||||
query.page = Some(page);
|
||||
let resp = forge.user_search(query.clone()).await?;
|
||||
match (resp.ok, resp.data) {
|
||||
(Some(true), Some(mut query_repos)) => {
|
||||
if query_repos.is_empty() {
|
||||
break;
|
||||
}
|
||||
users.append(&mut query_repos);
|
||||
}
|
||||
_ => todo!("scrape_users: implement retries"),
|
||||
}
|
||||
page += 1;
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
Ok(users)
|
||||
}
|
||||
|
||||
async fn get_users_data(forge: &Forgejo) -> anyhow::Result<HashMap<UserId, UserData>> {
|
||||
let mut data = HashMap::new();
|
||||
|
||||
let discard_empty = |o: Option<String>| {
|
||||
match o {
|
||||
None => None,
|
||||
Some(s) if s.is_empty() => None,
|
||||
Some(s) => Some(s),
|
||||
}
|
||||
};
|
||||
|
||||
eprintln!("Fetching users...");
|
||||
for user in scrape_users(&forge).await? {
|
||||
let Some(id) = user.id else {
|
||||
eprintln!("WARN: user with no id");
|
||||
continue;
|
||||
};
|
||||
let Some(login) = user.login else {
|
||||
eprintln!("WARN: missing login for user {id}");
|
||||
continue;
|
||||
};
|
||||
|
||||
// TODO: fetch those from the admin API instead
|
||||
let Some(email) = user.email else {
|
||||
eprintln!("WARN: missing email for user {id}");
|
||||
continue;
|
||||
};
|
||||
|
||||
data.insert(
|
||||
UserId(id),
|
||||
UserData {
|
||||
login,
|
||||
email,
|
||||
full_name: discard_empty(user.full_name),
|
||||
location: discard_empty(user.location),
|
||||
website: discard_empty(user.website),
|
||||
description: discard_empty(user.description),
|
||||
repos: Vec::new(),
|
||||
issues: Vec::new(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
eprintln!("Fetching repos...");
|
||||
for repo in scrape_repos(&forge).await? {
|
||||
let Some(id) = repo.id else {
|
||||
eprintln!("WARN: repo with no id");
|
||||
continue;
|
||||
};
|
||||
let Some(owner) = repo.owner else {
|
||||
eprintln!("WARN: repo {} with no owner", id);
|
||||
continue;
|
||||
};
|
||||
let Some(owner_id) = owner.id else {
|
||||
eprintln!("WARN: owner for repo {} has no id", id);
|
||||
continue;
|
||||
};
|
||||
let Some(repo_name) = repo.name else {
|
||||
eprintln!("WARN: repo {} has no name", id);
|
||||
continue;
|
||||
};
|
||||
let Some(forge_owner) = data.get_mut(&UserId(owner_id)) else {
|
||||
// this currently happens for repos owned by organizations
|
||||
eprintln!(
|
||||
"WARN: repo owner {} for repo {} is not in database",
|
||||
owner.login.unwrap_or_default(),
|
||||
repo_name
|
||||
);
|
||||
continue;
|
||||
};
|
||||
forge_owner.repos.push((
|
||||
RepoId(id),
|
||||
RepoData {
|
||||
name: repo_name,
|
||||
description: discard_empty(repo.description),
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
eprintln!("Fetching issues...");
|
||||
for issue in scrape_issues(&forge).await? {
|
||||
let Some(id) = issue.id else {
|
||||
eprintln!("WARN: issue with no id");
|
||||
continue;
|
||||
};
|
||||
let Some(user) = issue.user else {
|
||||
eprintln!("WARN: issue {} has no owner", id);
|
||||
continue;
|
||||
};
|
||||
let Some(user_id) = user.id else {
|
||||
eprintln!("WARN: user for issue {} has no id", id);
|
||||
continue;
|
||||
};
|
||||
let Some(forge_user) = data.get_mut(&UserId(user_id)) else {
|
||||
eprintln!(
|
||||
"WARN: issue user {} {} for issue {} is not in database",
|
||||
user.login.unwrap_or_default(),
|
||||
user_id,
|
||||
issue
|
||||
.html_url
|
||||
.map_or(String::from(""), |url| url.as_str().to_string())
|
||||
);
|
||||
continue;
|
||||
};
|
||||
forge_user.issues.push((
|
||||
IssueId(id),
|
||||
IssueData {
|
||||
title: issue.title.unwrap_or_default(),
|
||||
body: issue.body.unwrap_or_default(),
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
// discard users with an entirely empty profile: there is nothing useful we
|
||||
// can say about them
|
||||
let data = data
|
||||
.into_iter()
|
||||
.filter(|(_, user)| !user.is_empty())
|
||||
.collect();
|
||||
Ok(data)
|
||||
}
|
||||
use data::*;
|
||||
use db::Db;
|
||||
|
||||
async fn load_db() -> anyhow::Result<(Db, Classifier)> {
|
||||
let model_path = Path::new("model.json");
|
||||
|
@ -374,7 +38,7 @@ async fn load_db() -> anyhow::Result<(Db, Classifier)> {
|
|||
let db: Db = if db_path.is_file() {
|
||||
Db::from_path(db_path, &classifier)?
|
||||
} else {
|
||||
let db = Db::from_users(get_users_data(&forge).await?, HashMap::new(), &classifier);
|
||||
let db = Db::from_users(scrape::get_users_data(&forge).await?, HashMap::new(), &classifier);
|
||||
db.store_to_path(db_path)?;
|
||||
db
|
||||
};
|
||||
|
|
190
src/scrape.rs
Normal file
190
src/scrape.rs
Normal file
|
@ -0,0 +1,190 @@
|
|||
use forgejo_api::Forgejo;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::data::*;
|
||||
|
||||
async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::Repository>> {
|
||||
let mut repos = Vec::new();
|
||||
let mut query = forgejo_api::structs::RepoSearchQuery::default();
|
||||
query.limit = Some(50);
|
||||
let mut page: u32 = 1;
|
||||
loop {
|
||||
query.page = Some(page);
|
||||
let resp = forge.repo_search(query.clone()).await?;
|
||||
match (resp.ok, resp.data) {
|
||||
(Some(true), Some(mut query_repos)) => {
|
||||
if query_repos.is_empty() {
|
||||
break;
|
||||
}
|
||||
repos.append(&mut query_repos);
|
||||
}
|
||||
_ => todo!("scrape_repos: implement retries"),
|
||||
}
|
||||
page += 1;
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
Ok(repos)
|
||||
}
|
||||
|
||||
async fn scrape_issues(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::Issue>> {
|
||||
let mut issues = Vec::new();
|
||||
let mut query = forgejo_api::structs::IssueSearchIssuesQuery::default();
|
||||
query.limit = Some(50);
|
||||
let mut page: u32 = 1;
|
||||
loop {
|
||||
query.page = Some(page);
|
||||
let mut resp = forge.issue_search_issues(query.clone()).await?;
|
||||
if resp.is_empty() {
|
||||
break;
|
||||
}
|
||||
issues.append(&mut resp);
|
||||
page += 1;
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
Ok(issues)
|
||||
}
|
||||
|
||||
async fn scrape_users(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::User>> {
|
||||
let mut users = Vec::new();
|
||||
let mut query = forgejo_api::structs::UserSearchQuery::default();
|
||||
query.limit = Some(50);
|
||||
let mut page: u32 = 1;
|
||||
loop {
|
||||
query.page = Some(page);
|
||||
let resp = forge.user_search(query.clone()).await?;
|
||||
match (resp.ok, resp.data) {
|
||||
(Some(true), Some(mut query_repos)) => {
|
||||
if query_repos.is_empty() {
|
||||
break;
|
||||
}
|
||||
users.append(&mut query_repos);
|
||||
}
|
||||
_ => todo!("scrape_users: implement retries"),
|
||||
}
|
||||
page += 1;
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
Ok(users)
|
||||
}
|
||||
|
||||
pub async fn get_users_data(forge: &Forgejo) -> anyhow::Result<HashMap<UserId, UserData>> {
|
||||
let mut data = HashMap::new();
|
||||
|
||||
let discard_empty = |o: Option<String>| {
|
||||
match o {
|
||||
None => None,
|
||||
Some(s) if s.is_empty() => None,
|
||||
Some(s) => Some(s),
|
||||
}
|
||||
};
|
||||
|
||||
eprintln!("Fetching users...");
|
||||
for user in scrape_users(&forge).await? {
|
||||
let Some(id) = user.id else {
|
||||
eprintln!("WARN: user with no id");
|
||||
continue;
|
||||
};
|
||||
let Some(login) = user.login else {
|
||||
eprintln!("WARN: missing login for user {id}");
|
||||
continue;
|
||||
};
|
||||
|
||||
let Some(email) = user.email else {
|
||||
eprintln!("WARN: missing email for user {id}");
|
||||
continue;
|
||||
};
|
||||
|
||||
data.insert(
|
||||
UserId(id),
|
||||
UserData {
|
||||
login,
|
||||
email,
|
||||
full_name: discard_empty(user.full_name),
|
||||
location: discard_empty(user.location),
|
||||
website: discard_empty(user.website),
|
||||
description: discard_empty(user.description),
|
||||
repos: Vec::new(),
|
||||
issues: Vec::new(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
eprintln!("Fetching repos...");
|
||||
for repo in scrape_repos(&forge).await? {
|
||||
let Some(id) = repo.id else {
|
||||
eprintln!("WARN: repo with no id");
|
||||
continue;
|
||||
};
|
||||
let Some(owner) = repo.owner else {
|
||||
eprintln!("WARN: repo {} with no owner", id);
|
||||
continue;
|
||||
};
|
||||
let Some(owner_id) = owner.id else {
|
||||
eprintln!("WARN: owner for repo {} has no id", id);
|
||||
continue;
|
||||
};
|
||||
let Some(repo_name) = repo.name else {
|
||||
eprintln!("WARN: repo {} has no name", id);
|
||||
continue;
|
||||
};
|
||||
let Some(forge_owner) = data.get_mut(&UserId(owner_id)) else {
|
||||
// this currently happens for repos owned by organizations
|
||||
eprintln!(
|
||||
"WARN: repo owner {} for repo {} is not in database",
|
||||
owner.login.unwrap_or_default(),
|
||||
repo_name
|
||||
);
|
||||
continue;
|
||||
};
|
||||
forge_owner.repos.push((
|
||||
RepoId(id),
|
||||
RepoData {
|
||||
name: repo_name,
|
||||
description: discard_empty(repo.description),
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
eprintln!("Fetching issues...");
|
||||
for issue in scrape_issues(&forge).await? {
|
||||
let Some(id) = issue.id else {
|
||||
eprintln!("WARN: issue with no id");
|
||||
continue;
|
||||
};
|
||||
let Some(user) = issue.user else {
|
||||
eprintln!("WARN: issue {} has no owner", id);
|
||||
continue;
|
||||
};
|
||||
let Some(user_id) = user.id else {
|
||||
eprintln!("WARN: user for issue {} has no id", id);
|
||||
continue;
|
||||
};
|
||||
let Some(forge_user) = data.get_mut(&UserId(user_id)) else {
|
||||
eprintln!(
|
||||
"WARN: issue user {} {} for issue {} is not in database",
|
||||
user.login.unwrap_or_default(),
|
||||
user_id,
|
||||
issue
|
||||
.html_url
|
||||
.map_or(String::from(""), |url| url.as_str().to_string())
|
||||
);
|
||||
continue;
|
||||
};
|
||||
forge_user.issues.push((
|
||||
IssueId(id),
|
||||
IssueData {
|
||||
title: issue.title.unwrap_or_default(),
|
||||
body: issue.body.unwrap_or_default(),
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
// discard users with an entirely empty profile: there is nothing useful we
|
||||
// can say about them
|
||||
let data = data
|
||||
.into_iter()
|
||||
.filter(|(_, user)| !user.is_empty())
|
||||
.collect();
|
||||
Ok(data)
|
||||
}
|
Loading…
Add table
Reference in a new issue