also fetch issues created by users

This commit is contained in:
Armaël Guéneau 2024-11-19 09:51:52 +01:00
parent 0f8368031f
commit 4aa2aeb1fc
4 changed files with 87 additions and 11 deletions

3
.gitignore vendored
View file

@ -1 +1,4 @@
/target /target
classification.json
model.json
api_token

View file

@ -1 +0,0 @@
{"users":{"5847":"Spam","5637":"Spam","4640":"Spam","3590":"Spam","137":"Legit","2176":"Spam","3489":"Spam","4357":"Spam","1985":"Legit","1905":"Spam","4683":"Spam","5006":"Spam","4248":"Spam","4780":"Spam","1790":"Spam","5778":"Spam","2101":"Spam","768":"Legit","2117":"Spam","5516":"Spam","1552":"Legit","946":"Legit","5968":"Spam","3077":"Spam","1376":"Legit","5571":"Spam","4832":"Spam","5513":"Spam","5620":"Spam","3879":"Spam","5366":"Spam","3299":"Spam","12":"Legit","4940":"Spam","5611":"Spam","5524":"Spam","3760":"Spam","4759":"Spam","5184":"Spam","400":"Legit","5695":"Spam","4629":"Spam","5235":"Spam"}}

File diff suppressed because one or more lines are too long

View file

@ -1,4 +1,3 @@
use forgejo_api::structs::{RepoSearchQuery, Repository, User, UserSearchQuery};
use forgejo_api::{Auth, Forgejo}; use forgejo_api::{Auth, Forgejo};
use std::collections::HashMap; use std::collections::HashMap;
use tokio::time::{sleep, Duration}; use tokio::time::{sleep, Duration};
@ -17,6 +16,16 @@ struct RepoData {
description: Option<String>, description: Option<String>,
} }
#[derive(Debug, Hash, PartialEq, Eq)]
#[derive(Serialize, Deserialize)]
struct IssueId(i64);
#[derive(Debug)]
struct IssueData {
title: String,
body: String,
}
#[derive(Debug, Hash, PartialEq, Eq)] #[derive(Debug, Hash, PartialEq, Eq)]
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
struct UserId(i64); struct UserId(i64);
@ -28,7 +37,9 @@ struct UserData {
location: Option<String>, location: Option<String>,
website: Option<String>, website: Option<String>,
description: Option<String>, description: Option<String>,
// TODO: visibility
repos: Vec<(RepoId, RepoData)>, repos: Vec<(RepoId, RepoData)>,
issues: Vec<(IssueId, IssueData)>,
} }
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
@ -78,6 +89,11 @@ impl UserData {
} }
} }
for (_id, issue) in &self.issues {
add(&issue.title);
add(&issue.body);
}
text text
} }
} }
@ -90,9 +106,9 @@ impl Db {
} }
} }
async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<Repository>> { async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::Repository>> {
let mut repos = Vec::new(); let mut repos = Vec::new();
let mut query = RepoSearchQuery::default(); let mut query = forgejo_api::structs::RepoSearchQuery::default();
query.limit = Some(50); query.limit = Some(50);
let mut page: u32 = 1; let mut page: u32 = 1;
loop { loop {
@ -113,9 +129,27 @@ async fn scrape_repos(forge: &Forgejo) -> anyhow::Result<Vec<Repository>> {
Ok(repos) Ok(repos)
} }
async fn scrape_users(forge: &Forgejo) -> anyhow::Result<Vec<User>> { async fn scrape_issues(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::Issue>> {
let mut issues = Vec::new();
let mut query = forgejo_api::structs::IssueSearchIssuesQuery::default();
query.limit = Some(50);
let mut page: u32 = 1;
loop {
query.page = Some(page);
let mut resp = forge.issue_search_issues(query.clone()).await?;
if resp.is_empty() {
break;
}
issues.append(&mut resp);
page += 1;
sleep(Duration::from_millis(100)).await;
}
Ok(issues)
}
async fn scrape_users(forge: &Forgejo) -> anyhow::Result<Vec<forgejo_api::structs::User>> {
let mut users = Vec::new(); let mut users = Vec::new();
let mut query = UserSearchQuery::default(); let mut query = forgejo_api::structs::UserSearchQuery::default();
query.limit = Some(50); query.limit = Some(50);
let mut page: u32 = 1; let mut page: u32 = 1;
loop { loop {
@ -136,8 +170,10 @@ async fn scrape_users(forge: &Forgejo) -> anyhow::Result<Vec<User>> {
Ok(users) Ok(users)
} }
async fn get_users_repos(forge: &Forgejo) -> anyhow::Result<HashMap<UserId, UserData>> { async fn get_users_data(forge: &Forgejo) -> anyhow::Result<HashMap<UserId, UserData>> {
let mut data = HashMap::new(); let mut data = HashMap::new();
eprintln!("Fetching users...");
for user in scrape_users(&forge).await? { for user in scrape_users(&forge).await? {
let Some(id) = user.id else { let Some(id) = user.id else {
eprintln!("WARN: user with no id"); eprintln!("WARN: user with no id");
@ -153,6 +189,7 @@ async fn get_users_repos(forge: &Forgejo) -> anyhow::Result<HashMap<UserId, User
eprintln!("WARN: missing email for user {id}"); eprintln!("WARN: missing email for user {id}");
continue; continue;
}; };
data.insert( data.insert(
UserId(id), UserId(id),
UserData { UserData {
@ -162,10 +199,12 @@ async fn get_users_repos(forge: &Forgejo) -> anyhow::Result<HashMap<UserId, User
website: user.website, website: user.website,
description: user.description, description: user.description,
repos: Vec::new(), repos: Vec::new(),
issues: Vec::new(),
}, },
); );
} }
eprintln!("Fetching repos...");
for repo in scrape_repos(&forge).await? { for repo in scrape_repos(&forge).await? {
let Some(id) = repo.id else { let Some(id) = repo.id else {
eprintln!("WARN: repo with no id"); eprintln!("WARN: repo with no id");
@ -187,7 +226,7 @@ async fn get_users_repos(forge: &Forgejo) -> anyhow::Result<HashMap<UserId, User
// this currently happens for repos owned by organizations // this currently happens for repos owned by organizations
eprintln!( eprintln!(
"WARN: repo owner {} for repo {} is not in database", "WARN: repo owner {} for repo {} is not in database",
owner.login.unwrap(), owner.login.unwrap_or_default(),
repo_name repo_name
); );
continue; continue;
@ -200,6 +239,36 @@ async fn get_users_repos(forge: &Forgejo) -> anyhow::Result<HashMap<UserId, User
})); }));
} }
eprintln!("Fetching issues...");
for issue in scrape_issues(&forge).await? {
let Some(id) = issue.id else {
eprintln!("WARN: issue with no id");
continue;
};
let Some(user) = issue.user else {
eprintln!("WARN: issue {} has no owner", id);
continue;
};
let Some(user_id) = user.id else {
eprintln!("WARN: user for issue {} has no id", id);
continue;
};
let Some(forge_user) = data.get_mut(&UserId(user_id)) else {
eprintln!("WARN: issue user {} {} for issue {} is not in database",
user.login.unwrap_or_default(),
user_id,
issue.html_url.map_or(String::from(""), |url| url.as_str().to_string())
);
continue;
};
forge_user.issues.push((
IssueId(id),
IssueData {
title: issue.title.unwrap_or_default(),
body: issue.body.unwrap_or_default(),
}));
}
Ok(data) Ok(data)
} }
@ -221,8 +290,14 @@ async fn main() -> anyhow::Result<()> {
Db::new() Db::new()
}; };
let forge = Forgejo::new(Auth::None, url::Url::parse("https://git.deuxfleurs.fr")?)?; let api_token =
let data = get_users_repos(&forge).await?; std::fs::read_to_string(Path::new("api_token"))?
.trim().to_string();
let forge = Forgejo::new(
Auth::Token(&api_token),
url::Url::parse("https://git.deuxfleurs.fr")?
)?;
let data = get_users_data(&forge).await?;
println!("got {} users", data.len()); println!("got {} users", data.len());
for (user_id, user) in data { for (user_id, user) in data {