ID-list request & JSON parsing

This commit is contained in:
einisto 2022-06-02 01:19:04 +03:00
parent a34f3d310f
commit 576e426b03
4 changed files with 1017 additions and 16 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
/target /target
testdata.json

942
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -8,3 +8,6 @@ edition = "2021"
[dependencies] [dependencies]
clap = "3.1.18" clap = "3.1.18"
regex = "1.5.6" regex = "1.5.6"
reqwest = { version = "0.11", features = ["json"] }
tokio = { version = "1.18.2", features = ["full"] }
serde_json = "1.0"

View File

@ -1,21 +1,25 @@
use clap::{Arg, ArgGroup, Command}; use clap::{Arg, ArgGroup, Command};
use regex::Regex; use regex::Regex;
use serde_json::Value;
use std::{path::PathBuf, process::exit}; use std::{path::PathBuf, process::exit};
// General error type to make error handling easier
type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
enum Mode { enum Mode {
Thread, Thread,
Board, Board,
} }
fn parse_cli_args() -> (PathBuf, String, Mode) { fn parse_cli_args() -> Result<(PathBuf, String, Mode)> {
let matches = Command::new("My Super Program") let matches = Command::new("WGDL-imagescraper written in Rust")
.version("1.0") .version("0.1.0")
.author("Kevin K. <kbknapp@gmail.com>") .author("Arttu Einistö <einisto@proton.me>")
.about("Does awesome things") .about("Scrapes images efficiently from 4chan.org")
.arg( .arg(
Arg::new("path") Arg::new("output")
.short('p') .short('o')
.long("path") .long("path")
.value_name("PATH") .value_name("PATH")
.help("Set an output directory") .help("Set an output directory")
@ -46,11 +50,10 @@ fn parse_cli_args() -> (PathBuf, String, Mode) {
.get_matches(); .get_matches();
let re = Regex::new( let re = Regex::new(
r"(http|https)://boards.(4chan|4channel).org/[a-zA-Z]{1,4}/(catalog|thread/\d+)", r"^((http|https)://)?boards.(4chan|4channel).org/[a-zA-Z]{1,4}/(catalog|thread/\d+)$",
) )?;
.unwrap();
let path = PathBuf::from(matches.value_of("path").unwrap()); let path = PathBuf::from(matches.value_of("output").unwrap());
let target_match = matches.value_of("target").unwrap(); let target_match = matches.value_of("target").unwrap();
let target = match re.is_match(target_match) { let target = match re.is_match(target_match) {
true => target_match, true => target_match,
@ -64,15 +67,67 @@ fn parse_cli_args() -> (PathBuf, String, Mode) {
false => Mode::Board, false => Mode::Board,
}; };
(path, String::from(target), mode) Ok((path, String::from(target), mode))
} }
fn main() { fn create_thread_url(target: String) -> String {
// TODO: add var for default output path (similar to wgdl.py) let url_vec: Vec<&str> = target.split("/").collect();
let (path, target, mode) = parse_cli_args(); let thread_id = url_vec.get(url_vec.len() - 1).unwrap();
let board = url_vec.get(url_vec.len() - 3).unwrap();
format!("https://a.4cdn.org/{}/thread/{}.json", board, thread_id)
}
fn create_board_url(target: String) -> String {
let url_vec: Vec<&str> = target.split("/").collect();
let board = url_vec.get(url_vec.len()).unwrap();
format!("https://a.4cdn.org/{}/catalog.json", board)
}
async fn get_imagelist(json_url: &str) -> Result<Vec<(String, String)>> {
let req_body = reqwest::get(json_url).await?.text().await?;
let json_data: Value = serde_json::from_str(req_body.as_str())?;
let mut thread_img_data: Vec<(String, String)> = Vec::new();
for post in json_data["posts"].as_array().unwrap() {
if post["tim"].is_i64() {
thread_img_data.push((
post["tim"].to_string(),
post["ext"].as_str().unwrap().to_string(),
));
} else {
continue;
}
}
Ok(thread_img_data)
}
#[tokio::main]
async fn main() -> Result<()> {
// TODO: add possible config-file for default output path (similar to wgdl.py)
let (path, target, mode) = parse_cli_args()?;
println!( println!(
"CONFIG:\n\tPATH: {:?}\n\tTARGET: {}\n\tMODE: {:?}", "\nCONFIG:\n\tPATH: {:?}\n\tTARGET: {}\n\tMODE: {:?}\n",
path, target, mode path, target, mode
); );
match mode {
Mode::Thread => {
let json_url = create_thread_url(target);
let id_list = get_imagelist(&json_url.as_str()).await?;
println!("{:#?}", id_list);
// 3.) download img based on json
}
Mode::Board => {
let json_url = create_board_url(target);
let id_list = get_imagelist(&json_url.as_str()).await?;
println!("{:#?}", id_list);
// 3.) download img based on json
}
};
Ok(())
} }