ID-list request & JSON parsing
This commit is contained in:
parent
a34f3d310f
commit
576e426b03
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
|||||||
/target
|
/target
|
||||||
|
testdata.json
|
||||||
|
942
Cargo.lock
generated
942
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -8,3 +8,6 @@ edition = "2021"
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
clap = "3.1.18"
|
clap = "3.1.18"
|
||||||
regex = "1.5.6"
|
regex = "1.5.6"
|
||||||
|
reqwest = { version = "0.11", features = ["json"] }
|
||||||
|
tokio = { version = "1.18.2", features = ["full"] }
|
||||||
|
serde_json = "1.0"
|
||||||
|
87
src/main.rs
87
src/main.rs
@ -1,21 +1,25 @@
|
|||||||
use clap::{Arg, ArgGroup, Command};
|
use clap::{Arg, ArgGroup, Command};
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
|
use serde_json::Value;
|
||||||
use std::{path::PathBuf, process::exit};
|
use std::{path::PathBuf, process::exit};
|
||||||
|
|
||||||
|
// General error type to make error handling easier
|
||||||
|
type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
enum Mode {
|
enum Mode {
|
||||||
Thread,
|
Thread,
|
||||||
Board,
|
Board,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_cli_args() -> (PathBuf, String, Mode) {
|
fn parse_cli_args() -> Result<(PathBuf, String, Mode)> {
|
||||||
let matches = Command::new("My Super Program")
|
let matches = Command::new("WGDL-imagescraper written in Rust")
|
||||||
.version("1.0")
|
.version("0.1.0")
|
||||||
.author("Kevin K. <kbknapp@gmail.com>")
|
.author("Arttu Einistö <einisto@proton.me>")
|
||||||
.about("Does awesome things")
|
.about("Scrapes images efficiently from 4chan.org")
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new("path")
|
Arg::new("output")
|
||||||
.short('p')
|
.short('o')
|
||||||
.long("path")
|
.long("path")
|
||||||
.value_name("PATH")
|
.value_name("PATH")
|
||||||
.help("Set an output directory")
|
.help("Set an output directory")
|
||||||
@ -46,11 +50,10 @@ fn parse_cli_args() -> (PathBuf, String, Mode) {
|
|||||||
.get_matches();
|
.get_matches();
|
||||||
|
|
||||||
let re = Regex::new(
|
let re = Regex::new(
|
||||||
r"(http|https)://boards.(4chan|4channel).org/[a-zA-Z]{1,4}/(catalog|thread/\d+)",
|
r"^((http|https)://)?boards.(4chan|4channel).org/[a-zA-Z]{1,4}/(catalog|thread/\d+)$",
|
||||||
)
|
)?;
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let path = PathBuf::from(matches.value_of("path").unwrap());
|
let path = PathBuf::from(matches.value_of("output").unwrap());
|
||||||
let target_match = matches.value_of("target").unwrap();
|
let target_match = matches.value_of("target").unwrap();
|
||||||
let target = match re.is_match(target_match) {
|
let target = match re.is_match(target_match) {
|
||||||
true => target_match,
|
true => target_match,
|
||||||
@ -64,15 +67,67 @@ fn parse_cli_args() -> (PathBuf, String, Mode) {
|
|||||||
false => Mode::Board,
|
false => Mode::Board,
|
||||||
};
|
};
|
||||||
|
|
||||||
(path, String::from(target), mode)
|
Ok((path, String::from(target), mode))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn create_thread_url(target: String) -> String {
|
||||||
// TODO: add var for default output path (similar to wgdl.py)
|
let url_vec: Vec<&str> = target.split("/").collect();
|
||||||
let (path, target, mode) = parse_cli_args();
|
let thread_id = url_vec.get(url_vec.len() - 1).unwrap();
|
||||||
|
let board = url_vec.get(url_vec.len() - 3).unwrap();
|
||||||
|
|
||||||
|
format!("https://a.4cdn.org/{}/thread/{}.json", board, thread_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_board_url(target: String) -> String {
|
||||||
|
let url_vec: Vec<&str> = target.split("/").collect();
|
||||||
|
let board = url_vec.get(url_vec.len()).unwrap();
|
||||||
|
|
||||||
|
format!("https://a.4cdn.org/{}/catalog.json", board)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_imagelist(json_url: &str) -> Result<Vec<(String, String)>> {
|
||||||
|
let req_body = reqwest::get(json_url).await?.text().await?;
|
||||||
|
let json_data: Value = serde_json::from_str(req_body.as_str())?;
|
||||||
|
|
||||||
|
let mut thread_img_data: Vec<(String, String)> = Vec::new();
|
||||||
|
for post in json_data["posts"].as_array().unwrap() {
|
||||||
|
if post["tim"].is_i64() {
|
||||||
|
thread_img_data.push((
|
||||||
|
post["tim"].to_string(),
|
||||||
|
post["ext"].as_str().unwrap().to_string(),
|
||||||
|
));
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(thread_img_data)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<()> {
|
||||||
|
// TODO: add possible config-file for default output path (similar to wgdl.py)
|
||||||
|
let (path, target, mode) = parse_cli_args()?;
|
||||||
|
|
||||||
println!(
|
println!(
|
||||||
"CONFIG:\n\tPATH: {:?}\n\tTARGET: {}\n\tMODE: {:?}",
|
"\nCONFIG:\n\tPATH: {:?}\n\tTARGET: {}\n\tMODE: {:?}\n",
|
||||||
path, target, mode
|
path, target, mode
|
||||||
);
|
);
|
||||||
|
|
||||||
|
match mode {
|
||||||
|
Mode::Thread => {
|
||||||
|
let json_url = create_thread_url(target);
|
||||||
|
let id_list = get_imagelist(&json_url.as_str()).await?;
|
||||||
|
println!("{:#?}", id_list);
|
||||||
|
// 3.) download img based on json
|
||||||
|
}
|
||||||
|
Mode::Board => {
|
||||||
|
let json_url = create_board_url(target);
|
||||||
|
let id_list = get_imagelist(&json_url.as_str()).await?;
|
||||||
|
println!("{:#?}", id_list);
|
||||||
|
// 3.) download img based on json
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user