feat(BREAKING): comprehensive filtering, fixes #1

This commit is contained in:
ae 2024-09-28 04:27:49 +03:00
parent 4f26e93fd2
commit 01de99663b
Signed by: ae
GPG Key ID: 995EFD5C1B532B3E
9 changed files with 1400 additions and 657 deletions

3
.gitignore vendored
View File

@ -1,2 +1 @@
/target
testdata.json
target/

1292
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,13 +1,17 @@
[package]
name = "dlrs"
version = "0.2.1"
description = "Imageboard media downloader"
version = "0.2.2"
edition = "2021"
[dependencies]
clap = "3.1.18"
regex = "1.5.6"
reqwest = { version = "0.11", features = ["json"] }
tokio = { version = "1.18.5", features = ["full"] }
serde_json = "1.0"
futures = "0.3"
colored = "2.0.0"
base64 = "0.22.1"
clap = { version = "4.5.18", features = ["derive"] }
colored = "2.1.0"
futures = "0.3.30"
lazy-regex = "3.3.0"
md-5 = "0.10.6"
num = "0.4.3"
reqwest = "0.12.7"
serde_json = "1.0.128"
tokio = { version = "1.40.0", features = ["full"] }

View File

@ -4,16 +4,16 @@
### Usage
Build the optimized binary with `cargo build --release`.
```
USAGE:
dlrs --output <PATH> <--thread <URL>|--board <URL>>
Usage: dlrs [OPTIONS] -o <PATH> <-t <URL>|-b <URL>>
OPTIONS:
-b, --board <URL> Set a board URL
-h, --help Print help information
-o, --output <PATH> Set an output directory
-t, --thread <URL> Set a thread URL
-V, --version Print version information
Options:
-o <PATH> Set the output directory
-t <URL> Set the thread URL
-b <URL> Set the board URL
-n <MIN_RES> Set the minimum image resolution (e.g. "1920x1080")
-m <MAX_RES> Set the minimum image resolution (e.g. "3840x2160")
-a <RATIOS> Set a comma-separated list of accepted image aspect ratios (e.g. "4:3,16:9")
-v Toggle verbose output
-h, --help Print help
```

View File

@ -1,37 +1,95 @@
use std::{error::Error, process::exit};
use colored::Colorize;
use reqwest::{header::USER_AGENT, Client};
use serde_json::Value;
type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
use crate::{Config, UA};
pub fn parse_url(url: &str) -> (String, String) {
pub async fn dl(cfg: Config, client: Client) -> Result<(usize, usize), Box<dyn Error>> {
let mut dl = 0;
let mut fl = 0;
let (json_url, board) = parse_url(cfg.target.clone());
println!("{}", format!("Board JSON URL: {json_url}").blue().bold());
let (tc, threads) = parse_json(&client, json_url, board.clone()).await?;
println!(
"{}",
format!("Current thread count of {board}: {tc}")
.blue()
.bold()
);
for url in threads {
let (ndl, nfl) =
crate::thread::dl(cfg.clone(), client.clone(), Some((url, board.clone()))).await?;
dl += ndl;
fl += nfl;
}
Ok((dl, fl))
}
fn parse_url(url: String) -> (String, String) {
let url_split: Vec<&str> = url.split('/').collect();
let board_name = url_split.get(url_split.len() - 2).unwrap();
let board = url_split.get(url_split.len() - 2).unwrap();
(
format!("https://a.4cdn.org/{}/catalog.json", board_name),
board_name.to_string(),
format!("https://a.4cdn.org/{board}/catalog.json",),
board.to_string(),
)
}
pub async fn get_threadlist(json_url: &str, board_name: &str) -> Result<(usize, Vec<String>)> {
let req_body = reqwest::get(json_url).await?.text().await?;
let json_data: Value = serde_json::from_str(req_body.as_str())?;
let board: Vec<Value> = json_data
async fn parse_json(
client: &Client,
url: String,
board: String,
) -> Result<(usize, Vec<String>), Box<dyn Error>> {
let mut threads = Vec::new();
let res_txt = match client.get(url.clone()).header(USER_AGENT, UA).send().await {
Ok(res) => res.text().await?,
Err(e) => {
eprintln!(
"{}",
format!("Failed to request the thread JSON data from {url}: {e}")
.red()
.bold()
);
exit(1);
}
};
let data: Value = match serde_json::from_str(&res_txt) {
Ok(data) => data,
Err(e) => {
eprintln!(
"{}",
format!("Failed to parse the raw data from {url}: {e}")
.red()
.bold()
);
exit(1);
}
};
let catalogue: Vec<Value> = data
.as_array()
.unwrap()
.iter()
.map(|page| page["threads"].clone())
.collect();
let mut board_data: Vec<String> = Vec::new();
board.iter().for_each(|thread_arr| {
thread_arr.as_array().unwrap().iter().for_each(|thread| {
let url = format!(
"https://a.4cdn.org/{}/thread/{}.json",
board_name, thread["no"]
);
board_data.push(url);
});
});
for thread_array in catalogue {
for thread in thread_array.as_array().unwrap() {
let thread_id = &thread["no"];
let url = format!("https://a.4cdn.org/{board}/thread/{thread_id}.json");
Ok((board_data.len(), board_data))
threads.push(url);
}
}
Ok((threads.len(), threads))
}

View File

@ -1,95 +0,0 @@
use colored::Colorize;
use futures::{stream, StreamExt};
use reqwest::Client;
use serde_json::Value;
use std::{
path::{Path, PathBuf},
process::exit,
};
use tokio::{fs::File, io::AsyncWriteExt};
type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
pub async fn get_imagelist(
json_url: &str,
board_name: &str,
output_path: &Path,
) -> Result<Vec<(String, PathBuf)>> {
let req_body_raw = match reqwest::get(json_url).await {
Ok(n) => n,
Err(_) => {
eprintln!("{}", format!("Error requesting {}", json_url).bold().red());
exit(0x0100);
}
};
let req_body_text = req_body_raw.text().await?;
let json_data: Value = match serde_json::from_str(req_body_text.as_str()) {
Ok(n) => n,
Err(e) => {
eprintln!(
"{}",
format!("Error parsing json from {}: {}", json_url, e)
.bold()
.red()
);
exit(0x0100);
}
};
let mut img_data: Vec<(String, PathBuf)> = Vec::new();
json_data["posts"]
.as_array()
.unwrap()
.iter()
.filter(|post| post["tim"].is_i64())
.for_each(|post| {
let id = post["tim"].to_string();
let ext = post["ext"].as_str().unwrap().to_string();
let filepath = output_path.join(format!("{}{}", id, ext).as_str());
img_data.push((
format!("https://i.4cdn.org/{}/{}{}", board_name, id, ext),
filepath,
))
});
Ok(img_data)
}
pub async fn get_images(img_data: &Vec<(String, PathBuf)>) -> Result<usize> {
let client = Client::builder().build()?;
let futures = stream::iter(img_data.iter().map(|data| async {
let (url, path) = data;
let send_fut = client.get(url).send();
match send_fut.await {
Ok(res) => match res.bytes().await {
Ok(bytes) => {
let mut file = File::create(path).await.unwrap();
file.write_all(&bytes).await.unwrap();
println!(
"{}",
format!("{} bytes: {:?} -> {:?}", bytes.len(), url, path)
.italic()
.purple()
);
}
Err(_) => eprintln!(
"{}",
format!("Error converting request from {} to bytes", url)
.bold()
.red()
),
},
Err(_) => eprintln!("{}", format!("Error requesting {}", url).bold().red()),
}
}))
.buffer_unordered(100)
.collect::<Vec<()>>();
futures.await;
Ok(img_data.len())
}

84
src/http.rs Normal file
View File

@ -0,0 +1,84 @@
use std::{error::Error, path::PathBuf, sync::Arc, time::Duration};
use base64::{engine::general_purpose, Engine};
use colored::Colorize;
use futures::{lock::Mutex, stream, StreamExt};
use md5::{Digest, Md5};
use reqwest::{header::USER_AGENT, Client};
use tokio::{fs::File, io::AsyncWriteExt};
use crate::UA;
const KEEP_ALIVE_TIMEOUT: Duration = Duration::from_secs(5);
pub async fn concurrent_dl(
images: Vec<(String, PathBuf, String)>,
) -> Result<(usize, usize), Box<dyn Error>> {
let dl_count = Arc::new(Mutex::new(0));
let sk_count = Arc::new(Mutex::new(0));
let client = Client::builder()
.pool_idle_timeout(KEEP_ALIVE_TIMEOUT)
.build()?;
let futures = stream::iter(images.iter().map(|data| async {
let dl_count = Arc::clone(&dl_count);
let _sk_count = Arc::clone(&sk_count);
let client = client.clone();
let (url, path, _expct_md5) = data;
let send_fut = client.get(url).header(USER_AGENT, UA).send();
match send_fut.await {
Ok(res) => match res.bytes().await {
Ok(bytes) => {
let byte_count = bytes.len();
let mut hasher = Md5::new();
hasher.update(&bytes);
let result = hasher.finalize();
let b64_md5 = general_purpose::STANDARD.encode(result);
// 4chan file attachment hash is always 24 character packed base64 encoded MD5. Truly a fucking state of art CRC.
// TODO: Figure out how the MD5 should be converted before uncommenting the following filtering condition
// if b64_md5 != *expct_md5 {
// eprintln!("{}", format!("File skipped due to mismatched MD5 (expected {expct_md5}, got {b64_md5})").red().bold());
// let mut sk_count = sk_count.lock().await;
// *sk_count += 1;
// return;
// }
let mut file = File::create(path).await.unwrap();
file.write_all(&bytes).await.unwrap();
let mut dl_count = dl_count.lock().await;
*dl_count += 1;
println!(
"{}",
format!("{b64_md5}: {byte_count} bytes").truecolor(0, 209, 27)
);
}
Err(_) => eprintln!(
"{}",
format!("Failed to convert request from {} to bytes", url)
.red()
.bold()
),
},
Err(_) => eprintln!("{}", format!("Failed to request {}", url).red().bold()),
}
}))
.buffer_unordered(100)
.collect::<Vec<()>>();
futures.await;
let dl = *dl_count.lock().await;
let sk = *sk_count.lock().await;
Ok((dl, sk))
}

View File

@ -1,127 +1,199 @@
mod board;
mod downloader;
mod http;
mod thread;
use clap::{Arg, ArgGroup, Command};
use colored::Colorize;
use regex::Regex;
use std::{path::PathBuf, process::exit};
type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
use clap::{Args, Parser};
use colored::Colorize;
use lazy_regex::*;
#[derive(Debug)]
static RES_REX: Lazy<Regex> = lazy_regex!(r"^\d{3,6}x\d{3,6}$");
static RATIO_REX: Lazy<Regex> = lazy_regex!(r"^\d{1,2}:\d{1,2}$");
static URL_REX: Lazy<Regex> = lazy_regex!(
r"^((http|https)://)?boards.(4chan|4channel).org/[a-zA-Z]{1,4}/(catalog|thread/\d+)$"
);
pub static UA: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.3";
#[derive(Debug, Clone, Copy)]
enum Mode {
Thread,
Board,
}
fn parse_cli_args() -> Result<(PathBuf, String, Mode)> {
let matches = Command::new("dlrs")
.arg(
Arg::new("output")
.short('o')
.long("output")
.value_name("PATH")
.help("Set an output directory")
.takes_value(true)
.required(true),
)
.arg(
Arg::new("thread")
.short('t')
.long("thread")
.value_name("URL")
.help("Set a thread URL")
.takes_value(true),
)
.arg(
Arg::new("board")
.short('b')
.long("board")
.value_name("URL")
.help("Set a board URL")
.takes_value(true),
)
.group(
ArgGroup::new("target")
.args(&["thread", "board"])
.required(true),
)
.get_matches();
#[derive(Debug, Clone)]
struct Config {
mode: Mode,
target: String,
min_res: Option<(u64, u64)>,
max_res: Option<(u64, u64)>,
aspect_ratios: Option<Vec<(u32, u32)>>,
out_dir: PathBuf,
verbose: bool,
}
let re = Regex::new(
r"^((http|https)://)?boards.(4chan|4channel).org/[a-zA-Z]{1,4}/(catalog|thread/\d+)$",
)?;
impl From<Cli> for Config {
fn from(value: Cli) -> Self {
let mode: Mode;
let target: String;
let path = PathBuf::from(matches.value_of("output").unwrap());
let target_match = matches.value_of("target").unwrap();
let target = match re.is_match(target_match) {
true => target_match,
false => {
eprintln!("{}", "Error: Invalid URL format".to_string().bold().red());
exit(0x0100);
if let Some(thread_url) = value.target.thread {
mode = Mode::Thread;
target = thread_url;
} else if let Some(board_url) = value.target.board {
mode = Mode::Board;
target = board_url;
} else {
eprintln!("{}", "No target URL".red().bold());
exit(1);
}
};
let mode = match matches.is_present("thread") {
true => Mode::Thread,
false => Mode::Board,
};
Ok((path, target.to_string(), mode))
if !URL_REX.is_match(&target) {
eprintln!("{}", "Target URL doesn't pass the RegEx check".red().bold());
exit(1);
}
let out_dir = PathBuf::from(value.output);
Config {
mode,
target,
min_res: Self::parse_res(value.min_res),
max_res: Self::parse_res(value.max_res),
aspect_ratios: Self::parse_aspect_ratios(value.aspect_ratios),
out_dir,
verbose: value.verbose,
}
}
}
impl Config {
fn parse_res(res_str: Option<String>) -> Option<(u64, u64)> {
let res = match res_str {
Some(res) => res,
None => return None,
};
if !RES_REX.is_match(&res) {
eprintln!(
"{}",
"Given resolution doesn't pass the RegEx check".red().bold()
);
exit(1);
}
let split: Vec<&str> = res.split('x').collect();
let parsed = (
split[0].parse::<u64>().unwrap(),
split[1].parse::<u64>().unwrap(),
);
Some(parsed)
}
fn parse_aspect_ratios(ratios_str: Option<String>) -> Option<Vec<(u32, u32)>> {
let ratios = match ratios_str {
Some(ratios) => ratios,
None => return None,
};
let ratios_vec: Vec<&str> = ratios.split(',').collect();
let mut parsed_vec = Vec::new();
for r in ratios_vec {
if !RATIO_REX.is_match(r) {
eprintln!(
"{}",
"One or more of the given aspect ratios don't pass the RegEx check"
.red()
.bold()
);
exit(1);
}
let split: Vec<&str> = r.split(':').collect();
let parsed = (
split[0].parse::<u32>().unwrap(),
split[1].parse::<u32>().unwrap(),
);
parsed_vec.push(parsed);
}
Some(parsed_vec)
}
}
#[derive(Debug, Parser)]
struct Cli {
/// Set the output directory
#[arg(short = 'o', value_name = "PATH", required = true)]
output: String,
/// Set the target URL
#[command(flatten)]
target: Target,
/// Set the minimum image resolution (e.g. "1920x1080")
#[arg(short = 'n', value_name = "MIN_RES")]
min_res: Option<String>,
/// Set the minimum image resolution (e.g. "3840x2160")
#[arg(short = 'm', value_name = "MAX_RES")]
max_res: Option<String>,
/// Set a comma-separated list of accepted image aspect ratios (e.g. "4:3,16:9")
#[arg(short = 'a', value_name = "RATIOS")]
aspect_ratios: Option<String>,
/// Toggle verbose output
#[arg(short = 'v', action, default_value_t = false)]
verbose: bool,
}
#[derive(Debug, Args)]
#[group(required = true, multiple = false)]
struct Target {
/// Set the thread URL
#[arg(short = 't', value_name = "URL")]
thread: Option<String>,
/// Set the board URL
#[arg(short = 'b', value_name = "URL")]
board: Option<String>,
}
#[tokio::main]
async fn main() -> Result<()> {
let (path, target, mode) = parse_cli_args()?;
println!(
"{}",
format!(
"\nDownload configuration:\n\tOUTPUT PATH: {:?}\n\tURL: {}\n\tDOWNLOAD MODE: {:?}\n",
path, target, mode
)
.bold()
.green()
);
async fn main() {
let args = Cli::parse();
let cfg = Config::from(args);
let client = reqwest::Client::new();
match mode {
Mode::Thread => {
let (json_url, board_name) = thread::parse_url(&target);
println!(
"{}",
format!("Parsing JSON from {}", json_url).bold().blue()
);
let img_data = downloader::get_imagelist(&json_url, &board_name, &path).await?;
let filecount = downloader::get_images(&img_data).await?;
println!(
"{}",
format!("Total of {} files downloaded from 1 thread.\n", filecount)
.bold()
.green()
);
}
Mode::Board => {
let (json_url, board_name) = board::parse_url(&target);
let (thread_amt, thread_data) = board::get_threadlist(&json_url, &board_name).await?;
let mut filecount: usize = 0;
for url in &thread_data {
println!("{}", format!("Parsing JSON from {}", url).bold().blue());
let img_data = downloader::get_imagelist(url, &board_name, &path).await?;
let total_amt = downloader::get_images(&img_data).await?;
filecount += total_amt;
}
println!(
"{}",
format!(
"Total of {} files downloaded from {} threads.\n",
filecount, thread_amt
)
.bold()
.green()
);
}
if cfg.verbose {
println!("\nDLRS CONFIG:\n{cfg:#?}\n");
}
Ok(())
let res = match cfg.mode {
Mode::Thread => thread::dl(cfg, client, None).await,
Mode::Board => board::dl(cfg, client).await,
};
let (dl, fl) = match res {
Ok((dl, fl)) => (dl, fl),
Err(e) => {
eprintln!(
"{}",
format!("Error during thread download: {e}").red().bold()
);
exit(1);
}
};
println!(
"{}",
format!("A total of {dl} files downloaded and {fl} filtered")
.truecolor(252, 156, 12)
.bold()
);
}

View File

@ -1,13 +1,162 @@
pub fn parse_url(url: &str) -> (String, String) {
use core::fmt;
use std::{error::Error, path::PathBuf, process::exit};
use crate::{http, Config, UA};
use colored::Colorize;
use num::integer::gcd;
use reqwest::{header::USER_AGENT, Client};
use serde_json::Value;
#[derive(Debug)]
enum FilterCondition {
MaxRes,
MinRes,
AspectRatio,
None,
}
impl fmt::Display for FilterCondition {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::MaxRes => write!(f, "Image is larger than the set maximum resolution"),
Self::MinRes => write!(f, "Image is smaller than the set maximum resolution"),
Self::AspectRatio => write!(f, "Image's aspect ratio doesn't match the listed ones"),
Self::None => write!(f, "No reason"),
}
}
}
pub async fn dl(
cfg: Config,
client: Client,
combo: Option<(String, String)>,
) -> Result<(usize, usize), Box<dyn Error>> {
let (json_url, board) = match combo {
Some((json_url, board)) => (json_url, board),
None => parse_url(cfg.target.clone()),
};
println!("{}", format!("Thread JSON URL: {json_url}").blue().bold());
let (images, fl) = parse_json(cfg.clone(), &client, json_url, board).await?;
let (dl, sk) = http::concurrent_dl(images).await?;
Ok((dl, fl + sk))
}
fn parse_url(url: String) -> (String, String) {
let url_split: Vec<&str> = url.split('/').collect();
let thread_id = url_split.last().unwrap();
let board_name = url_split.get(url_split.len() - 3).unwrap();
let board = url_split.get(url_split.len() - 3).unwrap();
(
format!(
"https://a.4cdn.org/{}/thread/{}.json",
board_name, thread_id
),
board_name.to_string(),
format!("https://a.4cdn.org/{board}/thread/{thread_id}.json"),
board.to_string(),
)
}
async fn parse_json(
cfg: Config,
client: &Client,
url: String,
board: String,
) -> Result<(Vec<(String, PathBuf, String)>, usize), Box<dyn Error>> {
let mut filtered = 0;
let mut images = Vec::new();
let res_txt = match client.get(url.clone()).header(USER_AGENT, UA).send().await {
Ok(res) => res.text().await?,
Err(e) => {
eprintln!(
"{}",
format!("Failed to request the thread JSON data from {url}: {e}")
.red()
.bold()
);
exit(1);
}
};
let data: Value = match serde_json::from_str(&res_txt) {
Ok(data) => data,
Err(e) => {
eprintln!(
"{}",
format!("Failed to parse the raw data from {url}: {e}")
.red()
.bold()
);
exit(1);
}
};
for entry in data["posts"].as_array().unwrap() {
// Reply without an attachment
if !entry["tim"].is_i64() {
continue;
}
let (is_filtered, cond) = img_filter(cfg.clone(), entry);
if is_filtered {
if cfg.verbose {
println!("Image filtered: {cond}");
}
filtered += 1;
continue;
}
// Format: (img url, output file, md5 hash)
images.push(parse_img(cfg.clone(), entry, &board));
}
Ok((images, filtered))
}
fn parse_img(cfg: Config, entry: &Value, board: &str) -> (String, PathBuf, String) {
let id = entry["tim"].to_string();
// Chained conversions for `ext` and `md5` to get rid of the quotes
let ext = entry["ext"].as_str().unwrap().to_string();
let md5 = entry["md5"].as_str().unwrap().to_string();
let path = cfg.out_dir.join(format!("{id}{ext}"));
let url = format!("https://i.4cdn.org/{board}/{id}{ext}");
(url, path, md5)
}
fn img_filter(cfg: Config, entry: &Value) -> (bool, FilterCondition) {
let img_w = entry["w"].as_u64().unwrap();
let img_h = entry["h"].as_u64().unwrap();
let aspect_ratio = img_aspect_ratio(img_w, img_h);
if let Some(min_res) = cfg.min_res {
if min_res.0 > img_w || min_res.1 > img_h {
return (true, FilterCondition::MinRes);
}
}
if let Some(max_res) = cfg.max_res {
if max_res.0 < img_w || max_res.1 < img_h {
return (true, FilterCondition::MaxRes);
}
}
if let Some(aspect_ratios) = &cfg.aspect_ratios {
// Non-empty list of preferred aspect ratios
if !aspect_ratios.contains(&aspect_ratio) {
return (true, FilterCondition::AspectRatio);
}
}
(false, FilterCondition::None)
}
fn img_aspect_ratio(w: u64, h: u64) -> (u32, u32) {
let div = gcd(w, h);
let simpl_w = (w / div) as u32;
let simpl_h = (h / div) as u32;
(simpl_w, simpl_h)
}