Simple scraper-based parser that collects posts from AP News. As of now it just prints them into the console.

Cargo.toml

[package]
name = "scraper"
edition = "2021"

[dependencies]
prest = { path = "../../", version = "0.4" }
tokio = { version = "1.35", features = ["full"] }
reqwest = { version = "0.11", default-features=false, features = ["rustls-tls"] }
scraper = "0.18"
futures = "0.3"

src/main.rs

use futures::future::join_all;
use prest::*;
use reqwest::get as fetch;
use scraper::{Html, Selector};
struct Target {
    pub url: String,
    pub links: Selector,
    pub title: Selector,
    pub content: Selector,
}

fn main() {
    // starting scraping in a separate OS thread because it involves some !Send values
    std::thread::spawn(|| {
        scrape(Target {
            url: "https://apnews.com".to_owned(),
            links: Selector::parse(".Page-content .PageList-items-item a").unwrap(),
            title: Selector::parse("h1.Page-headline").unwrap(),
            content: Selector::parse(".RichTextStoryBody > p").unwrap(),
        })
    });

    route("/", get(homepage)).run()
}

async fn homepage() -> Markup {
    html!(html {(Head::with_title("With scraping"))
        body { $"text-xl" {"Check out terminal for scraping results!"}}
    })
}

struct Story {
    pub title: String,
    pub content: String,
}

#[tokio::main]
async fn scrape(target: Target) -> AnyhowResult<()> {
    let mut stories = vec![];
    let response = fetch(&target.url).await?.text().await?;
    let document = Html::parse_document(&response);

    // select links from the target
    let mut links = document
        .select(&target.links)
        .map(|x| x.value().attr("href").unwrap())
        .collect::<Vec<&str>>();
    // remove duplicates
    links.sort_unstable();
    links.dedup();

    // await requests to each link
    let results = join_all(links.into_iter().map(|link| fetch(link))).await;
    // filter out unsuccessful results
    let responses = results.into_iter().filter_map(|resp| resp.ok());
    // await bodies of successful responses
    let texts = join_all(responses.map(|resp| resp.text())).await;
    // filter out malformed bodies and parse as html
    let bodies: Vec<Html> = texts
        .into_iter()
        .filter_map(|text| text.ok())
        .map(|text| Html::parse_document(&text))
        .collect();

    for body in bodies {
        // select title's inner html and take the first match
        let title = body
            .select(&target.title)
            .map(|t| t.inner_html())
            .next()
            .unwrap();
        // select content's text nodes and fold them together
        let content = body.select(&target.content).fold(String::new(), |full, p| {
            p.text().fold(full, |full, text| full + text) + "\n"
        });

        stories.push(Story { title, content });
    }
    for story in stories {
        println!("---{}\n{:.150}...\n\n", story.title, story.content);
    }
    Ok(())
}
v0.4.0
made by Egor Dezhic