Simple scraper-based parser that collects posts from AP News. Beside scraper it uses reqwest which is a standard option in tokio ecosystem to make requests:
name = "scraping"
edition = "2021"
prest = { version = "0.5", path = "../../" }
reqwest = { version = "0.12", default-features=false, features = ["rustls-tls"] }
scraper = "0.22"
This example spawns the scrape
function which fetches the provided url, extracts links to other pages from it (but using only 5 of them later to limit the load), then using join_all
function from the futures
crate to get all these pages concurrently, then awaits their bodies, then extracts titles and contents and saves the results:
use prest::*;
use reqwest::get as fetch;
use scraper::{Html, Selector};
#[derive(Storage, Serialize, Deserialize)]
struct Story {
pub title: String,
pub content: String,
#[init(log_filters=[("html5ever", "info"), ("selectors", "info")])]
async fn main() -> Result {
Selector::parse(".Page-content .PageList-items-item a").unwrap(),
Selector::parse(".RichTextStoryBody > p").unwrap(),
get(|| async {
ok(html!(html {(Head::with_title("With scraping"))
body { @for story in Story::get_all().await? {
div $"my-2" {
h3 {(story.title)}
div $"text-sm" {(format!("{:.150}...", story.content))}
async fn scrape(
url: &str,
links_selector: Selector,
title_selector: Selector,
content_selector: Selector,
) -> Somehow {
let text = fetch(url).await?.text().await?;
let links = get_links(text, &links_selector);
// restricting amount of parsed pages
let links = &links[0..5];
let responses = join_all(links.into_iter().map(|link| fetch(link)))
.filter_map(|resp| resp.ok());
let stories: Vec<Story> = join_all(|resp| resp.text()))
.filter_map(|text| text.ok())
.map(|text| get_content(text, &title_selector, &content_selector))
for story in stories {;
fn get_content(text: String, title_selector: &Selector, content_selector: &Selector) -> Story {
let document = Html::parse_document(&text);
let title = document
.map(|t| t.inner_html())
let content = document
.fold(String::new(), |full, p| {
p.text().fold(full, |full, text| full + text) + "\n"
Story { title, content }
fn get_links(text: String, selector: &Selector) -> Vec<String> {
let document = Html::parse_document(&text);
let mut links = document
.filter_map(|x| x.value().attr("href"))