Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,160 changes: 2,160 additions & 0 deletions topics/web-scraper/Cargo.lock

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions topics/web-scraper/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[package]
name = "Despaux_Noa_WebScraper"
version = "0.1.0"
edition = "2021"

[dependencies]
clap = { version = "4.5", features = ["derive"] }
reqwest = { version = "0.12", features = ["blocking"] }
scraper = "0.20"
url = "2.5"
thiserror = "1.0"
58 changes: 58 additions & 0 deletions topics/web-scraper/docs/architecture.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Web Scraper Architecture

## Overview

A concurrent Rust web scraper that downloads pages from starting URLs, follows links up to a configurable depth, and saves results in a hierarchical directory structure. It uses multiple threads for parallel processing and avoids revisiting URLs.

## Main Modules

- **cli.rs**: Parses command-line arguments (output directory, depth, URLs) using `clap`.
- **downloader.rs**: Downloads HTML content via `reqwest`.
- **parser.rs**: Extracts and resolves links from HTML using `scraper` and `url`.
- **storage.rs**: Saves pages in a directory structure reflecting link relationships.
- **scraper.rs**: Coordinates crawling, concurrency, depth, and duplicate prevention.

## Architecture Flow

```
CLI → Scraper Engine → [Worker Threads]
Task Queue (sync)
Downloader, Parser, Storage
```

1. User provides URLs and options via CLI.
2. Scraper initializes shared queue and visited set.
3. Worker threads crawl, download, parse, and save pages, queuing new links until depth is reached.

## Usage

```bash
cargo build --release
./target/release/Despaux_Noa_WebScraper --output ./output --depth 2 https://example.com
```

- `--output <DIR>`: Output directory
- `--depth <N>`: Crawl depth (default: 1)
- `<URL>...`: Starting URLs

## Output Example

```
output/
├── example.com.html
└── example.com/
├── example.com_page1.html
└── example.com_page1/
└── example.com_subpage1.html
```

## Features

- Concurrent crawling (4 threads)
- Duplicate prevention
- Hierarchical storage
- Progress feedback
- Error handling
- Configurable depth
19 changes: 19 additions & 0 deletions topics/web-scraper/src/cli.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use clap::Parser;

/// Web Scraper CLI
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
pub struct Cli {
/// Output directory for scraped pages
#[arg(short, long)]
pub output: String,

/// Maximum crawl depth
#[arg(short, long, default_value_t = 1)]
pub depth: usize,

/// Starting URLs
#[arg(required = true)]
pub urls: Vec<String>,
}

36 changes: 36 additions & 0 deletions topics/web-scraper/src/downloader.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use reqwest::blocking::Client;
use std::time::Duration;

/// Downloader: fetches HTML content from URLs
pub struct Downloader {
client: Client,
}

impl Downloader {
/// Creates a new Downloader with a configured HTTP client
pub fn new() -> Self {
let client = Client::builder()
.timeout(Duration::from_secs(30))
.user_agent("Mozilla/5.0 (compatible; RustWebScraper/0.1)")
.build()
.expect("Failed to create HTTP client");

Downloader { client }
}

/// Downloads the HTML content from the given URL
pub fn download(&self, url: &str) -> Result<String, String> {
self.client
.get(url)
.send()
.map_err(|e| format!("Failed to fetch {}: {}", url, e))?
.text()
.map_err(|e| format!("Failed to read response body: {}", e))
}
}

impl Default for Downloader {
fn default() -> Self {
Self::new()
}
}
30 changes: 30 additions & 0 deletions topics/web-scraper/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use clap::Parser;

mod cli;
mod downloader;
mod parser;
mod scraper;
mod storage;

use cli::Cli;
use scraper::Scraper;

/// Main entry point for the web scraper CLI
fn main() {
let cli = Cli::parse();

// Validate depth
if cli.depth == 0 {
eprintln!("Error: depth must be at least 1");
std::process::exit(1);
}

// Validate URLs
if cli.urls.is_empty() {
eprintln!("Error: at least one URL must be provided");
std::process::exit(1);
}

let scraper = Scraper::new();
scraper.run(&cli.output, cli.depth, &cli.urls);
}
42 changes: 42 additions & 0 deletions topics/web-scraper/src/parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
use scraper::{Html, Selector};
use url::Url;

/// Parser: extracts links from HTML content
pub struct Parser;

impl Parser {
/// Extracts all absolute URLs from HTML content
///
/// # Arguments
/// * `html` - The HTML content to parse
/// * `base_url` - The base URL to resolve relative links against
///
/// # Returns
/// A vector of absolute URLs found in the HTML
pub fn extract_links(html: &str, base_url: &str) -> Vec<String> {
let document = Html::parse_document(html);
let selector = Selector::parse("a[href]").unwrap();

let base = match Url::parse(base_url) {
Ok(url) => url,
Err(_) => return vec![],
};

let mut links = Vec::new();

for element in document.select(&selector) {
if let Some(href) = element.value().attr("href") {
// Convert relative URLs to absolute
if let Ok(absolute_url) = base.join(href) {
let url_str = absolute_url.to_string();
// Only include http/https URLs
if url_str.starts_with("http://") || url_str.starts_with("https://") {
links.push(url_str);
}
}
}
}

links
}
}
188 changes: 188 additions & 0 deletions topics/web-scraper/src/scraper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
use crate::downloader::Downloader;
use crate::parser::Parser;
use crate::storage::Storage;
use std::collections::{HashSet, VecDeque};
use std::path::PathBuf;
use std::sync::{Arc, Mutex};
use std::thread;

/// Represents a URL to be crawled with its depth and parent path
#[derive(Clone, Debug)]
struct CrawlTask {
url: String,
depth: usize,
parent_path: Option<PathBuf>,
}

/// Scraper engine: manages crawl queue, concurrency, and depth limits
pub struct Scraper;

impl Scraper {
/// Creates a new Scraper instance
pub fn new() -> Self {
Scraper
}

/// Runs the web scraper
///
/// # Arguments
/// * `output` - Output directory for scraped pages
/// * `max_depth` - Maximum crawl depth
/// * `urls` - Starting URLs to crawl
pub fn run(&self, output: &str, max_depth: usize, urls: &[String]) {
println!("Starting web scraper...");
println!("Output directory: {}", output);
println!("Maximum depth: {}", max_depth);
println!("Starting URLs: {:?}", urls);

// Initialize storage
let storage = match Storage::new(output) {
Ok(s) => Arc::new(s),
Err(e) => {
eprintln!("Error: {}", e);
return;
}
};

// Track visited URLs to avoid duplicates
let visited = Arc::new(Mutex::new(HashSet::new()));

// Initialize crawl queue with starting URLs
let queue = Arc::new(Mutex::new(VecDeque::new()));
for url in urls {
queue.lock().unwrap().push_back(CrawlTask {
url: url.clone(),
depth: 0,
parent_path: None,
});
}

// Number of worker threads
let num_threads = 4;
let mut handles = vec![];

// Spawn worker threads
for thread_id in 0..num_threads {
let queue = Arc::clone(&queue);
let visited = Arc::clone(&visited);
let storage = Arc::clone(&storage);
let downloader = Downloader::new();

let handle = thread::spawn(move || {
loop {
// Get next task from queue
let task = {
let mut q = queue.lock().unwrap();
q.pop_front()
};

match task {
Some(task) => {
// Check if already visited
let should_process = {
let mut v = visited.lock().unwrap();
if v.contains(&task.url) {
false
} else {
v.insert(task.url.clone());
true
}
};

if !should_process {
continue;
}

println!("[Thread {}] Crawling (depth {}): {}", thread_id, task.depth, task.url);

// Download the page
let html = match downloader.download(&task.url) {
Ok(content) => content,
Err(e) => {
eprintln!("[Thread {}] {}", thread_id, e);
continue;
}
};

// Save the page and get its path
let saved_file_path = match storage.save_page(&task.url, &html, task.parent_path.as_deref()) {
Ok(path) => path,
Err(e) => {
eprintln!("[Thread {}] {}", thread_id, e);
continue;
}
};

println!("[Thread {}] Saved: {} (depth {})", thread_id, task.url, task.depth);

// If we haven't reached max depth, extract and queue links
if task.depth < max_depth {
let links = Parser::extract_links(&html, &task.url);
let new_depth = task.depth + 1;

// Create subdirectory path from the saved file
// Convert "example.com.html" to "example.com/" for storing child pages
let child_directory = saved_file_path
.file_stem()
.and_then(|s| s.to_str())
.map(|s| {
if let Some(parent) = saved_file_path.parent() {
parent.join(s)
} else {
PathBuf::from(s)
}
})
.unwrap_or_else(|| PathBuf::from("links"));

let mut q = queue.lock().unwrap();
for link in links {
// Only queue if not visited
let should_queue = {
let v = visited.lock().unwrap();
!v.contains(&link)
};

if should_queue {
q.push_back(CrawlTask {
url: link,
depth: new_depth,
parent_path: Some(child_directory.clone()),
});
}
}
}
}
None => {
// Queue is empty, check if we should exit
thread::sleep(std::time::Duration::from_millis(100));

// If queue is still empty after waiting, all threads will exit
let q = queue.lock().unwrap();
if q.is_empty() {
break;
}
}
}
}
});

handles.push(handle);
}

// Wait for all threads to complete
for handle in handles {
handle.join().unwrap();
}

let visited_count = visited.lock().unwrap().len();
println!("\nScraping completed!");
println!("Total pages crawled: {}", visited_count);
}
}

impl Default for Scraper {
fn default() -> Self {
Self::new()
}
}

Loading