dev-sys-do · NoaDespaux · Oct 29, 2025
diff --git a/topics/web-scraper/Cargo.lock b/topics/web-scraper/Cargo.lock
diff --git a/topics/web-scraper/Cargo.toml b/topics/web-scraper/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "Despaux_Noa_WebScraper"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+clap = { version = "4.5", features = ["derive"] }
+reqwest = { version = "0.12", features = ["blocking"] }
+scraper = "0.20"
+url = "2.5"
+thiserror = "1.0"
diff --git a/topics/web-scraper/docs/architecture.md b/topics/web-scraper/docs/architecture.md
@@ -0,0 +1,58 @@
+# Web Scraper Architecture
+
+## Overview
+
+A concurrent Rust web scraper that downloads pages from starting URLs, follows links up to a configurable depth, and saves results in a hierarchical directory structure. It uses multiple threads for parallel processing and avoids revisiting URLs.
+
+## Main Modules
+
+- **cli.rs**: Parses command-line arguments (output directory, depth, URLs) using `clap`.
+- **downloader.rs**: Downloads HTML content via `reqwest`.
+- **parser.rs**: Extracts and resolves links from HTML using `scraper` and `url`.
+- **storage.rs**: Saves pages in a directory structure reflecting link relationships.
+- **scraper.rs**: Coordinates crawling, concurrency, depth, and duplicate prevention.
+
+## Architecture Flow
+
+```
+CLI → Scraper Engine → [Worker Threads]
+         ↓
+   Task Queue (sync)
+         ↓
+Downloader, Parser, Storage
+```
+
+1. User provides URLs and options via CLI.
+2. Scraper initializes shared queue and visited set.
+3. Worker threads crawl, download, parse, and save pages, queuing new links until depth is reached.
+
+## Usage
+
+```bash
+cargo build --release
+./target/release/Despaux_Noa_WebScraper --output ./output --depth 2 https://example.com
+```
+
+- `--output <DIR>`: Output directory
+- `--depth <N>`: Crawl depth (default: 1)
+- `<URL>...`: Starting URLs
+
+## Output Example
+
+```
+output/
+├── example.com.html
+└── example.com/
+    ├── example.com_page1.html
+    └── example.com_page1/
+        └── example.com_subpage1.html
+```
+
+## Features
+
+- Concurrent crawling (4 threads)
+- Duplicate prevention
+- Hierarchical storage
+- Progress feedback
+- Error handling
+- Configurable depth
diff --git a/topics/web-scraper/src/cli.rs b/topics/web-scraper/src/cli.rs
@@ -0,0 +1,19 @@
+use clap::Parser;
+
+/// Web Scraper CLI
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+pub struct Cli {
+    /// Output directory for scraped pages
+    #[arg(short, long)]
+    pub output: String,
+
+    /// Maximum crawl depth
+    #[arg(short, long, default_value_t = 1)]
+    pub depth: usize,
+
+    /// Starting URLs
+    #[arg(required = true)]
+    pub urls: Vec<String>,
+}
+
diff --git a/topics/web-scraper/src/downloader.rs b/topics/web-scraper/src/downloader.rs
@@ -0,0 +1,36 @@
+use reqwest::blocking::Client;
+use std::time::Duration;
+
+/// Downloader: fetches HTML content from URLs
+pub struct Downloader {
+    client: Client,
+}
+
+impl Downloader {
+    /// Creates a new Downloader with a configured HTTP client
+    pub fn new() -> Self {
+        let client = Client::builder()
+            .timeout(Duration::from_secs(30))
+            .user_agent("Mozilla/5.0 (compatible; RustWebScraper/0.1)")
+            .build()
+            .expect("Failed to create HTTP client");
+
+        Downloader { client }
+    }
+
+    /// Downloads the HTML content from the given URL
+    pub fn download(&self, url: &str) -> Result<String, String> {
+        self.client
+            .get(url)
+            .send()
+            .map_err(|e| format!("Failed to fetch {}: {}", url, e))?
+            .text()
+            .map_err(|e| format!("Failed to read response body: {}", e))
+    }
+}
+
+impl Default for Downloader {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/topics/web-scraper/src/main.rs b/topics/web-scraper/src/main.rs
@@ -0,0 +1,30 @@
+use clap::Parser;
+
+mod cli;
+mod downloader;
+mod parser;
+mod scraper;
+mod storage;
+
+use cli::Cli;
+use scraper::Scraper;
+
+/// Main entry point for the web scraper CLI
+fn main() {
+    let cli = Cli::parse();
+
+    // Validate depth
+    if cli.depth == 0 {
+        eprintln!("Error: depth must be at least 1");
+        std::process::exit(1);
+    }
+
+    // Validate URLs
+    if cli.urls.is_empty() {
+        eprintln!("Error: at least one URL must be provided");
+        std::process::exit(1);
+    }
+
+    let scraper = Scraper::new();
+    scraper.run(&cli.output, cli.depth, &cli.urls);
+}
diff --git a/topics/web-scraper/src/parser.rs b/topics/web-scraper/src/parser.rs
@@ -0,0 +1,42 @@
+use scraper::{Html, Selector};
+use url::Url;
+
+/// Parser: extracts links from HTML content
+pub struct Parser;
+
+impl Parser {
+    /// Extracts all absolute URLs from HTML content
+    ///
+    /// # Arguments
+    /// * `html` - The HTML content to parse
+    /// * `base_url` - The base URL to resolve relative links against
+    ///
+    /// # Returns
+    /// A vector of absolute URLs found in the HTML
+    pub fn extract_links(html: &str, base_url: &str) -> Vec<String> {
+        let document = Html::parse_document(html);
+        let selector = Selector::parse("a[href]").unwrap();
+
+        let base = match Url::parse(base_url) {
+            Ok(url) => url,
+            Err(_) => return vec![],
+        };
+
+        let mut links = Vec::new();
+
+        for element in document.select(&selector) {
+            if let Some(href) = element.value().attr("href") {
+                // Convert relative URLs to absolute
+                if let Ok(absolute_url) = base.join(href) {
+                    let url_str = absolute_url.to_string();
+                    // Only include http/https URLs
+                    if url_str.starts_with("http://") || url_str.starts_with("https://") {
+                        links.push(url_str);
+                    }
+                }
+            }
+        }
+
+        links
+    }
+}
diff --git a/topics/web-scraper/src/scraper.rs b/topics/web-scraper/src/scraper.rs
@@ -0,0 +1,188 @@
+use crate::downloader::Downloader;
+use crate::parser::Parser;
+use crate::storage::Storage;
+use std::collections::{HashSet, VecDeque};
+use std::path::PathBuf;
+use std::sync::{Arc, Mutex};
+use std::thread;
+
+/// Represents a URL to be crawled with its depth and parent path
+#[derive(Clone, Debug)]
+struct CrawlTask {
+    url: String,
+    depth: usize,
+    parent_path: Option<PathBuf>,
+}
+
+/// Scraper engine: manages crawl queue, concurrency, and depth limits
+pub struct Scraper;
+
+impl Scraper {
+    /// Creates a new Scraper instance
+    pub fn new() -> Self {
+        Scraper
+    }
+
+    /// Runs the web scraper
+    ///
+    /// # Arguments
+    /// * `output` - Output directory for scraped pages
+    /// * `max_depth` - Maximum crawl depth
+    /// * `urls` - Starting URLs to crawl
+    pub fn run(&self, output: &str, max_depth: usize, urls: &[String]) {
+        println!("Starting web scraper...");
+        println!("Output directory: {}", output);
+        println!("Maximum depth: {}", max_depth);
+        println!("Starting URLs: {:?}", urls);
+
+        // Initialize storage
+        let storage = match Storage::new(output) {
+            Ok(s) => Arc::new(s),
+            Err(e) => {
+                eprintln!("Error: {}", e);
+                return;
+            }
+        };
+
+        // Track visited URLs to avoid duplicates
+        let visited = Arc::new(Mutex::new(HashSet::new()));
+
+        // Initialize crawl queue with starting URLs
+        let queue = Arc::new(Mutex::new(VecDeque::new()));
+        for url in urls {
+            queue.lock().unwrap().push_back(CrawlTask {
+                url: url.clone(),
+                depth: 0,
+                parent_path: None,
+            });
+        }
+
+        // Number of worker threads
+        let num_threads = 4;
+        let mut handles = vec![];
+
+        // Spawn worker threads
+        for thread_id in 0..num_threads {
+            let queue = Arc::clone(&queue);
+            let visited = Arc::clone(&visited);
+            let storage = Arc::clone(&storage);
+            let downloader = Downloader::new();
+
+            let handle = thread::spawn(move || {
+                loop {
+                    // Get next task from queue
+                    let task = {
+                        let mut q = queue.lock().unwrap();
+                        q.pop_front()
+                    };
+
+                    match task {
+                        Some(task) => {
+                            // Check if already visited
+                            let should_process = {
+                                let mut v = visited.lock().unwrap();
+                                if v.contains(&task.url) {
+                                    false
+                                } else {
+                                    v.insert(task.url.clone());
+                                    true
+                                }
+                            };
+
+                            if !should_process {
+                                continue;
+                            }
+
+                            println!("[Thread {}] Crawling (depth {}): {}", thread_id, task.depth, task.url);
+
+                            // Download the page
+                            let html = match downloader.download(&task.url) {
+                                Ok(content) => content,
+                                Err(e) => {
+                                    eprintln!("[Thread {}] {}", thread_id, e);
+                                    continue;
+                                }
+                            };
+
+                            // Save the page and get its path
+                            let saved_file_path = match storage.save_page(&task.url, &html, task.parent_path.as_deref()) {
+                                Ok(path) => path,
+                                Err(e) => {
+                                    eprintln!("[Thread {}] {}", thread_id, e);
+                                    continue;
+                                }
+                            };
+
+                            println!("[Thread {}] Saved: {} (depth {})", thread_id, task.url, task.depth);
+
+                            // If we haven't reached max depth, extract and queue links
+                            if task.depth < max_depth {
+                                let links = Parser::extract_links(&html, &task.url);
+                                let new_depth = task.depth + 1;
+
+                                // Create subdirectory path from the saved file
+                                // Convert "example.com.html" to "example.com/" for storing child pages
+                                let child_directory = saved_file_path
+                                    .file_stem()
+                                    .and_then(|s| s.to_str())
+                                    .map(|s| {
+                                        if let Some(parent) = saved_file_path.parent() {
+                                            parent.join(s)
+                                        } else {
+                                            PathBuf::from(s)
+                                        }
+                                    })
+                                    .unwrap_or_else(|| PathBuf::from("links"));
+
+                                let mut q = queue.lock().unwrap();
+                                for link in links {
+                                    // Only queue if not visited
+                                    let should_queue = {
+                                        let v = visited.lock().unwrap();
+                                        !v.contains(&link)
+                                    };
+
+                                    if should_queue {
+                                        q.push_back(CrawlTask {
+                                            url: link,
+                                            depth: new_depth,
+                                            parent_path: Some(child_directory.clone()),
+                                        });
+                                    }
+                                }
+                            }
+                        }
+                        None => {
+                            // Queue is empty, check if we should exit
+                            thread::sleep(std::time::Duration::from_millis(100));
+
+                            // If queue is still empty after waiting, all threads will exit
+                            let q = queue.lock().unwrap();
+                            if q.is_empty() {
+                                break;
+                            }
+                        }
+                    }
+                }
+            });
+
+            handles.push(handle);
+        }
+
+        // Wait for all threads to complete
+        for handle in handles {
+            handle.join().unwrap();
+        }
+
+        let visited_count = visited.lock().unwrap().len();
+        println!("\nScraping completed!");
+        println!("Total pages crawled: {}", visited_count);
+    }
+}
+
+impl Default for Scraper {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+