Scrape Module Usage Examples

This document provides comprehensive usage examples for the scrape module.

Basic Scraping

Simple Scrape

from codomyrmex.scrape import Scraper

scraper = Scraper()
result = scraper.scrape("https://example.com")

print(result.content)  # Markdown content
print(result.metadata.get("title"))  # Page title

Multiple Formats

from codomyrmex.scrape import Scraper, ScrapeOptions, ScrapeFormat

scraper = Scraper()
options = ScrapeOptions(
    formats=[ScrapeFormat.MARKDOWN, ScrapeFormat.HTML, ScrapeFormat.METADATA]
)
result = scraper.scrape("https://example.com", options)

print(result.formats.get("markdown"))  # Markdown
print(result.formats.get("html"))  # HTML
print(result.formats.get("metadata"))  # Metadata

With Custom Headers

from codomyrmex.scrape import Scraper, ScrapeOptions

scraper = Scraper()
options = ScrapeOptions(
    headers={
        "User-Agent": "MyBot/1.0",
        "Accept": "text/html",
    }
)
result = scraper.scrape("https://example.com", options)

Website Crawling

Basic Crawl

from codomyrmex.scrape import Scraper, ScrapeOptions, ScrapeFormat

scraper = Scraper()
options = ScrapeOptions(
    formats=[ScrapeFormat.MARKDOWN],
    limit=10,  # Maximum pages
)

crawl_result = scraper.crawl("https://example.com", options)

print(f"Job ID: {crawl_result.job_id}")
print(f"Status: {crawl_result.status}")
print(f"Total pages: {crawl_result.total}")

for page in crawl_result.results:
    print(f"  - {page.url}: {len(page.content)} chars")

Crawl with Depth Control

from codomyrmex.scrape import Scraper, ScrapeOptions

scraper = Scraper()
options = ScrapeOptions(
    max_depth=2,  # Only crawl 2 levels deep
    limit=50,
)

crawl_result = scraper.crawl("https://example.com", options)

Site Mapping

Map All Links

from codomyrmex.scrape import Scraper

scraper = Scraper()
map_result = scraper.map("https://example.com")

print(f"Found {map_result.total} links")
for link in map_result.links[:10]:
    print(f"  {link.get('title', 'No title')}: {link.get('url')}")

Search for Specific Links

from codomyrmex.scrape import Scraper

scraper = Scraper()
# Find links related to "docs"
map_result = scraper.map("https://example.com", search="docs")

print(f"Found {map_result.total} matching links")
for link in map_result.links:
    print(f"  {link.get('title')}: {link.get('url')}")

Web Search

Basic Search

from codomyrmex.scrape import Scraper

scraper = Scraper()
search_result = scraper.search("python web scraping")

print(f"Found {search_result.total} results")
for result in search_result.results:
    print(f"  {result.url}: {result.metadata.get('title')}")

Search with Content Scraping

from codomyrmex.scrape import Scraper, ScrapeOptions, ScrapeFormat

scraper = Scraper()
options = ScrapeOptions(
    formats=[ScrapeFormat.MARKDOWN],
    limit=5,  # Number of results to scrape
)

search_result = scraper.search("python web scraping", options)

for result in search_result.results:
    print(f"
{result.url}")
    print(result.content[:200])  # First 200 chars

Structured Data Extraction

Extract with Schema

from codomyrmex.scrape import Scraper

scraper = Scraper()

schema = {
    "type": "object",
    "properties": {
        "title": {"type": "string"},
        "author": {"type": "string"},
        "published_date": {"type": "string"},
        "content": {"type": "string"}
    },
    "required": ["title", "content"]
}

extract_result = scraper.extract(
    urls=["https://example.com/article"],
    schema=schema
)

print(extract_result.data)

Extract with Prompt

from codomyrmex.scrape import Scraper

scraper = Scraper()

extract_result = scraper.extract(
    urls=["https://example.com/article"],
    prompt="Extract the main points and key takeaways from this article"
)

print(extract_result.data)

Extract from Multiple URLs

from codomyrmex.scrape import Scraper

scraper = Scraper()

extract_result = scraper.extract(
    urls=[
        "https://example.com/article1",
        "https://example.com/article2",
        "https://example.com/article3",
    ],
    prompt="Extract article summaries"
)

for url, data in zip(extract_result.urls, extract_result.data):
    print(f"{url}: {data}")

Configuration

Environment Variables

export FIRECRAWL_API_KEY="fc-your-api-key"
export SCRAPE_TIMEOUT="60.0"
export SCRAPE_MAX_RETRIES="5"

from codomyrmex.scrape import Scraper

# Automatically uses environment variables
scraper = Scraper()

Programmatic Configuration

from codomyrmex.scrape import Scraper, ScrapeConfig, set_config

config = ScrapeConfig(
    api_key="fc-your-key",
    default_timeout=60.0,
    max_retries=5,
    respect_robots_txt=True,
)

set_config(config)  # Set as global
scraper = Scraper()  # Uses global config

Per-Instance Configuration

from codomyrmex.scrape import Scraper, ScrapeConfig

config = ScrapeConfig(api_key="fc-your-key")
scraper = Scraper(config=config)

Error Handling

Comprehensive Error Handling

from codomyrmex.scrape import Scraper
from codomyrmex.scrape.exceptions import (
    ScrapeValidationError,
    ScrapeConnectionError,
    ScrapeTimeoutError,
    FirecrawlError,
    ScrapeError,
)

scraper = Scraper()

try:
    result = scraper.scrape("https://example.com")
except ScrapeValidationError as e:
    print(f"Invalid input: {e}")
except ScrapeConnectionError as e:
    print(f"Connection failed: {e.url}")
except ScrapeTimeoutError as e:
    print(f"Timeout after {e.context.get('timeout')}s")
except FirecrawlError as e:
    print(f"Firecrawl error: {e}")
except ScrapeError as e:
    print(f"Scraping error: {e}")

Batch Processing

Process Multiple URLs

from codomyrmex.scrape import Scraper, ScrapeOptions, ScrapeFormat

scraper = Scraper()
options = ScrapeOptions(formats=[ScrapeFormat.MARKDOWN])

urls = [
    "https://example.com/page1",
    "https://example.com/page2",
    "https://example.com/page3",
]

results = []
for url in urls:
    try:
        result = scraper.scrape(url, options)
        results.append(result)
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")

print(f"Successfully scraped {len(results)} URLs")

Advanced Usage

Dynamic Content with Actions

from codomyrmex.scrape import Scraper, ScrapeOptions

scraper = Scraper()
options = ScrapeOptions(
    actions=[
        {"type": "wait", "milliseconds": 2000},
        {"type": "click", "selector": "button.load-more"},
        {"type": "wait", "milliseconds": 3000},
    ]
)

result = scraper.scrape("https://example.com/dynamic", options)

Custom Timeout

from codomyrmex.scrape import Scraper, ScrapeOptions

scraper = Scraper()
options = ScrapeOptions(timeout=120.0)  # 2 minute timeout

result = scraper.scrape("https://slow-site.com", options)

Navigation Links

Parent: Project Overview
Module Index: All Agents
Documentation: Reference Guides
Home: Root README

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Scrape Module Usage Examples

Basic Scraping

Simple Scrape

Multiple Formats

With Custom Headers

Website Crawling

Basic Crawl

Crawl with Depth Control

Site Mapping

Map All Links

Search for Specific Links

Web Search

Basic Search

Search with Content Scraping

Structured Data Extraction

Extract with Schema

Extract with Prompt

Extract from Multiple URLs

Configuration

Environment Variables

Programmatic Configuration

Per-Instance Configuration

Error Handling

Comprehensive Error Handling

Batch Processing

Process Multiple URLs

Advanced Usage

Dynamic Content with Actions

Custom Timeout

Navigation Links

FilesExpand file tree

USAGE_EXAMPLES.md

Latest commit

History

USAGE_EXAMPLES.md

File metadata and controls

Scrape Module Usage Examples

Basic Scraping

Simple Scrape

Multiple Formats

With Custom Headers

Website Crawling

Basic Crawl

Crawl with Depth Control

Site Mapping

Map All Links

Search for Specific Links

Web Search

Basic Search

Search with Content Scraping

Structured Data Extraction

Extract with Schema

Extract with Prompt

Extract from Multiple URLs

Configuration

Environment Variables

Programmatic Configuration

Per-Instance Configuration

Error Handling

Comprehensive Error Handling

Batch Processing

Process Multiple URLs

Advanced Usage

Dynamic Content with Actions

Custom Timeout

Navigation Links