Skip to content

Commit 5811bb5

Browse files
committed
download images
based on timf34#26
1 parent 4b8598a commit 5811bb5

File tree

1 file changed

+185
-1
lines changed

1 file changed

+185
-1
lines changed

src/substack2markdown/substack_scraper.py

Lines changed: 185 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
import argparse
22
import json
33
import os
4+
import io
5+
import re
6+
import base64
7+
import hashlib
8+
import mimetypes
9+
from pathlib import Path
10+
from urllib.parse import urlparse, unquote
411
from abc import ABC, abstractmethod
512
from typing import List, Optional, Tuple
613
from time import sleep
@@ -18,18 +25,54 @@
1825

1926
from selenium_driverless import webdriver
2027
from selenium_driverless.types.by import By
21-
from urllib.parse import urlparse
2228

2329
USE_PREMIUM: bool = True # Set to True if you want to login to Substack and convert paid for posts
2430
BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/" # Substack you want to convert to markdown
2531
BASE_MD_DIR: str = "substack_md_files" # Name of the directory we'll save the .md essay files
2632
BASE_HTML_DIR: str = "substack_html_pages" # Name of the directory we'll save the .html essay files
33+
BASE_IMAGE_DIR: str = "substack_images"
2734
ASSETS_DIR: str = os.path.dirname(__file__) + "/assets"
2835
HTML_TEMPLATE: str = "author_template.html" # HTML template to use for the author page
2936
JSON_DATA_DIR: str = "data"
3037
NUM_POSTS_TO_SCRAPE: int = 3 # Set to 0 if you want all posts
3138

3239

40+
def count_images_in_markdown(md_content: str) -> int:
41+
"""Count number of Substack CDN image URLs in markdown content."""
42+
# [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png)
43+
# regex lookahead: match "...)" but not "...)]" suffix
44+
pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)(?=[^\]]|$)')
45+
matches = re.findall(pattern, md_content)
46+
return len(matches)
47+
48+
49+
def sanitize_image_filename(url: str) -> str:
50+
"""Create a safe filename from URL or content."""
51+
# Extract original filename from CDN URL
52+
if "substackcdn.com" in url:
53+
# Get the actual image URL after the CDN parameters
54+
original_url = unquote(url.split("/https%3A%2F%2F")[1])
55+
filename = original_url.split("/")[-1]
56+
else:
57+
filename = url.split("/")[-1]
58+
59+
# Remove invalid characters
60+
filename = re.sub(r'[<>:"/\\|?*]', '', filename)
61+
62+
# If filename is too long or empty, create hash-based name
63+
if len(filename) > 100 or not filename:
64+
hash_object = hashlib.md5(url.encode())
65+
ext = mimetypes.guess_extension(requests.head(url).headers.get('content-type', '')) or '.jpg'
66+
filename = f"{hash_object.hexdigest()}{ext}"
67+
68+
return filename
69+
70+
71+
def get_post_slug(url: str) -> str:
72+
match = re.search(r'/p/([^/]+)', url)
73+
return match.group(1) if match else 'unknown_post'
74+
75+
3376
def extract_main_part(url: str) -> str:
3477
parts = urlparse(url).netloc.split('.') # Parse the URL to get the netloc, and split on '.'
3578
return parts[1] if parts[0] == 'www' else parts[0] # Return the main part of the domain, while ignoring 'www' if
@@ -96,6 +139,9 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir
96139
os.makedirs(self.html_save_dir)
97140
print(f"Created html directory {self.html_save_dir}")
98141

142+
if not self.args.no_images:
143+
os.makedirs(self.args.image_directory, exist_ok=True)
144+
99145
self.keywords: List[str] = ["about", "archive", "podcast"]
100146
self.post_urls: List[str] = self.get_all_post_urls()
101147

@@ -359,6 +405,13 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
359405
total += 1
360406
continue
361407
title, subtitle, like_count, date, md = self.extract_post_data(soup)
408+
409+
if not self.args.no_images:
410+
total_images = count_images_in_markdown(md)
411+
post_slug = get_post_slug(url)
412+
with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar:
413+
md = await self.process_markdown_images(md, self.writer_name, post_slug, img_pbar)
414+
362415
self.save_to_file(md_filepath, md)
363416

364417
# Convert markdown to HTML and save
@@ -383,6 +436,56 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
383436
self.save_essays_data_to_json(essays_data=essays_data)
384437
generate_html_file(self.args, author_name=self.writer_name)
385438

439+
async def download_image(
440+
self,
441+
url: str,
442+
save_path: Path,
443+
pbar: Optional[tqdm] = None
444+
) -> Optional[str]:
445+
"""Download image from URL and save to path."""
446+
try:
447+
response = requests.get(url, stream=True)
448+
if response.status_code == 200:
449+
save_path.parent.mkdir(parents=True, exist_ok=True)
450+
with open(save_path, 'wb') as f:
451+
for chunk in response.iter_content(chunk_size=8192):
452+
if chunk:
453+
f.write(chunk)
454+
if pbar:
455+
pbar.update(1)
456+
return str(save_path)
457+
except Exception as exc:
458+
if pbar:
459+
pbar.write(f"Error downloading image {url}: {str(exc)}")
460+
# raise exc # debug
461+
return None
462+
463+
async def process_markdown_images(
464+
self,
465+
md_content: str,
466+
author: str,
467+
post_slug: str,
468+
pbar=None
469+
) -> str:
470+
"""Process markdown content to download images and update references."""
471+
image_dir = Path(self.args.image_directory) / author / post_slug
472+
# [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png)
473+
pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)')
474+
buf = io.StringIO()
475+
last_end = 0
476+
for match in pattern.finditer(md_content):
477+
buf.write(md_content[last_end:match.start()])
478+
url = match.group(0).strip("()")
479+
filename = sanitize_image_filename(url)
480+
save_path = image_dir / filename
481+
if not save_path.exists():
482+
await self.download_image(url, save_path, pbar)
483+
rel_path = os.path.relpath(save_path, Path(self.args.directory) / author)
484+
buf.write(f"({rel_path})")
485+
last_end = match.end()
486+
buf.write(md_content[last_end:])
487+
return buf.getvalue()
488+
386489

387490
class SubstackScraper(BaseSubstackScraper):
388491
def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
@@ -515,6 +618,76 @@ async def get_url_soup(self, url: str):
515618
html = await self.driver.page_source
516619
return BeautifulSoup(html, "html.parser")
517620

621+
async def download_image_FIXME(
622+
self,
623+
url: str,
624+
save_path: Path,
625+
pbar: Optional[tqdm] = None
626+
) -> Optional[str]:
627+
"""Download image using selenium_driverless"""
628+
629+
# NOTE for now this works with the default "def download_image"
630+
631+
# WONTFIX "fetch" fails due to CORS policy
632+
633+
# WONTFIX "canvas" does not return the original image bytes
634+
635+
# we could fetch images with CDP Network.getResponseBody
636+
# but that requires lots of boilerplate code
637+
# fix: use https://github.com/milahu/aiohttp_chromium
638+
639+
try:
640+
# Execute JS fetch inside browser
641+
result = await self.driver.execute_async_script(
642+
"""
643+
const url = arguments[0];
644+
const callback = arguments[arguments.length - 1];
645+
646+
const img = new Image();
647+
img.crossOrigin = 'Anonymous'; // try to avoid CORS issues
648+
img.onload = () => {
649+
try {
650+
const canvas = document.createElement('canvas');
651+
canvas.width = img.width;
652+
canvas.height = img.height;
653+
const ctx = canvas.getContext('2d');
654+
ctx.drawImage(img, 0, 0);
655+
const dataUrl = canvas.toDataURL('image/png'); // returns "data:image/png;base64,..."
656+
const base64 = dataUrl.split(',')[1]; // strip prefix
657+
callback({data: base64});
658+
} catch (err) {
659+
callback({error: err.message, stack: err.stack});
660+
}
661+
};
662+
img.onerror = (err) => {
663+
callback({error: 'Image load error', stack: err.toString()});
664+
};
665+
img.src = url;
666+
""",
667+
url
668+
)
669+
670+
if isinstance(result, dict) and "error" in result:
671+
raise RuntimeError(f"{result['error']}\nJS stack:\n{result['stack']}")
672+
673+
# Decode base64 to bytes
674+
image_bytes = base64.b64decode(result)
675+
676+
save_path.parent.mkdir(parents=True, exist_ok=True)
677+
with open(save_path, "wb") as f:
678+
f.write(image_bytes)
679+
680+
if pbar:
681+
pbar.update(1)
682+
683+
return str(save_path)
684+
685+
except Exception as exc:
686+
if pbar:
687+
pbar.write(f"Error downloading image {url}: {exc}")
688+
# raise exc # debug
689+
return None
690+
518691

519692
def parse_args() -> argparse.Namespace:
520693
parser = argparse.ArgumentParser(description="Scrape a Substack site.")
@@ -588,6 +761,17 @@ def parse_args() -> argparse.Namespace:
588761
default=BASE_HTML_DIR,
589762
help=f"The directory to save scraped posts as HTML files. Default: {BASE_HTML_DIR!r}",
590763
)
764+
parser.add_argument(
765+
"--image-directory", # args.image_directory
766+
type=str,
767+
default=BASE_IMAGE_DIR,
768+
help=f"The directory to save scraped image files. Default: {BASE_IMAGE_DIR!r}",
769+
)
770+
parser.add_argument(
771+
"--no-images", # args.no_images
772+
action="store_true",
773+
help=f"Do not download images.",
774+
)
591775

592776
return parser.parse_args()
593777

0 commit comments

Comments
 (0)