11import argparse
22import json
33import os
4+ import io
5+ import re
6+ import base64
7+ import hashlib
8+ import mimetypes
9+ from pathlib import Path
10+ from urllib .parse import urlparse , unquote
411from abc import ABC , abstractmethod
512from typing import List , Optional , Tuple
613from time import sleep
1825
1926from selenium_driverless import webdriver
2027from selenium_driverless .types .by import By
21- from urllib .parse import urlparse
2228
2329USE_PREMIUM : bool = True # Set to True if you want to login to Substack and convert paid for posts
2430BASE_SUBSTACK_URL : str = "https://www.thefitzwilliam.com/" # Substack you want to convert to markdown
2531BASE_MD_DIR : str = "substack_md_files" # Name of the directory we'll save the .md essay files
2632BASE_HTML_DIR : str = "substack_html_pages" # Name of the directory we'll save the .html essay files
33+ BASE_IMAGE_DIR : str = "substack_images"
2734ASSETS_DIR : str = os .path .dirname (__file__ ) + "/assets"
2835HTML_TEMPLATE : str = "author_template.html" # HTML template to use for the author page
2936JSON_DATA_DIR : str = "data"
3037NUM_POSTS_TO_SCRAPE : int = 3 # Set to 0 if you want all posts
3138
3239
40+ def count_images_in_markdown (md_content : str ) -> int :
41+ """Count number of Substack CDN image URLs in markdown content."""
42+ # [](https://substackcdn.com/image/fetch/x.png)
43+ # regex lookahead: match "...)" but not "...)]" suffix
44+ pattern = re .compile (r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)(?=[^\]]|$)' )
45+ matches = re .findall (pattern , md_content )
46+ return len (matches )
47+
48+
49+ def sanitize_image_filename (url : str ) -> str :
50+ """Create a safe filename from URL or content."""
51+ # Extract original filename from CDN URL
52+ if "substackcdn.com" in url :
53+ # Get the actual image URL after the CDN parameters
54+ original_url = unquote (url .split ("/https%3A%2F%2F" )[1 ])
55+ filename = original_url .split ("/" )[- 1 ]
56+ else :
57+ filename = url .split ("/" )[- 1 ]
58+
59+ # Remove invalid characters
60+ filename = re .sub (r'[<>:"/\\|?*]' , '' , filename )
61+
62+ # If filename is too long or empty, create hash-based name
63+ if len (filename ) > 100 or not filename :
64+ hash_object = hashlib .md5 (url .encode ())
65+ ext = mimetypes .guess_extension (requests .head (url ).headers .get ('content-type' , '' )) or '.jpg'
66+ filename = f"{ hash_object .hexdigest ()} { ext } "
67+
68+ return filename
69+
70+
71+ def get_post_slug (url : str ) -> str :
72+ match = re .search (r'/p/([^/]+)' , url )
73+ return match .group (1 ) if match else 'unknown_post'
74+
75+
3376def extract_main_part (url : str ) -> str :
3477 parts = urlparse (url ).netloc .split ('.' ) # Parse the URL to get the netloc, and split on '.'
3578 return parts [1 ] if parts [0 ] == 'www' else parts [0 ] # Return the main part of the domain, while ignoring 'www' if
@@ -96,6 +139,9 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir
96139 os .makedirs (self .html_save_dir )
97140 print (f"Created html directory { self .html_save_dir } " )
98141
142+ if not self .args .no_images :
143+ os .makedirs (self .args .image_directory , exist_ok = True )
144+
99145 self .keywords : List [str ] = ["about" , "archive" , "podcast" ]
100146 self .post_urls : List [str ] = self .get_all_post_urls ()
101147
@@ -359,6 +405,13 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
359405 total += 1
360406 continue
361407 title , subtitle , like_count , date , md = self .extract_post_data (soup )
408+
409+ if not self .args .no_images :
410+ total_images = count_images_in_markdown (md )
411+ post_slug = get_post_slug (url )
412+ with tqdm (total = total_images , desc = f"Downloading images for { post_slug } " , leave = False ) as img_pbar :
413+ md = await self .process_markdown_images (md , self .writer_name , post_slug , img_pbar )
414+
362415 self .save_to_file (md_filepath , md )
363416
364417 # Convert markdown to HTML and save
@@ -383,6 +436,56 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
383436 self .save_essays_data_to_json (essays_data = essays_data )
384437 generate_html_file (self .args , author_name = self .writer_name )
385438
439+ async def download_image (
440+ self ,
441+ url : str ,
442+ save_path : Path ,
443+ pbar : Optional [tqdm ] = None
444+ ) -> Optional [str ]:
445+ """Download image from URL and save to path."""
446+ try :
447+ response = requests .get (url , stream = True )
448+ if response .status_code == 200 :
449+ save_path .parent .mkdir (parents = True , exist_ok = True )
450+ with open (save_path , 'wb' ) as f :
451+ for chunk in response .iter_content (chunk_size = 8192 ):
452+ if chunk :
453+ f .write (chunk )
454+ if pbar :
455+ pbar .update (1 )
456+ return str (save_path )
457+ except Exception as exc :
458+ if pbar :
459+ pbar .write (f"Error downloading image { url } : { str (exc )} " )
460+ # raise exc # debug
461+ return None
462+
463+ async def process_markdown_images (
464+ self ,
465+ md_content : str ,
466+ author : str ,
467+ post_slug : str ,
468+ pbar = None
469+ ) -> str :
470+ """Process markdown content to download images and update references."""
471+ image_dir = Path (self .args .image_directory ) / author / post_slug
472+ # [](https://substackcdn.com/image/fetch/x.png)
473+ pattern = re .compile (r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)' )
474+ buf = io .StringIO ()
475+ last_end = 0
476+ for match in pattern .finditer (md_content ):
477+ buf .write (md_content [last_end :match .start ()])
478+ url = match .group (0 ).strip ("()" )
479+ filename = sanitize_image_filename (url )
480+ save_path = image_dir / filename
481+ if not save_path .exists ():
482+ await self .download_image (url , save_path , pbar )
483+ rel_path = os .path .relpath (save_path , Path (self .args .directory ) / author )
484+ buf .write (f"({ rel_path } )" )
485+ last_end = match .end ()
486+ buf .write (md_content [last_end :])
487+ return buf .getvalue ()
488+
386489
387490class SubstackScraper (BaseSubstackScraper ):
388491 def __init__ (self , args , base_substack_url : str , md_save_dir : str , html_save_dir : str ):
@@ -515,6 +618,76 @@ async def get_url_soup(self, url: str):
515618 html = await self .driver .page_source
516619 return BeautifulSoup (html , "html.parser" )
517620
621+ async def download_image_FIXME (
622+ self ,
623+ url : str ,
624+ save_path : Path ,
625+ pbar : Optional [tqdm ] = None
626+ ) -> Optional [str ]:
627+ """Download image using selenium_driverless"""
628+
629+ # NOTE for now this works with the default "def download_image"
630+
631+ # WONTFIX "fetch" fails due to CORS policy
632+
633+ # WONTFIX "canvas" does not return the original image bytes
634+
635+ # we could fetch images with CDP Network.getResponseBody
636+ # but that requires lots of boilerplate code
637+ # fix: use https://github.com/milahu/aiohttp_chromium
638+
639+ try :
640+ # Execute JS fetch inside browser
641+ result = await self .driver .execute_async_script (
642+ """
643+ const url = arguments[0];
644+ const callback = arguments[arguments.length - 1];
645+
646+ const img = new Image();
647+ img.crossOrigin = 'Anonymous'; // try to avoid CORS issues
648+ img.onload = () => {
649+ try {
650+ const canvas = document.createElement('canvas');
651+ canvas.width = img.width;
652+ canvas.height = img.height;
653+ const ctx = canvas.getContext('2d');
654+ ctx.drawImage(img, 0, 0);
655+ const dataUrl = canvas.toDataURL('image/png'); // returns "data:image/png;base64,..."
656+ const base64 = dataUrl.split(',')[1]; // strip prefix
657+ callback({data: base64});
658+ } catch (err) {
659+ callback({error: err.message, stack: err.stack});
660+ }
661+ };
662+ img.onerror = (err) => {
663+ callback({error: 'Image load error', stack: err.toString()});
664+ };
665+ img.src = url;
666+ """ ,
667+ url
668+ )
669+
670+ if isinstance (result , dict ) and "error" in result :
671+ raise RuntimeError (f"{ result ['error' ]} \n JS stack:\n { result ['stack' ]} " )
672+
673+ # Decode base64 to bytes
674+ image_bytes = base64 .b64decode (result )
675+
676+ save_path .parent .mkdir (parents = True , exist_ok = True )
677+ with open (save_path , "wb" ) as f :
678+ f .write (image_bytes )
679+
680+ if pbar :
681+ pbar .update (1 )
682+
683+ return str (save_path )
684+
685+ except Exception as exc :
686+ if pbar :
687+ pbar .write (f"Error downloading image { url } : { exc } " )
688+ # raise exc # debug
689+ return None
690+
518691
519692def parse_args () -> argparse .Namespace :
520693 parser = argparse .ArgumentParser (description = "Scrape a Substack site." )
@@ -588,6 +761,17 @@ def parse_args() -> argparse.Namespace:
588761 default = BASE_HTML_DIR ,
589762 help = f"The directory to save scraped posts as HTML files. Default: { BASE_HTML_DIR !r} " ,
590763 )
764+ parser .add_argument (
765+ "--image-directory" , # args.image_directory
766+ type = str ,
767+ default = BASE_IMAGE_DIR ,
768+ help = f"The directory to save scraped image files. Default: { BASE_IMAGE_DIR !r} " ,
769+ )
770+ parser .add_argument (
771+ "--no-images" , # args.no_images
772+ action = "store_true" ,
773+ help = f"Do not download images." ,
774+ )
591775
592776 return parser .parse_args ()
593777
0 commit comments