Initial commit.

2025-10-22 09:06:50 -04:00
parent 6abfdc5e0d
commit 34afddbe22
4 changed files with 368 additions and 2 deletions
@@ -0,0 +1,27 @@
 # ---> macOS
 .DS_Store
 .AppleDouble
 .LSOverride
 # Icon must end with two \r
 Icon
 # Thumbnails
 ._*
 # Files that might appear in the root of a volume
 .DocumentRevisions-V100
 .fseventsd
 .Spotlight-V100
 .TemporaryItems
 .Trashes
 .VolumeIcon.icns
 # Directories potentially created on remote AFP share
 .AppleDB
 .AppleDesktop
 Network Trash Folder
 Temporary Items
 .apdisk
@@ -1,3 +1,24 @@
-# Selenium-Image-Scraper
+# Selenium Image Scraper (sis.py)
-A small script that automates the downloading of images through a Selenium controlled Firefox.
+The Selenium Image Scraper (sis.py) is a Python script that helps you download images from various websites. To do this, it uses a Firefox browser controlled by Selenium to automate what a user might do manually, but no longer has to because the script will handle it for them. The logic within the script is that sis.py will find the largest image on the webpage, check to see if it's part of a lightbox and, if so, it will download the largest image it can by grabbing the image from the lightbox. Otherwise, it'll download the largest image on the page.
 The script itself is designed to be run within a terminal in macOS but could be modified for other operating systems. Within macOS, images are downloaded to a folder in the current user's Downloads directory. There are some special considerations given to images in subreddits and directories will be created based on the name of the subreddit.
 The script works pretty well on single image posts on Tumblr, but it's not always accurate given the chaotic nature that is Tumblr themes.
 ## Setting Up
 This information is for setting up on macOS. You may need to make some changes based on your operating system.
 1. Clone the repo into whatever directory you like.
 2. This script relies on a Firefox browser controlled by Selenium. So you'll want to have Firefox installed and available on your system.
 3. Selenium talks to Firefox through geckodriver, which can be acquired through Homebrew ```brew install geckodriver```
 4. Within your working directory, create a virtual environment ```python3 -m venv .venv```
 5. Activate the virtual environment ```source .venv/bin/activate```
 6. Download any necessary dependencies through pip. This script uses os, sys, time, selenium, requests, and tqdm. Chances are you'll need to pip install selenium, requests, and tqdm; but your set up may vary.
 7. sis.py calls on a list of URLs within a creatively named text file called urls.txt. Add the urls of the web pages here. I've included a basic set of subreddit URLs in the repo if you want to have a quick play and see what happens.
 8. Run the script with ```python3 sis.py```
 ### AI Disclosure
 I used AI to help me create this script, specifically Claude using the Sonnet 4 model. I've tested the script for a few weeks before adding it to this repo, but be aware there could be dumbass bugs due to Claude being just as fallible as any other AI.
@@ -0,0 +1,309 @@
 import os
 import sys
 import time
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.common.exceptions import TimeoutException, NoSuchElementException
 import requests
 from tqdm import tqdm
 # Configuration
 DOWNLOAD_DIR = os.path.expanduser("~/Downloads/image_scraper")
 WAIT_TIME = 3  # seconds to wait for lightbox/image to load
 DELAY_BETWEEN_PAGES = 2  # seconds to wait between processing pages (be nice to servers!)
 LOGIN_DELAY = 60  # seconds to wait for manual login before starting (0 to skip)
 URL_FILE = "urls.txt"  # Default URL file
 # Create download directory if it doesn't exist
 os.makedirs(DOWNLOAD_DIR, exist_ok=True)
 def load_urls_from_file(filename):
    """Load URLs from a text file, one per line"""
    try:
        with open(filename, 'r') as f:
            urls = [line.strip() for line in f if line.strip() and not line.strip().startswith('#')]
        return urls
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found")
        return []
    except Exception as e:
        print(f"Error reading file: {e}")
        return []
 def get_largest_image(driver):
    """Find the largest image on the page by dimensions, excluding headers/banners"""
    script = """
    // First, check for Reddit post image specifically
    const postImage = document.getElementById('post-image');
    if (postImage && postImage.naturalWidth > 0 && postImage.naturalHeight > 0) {
        return {
            src: postImage.src,
            width: postImage.naturalWidth,
            height: postImage.naturalHeight,
            isPostImage: true
        };
    }
    // Otherwise, find largest non-header image
    const images = Array.from(document.querySelectorAll('img'));
    const imageData = images.map(img => {
        // Get the element and its parents to check context
        let element = img;
        let isInHeader = false;
        let isInBanner = false;
        let depth = 0;
        // Check if image is in header, nav, or banner elements
        while (element && depth < 10) {
            const tagName = element.tagName.toLowerCase();
            const className = element.className.toLowerCase();
            const id = element.id.toLowerCase();
            if (tagName === 'header' || tagName === 'nav' || 
                className.includes('header') || className.includes('banner') ||
                className.includes('nav') || className.includes('toolbar') ||
                id.includes('header') || id.includes('banner')) {
                isInHeader = true;
                break;
            }
            element = element.parentElement;
            depth++;
        }
        return {
            element: img,
            area: img.naturalWidth * img.naturalHeight,
            width: img.naturalWidth,
            height: img.naturalHeight,
            src: img.src,
            isInHeader: isInHeader
        };
    }).filter(data => data.area > 0 && !data.isInHeader);  // Filter out header images
    imageData.sort((a, b) => b.area - a.area);
    if (imageData.length > 0) {
        return {
            src: imageData[0].src,
            width: imageData[0].width,
            height: imageData[0].height
        };
    }
    return null;
    """
    return driver.execute_script(script)
 def click_largest_image(driver):
    """Find and click the largest clickable image, excluding headers/banners"""
    script = """
    // First, check for Reddit post image specifically
    const postImage = document.getElementById('post-image');
    if (postImage && postImage.naturalWidth > 0 && postImage.naturalHeight > 0) {
        postImage.click();
        return true;
    }
    // Otherwise, find and click largest non-header image
    const images = Array.from(document.querySelectorAll('img'));
    const imageData = images.map(img => {
        // Check if image is in header, nav, or banner elements
        let element = img;
        let isInHeader = false;
        let depth = 0;
        while (element && depth < 10) {
            const tagName = element.tagName.toLowerCase();
            const className = element.className.toLowerCase();
            const id = element.id.toLowerCase();
            if (tagName === 'header' || tagName === 'nav' || 
                className.includes('header') || className.includes('banner') ||
                className.includes('nav') || className.includes('toolbar') ||
                id.includes('header') || id.includes('banner')) {
                isInHeader = true;
                break;
            }
            element = element.parentElement;
            depth++;
        }
        return {
            element: img,
            area: img.naturalWidth * img.naturalHeight,
            isInHeader: isInHeader
        };
    }).filter(data => data.area > 0 && !data.isInHeader);
    imageData.sort((a, b) => b.area - a.area);
    if (imageData.length > 0) {
        imageData[0].element.click();
        return true;
    }
    return false;
    """
    return driver.execute_script(script)
 def get_subreddit_from_url(url):
    """Extract subreddit name from Reddit URL"""
    import re
    # Match patterns like reddit.com/r/subredditname or old.reddit.com/r/subredditname
    match = re.search(r'reddit\.com/r/([^/]+)', url)
    if match:
        return match.group(1)
    return None
 def download_image(url, filename, subfolder=None):
    """Download image from URL, optionally to a subfolder"""
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            # Determine the save directory
            if subfolder:
                save_dir = os.path.join(DOWNLOAD_DIR, subfolder)
                os.makedirs(save_dir, exist_ok=True)
            else:
                save_dir = DOWNLOAD_DIR
            filepath = os.path.join(save_dir, filename)
            with open(filepath, 'wb') as f:
                f.write(response.content)
            return True
    except Exception as e:
        return False
    return False
 def process_tab(driver, url, pbar):
    """Process a single tab to find and download the largest image"""
    try:
        pbar.set_description(f"Loading {url[:50]}...")
        # Check if this is a Reddit URL and extract subreddit
        subreddit = get_subreddit_from_url(url)
        # Navigate to the URL
        driver.get(url)
        time.sleep(2)  # Wait for page to load
        pbar.set_description(f"Processing {url[:50]}...")
        # First, try to get the largest image directly
        initial_image = get_largest_image(driver)
        if initial_image:
            pbar.write(f"  ✓ Found image: {initial_image['width']}x{initial_image['height']}px")
        # Try clicking the image to see if a lightbox appears
        clicked = click_largest_image(driver)
        if clicked:
            pbar.write(f"  → Clicked image, checking for lightbox...")
            time.sleep(WAIT_TIME)  # Wait for lightbox to appear
            # Check if a larger image appeared after clicking
            lightbox_image = get_largest_image(driver)
            if lightbox_image and lightbox_image['src'] != initial_image['src']:
                pbar.write(f"  ✓ Lightbox image: {lightbox_image['width']}x{lightbox_image['height']}px")
                initial_image = lightbox_image
        # Download the image
        if initial_image:
            # Generate filename from URL
            url_parts = initial_image['src'].split('/')
            filename = url_parts[-1].split('?')[0]  # Remove query params
            # If filename is generic or missing extension, create one
            if not filename or '.' not in filename:
                from urllib.parse import urlparse
                domain = urlparse(driver.current_url).netloc.replace('www.', '')
                timestamp = int(time.time())
                filename = f"{domain}_{timestamp}.jpg"
            success = download_image(initial_image['src'], filename, subfolder=subreddit)
            if success:
                if subreddit:
                    pbar.write(f"  ✓ Saved to r/{subreddit}/: {filename}")
                else:
                    pbar.write(f"  ✓ Saved: {filename}")
        else:
            pbar.write(f"  ✗ No images found on this page")
    except Exception as e:
        pbar.write(f"  ✗ Error: {e}")
 def main():
    print("=== Image Tab Downloader ===")
    print(f"Download directory: {DOWNLOAD_DIR}\n")
    # Check if a custom URL file was provided as argument
    url_file = sys.argv[1] if len(sys.argv) > 1 else URL_FILE
    # Load URLs from file
    urls = load_urls_from_file(url_file)
    if not urls:
        print(f"\nNo URLs found in '{url_file}'")
        print("\nUsage:")
        print(f"  python {sys.argv[0]} [url_file.txt]")
        print(f"\nCreate a text file with one URL per line:")
        print("  https://example.com/image1")
        print("  https://example.com/image2")
        print("  # Lines starting with # are ignored")
        return
    print(f"Loaded {len(urls)} URL(s) from '{url_file}'")
    print(f"Delay between pages: {DELAY_BETWEEN_PAGES}s")
    print()
    # Set up Firefox options
    options = webdriver.FirefoxOptions()
    # Comment out the next line if you want to see the browser
    # options.add_argument('--headless')
    # Initialize the driver
    driver = webdriver.Firefox(options=options)
    try:
        # Login delay if configured
        if LOGIN_DELAY > 0:
            print(f"⏱️  Login window: You have {LOGIN_DELAY} seconds to log in to any sites you need...")
            print("   (The browser window is now open - go log in!)")
            print()
            # Open Reddit in the first tab to make login easy
            driver.get("https://www.reddit.com")
            # Countdown timer
            for remaining in range(LOGIN_DELAY, 0, -5):
                if remaining <= 10:
                    print(f"   Starting in {remaining} seconds...")
                    time.sleep(1)
                else:
                    print(f"   Starting in {remaining} seconds...")
                    time.sleep(5)
            print("\n🚀 Starting download process...\n")
        # Process each URL with progress bar
        with tqdm(total=len(urls), desc="Overall Progress", unit="page") as pbar:
            for i, url in enumerate(urls):
                if i > 0:  # Don't delay before the first page
                    time.sleep(DELAY_BETWEEN_PAGES)
                process_tab(driver, url, pbar)
                pbar.update(1)
        print(f"\n=== Complete! ===")
        print(f"Check {DOWNLOAD_DIR} for downloaded images")
    finally:
        # Close the browser
        driver.quit()
 if __name__ == "__main__":
    main()
@@ -0,0 +1,9 @@
 https://www.reddit.com/r/ImaginarySliceOfLife/comments/1octs3z/coffee_date_by_fztt/
 https://www.reddit.com/r/ImaginarySliceOfLife/comments/1ocg29d/a_letter_to_my_younger_self_dangiuz/
 https://www.reddit.com/r/ImaginarySliceOfLife/comments/1obrp2k/grown_ass_women_by_veyonis/
 https://www.reddit.com/r/ImaginarySliceOfLife/comments/1obfzgc/friend_by_guweiz/
 https://www.reddit.com/r/ImaginarySliceOfLife/comments/1ob82uq/a_yummy_dinner_by_ig_arangchii/
 https://www.reddit.com/r/ImaginarySliceOfLife/comments/1oawtvy/dream_by_artist_dave_greco/
 https://www.reddit.com/r/ImaginarySliceOfLife/comments/1oawm88/studio_workspace_commission_by_artist_vizireanu/
 https://www.reddit.com/r/ImaginarySliceOfLife/comments/1oavmu8/night_in_the_town_by_cyclecircle/
 https://www.reddit.com/r/ImaginarySliceOfLife/comments/1oajf5c/reze_denji_by_kouchanthree/