Selenium-Image-Scraper/sis.py

import os
import sys
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import requests
from tqdm import tqdm

# Configuration
DOWNLOAD_DIR = os.path.expanduser("~/Downloads/image_scraper")
WAIT_TIME = 3  # seconds to wait for lightbox/image to load
DELAY_BETWEEN_PAGES = 2  # seconds to wait between processing pages (be nice to servers!)
LOGIN_DELAY = 60  # seconds to wait for manual login before starting (0 to skip)
URL_FILE = "urls.txt"  # Default URL file

# Create download directory if it doesn't exist
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

def load_urls_from_file(filename):
    """Load URLs from a text file, one per line"""
    try:
        with open(filename, 'r') as f:
            urls = [line.strip() for line in f if line.strip() and not line.strip().startswith('#')]
        return urls
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found")
        return []
    except Exception as e:
        print(f"Error reading file: {e}")
        return []

def get_largest_image(driver):
    """Find the largest image on the page by dimensions, excluding headers/banners"""
    script = """
    // First, check for Reddit post image specifically
    const postImage = document.getElementById('post-image');
    if (postImage && postImage.naturalWidth > 0 && postImage.naturalHeight > 0) {
        return {
            src: postImage.src,
            width: postImage.naturalWidth,
            height: postImage.naturalHeight,
            isPostImage: true
        };
    }

    // Otherwise, find largest non-header image
    const images = Array.from(document.querySelectorAll('img'));
    const imageData = images.map(img => {
        // Get the element and its parents to check context
        let element = img;
        let isInHeader = false;
        let isInBanner = false;
        let depth = 0;

        // Check if image is in header, nav, or banner elements
        while (element && depth < 10) {
            const tagName = element.tagName.toLowerCase();
            const className = element.className.toLowerCase();
            const id = element.id.toLowerCase();

            if (tagName === 'header' || tagName === 'nav' ||
                className.includes('header') || className.includes('banner') ||
                className.includes('nav') || className.includes('toolbar') ||
                id.includes('header') || id.includes('banner')) {
                isInHeader = true;
                break;
            }

            element = element.parentElement;
            depth++;
        }

        return {
            element: img,
            area: img.naturalWidth * img.naturalHeight,
            width: img.naturalWidth,
            height: img.naturalHeight,
            src: img.src,
            isInHeader: isInHeader
        };
    }).filter(data => data.area > 0 && !data.isInHeader);  // Filter out header images

    imageData.sort((a, b) => b.area - a.area);

    if (imageData.length > 0) {
        return {
            src: imageData[0].src,
            width: imageData[0].width,
            height: imageData[0].height
        };
    }
    return null;
    """
    return driver.execute_script(script)

def click_largest_image(driver):
    """Find and click the largest clickable image, excluding headers/banners"""
    script = """
    // First, check for Reddit post image specifically
    const postImage = document.getElementById('post-image');
    if (postImage && postImage.naturalWidth > 0 && postImage.naturalHeight > 0) {
        postImage.click();
        return true;
    }

    // Otherwise, find and click largest non-header image
    const images = Array.from(document.querySelectorAll('img'));
    const imageData = images.map(img => {
        // Check if image is in header, nav, or banner elements
        let element = img;
        let isInHeader = false;
        let depth = 0;

        while (element && depth < 10) {
            const tagName = element.tagName.toLowerCase();
            const className = element.className.toLowerCase();
            const id = element.id.toLowerCase();

            if (tagName === 'header' || tagName === 'nav' ||
                className.includes('header') || className.includes('banner') ||
                className.includes('nav') || className.includes('toolbar') ||
                id.includes('header') || id.includes('banner')) {
                isInHeader = true;
                break;
            }

            element = element.parentElement;
            depth++;
        }

        return {
            element: img,
            area: img.naturalWidth * img.naturalHeight,
            isInHeader: isInHeader
        };
    }).filter(data => data.area > 0 && !data.isInHeader);

    imageData.sort((a, b) => b.area - a.area);

    if (imageData.length > 0) {
        imageData[0].element.click();
        return true;
    }
    return false;
    """
    return driver.execute_script(script)

def get_subreddit_from_url(url):
    """Extract subreddit name from Reddit URL"""
    import re
    # Match patterns like reddit.com/r/subredditname or old.reddit.com/r/subredditname
    match = re.search(r'reddit\.com/r/([^/]+)', url)
    if match:
        return match.group(1)
    return None

def download_image(url, filename, subfolder=None):
    """Download image from URL, optionally to a subfolder"""
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            # Determine the save directory
            if subfolder:
                save_dir = os.path.join(DOWNLOAD_DIR, subfolder)
                os.makedirs(save_dir, exist_ok=True)
            else:
                save_dir = DOWNLOAD_DIR

            filepath = os.path.join(save_dir, filename)
            with open(filepath, 'wb') as f:
                f.write(response.content)
            return True
    except Exception as e:
        return False
    return False

def process_tab(driver, url, pbar):
    """Process a single tab to find and download the largest image"""
    try:
        pbar.set_description(f"Loading {url[:50]}...")

        # Check if this is a Reddit URL and extract subreddit
        subreddit = get_subreddit_from_url(url)

        # Navigate to the URL
        driver.get(url)
        time.sleep(2)  # Wait for page to load

        pbar.set_description(f"Processing {url[:50]}...")

        # First, try to get the largest image directly
        initial_image = get_largest_image(driver)

        if initial_image:
            pbar.write(f"  ✓ Found image: {initial_image['width']}x{initial_image['height']}px")

        # Try clicking the image to see if a lightbox appears
        clicked = click_largest_image(driver)

        if clicked:
            pbar.write(f"  → Clicked image, checking for lightbox...")
            time.sleep(WAIT_TIME)  # Wait for lightbox to appear

            # Check if a larger image appeared after clicking
            lightbox_image = get_largest_image(driver)

            if lightbox_image and lightbox_image['src'] != initial_image['src']:
                pbar.write(f"  ✓ Lightbox image: {lightbox_image['width']}x{lightbox_image['height']}px")
                initial_image = lightbox_image

        # Download the image
        if initial_image:
            # Generate filename from URL
            url_parts = initial_image['src'].split('/')
            filename = url_parts[-1].split('?')[0]  # Remove query params

            # If filename is generic or missing extension, create one
            if not filename or '.' not in filename:
                from urllib.parse import urlparse
                domain = urlparse(driver.current_url).netloc.replace('www.', '')
                timestamp = int(time.time())
                filename = f"{domain}_{timestamp}.jpg"

            success = download_image(initial_image['src'], filename, subfolder=subreddit)
            if success:
                if subreddit:
                    pbar.write(f"  ✓ Saved to r/{subreddit}/: {filename}")
                else:
                    pbar.write(f"  ✓ Saved: {filename}")
        else:
            pbar.write(f"  ✗ No images found on this page")

    except Exception as e:
        pbar.write(f"  ✗ Error: {e}")

def main():
    print("=== Image Tab Downloader ===")
    print(f"Download directory: {DOWNLOAD_DIR}\n")

    # Check if a custom URL file was provided as argument
    url_file = sys.argv[1] if len(sys.argv) > 1 else URL_FILE

    # Load URLs from file
    urls = load_urls_from_file(url_file)

    if not urls:
        print(f"\nNo URLs found in '{url_file}'")
        print("\nUsage:")
        print(f"  python {sys.argv[0]} [url_file.txt]")
        print(f"\nCreate a text file with one URL per line:")
        print("  https://example.com/image1")
        print("  https://example.com/image2")
        print("  # Lines starting with # are ignored")
        return

    print(f"Loaded {len(urls)} URL(s) from '{url_file}'")
    print(f"Delay between pages: {DELAY_BETWEEN_PAGES}s")
    print()

    # Set up Firefox options
    options = webdriver.FirefoxOptions()
    # Comment out the next line if you want to see the browser
    # options.add_argument('--headless')

    # Initialize the driver
    driver = webdriver.Firefox(options=options)

    try:
        # Login delay if configured
        if LOGIN_DELAY > 0:
            print(f"⏱️  Login window: You have {LOGIN_DELAY} seconds to log in to any sites you need...")
            print("   (The browser window is now open - go log in!)")
            print()

            # Open Reddit in the first tab to make login easy
            driver.get("https://www.reddit.com")

            # Countdown timer
            for remaining in range(LOGIN_DELAY, 0, -5):
                if remaining <= 10:
                    print(f"   Starting in {remaining} seconds...")
                    time.sleep(1)
                else:
                    print(f"   Starting in {remaining} seconds...")
                    time.sleep(5)

            print("\n🚀 Starting download process...\n")

        # Process each URL with progress bar
        with tqdm(total=len(urls), desc="Overall Progress", unit="page") as pbar:
            for i, url in enumerate(urls):
                if i > 0:  # Don't delay before the first page
                    time.sleep(DELAY_BETWEEN_PAGES)

                process_tab(driver, url, pbar)
                pbar.update(1)

        print(f"\n=== Complete! ===")
        print(f"Check {DOWNLOAD_DIR} for downloaded images")

    finally:
        # Close the browser
        driver.quit()

if __name__ == "__main__":
    main()