import os import sys import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, NoSuchElementException import requests from tqdm import tqdm # Configuration DOWNLOAD_DIR = os.path.expanduser("~/Downloads/image_scraper") WAIT_TIME = 3 # seconds to wait for lightbox/image to load DELAY_BETWEEN_PAGES = 2 # seconds to wait between processing pages (be nice to servers!) LOGIN_DELAY = 60 # seconds to wait for manual login before starting (0 to skip) URL_FILE = "urls.txt" # Default URL file # Create download directory if it doesn't exist os.makedirs(DOWNLOAD_DIR, exist_ok=True) def load_urls_from_file(filename): """Load URLs from a text file, one per line""" try: with open(filename, 'r') as f: urls = [line.strip() for line in f if line.strip() and not line.strip().startswith('#')] return urls except FileNotFoundError: print(f"Error: File '{filename}' not found") return [] except Exception as e: print(f"Error reading file: {e}") return [] def get_largest_image(driver): """Find the largest image on the page by dimensions, excluding headers/banners""" script = """ // First, check for Reddit post image specifically const postImage = document.getElementById('post-image'); if (postImage && postImage.naturalWidth > 0 && postImage.naturalHeight > 0) { return { src: postImage.src, width: postImage.naturalWidth, height: postImage.naturalHeight, isPostImage: true }; } // Otherwise, find largest non-header image const images = Array.from(document.querySelectorAll('img')); const imageData = images.map(img => { // Get the element and its parents to check context let element = img; let isInHeader = false; let isInBanner = false; let depth = 0; // Check if image is in header, nav, or banner elements while (element && depth < 10) { const tagName = element.tagName.toLowerCase(); const className = element.className.toLowerCase(); const id = element.id.toLowerCase(); if (tagName === 'header' || tagName === 'nav' || className.includes('header') || className.includes('banner') || className.includes('nav') || className.includes('toolbar') || id.includes('header') || id.includes('banner')) { isInHeader = true; break; } element = element.parentElement; depth++; } return { element: img, area: img.naturalWidth * img.naturalHeight, width: img.naturalWidth, height: img.naturalHeight, src: img.src, isInHeader: isInHeader }; }).filter(data => data.area > 0 && !data.isInHeader); // Filter out header images imageData.sort((a, b) => b.area - a.area); if (imageData.length > 0) { return { src: imageData[0].src, width: imageData[0].width, height: imageData[0].height }; } return null; """ return driver.execute_script(script) def click_largest_image(driver): """Find and click the largest clickable image, excluding headers/banners""" script = """ // First, check for Reddit post image specifically const postImage = document.getElementById('post-image'); if (postImage && postImage.naturalWidth > 0 && postImage.naturalHeight > 0) { postImage.click(); return true; } // Otherwise, find and click largest non-header image const images = Array.from(document.querySelectorAll('img')); const imageData = images.map(img => { // Check if image is in header, nav, or banner elements let element = img; let isInHeader = false; let depth = 0; while (element && depth < 10) { const tagName = element.tagName.toLowerCase(); const className = element.className.toLowerCase(); const id = element.id.toLowerCase(); if (tagName === 'header' || tagName === 'nav' || className.includes('header') || className.includes('banner') || className.includes('nav') || className.includes('toolbar') || id.includes('header') || id.includes('banner')) { isInHeader = true; break; } element = element.parentElement; depth++; } return { element: img, area: img.naturalWidth * img.naturalHeight, isInHeader: isInHeader }; }).filter(data => data.area > 0 && !data.isInHeader); imageData.sort((a, b) => b.area - a.area); if (imageData.length > 0) { imageData[0].element.click(); return true; } return false; """ return driver.execute_script(script) def get_subreddit_from_url(url): """Extract subreddit name from Reddit URL""" import re # Match patterns like reddit.com/r/subredditname or old.reddit.com/r/subredditname match = re.search(r'reddit\.com/r/([^/]+)', url) if match: return match.group(1) return None def download_image(url, filename, subfolder=None): """Download image from URL, optionally to a subfolder""" try: response = requests.get(url, timeout=10) if response.status_code == 200: # Determine the save directory if subfolder: save_dir = os.path.join(DOWNLOAD_DIR, subfolder) os.makedirs(save_dir, exist_ok=True) else: save_dir = DOWNLOAD_DIR filepath = os.path.join(save_dir, filename) with open(filepath, 'wb') as f: f.write(response.content) return True except Exception as e: return False return False def process_tab(driver, url, pbar): """Process a single tab to find and download the largest image""" try: pbar.set_description(f"Loading {url[:50]}...") # Check if this is a Reddit URL and extract subreddit subreddit = get_subreddit_from_url(url) # Navigate to the URL driver.get(url) time.sleep(2) # Wait for page to load pbar.set_description(f"Processing {url[:50]}...") # First, try to get the largest image directly initial_image = get_largest_image(driver) if initial_image: pbar.write(f" ✓ Found image: {initial_image['width']}x{initial_image['height']}px") # Try clicking the image to see if a lightbox appears clicked = click_largest_image(driver) if clicked: pbar.write(f" → Clicked image, checking for lightbox...") time.sleep(WAIT_TIME) # Wait for lightbox to appear # Check if a larger image appeared after clicking lightbox_image = get_largest_image(driver) if lightbox_image and lightbox_image['src'] != initial_image['src']: pbar.write(f" ✓ Lightbox image: {lightbox_image['width']}x{lightbox_image['height']}px") initial_image = lightbox_image # Download the image if initial_image: # Generate filename from URL url_parts = initial_image['src'].split('/') filename = url_parts[-1].split('?')[0] # Remove query params # If filename is generic or missing extension, create one if not filename or '.' not in filename: from urllib.parse import urlparse domain = urlparse(driver.current_url).netloc.replace('www.', '') timestamp = int(time.time()) filename = f"{domain}_{timestamp}.jpg" success = download_image(initial_image['src'], filename, subfolder=subreddit) if success: if subreddit: pbar.write(f" ✓ Saved to r/{subreddit}/: {filename}") else: pbar.write(f" ✓ Saved: {filename}") else: pbar.write(f" ✗ No images found on this page") except Exception as e: pbar.write(f" ✗ Error: {e}") def main(): print("=== Image Tab Downloader ===") print(f"Download directory: {DOWNLOAD_DIR}\n") # Check if a custom URL file was provided as argument url_file = sys.argv[1] if len(sys.argv) > 1 else URL_FILE # Load URLs from file urls = load_urls_from_file(url_file) if not urls: print(f"\nNo URLs found in '{url_file}'") print("\nUsage:") print(f" python {sys.argv[0]} [url_file.txt]") print(f"\nCreate a text file with one URL per line:") print(" https://example.com/image1") print(" https://example.com/image2") print(" # Lines starting with # are ignored") return print(f"Loaded {len(urls)} URL(s) from '{url_file}'") print(f"Delay between pages: {DELAY_BETWEEN_PAGES}s") print() # Set up Firefox options options = webdriver.FirefoxOptions() # Comment out the next line if you want to see the browser # options.add_argument('--headless') # Initialize the driver driver = webdriver.Firefox(options=options) try: # Login delay if configured if LOGIN_DELAY > 0: print(f"⏱️ Login window: You have {LOGIN_DELAY} seconds to log in to any sites you need...") print(" (The browser window is now open - go log in!)") print() # Open Reddit in the first tab to make login easy driver.get("https://www.reddit.com") # Countdown timer for remaining in range(LOGIN_DELAY, 0, -5): if remaining <= 10: print(f" Starting in {remaining} seconds...") time.sleep(1) else: print(f" Starting in {remaining} seconds...") time.sleep(5) print("\n🚀 Starting download process...\n") # Process each URL with progress bar with tqdm(total=len(urls), desc="Overall Progress", unit="page") as pbar: for i, url in enumerate(urls): if i > 0: # Don't delay before the first page time.sleep(DELAY_BETWEEN_PAGES) process_tab(driver, url, pbar) pbar.update(1) print(f"\n=== Complete! ===") print(f"Check {DOWNLOAD_DIR} for downloaded images") finally: # Close the browser driver.quit() if __name__ == "__main__": main()