Files
Selenium-Image-Scraper/sis.py
2025-10-22 09:06:50 -04:00

310 lines
11 KiB
Python

import os
import sys
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import requests
from tqdm import tqdm
# Configuration
DOWNLOAD_DIR = os.path.expanduser("~/Downloads/image_scraper")
WAIT_TIME = 3 # seconds to wait for lightbox/image to load
DELAY_BETWEEN_PAGES = 2 # seconds to wait between processing pages (be nice to servers!)
LOGIN_DELAY = 60 # seconds to wait for manual login before starting (0 to skip)
URL_FILE = "urls.txt" # Default URL file
# Create download directory if it doesn't exist
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
def load_urls_from_file(filename):
"""Load URLs from a text file, one per line"""
try:
with open(filename, 'r') as f:
urls = [line.strip() for line in f if line.strip() and not line.strip().startswith('#')]
return urls
except FileNotFoundError:
print(f"Error: File '{filename}' not found")
return []
except Exception as e:
print(f"Error reading file: {e}")
return []
def get_largest_image(driver):
"""Find the largest image on the page by dimensions, excluding headers/banners"""
script = """
// First, check for Reddit post image specifically
const postImage = document.getElementById('post-image');
if (postImage && postImage.naturalWidth > 0 && postImage.naturalHeight > 0) {
return {
src: postImage.src,
width: postImage.naturalWidth,
height: postImage.naturalHeight,
isPostImage: true
};
}
// Otherwise, find largest non-header image
const images = Array.from(document.querySelectorAll('img'));
const imageData = images.map(img => {
// Get the element and its parents to check context
let element = img;
let isInHeader = false;
let isInBanner = false;
let depth = 0;
// Check if image is in header, nav, or banner elements
while (element && depth < 10) {
const tagName = element.tagName.toLowerCase();
const className = element.className.toLowerCase();
const id = element.id.toLowerCase();
if (tagName === 'header' || tagName === 'nav' ||
className.includes('header') || className.includes('banner') ||
className.includes('nav') || className.includes('toolbar') ||
id.includes('header') || id.includes('banner')) {
isInHeader = true;
break;
}
element = element.parentElement;
depth++;
}
return {
element: img,
area: img.naturalWidth * img.naturalHeight,
width: img.naturalWidth,
height: img.naturalHeight,
src: img.src,
isInHeader: isInHeader
};
}).filter(data => data.area > 0 && !data.isInHeader); // Filter out header images
imageData.sort((a, b) => b.area - a.area);
if (imageData.length > 0) {
return {
src: imageData[0].src,
width: imageData[0].width,
height: imageData[0].height
};
}
return null;
"""
return driver.execute_script(script)
def click_largest_image(driver):
"""Find and click the largest clickable image, excluding headers/banners"""
script = """
// First, check for Reddit post image specifically
const postImage = document.getElementById('post-image');
if (postImage && postImage.naturalWidth > 0 && postImage.naturalHeight > 0) {
postImage.click();
return true;
}
// Otherwise, find and click largest non-header image
const images = Array.from(document.querySelectorAll('img'));
const imageData = images.map(img => {
// Check if image is in header, nav, or banner elements
let element = img;
let isInHeader = false;
let depth = 0;
while (element && depth < 10) {
const tagName = element.tagName.toLowerCase();
const className = element.className.toLowerCase();
const id = element.id.toLowerCase();
if (tagName === 'header' || tagName === 'nav' ||
className.includes('header') || className.includes('banner') ||
className.includes('nav') || className.includes('toolbar') ||
id.includes('header') || id.includes('banner')) {
isInHeader = true;
break;
}
element = element.parentElement;
depth++;
}
return {
element: img,
area: img.naturalWidth * img.naturalHeight,
isInHeader: isInHeader
};
}).filter(data => data.area > 0 && !data.isInHeader);
imageData.sort((a, b) => b.area - a.area);
if (imageData.length > 0) {
imageData[0].element.click();
return true;
}
return false;
"""
return driver.execute_script(script)
def get_subreddit_from_url(url):
"""Extract subreddit name from Reddit URL"""
import re
# Match patterns like reddit.com/r/subredditname or old.reddit.com/r/subredditname
match = re.search(r'reddit\.com/r/([^/]+)', url)
if match:
return match.group(1)
return None
def download_image(url, filename, subfolder=None):
"""Download image from URL, optionally to a subfolder"""
try:
response = requests.get(url, timeout=10)
if response.status_code == 200:
# Determine the save directory
if subfolder:
save_dir = os.path.join(DOWNLOAD_DIR, subfolder)
os.makedirs(save_dir, exist_ok=True)
else:
save_dir = DOWNLOAD_DIR
filepath = os.path.join(save_dir, filename)
with open(filepath, 'wb') as f:
f.write(response.content)
return True
except Exception as e:
return False
return False
def process_tab(driver, url, pbar):
"""Process a single tab to find and download the largest image"""
try:
pbar.set_description(f"Loading {url[:50]}...")
# Check if this is a Reddit URL and extract subreddit
subreddit = get_subreddit_from_url(url)
# Navigate to the URL
driver.get(url)
time.sleep(2) # Wait for page to load
pbar.set_description(f"Processing {url[:50]}...")
# First, try to get the largest image directly
initial_image = get_largest_image(driver)
if initial_image:
pbar.write(f" ✓ Found image: {initial_image['width']}x{initial_image['height']}px")
# Try clicking the image to see if a lightbox appears
clicked = click_largest_image(driver)
if clicked:
pbar.write(f" → Clicked image, checking for lightbox...")
time.sleep(WAIT_TIME) # Wait for lightbox to appear
# Check if a larger image appeared after clicking
lightbox_image = get_largest_image(driver)
if lightbox_image and lightbox_image['src'] != initial_image['src']:
pbar.write(f" ✓ Lightbox image: {lightbox_image['width']}x{lightbox_image['height']}px")
initial_image = lightbox_image
# Download the image
if initial_image:
# Generate filename from URL
url_parts = initial_image['src'].split('/')
filename = url_parts[-1].split('?')[0] # Remove query params
# If filename is generic or missing extension, create one
if not filename or '.' not in filename:
from urllib.parse import urlparse
domain = urlparse(driver.current_url).netloc.replace('www.', '')
timestamp = int(time.time())
filename = f"{domain}_{timestamp}.jpg"
success = download_image(initial_image['src'], filename, subfolder=subreddit)
if success:
if subreddit:
pbar.write(f" ✓ Saved to r/{subreddit}/: {filename}")
else:
pbar.write(f" ✓ Saved: {filename}")
else:
pbar.write(f" ✗ No images found on this page")
except Exception as e:
pbar.write(f" ✗ Error: {e}")
def main():
print("=== Image Tab Downloader ===")
print(f"Download directory: {DOWNLOAD_DIR}\n")
# Check if a custom URL file was provided as argument
url_file = sys.argv[1] if len(sys.argv) > 1 else URL_FILE
# Load URLs from file
urls = load_urls_from_file(url_file)
if not urls:
print(f"\nNo URLs found in '{url_file}'")
print("\nUsage:")
print(f" python {sys.argv[0]} [url_file.txt]")
print(f"\nCreate a text file with one URL per line:")
print(" https://example.com/image1")
print(" https://example.com/image2")
print(" # Lines starting with # are ignored")
return
print(f"Loaded {len(urls)} URL(s) from '{url_file}'")
print(f"Delay between pages: {DELAY_BETWEEN_PAGES}s")
print()
# Set up Firefox options
options = webdriver.FirefoxOptions()
# Comment out the next line if you want to see the browser
# options.add_argument('--headless')
# Initialize the driver
driver = webdriver.Firefox(options=options)
try:
# Login delay if configured
if LOGIN_DELAY > 0:
print(f"⏱️ Login window: You have {LOGIN_DELAY} seconds to log in to any sites you need...")
print(" (The browser window is now open - go log in!)")
print()
# Open Reddit in the first tab to make login easy
driver.get("https://www.reddit.com")
# Countdown timer
for remaining in range(LOGIN_DELAY, 0, -5):
if remaining <= 10:
print(f" Starting in {remaining} seconds...")
time.sleep(1)
else:
print(f" Starting in {remaining} seconds...")
time.sleep(5)
print("\n🚀 Starting download process...\n")
# Process each URL with progress bar
with tqdm(total=len(urls), desc="Overall Progress", unit="page") as pbar:
for i, url in enumerate(urls):
if i > 0: # Don't delay before the first page
time.sleep(DELAY_BETWEEN_PAGES)
process_tab(driver, url, pbar)
pbar.update(1)
print(f"\n=== Complete! ===")
print(f"Check {DOWNLOAD_DIR} for downloaded images")
finally:
# Close the browser
driver.quit()
if __name__ == "__main__":
main()