310 lines
11 KiB
Python
310 lines
11 KiB
Python
import os
|
|
import sys
|
|
import time
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
|
import requests
|
|
from tqdm import tqdm
|
|
|
|
# Configuration
|
|
DOWNLOAD_DIR = os.path.expanduser("~/Downloads/image_scraper")
|
|
WAIT_TIME = 3 # seconds to wait for lightbox/image to load
|
|
DELAY_BETWEEN_PAGES = 2 # seconds to wait between processing pages (be nice to servers!)
|
|
LOGIN_DELAY = 60 # seconds to wait for manual login before starting (0 to skip)
|
|
URL_FILE = "urls.txt" # Default URL file
|
|
|
|
# Create download directory if it doesn't exist
|
|
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
|
|
|
|
def load_urls_from_file(filename):
|
|
"""Load URLs from a text file, one per line"""
|
|
try:
|
|
with open(filename, 'r') as f:
|
|
urls = [line.strip() for line in f if line.strip() and not line.strip().startswith('#')]
|
|
return urls
|
|
except FileNotFoundError:
|
|
print(f"Error: File '{filename}' not found")
|
|
return []
|
|
except Exception as e:
|
|
print(f"Error reading file: {e}")
|
|
return []
|
|
|
|
def get_largest_image(driver):
|
|
"""Find the largest image on the page by dimensions, excluding headers/banners"""
|
|
script = """
|
|
// First, check for Reddit post image specifically
|
|
const postImage = document.getElementById('post-image');
|
|
if (postImage && postImage.naturalWidth > 0 && postImage.naturalHeight > 0) {
|
|
return {
|
|
src: postImage.src,
|
|
width: postImage.naturalWidth,
|
|
height: postImage.naturalHeight,
|
|
isPostImage: true
|
|
};
|
|
}
|
|
|
|
// Otherwise, find largest non-header image
|
|
const images = Array.from(document.querySelectorAll('img'));
|
|
const imageData = images.map(img => {
|
|
// Get the element and its parents to check context
|
|
let element = img;
|
|
let isInHeader = false;
|
|
let isInBanner = false;
|
|
let depth = 0;
|
|
|
|
// Check if image is in header, nav, or banner elements
|
|
while (element && depth < 10) {
|
|
const tagName = element.tagName.toLowerCase();
|
|
const className = element.className.toLowerCase();
|
|
const id = element.id.toLowerCase();
|
|
|
|
if (tagName === 'header' || tagName === 'nav' ||
|
|
className.includes('header') || className.includes('banner') ||
|
|
className.includes('nav') || className.includes('toolbar') ||
|
|
id.includes('header') || id.includes('banner')) {
|
|
isInHeader = true;
|
|
break;
|
|
}
|
|
|
|
element = element.parentElement;
|
|
depth++;
|
|
}
|
|
|
|
return {
|
|
element: img,
|
|
area: img.naturalWidth * img.naturalHeight,
|
|
width: img.naturalWidth,
|
|
height: img.naturalHeight,
|
|
src: img.src,
|
|
isInHeader: isInHeader
|
|
};
|
|
}).filter(data => data.area > 0 && !data.isInHeader); // Filter out header images
|
|
|
|
imageData.sort((a, b) => b.area - a.area);
|
|
|
|
if (imageData.length > 0) {
|
|
return {
|
|
src: imageData[0].src,
|
|
width: imageData[0].width,
|
|
height: imageData[0].height
|
|
};
|
|
}
|
|
return null;
|
|
"""
|
|
return driver.execute_script(script)
|
|
|
|
def click_largest_image(driver):
|
|
"""Find and click the largest clickable image, excluding headers/banners"""
|
|
script = """
|
|
// First, check for Reddit post image specifically
|
|
const postImage = document.getElementById('post-image');
|
|
if (postImage && postImage.naturalWidth > 0 && postImage.naturalHeight > 0) {
|
|
postImage.click();
|
|
return true;
|
|
}
|
|
|
|
// Otherwise, find and click largest non-header image
|
|
const images = Array.from(document.querySelectorAll('img'));
|
|
const imageData = images.map(img => {
|
|
// Check if image is in header, nav, or banner elements
|
|
let element = img;
|
|
let isInHeader = false;
|
|
let depth = 0;
|
|
|
|
while (element && depth < 10) {
|
|
const tagName = element.tagName.toLowerCase();
|
|
const className = element.className.toLowerCase();
|
|
const id = element.id.toLowerCase();
|
|
|
|
if (tagName === 'header' || tagName === 'nav' ||
|
|
className.includes('header') || className.includes('banner') ||
|
|
className.includes('nav') || className.includes('toolbar') ||
|
|
id.includes('header') || id.includes('banner')) {
|
|
isInHeader = true;
|
|
break;
|
|
}
|
|
|
|
element = element.parentElement;
|
|
depth++;
|
|
}
|
|
|
|
return {
|
|
element: img,
|
|
area: img.naturalWidth * img.naturalHeight,
|
|
isInHeader: isInHeader
|
|
};
|
|
}).filter(data => data.area > 0 && !data.isInHeader);
|
|
|
|
imageData.sort((a, b) => b.area - a.area);
|
|
|
|
if (imageData.length > 0) {
|
|
imageData[0].element.click();
|
|
return true;
|
|
}
|
|
return false;
|
|
"""
|
|
return driver.execute_script(script)
|
|
|
|
def get_subreddit_from_url(url):
|
|
"""Extract subreddit name from Reddit URL"""
|
|
import re
|
|
# Match patterns like reddit.com/r/subredditname or old.reddit.com/r/subredditname
|
|
match = re.search(r'reddit\.com/r/([^/]+)', url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def download_image(url, filename, subfolder=None):
|
|
"""Download image from URL, optionally to a subfolder"""
|
|
try:
|
|
response = requests.get(url, timeout=10)
|
|
if response.status_code == 200:
|
|
# Determine the save directory
|
|
if subfolder:
|
|
save_dir = os.path.join(DOWNLOAD_DIR, subfolder)
|
|
os.makedirs(save_dir, exist_ok=True)
|
|
else:
|
|
save_dir = DOWNLOAD_DIR
|
|
|
|
filepath = os.path.join(save_dir, filename)
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
return True
|
|
except Exception as e:
|
|
return False
|
|
return False
|
|
|
|
def process_tab(driver, url, pbar):
|
|
"""Process a single tab to find and download the largest image"""
|
|
try:
|
|
pbar.set_description(f"Loading {url[:50]}...")
|
|
|
|
# Check if this is a Reddit URL and extract subreddit
|
|
subreddit = get_subreddit_from_url(url)
|
|
|
|
# Navigate to the URL
|
|
driver.get(url)
|
|
time.sleep(2) # Wait for page to load
|
|
|
|
pbar.set_description(f"Processing {url[:50]}...")
|
|
|
|
# First, try to get the largest image directly
|
|
initial_image = get_largest_image(driver)
|
|
|
|
if initial_image:
|
|
pbar.write(f" ✓ Found image: {initial_image['width']}x{initial_image['height']}px")
|
|
|
|
# Try clicking the image to see if a lightbox appears
|
|
clicked = click_largest_image(driver)
|
|
|
|
if clicked:
|
|
pbar.write(f" → Clicked image, checking for lightbox...")
|
|
time.sleep(WAIT_TIME) # Wait for lightbox to appear
|
|
|
|
# Check if a larger image appeared after clicking
|
|
lightbox_image = get_largest_image(driver)
|
|
|
|
if lightbox_image and lightbox_image['src'] != initial_image['src']:
|
|
pbar.write(f" ✓ Lightbox image: {lightbox_image['width']}x{lightbox_image['height']}px")
|
|
initial_image = lightbox_image
|
|
|
|
# Download the image
|
|
if initial_image:
|
|
# Generate filename from URL
|
|
url_parts = initial_image['src'].split('/')
|
|
filename = url_parts[-1].split('?')[0] # Remove query params
|
|
|
|
# If filename is generic or missing extension, create one
|
|
if not filename or '.' not in filename:
|
|
from urllib.parse import urlparse
|
|
domain = urlparse(driver.current_url).netloc.replace('www.', '')
|
|
timestamp = int(time.time())
|
|
filename = f"{domain}_{timestamp}.jpg"
|
|
|
|
success = download_image(initial_image['src'], filename, subfolder=subreddit)
|
|
if success:
|
|
if subreddit:
|
|
pbar.write(f" ✓ Saved to r/{subreddit}/: {filename}")
|
|
else:
|
|
pbar.write(f" ✓ Saved: {filename}")
|
|
else:
|
|
pbar.write(f" ✗ No images found on this page")
|
|
|
|
except Exception as e:
|
|
pbar.write(f" ✗ Error: {e}")
|
|
|
|
def main():
|
|
print("=== Image Tab Downloader ===")
|
|
print(f"Download directory: {DOWNLOAD_DIR}\n")
|
|
|
|
# Check if a custom URL file was provided as argument
|
|
url_file = sys.argv[1] if len(sys.argv) > 1 else URL_FILE
|
|
|
|
# Load URLs from file
|
|
urls = load_urls_from_file(url_file)
|
|
|
|
if not urls:
|
|
print(f"\nNo URLs found in '{url_file}'")
|
|
print("\nUsage:")
|
|
print(f" python {sys.argv[0]} [url_file.txt]")
|
|
print(f"\nCreate a text file with one URL per line:")
|
|
print(" https://example.com/image1")
|
|
print(" https://example.com/image2")
|
|
print(" # Lines starting with # are ignored")
|
|
return
|
|
|
|
print(f"Loaded {len(urls)} URL(s) from '{url_file}'")
|
|
print(f"Delay between pages: {DELAY_BETWEEN_PAGES}s")
|
|
print()
|
|
|
|
# Set up Firefox options
|
|
options = webdriver.FirefoxOptions()
|
|
# Comment out the next line if you want to see the browser
|
|
# options.add_argument('--headless')
|
|
|
|
# Initialize the driver
|
|
driver = webdriver.Firefox(options=options)
|
|
|
|
try:
|
|
# Login delay if configured
|
|
if LOGIN_DELAY > 0:
|
|
print(f"⏱️ Login window: You have {LOGIN_DELAY} seconds to log in to any sites you need...")
|
|
print(" (The browser window is now open - go log in!)")
|
|
print()
|
|
|
|
# Open Reddit in the first tab to make login easy
|
|
driver.get("https://www.reddit.com")
|
|
|
|
# Countdown timer
|
|
for remaining in range(LOGIN_DELAY, 0, -5):
|
|
if remaining <= 10:
|
|
print(f" Starting in {remaining} seconds...")
|
|
time.sleep(1)
|
|
else:
|
|
print(f" Starting in {remaining} seconds...")
|
|
time.sleep(5)
|
|
|
|
print("\n🚀 Starting download process...\n")
|
|
|
|
# Process each URL with progress bar
|
|
with tqdm(total=len(urls), desc="Overall Progress", unit="page") as pbar:
|
|
for i, url in enumerate(urls):
|
|
if i > 0: # Don't delay before the first page
|
|
time.sleep(DELAY_BETWEEN_PAGES)
|
|
|
|
process_tab(driver, url, pbar)
|
|
pbar.update(1)
|
|
|
|
print(f"\n=== Complete! ===")
|
|
print(f"Check {DOWNLOAD_DIR} for downloaded images")
|
|
|
|
finally:
|
|
# Close the browser
|
|
driver.quit()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|