Compare commits

..

1 commit

Author SHA1 Message Date
Andre Saddler
2f91ea4aa4
Merge ebd7a47ebe into 82e50c64f0 2024-09-26 10:45:52 -04:00
2 changed files with 27 additions and 46 deletions

View file

@ -1,30 +1,28 @@
import argparse
import asyncio
import json
import multiprocessing as mp
import os
import re
import time
import zipfile
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import unquote
import aiohttp
import imagehash
import requests
from PIL import Image
async def fetch_json_data(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status == 200:
text = await response.text()
try:
return json.loads(text)
except json.JSONDecodeError:
raise Exception(f"Failed to parse JSON data from {url}")
else:
raise Exception(f"Failed to fetch data. Status code: {response.status}")
# python mkbsd.py [--zip] [--zip-name CUSTOM_NAME] [--remove-duplicates]
def fetch_json_data(url):
response = requests.get(url)
if response.status_code == 200:
return response.json()
else:
raise Exception(
f"Failed to fetch JSON data. Status code: {response.status_code}"
)
def extract_urls(element):
@ -41,27 +39,19 @@ def extract_urls(element):
return urls
async def download_file(session, url):
def download_file(url):
file_name = os.path.basename(unquote(url.split("?")[0]))
file_name = clean_filename(file_name)
file_path = os.path.join("downloads", file_name)
if not os.path.exists(file_path):
try:
async with session.get(url) as response:
if response.status == 200:
with open(file_path, "wb") as f:
while True:
chunk = await response.content.read(8192)
if not chunk:
break
f.write(chunk)
return f"Downloaded: {file_name}"
else:
return f"Failed to download {file_name}: HTTP {response.status}"
except Exception as e:
return f"Error downloading {file_name}: {str(e)}"
print(f"Downloading {url}")
response = requests.get(url, stream=True)
with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
else:
return f"Skipped (already exists): {file_name}"
print(f"Skipping {url}")
return file_path
def clean_filename(filename):
@ -130,7 +120,7 @@ def remove_duplicates(duplicates):
print(f"Error removing duplicate: {e}")
async def main():
def main():
parser = argparse.ArgumentParser(
description="Download images from JSON data and remove duplicates."
)
@ -151,7 +141,7 @@ async def main():
json_url = "https://storage.googleapis.com/panels-cdn/data/20240730/all.json"
try:
json_data = await fetch_json_data(json_url)
json_data = fetch_json_data(json_url)
except Exception as e:
print(f"Error: {e}")
return
@ -162,16 +152,8 @@ async def main():
if not os.path.exists("downloads"):
os.makedirs("downloads")
start_time = time.time()
async with aiohttp.ClientSession() as session:
tasks = [download_file(session, url) for url in urls]
for batch in [tasks[i : i + 50] for i in range(0, len(tasks), 50)]:
results = await asyncio.gather(*batch)
for result in results:
print(result)
end_time = time.time()
print(f"Download completed in {end_time - start_time:.2f} seconds")
with ThreadPoolExecutor(max_workers=10) as executor:
executor.map(download_file, urls)
if args.remove_duplicates:
print("Searching for duplicate images...")
@ -190,4 +172,4 @@ async def main():
if __name__ == "__main__":
asyncio.run(main())
main()

View file

@ -1,2 +1 @@
imagehash
aiohttp