# -*- coding: utf-8 -*- """ Created on Thu Apr 29 21:40:16 2021 @author: X This script downloads penny-arcade comics. It iterates forwards or backwards and the default settings will start with the beginning and iterate forward. """ import requests from bs4 import BeautifulSoup import os import re import time #this function downloads url n with filename fn #using requests, because wget didn't work for some reason def download_img(n,fn): print('fn=',fn) for i in range(3): try: r=requests.get(n) with open(fn,mode='wb') as fh: fh.write(r.content) return 0 except Exception as e: print(e) time.sleep(5000) print('DOWNLOAD FAILED!!! I guess we are going to skip this image.') return 1 #this checks to see if any of the 3 filenames suggested already exist def already_downloaded(fnv): for fn in fnv: if os.path.exists(fn): return True return False bookmark_file='penny-arcade-bookmark.txt' def load_bookmark(): rs='' with open(bookmark_file,mode='r') as fh: rs=fh.read() return rs def save_bookmark(n): with open(bookmark_file,mode='w') as fh: fh.write(n) #this user agent wasn't actually necessary headers={'User-Agent': 'Mozilla/72.0'} user_instructions='''This app crawls either forwards or backwards. You can choose which value of prev_or_next you want accordingly. This app saves a bookmark to a file so you can just run it repeatedly until you get all the comics. Default behavior is to start from the beginning, but you can change that too by setting the url variable to either first or latest. You have a choice of filename formating options: 0: {year}\{date} - {title} 1: {title} 2: original webserver filename''' #print(user_instructions) # #title_format=input('Which format? : ') #title_format=int(title_format) #if title_format not in [0,1,2]: title_format=0 #count=input('How many pictures do you want to process? : ') #count=int(count) #pause=input('How many milleseconds do you want to wait between downloads? : ') #pause=int(pause)/1000 #url=input('Enter starting point url : ') # Alternatively to this console interactivity, you can just uncomment these # and type in your desired values # and comment out the input statements too don't forget that # Also, you can try to change the formatting by editing line 115. #pattern to remove illegal filename characters pattern=r"""[\\/:"*?<>|]""" replacement_character='_' savedir=r'' latest='https://www.penny-arcade.com/comic' first='https://www.penny-arcade.com/comic/1998/11/18' url=first prev_or_next = "btn btnNext" #prev_or_next = "btn btnPrev" count = 2000 title_format = 0 pause = 0 failed_urls=[] try: if os.path.exists(savedir): os.chdir(savedir) except: pass if os.path.exists(bookmark_file): url=load_bookmark() while url and count>0: print('count =',count,'url=',url) r=requests.get(url,headers) soup=BeautifulSoup(r.text,'lxml') comicframe = soup.find('div', id = "comicFrame") imgurl=comicframe.img['src'] buyprint = soup.find(class_ = "buyPrint") soup2 = buyprint.find_all('input') buyprint_title = soup2[1]['value'] date = buyprint_title[0:10] year = date[0:4] title=soup.find('h2').text title=re.sub(pattern,replacement_character,title) serverfn=imgurl.split('/')[-1] bn,ext=os.path.splitext(serverfn) fnv=[os.path.join(year,'{} - {}{}'.format(date,title,ext)), '{}{}'.format(title,ext), serverfn] if not already_downloaded(fnv): if not os.path.exists(year): os.mkdir(year) rv=download_img(imgurl,fnv[title_format]) if rv: failed_urls.append(url) if pause>0: time.sleep(pause) count-=1 try: url = soup.find(class_ = prev_or_next)['href'] except: print("Done. End of sequence.") url=None if url: print('Done. To resume start from this url:') print(url) save_bookmark(url) print('Bookmark saved') if failed_urls==[]: print('No failed urls.') else: print('Failed urls:') for each in failed_urls: print(each)