# -*- coding: utf-8 -*-
"""
Created on Thu Apr 29 21:40:16 2021

@author: X

This script downloads penny-arcade comics. It iterates forwards or
backwards and the default settings will start with the beginning
and iterate forward.
"""

import requests
from bs4 import BeautifulSoup
import os
import re
import time

#this function downloads url n with filename fn
#using requests, because wget didn't work for some reason
def download_img(n,fn):
    print('fn=',fn)
    for i in range(3):
        try:
            r=requests.get(n)
            with open(fn,mode='wb') as fh:
                fh.write(r.content)
            return 0
        except Exception as e:
            print(e)
            time.sleep(5000)
    print('DOWNLOAD FAILED!!! I guess we are going to skip this image.')
    return 1
#this checks to see if any of the 3 filenames suggested already exist
def already_downloaded(fnv):
    for fn in fnv:        
        if os.path.exists(fn):
            return True
    return False
bookmark_file='penny-arcade-bookmark.txt'
def load_bookmark():
    rs=''
    with open(bookmark_file,mode='r') as fh:
        rs=fh.read()
    return rs
def save_bookmark(n):
    with open(bookmark_file,mode='w') as fh:
        fh.write(n)

#this user agent wasn't actually necessary
headers={'User-Agent': 'Mozilla/72.0'}
user_instructions='''This app crawls either forwards or backwards.
You can choose which value of prev_or_next you want accordingly.
This app saves a bookmark to a file so you can just run it repeatedly until you
get all the comics. Default behavior is to start from the beginning, but you
can change that too by setting the url variable to either first or latest.

You have a choice of filename formating options:
 0: {year}\{date} - {title}
 1: {title}
 2: original webserver filename'''
#print(user_instructions)
#
#title_format=input('Which format? : ')
#title_format=int(title_format)
#if title_format not in [0,1,2]: title_format=0
#count=input('How many pictures do you want to process? : ')
#count=int(count)
#pause=input('How many milleseconds do you want to wait between downloads? : ')
#pause=int(pause)/1000
#url=input('Enter starting point url : ')

# Alternatively to this console interactivity, you can just uncomment these
# and type in your desired values
# and comment out the input statements too don't forget that
# Also, you can try to change the formatting by editing line 115.

#pattern to remove illegal filename characters
pattern=r"""[\\/:"*?<>|]"""
replacement_character='_'

savedir=r''
latest='https://www.penny-arcade.com/comic'
first='https://www.penny-arcade.com/comic/1998/11/18'
url=first
prev_or_next = "btn btnNext"
#prev_or_next = "btn btnPrev"
count = 2000
title_format = 0
pause = 0

failed_urls=[]

try:
    if os.path.exists(savedir):
        os.chdir(savedir)
except:
    pass
if os.path.exists(bookmark_file):
    url=load_bookmark()
while url and count>0:
    print('count =',count,'url=',url)
    r=requests.get(url,headers)
    soup=BeautifulSoup(r.text,'lxml')
    comicframe = soup.find('div', id = "comicFrame")
    imgurl=comicframe.img['src']
    buyprint = soup.find(class_ = "buyPrint")
    soup2 = buyprint.find_all('input')
    buyprint_title = soup2[1]['value']
    date = buyprint_title[0:10]
    year = date[0:4]
    title=soup.find('h2').text
    title=re.sub(pattern,replacement_character,title)
    serverfn=imgurl.split('/')[-1]
    bn,ext=os.path.splitext(serverfn)
    fnv=[os.path.join(year,'{} - {}{}'.format(date,title,ext)),
    '{}{}'.format(title,ext),
    serverfn]
    
    if not already_downloaded(fnv):
        if not os.path.exists(year): os.mkdir(year)
        rv=download_img(imgurl,fnv[title_format])
        if rv: failed_urls.append(url)
        if pause>0: time.sleep(pause)

    count-=1
    try:
        url = soup.find(class_ = prev_or_next)['href']
    except:
        print("Done. End of sequence.")
        url=None
if url:    
    print('Done. To resume start from this url:')
    print(url)
    save_bookmark(url)
    print('Bookmark saved')
    if failed_urls==[]:
        print('No failed urls.')
    else:
        print('Failed urls:')
        for each in failed_urls:
            print(each)