Skip to content
This repository was archived by the owner on Apr 24, 2025. It is now read-only.

Commit 0566267

Browse files
committed
Comic scraped from xkcd website
1 parent fd64c07 commit 0566267

2 files changed

Lines changed: 124 additions & 0 deletions

File tree

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#! python3
2+
import requests, os, bs4, re
3+
4+
url = 'https://xkcd.com'
5+
#create a directory to store all the comics
6+
os.makedirs('xkcd', exist_ok=True)
7+
8+
9+
def imgdownloader(url):
10+
while not url.endswith('#'):
11+
#print out the current page
12+
res = requests.get(url)
13+
res.raise_for_status() #returns None as the request received is 200 which is fine, if received status is 404 there is an exception for bad request
14+
soup = bs4.BeautifulSoup(res.text,"lxml") #r.text is the content of the response in unicode, and r.content is the content of the response in bytes.
15+
#find the comic image on the current page
16+
comic = soup.select('#comic img') #finds tag with comic and its sub tag img
17+
#print(comic)
18+
if comic == []:
19+
#the page did not contaib a comic.. move on
20+
print("No comic was found..")
21+
break
22+
else:
23+
try:
24+
#get the full url to the comic
25+
comicimg = 'http:' + comic[0].get('src') #finds url from the list comic|| basically comic[0] is used as there is just single one element in list!! try print(comic) && print(comic[0]) to see for yourself..
26+
#check that it is actually a comic and not an interactive page
27+
if "/comics/" in comicimg:
28+
print('Download image %s' % comicimg)
29+
res = requests.get(comicimg)
30+
res.raise_for_status()
31+
#write the image to the xkcd folder
32+
image = open(os.path.join('xkcd', os.path.basename(comicimg)), 'wb')
33+
for chunk in res.iter_content(10000): #default way to write requested content basically chunk is byte by byte writing
34+
image.write(chunk)
35+
image.close()
36+
print('Finished')
37+
break
38+
else:
39+
print("No comic was found..")
40+
break
41+
except requests.exceptions.MissingSchema:
42+
print("Error in downloading img!!")
43+
break
44+
45+
46+
def getLatestComicNumber(url):
47+
res=requests.get(url)
48+
res.raise_for_status() #returns None as the request received is 200 which is fine, if received status is 404 there is an exception for bad request
49+
soup=bs4.BeautifulSoup(res.text,"lxml")
50+
prevLink=soup.select('a[rel="prev"]')[0]
51+
url = 'https://xkcd.com' + prevLink.get('href')
52+
x=re.findall('\d+',url)
53+
x=int(x[0])+1
54+
#print(x)
55+
return x;
56+
57+
58+
#this function is basically traversing backwards, it starts from the most recent comic and goes back until n-1 n being number of pages
59+
#as there are no prev before 1 ( :p quite obvious)
60+
def getNextComic(soup):
61+
prevLink=soup.select('a[rel="prev"]')[0]
62+
url = 'https://xkcd.com' + prevLink.get('href') # gets /comic-num/ from current page prev button ..basic crawling!!
63+
return url;
64+
65+
def getSpecificComic(comic_number): #comic_number
66+
res=url+'/'+comic_number+'/'
67+
try:
68+
imgdownloader(res)
69+
except Exception as e:
70+
print(str(e))
71+
72+
73+
def batchDownloader():
74+
url = 'https://xkcd.com'
75+
#check to make sure it's not the first page
76+
while not url.endswith('#'):
77+
#print out the current page
78+
print('Current page: %s' % url)
79+
res = requests.get(url)
80+
res.raise_for_status() #returns None as the request received is 200 which is fine, if received status is 400
81+
soup = bs4.BeautifulSoup(res.text,"lxml") #r.text is the content of the response in unicode, and r.content is the content of the response in bytes.
82+
#find the comic image on the current page
83+
comic = soup.select('#comic img') #finds tag with comic and its sub tag img
84+
#print(comic)
85+
if comic == []:
86+
#the page did not contaib a comic.. move on
87+
print("No comic was found..")
88+
else:
89+
try:
90+
#get the full url to the comic
91+
comicimg = 'http:' + comic[0].get('src') #finds url from the list comic|| basically comic[0] is used as there is just single one element in list!! try print(comic) && print(comic[0]) to see for yourself..
92+
#check that it is actually a comic and not an interactive page
93+
if "/comics/" in comicimg:
94+
print('Download image %s' % comicimg)
95+
res = requests.get(comicimg)
96+
res.raise_for_status()
97+
#write the image to the xkcd folder
98+
image = open(os.path.join('xkcd', os.path.basename(comicimg)), 'wb')
99+
for chunk in res.iter_content(10000): #default way to write requested content basically chunk is byte by byte writing
100+
image.write(chunk)
101+
image.close()
102+
else:
103+
print("No comic was found..")
104+
except requests.exceptions.MissingSchema:
105+
url = getNextComic(soup)
106+
continue
107+
url=getNextComic(soup) #basically for downloading the first image
108+
#all comics have downloaded
109+
print('Finished')
110+
111+
def main():
112+
x=int(input("Choose your option: \n1.Download all images\t2.Download Specific image\n"))
113+
if x==1:
114+
batchDownloader()
115+
if x==2:
116+
y=str(input("Enter any comic number between 1-"+str(getLatestComicNumber(url))))
117+
try:
118+
getSpecificComic(y)
119+
except Exception as e:
120+
print(str(e))
121+
122+
if __name__ == '__main__':
123+
main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
bs4

0 commit comments

Comments
 (0)