You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
{{ message }}
This repository was archived by the owner on Apr 24, 2025. It is now read-only.
res.raise_for_status() #returns None as the request received is 200 which is fine, if received status is 404 there is an exception for bad request
14
+
soup=bs4.BeautifulSoup(res.text,"lxml") #r.text is the content of the response in unicode, and r.content is the content of the response in bytes.
15
+
#find the comic image on the current page
16
+
comic=soup.select('#comic img') #finds tag with comic and its sub tag img
17
+
#print(comic)
18
+
ifcomic== []:
19
+
#the page did not contaib a comic.. move on
20
+
print("No comic was found..")
21
+
break
22
+
else:
23
+
try:
24
+
#get the full url to the comic
25
+
comicimg='http:'+comic[0].get('src') #finds url from the list comic|| basically comic[0] is used as there is just single one element in list!! try print(comic) && print(comic[0]) to see for yourself..
26
+
#check that it is actually a comic and not an interactive page
forchunkinres.iter_content(10000): #default way to write requested content basically chunk is byte by byte writing
34
+
image.write(chunk)
35
+
image.close()
36
+
print('Finished')
37
+
break
38
+
else:
39
+
print("No comic was found..")
40
+
break
41
+
exceptrequests.exceptions.MissingSchema:
42
+
print("Error in downloading img!!")
43
+
break
44
+
45
+
46
+
defgetLatestComicNumber(url):
47
+
res=requests.get(url)
48
+
res.raise_for_status() #returns None as the request received is 200 which is fine, if received status is 404 there is an exception for bad request
49
+
soup=bs4.BeautifulSoup(res.text,"lxml")
50
+
prevLink=soup.select('a[rel="prev"]')[0]
51
+
url='https://xkcd.com'+prevLink.get('href')
52
+
x=re.findall('\d+',url)
53
+
x=int(x[0])+1
54
+
#print(x)
55
+
returnx;
56
+
57
+
58
+
#this function is basically traversing backwards, it starts from the most recent comic and goes back until n-1 n being number of pages
59
+
#as there are no prev before 1 ( :p quite obvious)
60
+
defgetNextComic(soup):
61
+
prevLink=soup.select('a[rel="prev"]')[0]
62
+
url='https://xkcd.com'+prevLink.get('href') # gets /comic-num/ from current page prev button ..basic crawling!!
63
+
returnurl;
64
+
65
+
defgetSpecificComic(comic_number): #comic_number
66
+
res=url+'/'+comic_number+'/'
67
+
try:
68
+
imgdownloader(res)
69
+
exceptExceptionase:
70
+
print(str(e))
71
+
72
+
73
+
defbatchDownloader():
74
+
url='https://xkcd.com'
75
+
#check to make sure it's not the first page
76
+
whilenoturl.endswith('#'):
77
+
#print out the current page
78
+
print('Current page: %s'%url)
79
+
res=requests.get(url)
80
+
res.raise_for_status() #returns None as the request received is 200 which is fine, if received status is 400
81
+
soup=bs4.BeautifulSoup(res.text,"lxml") #r.text is the content of the response in unicode, and r.content is the content of the response in bytes.
82
+
#find the comic image on the current page
83
+
comic=soup.select('#comic img') #finds tag with comic and its sub tag img
84
+
#print(comic)
85
+
ifcomic== []:
86
+
#the page did not contaib a comic.. move on
87
+
print("No comic was found..")
88
+
else:
89
+
try:
90
+
#get the full url to the comic
91
+
comicimg='http:'+comic[0].get('src') #finds url from the list comic|| basically comic[0] is used as there is just single one element in list!! try print(comic) && print(comic[0]) to see for yourself..
92
+
#check that it is actually a comic and not an interactive page
0 commit comments