Skip to content
This repository was archived by the owner on Apr 24, 2025. It is now read-only.

Commit 25b323d

Browse files
Merge pull request #695 from ibra-kdbra/ibra-kdbra
Big Comeback with load of script.
2 parents 650f1aa + f71f185 commit 25b323d

34 files changed

Lines changed: 808 additions & 0 deletions

File tree

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#! python3
2+
import requests, os, bs4, re
3+
4+
url = 'https://xkcd.com'
5+
#create a directory to store all the comics
6+
os.makedirs('xkcd', exist_ok=True)
7+
8+
9+
def imgdownloader(url):
10+
while not url.endswith('#'):
11+
#print out the current page
12+
res = requests.get(url)
13+
res.raise_for_status() #returns None as the request received is 200 which is fine, if received status is 404 there is an exception for bad request
14+
soup = bs4.BeautifulSoup(res.text,"lxml") #r.text is the content of the response in unicode, and r.content is the content of the response in bytes.
15+
#find the comic image on the current page
16+
comic = soup.select('#comic img') #finds tag with comic and its sub tag img
17+
#print(comic)
18+
if comic == []:
19+
#the page did not contaib a comic.. move on
20+
print("No comic was found..")
21+
break
22+
else:
23+
try:
24+
#get the full url to the comic
25+
comicimg = 'http:' + comic[0].get('src') #finds url from the list comic|| basically comic[0] is used as there is just single one element in list!! try print(comic) && print(comic[0]) to see for yourself..
26+
#check that it is actually a comic and not an interactive page
27+
if "/comics/" in comicimg:
28+
print('Download image %s' % comicimg)
29+
res = requests.get(comicimg)
30+
res.raise_for_status()
31+
#write the image to the xkcd folder
32+
image = open(os.path.join('xkcd', os.path.basename(comicimg)), 'wb')
33+
for chunk in res.iter_content(10000): #default way to write requested content basically chunk is byte by byte writing
34+
image.write(chunk)
35+
image.close()
36+
print('Finished')
37+
break
38+
else:
39+
print("No comic was found..")
40+
break
41+
except requests.exceptions.MissingSchema:
42+
print("Error in downloading img!!")
43+
break
44+
45+
46+
def getLatestComicNumber(url):
47+
res=requests.get(url)
48+
res.raise_for_status() #returns None as the request received is 200 which is fine, if received status is 404 there is an exception for bad request
49+
soup=bs4.BeautifulSoup(res.text,"lxml")
50+
prevLink=soup.select('a[rel="prev"]')[0]
51+
url = 'https://xkcd.com' + prevLink.get('href')
52+
x=re.findall('\d+',url)
53+
x=int(x[0])+1
54+
#print(x)
55+
return x;
56+
57+
58+
#this function is basically traversing backwards, it starts from the most recent comic and goes back until n-1 n being number of pages
59+
#as there are no prev before 1 ( :p quite obvious)
60+
def getNextComic(soup):
61+
prevLink=soup.select('a[rel="prev"]')[0]
62+
url = 'https://xkcd.com' + prevLink.get('href') # gets /comic-num/ from current page prev button ..basic crawling!!
63+
return url;
64+
65+
def getSpecificComic(comic_number): #comic_number
66+
res=url+'/'+comic_number+'/'
67+
try:
68+
imgdownloader(res)
69+
except Exception as e:
70+
print(str(e))
71+
72+
73+
def batchDownloader():
74+
url = 'https://xkcd.com'
75+
#check to make sure it's not the first page
76+
while not url.endswith('#'):
77+
#print out the current page
78+
print('Current page: %s' % url)
79+
res = requests.get(url)
80+
res.raise_for_status() #returns None as the request received is 200 which is fine, if received status is 400
81+
soup = bs4.BeautifulSoup(res.text,"lxml") #r.text is the content of the response in unicode, and r.content is the content of the response in bytes.
82+
#find the comic image on the current page
83+
comic = soup.select('#comic img') #finds tag with comic and its sub tag img
84+
#print(comic)
85+
if comic == []:
86+
#the page did not contaib a comic.. move on
87+
print("No comic was found..")
88+
else:
89+
try:
90+
#get the full url to the comic
91+
comicimg = 'http:' + comic[0].get('src') #finds url from the list comic|| basically comic[0] is used as there is just single one element in list!! try print(comic) && print(comic[0]) to see for yourself..
92+
#check that it is actually a comic and not an interactive page
93+
if "/comics/" in comicimg:
94+
print('Download image %s' % comicimg)
95+
res = requests.get(comicimg)
96+
res.raise_for_status()
97+
#write the image to the xkcd folder
98+
image = open(os.path.join('xkcd', os.path.basename(comicimg)), 'wb')
99+
for chunk in res.iter_content(10000): #default way to write requested content basically chunk is byte by byte writing
100+
image.write(chunk)
101+
image.close()
102+
else:
103+
print("No comic was found..")
104+
except requests.exceptions.MissingSchema:
105+
url = getNextComic(soup)
106+
continue
107+
url=getNextComic(soup) #basically for downloading the first image
108+
#all comics have downloaded
109+
print('Finished')
110+
111+
def main():
112+
x=int(input("Choose your option: \n1.Download all images\t2.Download Specific image\n"))
113+
if x==1:
114+
batchDownloader()
115+
if x==2:
116+
y=str(input("Enter any comic number between 1-"+str(getLatestComicNumber(url))))
117+
try:
118+
getSpecificComic(y)
119+
except Exception as e:
120+
print(str(e))
121+
122+
if __name__ == '__main__':
123+
main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
bs4
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
openpyxl
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
text1
2+
tex1, 1
3+
tex1, 2
4+
text1, 3
5+
text1, 4
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
text2
2+
text2, 1
3+
text2, 2
4+
text2, 3
5+
text2, 4
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
text3
2+
text3, 1
3+
text3, 2
4+
text3, 3
5+
text3, 4
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import os
2+
import openpyxl
3+
4+
5+
def textToSheet(directory, filename):
6+
"""converts text files to columns in excel worksheet
7+
Args:
8+
directory (str): folder containing text files
9+
filename (str): name of excel file
10+
Returns:
11+
None
12+
"""
13+
wb = openpyxl.Workbook()
14+
wb.create_sheet(index=0, title='result')
15+
sheet = wb.active
16+
17+
colIndex = 1
18+
19+
# write text files as columns in worksheet
20+
for file in os.listdir():
21+
if file.endswith('.txt'):
22+
rowIndex = 1
23+
with open(file) as f:
24+
for line in f:
25+
sheet.cell(row=rowIndex, column=colIndex).value = line
26+
rowIndex += 1
27+
colIndex += 1
28+
29+
wb.save(filename)
30+
31+
if __name__ == "__main__":
32+
textToSheet('.', 'text-to-cols.xlsx')
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
openpyxl
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import os
2+
import openpyxl
3+
4+
5+
def toTextFiles(filename):
6+
"""writes column data in worksheet into text files
7+
Args:
8+
filename (str): name of worksheet to read from
9+
Returns:
10+
None
11+
"""
12+
wb = openpyxl.load_workbook(filename)
13+
sheet = wb.active
14+
count = 1
15+
16+
for colObj in sheet.columns:
17+
18+
with open('text-'+str(count)+'.txt', 'w') as file:
19+
for cellObj in colObj:
20+
file.write(cellObj.value)
21+
22+
count += 1
23+
24+
25+
if __name__ == "__main__":
26+
toTextFiles('worksheet.xlsx')
5.02 KB
Binary file not shown.

0 commit comments

Comments
 (0)