Mrinank-Bhowmick
diff --git a/‎projects/Comics_Scraper/comicXCD_scraper.py‎
Lines changed: 123 additions & 0 deletions b/‎projects/Comics_Scraper/comicXCD_scraper.py‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎projects/Comics_Scraper/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎projects/Comics_Scraper/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎projects/Text_to_SpreadSheet/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎projects/Text_to_SpreadSheet/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎projects/Text_to_SpreadSheet/text1.txt‎
Lines changed: 5 additions & 0 deletions b/‎projects/Text_to_SpreadSheet/text1.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎projects/Text_to_SpreadSheet/text2.txt‎
Lines changed: 5 additions & 0 deletions b/‎projects/Text_to_SpreadSheet/text2.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎projects/Text_to_SpreadSheet/text3.txt‎
Lines changed: 5 additions & 0 deletions b/‎projects/Text_to_SpreadSheet/text3.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎projects/Text_to_SpreadSheet/textToSheet.py‎
Lines changed: 32 additions & 0 deletions b/‎projects/Text_to_SpreadSheet/textToSheet.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎projects/Worksheet_to_text/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎projects/Worksheet_to_text/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎projects/Worksheet_to_text/sheetToTextFile.py‎
Lines changed: 26 additions & 0 deletions b/‎projects/Worksheet_to_text/sheetToTextFile.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎projects/Worksheet_to_text/worksheet.xlsx‎
5.02 KB b/‎projects/Worksheet_to_text/worksheet.xlsx‎
5.02 KB
@@ -0,0 +1,123 @@
+#! python3
+import requests, os, bs4, re
+
+url = 'https://xkcd.com'
+#create a directory to store all the comics
+os.makedirs('xkcd', exist_ok=True)
+
+
+def imgdownloader(url):
+    while not url.endswith('#'):
+    #print out the current page
+        res = requests.get(url)
+        res.raise_for_status()  #returns None as the request received is 200 which is fine, if received status is 404 there is an exception for bad request
+        soup = bs4.BeautifulSoup(res.text,"lxml")  #r.text is the content of the response in unicode, and r.content is the content of the response in bytes.
+        #find the comic image on the current page 
+        comic = soup.select('#comic img')  #finds tag with comic and its sub tag img 
+        #print(comic)
+        if comic == []:
+            #the page did not contaib a comic.. move on
+            print("No comic was found..")
+            break
+        else:
+            try:
+                #get the full url to the comic
+                comicimg = 'http:' + comic[0].get('src')  #finds url from the list comic|| basically comic[0] is used as there is just single one element in list!! try print(comic) && print(comic[0]) to see for yourself..
+                #check that it is actually a comic and not an interactive page
+                if "/comics/" in comicimg:
+                    print('Download image %s' % comicimg)
+                    res = requests.get(comicimg)
+                    res.raise_for_status()
+                    #write the image to the xkcd folder
+                    image = open(os.path.join('xkcd', os.path.basename(comicimg)), 'wb')
+                    for chunk in res.iter_content(10000):  #default way to write requested content basically chunk is byte by byte writing 
+                        image.write(chunk)
+                    image.close()
+                    print('Finished')
+                    break
+                else:
+                    print("No comic was found..")
+                    break
+            except requests.exceptions.MissingSchema:
+                print("Error in downloading img!!")
+                break
+
+
+def getLatestComicNumber(url):
+    res=requests.get(url)
+    res.raise_for_status()  #returns None as the request received is 200 which is fine, if received status is 404 there is an exception for bad request
+    soup=bs4.BeautifulSoup(res.text,"lxml") 
+    prevLink=soup.select('a[rel="prev"]')[0]
+    url = 'https://xkcd.com' + prevLink.get('href')
+    x=re.findall('\d+',url)
+    x=int(x[0])+1
+    #print(x)
+    return x; 
+
+
+#this function is basically traversing backwards, it starts from the most recent comic and goes back until n-1 n being number of pages  
+#as there are no prev before 1 ( :p quite obvious)
+def getNextComic(soup):  
+    prevLink=soup.select('a[rel="prev"]')[0]
+    url = 'https://xkcd.com' + prevLink.get('href')   # gets /comic-num/ from current page prev button ..basic crawling!!
+    return url;
+
+def getSpecificComic(comic_number):  #comic_number
+    res=url+'/'+comic_number+'/'
+    try:
+        imgdownloader(res)
+    except Exception as e:
+        print(str(e))
+        
+ 
+def batchDownloader():
+    url = 'https://xkcd.com'
+    #check to make sure it's not the first page
+    while not url.endswith('#'):
+        #print out the current page
+        print('Current page: %s' % url)
+        res = requests.get(url)
+        res.raise_for_status()  #returns None as the request received is 200 which is fine, if received status is 400 
+        soup = bs4.BeautifulSoup(res.text,"lxml")  #r.text is the content of the response in unicode, and r.content is the content of the response in bytes.
+        #find the comic image on the current page 
+        comic = soup.select('#comic img')  #finds tag with comic and its sub tag img 
+        #print(comic)
+        if comic == []:
+            #the page did not contaib a comic.. move on
+            print("No comic was found..")
+        else:
+            try:
+                #get the full url to the comic
+                comicimg = 'http:' + comic[0].get('src')  #finds url from the list comic|| basically comic[0] is used as there is just single one element in list!! try print(comic) && print(comic[0]) to see for yourself..
+                #check that it is actually a comic and not an interactive page
+                if "/comics/" in comicimg:
+                    print('Download image %s' % comicimg)
+                    res = requests.get(comicimg)
+                    res.raise_for_status()
+                    #write the image to the xkcd folder
+                    image = open(os.path.join('xkcd', os.path.basename(comicimg)), 'wb')
+                    for chunk in res.iter_content(10000):  #default way to write requested content basically chunk is byte by byte writing 
+                        image.write(chunk)
+                    image.close()
+                else:
+                    print("No comic was found..")
+            except requests.exceptions.MissingSchema:
+                url = getNextComic(soup)
+                continue
+        url=getNextComic(soup)  #basically for downloading the first image
+    #all comics have downloaded
+    print('Finished')
+
+def main():
+    x=int(input("Choose your option: \n1.Download all images\t2.Download Specific image\n"))
+    if x==1:
+        batchDownloader()
+    if x==2:
+        y=str(input("Enter any comic number between 1-"+str(getLatestComicNumber(url))))
+        try:
+            getSpecificComic(y)
+        except Exception as e:
+            print(str(e))    
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1 @@
+bs4
@@ -0,0 +1 @@
+openpyxl
@@ -0,0 +1,5 @@
+text1
+tex1, 1
+tex1, 2
+text1, 3
+text1, 4
@@ -0,0 +1,5 @@
+text2
+text2, 1
+text2, 2
+text2, 3
+text2, 4
@@ -0,0 +1,5 @@
+text3
+text3, 1
+text3, 2
+text3, 3
+text3, 4
@@ -0,0 +1,32 @@
+import os
+import openpyxl
+
+
+def textToSheet(directory, filename):
+    """converts text files to columns in excel worksheet
+    Args:
+        directory (str): folder containing text files
+        filename (str): name of excel file
+    Returns:
+        None
+    """
+    wb = openpyxl.Workbook()
+    wb.create_sheet(index=0, title='result')
+    sheet = wb.active
+
+    colIndex = 1
+
+    # write text files as columns in worksheet
+    for file in os.listdir():
+        if file.endswith('.txt'):
+            rowIndex = 1
+            with open(file) as f:
+                for line in f:
+                    sheet.cell(row=rowIndex, column=colIndex).value = line
+                    rowIndex += 1
+            colIndex += 1
+
+    wb.save(filename)
+
+if __name__ == "__main__":
+    textToSheet('.', 'text-to-cols.xlsx')
@@ -0,0 +1 @@
+openpyxl
@@ -0,0 +1,26 @@
+import os
+import openpyxl
+
+
+def toTextFiles(filename):
+    """writes column data in worksheet into text files
+    Args:
+        filename (str): name of worksheet to read from
+    Returns:
+        None
+    """
+    wb = openpyxl.load_workbook(filename)
+    sheet = wb.active
+    count = 1
+
+    for colObj in sheet.columns:
+        
+        with open('text-'+str(count)+'.txt', 'w') as file:
+            for cellObj in colObj:
+                file.write(cellObj.value)
+
+        count += 1
+
+
+if __name__ == "__main__":
+    toTextFiles('worksheet.xlsx')
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +text1
 +tex1, 1
 +tex1, 2
 +text1, 3
 +text1, 4