diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/AbyssForum/crawler_selenium.py index 32678dc..4d2ad99 100644 --- a/Forums/AbyssForum/crawler_selenium.py +++ b/Forums/AbyssForum/crawler_selenium.py @@ -30,19 +30,19 @@ baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion # Opens Tor Browser, crawls the website def startCrawling(): - opentor() - # forumName = getForumName() - driver = getAccess() + # opentor() + forumName = getForumName() + # driver = getAccess() + # + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) - - # new_parse(forumName, baseURL, False) + new_parse(forumName, baseURL, True) # Opens Tor Browser @@ -260,7 +260,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling AbyssForum forum done sucessfully. Press ENTER to continue\n") + print("Crawling the AbyssForum forum done.") # Returns 'True' if the link is Topic link diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py index 1ae6d8a..6eb813e 100644 --- a/Forums/Altenens/crawler_selenium.py +++ b/Forums/Altenens/crawler_selenium.py @@ -30,19 +30,19 @@ baseURL = 'https://altenens.is/' # Opens Tor Browser, crawls the website def startCrawling(): - opentor() + # opentor() forumName = getForumName() - driver = getAccess() + # driver = getAccess() + # + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) - - new_parse(forumName, baseURL, False) + new_parse(forumName, baseURL, True) # Opens Tor Browser diff --git a/Forums/Altenens/parser.py b/Forums/Altenens/parser.py index 71fb34d..19155d5 100644 --- a/Forums/Altenens/parser.py +++ b/Forums/Altenens/parser.py @@ -9,9 +9,8 @@ import re # Here, we are importing BeautifulSoup to search through the HTML tree from bs4 import BeautifulSoup -# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) - +# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) def altenens_description_parser(soup): topic = "-1" # 0 *topic name @@ -58,7 +57,7 @@ def altenens_description_parser(soup): if inner is not None: inner = inner.strip() else: - inner = "-1" + inner = "" # cannot use -1 because the post is hidden unless you reply post.append(cleanString(inner)) feedback.append("-1") @@ -75,8 +74,8 @@ def altenens_description_parser(soup): return row -# This is the method to parse the Listing Pages (one page with many posts) +# This is the method to parse the Listing Pages (one page with many posts) def altenens_listing_parser(soup): nm = 0 # *this variable should receive the number of topics diff --git a/Forums/BestCardingWorld/crawler_selenium.py b/Forums/BestCardingWorld/crawler_selenium.py index 8eaa261..7e35381 100644 --- a/Forums/BestCardingWorld/crawler_selenium.py +++ b/Forums/BestCardingWorld/crawler_selenium.py @@ -38,7 +38,7 @@ def startCrawling(): # print(driver.current_url, e) # closetor(driver) - new_parse(forumName, False) + new_parse(forumName, baseURL, True) # Opens Tor Browser diff --git a/Forums/Cardingleaks/crawler_selenium.py b/Forums/Cardingleaks/crawler_selenium.py index 83a9ef3..de8dd0b 100644 --- a/Forums/Cardingleaks/crawler_selenium.py +++ b/Forums/Cardingleaks/crawler_selenium.py @@ -4,7 +4,7 @@ __author__ = 'DarkWeb' Cardingleaks Forum Crawler (Selenium) Crawler updated and fixed -The site has this thing sometime whereyou'll have to look at a new post everyday. makes sure +The site has this thing sometime where you'll have to look at a new post everyday. makes sure you login first before crawling. ''' @@ -27,7 +27,7 @@ from Forums.Cardingleaks.parser import cardingleaks_links_parser from Forums.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'https://cardingleaks.ws/' +baseURL = 'https://leaks.ws/' # Opens Tor Browser, crawls the website @@ -35,7 +35,7 @@ def startCrawling(): # opentor() forumName = getForumName() # driver = getAccess() - + # # if driver != 'down': # try: # login(driver) @@ -44,7 +44,7 @@ def startCrawling(): # print(driver.current_url, e) # closetor(driver) - new_parse(forumName, baseURL, False) + new_parse(forumName, baseURL, True) # Opens Tor Browser @@ -96,7 +96,7 @@ def getForumName() -> str: # Return the link of the website def getFixedURL(): - url = 'https://cardingleaks.ws/' + url = 'https://leaks.ws/' return url @@ -194,23 +194,23 @@ def getInterestedLinks(): links = [] # # carding methods - links.append('https://cardingleaks.ws/forums/carding-methods.82/') + links.append('https://leaks.ws/forums/carding-methods.82/') # # carding schools - # links.append('https://cardingleaks.ws/forums/help-desk-carding-school.35/') + # links.append('https://leaks.ws/forums/help-desk-carding-school.35/') # # carding discussion - # links.append('https://cardingleaks.ws/forums/carding-discussion-desk.58/') + # links.append('https://leaks.ws/forums/carding-discussion-desk.58/') # # carding tutorials - # links.append('https://cardingleaks.ws/forums/carding-tutorials.13/') + # links.append('https://leaks.ws/forums/carding-tutorials.13/') # # carding tools and software - # links.append('https://cardingleaks.ws/forums/carding-tools-softwares.10/') + # links.append('https://leaks.ws/forums/carding-tools-softwares.10/') # # exploits and cracking tools - # links.append('https://cardingleaks.ws/forums/exploits-cracking-tools.22/') + # links.append('https://leaks.ws/forums/exploits-cracking-tools.22/') return links def crawlForum(driver): - print("Crawling the Cardinglinks forum") + print("Crawling the Cardingleaks forum") linksToCrawl = getInterestedLinks() @@ -245,7 +245,7 @@ def crawlForum(driver): savePage(driver.page_source, topic + f"page{counter}") # very important # comment out - if counter == 5: + if counter == 2: break try: @@ -264,7 +264,7 @@ def crawlForum(driver): break # comment out - if count == 10: + if count == 1: break try: @@ -280,7 +280,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Cardingleaksforum done successfully. Press ENTER to continue\n") + print("Crawling the Cardingleaks forum done.") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/Forums/CryptBB/crawler_selenium.py b/Forums/CryptBB/crawler_selenium.py index a0ad16d..bdc964c 100644 --- a/Forums/CryptBB/crawler_selenium.py +++ b/Forums/CryptBB/crawler_selenium.py @@ -28,19 +28,19 @@ baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion # Opens Tor Browser, crawls the website def startCrawling(): - opentor() + # opentor() forumName = getForumName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) + # driver = getAccess() + # + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) - new_parse(forumName, baseURL, False) + new_parse(forumName, baseURL, True) # Opens Tor Browser @@ -238,8 +238,6 @@ def getInterestedLinks(): def crawlForum(driver): print("Crawling the CryptBB forum") - print("Crawling the CryptBB forum") - linksToCrawl = getInterestedLinks() i = 0 @@ -273,7 +271,7 @@ def crawlForum(driver): savePage(driver.page_source, topic + f"page{counter}") # very important # comment out - if counter == 10: + if counter == 2: break try: @@ -291,10 +289,10 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out - if count == 20: + if count == 1: break try: @@ -312,7 +310,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling CrypttBB done successfully. Press ENTER to continue\n") + print("Crawling the CrypttBB forum done.") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/HiddenAnswers/crawler_selenium.py index bb73764..46e445e 100644 --- a/Forums/HiddenAnswers/crawler_selenium.py +++ b/Forums/HiddenAnswers/crawler_selenium.py @@ -30,19 +30,19 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion # Opens Tor Browser, crawls the website def startCrawling(): - opentor() - # forumName = getForumName() - driver: webdriver.Firefox = getAccess() + # opentor() + forumName = getForumName() + # driver: webdriver.Firefox = getAccess() + # + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) - - # new_parse(forumName, baseURL, False) + new_parse(forumName, baseURL, True) # Opens Tor Browser @@ -219,8 +219,8 @@ def crawlForum(driver: webdriver.Firefox): savePage(driver.page_source, topic + f"page{counter}") # very important # comment out - # if counter == 2: - # break + if counter == 2: + break try: page = "" # no next page so far may have some later on @@ -235,11 +235,11 @@ def crawlForum(driver: webdriver.Firefox): driver.back() # comment out - # break + break # comment out - # if count == 1: - # break + if count == 1: + break try: link = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href') @@ -255,7 +255,7 @@ def crawlForum(driver: webdriver.Firefox): print(link, e) i += 1 - input("Crawling HiddenAnswers done successfully. Press ENTER to continue\n") + print("Crawling the HiddenAnswers forum done.") # Returns 'True' if the link is Topic link diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt index 304b5c0..3526771 100644 --- a/Forums/Initialization/forumsList.txt +++ b/Forums/Initialization/forumsList.txt @@ -1 +1,9 @@ -HiddenAnswers \ No newline at end of file +AbyssForum +Altenens +BestCardingWorld +Cardingleaks +CryptBB +HiddenAnswers +Libre +OnniForums +Procrax \ No newline at end of file diff --git a/Forums/Initialization/forums_mining.py b/Forums/Initialization/forums_mining.py index 883ac34..38a52c8 100644 --- a/Forums/Initialization/forums_mining.py +++ b/Forums/Initialization/forums_mining.py @@ -4,7 +4,6 @@ __author__ = 'DarkWeb' Starting point of the Darkweb Forums Mining ''' -import os from datetime import * from Forums.BestCardingWorld.crawler_selenium import crawler as crawlerBestCardingWorld from Forums.CryptBB.crawler_selenium import crawler as crawlerCryptBB @@ -17,7 +16,8 @@ from Forums.Altenens.crawler_selenium import crawler as crawlerAltenens from Forums.Libre.crawler_selenium import crawler as crawlerLibre import configparser -import time +import os +import subprocess config = configparser.ConfigParser() config.read('../../setup.ini') @@ -88,9 +88,22 @@ def createSubdirectories(pagesDir): os.mkdir(descReadDir) +# Opens Tor Browser +def opentor(): + global pid + print("Connecting Tor...") + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) + pid = pro.pid + # time.sleep(7.5) + input('Press ENTER when Tor is connected to continue') + return + + # main method if __name__ == '__main__': + # opentor() + # assignment from forumsList.txt forumsList = getForums() @@ -98,10 +111,10 @@ if __name__ == '__main__': for forum in forumsList: forum = forum.replace('\n','') - print("Creating listing and description directories ... for " + forum) + print("\nCreating listing and description directories ... for " + forum) createDirectory(forum) - time.sleep(5) # wait for directories to be created - print("Directories created successfully.") + # time.sleep(5) # wait for directories to be created + print("Directories created.") if forum == "BestCardingWorld": crawlerBestCardingWorld() @@ -122,7 +135,7 @@ if __name__ == '__main__': elif forum == 'Libre': crawlerLibre() - print("Scraping process completed successfully!") + print("Scraping process completed!") diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index c9a50ae..10d5f0d 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -73,13 +73,11 @@ def persist_data(url, row, cur): forum = create_forum(cur, row, url) - board = create_board(cur, row, forum) + author = create_author(cur, row, forum) - author = create_user(cur, row, forum, 0) + topic = create_topic(cur, forum, row, author) - topic = create_topic(cur, row, forum, board, author) - - create_posts(cur, row, forum, board, topic) + create_posts(cur, row, forum, topic) def incrementError(): @@ -191,8 +189,9 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript if trace.find("already exists") == -1: incrementError() print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!") + traceback.print_exc() if createLog: - logFile.write(str(nError) + f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n") + logFile.write(str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n") return False else: return True @@ -223,7 +222,7 @@ def new_parse(forum, url, createLog): from Forums.Initialization.forums_mining import config, CURRENT_DATE - print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.") + print("Parsing the " + forum + " forum and conduct data classification to store the information in the database.") # Connecting to the database con = connectDataBase() @@ -261,6 +260,7 @@ def new_parse(forum, url, createLog): parseDescriptionError = False persistDescriptionError = False moveDescriptionError = False + findDescriptionError = False rw = [] @@ -272,6 +272,8 @@ def new_parse(forum, url, createLog): if doDescription: + nFound = 0 + for rec in rw: rec = rec.split(',') @@ -280,6 +282,9 @@ def new_parse(forum, url, createLog): # Reading the associated description Html Pages descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern)) + + nFound += len(descriptions) + for descriptionIndex, descriptionFile in enumerate(descriptions): print("Reading description folder of '" + forum + "', file '" + os.path.basename( @@ -331,7 +336,19 @@ def new_parse(forum, url, createLog): else: moveDescriptionError = True - if not (readDescriptionError or parseDescriptionError or persistDescriptionError or moveDescriptionError): + if not (nFound > 0): + + findDescriptionError = True + + incrementError() + print(f"There was a problem to locate the file(s) for {listingFile} in the Description section!") + if createLog: + logFile.write( + str(nError) + f". There was a problem to locate the file(s) for {listingFile}" + f" in the Description section!\n") + + if not (readDescriptionError or parseDescriptionError or persistDescriptionError + or moveDescriptionError or findDescriptionError): # move listing files of completed folder move_file(listingFile, createLog, logFile) diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py index a2ba332..4697cda 100644 --- a/Forums/Libre/crawler_selenium.py +++ b/Forums/Libre/crawler_selenium.py @@ -31,7 +31,7 @@ def startCrawling(): # opentor() forumName = getForumName() # driver = getAccess() - + # # if driver != 'down': # try: # login(driver) @@ -40,7 +40,7 @@ def startCrawling(): # print(driver.current_url, e) # closetor(driver) - new_parse(forumName, baseURL, False) + new_parse(forumName, baseURL, True) # Opens Tor Browser @@ -275,7 +275,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Libre done successfully. Press ENTER to continue\n") + input("Crawling the Libre forum done.") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/Forums/Libre/parser.py b/Forums/Libre/parser.py index c951ad5..7783661 100644 --- a/Forums/Libre/parser.py +++ b/Forums/Libre/parser.py @@ -9,11 +9,8 @@ import re # Here, we are importing BeautifulSoup to search through the HTML tree from bs4 import BeautifulSoup, ResultSet, Tag -# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) - - - +# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) def libre_description_parser(soup: Tag): # Fields to be parsed @@ -88,7 +85,6 @@ def libre_description_parser(soup: Tag): date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text date_time_cleaned = date_posted.replace(user_name, "")[3:-12] - print(date_time_cleaned) datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT") addDate.append(datetime_append) @@ -130,7 +126,6 @@ def libre_description_parser(soup: Tag): # This is the method to parse the Listing Pages (one page with many posts) - def libre_listing_parser(soup): nm = 0 # *this variable should receive the number of topics forum = "Libre" # 0 *forum name @@ -217,6 +212,7 @@ def libre_listing_parser(soup): addDate=addDate ) + def libre_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py index 447dd2e..a7d0c15 100644 --- a/Forums/OnniForums/crawler_selenium.py +++ b/Forums/OnniForums/crawler_selenium.py @@ -31,19 +31,19 @@ baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion # Opens Tor Browser, crawls the website def startCrawling(): - opentor() + # opentor() forumName = getForumName() - driver = getAccess() + # driver = getAccess() + # + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) - - # new_parse(forum=forumName, url=baseURL, createLog=False) + new_parse(forum=forumName, url=baseURL, createLog=True) # Opens Tor Browser @@ -214,7 +214,7 @@ def getInterestedLinks(): def crawlForum(driver): - print("Crawling the OnniForums") + print("Crawling the OnniForums forum") linksToCrawl = getInterestedLinks() @@ -288,7 +288,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling OnniForums done successfully. Press ENTER to continue\n") + print("Crawling the OnniForums forum done.") # Returns 'True' if the link is Topic link diff --git a/Forums/Procrax/crawler_selenium.py b/Forums/Procrax/crawler_selenium.py index 9d37eae..fc54a30 100644 --- a/Forums/Procrax/crawler_selenium.py +++ b/Forums/Procrax/crawler_selenium.py @@ -34,7 +34,7 @@ FORUM_NAME = 'Procrax' def startCrawling(): # opentor() # driver = getAccess() - + # # if driver != 'down': # try: # login(driver) @@ -46,7 +46,7 @@ def startCrawling(): new_parse( forum=FORUM_NAME, url=BASE_URL, - createLog=False + createLog=True ) @@ -190,9 +190,9 @@ def getInterestedLinks(): # # general hacking links.append('https://procrax.cx/forums/general-hacking.24/') # # hacking security tools - links.append('https://procrax.cx/forums/hacking-security-tools.20/') + # links.append('https://procrax.cx/forums/hacking-security-tools.20/') # # hacktube - links.append('https://procrax.cx/forums/hacktube.22/') + # links.append('https://procrax.cx/forums/hacktube.22/') # # cardable # links.append('https://procrax.cx/forums/cardable-websites.28/') # # tools @@ -205,7 +205,7 @@ def getInterestedLinks(): def crawlForum(driver): - print("Crawling the Procrax") + print("Crawling the Procrax forum") linksToCrawl = getInterestedLinks() @@ -240,8 +240,8 @@ def crawlForum(driver): savePage(driver.page_source, topic + f"page{counter}") # very important # comment out - # if counter == 2: - # break + if counter == 2: + break try: page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') @@ -257,10 +257,10 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out - if count == 20: + if count == 1: break try: @@ -278,7 +278,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Procrax done successfully. Press ENTER to continue\n") + print("Crawling the Procrax forum done.") # Returns 'True' if the link is Topic link, may need to change for every website