From a3ebc920bb3fc5ab4545759c13c8aad4c07dbf24 Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Wed, 20 Sep 2023 23:25:39 -0700 Subject: [PATCH] finished fully running completed AbyssForum and Procrax --- Forums/AbyssForum/crawler_selenium.py | 49 ++++++++++--------- Forums/Initialization/prepare_parser.py | 5 ++ Forums/Procrax/crawler_selenium.py | 63 +++++++++++++++++-------- Forums/Procrax/parser.py | 25 ++++++---- 4 files changed, 91 insertions(+), 51 deletions(-) diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/AbyssForum/crawler_selenium.py index 071abb0..03ceb06 100644 --- a/Forums/AbyssForum/crawler_selenium.py +++ b/Forums/AbyssForum/crawler_selenium.py @@ -158,19 +158,19 @@ def getInterestedLinks(): links = [] # Hacked Database - # links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=26') + links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=26') # Hire a Hacker links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=27') - # # Hacking Tools - # links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=28') - # # Carding Forums - # links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=30') - # # Social Media Hacking - # links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=32') - # # Hacking Tutorials - # links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=12') - # # Cracking Tutorials - # links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=13') + # Hacking Tools + links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=28') + # Carding Forums + links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=30') + # Social Media Hacking + links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=32') + # Hacking Tutorials + links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=12') + # Cracking Tutorials + links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=13') return links @@ -210,13 +210,14 @@ def crawlForum(driver): driver.refresh() savePage(driver, driver.page_source, topic + f"page{counter}") - # comment out - if counter == 2: - break + # # comment out + # if counter == 2: + # break try: - temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/div[3]') - page = temp.find_element(by=By.CLASS_NAME, value='button button-icon-only').get_attribute('href') + temp = driver.find_element(By.CLASS_NAME, 'pagination') + temp = temp.find_element(by=By.CLASS_NAME, value='next') + page = temp.find_element(by=By.CLASS_NAME, value='button').get_attribute('href') if page == "": raise NoSuchElementException counter += 1 @@ -228,15 +229,17 @@ def crawlForum(driver): for j in range(counter): driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href') + temp = driver.find_element(By.CLASS_NAME, 'pagination') + temp = temp.find_element(by=By.CLASS_NAME, value='next') + link = temp.find_element(by=By.CLASS_NAME, value='button').get_attribute('href') if link == "": raise NoSuchElementException count += 1 diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 3d2f388..79b79a7 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -15,6 +15,7 @@ from Forums.Altenens.parser import * from Forums.Procrax.parser import * from Forums.Libre.parser import * from Forums.HiddenAnswers.parser import * +from Forums.AbyssForum.parser import * from Forums.Classifier.classify_product import predict # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi @@ -137,6 +138,8 @@ def parse_listing(forum, listingFile, soup, createLog, logFile): rw = libre_listing_parser(soup) elif forum == "HiddenAnswers": rw = HiddenAnswers_listing_parser(soup) + elif forum == "AbyssForum": + rw = abyssForums_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -174,6 +177,8 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile): rmm = libre_description_parser(soup) elif forum == "HiddenAnswers": rmm = HiddenAnswers_description_parser(soup) + elif forum == "AbyssForum": + rmm = abyssForums_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception diff --git a/Forums/Procrax/crawler_selenium.py b/Forums/Procrax/crawler_selenium.py index 6a9474b..71eeb3d 100644 --- a/Forums/Procrax/crawler_selenium.py +++ b/Forums/Procrax/crawler_selenium.py @@ -171,19 +171,42 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # general hacking + # verified sales + links.append('https://procrax.cx/forums/verified-sales-market.10/') + # unverified sales + links.append('https://procrax.cx/forums/unverified-sales-market.12/') + # combos + links.append('https://procrax.cx/forums/bases.79/') + # tools + links.append('https://procrax.cx/forums/tools.81/') + # configs + links.append('https://procrax.cx/forums/configs.82/') + # craxtube + links.append('https://procrax.cx/forums/craxtube.83/') + # general hacking links.append('https://procrax.cx/forums/general-hacking.24/') - # # hacking security tools - # links.append('https://procrax.cx/forums/hacking-security-tools.20/') - # # hacktube - # links.append('https://procrax.cx/forums/hacktube.22/') - # # cardable - # links.append('https://procrax.cx/forums/cardable-websites.28/') - # # tools - # links.append('https://procrax.cx/forums/tools-bots-validators.73/') + # hacking security tools + links.append('https://procrax.cx/forums/hacking-security-tools.20/') + # hacktube + links.append('https://procrax.cx/forums/hacktube.22/') + # cardingtube + links.append('https://procrax.cx/forums/cardingtube.26/') + # cardable + links.append('https://procrax.cx/forums/cardable-websites.28/') + # spam software + links.append('https://procrax.cx/forums/mailing.72/') + # spam tools + links.append('https://procrax.cx/forums/tools-bots-validators.73/') + # darknet news + links.append('https://procrax.cx/forums/darknet-news-articles.42/') + # links + links.append('https://procrax.cx/forums/darknet-markets-deep-onion-links.43/') + # courses + links.append('https://procrax.cx/forums/courses.59/') + # software + links.append('https://procrax.cx/forums/software.76/') # general forum - # links.append('https://procrax.cx/forums/forum-discussions-updates.7/') - + links.append('https://procrax.cx/forums/forum-discussions-updates.7/') return links @@ -223,9 +246,9 @@ def crawlForum(driver): driver.refresh() savePage(driver, driver.page_source, topic + f"page{counter}") # very important - # comment out - if counter == 2: - break + # # comment out + # if counter == 2: + # break try: page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') @@ -240,12 +263,12 @@ def crawlForum(driver): for j in range(counter): driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: diff --git a/Forums/Procrax/parser.py b/Forums/Procrax/parser.py index cb16271..dda0090 100644 --- a/Forums/Procrax/parser.py +++ b/Forums/Procrax/parser.py @@ -48,8 +48,12 @@ def procrax_description_parser(soup: Tag): user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text status.append(cleanString(user_status.strip())) - user_lvl = ipost.find("div", {"class": "afAwardLevel"}).text - reputation.append(cleanString(user_lvl.strip())) + user_lvl = ipost.find("div", {"class": "afAwardLevel"}) + if user_lvl is not None: + user_lvl = user_lvl.text + reputation.append(cleanString(user_lvl.strip())) + else: + reputation.append('-1') sign.append("-1") @@ -110,7 +114,11 @@ def procrax_listing_parser(soup: Tag): li = soup.find("h1", {"class": "p-title-value"}) board = cleanString(li.text.strip()) - threads_list: ResultSet[Tag] = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True}) + threads_list = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True}) + + sticky = soup.find("div", {"class": "structItemContainer-group structItemContainer-group--sticky"}) + if sticky is not None: + threads_list = sticky.find_all("div", {"data-author": True}) + threads_list nm = len(threads_list) @@ -134,19 +142,20 @@ def procrax_listing_parser(soup: Tag): author.append(cleanString(thread_author)) thread_views = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find('dd').text - thread_views = thread_views.lower().replace("k","000") - views.append(cleanString(thread_views.strip())) + thread_views = thread_views.lower().replace("k", "000") + thread_views = thread_views.lower().replace("m", "000000") + views.append(thread_views.strip()) thread_replies = thread.find("dl", {"class": "pairs pairs--justified"}).find('dd').text # All threads contain one topic post and reply posts - thread_total_posts = str(1 + int(thread_replies)) - posts.append(thread_total_posts) + thread_total_posts = thread_replies.lower().replace("k", "000") + posts.append(thread_total_posts.strip()) thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime") datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z") addDate.append(datetime_obj) - thread_link: str = thread.find("div", {"class": "structItem-title"}).find('a').get('href') + thread_link: str = thread.find("div", {"class": "structItem-title"}).find('a', {'class': ''}).get('href') href.append(thread_link)