From ec6530bfb272f47ba6ddf5920834066e93bbdffb Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Tue, 26 Sep 2023 15:48:51 -0700 Subject: [PATCH] forum crawler logic updates --- Forums/AbyssForum/crawler_selenium.py | 16 ++++++++---- Forums/Altenens/crawler_selenium.py | 8 ++++-- Forums/BestCardingWorld/crawler_selenium.py | 16 ++++++++---- Forums/Cardingleaks/crawler_selenium.py | 29 ++++++++++++--------- Forums/CryptBB/crawler_selenium.py | 13 ++++++--- Forums/HiddenAnswers/crawler_selenium.py | 11 ++++++-- Forums/Libre/crawler_selenium.py | 13 ++++++--- Forums/OnniForums/crawler_selenium.py | 6 ++++- Forums/Procrax/crawler_selenium.py | 13 ++++++--- 9 files changed, 88 insertions(+), 37 deletions(-) diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/AbyssForum/crawler_selenium.py index 03ceb06..27135f2 100644 --- a/Forums/AbyssForum/crawler_selenium.py +++ b/Forums/AbyssForum/crawler_selenium.py @@ -208,7 +208,11 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver, driver.page_source, topic + f"page{counter}") + + if isListingLink(driver.current_url): + break + + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # # comment out # if counter == 2: @@ -225,9 +229,11 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - # end of loop - for j in range(counter): - driver.back() + # making sure we go back to the listing page (browser back button simulation) + try: + driver.get(link) + except: + driver.refresh() # # comment out # break @@ -263,7 +269,7 @@ def isDescriptionLink(url): # Returns True if the link is a listingPage link def isListingLink(url): - if 'viewforum' in url: + if '.onion/viewforum' in url: return True return False diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py index c9c4907..4dfa963 100644 --- a/Forums/Altenens/crawler_selenium.py +++ b/Forums/Altenens/crawler_selenium.py @@ -225,7 +225,11 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver, driver.page_source, topic + f"page{counter}") # very important + + if isListingLink(driver.current_url): + break + + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # # comment out # if counter == 2: @@ -277,7 +281,7 @@ def isDescriptionLink(url): # Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): - if 'forums' in url: + if '.is/forums' in url: return True return False diff --git a/Forums/BestCardingWorld/crawler_selenium.py b/Forums/BestCardingWorld/crawler_selenium.py index 6c3bdc9..dd61e3b 100644 --- a/Forums/BestCardingWorld/crawler_selenium.py +++ b/Forums/BestCardingWorld/crawler_selenium.py @@ -217,7 +217,11 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver, driver.page_source, topic + f"page{counter}") + + if isListingLink(driver.current_url): + break + + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # comment out if counter == 2: @@ -234,9 +238,11 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - # end of loop - for j in range(counter): - driver.back() + # making sure we go back to the listing page (browser back button simulation) + try: + driver.get(link) + except: + driver.refresh() # comment out # break @@ -276,7 +282,7 @@ def isDescriptionLink(url): #@param: url of any url crawled #return: true if is a Listing page, false if not def isListingLink(url): - if 'forum' in url: + if '.onion/viewforum' in url: return True return False diff --git a/Forums/Cardingleaks/crawler_selenium.py b/Forums/Cardingleaks/crawler_selenium.py index 97296ca..0712956 100644 --- a/Forums/Cardingleaks/crawler_selenium.py +++ b/Forums/Cardingleaks/crawler_selenium.py @@ -183,16 +183,16 @@ def getInterestedLinks(): # carding methods links.append('https://leaks.ws/forums/carding-methods.82/') - # carding schools - links.append('https://leaks.ws/forums/help-desk-carding-school.35/') - # carding discussion - links.append('https://leaks.ws/forums/carding-discussion-desk.58/') - # carding tutorials - links.append('https://leaks.ws/forums/carding-tutorials.13/') - # carding tools and software - links.append('https://leaks.ws/forums/carding-tools-softwares.10/') - # exploits and cracking tools - links.append('https://leaks.ws/forums/exploits-cracking-tools.22/') + # # carding schools + # links.append('https://leaks.ws/forums/help-desk-carding-school.35/') + # # carding discussion + # links.append('https://leaks.ws/forums/carding-discussion-desk.58/') + # # carding tutorials + # links.append('https://leaks.ws/forums/carding-tutorials.13/') + # # carding tools and software + # links.append('https://leaks.ws/forums/carding-tools-softwares.10/') + # # exploits and cracking tools + # links.append('https://leaks.ws/forums/exploits-cracking-tools.22/') return links @@ -249,8 +249,11 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - for j in range(counter): - driver.back() + # making sure we go back to the listing page (browser back button simulation) + try: + driver.get(link) + except: + driver.refresh() # # comment out # break @@ -284,7 +287,7 @@ def isDescriptionLink(url): # Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): - if 'forums' in url: + if '.ws/forums' in url: return True return False diff --git a/Forums/CryptBB/crawler_selenium.py b/Forums/CryptBB/crawler_selenium.py index 40255ce..e48b193 100644 --- a/Forums/CryptBB/crawler_selenium.py +++ b/Forums/CryptBB/crawler_selenium.py @@ -254,6 +254,10 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() + + if isListingLink(driver.current_url): + break + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # # comment out @@ -271,8 +275,11 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - for j in range(counter): - driver.back() + # making sure we go back to the listing page (browser back button simulation) + try: + driver.get(link) + except: + driver.refresh() # # comment out # break @@ -308,7 +315,7 @@ def isDescriptionLink(url): # Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): - if 'forum' in url: + if '.onion/forumdisplay' in url: return True return False diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/HiddenAnswers/crawler_selenium.py index f369347..f972861 100644 --- a/Forums/HiddenAnswers/crawler_selenium.py +++ b/Forums/HiddenAnswers/crawler_selenium.py @@ -208,6 +208,10 @@ def crawlForum(driver: webdriver.Firefox): driver.get(itemURL) except: driver.refresh() + + if isListingLink(driver.current_url): + break + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # # comment out @@ -223,8 +227,11 @@ def crawlForum(driver: webdriver.Firefox): except NoSuchElementException: has_next_topic_page = False - for j in range(counter): - driver.back() + # making sure we go back to the listing page (browser back button simulation) + try: + driver.get(link) + except: + driver.refresh() # # comment out # break diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py index 6e8f43b..19a05aa 100644 --- a/Forums/Libre/crawler_selenium.py +++ b/Forums/Libre/crawler_selenium.py @@ -228,6 +228,10 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() + + if isListingLink(driver.current_url): + break + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # # comment out @@ -243,8 +247,11 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - for j in range(counter): - driver.back() + # making sure we go back to the listing page (browser back button simulation) + try: + driver.get(link) + except: + driver.refresh() # # comment out # break @@ -279,7 +286,7 @@ def isDescriptionLink(url): # Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): - if '/c/' in url: + if '.onion/c' in url: return True return False diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py index 84aeb3e..d801d29 100644 --- a/Forums/OnniForums/crawler_selenium.py +++ b/Forums/OnniForums/crawler_selenium.py @@ -232,6 +232,10 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() + + if isListingLink(driver.current_url): + break + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # # comment out @@ -289,7 +293,7 @@ def isDescriptionLink(url): # Returns True if the link is a listingPage link def isListingLink(url): - if 'Forum' in url: + if '.onion/Forum' in url: return True return False diff --git a/Forums/Procrax/crawler_selenium.py b/Forums/Procrax/crawler_selenium.py index 71eeb3d..c12088a 100644 --- a/Forums/Procrax/crawler_selenium.py +++ b/Forums/Procrax/crawler_selenium.py @@ -244,6 +244,10 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() + + if isListingLink(driver.current_url): + break + savePage(driver, driver.page_source, topic + f"page{counter}") # very important # # comment out @@ -260,8 +264,11 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - for j in range(counter): - driver.back() + # making sure we go back to the listing page (browser back button simulation) + try: + driver.get(link) + except: + driver.refresh() # # comment out # break @@ -297,7 +304,7 @@ def isDescriptionLink(url): # Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): - if 'forums' in url: + if '.cx/forums' in url: return True return False