Browse Source

forum crawler logic updates

main
westernmeadow 1 year ago
parent
commit
ec6530bfb2
9 changed files with 88 additions and 37 deletions
  1. +11
    -5
      Forums/AbyssForum/crawler_selenium.py
  2. +6
    -2
      Forums/Altenens/crawler_selenium.py
  3. +11
    -5
      Forums/BestCardingWorld/crawler_selenium.py
  4. +16
    -13
      Forums/Cardingleaks/crawler_selenium.py
  5. +10
    -3
      Forums/CryptBB/crawler_selenium.py
  6. +9
    -2
      Forums/HiddenAnswers/crawler_selenium.py
  7. +10
    -3
      Forums/Libre/crawler_selenium.py
  8. +5
    -1
      Forums/OnniForums/crawler_selenium.py
  9. +10
    -3
      Forums/Procrax/crawler_selenium.py

+ 11
- 5
Forums/AbyssForum/crawler_selenium.py View File

@ -208,7 +208,11 @@ def crawlForum(driver):
driver.get(itemURL) driver.get(itemURL)
except: except:
driver.refresh() driver.refresh()
savePage(driver, driver.page_source, topic + f"page{counter}")
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out # # comment out
# if counter == 2: # if counter == 2:
@ -225,9 +229,11 @@ def crawlForum(driver):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
# end of loop
for j in range(counter):
driver.back()
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
# # comment out # # comment out
# break # break
@ -263,7 +269,7 @@ def isDescriptionLink(url):
# Returns True if the link is a listingPage link # Returns True if the link is a listingPage link
def isListingLink(url): def isListingLink(url):
if 'viewforum' in url:
if '.onion/viewforum' in url:
return True return True
return False return False


+ 6
- 2
Forums/Altenens/crawler_selenium.py View File

@ -225,7 +225,11 @@ def crawlForum(driver):
driver.get(itemURL) driver.get(itemURL)
except: except:
driver.refresh() driver.refresh()
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out # # comment out
# if counter == 2: # if counter == 2:
@ -277,7 +281,7 @@ def isDescriptionLink(url):
# Returns True if the link is a listingPage link, may need to change for every website # Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url): def isListingLink(url):
if 'forums' in url:
if '.is/forums' in url:
return True return True
return False return False


+ 11
- 5
Forums/BestCardingWorld/crawler_selenium.py View File

@ -217,7 +217,11 @@ def crawlForum(driver):
driver.get(itemURL) driver.get(itemURL)
except: except:
driver.refresh() driver.refresh()
savePage(driver, driver.page_source, topic + f"page{counter}")
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# comment out # comment out
if counter == 2: if counter == 2:
@ -234,9 +238,11 @@ def crawlForum(driver):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
# end of loop
for j in range(counter):
driver.back()
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
# comment out # comment out
# break # break
@ -276,7 +282,7 @@ def isDescriptionLink(url):
#@param: url of any url crawled #@param: url of any url crawled
#return: true if is a Listing page, false if not #return: true if is a Listing page, false if not
def isListingLink(url): def isListingLink(url):
if 'forum' in url:
if '.onion/viewforum' in url:
return True return True
return False return False


+ 16
- 13
Forums/Cardingleaks/crawler_selenium.py View File

@ -183,16 +183,16 @@ def getInterestedLinks():
# carding methods # carding methods
links.append('https://leaks.ws/forums/carding-methods.82/') links.append('https://leaks.ws/forums/carding-methods.82/')
# carding schools
links.append('https://leaks.ws/forums/help-desk-carding-school.35/')
# carding discussion
links.append('https://leaks.ws/forums/carding-discussion-desk.58/')
# carding tutorials
links.append('https://leaks.ws/forums/carding-tutorials.13/')
# carding tools and software
links.append('https://leaks.ws/forums/carding-tools-softwares.10/')
# exploits and cracking tools
links.append('https://leaks.ws/forums/exploits-cracking-tools.22/')
# # carding schools
# links.append('https://leaks.ws/forums/help-desk-carding-school.35/')
# # carding discussion
# links.append('https://leaks.ws/forums/carding-discussion-desk.58/')
# # carding tutorials
# links.append('https://leaks.ws/forums/carding-tutorials.13/')
# # carding tools and software
# links.append('https://leaks.ws/forums/carding-tools-softwares.10/')
# # exploits and cracking tools
# links.append('https://leaks.ws/forums/exploits-cracking-tools.22/')
return links return links
@ -249,8 +249,11 @@ def crawlForum(driver):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
for j in range(counter):
driver.back()
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
# # comment out # # comment out
# break # break
@ -284,7 +287,7 @@ def isDescriptionLink(url):
# Returns True if the link is a listingPage link, may need to change for every website # Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url): def isListingLink(url):
if 'forums' in url:
if '.ws/forums' in url:
return True return True
return False return False


+ 10
- 3
Forums/CryptBB/crawler_selenium.py View File

@ -254,6 +254,10 @@ def crawlForum(driver):
driver.get(itemURL) driver.get(itemURL)
except: except:
driver.refresh() driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out # # comment out
@ -271,8 +275,11 @@ def crawlForum(driver):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
for j in range(counter):
driver.back()
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
# # comment out # # comment out
# break # break
@ -308,7 +315,7 @@ def isDescriptionLink(url):
# Returns True if the link is a listingPage link, may need to change for every website # Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url): def isListingLink(url):
if 'forum' in url:
if '.onion/forumdisplay' in url:
return True return True
return False return False


+ 9
- 2
Forums/HiddenAnswers/crawler_selenium.py View File

@ -208,6 +208,10 @@ def crawlForum(driver: webdriver.Firefox):
driver.get(itemURL) driver.get(itemURL)
except: except:
driver.refresh() driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out # # comment out
@ -223,8 +227,11 @@ def crawlForum(driver: webdriver.Firefox):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
for j in range(counter):
driver.back()
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
# # comment out # # comment out
# break # break


+ 10
- 3
Forums/Libre/crawler_selenium.py View File

@ -228,6 +228,10 @@ def crawlForum(driver):
driver.get(itemURL) driver.get(itemURL)
except: except:
driver.refresh() driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out # # comment out
@ -243,8 +247,11 @@ def crawlForum(driver):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
for j in range(counter):
driver.back()
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
# # comment out # # comment out
# break # break
@ -279,7 +286,7 @@ def isDescriptionLink(url):
# Returns True if the link is a listingPage link, may need to change for every website # Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url): def isListingLink(url):
if '/c/' in url:
if '.onion/c' in url:
return True return True
return False return False


+ 5
- 1
Forums/OnniForums/crawler_selenium.py View File

@ -232,6 +232,10 @@ def crawlForum(driver):
driver.get(itemURL) driver.get(itemURL)
except: except:
driver.refresh() driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out # # comment out
@ -289,7 +293,7 @@ def isDescriptionLink(url):
# Returns True if the link is a listingPage link # Returns True if the link is a listingPage link
def isListingLink(url): def isListingLink(url):
if 'Forum' in url:
if '.onion/Forum' in url:
return True return True
return False return False


+ 10
- 3
Forums/Procrax/crawler_selenium.py View File

@ -244,6 +244,10 @@ def crawlForum(driver):
driver.get(itemURL) driver.get(itemURL)
except: except:
driver.refresh() driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out # # comment out
@ -260,8 +264,11 @@ def crawlForum(driver):
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
for j in range(counter):
driver.back()
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
# # comment out # # comment out
# break # break
@ -297,7 +304,7 @@ def isDescriptionLink(url):
# Returns True if the link is a listingPage link, may need to change for every website # Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url): def isListingLink(url):
if 'forums' in url:
if '.cx/forums' in url:
return True return True
return False return False


Loading…
Cancel
Save