Browse Source

fixed pagination issues with hiddenanswers crawler

main
Khoi 1 year ago
parent
commit
89fb117115
2 changed files with 18 additions and 10 deletions
  1. +17
    -9
      Forums/HiddenAnswers/crawler_selenium.py
  2. +1
    -1
      Forums/Initialization/forumsList.txt

+ 17
- 9
Forums/HiddenAnswers/crawler_selenium.py View File

@ -32,7 +32,7 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion
def startCrawling():
opentor()
# forumName = getForumName()
driver = getAccess()
driver: webdriver.Firefox = getAccess()
if driver != 'down':
try:
@ -170,12 +170,20 @@ def getInterestedLinks():
links = []
# Hacks
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking')
# links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking')
# links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/darknet-and-tor')
# links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/internet')
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/links')
return links
def crawlForum(driver):
def crawlForum(driver: webdriver.Firefox):
print("Crawling the HiddenAnswers forum")
linksToCrawl = getInterestedLinks()
@ -211,8 +219,8 @@ def crawlForum(driver):
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
break
# if counter == 2:
# break
try:
page = "" # no next page so far may have some later on
@ -227,14 +235,14 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:
break
# if count == 1:
# break
try:
link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div/div[3]/div[3]/ul/li[7]/a').get_attribute('href')
link = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href')
if link == "":
raise NoSuchElementException


+ 1
- 1
Forums/Initialization/forumsList.txt View File

@ -1 +1 @@
Libre
HiddenAnswers

Loading…
Cancel
Save