From 89fb117115634202a209bc14635a2c4d46d0073d Mon Sep 17 00:00:00 2001 From: Khoi Date: Mon, 24 Jul 2023 11:04:01 -0700 Subject: [PATCH] fixed pagination issues with hiddenanswers crawler --- Forums/HiddenAnswers/crawler_selenium.py | 26 ++++++++++++++++-------- Forums/Initialization/forumsList.txt | 2 +- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/HiddenAnswers/crawler_selenium.py index 54e4a05..bb73764 100644 --- a/Forums/HiddenAnswers/crawler_selenium.py +++ b/Forums/HiddenAnswers/crawler_selenium.py @@ -32,7 +32,7 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion def startCrawling(): opentor() # forumName = getForumName() - driver = getAccess() + driver: webdriver.Firefox = getAccess() if driver != 'down': try: @@ -170,12 +170,20 @@ def getInterestedLinks(): links = [] # Hacks - links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking') + # links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking') + + # links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/darknet-and-tor') + + # links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/internet') + + links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/links') + + return links -def crawlForum(driver): +def crawlForum(driver: webdriver.Firefox): print("Crawling the HiddenAnswers forum") linksToCrawl = getInterestedLinks() @@ -211,8 +219,8 @@ def crawlForum(driver): savePage(driver.page_source, topic + f"page{counter}") # very important # comment out - if counter == 2: - break + # if counter == 2: + # break try: page = "" # no next page so far may have some later on @@ -227,14 +235,14 @@ def crawlForum(driver): driver.back() # comment out - break + # break # comment out - if count == 1: - break + # if count == 1: + # break try: - link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div/div[3]/div[3]/ul/li[7]/a').get_attribute('href') + link = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href') if link == "": raise NoSuchElementException diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt index 3010d1b..304b5c0 100644 --- a/Forums/Initialization/forumsList.txt +++ b/Forums/Initialization/forumsList.txt @@ -1 +1 @@ -Libre \ No newline at end of file +HiddenAnswers \ No newline at end of file