|
|
@ -32,7 +32,7 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion |
|
|
|
def startCrawling(): |
|
|
|
opentor() |
|
|
|
# forumName = getForumName() |
|
|
|
driver = getAccess() |
|
|
|
driver: webdriver.Firefox = getAccess() |
|
|
|
|
|
|
|
if driver != 'down': |
|
|
|
try: |
|
|
@ -170,12 +170,20 @@ def getInterestedLinks(): |
|
|
|
links = [] |
|
|
|
|
|
|
|
# Hacks |
|
|
|
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking') |
|
|
|
# links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking') |
|
|
|
|
|
|
|
# links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/darknet-and-tor') |
|
|
|
|
|
|
|
# links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/internet') |
|
|
|
|
|
|
|
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/links') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return links |
|
|
|
|
|
|
|
|
|
|
|
def crawlForum(driver): |
|
|
|
def crawlForum(driver: webdriver.Firefox): |
|
|
|
print("Crawling the HiddenAnswers forum") |
|
|
|
|
|
|
|
linksToCrawl = getInterestedLinks() |
|
|
@ -211,8 +219,8 @@ def crawlForum(driver): |
|
|
|
savePage(driver.page_source, topic + f"page{counter}") # very important |
|
|
|
|
|
|
|
# comment out |
|
|
|
if counter == 2: |
|
|
|
break |
|
|
|
# if counter == 2: |
|
|
|
# break |
|
|
|
|
|
|
|
try: |
|
|
|
page = "" # no next page so far may have some later on |
|
|
@ -227,14 +235,14 @@ def crawlForum(driver): |
|
|
|
driver.back() |
|
|
|
|
|
|
|
# comment out |
|
|
|
break |
|
|
|
# break |
|
|
|
|
|
|
|
# comment out |
|
|
|
if count == 1: |
|
|
|
break |
|
|
|
# if count == 1: |
|
|
|
# break |
|
|
|
|
|
|
|
try: |
|
|
|
link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div/div[3]/div[3]/ul/li[7]/a').get_attribute('href') |
|
|
|
link = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href') |
|
|
|
|
|
|
|
if link == "": |
|
|
|
raise NoSuchElementException |
|
|
|