|
|
@ -1,8 +1,7 @@ |
|
|
|
__author__ = 'Helium' |
|
|
|
|
|
|
|
''' |
|
|
|
Altenens Forum Crawler (Selenium); |
|
|
|
Untested due to CAPTCHAS and blocking the network |
|
|
|
Altenens Forum Crawler (Selenium) |
|
|
|
''' |
|
|
|
|
|
|
|
from selenium import webdriver |
|
|
@ -31,18 +30,18 @@ baseURL = 'https://altenens.is/' |
|
|
|
|
|
|
|
# Opens Tor Browser, crawls the website |
|
|
|
def startCrawling(): |
|
|
|
opentor() |
|
|
|
# opentor() |
|
|
|
forumName = getForumName() |
|
|
|
driver = getAccess() |
|
|
|
|
|
|
|
if driver != 'down': |
|
|
|
try: |
|
|
|
login(driver) |
|
|
|
crawlForum(driver) |
|
|
|
except Exception as e: |
|
|
|
print(driver.current_url, e) |
|
|
|
closetor(driver) |
|
|
|
|
|
|
|
# driver = getAccess() |
|
|
|
# |
|
|
|
# if driver != 'down': |
|
|
|
# try: |
|
|
|
# login(driver) |
|
|
|
# crawlForum(driver) |
|
|
|
# except Exception as e: |
|
|
|
# print(driver.current_url, e) |
|
|
|
# closetor(driver) |
|
|
|
# |
|
|
|
new_parse(forumName, baseURL, False) |
|
|
|
|
|
|
|
|
|
|
@ -73,12 +72,12 @@ def login(driver): |
|
|
|
#Password here |
|
|
|
passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox |
|
|
|
|
|
|
|
input("Press ENTER when you complete the CAPTCHA and press login\n") |
|
|
|
input("Press ENTER when CAPTCHA is completed\n") |
|
|
|
|
|
|
|
# wait for listing page show up (This Xpath may need to change based on different seed url) |
|
|
|
# wait for 50 sec until id = tab_content is found, then cont |
|
|
|
# WebDriverWait(driver, 50).until(EC.visibility_of_element_located( |
|
|
|
# (By.XPATH, '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div[4]/div/div/div[1]/div/div[1]'))) |
|
|
|
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( |
|
|
|
(By.XPATH, '/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]'))) |
|
|
|
|
|
|
|
|
|
|
|
# Returns the name of the website |
|
|
@ -205,76 +204,64 @@ def crawlForum(driver): |
|
|
|
print("Crawling the Altenens forum") |
|
|
|
|
|
|
|
linksToCrawl = getInterestedLinks() |
|
|
|
visited = set(linksToCrawl) |
|
|
|
initialTime = time.time() |
|
|
|
|
|
|
|
i = 0 |
|
|
|
count = 0 |
|
|
|
while i < len(linksToCrawl): |
|
|
|
link = linksToCrawl[i] |
|
|
|
print('Crawling :', link) |
|
|
|
try: |
|
|
|
try: |
|
|
|
driver.get(link)# open |
|
|
|
except: |
|
|
|
driver.refresh() |
|
|
|
html = driver.page_source |
|
|
|
savePage(html, link) |
|
|
|
|
|
|
|
has_next_page = True |
|
|
|
count = 0 |
|
|
|
|
|
|
|
#loop through the topics |
|
|
|
while has_next_page: |
|
|
|
list = topicPages(html)# for multiple pages |
|
|
|
for item in list: |
|
|
|
#variable to check if there is a next page for the topic |
|
|
|
try: |
|
|
|
driver.get(link) |
|
|
|
except: |
|
|
|
driver.refresh() |
|
|
|
html = driver.page_source |
|
|
|
savePage(html, link) |
|
|
|
|
|
|
|
topics = topicPages(html) |
|
|
|
for topic in topics: |
|
|
|
has_next_topic_page = True |
|
|
|
counter = 1 |
|
|
|
page = topic |
|
|
|
|
|
|
|
# check if there is a next page for the topics |
|
|
|
while has_next_topic_page: |
|
|
|
# try to access next page of th topic |
|
|
|
itemURL = urlparse.urljoin(baseURL, str(item)) |
|
|
|
itemURL = urlparse.urljoin(baseURL, str(page)) |
|
|
|
try: |
|
|
|
driver.get(itemURL) |
|
|
|
except: |
|
|
|
driver.refresh() |
|
|
|
savePage(driver.page_source, item) |
|
|
|
savePage(driver.page_source, topic + f"page{counter}") |
|
|
|
|
|
|
|
# if there is a next page then go and save.... |
|
|
|
# specific |
|
|
|
try: |
|
|
|
item = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') |
|
|
|
# comment out |
|
|
|
if counter == 2: |
|
|
|
break |
|
|
|
|
|
|
|
if item == "": |
|
|
|
try: |
|
|
|
page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') |
|
|
|
if page == "": |
|
|
|
raise NoSuchElementException |
|
|
|
has_next_topic_page = False |
|
|
|
else: |
|
|
|
counter += 1 |
|
|
|
counter += 1 |
|
|
|
|
|
|
|
except NoSuchElementException: |
|
|
|
has_next_topic_page = False |
|
|
|
|
|
|
|
#end of loop |
|
|
|
for i in range(counter): |
|
|
|
driver.back() |
|
|
|
|
|
|
|
# comment out |
|
|
|
break |
|
|
|
|
|
|
|
# comment out |
|
|
|
if count == 1: |
|
|
|
count = 0 |
|
|
|
break |
|
|
|
|
|
|
|
try:# change depending on web page, #next page |
|
|
|
try: |
|
|
|
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') |
|
|
|
if link == "": |
|
|
|
raise NoSuchElementException |
|
|
|
try: |
|
|
|
driver.get(link) |
|
|
|
except: |
|
|
|
driver.refresh() |
|
|
|
html = driver.page_source |
|
|
|
savePage(html, link) |
|
|
|
count += 1 |
|
|
|
|
|
|
|
except NoSuchElementException: |
|
|
@ -284,9 +271,6 @@ def crawlForum(driver): |
|
|
|
print(link, e) |
|
|
|
i += 1 |
|
|
|
|
|
|
|
# finalTime = time.time() |
|
|
|
# print finalTime - initialTime |
|
|
|
|
|
|
|
input("Crawling Altenens forum done successfully. Press ENTER to continue\n") |
|
|
|
|
|
|
|
|
|
|
|