|
|
@ -191,86 +191,66 @@ def crawlForum(driver): |
|
|
|
print("Crawling the AbyssForum forum") |
|
|
|
|
|
|
|
linksToCrawl = getInterestedLinks() |
|
|
|
visited = set(linksToCrawl) |
|
|
|
initialTime = time.time() |
|
|
|
|
|
|
|
i = 0 |
|
|
|
count = 0 |
|
|
|
while i < len(linksToCrawl): |
|
|
|
link = linksToCrawl[i] |
|
|
|
print('Crawling :', link) |
|
|
|
try: |
|
|
|
try: |
|
|
|
driver.get(link) |
|
|
|
except: |
|
|
|
driver.refresh() |
|
|
|
html = driver.page_source |
|
|
|
savePage(html, link) |
|
|
|
|
|
|
|
has_next_page = True |
|
|
|
count = 0 |
|
|
|
|
|
|
|
while has_next_page: |
|
|
|
list = topicPages(html) |
|
|
|
for item in list: |
|
|
|
itemURL = urlparse.urljoin(baseURL, str(item)) |
|
|
|
try: |
|
|
|
driver.get(itemURL) |
|
|
|
except: |
|
|
|
driver.refresh() |
|
|
|
savePage(driver.page_source, item) |
|
|
|
driver.back() |
|
|
|
''' |
|
|
|
#variable to check if there is a next page for the topic |
|
|
|
try: |
|
|
|
driver.get(link) |
|
|
|
except: |
|
|
|
driver.refresh() |
|
|
|
html = driver.page_source |
|
|
|
savePage(html, link) |
|
|
|
|
|
|
|
topics = topicPages(html) |
|
|
|
for topic in topics: |
|
|
|
has_next_topic_page = True |
|
|
|
counter = 1 |
|
|
|
page = topic |
|
|
|
|
|
|
|
# check if there is a next page for the topics |
|
|
|
while has_next_topic_page: |
|
|
|
# try to access next page of th topic |
|
|
|
itemURL = urlparse.urljoin(baseURL, str(item)) |
|
|
|
itemURL = urlparse.urljoin(baseURL, str(page)) |
|
|
|
try: |
|
|
|
driver.get(itemURL) |
|
|
|
except: |
|
|
|
driver.refresh() |
|
|
|
savePage(driver.page_source, item) |
|
|
|
savePage(driver.page_source, topic + f"page{counter}") |
|
|
|
|
|
|
|
# if there is a next page then go and save.... |
|
|
|
# next page in the topic? |
|
|
|
try: |
|
|
|
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/ |
|
|
|
item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div |
|
|
|
# comment out |
|
|
|
if counter == 2: |
|
|
|
break |
|
|
|
|
|
|
|
try: |
|
|
|
temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/div[3]') |
|
|
|
item = temp.find_element(by=By.CLASS_NAME, value='button button-icon-only').get_attribute('href') |
|
|
|
if item == "": |
|
|
|
raise NoSuchElementException |
|
|
|
has_next_topic_page = False |
|
|
|
else: |
|
|
|
counter += 1 |
|
|
|
counter += 1 |
|
|
|
|
|
|
|
except NoSuchElementException: |
|
|
|
has_next_topic_page = False |
|
|
|
|
|
|
|
# end of loop |
|
|
|
for i in range(counter): |
|
|
|
driver.back() |
|
|
|
''' |
|
|
|
|
|
|
|
# comment out |
|
|
|
break |
|
|
|
|
|
|
|
# comment out |
|
|
|
if count == 1: |
|
|
|
count = 0 |
|
|
|
break |
|
|
|
|
|
|
|
try: |
|
|
|
link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href') |
|
|
|
|
|
|
|
if link == "": |
|
|
|
raise NoSuchElementException |
|
|
|
try: |
|
|
|
driver.get(link) |
|
|
|
except: |
|
|
|
driver.refresh() |
|
|
|
html = driver.page_source |
|
|
|
savePage(html, link) |
|
|
|
count += 1 |
|
|
|
|
|
|
|
except NoSuchElementException: |
|
|
@ -280,10 +260,6 @@ def crawlForum(driver): |
|
|
|
print(link, e) |
|
|
|
i += 1 |
|
|
|
|
|
|
|
# finalTime = time.time() |
|
|
|
# print finalTime - initialTime |
|
|
|
|
|
|
|
|
|
|
|
input("Crawling AbyssForum forum done sucessfully. Press ENTER to continue\n") |
|
|
|
|
|
|
|
|
|
|
|