From b3188f47167123dd7af9334ac5c4c261d0def8d6 Mon Sep 17 00:00:00 2001 From: Helium Date: Fri, 14 Jul 2023 13:28:25 -0700 Subject: [PATCH] updated all forum crawlers --- Forums/Altenens/crawler_selenium.py | 4 +- Forums/Cardingleaks/crawler_selenium.py | 72 ++---- Forums/CryptBB/crawler_selenium.py | 70 ++---- Forums/HiddenAnswers/crawler_selenium.py | 73 ++---- Forums/Initialization/forumsList.txt | 2 +- Forums/Initialization/forums_mining.py | 3 + Forums/Initialization/geckodriver.log | 247 ++++++++++++++++++++ Forums/Libre/crawler_selenium.py | 107 ++++----- Forums/OnniForums/crawler_selenium.py | 81 +++---- Forums/Procrax/crawler_selenium.py | 82 +++---- MarketPlaces/Initialization/marketsList.txt | 2 +- MarketPlaces/M00nkeyMarket/parser.py | 135 +++++------ 12 files changed, 479 insertions(+), 399 deletions(-) diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py index 736022c..c9edd9d 100644 --- a/Forums/Altenens/crawler_selenium.py +++ b/Forums/Altenens/crawler_selenium.py @@ -199,7 +199,7 @@ def getInterestedLinks(): return links - +# newest version of crawling def crawlForum(driver): print("Crawling the Altenens forum") @@ -233,7 +233,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, topic + f"page{counter}") + savePage(driver.page_source, topic + f"page{counter}") # very important # comment out if counter == 2: diff --git a/Forums/Cardingleaks/crawler_selenium.py b/Forums/Cardingleaks/crawler_selenium.py index 4d41368..835024e 100644 --- a/Forums/Cardingleaks/crawler_selenium.py +++ b/Forums/Cardingleaks/crawler_selenium.py @@ -2,7 +2,7 @@ __author__ = 'DarkWeb' ''' Cardingleaks Forum Crawler (Selenium) -FIXED +Crawler updated and fixed ''' from selenium import webdriver @@ -207,67 +207,53 @@ def getInterestedLinks(): def crawlForum(driver): - print("Crawling the Cardingleaks forum") + print("Crawling the Cardinglinks forum") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 while has_next_page: - list = topicPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver.page_source, item) - driver.back() - - #variable to check if there is a next page for the topic + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: has_next_topic_page = True counter = 1 + page = topic - # check if there is a next page for the topics while has_next_topic_page: - # try to access next page of th topic - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(baseURL, str(page)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver.page_source, topic + f"page{counter}") # very important - # if there is a next page then go and save.... - # Spec - try: - # temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div[1]/div[1]/div/nav/div[1]') # /html/body/div/div[2]/div/div[2]/div/ - item = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div + # comment out + if counter == 2: + break - if item == "": + try: + page = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') + if page == "": raise NoSuchElementException - else: - counter += 1 + counter += 1 except NoSuchElementException: has_next_topic_page = False - # end of loop for i in range(counter): driver.back() @@ -276,21 +262,12 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: - # temp = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div/div[1]/div/nav/div[1]') link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') - if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -300,10 +277,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - input("Crawling Cardingleaks forum done successfully. Press ENTER to continue\n") + input("Crawling Cardingleaksforum done successfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/Forums/CryptBB/crawler_selenium.py b/Forums/CryptBB/crawler_selenium.py index 11c44de..c69bd6a 100644 --- a/Forums/CryptBB/crawler_selenium.py +++ b/Forums/CryptBB/crawler_selenium.py @@ -238,65 +238,55 @@ def getInterestedLinks(): def crawlForum(driver): print("Crawling the CryptBB forum") + print("Crawling the CryptBB forum") + linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 while has_next_page: - list = topicPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver.page_source, item) - driver.back() - - #variable to check if there is a next page for the topic + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: has_next_topic_page = True counter = 1 + page = topic - # check if there is a next page for the topics while has_next_topic_page: - # try to access next page of th topic - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(baseURL, str(page)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver.page_source, topic + f"page{counter}") # very important + + # comment out + if counter == 2: + break - # if there is a next page then go and save.... - # next page in the topic? try: - temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/ - item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div + temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') + page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') - if item == "": + if page == "": raise NoSuchElementException - else: - counter += 1 + counter += 1 except NoSuchElementException: has_next_topic_page = False - # end of loop for i in range(counter): driver.back() @@ -305,21 +295,14 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: - temp = driver.find_element(by=By.XPATH, value = '/html/body/div/div[2]/div/div[2]/div') + temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div') link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -329,10 +312,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - input("Crawling CryptBB forum done successfully. Press ENTER to continue\n") + input("Crawling CrypttBB done successfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/HiddenAnswers/crawler_selenium.py index 66085a3..54e4a05 100644 --- a/Forums/HiddenAnswers/crawler_selenium.py +++ b/Forums/HiddenAnswers/crawler_selenium.py @@ -179,86 +179,65 @@ def crawlForum(driver): print("Crawling the HiddenAnswers forum") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 while has_next_page: - list = topicPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver.page_source, item) - driver.back() - ''' - #variable to check if there is a next page for the topic + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: has_next_topic_page = True counter = 1 + page = topic - # check if there is a next page for the topics while has_next_topic_page: - # try to access next page of th topic - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(baseURL, str(page)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver.page_source, topic + f"page{counter}") # very important - # if there is a next page then go and save.... - # next page in the topic? - try: - temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/ - item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div + # comment out + if counter == 2: + break - if item == "": + try: + page = "" # no next page so far may have some later on + if page == "": raise NoSuchElementException - has_next_topic_page = False - else: - counter += 1 + counter += 1 + except NoSuchElementException: has_next_topic_page = False - # end of loop for i in range(counter): driver.back() - ''' + # comment out break # comment out if count == 1: - count = 0 break try: - link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div/div[3]/div[3]/ul/li[7]/a').get_attribute('href') + link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div/div[3]/div[3]/ul/li[7]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -268,11 +247,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - - input("Crawling HiddenAnswers forum done sucessfully. Press ENTER to continue\n") + input("Crawling HiddenAnswers done successfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt index 801a104..d3ba91a 100644 --- a/Forums/Initialization/forumsList.txt +++ b/Forums/Initialization/forumsList.txt @@ -1 +1 @@ -Altenens \ No newline at end of file +Procrax \ No newline at end of file diff --git a/Forums/Initialization/forums_mining.py b/Forums/Initialization/forums_mining.py index 53e27d4..5fcf17e 100644 --- a/Forums/Initialization/forums_mining.py +++ b/Forums/Initialization/forums_mining.py @@ -14,6 +14,7 @@ from Forums.Procrax.crawler_selenium import crawler as crawlerProcraxForum from Forums.HiddenAnswers.crawler_selenium import crawler as crawlerHiddenAnswers from Forums.Cardingleaks.crawler_selenium import crawler as crawlerCardingleaks from Forums.Altenens.crawler_selenium import crawler as crawlerAltenens +from Forums.Libre.crawler_selenium import crawler as crawlerLibre import configparser import time @@ -119,6 +120,8 @@ if __name__ == '__main__': crawlerCardingleaks() elif forum == 'Altenens': crawlerAltenens() + elif forum == 'Libre': + crawlerLibre() diff --git a/Forums/Initialization/geckodriver.log b/Forums/Initialization/geckodriver.log index 80a0c5a..bfb039a 100644 --- a/Forums/Initialization/geckodriver.log +++ b/Forums/Initialization/geckodriver.log @@ -10951,3 +10951,250 @@ unwatchForTargets()@TargetList.jsm:37 destructor()@TargetList.jsm:109 stop()@CDP.jsm:104 close()@RemoteAgent.jsm:138 +1689363209615 geckodriver INFO Listening on 127.0.0.1:60532 +1689363216981 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "60533" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofile278pEs" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689363219049 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:60533/devtools/browser/8c990d4b-44eb-425d-b226-b8d4c1cffc2d +1689363224682 Marionette INFO Listening on port 60540 +1689363225068 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: , line 0: NotFoundError: No such JSWindowActor 'MarionetteEvents' +JavaScript error: , line 0: NotFoundError: No such JSWindowActor 'MarionetteEvents' +1689363820376 Marionette INFO Stopped listening on port 60540 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofile278pEs\thumbnails) because it does not exist +[Parent 5080, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167 +1689363820593 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 + resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:60789/devtools/browser/8539d316-2b33-4477-9e35-2f9e6eab09b6 +1689363569998 Marionette INFO Listening on port 60796 +1689363570244 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/, line 2: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 2: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 5: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 9: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 6: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 19: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 25: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 6: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 19: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 25: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 6: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 19: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 25: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86&page=2, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16778, line 6: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16778, line 19: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16778, line 25: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86&page=2, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +1689363752505 Marionette INFO Stopped listening on port 60796 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilecgBCTA\thumbnails) because it does not exist + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +Crash Annotation GraphicsCriticalError: |[C0][GFX1-]: Receive IPC close with reason=AbnormalShutdown (t=1346.28) +###!!! [Child][MessageChannel] Error: (msgtype=0x3900E5,name=PContent::Msg_GraphicsError) Channel closing: too late to send/recv, messages will be lost + +[GFX1-]: Receive IPC close with reason=AbnormalShutdown +1689363753315 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689364130030 geckodriver INFO Listening on 127.0.0.1:61129 +1689364135033 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "61130" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileZXcPSi" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689364136375 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:61130/devtools/browser/d0a00e7f-efab-4092-ba43-3afb5ec55bcc +1689364140122 Marionette INFO Listening on port 61138 +1689364140225 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689364164357 Marionette INFO Stopped listening on port 61138 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileZXcPSi\thumbnails) because it does not exist +[Parent 5336, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167 +[Parent 5336, IPC I/O Parent] WARNING: pipe error: 232: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/chrome/common/ipc_channel_win.cc:544 +1689364165253 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689364952139 geckodriver INFO Listening on 127.0.0.1:61327 +1689364958550 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "61328" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileeX31Bg" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689364960322 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:61328/devtools/browser/d98ca77f-1ca8-49c2-b3d0-7c98e39d55e8 +1689364964835 Marionette INFO Listening on port 61336 +1689364965449 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689365065931 Marionette INFO Stopped listening on port 61336 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileeX31Bg\thumbnails) because it does not exist + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1689365066887 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689365596202 geckodriver INFO Listening on 127.0.0.1:61665 +1689365603047 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "61666" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilegVxGn8" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689365604946 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:61666/devtools/browser/3f945d28-11cd-436c-832e-2085f8bb57e1 +1689365609901 Marionette INFO Listening on port 61676 +1689365610315 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689365827541 Marionette INFO Stopped listening on port 61676 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished +[Parent 7204, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167 +1689365828066 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689366358424 geckodriver INFO Listening on 127.0.0.1:62059 +1689366363521 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "62060" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileSRNF4S" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689366364862 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:62060/devtools/browser/38410e90-6408-4c6e-a78a-4d8c6dabe5f5 +1689366368448 Marionette INFO Listening on port 62067 + +###!!! [Child][MessageChannel] Error: (msgtype=0x390097,name=PContent::Msg_InitBackground) Channel closing: too late to send/recv, messages will be lost + + +###!!! [Child][MessageChannel] Error: (msgtype=0x390097,name=PContent::Msg_InitBackground) Channel closing: too late to send/recv, messages will be lost + +1689366368939 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689366462907 Marionette INFO Stopped listening on port 62067 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] + +###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished +1689366464131 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py index 59cea94..dfef8db 100644 --- a/Forums/Libre/crawler_selenium.py +++ b/Forums/Libre/crawler_selenium.py @@ -62,16 +62,14 @@ def login(driver): input('Press enter when CAPTCHA is completed, and you\'re at the login page') #entering username and password into input boxes - usernameBox = driver.find_element(by=By.NAME, value='login') + usernameBox = driver.find_element(by=By.NAME, value='username') #Username here usernameBox.send_keys('ct1234')#sends string to the username box passwordBox = driver.find_element(by=By.NAME, value='password') #Password here passwordBox.send_keys('r5o0wqmw')# sends string to passwordBox - login = driver.find_element(by=By.CLASS_NAME, value='block-container') - login_link = login.find_element(by=By.TAG_NAME, value='button') - login_link.click() + input("Press the login button and solve the CAPTCHA then press enter\n") # input('input') @@ -209,87 +207,65 @@ def crawlForum(driver): print("Crawling the Libre forum") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 while has_next_page: - list = topicPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver.page_source, item) - driver.back() - - #variable to check if there is a next page for the topic - # has_next_topic_page = True - # counter = 1 - - # # check if there is a next page for the topics - # while has_next_topic_page: - # # try to access next page of th topic - # itemURL = urlparse.urljoin(baseURL, str(item)) - # try: - # driver.get(itemURL) - # except: - # driver.refresh() - # savePage(driver.page_source, item) - # - # # if there is a next page then go and save.... - # # Spec - # try: - # # temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div[1]/div[1]/div/nav/div[1]') # /html/body/div/div[2]/div/div[2]/div/ - # item = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div - # - # if item == "": - # raise NoSuchElementException - # else: - # counter += 1 - # - # except NoSuchElementException: - # has_next_topic_page = False - # - # # end of loop - # for i in range(counter): - # driver.back() + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: + has_next_topic_page = True + counter = 1 + page = topic + + while has_next_topic_page: + itemURL = urlparse.urljoin(baseURL, str(page)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver.page_source, topic + f"page{counter}") # very important + + # comment out + if counter == 2: + break + + try: + page = "" # no next page so far may have some later on + if page == "": + raise NoSuchElementException + counter += 1 + + except NoSuchElementException: + has_next_topic_page = False + + for i in range(counter): + driver.back() # comment out break # comment out if count == 1: - count = 0 break try: - # temp = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div/div[1]/div/nav/div[1]') link = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -299,10 +275,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - input("Crawling Libre forum done successfully. Press ENTER to continue\n") + input("Crawling Libre done successfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py index 35824a2..447dd2e 100644 --- a/Forums/OnniForums/crawler_selenium.py +++ b/Forums/OnniForums/crawler_selenium.py @@ -214,92 +214,71 @@ def getInterestedLinks(): def crawlForum(driver): - print("Crawling the OnniForums forum") + print("Crawling the OnniForums") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: - list = topicPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver.page_source, item) - - #next page for topic - # variable to check if there is a next page for the topic + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: has_next_topic_page = True counter = 1 + page = topic - # check if there is a next page for the topics while has_next_topic_page: - # try to access next page of th topic - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(baseURL, str(page)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver.page_source, topic + f"page{counter}") # very important + + # comment out + if counter == 2: + break - # if there is a next page then go and save.... - # next page in the topic? try: - temp = driver.find_element(By.XPATH, - '/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[2]/div/ - item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute( - 'href') # /html/body/div/div[2]/div/div[2]/div + temp = driver.find_element(By.XPATH,'/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[2]/div/ + page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') # /html/body/div/div[2]/div/div[2]/div - if item == "": + if page == "": raise NoSuchElementException - has_next_topic_page = False - else: - counter += 1 + counter += 1 + except NoSuchElementException: has_next_topic_page = False - # end of loop for i in range(counter): driver.back() - # comment out, one topic per page + # comment out break - # comment out, go through all pages + # comment out if count == 1: - count = 0 break try: - temp = driver.find_element(by=By.XPATH, value= - '/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[3]/div + temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[3]/div link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -309,11 +288,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - - input("Crawling OnniForums forum done sucessfully. Press ENTER to continue\n") + input("Crawling OnniForums done successfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link diff --git a/Forums/Procrax/crawler_selenium.py b/Forums/Procrax/crawler_selenium.py index 5bb8a7a..99985b5 100644 --- a/Forums/Procrax/crawler_selenium.py +++ b/Forums/Procrax/crawler_selenium.py @@ -202,83 +202,70 @@ def getInterestedLinks(): def crawlForum(driver): - print("Crawling the Procrax forum") + print("Crawling the Procrax") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link)# open - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 - #loop through the topics while has_next_page: - list = topicPages(html)# for multiple pages - for item in list: - #variable to check if there is a next page for the topic + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: has_next_topic_page = True counter = 1 + page = topic - # check if there is a next page for the topics while has_next_topic_page: - # try to access next page of th topic - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(baseURL, str(page)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver.page_source, topic + f"page{counter}") # very important + + # comment out + if counter == 2: + break - # if there is a next page then go and save.... - # specific try: - # temp = driver.find_element(By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div/div') - item = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') + page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') - if item == "": + if page == "": raise NoSuchElementException - has_next_topic_page = False - else: - counter += 1 + counter += 1 + except NoSuchElementException: has_next_topic_page = False - #end of loop for i in range(counter): driver.back() - # # comment out - # break - # - # # comment out - # if count == 1: - # count = 0 - # break - - try:# change depending on web page, #general - # /html/body/div[1]/div[3]/div[2]/div[3]/div/div/div/div[1]/div/nav/div[1] - # temp = driver.find_element(By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div/div/div/div[1]/div/nav/div[1]') + + # comment out + break + + # comment out + if count == 1: + break + + try: + link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -288,10 +275,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - input("Crawling Procrax forum done successfully. Press ENTER to continue\n") + input("Crawling Procrax done successfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index ebb406e..19c6d10 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -AnonymousMarketplace +M00nkeyMarket diff --git a/MarketPlaces/M00nkeyMarket/parser.py b/MarketPlaces/M00nkeyMarket/parser.py index 8417dd5..db54c4b 100644 --- a/MarketPlaces/M00nkeyMarket/parser.py +++ b/MarketPlaces/M00nkeyMarket/parser.py @@ -35,50 +35,33 @@ def m00nkey_description_parser(soup): shipTo = "-1" # 18 Product_ShippedTo #vendor name - try: - temp = soup.find('div', {'class': 'box rounded mb-0'}).find('a').text - vendor = (cleanString(temp.strip())) - except: - print("Error in vendor") + temp = soup.find('div', {'class': 'box rounded mb-0'}).find('a').text + vendor = (cleanString(temp.strip())) #successful transaction - try: - temp = soup.findAll('div', {'class','text-center text-truncate column-flex ml-1 mr-1'}) #card sidebar-menu mb-4 card sidebar-menu mb-4 - temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'}) - temp = temp2[1].text - success = (cleanString(temp.strip())) - except: - print("Error in successful") - sucess = "-1" + temp = soup.findAll('div', {'class','text-center text-truncate column-flex ml-1 mr-1'}) #card sidebar-menu mb-4 card sidebar-menu mb-4 + temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'}) + temp = temp2[1].text + success = (cleanString(temp.strip())) + #vendor rating 5 - try: - temp = soup.findAll('div', {'class', 'text-center text-truncate column-flex ml-1 mr-1'}) # card sidebar-menu mb-4 card sidebar-menu mb-4 - temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'}) - temp = temp2[5].text - rating_vendor = (cleanString(temp.strip())) - except: - print("Error in vendor rating") - rating_vendor = "-1" + temp = soup.findAll('div', {'class', 'text-center text-truncate column-flex ml-1 mr-1'}) # card sidebar-menu mb-4 card sidebar-menu mb-4 + temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'}) + temp = temp2[5].text + rating_vendor = (cleanString(temp.strip())) # product name - try: - temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').text - name = (cleanString(temp.strip())) - except: - print("Error in product name") - name = "-1" + temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').text + name = (cleanString(temp.strip())) + # product description - try: - describe = soup.find('div', {'class': "box rounded flex-fill"}).find('pre').text - if "\n" in describe: - describe = describe.replace("\n", " ") - describe = describe.replace("\r", " ") - describe = cleanString(describe.strip()) - except: - print("Product description") - describe = "-1" + describe = soup.find('div', {'class': "box rounded flex-fill"}).find('pre').text + if "\n" in describe: + describe = describe.replace("\n", " ") + describe = describe.replace("\r", " ") + describe = cleanString(describe.strip()) CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much @@ -90,14 +73,10 @@ def m00nkey_description_parser(soup): temp = temp2[1].text category = cleanString(temp.strip()) except: - try: - temp = soup.find('table', {'class', 'table table-hover'}) - temp2 = temp.find('tbody').find('tr').findAll('td') - temp = temp2[1].text - category = cleanString(temp.strip()) - except: - print('Product category') - category = "-1" + temp = soup.find('table', {'class', 'table table-hover'}) + temp2 = temp.find('tbody').find('tr').findAll('td') + temp = temp2[1].text + category = cleanString(temp.strip()) # product number of view try: @@ -107,47 +86,35 @@ def m00nkey_description_parser(soup): views = cleanString((temp.strip())) except: print('Product number of view') - views = "-1" + # views = "-1" reviews = "-1" # 9 Product_Number_Of_Reviews rating_item = "-1" # 10 Product_Rating addDate = "-1" # 11 Product_AddedDate #BTC selling price box box-rounded mt-2 - try: - temp = soup.find('div', {'class', 'box box-rounded mt-2'}) - temp2 = temp.findAll('i', {'class', 'float-right color-prices'}) - temp = temp2[1].text - BTC = cleanString((temp.strip())) - except: - print('Product BTC') - BTC = "-1" + temp = soup.find('div', {'class', 'box box-rounded mt-2'}) + temp2 = temp.findAll('i', {'class', 'float-right color-prices'}) + temp = temp2[1].text + BTC = cleanString((temp.strip())) # USD selling price - try: - temp = soup.find('div', {'class', 'box box-rounded mt-2'}) - temp2 = temp.findAll('center') - temp = temp2[1].find('i').text - if "$" in temp: - temp = temp.replace("$", "") - USD = cleanString((temp.strip())) - except: - print('Product USD') - USD = "-1" + temp = soup.find('div', {'class', 'box box-rounded mt-2'}) + temp2 = temp.findAll('center') + temp = temp2[1].find('i').text + if "$" in temp: + temp = temp.replace("$", "") + USD = cleanString((temp.strip())) EURO = "-1" # 14 Product_EURO_SellingPrice # product sold - try: - temp = soup.find('div', {'class', 'box rounded mb-0'}) # card sidebar-menu mb-4 card sidebar-menu mb-4 - temp2 = temp.find('i') - temp = temp2.text - sold = (cleanString(temp.strip())) - - except: - print("Error in successful") - sold = "-1" + temp = soup.find('div', {'class', 'box rounded mb-0'}) # card sidebar-menu mb-4 card sidebar-menu mb-4 + temp2 = temp.find('i') + temp = temp2.text + sold = (cleanString(temp.strip())) + # sold = "-1" # product quantatiy left ###ERRROR try: @@ -157,15 +124,12 @@ def m00nkey_description_parser(soup): temp = temp3[1].text left = cleanString(temp.strip()) except: - try: - temp = soup.find('table', {'class', 'table table-hover'}) - temp2 = temp.findAll('tr') - temp3 = temp2[1].findAll('td') - temp = temp3[1].text - left = cleanString(temp.strip()) - except: - print('Product quantity') - left = "-1" + temp = soup.find('table', {'class', 'table table-hover'}) + temp2 = temp.findAll('tr') + temp3 = temp2[1].findAll('td') + temp = temp3[1].text + left = cleanString(temp.strip()) + shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo @@ -229,20 +193,25 @@ def m00nkey_listing_parser(soup): temp = a.find('col-5 justify-content-between mx-auto').find('div').text success.append(cleanString(temp.strip())) except: - print('vendor') + print('successful transactions') # product name try: temp = a.find('card-title rounded text-truncate').find('a').text name.append(cleanString(temp.strip())) except: - print('vendor') + print('product name') CVE.append('-1') MS.append('-1') rating_vendor.append("-1") + try: + temp = a.findAll('btn btn-block btn-primary') + except: + print("Error in product category") + category = [] # 7 Product_Category y describe = [] # 8 Product_Description views = [] # 9 Product_Number_Of_Views