diff --git a/.gitignore b/.gitignore index fe0eeb1..dbe1559 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ /shelf/ .idea/workspace.xml selenium/geckodriver.exe +__pycache__ setup.ini *.html *.log diff --git a/Forums/AbyssForum/__pycache__/crawler_selenium.cpython-310.pyc b/Forums/AbyssForum/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index 5512e92..0000000 Binary files a/Forums/AbyssForum/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/Forums/AbyssForum/__pycache__/crawler_selenium.cpython-311.pyc b/Forums/AbyssForum/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index d08a6e5..0000000 Binary files a/Forums/AbyssForum/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/Forums/AbyssForum/__pycache__/parser.cpython-310.pyc b/Forums/AbyssForum/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 7115900..0000000 Binary files a/Forums/AbyssForum/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/Forums/AbyssForum/__pycache__/parser.cpython-311.pyc b/Forums/AbyssForum/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index 55a8281..0000000 Binary files a/Forums/AbyssForum/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/AbyssForum/crawler_selenium.py index 573cd13..32678dc 100644 --- a/Forums/AbyssForum/crawler_selenium.py +++ b/Forums/AbyssForum/crawler_selenium.py @@ -191,86 +191,66 @@ def crawlForum(driver): print("Crawling the AbyssForum forum") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 while has_next_page: - list = topicPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver.page_source, item) - driver.back() - ''' - #variable to check if there is a next page for the topic + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: has_next_topic_page = True counter = 1 + page = topic - # check if there is a next page for the topics while has_next_topic_page: - # try to access next page of th topic - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(baseURL, str(page)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver.page_source, topic + f"page{counter}") - # if there is a next page then go and save.... - # next page in the topic? - try: - temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/ - item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div + # comment out + if counter == 2: + break + try: + temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/div[3]') + item = temp.find_element(by=By.CLASS_NAME, value='button button-icon-only').get_attribute('href') if item == "": raise NoSuchElementException - has_next_topic_page = False - else: - counter += 1 + counter += 1 + except NoSuchElementException: has_next_topic_page = False # end of loop for i in range(counter): driver.back() - ''' + # comment out break # comment out if count == 1: - count = 0 break try: link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href') - if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -280,10 +260,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - input("Crawling AbyssForum forum done sucessfully. Press ENTER to continue\n") diff --git a/Forums/Altenens/__pycache__/crawler_selenium.cpython-310.pyc b/Forums/Altenens/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index 8cdd037..0000000 Binary files a/Forums/Altenens/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/Forums/Altenens/__pycache__/crawler_selenium.cpython-311.pyc b/Forums/Altenens/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index 38b5dbc..0000000 Binary files a/Forums/Altenens/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/Forums/Altenens/__pycache__/parser.cpython-310.pyc b/Forums/Altenens/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index b95b89c..0000000 Binary files a/Forums/Altenens/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/Forums/Altenens/__pycache__/parser.cpython-311.pyc b/Forums/Altenens/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index 3f1c5d5..0000000 Binary files a/Forums/Altenens/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py index 736022c..c9edd9d 100644 --- a/Forums/Altenens/crawler_selenium.py +++ b/Forums/Altenens/crawler_selenium.py @@ -199,7 +199,7 @@ def getInterestedLinks(): return links - +# newest version of crawling def crawlForum(driver): print("Crawling the Altenens forum") @@ -233,7 +233,7 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, topic + f"page{counter}") + savePage(driver.page_source, topic + f"page{counter}") # very important # comment out if counter == 2: diff --git a/Forums/Cardingleaks/crawler_selenium.py b/Forums/Cardingleaks/crawler_selenium.py index 4d41368..835024e 100644 --- a/Forums/Cardingleaks/crawler_selenium.py +++ b/Forums/Cardingleaks/crawler_selenium.py @@ -2,7 +2,7 @@ __author__ = 'DarkWeb' ''' Cardingleaks Forum Crawler (Selenium) -FIXED +Crawler updated and fixed ''' from selenium import webdriver @@ -207,67 +207,53 @@ def getInterestedLinks(): def crawlForum(driver): - print("Crawling the Cardingleaks forum") + print("Crawling the Cardinglinks forum") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 while has_next_page: - list = topicPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver.page_source, item) - driver.back() - - #variable to check if there is a next page for the topic + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: has_next_topic_page = True counter = 1 + page = topic - # check if there is a next page for the topics while has_next_topic_page: - # try to access next page of th topic - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(baseURL, str(page)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver.page_source, topic + f"page{counter}") # very important - # if there is a next page then go and save.... - # Spec - try: - # temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div[1]/div[1]/div/nav/div[1]') # /html/body/div/div[2]/div/div[2]/div/ - item = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div + # comment out + if counter == 2: + break - if item == "": + try: + page = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') + if page == "": raise NoSuchElementException - else: - counter += 1 + counter += 1 except NoSuchElementException: has_next_topic_page = False - # end of loop for i in range(counter): driver.back() @@ -276,21 +262,12 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: - # temp = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div/div[1]/div/nav/div[1]') link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') - if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -300,10 +277,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - input("Crawling Cardingleaks forum done successfully. Press ENTER to continue\n") + input("Crawling Cardingleaksforum done successfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/Forums/CryptBB/__pycache__/__init__.cpython-311.pyc b/Forums/CryptBB/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index c09fee6..0000000 Binary files a/Forums/CryptBB/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/Forums/CryptBB/__pycache__/crawler_selenium.cpython-310.pyc b/Forums/CryptBB/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index b64d5e7..0000000 Binary files a/Forums/CryptBB/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/Forums/CryptBB/__pycache__/crawler_selenium.cpython-311.pyc b/Forums/CryptBB/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index 3f5473f..0000000 Binary files a/Forums/CryptBB/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/Forums/CryptBB/__pycache__/parser.cpython-310.pyc b/Forums/CryptBB/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 9086c35..0000000 Binary files a/Forums/CryptBB/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/Forums/CryptBB/__pycache__/parser.cpython-311.pyc b/Forums/CryptBB/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index dc04000..0000000 Binary files a/Forums/CryptBB/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/Forums/CryptBB/crawler_selenium.py b/Forums/CryptBB/crawler_selenium.py index 11c44de..c69bd6a 100644 --- a/Forums/CryptBB/crawler_selenium.py +++ b/Forums/CryptBB/crawler_selenium.py @@ -238,65 +238,55 @@ def getInterestedLinks(): def crawlForum(driver): print("Crawling the CryptBB forum") + print("Crawling the CryptBB forum") + linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 while has_next_page: - list = topicPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver.page_source, item) - driver.back() - - #variable to check if there is a next page for the topic + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: has_next_topic_page = True counter = 1 + page = topic - # check if there is a next page for the topics while has_next_topic_page: - # try to access next page of th topic - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(baseURL, str(page)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver.page_source, topic + f"page{counter}") # very important + + # comment out + if counter == 2: + break - # if there is a next page then go and save.... - # next page in the topic? try: - temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/ - item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div + temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') + page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') - if item == "": + if page == "": raise NoSuchElementException - else: - counter += 1 + counter += 1 except NoSuchElementException: has_next_topic_page = False - # end of loop for i in range(counter): driver.back() @@ -305,21 +295,14 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: - temp = driver.find_element(by=By.XPATH, value = '/html/body/div/div[2]/div/div[2]/div') + temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div') link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -329,10 +312,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - input("Crawling CryptBB forum done successfully. Press ENTER to continue\n") + input("Crawling CrypttBB done successfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-310.pyc b/Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index e28f1a2..0000000 Binary files a/Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-311.pyc b/Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index 80d392b..0000000 Binary files a/Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/Forums/HiddenAnswers/__pycache__/parser.cpython-310.pyc b/Forums/HiddenAnswers/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 1444998..0000000 Binary files a/Forums/HiddenAnswers/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/Forums/HiddenAnswers/__pycache__/parser.cpython-311.pyc b/Forums/HiddenAnswers/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index d9c9fb5..0000000 Binary files a/Forums/HiddenAnswers/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/HiddenAnswers/crawler_selenium.py index 66085a3..54e4a05 100644 --- a/Forums/HiddenAnswers/crawler_selenium.py +++ b/Forums/HiddenAnswers/crawler_selenium.py @@ -179,86 +179,65 @@ def crawlForum(driver): print("Crawling the HiddenAnswers forum") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 while has_next_page: - list = topicPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver.page_source, item) - driver.back() - ''' - #variable to check if there is a next page for the topic + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: has_next_topic_page = True counter = 1 + page = topic - # check if there is a next page for the topics while has_next_topic_page: - # try to access next page of th topic - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(baseURL, str(page)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver.page_source, topic + f"page{counter}") # very important - # if there is a next page then go and save.... - # next page in the topic? - try: - temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/ - item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div + # comment out + if counter == 2: + break - if item == "": + try: + page = "" # no next page so far may have some later on + if page == "": raise NoSuchElementException - has_next_topic_page = False - else: - counter += 1 + counter += 1 + except NoSuchElementException: has_next_topic_page = False - # end of loop for i in range(counter): driver.back() - ''' + # comment out break # comment out if count == 1: - count = 0 break try: - link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div/div[3]/div[3]/ul/li[7]/a').get_attribute('href') + link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div/div[3]/div[3]/ul/li[7]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -268,11 +247,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - - input("Crawling HiddenAnswers forum done sucessfully. Press ENTER to continue\n") + input("Crawling HiddenAnswers done successfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link diff --git a/Forums/Initialization/__pycache__/__init__.cpython-310.pyc b/Forums/Initialization/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 481f9a7..0000000 Binary files a/Forums/Initialization/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/Forums/Initialization/__pycache__/__init__.cpython-311.pyc b/Forums/Initialization/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index c6c10a4..0000000 Binary files a/Forums/Initialization/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/Forums/Initialization/__pycache__/forums_mining.cpython-310.pyc b/Forums/Initialization/__pycache__/forums_mining.cpython-310.pyc deleted file mode 100644 index f8f47ca..0000000 Binary files a/Forums/Initialization/__pycache__/forums_mining.cpython-310.pyc and /dev/null differ diff --git a/Forums/Initialization/__pycache__/forums_mining.cpython-311.pyc b/Forums/Initialization/__pycache__/forums_mining.cpython-311.pyc deleted file mode 100644 index 294aebc..0000000 Binary files a/Forums/Initialization/__pycache__/forums_mining.cpython-311.pyc and /dev/null differ diff --git a/Forums/Initialization/__pycache__/prepare_parser.cpython-310.pyc b/Forums/Initialization/__pycache__/prepare_parser.cpython-310.pyc deleted file mode 100644 index 158eea1..0000000 Binary files a/Forums/Initialization/__pycache__/prepare_parser.cpython-310.pyc and /dev/null differ diff --git a/Forums/Initialization/__pycache__/prepare_parser.cpython-311.pyc b/Forums/Initialization/__pycache__/prepare_parser.cpython-311.pyc deleted file mode 100644 index 121809c..0000000 Binary files a/Forums/Initialization/__pycache__/prepare_parser.cpython-311.pyc and /dev/null differ diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt index 801a104..d3ba91a 100644 --- a/Forums/Initialization/forumsList.txt +++ b/Forums/Initialization/forumsList.txt @@ -1 +1 @@ -Altenens \ No newline at end of file +Procrax \ No newline at end of file diff --git a/Forums/Initialization/forums_mining.py b/Forums/Initialization/forums_mining.py index 53e27d4..6c76692 100644 --- a/Forums/Initialization/forums_mining.py +++ b/Forums/Initialization/forums_mining.py @@ -14,6 +14,7 @@ from Forums.Procrax.crawler_selenium import crawler as crawlerProcraxForum from Forums.HiddenAnswers.crawler_selenium import crawler as crawlerHiddenAnswers from Forums.Cardingleaks.crawler_selenium import crawler as crawlerCardingleaks from Forums.Altenens.crawler_selenium import crawler as crawlerAltenens +from Forums.Libre.crawler_selenium import crawler as crawlerLibre import configparser import time @@ -98,9 +99,9 @@ if __name__ == '__main__': forum = forum.replace('\n','') print("Creating listing and description directories ... for " + forum) - createDirectory(forum) - time.sleep(5) # wait for directories to be created - input("Directories created successfully. Press ENTER to continue\n") + # createDirectory(forum) + # time.sleep(5) # wait for directories to be created + # input("Directories created successfully. Press ENTER to continue\n") if forum == "BestCardingWorld": @@ -119,6 +120,8 @@ if __name__ == '__main__': crawlerCardingleaks() elif forum == 'Altenens': crawlerAltenens() + elif forum == 'Libre': + crawlerLibre() diff --git a/Forums/Initialization/geckodriver.log b/Forums/Initialization/geckodriver.log index 80a0c5a..8d4ccb3 100644 --- a/Forums/Initialization/geckodriver.log +++ b/Forums/Initialization/geckodriver.log @@ -10951,3 +10951,327 @@ unwatchForTargets()@TargetList.jsm:37 destructor()@TargetList.jsm:109 stop()@CDP.jsm:104 close()@RemoteAgent.jsm:138 +1689363209615 geckodriver INFO Listening on 127.0.0.1:60532 +1689363216981 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "60533" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofile278pEs" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689363219049 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:60533/devtools/browser/8c990d4b-44eb-425d-b226-b8d4c1cffc2d +1689363224682 Marionette INFO Listening on port 60540 +1689363225068 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: , line 0: NotFoundError: No such JSWindowActor 'MarionetteEvents' +JavaScript error: , line 0: NotFoundError: No such JSWindowActor 'MarionetteEvents' +1689363820376 Marionette INFO Stopped listening on port 60540 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofile278pEs\thumbnails) because it does not exist +[Parent 5080, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167 +1689363820593 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 + resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:60789/devtools/browser/8539d316-2b33-4477-9e35-2f9e6eab09b6 +1689363569998 Marionette INFO Listening on port 60796 +1689363570244 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/, line 2: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 2: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 5: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 9: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 6: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 19: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 25: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 6: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 19: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 25: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 6: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 19: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 25: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86&page=2, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16778, line 6: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16778, line 19: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16778, line 25: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86&page=2, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +1689363752505 Marionette INFO Stopped listening on port 60796 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilecgBCTA\thumbnails) because it does not exist + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +Crash Annotation GraphicsCriticalError: |[C0][GFX1-]: Receive IPC close with reason=AbnormalShutdown (t=1346.28) +###!!! [Child][MessageChannel] Error: (msgtype=0x3900E5,name=PContent::Msg_GraphicsError) Channel closing: too late to send/recv, messages will be lost + +[GFX1-]: Receive IPC close with reason=AbnormalShutdown +1689363753315 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689364130030 geckodriver INFO Listening on 127.0.0.1:61129 +1689364135033 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "61130" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileZXcPSi" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689364136375 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:61130/devtools/browser/d0a00e7f-efab-4092-ba43-3afb5ec55bcc +1689364140122 Marionette INFO Listening on port 61138 +1689364140225 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689364164357 Marionette INFO Stopped listening on port 61138 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileZXcPSi\thumbnails) because it does not exist +[Parent 5336, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167 +[Parent 5336, IPC I/O Parent] WARNING: pipe error: 232: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/chrome/common/ipc_channel_win.cc:544 +1689364165253 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689364952139 geckodriver INFO Listening on 127.0.0.1:61327 +1689364958550 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "61328" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileeX31Bg" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689364960322 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:61328/devtools/browser/d98ca77f-1ca8-49c2-b3d0-7c98e39d55e8 +1689364964835 Marionette INFO Listening on port 61336 +1689364965449 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689365065931 Marionette INFO Stopped listening on port 61336 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileeX31Bg\thumbnails) because it does not exist + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1689365066887 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689365596202 geckodriver INFO Listening on 127.0.0.1:61665 +1689365603047 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "61666" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilegVxGn8" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689365604946 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:61666/devtools/browser/3f945d28-11cd-436c-832e-2085f8bb57e1 +1689365609901 Marionette INFO Listening on port 61676 +1689365610315 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689365827541 Marionette INFO Stopped listening on port 61676 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished +[Parent 7204, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167 +1689365828066 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689366358424 geckodriver INFO Listening on 127.0.0.1:62059 +1689366363521 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "62060" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileSRNF4S" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689366364862 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:62060/devtools/browser/38410e90-6408-4c6e-a78a-4d8c6dabe5f5 +1689366368448 Marionette INFO Listening on port 62067 + +###!!! [Child][MessageChannel] Error: (msgtype=0x390097,name=PContent::Msg_InitBackground) Channel closing: too late to send/recv, messages will be lost + + +###!!! [Child][MessageChannel] Error: (msgtype=0x390097,name=PContent::Msg_InitBackground) Channel closing: too late to send/recv, messages will be lost + +1689366368939 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689366462907 Marionette INFO Stopped listening on port 62067 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] + +###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished +1689366464131 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689622469580 geckodriver INFO Listening on 127.0.0.1:58866 +1689622474728 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "58867" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofile5gOLDP" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689622475417 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:58867/devtools/browser/9a3a8de2-439e-425e-b415-f975abd86b65 +1689622476941 Marionette INFO Listening on port 58873 +1689622477054 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofile5gOLDP\thumbnails) because it does not exist +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: undefined, line 0: Error: Missing host permission for the tab +JavaScript error: undefined, line 0: Error: Missing host permission for the tab +1689624030995 Marionette INFO Stopped listening on port 58873 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofile5gOLDP\thumbnails) because it does not exist + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1689624031467 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689624276336 geckodriver INFO Listening on 127.0.0.1:59792 +1689624280979 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "59793" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofileSTe5EC" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689624281509 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:59793/devtools/browser/222a61fa-a958-4978-8048-bb632f658131 +1689624283001 Marionette INFO Listening on port 59799 +1689624283405 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689624692072 Marionette INFO Stopped listening on port 59799 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofileSTe5EC\thumbnails) because it does not exist + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1689624692916 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index fe8be28..4c6a407 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -9,6 +9,7 @@ from Forums.BestCardingWorld.parser import * from Forums.CryptBB.parser import * from Forums.OnniForums.parser import * from Forums.Altenens.parser import * +from Forums.Procrax.parser import * from Forums.Classifier.classify_product import predict # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi @@ -154,6 +155,8 @@ def new_parse(forum, url, createLog): rmm = onniForums_description_parser(soup) elif forum == "Altenens": rmm = altenens_description_parser(soup) + elif forum == "Procrax": + rmm = procrax_description_parser(soup) # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip() key = u"Url:" + os.path.basename(line2).replace(".html", "") @@ -233,6 +236,8 @@ def new_parse(forum, url, createLog): rw = onniForums_listing_parser(soup) elif forum == "Altenens": rw = altenens_listing_parser(soup) + elif forum == "Procrax": + rw = procrax_listing_parser(soup) except: diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py index 59cea94..dfef8db 100644 --- a/Forums/Libre/crawler_selenium.py +++ b/Forums/Libre/crawler_selenium.py @@ -62,16 +62,14 @@ def login(driver): input('Press enter when CAPTCHA is completed, and you\'re at the login page') #entering username and password into input boxes - usernameBox = driver.find_element(by=By.NAME, value='login') + usernameBox = driver.find_element(by=By.NAME, value='username') #Username here usernameBox.send_keys('ct1234')#sends string to the username box passwordBox = driver.find_element(by=By.NAME, value='password') #Password here passwordBox.send_keys('r5o0wqmw')# sends string to passwordBox - login = driver.find_element(by=By.CLASS_NAME, value='block-container') - login_link = login.find_element(by=By.TAG_NAME, value='button') - login_link.click() + input("Press the login button and solve the CAPTCHA then press enter\n") # input('input') @@ -209,87 +207,65 @@ def crawlForum(driver): print("Crawling the Libre forum") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 while has_next_page: - list = topicPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver.page_source, item) - driver.back() - - #variable to check if there is a next page for the topic - # has_next_topic_page = True - # counter = 1 - - # # check if there is a next page for the topics - # while has_next_topic_page: - # # try to access next page of th topic - # itemURL = urlparse.urljoin(baseURL, str(item)) - # try: - # driver.get(itemURL) - # except: - # driver.refresh() - # savePage(driver.page_source, item) - # - # # if there is a next page then go and save.... - # # Spec - # try: - # # temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div[1]/div[1]/div/nav/div[1]') # /html/body/div/div[2]/div/div[2]/div/ - # item = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div - # - # if item == "": - # raise NoSuchElementException - # else: - # counter += 1 - # - # except NoSuchElementException: - # has_next_topic_page = False - # - # # end of loop - # for i in range(counter): - # driver.back() + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: + has_next_topic_page = True + counter = 1 + page = topic + + while has_next_topic_page: + itemURL = urlparse.urljoin(baseURL, str(page)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver.page_source, topic + f"page{counter}") # very important + + # comment out + if counter == 2: + break + + try: + page = "" # no next page so far may have some later on + if page == "": + raise NoSuchElementException + counter += 1 + + except NoSuchElementException: + has_next_topic_page = False + + for i in range(counter): + driver.back() # comment out break # comment out if count == 1: - count = 0 break try: - # temp = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div/div[1]/div/nav/div[1]') link = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -299,10 +275,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - input("Crawling Libre forum done successfully. Press ENTER to continue\n") + input("Crawling Libre done successfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/Forums/OnniForums/__pycache__/crawler_selenium.cpython-310.pyc b/Forums/OnniForums/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index a7ffacc..0000000 Binary files a/Forums/OnniForums/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/Forums/OnniForums/__pycache__/crawler_selenium.cpython-311.pyc b/Forums/OnniForums/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index 8278bd9..0000000 Binary files a/Forums/OnniForums/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/Forums/OnniForums/__pycache__/parser.cpython-310.pyc b/Forums/OnniForums/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index f08bc35..0000000 Binary files a/Forums/OnniForums/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/Forums/OnniForums/__pycache__/parser.cpython-311.pyc b/Forums/OnniForums/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index cdefc99..0000000 Binary files a/Forums/OnniForums/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/Forums/OnniForums/__pycache__/parser_script.cpython-311.pyc b/Forums/OnniForums/__pycache__/parser_script.cpython-311.pyc deleted file mode 100644 index d03ffa2..0000000 Binary files a/Forums/OnniForums/__pycache__/parser_script.cpython-311.pyc and /dev/null differ diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py index 35824a2..447dd2e 100644 --- a/Forums/OnniForums/crawler_selenium.py +++ b/Forums/OnniForums/crawler_selenium.py @@ -214,92 +214,71 @@ def getInterestedLinks(): def crawlForum(driver): - print("Crawling the OnniForums forum") + print("Crawling the OnniForums") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: - list = topicPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver.page_source, item) - - #next page for topic - # variable to check if there is a next page for the topic + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: has_next_topic_page = True counter = 1 + page = topic - # check if there is a next page for the topics while has_next_topic_page: - # try to access next page of th topic - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(baseURL, str(page)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver.page_source, topic + f"page{counter}") # very important + + # comment out + if counter == 2: + break - # if there is a next page then go and save.... - # next page in the topic? try: - temp = driver.find_element(By.XPATH, - '/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[2]/div/ - item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute( - 'href') # /html/body/div/div[2]/div/div[2]/div + temp = driver.find_element(By.XPATH,'/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[2]/div/ + page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') # /html/body/div/div[2]/div/div[2]/div - if item == "": + if page == "": raise NoSuchElementException - has_next_topic_page = False - else: - counter += 1 + counter += 1 + except NoSuchElementException: has_next_topic_page = False - # end of loop for i in range(counter): driver.back() - # comment out, one topic per page + # comment out break - # comment out, go through all pages + # comment out if count == 1: - count = 0 break try: - temp = driver.find_element(by=By.XPATH, value= - '/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[3]/div + temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[3]/div link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -309,11 +288,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - - input("Crawling OnniForums forum done sucessfully. Press ENTER to continue\n") + input("Crawling OnniForums done successfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link diff --git a/Forums/Procrax/crawler_selenium.py b/Forums/Procrax/crawler_selenium.py index 5bb8a7a..9d37eae 100644 --- a/Forums/Procrax/crawler_selenium.py +++ b/Forums/Procrax/crawler_selenium.py @@ -26,24 +26,28 @@ from Forums.Procrax.parser import procrax_links_parser from Forums.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'https://procrax.cx/' +BASE_URL = 'https://procrax.cx/' +FORUM_NAME = 'Procrax' # Opens Tor Browser, crawls the website def startCrawling(): - opentor() - # forumName = getForumName() - driver = getAccess() + # opentor() + # driver = getAccess() - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) - # new_parse(forumName, False) + new_parse( + forum=FORUM_NAME, + url=BASE_URL, + createLog=False + ) # Opens Tor Browser @@ -139,10 +143,9 @@ def createFFDriver(): return driver def getAccess(): - url = getFixedURL() driver = createFFDriver() try: - driver.get(url)# open url in browser + driver.get(BASE_URL)# open url in browser return driver except: driver.close()# close tab @@ -162,7 +165,7 @@ def savePage(page, url): def getFullPathName(url): from Forums.Initialization.forums_mining import config, CURRENT_DATE - mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") + mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + FORUM_NAME + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') @@ -185,100 +188,87 @@ def getInterestedLinks(): links = [] # # general hacking - # links.append('https://procrax.cx/forums/general-hacking.24/') + links.append('https://procrax.cx/forums/general-hacking.24/') # # hacking security tools - # links.append('https://procrax.cx/forums/hacking-security-tools.20/') + links.append('https://procrax.cx/forums/hacking-security-tools.20/') # # hacktube - # links.append('https://procrax.cx/forums/hacktube.22/') + links.append('https://procrax.cx/forums/hacktube.22/') # # cardable # links.append('https://procrax.cx/forums/cardable-websites.28/') # # tools # links.append('https://procrax.cx/forums/tools-bots-validators.73/') # general forum - links.append('https://procrax.cx/forums/forum-discussions-updates.7/') + # links.append('https://procrax.cx/forums/forum-discussions-updates.7/') return links def crawlForum(driver): - print("Crawling the Procrax forum") + print("Crawling the Procrax") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link)# open - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 - #loop through the topics while has_next_page: - list = topicPages(html)# for multiple pages - for item in list: - #variable to check if there is a next page for the topic + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: has_next_topic_page = True counter = 1 + page = topic - # check if there is a next page for the topics while has_next_topic_page: - # try to access next page of th topic - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(BASE_URL, str(page)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver.page_source, topic + f"page{counter}") # very important + + # comment out + # if counter == 2: + # break - # if there is a next page then go and save.... - # specific try: - # temp = driver.find_element(By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div/div') - item = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') + page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') - if item == "": + if page == "": raise NoSuchElementException - has_next_topic_page = False - else: - counter += 1 + counter += 1 + except NoSuchElementException: has_next_topic_page = False - #end of loop for i in range(counter): driver.back() - # # comment out - # break - # - # # comment out - # if count == 1: - # count = 0 - # break - - try:# change depending on web page, #general - # /html/body/div[1]/div[3]/div[2]/div[3]/div/div/div/div[1]/div/nav/div[1] - # temp = driver.find_element(By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div/div/div/div[1]/div/nav/div[1]') + + # comment out + # break + + # comment out + if count == 20: + break + + try: + link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -288,10 +278,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - input("Crawling Procrax forum done successfully. Press ENTER to continue\n") + input("Crawling Procrax done successfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/Forums/Procrax/parser.py b/Forums/Procrax/parser.py index 30cc2e8..7c9c463 100644 --- a/Forums/Procrax/parser.py +++ b/Forums/Procrax/parser.py @@ -7,11 +7,12 @@ from datetime import timedelta import re # Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, ResultSet, Tag # This is the method to parse the Description Pages (one page to each topic in the Listing Pages) -def cryptBB_description_parser(soup): + +def procrax_description_parser(soup: Tag): # Fields to be parsed @@ -27,146 +28,36 @@ def cryptBB_description_parser(soup): # Finding the topic (should be just one coming from the Listing Page) - li = soup.find("td", {"class": "thead"}).find('strong') + li = soup.find("h1", {"class": "p-title-value"}) topic = li.text - topic = re.sub("\[\w*\]", '', topic) - - topic = topic.replace(",","") - topic = topic.replace("\n","") - topic = cleanString(topic.strip()) - - # Finding the repeated tag that corresponds to the listing of posts - - # try: - posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all( - 'div', {"class": "post"}) - - # For each message (post), get all the fields we are interested to: - - for ipost in posts: - - # Finding a first level of the HTML page - - post_wrapper = ipost.find('span', {"class": "largetext"}) - - # Finding the author (user) of the post - - author = post_wrapper.text.strip() - user.append(cleanString(author)) # Remember to clean the problematic characters - - # Finding the status of the author - - smalltext = ipost.find('div', {"class": "post_author"}) - - ''' - # Testing here two possibilities to find this status and combine them - if ipost.find('div', {"class": "deleted_post_author"}): - status.append(-1) - interest.append(-1) - reputation.append(-1) - addDate.append(-1) - post.append("THIS POST HAS BEEN REMOVED!") - sign.append(-1) - feedback.append(-1) - continue - ''' - - # CryptBB does have membergroup and postgroup - - membergroup = smalltext.find('div', {"class": "profile-rank"}) - postgroup = smalltext.find('div', {"class": "postgroup"}) - if membergroup != None: - membergroup = membergroup.text.strip() - if postgroup != None: - postgroup = postgroup.text.strip() - membergroup = membergroup + " - " + postgroup - else: - if postgroup != None: - membergroup = postgroup.text.strip() - else: - membergroup = "-1" - status.append(cleanString(membergroup)) - - # Finding the interest of the author - # CryptBB does not have blurb - blurb = smalltext.find('li', {"class": "blurb"}) - if blurb != None: - blurb = blurb.text.strip() - else: - blurb = "-1" - interest.append(cleanString(blurb)) - - # Finding the reputation of the user - # CryptBB does have reputation - author_stats = smalltext.find('div', {"class": "author_statistics"}) - karma = author_stats.find('strong') - if karma != None: - karma = karma.text - karma = karma.replace("Community Rating: ", "") - karma = karma.replace("Karma: ", "") - karma = karma.strip() - else: - karma = "-1" - reputation.append(cleanString(karma)) - - # Getting here another good tag to find the post date, post content and users' signature - - postarea = ipost.find('div', {"class": "post_content"}) - - dt = postarea.find('span', {"class": "post_date"}).text - # dt = dt.strip().split() - dt = dt.strip() - day=date.today() - if "Yesterday" in dt: - yesterday = day - timedelta(days=1) - yesterday = yesterday.strftime('%m-%d-%Y') - stime = dt.replace('Yesterday,','').strip() - date_time_obj = yesterday+ ', '+stime - date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') - elif "hours ago" in dt: - day = day.strftime('%m-%d-%Y') - date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] - date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p') - else: - date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') - stime = date_time_obj.strftime('%b %d, %Y') - sdate = date_time_obj.strftime('%I:%M %p') - addDate.append(date_time_obj) - - # Finding the post - - inner = postarea.find('div', {"class": "post_body scaleimages"}) - inner = inner.text.strip() - post.append(cleanString(inner)) - - # Finding the user's signature - - # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) - signature = ipost.find('div', {"class": "signature scaleimages"}) - if signature != None: - signature = signature.text.strip() - # print(signature) - else: - signature = "-1" - sign.append(cleanString(signature)) - - # As no information about user's feedback was found, just assign "-1" to the variable - + + thread: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True}) + + for ipost in thread: + username = ipost.find("h4", {"class": "message-name"}).text + user.append(cleanString(username.strip())) + + date_posted = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime") + datetime_obj = datetime.strptime(date_posted, "%Y-%m-%dT%H:%M:%S%z") + addDate.append(datetime_obj) + + feedback.append("-1") - - ''' - except: - if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ": - user.append("-1") - status.append(-1) - interest.append(-1) - reputation.append(-1) - addDate.append(-1) - post.append("NO ACCESS TO THIS PAGE!") - sign.append(-1) - feedback.append(-1) - ''' - + + user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text + status.append(cleanString(user_status.strip())) + + user_lvl = ipost.find("div", {"class": "afAwardLevel"}).text + reputation.append(cleanString(user_lvl.strip())) + + sign.append("-1") + + user_post = ipost.find("article", {"class": "message-body js-selectToQuote"}).text + post.append(cleanString(user_post.strip())) + + interest.append("-1") + + # Populate the final variable (this should be a list with all fields scraped) @@ -178,7 +69,7 @@ def cryptBB_description_parser(soup): # This is the method to parse the Listing Pages (one page with many posts) -def cryptBB_listing_parser(soup): +def procrax_listing_parser(soup: Tag): board = "-1" # board name (the previous level of the topic in the Forum categorization tree. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) @@ -193,59 +84,47 @@ def cryptBB_listing_parser(soup): # Listing and Description pages) # Finding the board (should be just one) - - board = soup.find('span', {"class": "active"}).text - board = cleanString(board.strip()) - - # Finding the repeated tag that corresponds to the listing of topics - - itopics = soup.find_all('tr',{"class": "inline_row"}) - - for itopic in itopics: - - # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them - # to don't miss any topic - - # Adding the topic to the topic list - try: - topics = itopic.find('span', {"class": "subject_old"}).find('a').text - except: - topics = itopic.find('span', {"class": "subject_new"}).find('a').text - topics = re.sub("\[\w*\]", '', topics) - topic.append(cleanString(topics)) - - # Counting how many topics we have found so far - - nm = len(topic) - - # Adding the url to the list of urls - try: - link = itopic.find('span', {"class": "subject_old"}).find('a').get('href') - except: - link = itopic.find('span',{"class": "subject_new"}).find('a').get('href') - link = cleanLink(link) - href.append(link) - - # Finding the author of the topic - ps = itopic.find('div', {"class":"author smalltext"}).find('a').text - user = ps.strip() - author.append(cleanString(user)) - - # Finding the number of replies - columns = itopic.findChildren('td',recursive=False) - replies = columns[3].text - - posts.append(cleanString(replies)) - - # Finding the number of Views - tview = columns[4].text - views.append(cleanString(tview)) - - # If no information about when the topic was added, just assign "-1" to the variable - - addDate.append("-1") - - return organizeTopics("CryptBB", nm, topic, board, author, views, posts, href, addDate) + li = soup.find("h1", {"class": "p-title-value"}) + board = cleanString(li.text.strip()) + + threads_list: ResultSet[Tag] = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True}) + + nm = len(threads_list) + + for thread in threads_list: + thread_title = thread.find("div", {"class": "structItem-title"}).text + topic.append(cleanString(thread_title.strip())) + + thread_author = thread.get("data-author") + author.append(cleanString(thread_author)) + + thread_views = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find('dd').text + views.append(cleanString(thread_views.strip())) + + thread_replies = thread.find("dl", {"class": "pairs pairs--justified"}).find('dd').text + # All threads contain one topic post and reply posts + thread_total_posts = str(1 + int(thread_replies)) + posts.append(thread_total_posts) + + thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime") + datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z") + addDate.append(datetime_obj) + + thread_link = thread.find("div", {"class": "structItem-title"}).find('a').get('href') + href.append(thread_link) + + + return organizeTopics( + forum="Procrax", + nm=nm, + board=board, + author=author, + topic=topic, + views=views, + posts=posts, + addDate=addDate, + href=href + ) def procrax_links_parser(soup): diff --git a/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-310.pyc b/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index dc895d5..0000000 Binary files a/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc b/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index a6674c4..0000000 Binary files a/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-310.pyc b/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 1dde171..0000000 Binary files a/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc b/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index ef25fdf..0000000 Binary files a/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/AnonymousMarketplace/crawler_selenium.py b/MarketPlaces/AnonymousMarketplace/crawler_selenium.py index 9642dfb..d09469f 100644 --- a/MarketPlaces/AnonymousMarketplace/crawler_selenium.py +++ b/MarketPlaces/AnonymousMarketplace/crawler_selenium.py @@ -32,19 +32,19 @@ baseURL = 'http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - # opentor() + opentor() mktName = getMKTName() - # driver = getAccess() + driver = getAccess() - # if driver != 'down': - # try: - # login(driver) - # crawlForum(driver) - # except Exception as e: - # print(driver.current_url, e) - # closetor(driver) + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) - new_parse(mktName, baseURL, False) + # new_parse(mktName, baseURL, False) # Opens Tor Browser @@ -188,9 +188,9 @@ def getInterestedLinks(): # carding links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/carding/') # # hacked paypal - # links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacked-paypal-accounts/') + links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacked-paypal-accounts/') # # hacking services - # links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacking-services/') + links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacking-services/') return links @@ -202,24 +202,23 @@ def crawlForum(driver): print("Crawling the AnonymousMarketplace market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -231,23 +230,17 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out - # if count == 20: - # count = 0 - # break + if count == 1: + break + #left in in case site changes try: link = "" if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -257,9 +250,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling AnonymousMarketplace forum done sucessfully. Press ENTER to continue\n") @@ -267,7 +257,7 @@ def crawlForum(driver): #@param: url of any url crawled #return: true if is a description page, false if not def isDescriptionLink(url): - if 'product/' in url: + if '/product/' in url: return True return False @@ -276,7 +266,7 @@ def isDescriptionLink(url): #@param: url of any url crawled #return: true if is a Listing page, false if not def isListingLink(url): - if 'product-' in url: + if 'category' in url: return True return False diff --git a/MarketPlaces/AnonymousMarketplace/parser.py b/MarketPlaces/AnonymousMarketplace/parser.py index a0880b7..da11774 100644 --- a/MarketPlaces/AnonymousMarketplace/parser.py +++ b/MarketPlaces/AnonymousMarketplace/parser.py @@ -43,14 +43,14 @@ def anonymousMarketplace_description_parser(soup: Tag): product_ratings: Tag = soup.find("div", {"class": "star-rating"}) - product_reviews = product_ratings.find("strong", {"class": "rating"}).text + product_reviews = product_ratings.find("div", {"class": "woocommerce-product-rating"}).find("strong", {"class": "rating"}).text reviews = cleanString(product_reviews.strip()) product_star_rating = product_ratings.find("span", {"class": "rating"}).text rating_item = cleanString(product_star_rating.strip()) - product_price = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text.replace("$", "") - USD = cleanString(product_price.strip()) + product_price = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text + USD = cleanString(product_price.replace("$", "").strip()) # Populating the final variable (this should be a list with all fields scraped) @@ -88,26 +88,29 @@ def anonymousMarketplace_listing_parser(soup: Tag): href = [] # 20 Product_Links - product_list: ResultSet[Tag] = soup.find("ul", {"class": "product_list_widget"}).find_all("li") + product_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-4"}).find_all("li") for item in product_list: item_href = item.find("a").get("href") href.append(item_href) - item_name = item.find("span", {"class": "product-title"}).text - name.append((item_name.strip())) + item_name = item.find("h2", {"class": "woocommerce-loop-product__title"}).text + name.append(cleanString('item_name'.strip())) item_rating = item.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text - rating_item.append(cleanNumbers(item_rating.strip())) + rating_item.append(cleanString(item_rating.strip())) - item_price = item.find("span", {"class": "woocommerce-Price-amount amount"}).text - if not item_price: + try: + item_price = item.find("span", {"class": "woocommerce-Price-amount amount"}).text + item_price = item_price.replace("$", "").strip() + USD.append(item_price) + except AttributeError: USD.append("-1") - else: - USD.append(cleanNumbers(item_price.replace("$", "").strip())) + + - vendor.append("-1") + vendor.append("Anonymous") rating_vendor.append("-1") success.append("-1") CVE.append("-1") @@ -126,9 +129,30 @@ def anonymousMarketplace_listing_parser(soup: Tag): nm += 1 - return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) - + return organizeProducts( + marketplace=mktName, + nm=nm, + vendor=vendor, + rating_vendor=rating_vendor, + success_vendor=success, + nombre=name, + CVE=CVE, + MS=MS, + category=category, + describe=describe, + views=views, + reviews=reviews, + rating_item=rating_item, + addDate=addDate, + BTC=BTC, + USD=USD, + EURO=EURO, + sold=sold, + qLeft=qLeft, + shipFrom=shipFrom, + shipTo=shipTo, + href=href + ) @@ -147,6 +171,7 @@ def anonymous_links_parser(soup): for a in listing: bae = a.find('a', {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}, href=True) + link = bae['href'] href.append(link) diff --git a/MarketPlaces/Apocalypse/crawler_selenium.py b/MarketPlaces/Apocalypse/crawler_selenium.py index 28d6a0f..134f4d8 100644 --- a/MarketPlaces/Apocalypse/crawler_selenium.py +++ b/MarketPlaces/Apocalypse/crawler_selenium.py @@ -216,24 +216,23 @@ def crawlForum(driver): print("Crawling the Apocalypse market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -245,11 +244,10 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out - if count == 20: - count = 0 + if count == 1: break try: @@ -257,12 +255,6 @@ def crawlForum(driver): '/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -272,9 +264,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling Apocalypse forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py index af1623a..5ce0101 100644 --- a/MarketPlaces/BlackPyramid/crawler_selenium.py +++ b/MarketPlaces/BlackPyramid/crawler_selenium.py @@ -220,26 +220,25 @@ def crawlForum(driver): print("Crawling the BlackPyramid market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a') - clicker.click() # open tab with url - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a') + clicker.click() # open tab with url + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -255,7 +254,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -263,12 +261,6 @@ def crawlForum(driver): '/html/body/center/div[4]/div/div[3]/div[23]/div[2]/input[1]') if clicker == "": raise NoSuchElementException - try: - clicker.click() - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -278,9 +270,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling BlackPyramid forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/CityMarket/crawler_selenium.py b/MarketPlaces/CityMarket/crawler_selenium.py index 1384c18..ff30bf0 100644 --- a/MarketPlaces/CityMarket/crawler_selenium.py +++ b/MarketPlaces/CityMarket/crawler_selenium.py @@ -221,24 +221,23 @@ def crawlForum(driver): print("Crawling the CityMarket market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -254,7 +253,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -262,12 +260,6 @@ def crawlForum(driver): '/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -277,9 +269,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling CityMarket forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/CypherMarketplace/crawler_selenium.py b/MarketPlaces/CypherMarketplace/crawler_selenium.py index aa587c4..120ed32 100644 --- a/MarketPlaces/CypherMarketplace/crawler_selenium.py +++ b/MarketPlaces/CypherMarketplace/crawler_selenium.py @@ -214,24 +214,23 @@ def crawlForum(driver): print("Crawling the CypherMarketplace market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -247,7 +246,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -256,12 +254,6 @@ def crawlForum(driver): link = temp.find_element(by=By.TAG_NAME, value='page-link').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -271,9 +263,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling CypherMarketplace forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc b/MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc index cd3f879..bb4ff6e 100644 Binary files a/MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc and b/MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc differ diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index a1b27ff..d75d2e2 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -139,6 +139,7 @@ def create_vendor(cur, row, marketId): def create_items(cur, row, marketId, vendorId): + print(row) sql = "Insert into items (market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \ "views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \ diff --git a/MarketPlaces/DarkFox/crawler_selenium.py b/MarketPlaces/DarkFox/crawler_selenium.py index 3967567..61927d6 100644 --- a/MarketPlaces/DarkFox/crawler_selenium.py +++ b/MarketPlaces/DarkFox/crawler_selenium.py @@ -239,46 +239,47 @@ def crawlForum(driver): print("Crawling the DarkFox market") linksToCrawl = getInterestedLinks() - # visited = set(linksToCrawl) - # initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): - if count >= 500: - break link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: - itemURL = str(item) + itemURL = urlparse.urljoin(baseURL, str(item)) try: driver.get(itemURL) except: driver.refresh() savePage(driver.page_source, item) driver.back() - count += 1 + + # comment out + break + + # comment out + if count == 0: + break try: link = driver.find_element(by=By.XPATH, value= '/html/body/main/div/div[2]/div/div[2]/div/div/div/nav/a[2]').get_attribute('href') - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) + if link == "": + raise NoSuchElementException + count += 1 + except NoSuchElementException: has_next_page = False @@ -286,9 +287,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling BestCardingWorld forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/DarkMatter/crawler_selenium.py b/MarketPlaces/DarkMatter/crawler_selenium.py index 21f6035..6d6986a 100644 --- a/MarketPlaces/DarkMatter/crawler_selenium.py +++ b/MarketPlaces/DarkMatter/crawler_selenium.py @@ -205,26 +205,24 @@ def crawlForum(driver): print("Crawling the DarkMatter market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: - list = productPages(html) + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -239,7 +237,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -248,12 +245,6 @@ def crawlForum(driver): link = a.get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -263,9 +254,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling DarkMatter forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/DarkTor/crawler_selenium.py b/MarketPlaces/DarkTor/crawler_selenium.py index 3c44dc7..d84de5c 100644 --- a/MarketPlaces/DarkTor/crawler_selenium.py +++ b/MarketPlaces/DarkTor/crawler_selenium.py @@ -201,24 +201,23 @@ def crawlForum(driver): print("Crawling the DarkTor market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -230,23 +229,16 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out - if count == 30: - count = 0 + if count == 1: break try: link = "" if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -256,9 +248,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling DarkTor forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/DigitalThriftShop/crawler_selenium.py b/MarketPlaces/DigitalThriftShop/crawler_selenium.py index e602a4f..88b460f 100644 --- a/MarketPlaces/DigitalThriftShop/crawler_selenium.py +++ b/MarketPlaces/DigitalThriftShop/crawler_selenium.py @@ -204,24 +204,23 @@ def crawlForum(driver): print("Crawling the DigitalThriftShop market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -237,7 +236,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -245,12 +243,6 @@ def crawlForum(driver): '/html/body/div[1]/div[2]/div/div[2]/main/div[1]/nav/ul/li[5]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -260,9 +252,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling DigitalThriftShop forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/Initialization/__pycache__/__init__.cpython-310.pyc b/MarketPlaces/Initialization/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 495b447..0000000 Binary files a/MarketPlaces/Initialization/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/Initialization/__pycache__/__init__.cpython-311.pyc b/MarketPlaces/Initialization/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 1309640..0000000 Binary files a/MarketPlaces/Initialization/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/Initialization/__pycache__/markets_mining.cpython-310.pyc b/MarketPlaces/Initialization/__pycache__/markets_mining.cpython-310.pyc deleted file mode 100644 index 5189e47..0000000 Binary files a/MarketPlaces/Initialization/__pycache__/markets_mining.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/Initialization/__pycache__/markets_mining.cpython-311.pyc b/MarketPlaces/Initialization/__pycache__/markets_mining.cpython-311.pyc deleted file mode 100644 index d2bbff8..0000000 Binary files a/MarketPlaces/Initialization/__pycache__/markets_mining.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-310.pyc b/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-310.pyc deleted file mode 100644 index 2da4f71..0000000 Binary files a/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc b/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc deleted file mode 100644 index b669316..0000000 Binary files a/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 5739f30..cd467f2 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -11,6 +11,8 @@ from MarketPlaces.Apocalypse.parser import * from MarketPlaces.ThiefWorld.parser import * from MarketPlaces.AnonymousMarketplace.parser import * from MarketPlaces.ViceCity.parser import * +from MarketPlaces.TorBay.parser import * +from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -151,7 +153,11 @@ def new_parse(marketPlace, url, createLog): rmm = anonymousMarketplace_description_parser(soup) elif marketPlace == "ViceCity": rmm = vicecity_description_parser(soup) - + elif marketPlace == "TorBay": + rmm = torbay_description_parser(soup) + elif marketPlace == "M00nkeyMarket": + rmm = m00nkey_description_parser(soup) + # key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2] key = u"Url:" + os.path.basename(line2).replace(".html", "") @@ -190,7 +196,7 @@ def new_parse(marketPlace, url, createLog): readError = True if not readError: - print("Hello!") + parseError = False try: @@ -206,11 +212,14 @@ def new_parse(marketPlace, url, createLog): rw = anonymousMarketplace_listing_parser(soup) elif marketPlace == "ViceCity": rw = vicecity_listing_parser(soup) + elif marketPlace == "TorBay": + rw = torbay_listing_parser(soup) + elif marketPlace == "M00nkeyMarket": + rw = m00nkey_listing_parser(soup) else: parseError = True - except Exception as e: - raise e + except: nError += 1 print("There was a problem to parse the file " + line1 + " in the listing section!") @@ -229,7 +238,6 @@ def new_parse(marketPlace, url, createLog): for rec in rw: rec = rec.split(',') - print(rec) # if len(detPage) > 0: #It was created here just because Zeroday Market does not have Description Pages # key = rec[23] @@ -237,7 +245,6 @@ def new_parse(marketPlace, url, createLog): # key = u"Pr:" + rec[1].upper()[:list_lim1] + u" Vendor:" + rec[18].upper()[:list_lim2] key = u"Url:" + cleanLink(rec[20]) - print(key) # if the associated description page is parsed if key in detPage: @@ -255,7 +262,8 @@ def new_parse(marketPlace, url, createLog): try: persist_data(url, tuple(rec), cur) con.commit() - except: + except Exception as e: + raise e trace = traceback.format_exc() diff --git a/MarketPlaces/LionMarketplace/crawler_selenium.py b/MarketPlaces/LionMarketplace/crawler_selenium.py index 3310aca..d969235 100644 --- a/MarketPlaces/LionMarketplace/crawler_selenium.py +++ b/MarketPlaces/LionMarketplace/crawler_selenium.py @@ -212,24 +212,23 @@ def crawlForum(driver): print("Crawling the LionMarketplace market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -245,7 +244,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -253,12 +251,6 @@ def crawlForum(driver): '/html/body/div[2]/div[2]/div/div[2]/nav/ul/li[5]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -268,9 +260,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling LionMarketplace forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-310.pyc b/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index e8ecfba..0000000 Binary files a/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc b/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index 1ea14d0..0000000 Binary files a/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-310.pyc b/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 7ede7a6..0000000 Binary files a/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc b/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index 77a5388..0000000 Binary files a/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/M00nkeyMarket/crawler_selenium.py b/MarketPlaces/M00nkeyMarket/crawler_selenium.py index dd422ce..a4191e5 100644 --- a/MarketPlaces/M00nkeyMarket/crawler_selenium.py +++ b/MarketPlaces/M00nkeyMarket/crawler_selenium.py @@ -27,16 +27,15 @@ from MarketPlaces.M00nkeyMarket.parser import m00nkey_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/' - +BASE_URL = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/' +MARKET_NAME = 'M00nkeyMarket' # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): opentor() - # mktName = getMKTName() driver = getAccess() - + if driver != 'down': try: login(driver) @@ -45,7 +44,7 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - # new_parse(forumName, baseURL, False) + new_parse(MARKET_NAME, BASE_URL, False) # Opens Tor Browser @@ -64,16 +63,16 @@ def opentor(): # Returns the name of the website #return: name of site in string type -def getMKTName(): - name = 'M00nkeyMarket' - return name +# def getMKTName(): +# name = 'M00nkeyMarket' +# return name # Return the base link of the website #return: url of base site in string type -def getFixedURL(): - url = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/' - return url +# def getFixedURL(): +# url = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/' +# return url # Closes Tor Browser @@ -127,10 +126,9 @@ def createFFDriver(): #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' #return: return the selenium driver or string 'down' def getAccess(): - url = getFixedURL() driver = createFFDriver() try: - driver.get(url) + driver.get(BASE_URL) return driver except: driver.close() @@ -175,7 +173,7 @@ def savePage(page, url): def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + MARKET_NAME + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') @@ -217,27 +215,26 @@ def crawlForum(driver): print("Crawling the M00nkeyMarket market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(BASE_URL, str(item)) try: driver.get(itemURL) except: @@ -249,21 +246,13 @@ def crawlForum(driver): break # comment out - # if count == 1: - # count = 0 - # break + if count == 1: + break try: link = driver.find_element(by=By.LINK_TEXT, value='Next ›').get_attribute('href') - if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -273,9 +262,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling M00nkeyMarket done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/M00nkeyMarket/parser.py b/MarketPlaces/M00nkeyMarket/parser.py index 9faf795..db54c4b 100644 --- a/MarketPlaces/M00nkeyMarket/parser.py +++ b/MarketPlaces/M00nkeyMarket/parser.py @@ -1,4 +1,4 @@ -__author__ = 'DarkWeb' +__author__ = 'Helium' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * @@ -11,133 +11,132 @@ from bs4 import BeautifulSoup #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of description page #return: 'row' that contains a variety of lists that each hold info on the description page -def darkfox_description_parser(soup): +def m00nkey_description_parser(soup): # Fields to be parsed - - name = "-1" # 0 Product_Name - describe = "-1" # 1 Product_Description - lastSeen = "-1" # 2 Product_LastViewDate - rules = "-1" # 3 NOT USED ... - CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 5 Product_MS_Classification (Microsoft Security) - review = "-1" # 6 Product_Number_Of_Reviews - category = "-1" # 7 Product_Category - shipFrom = "-1" # 8 Product_ShippedFrom - shipTo = "-1" # 9 Product_ShippedTo - left = "-1" # 10 Product_QuantityLeft - escrow = "-1" # 11 Vendor_Warranty - terms = "-1" # 12 Vendor_TermsAndConditions - vendor = "-1" # 13 Vendor_Name - sold = "-1" # 14 Product_QuantitySold - addDate = "-1" # 15 Product_AddedDate - available = "-1" # 16 NOT USED ... - endDate = "-1" # 17 NOT USED ... - BTC = "-1" # 18 Product_BTC_SellingPrice - USD = "-1" # 19 Product_USD_SellingPrice - rating = "-1" # 20 Vendor_Rating - success = "-1" # 21 Vendor_Successful_Transactions - EURO = "-1" # 22 Product_EURO_SellingPrice - - # Finding Product Name - name = soup.find('h1').text - name = name.replace('\n', ' ') - name = name.replace(",", "") - name = name.strip() - - # Finding Vendor - vendor = soup.find('h3').find('a').text.strip() - - # Finding Vendor Rating - rating = soup.find('span', {'class': "tag is-dark"}).text.strip() - - # Finding Successful Transactions - success = soup.find('h3').text - success = success.replace("Vendor: ", "") - success = success.replace(vendor, "") - success = success.replace("(", "") - success = success.replace(")", "") - success = success.strip() - - bae = soup.find('div', {'class': "box"}).find_all('ul') - - # Finding Prices - USD = bae[1].find('strong').text.strip() - - li = bae[2].find_all('li') - - # Finding Escrow - escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Category - category = li[1].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Quantity Available - left = li[3].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding Number Sold - sold = li[4].find('span', {'class': "tag is-dark"}).text.strip() - - li = bae[3].find_all('li') - - # Finding Shipment Information (Origin) - if "Ships from:" in li[-2].text: - shipFrom = li[-2].text - shipFrom = shipFrom.replace("Ships from: ", "") - # shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() - - # Finding Shipment Information (Destination) - shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text - shipTo = shipTo.replace("Ships to: ", "") - shipTo = shipTo.strip() - if "certain countries" in shipTo: - countries = "" - tags = li[-1].find_all('span', {'class': "tag"}) - for tag in tags: - country = tag.text.strip() - countries += country + ", " - shipTo = countries.strip(", ") - - # Finding the Product description - describe = soup.find('div', {'class': "pre-line"}).text - describe = describe.replace("\n", " ") - describe = describe.strip() - - '''# Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1"''' - - # Searching for CVE and MS categories - cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if cve: - CVE = " " - for idx in cve: - CVE += (idx) - CVE += " " - CVE = CVE.replace(',', ' ') - CVE = CVE.replace('\n', '') - ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) - if ms: - MS = " " - for im in ms: - MS += (im) - MS += " " - MS = MS.replace(',', ' ') - MS = MS.replace('\n', '') + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + + #vendor name + temp = soup.find('div', {'class': 'box rounded mb-0'}).find('a').text + vendor = (cleanString(temp.strip())) + + #successful transaction + temp = soup.findAll('div', {'class','text-center text-truncate column-flex ml-1 mr-1'}) #card sidebar-menu mb-4 card sidebar-menu mb-4 + temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'}) + temp = temp2[1].text + success = (cleanString(temp.strip())) + + + #vendor rating 5 + temp = soup.findAll('div', {'class', 'text-center text-truncate column-flex ml-1 mr-1'}) # card sidebar-menu mb-4 card sidebar-menu mb-4 + temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'}) + temp = temp2[5].text + rating_vendor = (cleanString(temp.strip())) + + # product name + temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').text + name = (cleanString(temp.strip())) + + + # product description + describe = soup.find('div', {'class': "box rounded flex-fill"}).find('pre').text + if "\n" in describe: + describe = describe.replace("\n", " ") + describe = describe.replace("\r", " ") + describe = cleanString(describe.strip()) + + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much + + # product category + try: + temp = soup.findAll('table', {'class', 'table table-hover'}) + temp2 = temp[1].find('tr').findAll('td') + temp = temp2[1].text + category = cleanString(temp.strip()) + except: + temp = soup.find('table', {'class', 'table table-hover'}) + temp2 = temp.find('tbody').find('tr').findAll('td') + temp = temp2[1].text + category = cleanString(temp.strip()) + + # product number of view + try: + temp = soup.find('div', {'class', 'box rounded mb-0'}) + temp2 = temp.findAll('i') + temp = temp2[2].text + views = cleanString((temp.strip())) + except: + print('Product number of view') + # views = "-1" + + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + + #BTC selling price box box-rounded mt-2 + temp = soup.find('div', {'class', 'box box-rounded mt-2'}) + temp2 = temp.findAll('i', {'class', 'float-right color-prices'}) + temp = temp2[1].text + BTC = cleanString((temp.strip())) + + # USD selling price + temp = soup.find('div', {'class', 'box box-rounded mt-2'}) + temp2 = temp.findAll('center') + temp = temp2[1].find('i').text + if "$" in temp: + temp = temp.replace("$", "") + USD = cleanString((temp.strip())) + + EURO = "-1" # 14 Product_EURO_SellingPrice + + + # product sold + temp = soup.find('div', {'class', 'box rounded mb-0'}) # card sidebar-menu mb-4 card sidebar-menu mb-4 + temp2 = temp.find('i') + temp = temp2.text + sold = (cleanString(temp.strip())) + # sold = "-1" + + # product quantatiy left ###ERRROR + try: + temp = soup.findAll('table', {'class', 'table table-hover'}) + temp2 = temp[1].findAll('tr') + temp3 = temp2[1].findAll('td') + temp = temp3[1].text + left = cleanString(temp.strip()) + except: + temp = soup.find('table', {'class', 'table table-hover'}) + temp2 = temp.findAll('tr') + temp3 = temp2[1].findAll('td') + temp = temp3[1].text + left = cleanString(temp.strip()) + + + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo # Populating the final variable (this should be a list with all fields scraped) - row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, - sold, addDate, available, endDate, BTC, USD, rating, success, EURO) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo) # Sending the results return row @@ -147,131 +146,91 @@ def darkfox_description_parser(soup): #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of listing page #return: 'row' that contains a variety of lists that each hold info on the listing page -def darkfox_listing_parser(soup): - +def m00nkey_listing_parser(soup): # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "DarkFox" # 0 Marketplace_Name - name = [] # 1 Product_Name - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category - describe = [] # 5 Product_Description - escrow = [] # 6 Vendor_Warranty - views = [] # 7 Product_Number_Of_Views - reviews = [] # 8 Product_Number_Of_Reviews - addDate = [] # 9 Product_AddDate - lastSeen = [] # 10 Product_LastViewDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft =[] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions - href = [] # 23 Product_Links (Urls) - - listing = soup.findAll('div', {"class": "card"}) + nm = 0 # *Total_Products (Should be Integer) + mktName = "M00nkeyMarket" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + href = [] # 20 Product_Links + + listing = soup.findAll('div', {"class": "card mt-1"}) # Populating the Number of Products nm = len(listing) for a in listing: - bae = a.findAll('a', href=True) - # Adding the url to the list of urls - link = bae[0].get('href') - link = cleanLink(link) - href.append(link) + # vendor + try: + temp = a.find('col-5 justify-content-between mx-auto').find('a').text + vendor.append(cleanString(temp.strip())) + except: + print('vendor') + + #vendor rating + + + #successful transactions + try: + temp = a.find('col-5 justify-content-between mx-auto').find('div').text + success.append(cleanString(temp.strip())) + except: + print('successful transactions') + + # product name + try: + temp = a.find('card-title rounded text-truncate').find('a').text + name.append(cleanString(temp.strip())) + except: + print('product name') + + + CVE.append('-1') + MS.append('-1') + rating_vendor.append("-1") + + try: + temp = a.findAll('btn btn-block btn-primary') + except: + print("Error in product category") + + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + href = [] # 20 Product_Links - # Finding the Product - product = bae[1].find('p').text - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.replace("...", "") - product = product.strip() - name.append(product) - - bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') - - if len(bae) >= 5: - # Finding Prices - price = bae[0].text - ud = price.replace(" USD", " ") - # u = ud.replace("$","") - u = ud.replace(",", "") - u = u.strip() - USD.append(u) - # bc = (prc[1]).strip(' BTC') - # BTC.append(bc) - - # Finding the Vendor - vendor_name = bae[1].find('a').text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Category - cat = bae[2].find('small').text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() - category.append(cat) - - # Finding Number Sold and Quantity Left - num = bae[3].text - num = num.replace("Sold: ", "") - num = num.strip() - sold.append(num) - - quant = bae[4].find('small').text - quant = quant.replace("In stock: ", "") - quant = quant.strip() - qLeft.append(quant) - - # Finding Successful Transactions - freq = bae[1].text - freq = freq.replace(vendor_name, "") - freq = re.sub(r'Vendor Level \d+', "", freq) - freq = freq.replace("(", "") - freq = freq.replace(")", "") - freq = freq.strip() - success.append(freq) - - # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if not cve: - cveValue="-1" - else: - cee = " " - for idx in cve: - cee += (idx) - cee += " " - cee = cee.replace(',', ' ') - cee = cee.replace('\n', '') - cveValue=cee - CVE.append(cveValue) - - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) - if not ms: - MSValue="-1" - else: - me = " " - for im in ms: - me += (im) - me += " " - me = me.replace(',', ' ') - me = me.replace('\n', '') - MSValue=me - MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, - BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) - + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) #called by the crawler to get description links on a listing page #@param: beautifulsoup object that is using the correct html page (listing page) diff --git a/MarketPlaces/MikesGrandStore/crawler_selenium.py b/MarketPlaces/MikesGrandStore/crawler_selenium.py index f24dce0..bb7d1f8 100644 --- a/MarketPlaces/MikesGrandStore/crawler_selenium.py +++ b/MarketPlaces/MikesGrandStore/crawler_selenium.py @@ -227,24 +227,23 @@ def crawlForum(driver): print("Crawling the MikesGrandStore market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -256,24 +255,17 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out - # if count == 1: - # count = 0 - # break + if count == 1: + break try: link = driver.find_element(by=By.XPATH, value= '/html/body/div[1]/main/div/div[1]/div/div[3]/nav/ul/li[6]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -283,9 +275,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling MikesGrandStore forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/ThiefWorld/__pycache__/crawler_selenium.cpython-311.pyc b/MarketPlaces/ThiefWorld/__pycache__/crawler_selenium.cpython-311.pyc index 3dc0317..0ee63ec 100644 Binary files a/MarketPlaces/ThiefWorld/__pycache__/crawler_selenium.cpython-311.pyc and b/MarketPlaces/ThiefWorld/__pycache__/crawler_selenium.cpython-311.pyc differ diff --git a/MarketPlaces/ThiefWorld/__pycache__/parser.cpython-311.pyc b/MarketPlaces/ThiefWorld/__pycache__/parser.cpython-311.pyc index 1e8dc5a..da3d193 100644 Binary files a/MarketPlaces/ThiefWorld/__pycache__/parser.cpython-311.pyc and b/MarketPlaces/ThiefWorld/__pycache__/parser.cpython-311.pyc differ diff --git a/MarketPlaces/ThiefWorld/crawler_selenium.py b/MarketPlaces/ThiefWorld/crawler_selenium.py index 52e8f89..1111c4d 100644 --- a/MarketPlaces/ThiefWorld/crawler_selenium.py +++ b/MarketPlaces/ThiefWorld/crawler_selenium.py @@ -211,24 +211,23 @@ def crawlForum(driver): print("Crawling the ThiefWorld market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -243,8 +242,7 @@ def crawlForum(driver): break # comment out - if count == 20: - count = 0 + if count == 1: break try: @@ -252,12 +250,6 @@ def crawlForum(driver): '/html/body/div/div[1]/div/div/div[2]/div[3]/div/ul/li[13]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -267,9 +259,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling ThiefWorld forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-310.pyc b/MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index 4bb3771..0000000 Binary files a/MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-311.pyc b/MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index 46ed74e..0000000 Binary files a/MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/Tor2door/__pycache__/parser.cpython-310.pyc b/MarketPlaces/Tor2door/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 5b284b2..0000000 Binary files a/MarketPlaces/Tor2door/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/Tor2door/__pycache__/parser.cpython-311.pyc b/MarketPlaces/Tor2door/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index a6b28ec..0000000 Binary files a/MarketPlaces/Tor2door/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/Tor2door/crawler_selenium.py b/MarketPlaces/Tor2door/crawler_selenium.py index a299c71..964c574 100644 --- a/MarketPlaces/Tor2door/crawler_selenium.py +++ b/MarketPlaces/Tor2door/crawler_selenium.py @@ -228,25 +228,23 @@ def crawlForum(driver): print("Crawling the Tor2door market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) - try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -256,12 +254,12 @@ def crawlForum(driver): driver.refresh() savePage(driver.page_source, item) driver.back() + # comment out break # comment out if count == 1: - count = 0 break try: @@ -269,15 +267,8 @@ def crawlForum(driver): '/html/body/main/div/div/div[2]/div[11]/div/nav') a = nav.find_element(by=By.LINK_TEXT, value="›") link = a.get_attribute('href') - if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -287,9 +278,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling Tor2door market done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-310.pyc b/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index d72e9e9..0000000 Binary files a/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc b/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index 5d4bbfc..0000000 Binary files a/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/TorBay/__pycache__/parser.cpython-310.pyc b/MarketPlaces/TorBay/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 2ff9034..0000000 Binary files a/MarketPlaces/TorBay/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc b/MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index 990e55e..0000000 Binary files a/MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/TorBay/crawler_selenium.py b/MarketPlaces/TorBay/crawler_selenium.py index ee2bd94..0861e82 100644 --- a/MarketPlaces/TorBay/crawler_selenium.py +++ b/MarketPlaces/TorBay/crawler_selenium.py @@ -32,19 +32,19 @@ baseURL = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() + # opentor() mktName = getMKTName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) - - # new_parse(forumName, baseURL, False) + # driver = getAccess() + # + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) + # + new_parse(mktName, baseURL, False) # Opens Tor Browser @@ -198,24 +198,23 @@ def crawlForum(driver): print("Crawling the TorBay Market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -226,25 +225,18 @@ def crawlForum(driver): savePage(driver.page_source, item) driver.back() - # #comment out - # break - # - # # # comment out - # if count == 1: - # count = 0 - # break + # comment out + break + + # comment out + if count == 1: + break try: link = driver.find_element(by=By.XPATH, value= '/html/body/section/div/div/div[2]/div/div[2]/ul/li[3]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -254,9 +246,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling TorBay forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/TorBay/parser.py b/MarketPlaces/TorBay/parser.py index 2e6aeea..3c9725f 100644 --- a/MarketPlaces/TorBay/parser.py +++ b/MarketPlaces/TorBay/parser.py @@ -35,88 +35,51 @@ def torbay_description_parser(soup): shipTo = "-1" # 18 Product_ShippedTo # Finding Product Name - name = soup.find('div', {'class': 'product-information'}).find('h1').text.strip() - - # Finding Vendor - vendor = soup.find('div', {"class": "profile-info"}).find('a').text.strip() - - # Finding Vendor Rating - rating_vendor.append(-1) - - # Finding Successful Transactions - success.append(-1) - - bae = soup.find('div', {'class': "box"}).find_all('ul') + try: + product_name = soup.find('div', {'class': 'product-information'}).find('h1').text + name = cleanString(product_name.strip()) + except: + try: + product_name = soup.find('div', {'class': 'profile-info'}).find('h2').text + name = cleanString(product_name.strip()) + except: + # print(e) + print("product name") + + # Finding Vendor FIx + try: + vendor_name = soup.find('div', {"class": "profile-info"}).find('h2').text + vendor = cleanString(vendor_name.strip()) + except: + print("description vendor name failed\n") # Finding Prices - USD = soup.find('div', {'class': "total-price"}).find('span').text.strip() - + try: + USD = soup.find('div', {'class': "total-price"}).find('span').text.strip() + except: + print("description price failed\n") # Finding the Product Category - category = soup.find('div', {'class': "profile-info"}).find('p').find('a').text.strip() - - # Finding the Product Quantity Available - left.append(-1) - - # Finding Number Sold - sold.append(-1) - - li = bae[3].find_all('li') - - # Finding Shipment Information (Origin) - if "Ships from:" in li[-2].text: - shipFrom = li[-2].text - shipFrom = shipFrom.replace("Ships from: ", "") - # shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() - - # Finding Shipment Information (Destination) - shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text - shipTo = shipTo.replace("Ships to: ", "") - shipTo = shipTo.strip() - if "certain countries" in shipTo: - countries = "" - tags = li[-1].find_all('span', {'class': "tag"}) - for tag in tags: - country = tag.text.strip() - countries += country + ", " - shipTo = countries.strip(", ") + try: + cat = soup.find('div', {'class': "profile-info"}).find('p').text + category = cleanString(cat.strip()) + except: + print("description product category failed") # Finding the Product description - describe = soup.find('div', {'class': "pre-line"}).text - describe = describe.replace("\n", " ") - describe = describe.strip() - - '''# Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1"''' - - # Searching for CVE and MS categories - cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if cve: - CVE = " " - for idx in cve: - CVE += (idx) - CVE += " " - CVE = CVE.replace(',', ' ') - CVE = CVE.replace('\n', '') - ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) - if ms: - MS = " " - for im in ms: - MS += (im) - MS += " " - MS = MS.replace(',', ' ') - MS = MS.replace('\n', '') + try: + describe = soup.find('div', {'class': "info"}).find('p').text + if "\n" in describe: + describe = describe.replace("\n", " ") + describe = describe.replace("\r", " ") + describe = cleanString(describe.strip()) + except: + # print("product desc") + try: + describe = soup.find('div', {'class': 'info'}).text + describe = cleanString(describe.strip()) + except: + print("Product description") # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, @@ -162,93 +125,48 @@ def torbay_listing_parser(soup): nm = len(listing) for a in listing: - bae = a.findAll('a', href=True) - - # Adding the url to the list of urls - link = bae[0].get('href') - link = cleanLink(link) - href.append(link) - - # Finding the Product - product = bae[1].find('p').text - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.replace("...", "") - product = product.strip() - name.append(product) - - bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') - - if len(bae) >= 5: - # Finding Prices - price = bae[0].text - ud = price.replace(" USD", " ") - # u = ud.replace("$","") - u = ud.replace(",", "") - u = u.strip() - USD.append(u) - # bc = (prc[1]).strip(' BTC') - # BTC.append(bc) - - # Finding the Vendor - vendor_name = bae[1].find('a').text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Category - cat = bae[2].find('small').text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() - category.append(cat) - - # Finding Number Sold and Quantity Left - num = bae[3].text - num = num.replace("Sold: ", "") - num = num.strip() - sold.append(num) - - quant = bae[4].find('small').text - quant = quant.replace("In stock: ", "") - quant = quant.strip() - qLeft.append(quant) - - # Finding Successful Transactions - freq = bae[1].text - freq = freq.replace(vendor_name, "") - freq = re.sub(r'Vendor Level \d+', "", freq) - freq = freq.replace("(", "") - freq = freq.replace(")", "") - freq = freq.strip() - success.append(freq) - # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if not cve: - cveValue="-1" - else: - cee = " " - for idx in cve: - cee += (idx) - cee += " " - cee = cee.replace(',', ' ') - cee = cee.replace('\n', '') - cveValue=cee - CVE.append(cveValue) - - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) - if not ms: - MSValue="-1" - else: - me = " " - for im in ms: - me += (im) - me += " " - me = me.replace(',', ' ') - me = me.replace('\n', '') - MSValue=me - MS.append(MSValue) + try: + product_name = a.find('p', {'class': 'name'}).text + name.append(cleanString(product_name.strip())) + except: + print("product name") + + try: + prod = a.find('p', {'class': 'price'}).text # price + USD.append(cleanString(prod.strip())) + except: + print("USD") + + try: + ven = a.find('div', {'class': 'pc-footer'}).find('div').find('a').text # pc-footer + vendor.append(cleanString(ven.strip())) + # print(ven) + except: + print("vendor") + + try: + h = a.find('p', {'class': 'name'}).find('a').get('href') + href.append(h) + except: + print("in href") + + CVE.append("-1") + MS.append("-1") + rating_vendor.append("-1") + success.append("-1") + describe.append("-1") + views.append("-1") + reviews.append("-1") + rating_item.append("-1") + addDate.append("-1") + BTC.append("-1") + EURO.append("-1") + sold.append("-1") + qLeft.append("-1") + shipFrom.append("-1") + shipTo.append("-1") + category.append("Hacking") # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, diff --git a/MarketPlaces/TorMarket/crawler_selenium.py b/MarketPlaces/TorMarket/crawler_selenium.py index 35be864..0528a05 100644 --- a/MarketPlaces/TorMarket/crawler_selenium.py +++ b/MarketPlaces/TorMarket/crawler_selenium.py @@ -201,24 +201,23 @@ def crawlForum(driver): print("Crawling the TorMarket market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -234,7 +233,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -242,12 +240,6 @@ def crawlForum(driver): '/html/body/div[2]/div/div/div[1]/main/nav/ul/li[5]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -257,9 +249,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling TorMarket forum done sucessfully. Press ENTER to continue\n") diff --git a/setup.ini b/setup.ini index c87990a..641d3f1 100644 --- a/setup.ini +++ b/setup.ini @@ -1,14 +1,15 @@ + [TOR] -firefox_binary_path = C:\Users\John Wick\Desktop\Tor Browser\Browser\firefox.exe -firefox_profile_path = C:\Users\John Wick\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default -geckodriver_path = C:\Users\John Wick\PycharmProjects\dw_pipeline_test\selenium\geckodriver.exe +firefox_binary_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe +firefox_profile_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Data\\Browser\\profile.default +geckodriver_path = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test\\selenium\\geckodriver.exe [Project] -project_directory = C:\Users\John Wick\PycharmProjects\dw_pipeline_test -shared_folder = Z:\\VBoxSvr\\VM_Files_ (shared) +project_directory = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test +shared_folder = \\VBoxSvr\\Shared [PostgreSQL] ip = localhost username = postgres -password = postgres +password = password database = darkweb_markets_forums \ No newline at end of file