Browse Source

updated all forum crawlers

main
Helium 1 year ago
parent
commit
b3188f4716
12 changed files with 479 additions and 399 deletions
  1. +2
    -2
      Forums/Altenens/crawler_selenium.py
  2. +23
    -49
      Forums/Cardingleaks/crawler_selenium.py
  3. +25
    -45
      Forums/CryptBB/crawler_selenium.py
  4. +24
    -49
      Forums/HiddenAnswers/crawler_selenium.py
  5. +1
    -1
      Forums/Initialization/forumsList.txt
  6. +3
    -0
      Forums/Initialization/forums_mining.py
  7. +247
    -0
      Forums/Initialization/geckodriver.log
  8. +40
    -67
      Forums/Libre/crawler_selenium.py
  9. +28
    -53
      Forums/OnniForums/crawler_selenium.py
  10. +33
    -49
      Forums/Procrax/crawler_selenium.py
  11. +1
    -1
      MarketPlaces/Initialization/marketsList.txt
  12. +52
    -83
      MarketPlaces/M00nkeyMarket/parser.py

+ 2
- 2
Forums/Altenens/crawler_selenium.py View File

@ -199,7 +199,7 @@ def getInterestedLinks():
return links
# newest version of crawling
def crawlForum(driver):
print("Crawling the Altenens forum")
@ -233,7 +233,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, topic + f"page{counter}")
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:


+ 23
- 49
Forums/Cardingleaks/crawler_selenium.py View File

@ -2,7 +2,7 @@ __author__ = 'DarkWeb'
'''
Cardingleaks Forum Crawler (Selenium)
FIXED
Crawler updated and fixed
'''
from selenium import webdriver
@ -207,67 +207,53 @@ def getInterestedLinks():
def crawlForum(driver):
print("Crawling the Cardingleaks forum")
print("Crawling the Cardinglinks forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
#variable to check if there is a next page for the topic
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver.page_source, topic + f"page{counter}") # very important
# if there is a next page then go and save....
# Spec
try:
# temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div[1]/div[1]/div/nav/div[1]') # /html/body/div/div[2]/div/div[2]/div/
item = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div
# comment out
if counter == 2:
break
if item == "":
try:
page = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if page == "":
raise NoSuchElementException
else:
counter += 1
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# end of loop
for i in range(counter):
driver.back()
@ -276,21 +262,12 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
# temp = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div/div[1]/div/nav/div[1]')
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -300,10 +277,7 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Cardingleaks forum done successfully. Press ENTER to continue\n")
input("Crawling Cardingleaksforum done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link, may need to change for every website


+ 25
- 45
Forums/CryptBB/crawler_selenium.py View File

@ -238,65 +238,55 @@ def getInterestedLinks():
def crawlForum(driver):
print("Crawling the CryptBB forum")
print("Crawling the CryptBB forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
#variable to check if there is a next page for the topic
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
break
# if there is a next page then go and save....
# next page in the topic?
try:
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/
item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div')
page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
if item == "":
if page == "":
raise NoSuchElementException
else:
counter += 1
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# end of loop
for i in range(counter):
driver.back()
@ -305,21 +295,14 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
temp = driver.find_element(by=By.XPATH, value = '/html/body/div/div[2]/div/div[2]/div')
temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div')
link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -329,10 +312,7 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling CryptBB forum done successfully. Press ENTER to continue\n")
input("Crawling CrypttBB done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link, may need to change for every website


+ 24
- 49
Forums/HiddenAnswers/crawler_selenium.py View File

@ -179,86 +179,65 @@ def crawlForum(driver):
print("Crawling the HiddenAnswers forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
'''
#variable to check if there is a next page for the topic
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver.page_source, topic + f"page{counter}") # very important
# if there is a next page then go and save....
# next page in the topic?
try:
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/
item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div
# comment out
if counter == 2:
break
if item == "":
try:
page = "" # no next page so far may have some later on
if page == "":
raise NoSuchElementException
has_next_topic_page = False
else:
counter += 1
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# end of loop
for i in range(counter):
driver.back()
'''
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div/div[3]/div[3]/ul/li[7]/a').get_attribute('href')
link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div/div[3]/div[3]/ul/li[7]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -268,11 +247,7 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling HiddenAnswers forum done sucessfully. Press ENTER to continue\n")
input("Crawling HiddenAnswers done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link


+ 1
- 1
Forums/Initialization/forumsList.txt View File

@ -1 +1 @@
Altenens
Procrax

+ 3
- 0
Forums/Initialization/forums_mining.py View File

@ -14,6 +14,7 @@ from Forums.Procrax.crawler_selenium import crawler as crawlerProcraxForum
from Forums.HiddenAnswers.crawler_selenium import crawler as crawlerHiddenAnswers
from Forums.Cardingleaks.crawler_selenium import crawler as crawlerCardingleaks
from Forums.Altenens.crawler_selenium import crawler as crawlerAltenens
from Forums.Libre.crawler_selenium import crawler as crawlerLibre
import configparser
import time
@ -119,6 +120,8 @@ if __name__ == '__main__':
crawlerCardingleaks()
elif forum == 'Altenens':
crawlerAltenens()
elif forum == 'Libre':
crawlerLibre()


+ 247
- 0
Forums/Initialization/geckodriver.log View File

@ -10951,3 +10951,250 @@ unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689363209615 geckodriver INFO Listening on 127.0.0.1:60532
1689363216981 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "60533" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofile278pEs"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689363219049 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:60533/devtools/browser/8c990d4b-44eb-425d-b226-b8d4c1cffc2d
1689363224682 Marionette INFO Listening on port 60540
1689363225068 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: , line 0: NotFoundError: No such JSWindowActor 'MarionetteEvents'
JavaScript error: , line 0: NotFoundError: No such JSWindowActor 'MarionetteEvents'
1689363820376 Marionette INFO Stopped listening on port 60540
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofile278pEs\thumbnails) because it does not exist
[Parent 5080, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
1689363820593 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:60789/devtools/browser/8539d316-2b33-4477-9e35-2f9e6eab09b6
1689363569998 Marionette INFO Listening on port 60796
1689363570244 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/, line 2: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 2: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 5: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 9: ReferenceError: use_xmlhttprequest is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 6: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 19: ReferenceError: use_xmlhttprequest is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 25: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 6: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 19: ReferenceError: use_xmlhttprequest is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 25: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 6: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 19: ReferenceError: use_xmlhttprequest is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 25: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86&page=2, line 3: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16778, line 6: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16778, line 19: ReferenceError: use_xmlhttprequest is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16778, line 25: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86&page=2, line 3: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined
1689363752505 Marionette INFO Stopped listening on port 60796
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilecgBCTA\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
Crash Annotation GraphicsCriticalError: |[C0][GFX1-]: Receive IPC close with reason=AbnormalShutdown (t=1346.28)
###!!! [Child][MessageChannel] Error: (msgtype=0x3900E5,name=PContent::Msg_GraphicsError) Channel closing: too late to send/recv, messages will be lost
[GFX1-]: Receive IPC close with reason=AbnormalShutdown
1689363753315 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689364130030 geckodriver INFO Listening on 127.0.0.1:61129
1689364135033 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "61130" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileZXcPSi"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689364136375 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:61130/devtools/browser/d0a00e7f-efab-4092-ba43-3afb5ec55bcc
1689364140122 Marionette INFO Listening on port 61138
1689364140225 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689364164357 Marionette INFO Stopped listening on port 61138
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileZXcPSi\thumbnails) because it does not exist
[Parent 5336, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
[Parent 5336, IPC I/O Parent] WARNING: pipe error: 232: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/chrome/common/ipc_channel_win.cc:544
1689364165253 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689364952139 geckodriver INFO Listening on 127.0.0.1:61327
1689364958550 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "61328" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileeX31Bg"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689364960322 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:61328/devtools/browser/d98ca77f-1ca8-49c2-b3d0-7c98e39d55e8
1689364964835 Marionette INFO Listening on port 61336
1689364965449 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689365065931 Marionette INFO Stopped listening on port 61336
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileeX31Bg\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1689365066887 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689365596202 geckodriver INFO Listening on 127.0.0.1:61665
1689365603047 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "61666" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilegVxGn8"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689365604946 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:61666/devtools/browser/3f945d28-11cd-436c-832e-2085f8bb57e1
1689365609901 Marionette INFO Listening on port 61676
1689365610315 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689365827541 Marionette INFO Stopped listening on port 61676
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
[Parent 7204, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
1689365828066 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689366358424 geckodriver INFO Listening on 127.0.0.1:62059
1689366363521 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "62060" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileSRNF4S"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689366364862 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:62060/devtools/browser/38410e90-6408-4c6e-a78a-4d8c6dabe5f5
1689366368448 Marionette INFO Listening on port 62067
###!!! [Child][MessageChannel] Error: (msgtype=0x390097,name=PContent::Msg_InitBackground) Channel closing: too late to send/recv, messages will be lost
###!!! [Child][MessageChannel] Error: (msgtype=0x390097,name=PContent::Msg_InitBackground) Channel closing: too late to send/recv, messages will be lost
1689366368939 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689366462907 Marionette INFO Stopped listening on port 62067
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
1689366464131 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138

+ 40
- 67
Forums/Libre/crawler_selenium.py View File

@ -62,16 +62,14 @@ def login(driver):
input('Press enter when CAPTCHA is completed, and you\'re at the login page')
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.NAME, value='login')
usernameBox = driver.find_element(by=By.NAME, value='username')
#Username here
usernameBox.send_keys('ct1234')#sends string to the username box
passwordBox = driver.find_element(by=By.NAME, value='password')
#Password here
passwordBox.send_keys('r5o0wqmw')# sends string to passwordBox
login = driver.find_element(by=By.CLASS_NAME, value='block-container')
login_link = login.find_element(by=By.TAG_NAME, value='button')
login_link.click()
input("Press the login button and solve the CAPTCHA then press enter\n")
# input('input')
@ -209,87 +207,65 @@ def crawlForum(driver):
print("Crawling the Libre forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
#variable to check if there is a next page for the topic
# has_next_topic_page = True
# counter = 1
# # check if there is a next page for the topics
# while has_next_topic_page:
# # try to access next page of th topic
# itemURL = urlparse.urljoin(baseURL, str(item))
# try:
# driver.get(itemURL)
# except:
# driver.refresh()
# savePage(driver.page_source, item)
#
# # if there is a next page then go and save....
# # Spec
# try:
# # temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div[1]/div[1]/div/nav/div[1]') # /html/body/div/div[2]/div/div[2]/div/
# item = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div
#
# if item == "":
# raise NoSuchElementException
# else:
# counter += 1
#
# except NoSuchElementException:
# has_next_topic_page = False
#
# # end of loop
# for i in range(counter):
# driver.back()
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
while has_next_topic_page:
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
break
try:
page = "" # no next page so far may have some later on
if page == "":
raise NoSuchElementException
counter += 1
except NoSuchElementException:
has_next_topic_page = False
for i in range(counter):
driver.back()
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
# temp = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div/div[1]/div/nav/div[1]')
link = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -299,10 +275,7 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Libre forum done successfully. Press ENTER to continue\n")
input("Crawling Libre done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link, may need to change for every website


+ 28
- 53
Forums/OnniForums/crawler_selenium.py View File

@ -214,92 +214,71 @@ def getInterestedLinks():
def crawlForum(driver):
print("Crawling the OnniForums forum")
print("Crawling the OnniForums")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
#next page for topic
# variable to check if there is a next page for the topic
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
break
# if there is a next page then go and save....
# next page in the topic?
try:
temp = driver.find_element(By.XPATH,
'/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[2]/div/
item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute(
'href') # /html/body/div/div[2]/div/div[2]/div
temp = driver.find_element(By.XPATH,'/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[2]/div/
page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') # /html/body/div/div[2]/div/div[2]/div
if item == "":
if page == "":
raise NoSuchElementException
has_next_topic_page = False
else:
counter += 1
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# end of loop
for i in range(counter):
driver.back()
# comment out, one topic per page
# comment out
break
# comment out, go through all pages
# comment out
if count == 1:
count = 0
break
try:
temp = driver.find_element(by=By.XPATH, value=
'/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[3]/div
temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[3]/div
link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -309,11 +288,7 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling OnniForums forum done sucessfully. Press ENTER to continue\n")
input("Crawling OnniForums done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link


+ 33
- 49
Forums/Procrax/crawler_selenium.py View File

@ -202,83 +202,70 @@ def getInterestedLinks():
def crawlForum(driver):
print("Crawling the Procrax forum")
print("Crawling the Procrax")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)# open
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
#loop through the topics
while has_next_page:
list = topicPages(html)# for multiple pages
for item in list:
#variable to check if there is a next page for the topic
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
break
# if there is a next page then go and save....
# specific
try:
# temp = driver.find_element(By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div/div')
item = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
if item == "":
if page == "":
raise NoSuchElementException
has_next_topic_page = False
else:
counter += 1
counter += 1
except NoSuchElementException:
has_next_topic_page = False
#end of loop
for i in range(counter):
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# count = 0
# break
try:# change depending on web page, #general
# /html/body/div[1]/div[3]/div[2]/div[3]/div/div/div/div[1]/div/nav/div[1]
# temp = driver.find_element(By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div/div/div/div[1]/div/nav/div[1]')
# comment out
break
# comment out
if count == 1:
break
try:
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -288,10 +275,7 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Procrax forum done successfully. Press ENTER to continue\n")
input("Crawling Procrax done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link, may need to change for every website


+ 1
- 1
MarketPlaces/Initialization/marketsList.txt View File

@ -1 +1 @@
AnonymousMarketplace
M00nkeyMarket

+ 52
- 83
MarketPlaces/M00nkeyMarket/parser.py View File

@ -35,50 +35,33 @@ def m00nkey_description_parser(soup):
shipTo = "-1" # 18 Product_ShippedTo
#vendor name
try:
temp = soup.find('div', {'class': 'box rounded mb-0'}).find('a').text
vendor = (cleanString(temp.strip()))
except:
print("Error in vendor")
temp = soup.find('div', {'class': 'box rounded mb-0'}).find('a').text
vendor = (cleanString(temp.strip()))
#successful transaction
try:
temp = soup.findAll('div', {'class','text-center text-truncate column-flex ml-1 mr-1'}) #card sidebar-menu mb-4 card sidebar-menu mb-4
temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'})
temp = temp2[1].text
success = (cleanString(temp.strip()))
except:
print("Error in successful")
sucess = "-1"
temp = soup.findAll('div', {'class','text-center text-truncate column-flex ml-1 mr-1'}) #card sidebar-menu mb-4 card sidebar-menu mb-4
temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'})
temp = temp2[1].text
success = (cleanString(temp.strip()))
#vendor rating 5
try:
temp = soup.findAll('div', {'class', 'text-center text-truncate column-flex ml-1 mr-1'}) # card sidebar-menu mb-4 card sidebar-menu mb-4
temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'})
temp = temp2[5].text
rating_vendor = (cleanString(temp.strip()))
except:
print("Error in vendor rating")
rating_vendor = "-1"
temp = soup.findAll('div', {'class', 'text-center text-truncate column-flex ml-1 mr-1'}) # card sidebar-menu mb-4 card sidebar-menu mb-4
temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'})
temp = temp2[5].text
rating_vendor = (cleanString(temp.strip()))
# product name
try:
temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').text
name = (cleanString(temp.strip()))
except:
print("Error in product name")
name = "-1"
temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').text
name = (cleanString(temp.strip()))
# product description
try:
describe = soup.find('div', {'class': "box rounded flex-fill"}).find('pre').text
if "\n" in describe:
describe = describe.replace("\n", " ")
describe = describe.replace("\r", " ")
describe = cleanString(describe.strip())
except:
print("Product description")
describe = "-1"
describe = soup.find('div', {'class': "box rounded flex-fill"}).find('pre').text
if "\n" in describe:
describe = describe.replace("\n", " ")
describe = describe.replace("\r", " ")
describe = cleanString(describe.strip())
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
@ -90,14 +73,10 @@ def m00nkey_description_parser(soup):
temp = temp2[1].text
category = cleanString(temp.strip())
except:
try:
temp = soup.find('table', {'class', 'table table-hover'})
temp2 = temp.find('tbody').find('tr').findAll('td')
temp = temp2[1].text
category = cleanString(temp.strip())
except:
print('Product category')
category = "-1"
temp = soup.find('table', {'class', 'table table-hover'})
temp2 = temp.find('tbody').find('tr').findAll('td')
temp = temp2[1].text
category = cleanString(temp.strip())
# product number of view
try:
@ -107,47 +86,35 @@ def m00nkey_description_parser(soup):
views = cleanString((temp.strip()))
except:
print('Product number of view')
views = "-1"
# views = "-1"
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
#BTC selling price box box-rounded mt-2
try:
temp = soup.find('div', {'class', 'box box-rounded mt-2'})
temp2 = temp.findAll('i', {'class', 'float-right color-prices'})
temp = temp2[1].text
BTC = cleanString((temp.strip()))
except:
print('Product BTC')
BTC = "-1"
temp = soup.find('div', {'class', 'box box-rounded mt-2'})
temp2 = temp.findAll('i', {'class', 'float-right color-prices'})
temp = temp2[1].text
BTC = cleanString((temp.strip()))
# USD selling price
try:
temp = soup.find('div', {'class', 'box box-rounded mt-2'})
temp2 = temp.findAll('center')
temp = temp2[1].find('i').text
if "$" in temp:
temp = temp.replace("$", "")
USD = cleanString((temp.strip()))
except:
print('Product USD')
USD = "-1"
temp = soup.find('div', {'class', 'box box-rounded mt-2'})
temp2 = temp.findAll('center')
temp = temp2[1].find('i').text
if "$" in temp:
temp = temp.replace("$", "")
USD = cleanString((temp.strip()))
EURO = "-1" # 14 Product_EURO_SellingPrice
# product sold
try:
temp = soup.find('div', {'class', 'box rounded mb-0'}) # card sidebar-menu mb-4 card sidebar-menu mb-4
temp2 = temp.find('i')
temp = temp2.text
sold = (cleanString(temp.strip()))
except:
print("Error in successful")
sold = "-1"
temp = soup.find('div', {'class', 'box rounded mb-0'}) # card sidebar-menu mb-4 card sidebar-menu mb-4
temp2 = temp.find('i')
temp = temp2.text
sold = (cleanString(temp.strip()))
# sold = "-1"
# product quantatiy left ###ERRROR
try:
@ -157,15 +124,12 @@ def m00nkey_description_parser(soup):
temp = temp3[1].text
left = cleanString(temp.strip())
except:
try:
temp = soup.find('table', {'class', 'table table-hover'})
temp2 = temp.findAll('tr')
temp3 = temp2[1].findAll('td')
temp = temp3[1].text
left = cleanString(temp.strip())
except:
print('Product quantity')
left = "-1"
temp = soup.find('table', {'class', 'table table-hover'})
temp2 = temp.findAll('tr')
temp3 = temp2[1].findAll('td')
temp = temp3[1].text
left = cleanString(temp.strip())
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
@ -229,20 +193,25 @@ def m00nkey_listing_parser(soup):
temp = a.find('col-5 justify-content-between mx-auto').find('div').text
success.append(cleanString(temp.strip()))
except:
print('vendor')
print('successful transactions')
# product name
try:
temp = a.find('card-title rounded text-truncate').find('a').text
name.append(cleanString(temp.strip()))
except:
print('vendor')
print('product name')
CVE.append('-1')
MS.append('-1')
rating_vendor.append("-1")
try:
temp = a.findAll('btn btn-block btn-primary')
except:
print("Error in product category")
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views


Loading…
Cancel
Save