Browse Source

Merge branch 'main' into josh-development

# Conflicts:
#	MarketPlaces/Initialization/geckodriver.log
#	MarketPlaces/Initialization/marketsList.txt
#	MarketPlaces/Initialization/prepare_parser.py
#	setup.ini
main
Joshua 1 year ago
parent
commit
60f15c7b13
89 changed files with 1188 additions and 1395 deletions
  1. +1
    -0
      .gitignore
  2. BIN
      Forums/AbyssForum/__pycache__/crawler_selenium.cpython-310.pyc
  3. BIN
      Forums/AbyssForum/__pycache__/crawler_selenium.cpython-311.pyc
  4. BIN
      Forums/AbyssForum/__pycache__/parser.cpython-310.pyc
  5. BIN
      Forums/AbyssForum/__pycache__/parser.cpython-311.pyc
  6. +22
    -46
      Forums/AbyssForum/crawler_selenium.py
  7. BIN
      Forums/Altenens/__pycache__/crawler_selenium.cpython-310.pyc
  8. BIN
      Forums/Altenens/__pycache__/crawler_selenium.cpython-311.pyc
  9. BIN
      Forums/Altenens/__pycache__/parser.cpython-310.pyc
  10. BIN
      Forums/Altenens/__pycache__/parser.cpython-311.pyc
  11. +2
    -2
      Forums/Altenens/crawler_selenium.py
  12. +23
    -49
      Forums/Cardingleaks/crawler_selenium.py
  13. BIN
      Forums/CryptBB/__pycache__/__init__.cpython-311.pyc
  14. BIN
      Forums/CryptBB/__pycache__/crawler_selenium.cpython-310.pyc
  15. BIN
      Forums/CryptBB/__pycache__/crawler_selenium.cpython-311.pyc
  16. BIN
      Forums/CryptBB/__pycache__/parser.cpython-310.pyc
  17. BIN
      Forums/CryptBB/__pycache__/parser.cpython-311.pyc
  18. +25
    -45
      Forums/CryptBB/crawler_selenium.py
  19. BIN
      Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-310.pyc
  20. BIN
      Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-311.pyc
  21. BIN
      Forums/HiddenAnswers/__pycache__/parser.cpython-310.pyc
  22. BIN
      Forums/HiddenAnswers/__pycache__/parser.cpython-311.pyc
  23. +24
    -49
      Forums/HiddenAnswers/crawler_selenium.py
  24. BIN
      Forums/Initialization/__pycache__/__init__.cpython-310.pyc
  25. BIN
      Forums/Initialization/__pycache__/__init__.cpython-311.pyc
  26. BIN
      Forums/Initialization/__pycache__/forums_mining.cpython-310.pyc
  27. BIN
      Forums/Initialization/__pycache__/forums_mining.cpython-311.pyc
  28. BIN
      Forums/Initialization/__pycache__/prepare_parser.cpython-310.pyc
  29. BIN
      Forums/Initialization/__pycache__/prepare_parser.cpython-311.pyc
  30. +1
    -1
      Forums/Initialization/forumsList.txt
  31. +6
    -3
      Forums/Initialization/forums_mining.py
  32. +324
    -0
      Forums/Initialization/geckodriver.log
  33. +5
    -0
      Forums/Initialization/prepare_parser.py
  34. +40
    -67
      Forums/Libre/crawler_selenium.py
  35. BIN
      Forums/OnniForums/__pycache__/crawler_selenium.cpython-310.pyc
  36. BIN
      Forums/OnniForums/__pycache__/crawler_selenium.cpython-311.pyc
  37. BIN
      Forums/OnniForums/__pycache__/parser.cpython-310.pyc
  38. BIN
      Forums/OnniForums/__pycache__/parser.cpython-311.pyc
  39. BIN
      Forums/OnniForums/__pycache__/parser_script.cpython-311.pyc
  40. +28
    -53
      Forums/OnniForums/crawler_selenium.py
  41. +55
    -68
      Forums/Procrax/crawler_selenium.py
  42. +73
    -194
      Forums/Procrax/parser.py
  43. BIN
      MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-310.pyc
  44. BIN
      MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc
  45. BIN
      MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-310.pyc
  46. BIN
      MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc
  47. +27
    -37
      MarketPlaces/AnonymousMarketplace/crawler_selenium.py
  48. +40
    -15
      MarketPlaces/AnonymousMarketplace/parser.py
  49. +11
    -22
      MarketPlaces/Apocalypse/crawler_selenium.py
  50. +11
    -22
      MarketPlaces/BlackPyramid/crawler_selenium.py
  51. +9
    -20
      MarketPlaces/CityMarket/crawler_selenium.py
  52. +9
    -20
      MarketPlaces/CypherMarketplace/crawler_selenium.py
  53. BIN
      MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc
  54. +1
    -0
      MarketPlaces/DB_Connection/db_connection.py
  55. +21
    -23
      MarketPlaces/DarkFox/crawler_selenium.py
  56. +9
    -21
      MarketPlaces/DarkMatter/crawler_selenium.py
  57. +11
    -22
      MarketPlaces/DarkTor/crawler_selenium.py
  58. +9
    -20
      MarketPlaces/DigitalThriftShop/crawler_selenium.py
  59. BIN
      MarketPlaces/Initialization/__pycache__/__init__.cpython-310.pyc
  60. BIN
      MarketPlaces/Initialization/__pycache__/__init__.cpython-311.pyc
  61. BIN
      MarketPlaces/Initialization/__pycache__/markets_mining.cpython-310.pyc
  62. BIN
      MarketPlaces/Initialization/__pycache__/markets_mining.cpython-311.pyc
  63. BIN
      MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-310.pyc
  64. BIN
      MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc
  65. +15
    -7
      MarketPlaces/Initialization/prepare_parser.py
  66. +9
    -20
      MarketPlaces/LionMarketplace/crawler_selenium.py
  67. BIN
      MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-310.pyc
  68. BIN
      MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc
  69. BIN
      MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-310.pyc
  70. BIN
      MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc
  71. +24
    -38
      MarketPlaces/M00nkeyMarket/crawler_selenium.py
  72. +198
    -239
      MarketPlaces/M00nkeyMarket/parser.py
  73. +12
    -23
      MarketPlaces/MikesGrandStore/crawler_selenium.py
  74. BIN
      MarketPlaces/ThiefWorld/__pycache__/crawler_selenium.cpython-311.pyc
  75. BIN
      MarketPlaces/ThiefWorld/__pycache__/parser.cpython-311.pyc
  76. +10
    -21
      MarketPlaces/ThiefWorld/crawler_selenium.py
  77. BIN
      MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-310.pyc
  78. BIN
      MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-311.pyc
  79. BIN
      MarketPlaces/Tor2door/__pycache__/parser.cpython-310.pyc
  80. BIN
      MarketPlaces/Tor2door/__pycache__/parser.cpython-311.pyc
  81. +10
    -22
      MarketPlaces/Tor2door/crawler_selenium.py
  82. BIN
      MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-310.pyc
  83. BIN
      MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc
  84. BIN
      MarketPlaces/TorBay/__pycache__/parser.cpython-310.pyc
  85. BIN
      MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc
  86. +27
    -38
      MarketPlaces/TorBay/crawler_selenium.py
  87. +80
    -162
      MarketPlaces/TorBay/parser.py
  88. +9
    -20
      MarketPlaces/TorMarket/crawler_selenium.py
  89. +7
    -6
      setup.ini

+ 1
- 0
.gitignore View File

@ -2,6 +2,7 @@
/shelf/
.idea/workspace.xml
selenium/geckodriver.exe
__pycache__
setup.ini
*.html
*.log

BIN
Forums/AbyssForum/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
Forums/AbyssForum/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
Forums/AbyssForum/__pycache__/parser.cpython-310.pyc View File


BIN
Forums/AbyssForum/__pycache__/parser.cpython-311.pyc View File


+ 22
- 46
Forums/AbyssForum/crawler_selenium.py View File

@ -191,86 +191,66 @@ def crawlForum(driver):
print("Crawling the AbyssForum forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
'''
#variable to check if there is a next page for the topic
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver.page_source, topic + f"page{counter}")
# if there is a next page then go and save....
# next page in the topic?
try:
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/
item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div
# comment out
if counter == 2:
break
try:
temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/div[3]')
item = temp.find_element(by=By.CLASS_NAME, value='button button-icon-only').get_attribute('href')
if item == "":
raise NoSuchElementException
has_next_topic_page = False
else:
counter += 1
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# end of loop
for i in range(counter):
driver.back()
'''
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -280,10 +260,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling AbyssForum forum done sucessfully. Press ENTER to continue\n")


BIN
Forums/Altenens/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
Forums/Altenens/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
Forums/Altenens/__pycache__/parser.cpython-310.pyc View File


BIN
Forums/Altenens/__pycache__/parser.cpython-311.pyc View File


+ 2
- 2
Forums/Altenens/crawler_selenium.py View File

@ -199,7 +199,7 @@ def getInterestedLinks():
return links
# newest version of crawling
def crawlForum(driver):
print("Crawling the Altenens forum")
@ -233,7 +233,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, topic + f"page{counter}")
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:


+ 23
- 49
Forums/Cardingleaks/crawler_selenium.py View File

@ -2,7 +2,7 @@ __author__ = 'DarkWeb'
'''
Cardingleaks Forum Crawler (Selenium)
FIXED
Crawler updated and fixed
'''
from selenium import webdriver
@ -207,67 +207,53 @@ def getInterestedLinks():
def crawlForum(driver):
print("Crawling the Cardingleaks forum")
print("Crawling the Cardinglinks forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
#variable to check if there is a next page for the topic
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver.page_source, topic + f"page{counter}") # very important
# if there is a next page then go and save....
# Spec
try:
# temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div[1]/div[1]/div/nav/div[1]') # /html/body/div/div[2]/div/div[2]/div/
item = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div
# comment out
if counter == 2:
break
if item == "":
try:
page = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if page == "":
raise NoSuchElementException
else:
counter += 1
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# end of loop
for i in range(counter):
driver.back()
@ -276,21 +262,12 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
# temp = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div/div[1]/div/nav/div[1]')
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -300,10 +277,7 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Cardingleaks forum done successfully. Press ENTER to continue\n")
input("Crawling Cardingleaksforum done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link, may need to change for every website


BIN
Forums/CryptBB/__pycache__/__init__.cpython-311.pyc View File


BIN
Forums/CryptBB/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
Forums/CryptBB/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
Forums/CryptBB/__pycache__/parser.cpython-310.pyc View File


BIN
Forums/CryptBB/__pycache__/parser.cpython-311.pyc View File


+ 25
- 45
Forums/CryptBB/crawler_selenium.py View File

@ -238,65 +238,55 @@ def getInterestedLinks():
def crawlForum(driver):
print("Crawling the CryptBB forum")
print("Crawling the CryptBB forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
#variable to check if there is a next page for the topic
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
break
# if there is a next page then go and save....
# next page in the topic?
try:
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/
item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div')
page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
if item == "":
if page == "":
raise NoSuchElementException
else:
counter += 1
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# end of loop
for i in range(counter):
driver.back()
@ -305,21 +295,14 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
temp = driver.find_element(by=By.XPATH, value = '/html/body/div/div[2]/div/div[2]/div')
temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div')
link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -329,10 +312,7 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling CryptBB forum done successfully. Press ENTER to continue\n")
input("Crawling CrypttBB done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link, may need to change for every website


BIN
Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
Forums/HiddenAnswers/__pycache__/parser.cpython-310.pyc View File


BIN
Forums/HiddenAnswers/__pycache__/parser.cpython-311.pyc View File


+ 24
- 49
Forums/HiddenAnswers/crawler_selenium.py View File

@ -179,86 +179,65 @@ def crawlForum(driver):
print("Crawling the HiddenAnswers forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
'''
#variable to check if there is a next page for the topic
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver.page_source, topic + f"page{counter}") # very important
# if there is a next page then go and save....
# next page in the topic?
try:
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/
item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div
# comment out
if counter == 2:
break
if item == "":
try:
page = "" # no next page so far may have some later on
if page == "":
raise NoSuchElementException
has_next_topic_page = False
else:
counter += 1
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# end of loop
for i in range(counter):
driver.back()
'''
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div/div[3]/div[3]/ul/li[7]/a').get_attribute('href')
link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div/div[3]/div[3]/ul/li[7]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -268,11 +247,7 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling HiddenAnswers forum done sucessfully. Press ENTER to continue\n")
input("Crawling HiddenAnswers done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link


BIN
Forums/Initialization/__pycache__/__init__.cpython-310.pyc View File


BIN
Forums/Initialization/__pycache__/__init__.cpython-311.pyc View File


BIN
Forums/Initialization/__pycache__/forums_mining.cpython-310.pyc View File


BIN
Forums/Initialization/__pycache__/forums_mining.cpython-311.pyc View File


BIN
Forums/Initialization/__pycache__/prepare_parser.cpython-310.pyc View File


BIN
Forums/Initialization/__pycache__/prepare_parser.cpython-311.pyc View File


+ 1
- 1
Forums/Initialization/forumsList.txt View File

@ -1 +1 @@
Altenens
Procrax

+ 6
- 3
Forums/Initialization/forums_mining.py View File

@ -14,6 +14,7 @@ from Forums.Procrax.crawler_selenium import crawler as crawlerProcraxForum
from Forums.HiddenAnswers.crawler_selenium import crawler as crawlerHiddenAnswers
from Forums.Cardingleaks.crawler_selenium import crawler as crawlerCardingleaks
from Forums.Altenens.crawler_selenium import crawler as crawlerAltenens
from Forums.Libre.crawler_selenium import crawler as crawlerLibre
import configparser
import time
@ -98,9 +99,9 @@ if __name__ == '__main__':
forum = forum.replace('\n','')
print("Creating listing and description directories ... for " + forum)
createDirectory(forum)
time.sleep(5) # wait for directories to be created
input("Directories created successfully. Press ENTER to continue\n")
# createDirectory(forum)
# time.sleep(5) # wait for directories to be created
# input("Directories created successfully. Press ENTER to continue\n")
if forum == "BestCardingWorld":
@ -119,6 +120,8 @@ if __name__ == '__main__':
crawlerCardingleaks()
elif forum == 'Altenens':
crawlerAltenens()
elif forum == 'Libre':
crawlerLibre()


+ 324
- 0
Forums/Initialization/geckodriver.log View File

@ -10951,3 +10951,327 @@ unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689363209615 geckodriver INFO Listening on 127.0.0.1:60532
1689363216981 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "60533" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofile278pEs"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689363219049 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:60533/devtools/browser/8c990d4b-44eb-425d-b226-b8d4c1cffc2d
1689363224682 Marionette INFO Listening on port 60540
1689363225068 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: , line 0: NotFoundError: No such JSWindowActor 'MarionetteEvents'
JavaScript error: , line 0: NotFoundError: No such JSWindowActor 'MarionetteEvents'
1689363820376 Marionette INFO Stopped listening on port 60540
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofile278pEs\thumbnails) because it does not exist
[Parent 5080, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
1689363820593 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:60789/devtools/browser/8539d316-2b33-4477-9e35-2f9e6eab09b6
1689363569998 Marionette INFO Listening on port 60796
1689363570244 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/, line 2: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 2: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 5: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 9: ReferenceError: use_xmlhttprequest is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 6: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 19: ReferenceError: use_xmlhttprequest is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 25: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 6: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 19: ReferenceError: use_xmlhttprequest is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 25: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 6: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 19: ReferenceError: use_xmlhttprequest is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 25: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86&page=2, line 3: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16778, line 6: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16778, line 19: ReferenceError: use_xmlhttprequest is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16778, line 25: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86&page=2, line 3: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined
1689363752505 Marionette INFO Stopped listening on port 60796
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilecgBCTA\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
Crash Annotation GraphicsCriticalError: |[C0][GFX1-]: Receive IPC close with reason=AbnormalShutdown (t=1346.28)
###!!! [Child][MessageChannel] Error: (msgtype=0x3900E5,name=PContent::Msg_GraphicsError) Channel closing: too late to send/recv, messages will be lost
[GFX1-]: Receive IPC close with reason=AbnormalShutdown
1689363753315 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689364130030 geckodriver INFO Listening on 127.0.0.1:61129
1689364135033 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "61130" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileZXcPSi"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689364136375 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:61130/devtools/browser/d0a00e7f-efab-4092-ba43-3afb5ec55bcc
1689364140122 Marionette INFO Listening on port 61138
1689364140225 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689364164357 Marionette INFO Stopped listening on port 61138
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileZXcPSi\thumbnails) because it does not exist
[Parent 5336, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
[Parent 5336, IPC I/O Parent] WARNING: pipe error: 232: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/chrome/common/ipc_channel_win.cc:544
1689364165253 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689364952139 geckodriver INFO Listening on 127.0.0.1:61327
1689364958550 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "61328" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileeX31Bg"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689364960322 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:61328/devtools/browser/d98ca77f-1ca8-49c2-b3d0-7c98e39d55e8
1689364964835 Marionette INFO Listening on port 61336
1689364965449 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689365065931 Marionette INFO Stopped listening on port 61336
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileeX31Bg\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1689365066887 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689365596202 geckodriver INFO Listening on 127.0.0.1:61665
1689365603047 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "61666" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilegVxGn8"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689365604946 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:61666/devtools/browser/3f945d28-11cd-436c-832e-2085f8bb57e1
1689365609901 Marionette INFO Listening on port 61676
1689365610315 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689365827541 Marionette INFO Stopped listening on port 61676
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
[Parent 7204, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
1689365828066 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689366358424 geckodriver INFO Listening on 127.0.0.1:62059
1689366363521 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "62060" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileSRNF4S"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689366364862 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:62060/devtools/browser/38410e90-6408-4c6e-a78a-4d8c6dabe5f5
1689366368448 Marionette INFO Listening on port 62067
###!!! [Child][MessageChannel] Error: (msgtype=0x390097,name=PContent::Msg_InitBackground) Channel closing: too late to send/recv, messages will be lost
###!!! [Child][MessageChannel] Error: (msgtype=0x390097,name=PContent::Msg_InitBackground) Channel closing: too late to send/recv, messages will be lost
1689366368939 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689366462907 Marionette INFO Stopped listening on port 62067
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
1689366464131 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689622469580 geckodriver INFO Listening on 127.0.0.1:58866
1689622474728 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "58867" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofile5gOLDP"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689622475417 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:58867/devtools/browser/9a3a8de2-439e-425e-b415-f975abd86b65
1689622476941 Marionette INFO Listening on port 58873
1689622477054 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofile5gOLDP\thumbnails) because it does not exist
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: undefined, line 0: Error: Missing host permission for the tab
JavaScript error: undefined, line 0: Error: Missing host permission for the tab
1689624030995 Marionette INFO Stopped listening on port 58873
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofile5gOLDP\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1689624031467 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689624276336 geckodriver INFO Listening on 127.0.0.1:59792
1689624280979 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "59793" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofileSTe5EC"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689624281509 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:59793/devtools/browser/222a61fa-a958-4978-8048-bb632f658131
1689624283001 Marionette INFO Listening on port 59799
1689624283405 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689624692072 Marionette INFO Stopped listening on port 59799
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofileSTe5EC\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1689624692916 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138

+ 5
- 0
Forums/Initialization/prepare_parser.py View File

@ -9,6 +9,7 @@ from Forums.BestCardingWorld.parser import *
from Forums.CryptBB.parser import *
from Forums.OnniForums.parser import *
from Forums.Altenens.parser import *
from Forums.Procrax.parser import *
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -154,6 +155,8 @@ def new_parse(forum, url, createLog):
rmm = onniForums_description_parser(soup)
elif forum == "Altenens":
rmm = altenens_description_parser(soup)
elif forum == "Procrax":
rmm = procrax_description_parser(soup)
# key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
key = u"Url:" + os.path.basename(line2).replace(".html", "")
@ -233,6 +236,8 @@ def new_parse(forum, url, createLog):
rw = onniForums_listing_parser(soup)
elif forum == "Altenens":
rw = altenens_listing_parser(soup)
elif forum == "Procrax":
rw = procrax_listing_parser(soup)
except:


+ 40
- 67
Forums/Libre/crawler_selenium.py View File

@ -62,16 +62,14 @@ def login(driver):
input('Press enter when CAPTCHA is completed, and you\'re at the login page')
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.NAME, value='login')
usernameBox = driver.find_element(by=By.NAME, value='username')
#Username here
usernameBox.send_keys('ct1234')#sends string to the username box
passwordBox = driver.find_element(by=By.NAME, value='password')
#Password here
passwordBox.send_keys('r5o0wqmw')# sends string to passwordBox
login = driver.find_element(by=By.CLASS_NAME, value='block-container')
login_link = login.find_element(by=By.TAG_NAME, value='button')
login_link.click()
input("Press the login button and solve the CAPTCHA then press enter\n")
# input('input')
@ -209,87 +207,65 @@ def crawlForum(driver):
print("Crawling the Libre forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
#variable to check if there is a next page for the topic
# has_next_topic_page = True
# counter = 1
# # check if there is a next page for the topics
# while has_next_topic_page:
# # try to access next page of th topic
# itemURL = urlparse.urljoin(baseURL, str(item))
# try:
# driver.get(itemURL)
# except:
# driver.refresh()
# savePage(driver.page_source, item)
#
# # if there is a next page then go and save....
# # Spec
# try:
# # temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div[1]/div[1]/div/nav/div[1]') # /html/body/div/div[2]/div/div[2]/div/
# item = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div
#
# if item == "":
# raise NoSuchElementException
# else:
# counter += 1
#
# except NoSuchElementException:
# has_next_topic_page = False
#
# # end of loop
# for i in range(counter):
# driver.back()
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
while has_next_topic_page:
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
break
try:
page = "" # no next page so far may have some later on
if page == "":
raise NoSuchElementException
counter += 1
except NoSuchElementException:
has_next_topic_page = False
for i in range(counter):
driver.back()
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
# temp = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div/div[1]/div/nav/div[1]')
link = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -299,10 +275,7 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Libre forum done successfully. Press ENTER to continue\n")
input("Crawling Libre done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link, may need to change for every website


BIN
Forums/OnniForums/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
Forums/OnniForums/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
Forums/OnniForums/__pycache__/parser.cpython-310.pyc View File


BIN
Forums/OnniForums/__pycache__/parser.cpython-311.pyc View File


BIN
Forums/OnniForums/__pycache__/parser_script.cpython-311.pyc View File


+ 28
- 53
Forums/OnniForums/crawler_selenium.py View File

@ -214,92 +214,71 @@ def getInterestedLinks():
def crawlForum(driver):
print("Crawling the OnniForums forum")
print("Crawling the OnniForums")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
#next page for topic
# variable to check if there is a next page for the topic
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
break
# if there is a next page then go and save....
# next page in the topic?
try:
temp = driver.find_element(By.XPATH,
'/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[2]/div/
item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute(
'href') # /html/body/div/div[2]/div/div[2]/div
temp = driver.find_element(By.XPATH,'/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[2]/div/
page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') # /html/body/div/div[2]/div/div[2]/div
if item == "":
if page == "":
raise NoSuchElementException
has_next_topic_page = False
else:
counter += 1
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# end of loop
for i in range(counter):
driver.back()
# comment out, one topic per page
# comment out
break
# comment out, go through all pages
# comment out
if count == 1:
count = 0
break
try:
temp = driver.find_element(by=By.XPATH, value=
'/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[3]/div
temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[3]/div
link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -309,11 +288,7 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling OnniForums forum done sucessfully. Press ENTER to continue\n")
input("Crawling OnniForums done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link


+ 55
- 68
Forums/Procrax/crawler_selenium.py View File

@ -26,24 +26,28 @@ from Forums.Procrax.parser import procrax_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'https://procrax.cx/'
BASE_URL = 'https://procrax.cx/'
FORUM_NAME = 'Procrax'
# Opens Tor Browser, crawls the website
def startCrawling():
opentor()
# forumName = getForumName()
driver = getAccess()
# opentor()
# driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
# new_parse(forumName, False)
new_parse(
forum=FORUM_NAME,
url=BASE_URL,
createLog=False
)
# Opens Tor Browser
@ -139,10 +143,9 @@ def createFFDriver():
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)# open url in browser
driver.get(BASE_URL)# open url in browser
return driver
except:
driver.close()# close tab
@ -162,7 +165,7 @@ def savePage(page, url):
def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + FORUM_NAME + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
@ -185,100 +188,87 @@ def getInterestedLinks():
links = []
# # general hacking
# links.append('https://procrax.cx/forums/general-hacking.24/')
links.append('https://procrax.cx/forums/general-hacking.24/')
# # hacking security tools
# links.append('https://procrax.cx/forums/hacking-security-tools.20/')
links.append('https://procrax.cx/forums/hacking-security-tools.20/')
# # hacktube
# links.append('https://procrax.cx/forums/hacktube.22/')
links.append('https://procrax.cx/forums/hacktube.22/')
# # cardable
# links.append('https://procrax.cx/forums/cardable-websites.28/')
# # tools
# links.append('https://procrax.cx/forums/tools-bots-validators.73/')
# general forum
links.append('https://procrax.cx/forums/forum-discussions-updates.7/')
# links.append('https://procrax.cx/forums/forum-discussions-updates.7/')
return links
def crawlForum(driver):
print("Crawling the Procrax forum")
print("Crawling the Procrax")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)# open
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
#loop through the topics
while has_next_page:
list = topicPages(html)# for multiple pages
for item in list:
#variable to check if there is a next page for the topic
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(BASE_URL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
# if counter == 2:
# break
# if there is a next page then go and save....
# specific
try:
# temp = driver.find_element(By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div/div')
item = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
if item == "":
if page == "":
raise NoSuchElementException
has_next_topic_page = False
else:
counter += 1
counter += 1
except NoSuchElementException:
has_next_topic_page = False
#end of loop
for i in range(counter):
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# count = 0
# break
try:# change depending on web page, #general
# /html/body/div[1]/div[3]/div[2]/div[3]/div/div/div/div[1]/div/nav/div[1]
# temp = driver.find_element(By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div/div/div/div[1]/div/nav/div[1]')
# comment out
# break
# comment out
if count == 20:
break
try:
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -288,10 +278,7 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Procrax forum done successfully. Press ENTER to continue\n")
input("Crawling Procrax done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link, may need to change for every website


+ 73
- 194
Forums/Procrax/parser.py View File

@ -7,11 +7,12 @@ from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, ResultSet, Tag
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def cryptBB_description_parser(soup):
def procrax_description_parser(soup: Tag):
# Fields to be parsed
@ -27,146 +28,36 @@ def cryptBB_description_parser(soup):
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("td", {"class": "thead"}).find('strong')
li = soup.find("h1", {"class": "p-title-value"})
topic = li.text
topic = re.sub("\[\w*\]", '', topic)
topic = topic.replace(",","")
topic = topic.replace("\n","")
topic = cleanString(topic.strip())
# Finding the repeated tag that corresponds to the listing of posts
# try:
posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
'div', {"class": "post"})
# For each message (post), get all the fields we are interested to:
for ipost in posts:
# Finding a first level of the HTML page
post_wrapper = ipost.find('span', {"class": "largetext"})
# Finding the author (user) of the post
author = post_wrapper.text.strip()
user.append(cleanString(author)) # Remember to clean the problematic characters
# Finding the status of the author
smalltext = ipost.find('div', {"class": "post_author"})
'''
# Testing here two possibilities to find this status and combine them
if ipost.find('div', {"class": "deleted_post_author"}):
status.append(-1)
interest.append(-1)
reputation.append(-1)
addDate.append(-1)
post.append("THIS POST HAS BEEN REMOVED!")
sign.append(-1)
feedback.append(-1)
continue
'''
# CryptBB does have membergroup and postgroup
membergroup = smalltext.find('div', {"class": "profile-rank"})
postgroup = smalltext.find('div', {"class": "postgroup"})
if membergroup != None:
membergroup = membergroup.text.strip()
if postgroup != None:
postgroup = postgroup.text.strip()
membergroup = membergroup + " - " + postgroup
else:
if postgroup != None:
membergroup = postgroup.text.strip()
else:
membergroup = "-1"
status.append(cleanString(membergroup))
# Finding the interest of the author
# CryptBB does not have blurb
blurb = smalltext.find('li', {"class": "blurb"})
if blurb != None:
blurb = blurb.text.strip()
else:
blurb = "-1"
interest.append(cleanString(blurb))
# Finding the reputation of the user
# CryptBB does have reputation
author_stats = smalltext.find('div', {"class": "author_statistics"})
karma = author_stats.find('strong')
if karma != None:
karma = karma.text
karma = karma.replace("Community Rating: ", "")
karma = karma.replace("Karma: ", "")
karma = karma.strip()
else:
karma = "-1"
reputation.append(cleanString(karma))
# Getting here another good tag to find the post date, post content and users' signature
postarea = ipost.find('div', {"class": "post_content"})
dt = postarea.find('span', {"class": "post_date"}).text
# dt = dt.strip().split()
dt = dt.strip()
day=date.today()
if "Yesterday" in dt:
yesterday = day - timedelta(days=1)
yesterday = yesterday.strftime('%m-%d-%Y')
stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday+ ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "hours ago" in dt:
day = day.strftime('%m-%d-%Y')
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
else:
date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
stime = date_time_obj.strftime('%b %d, %Y')
sdate = date_time_obj.strftime('%I:%M %p')
addDate.append(date_time_obj)
# Finding the post
inner = postarea.find('div', {"class": "post_body scaleimages"})
inner = inner.text.strip()
post.append(cleanString(inner))
# Finding the user's signature
# signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
signature = ipost.find('div', {"class": "signature scaleimages"})
if signature != None:
signature = signature.text.strip()
# print(signature)
else:
signature = "-1"
sign.append(cleanString(signature))
# As no information about user's feedback was found, just assign "-1" to the variable
thread: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True})
for ipost in thread:
username = ipost.find("h4", {"class": "message-name"}).text
user.append(cleanString(username.strip()))
date_posted = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime")
datetime_obj = datetime.strptime(date_posted, "%Y-%m-%dT%H:%M:%S%z")
addDate.append(datetime_obj)
feedback.append("-1")
'''
except:
if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ":
user.append("-1")
status.append(-1)
interest.append(-1)
reputation.append(-1)
addDate.append(-1)
post.append("NO ACCESS TO THIS PAGE!")
sign.append(-1)
feedback.append(-1)
'''
user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text
status.append(cleanString(user_status.strip()))
user_lvl = ipost.find("div", {"class": "afAwardLevel"}).text
reputation.append(cleanString(user_lvl.strip()))
sign.append("-1")
user_post = ipost.find("article", {"class": "message-body js-selectToQuote"}).text
post.append(cleanString(user_post.strip()))
interest.append("-1")
# Populate the final variable (this should be a list with all fields scraped)
@ -178,7 +69,7 @@ def cryptBB_description_parser(soup):
# This is the method to parse the Listing Pages (one page with many posts)
def cryptBB_listing_parser(soup):
def procrax_listing_parser(soup: Tag):
board = "-1" # board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
@ -193,59 +84,47 @@ def cryptBB_listing_parser(soup):
# Listing and Description pages)
# Finding the board (should be just one)
board = soup.find('span', {"class": "active"}).text
board = cleanString(board.strip())
# Finding the repeated tag that corresponds to the listing of topics
itopics = soup.find_all('tr',{"class": "inline_row"})
for itopic in itopics:
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
# to don't miss any topic
# Adding the topic to the topic list
try:
topics = itopic.find('span', {"class": "subject_old"}).find('a').text
except:
topics = itopic.find('span', {"class": "subject_new"}).find('a').text
topics = re.sub("\[\w*\]", '', topics)
topic.append(cleanString(topics))
# Counting how many topics we have found so far
nm = len(topic)
# Adding the url to the list of urls
try:
link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
except:
link = itopic.find('span',{"class": "subject_new"}).find('a').get('href')
link = cleanLink(link)
href.append(link)
# Finding the author of the topic
ps = itopic.find('div', {"class":"author smalltext"}).find('a').text
user = ps.strip()
author.append(cleanString(user))
# Finding the number of replies
columns = itopic.findChildren('td',recursive=False)
replies = columns[3].text
posts.append(cleanString(replies))
# Finding the number of Views
tview = columns[4].text
views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable
addDate.append("-1")
return organizeTopics("CryptBB", nm, topic, board, author, views, posts, href, addDate)
li = soup.find("h1", {"class": "p-title-value"})
board = cleanString(li.text.strip())
threads_list: ResultSet[Tag] = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True})
nm = len(threads_list)
for thread in threads_list:
thread_title = thread.find("div", {"class": "structItem-title"}).text
topic.append(cleanString(thread_title.strip()))
thread_author = thread.get("data-author")
author.append(cleanString(thread_author))
thread_views = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find('dd').text
views.append(cleanString(thread_views.strip()))
thread_replies = thread.find("dl", {"class": "pairs pairs--justified"}).find('dd').text
# All threads contain one topic post and reply posts
thread_total_posts = str(1 + int(thread_replies))
posts.append(thread_total_posts)
thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime")
datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z")
addDate.append(datetime_obj)
thread_link = thread.find("div", {"class": "structItem-title"}).find('a').get('href')
href.append(thread_link)
return organizeTopics(
forum="Procrax",
nm=nm,
board=board,
author=author,
topic=topic,
views=views,
posts=posts,
addDate=addDate,
href=href
)
def procrax_links_parser(soup):


BIN
MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-310.pyc View File


BIN
MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc View File


+ 27
- 37
MarketPlaces/AnonymousMarketplace/crawler_selenium.py View File

@ -32,19 +32,19 @@ baseURL = 'http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
opentor()
mktName = getMKTName()
# driver = getAccess()
driver = getAccess()
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(mktName, baseURL, False)
# new_parse(mktName, baseURL, False)
# Opens Tor Browser
@ -188,9 +188,9 @@ def getInterestedLinks():
# carding
links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/carding/')
# # hacked paypal
# links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacked-paypal-accounts/')
links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacked-paypal-accounts/')
# # hacking services
# links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacking-services/')
links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacking-services/')
return links
@ -202,24 +202,23 @@ def crawlForum(driver):
print("Crawling the AnonymousMarketplace market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -231,23 +230,17 @@ def crawlForum(driver):
driver.back()
# comment out
# break
break
# comment out
# if count == 20:
# count = 0
# break
if count == 1:
break
#left in in case site changes
try:
link = ""
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -257,9 +250,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling AnonymousMarketplace forum done sucessfully. Press ENTER to continue\n")
@ -267,7 +257,7 @@ def crawlForum(driver):
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'product/' in url:
if '/product/' in url:
return True
return False
@ -276,7 +266,7 @@ def isDescriptionLink(url):
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'product-' in url:
if 'category' in url:
return True
return False


+ 40
- 15
MarketPlaces/AnonymousMarketplace/parser.py View File

@ -43,14 +43,14 @@ def anonymousMarketplace_description_parser(soup: Tag):
product_ratings: Tag = soup.find("div", {"class": "star-rating"})
product_reviews = product_ratings.find("strong", {"class": "rating"}).text
product_reviews = product_ratings.find("div", {"class": "woocommerce-product-rating"}).find("strong", {"class": "rating"}).text
reviews = cleanString(product_reviews.strip())
product_star_rating = product_ratings.find("span", {"class": "rating"}).text
rating_item = cleanString(product_star_rating.strip())
product_price = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text.replace("$", "")
USD = cleanString(product_price.strip())
product_price = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text
USD = cleanString(product_price.replace("$", "").strip())
# Populating the final variable (this should be a list with all fields scraped)
@ -88,26 +88,29 @@ def anonymousMarketplace_listing_parser(soup: Tag):
href = [] # 20 Product_Links
product_list: ResultSet[Tag] = soup.find("ul", {"class": "product_list_widget"}).find_all("li")
product_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-4"}).find_all("li")
for item in product_list:
item_href = item.find("a").get("href")
href.append(item_href)
item_name = item.find("span", {"class": "product-title"}).text
name.append((item_name.strip()))
item_name = item.find("h2", {"class": "woocommerce-loop-product__title"}).text
name.append(cleanString(>'item_name'.strip()))
item_rating = item.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text
rating_item.append(cleanNumbers(item_rating.strip()))
rating_item.append(cleanString(item_rating.strip()))
item_price = item.find("span", {"class": "woocommerce-Price-amount amount"}).text
if not item_price:
try:
item_price = item.find("span", {"class": "woocommerce-Price-amount amount"}).text
item_price = item_price.replace("$", "").strip()
USD.append(item_price)
except AttributeError:
USD.append("-1")
else:
USD.append(cleanNumbers(item_price.replace("$", "").strip()))
vendor.append("-1")
vendor.append("Anonymous")
rating_vendor.append("-1")
success.append("-1")
CVE.append("-1")
@ -126,9 +129,30 @@ def anonymousMarketplace_listing_parser(soup: Tag):
nm += 1
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
return organizeProducts(
marketplace=mktName,
nm=nm,
vendor=vendor,
rating_vendor=rating_vendor,
success_vendor=success,
nombre=name,
CVE=CVE,
MS=MS,
category=category,
describe=describe,
views=views,
reviews=reviews,
rating_item=rating_item,
addDate=addDate,
BTC=BTC,
USD=USD,
EURO=EURO,
sold=sold,
qLeft=qLeft,
shipFrom=shipFrom,
shipTo=shipTo,
href=href
)
@ -147,6 +171,7 @@ def anonymous_links_parser(soup):
for a in listing:
bae = a.find('a', {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}, href=True)
link = bae['href']
href.append(link)

+ 11
- 22
MarketPlaces/Apocalypse/crawler_selenium.py View File

@ -216,24 +216,23 @@ def crawlForum(driver):
print("Crawling the Apocalypse market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -245,11 +244,10 @@ def crawlForum(driver):
driver.back()
# comment out
# break
break
# comment out
if count == 20:
count = 0
if count == 1:
break
try:
@ -257,12 +255,6 @@ def crawlForum(driver):
'/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -272,9 +264,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Apocalypse forum done sucessfully. Press ENTER to continue\n")


+ 11
- 22
MarketPlaces/BlackPyramid/crawler_selenium.py View File

@ -220,26 +220,25 @@ def crawlForum(driver):
print("Crawling the BlackPyramid market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a')
clicker.click() # open tab with url
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a')
clicker.click() # open tab with url
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -255,7 +254,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -263,12 +261,6 @@ def crawlForum(driver):
'/html/body/center/div[4]/div/div[3]/div[23]/div[2]/input[1]')
if clicker == "":
raise NoSuchElementException
try:
clicker.click()
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -278,9 +270,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling BlackPyramid forum done sucessfully. Press ENTER to continue\n")


+ 9
- 20
MarketPlaces/CityMarket/crawler_selenium.py View File

@ -221,24 +221,23 @@ def crawlForum(driver):
print("Crawling the CityMarket market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -254,7 +253,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -262,12 +260,6 @@ def crawlForum(driver):
'/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -277,9 +269,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling CityMarket forum done sucessfully. Press ENTER to continue\n")


+ 9
- 20
MarketPlaces/CypherMarketplace/crawler_selenium.py View File

@ -214,24 +214,23 @@ def crawlForum(driver):
print("Crawling the CypherMarketplace market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -247,7 +246,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -256,12 +254,6 @@ def crawlForum(driver):
link = temp.find_element(by=By.TAG_NAME, value='page-link').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -271,9 +263,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling CypherMarketplace forum done sucessfully. Press ENTER to continue\n")


BIN
MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc View File


+ 1
- 0
MarketPlaces/DB_Connection/db_connection.py View File

@ -139,6 +139,7 @@ def create_vendor(cur, row, marketId):
def create_items(cur, row, marketId, vendorId):
print(row)
sql = "Insert into items (market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \
"views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \


+ 21
- 23
MarketPlaces/DarkFox/crawler_selenium.py View File

@ -239,46 +239,47 @@ def crawlForum(driver):
print("Crawling the DarkFox market")
linksToCrawl = getInterestedLinks()
# visited = set(linksToCrawl)
# initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
if count >= 500:
break
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = str(item)
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
count += 1
# comment out
break
# comment out
if count == 0:
break
try:
link = driver.find_element(by=By.XPATH, value=
'/html/body/main/div/div[2]/div/div[2]/div/div/div/nav/a[2]').get_attribute('href')
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
@ -286,9 +287,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling BestCardingWorld forum done sucessfully. Press ENTER to continue\n")


+ 9
- 21
MarketPlaces/DarkMatter/crawler_selenium.py View File

@ -205,26 +205,24 @@ def crawlForum(driver):
print("Crawling the DarkMatter market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = productPages(html)
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
@ -239,7 +237,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -248,12 +245,6 @@ def crawlForum(driver):
link = a.get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -263,9 +254,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling DarkMatter forum done sucessfully. Press ENTER to continue\n")


+ 11
- 22
MarketPlaces/DarkTor/crawler_selenium.py View File

@ -201,24 +201,23 @@ def crawlForum(driver):
print("Crawling the DarkTor market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -230,23 +229,16 @@ def crawlForum(driver):
driver.back()
# comment out
# break
break
# comment out
if count == 30:
count = 0
if count == 1:
break
try:
link = ""
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -256,9 +248,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling DarkTor forum done sucessfully. Press ENTER to continue\n")


+ 9
- 20
MarketPlaces/DigitalThriftShop/crawler_selenium.py View File

@ -204,24 +204,23 @@ def crawlForum(driver):
print("Crawling the DigitalThriftShop market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -237,7 +236,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -245,12 +243,6 @@ def crawlForum(driver):
'/html/body/div[1]/div[2]/div/div[2]/main/div[1]/nav/ul/li[5]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -260,9 +252,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling DigitalThriftShop forum done sucessfully. Press ENTER to continue\n")


BIN
MarketPlaces/Initialization/__pycache__/__init__.cpython-310.pyc View File


BIN
MarketPlaces/Initialization/__pycache__/__init__.cpython-311.pyc View File


BIN
MarketPlaces/Initialization/__pycache__/markets_mining.cpython-310.pyc View File


BIN
MarketPlaces/Initialization/__pycache__/markets_mining.cpython-311.pyc View File


BIN
MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-310.pyc View File


BIN
MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc View File


+ 15
- 7
MarketPlaces/Initialization/prepare_parser.py View File

@ -11,6 +11,8 @@ from MarketPlaces.Apocalypse.parser import *
from MarketPlaces.ThiefWorld.parser import *
from MarketPlaces.AnonymousMarketplace.parser import *
from MarketPlaces.ViceCity.parser import *
from MarketPlaces.TorBay.parser import *
from MarketPlaces.M00nkeyMarket.parser import *
from MarketPlaces.Classifier.classify_product import predict
@ -151,7 +153,11 @@ def new_parse(marketPlace, url, createLog):
rmm = anonymousMarketplace_description_parser(soup)
elif marketPlace == "ViceCity":
rmm = vicecity_description_parser(soup)
elif marketPlace == "TorBay":
rmm = torbay_description_parser(soup)
elif marketPlace == "M00nkeyMarket":
rmm = m00nkey_description_parser(soup)
# key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2]
key = u"Url:" + os.path.basename(line2).replace(".html", "")
@ -190,7 +196,7 @@ def new_parse(marketPlace, url, createLog):
readError = True
if not readError:
print("Hello!")
parseError = False
try:
@ -206,11 +212,14 @@ def new_parse(marketPlace, url, createLog):
rw = anonymousMarketplace_listing_parser(soup)
elif marketPlace == "ViceCity":
rw = vicecity_listing_parser(soup)
elif marketPlace == "TorBay":
rw = torbay_listing_parser(soup)
elif marketPlace == "M00nkeyMarket":
rw = m00nkey_listing_parser(soup)
else:
parseError = True
except Exception as e:
raise e
except:
nError += 1
print("There was a problem to parse the file " + line1 + " in the listing section!")
@ -229,7 +238,6 @@ def new_parse(marketPlace, url, createLog):
for rec in rw:
rec = rec.split(',')
print(rec)
# if len(detPage) > 0: #It was created here just because Zeroday Market does not have Description Pages
# key = rec[23]
@ -237,7 +245,6 @@ def new_parse(marketPlace, url, createLog):
# key = u"Pr:" + rec[1].upper()[:list_lim1] + u" Vendor:" + rec[18].upper()[:list_lim2]
key = u"Url:" + cleanLink(rec[20])
print(key)
# if the associated description page is parsed
if key in detPage:
@ -255,7 +262,8 @@ def new_parse(marketPlace, url, createLog):
try:
persist_data(url, tuple(rec), cur)
con.commit()
except:
except Exception as e:
raise e
trace = traceback.format_exc()


+ 9
- 20
MarketPlaces/LionMarketplace/crawler_selenium.py View File

@ -212,24 +212,23 @@ def crawlForum(driver):
print("Crawling the LionMarketplace market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -245,7 +244,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -253,12 +251,6 @@ def crawlForum(driver):
'/html/body/div[2]/div[2]/div/div[2]/nav/ul/li[5]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -268,9 +260,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling LionMarketplace forum done sucessfully. Press ENTER to continue\n")


BIN
MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-310.pyc View File


BIN
MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc View File


+ 24
- 38
MarketPlaces/M00nkeyMarket/crawler_selenium.py View File

@ -27,16 +27,15 @@ from MarketPlaces.M00nkeyMarket.parser import m00nkey_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/'
BASE_URL = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/'
MARKET_NAME = 'M00nkeyMarket'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
opentor()
# mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
@ -45,7 +44,7 @@ def startCrawling():
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
new_parse(MARKET_NAME, BASE_URL, False)
# Opens Tor Browser
@ -64,16 +63,16 @@ def opentor():
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'M00nkeyMarket'
return name
# def getMKTName():
# name = 'M00nkeyMarket'
# return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/'
return url
# def getFixedURL():
# url = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/'
# return url
# Closes Tor Browser
@ -127,10 +126,9 @@ def createFFDriver():
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
driver.get(BASE_URL)
return driver
except:
driver.close()
@ -175,7 +173,7 @@ def savePage(page, url):
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + MARKET_NAME + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
@ -217,27 +215,26 @@ def crawlForum(driver):
print("Crawling the M00nkeyMarket market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(BASE_URL, str(item))
try:
driver.get(itemURL)
except:
@ -249,21 +246,13 @@ def crawlForum(driver):
break
# comment out
# if count == 1:
# count = 0
# break
if count == 1:
break
try:
link = driver.find_element(by=By.LINK_TEXT, value='Next ›').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -273,9 +262,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling M00nkeyMarket done sucessfully. Press ENTER to continue\n")


+ 198
- 239
MarketPlaces/M00nkeyMarket/parser.py View File

@ -1,4 +1,4 @@
__author__ = 'DarkWeb'
__author__ = 'Helium'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
@ -11,133 +11,132 @@ from bs4 import BeautifulSoup
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def darkfox_description_parser(soup):
def m00nkey_description_parser(soup):
# Fields to be parsed
name = "-1" # 0 Product_Name
describe = "-1" # 1 Product_Description
lastSeen = "-1" # 2 Product_LastViewDate
rules = "-1" # 3 NOT USED ...
CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
review = "-1" # 6 Product_Number_Of_Reviews
category = "-1" # 7 Product_Category
shipFrom = "-1" # 8 Product_ShippedFrom
shipTo = "-1" # 9 Product_ShippedTo
left = "-1" # 10 Product_QuantityLeft
escrow = "-1" # 11 Vendor_Warranty
terms = "-1" # 12 Vendor_TermsAndConditions
vendor = "-1" # 13 Vendor_Name
sold = "-1" # 14 Product_QuantitySold
addDate = "-1" # 15 Product_AddedDate
available = "-1" # 16 NOT USED ...
endDate = "-1" # 17 NOT USED ...
BTC = "-1" # 18 Product_BTC_SellingPrice
USD = "-1" # 19 Product_USD_SellingPrice
rating = "-1" # 20 Vendor_Rating
success = "-1" # 21 Vendor_Successful_Transactions
EURO = "-1" # 22 Product_EURO_SellingPrice
# Finding Product Name
name = soup.find('h1').text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
vendor = soup.find('h3').find('a').text.strip()
# Finding Vendor Rating
rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
# Finding Successful Transactions
success = soup.find('h3').text
success = success.replace("Vendor: ", "")
success = success.replace(vendor, "")
success = success.replace("(", "")
success = success.replace(")", "")
success = success.strip()
bae = soup.find('div', {'class': "box"}).find_all('ul')
# Finding Prices
USD = bae[1].find('strong').text.strip()
li = bae[2].find_all('li')
# Finding Escrow
escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Category
category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Quantity Available
left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
# Finding Number Sold
sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
li = bae[3].find_all('li')
# Finding Shipment Information (Origin)
if "Ships from:" in li[-2].text:
shipFrom = li[-2].text
shipFrom = shipFrom.replace("Ships from: ", "")
# shipFrom = shipFrom.replace(",", "")
shipFrom = shipFrom.strip()
# Finding Shipment Information (Destination)
shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
shipTo = shipTo.replace("Ships to: ", "")
shipTo = shipTo.strip()
if "certain countries" in shipTo:
countries = ""
tags = li[-1].find_all('span', {'class': "tag"})
for tag in tags:
country = tag.text.strip()
countries += country + ", "
shipTo = countries.strip(", ")
# Finding the Product description
describe = soup.find('div', {'class': "pre-line"}).text
describe = describe.replace("\n", " ")
describe = describe.strip()
'''# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
#vendor name
temp = soup.find('div', {'class': 'box rounded mb-0'}).find('a').text
vendor = (cleanString(temp.strip()))
#successful transaction
temp = soup.findAll('div', {'class','text-center text-truncate column-flex ml-1 mr-1'}) #card sidebar-menu mb-4 card sidebar-menu mb-4
temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'})
temp = temp2[1].text
success = (cleanString(temp.strip()))
#vendor rating 5
temp = soup.findAll('div', {'class', 'text-center text-truncate column-flex ml-1 mr-1'}) # card sidebar-menu mb-4 card sidebar-menu mb-4
temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'})
temp = temp2[5].text
rating_vendor = (cleanString(temp.strip()))
# product name
temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').text
name = (cleanString(temp.strip()))
# product description
describe = soup.find('div', {'class': "box rounded flex-fill"}).find('pre').text
if "\n" in describe:
describe = describe.replace("\n", " ")
describe = describe.replace("\r", " ")
describe = cleanString(describe.strip())
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
# product category
try:
temp = soup.findAll('table', {'class', 'table table-hover'})
temp2 = temp[1].find('tr').findAll('td')
temp = temp2[1].text
category = cleanString(temp.strip())
except:
temp = soup.find('table', {'class', 'table table-hover'})
temp2 = temp.find('tbody').find('tr').findAll('td')
temp = temp2[1].text
category = cleanString(temp.strip())
# product number of view
try:
temp = soup.find('div', {'class', 'box rounded mb-0'})
temp2 = temp.findAll('i')
temp = temp2[2].text
views = cleanString((temp.strip()))
except:
print('Product number of view')
# views = "-1"
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
#BTC selling price box box-rounded mt-2
temp = soup.find('div', {'class', 'box box-rounded mt-2'})
temp2 = temp.findAll('i', {'class', 'float-right color-prices'})
temp = temp2[1].text
BTC = cleanString((temp.strip()))
# USD selling price
temp = soup.find('div', {'class', 'box box-rounded mt-2'})
temp2 = temp.findAll('center')
temp = temp2[1].find('i').text
if "$" in temp:
temp = temp.replace("$", "")
USD = cleanString((temp.strip()))
EURO = "-1" # 14 Product_EURO_SellingPrice
# product sold
temp = soup.find('div', {'class', 'box rounded mb-0'}) # card sidebar-menu mb-4 card sidebar-menu mb-4
temp2 = temp.find('i')
temp = temp2.text
sold = (cleanString(temp.strip()))
# sold = "-1"
# product quantatiy left ###ERRROR
try:
temp = soup.findAll('table', {'class', 'table table-hover'})
temp2 = temp[1].findAll('tr')
temp3 = temp2[1].findAll('td')
temp = temp3[1].text
left = cleanString(temp.strip())
except:
temp = soup.find('table', {'class', 'table table-hover'})
temp2 = temp.findAll('tr')
temp3 = temp2[1].findAll('td')
temp = temp3[1].text
left = cleanString(temp.strip())
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
# Populating the final variable (this should be a list with all fields scraped)
row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
# Sending the results
return row
@ -147,131 +146,91 @@ def darkfox_description_parser(soup):
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def darkfox_listing_parser(soup):
def m00nkey_listing_parser(soup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "DarkFox" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft =[] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
listing = soup.findAll('div', {"class": "card"})
nm = 0 # *Total_Products (Should be Integer)
mktName = "M00nkeyMarket" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
listing = soup.findAll('div', {"class": "card mt-1"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# vendor
try:
temp = a.find('col-5 justify-content-between mx-auto').find('a').text
vendor.append(cleanString(temp.strip()))
except:
print('vendor')
#vendor rating
#successful transactions
try:
temp = a.find('col-5 justify-content-between mx-auto').find('div').text
success.append(cleanString(temp.strip()))
except:
print('successful transactions')
# product name
try:
temp = a.find('card-title rounded text-truncate').find('a').text
name.append(cleanString(temp.strip()))
except:
print('product name')
CVE.append('-1')
MS.append('-1')
rating_vendor.append("-1")
try:
temp = a.findAll('btn btn-block btn-primary')
except:
print("Error in product category")
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
# Finding the Product
product = bae[1].find('p').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
if len(bae) >= 5:
# Finding Prices
price = bae[0].text
ud = price.replace(" USD", " ")
# u = ud.replace("$","")
u = ud.replace(",", "")
u = u.strip()
USD.append(u)
# bc = (prc[1]).strip(' BTC')
# BTC.append(bc)
# Finding the Vendor
vendor_name = bae[1].find('a').text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Category
cat = bae[2].find('small').text
cat = cat.replace("Category: ", "")
cat = cat.replace(",", "")
cat = cat.strip()
category.append(cat)
# Finding Number Sold and Quantity Left
num = bae[3].text
num = num.replace("Sold: ", "")
num = num.strip()
sold.append(num)
quant = bae[4].find('small').text
quant = quant.replace("In stock: ", "")
quant = quant.strip()
qLeft.append(quant)
# Finding Successful Transactions
freq = bae[1].text
freq = freq.replace(vendor_name, "")
freq = re.sub(r'Vendor Level \d+', "", freq)
freq = freq.replace("(", "")
freq = freq.replace(")", "")
freq = freq.strip()
success.append(freq)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)


+ 12
- 23
MarketPlaces/MikesGrandStore/crawler_selenium.py View File

@ -227,24 +227,23 @@ def crawlForum(driver):
print("Crawling the MikesGrandStore market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -256,24 +255,17 @@ def crawlForum(driver):
driver.back()
# comment out
# break
break
# comment out
# if count == 1:
# count = 0
# break
if count == 1:
break
try:
link = driver.find_element(by=By.XPATH, value=
'/html/body/div[1]/main/div/div[1]/div/div[3]/nav/ul/li[6]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -283,9 +275,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling MikesGrandStore forum done sucessfully. Press ENTER to continue\n")


BIN
MarketPlaces/ThiefWorld/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
MarketPlaces/ThiefWorld/__pycache__/parser.cpython-311.pyc View File


+ 10
- 21
MarketPlaces/ThiefWorld/crawler_selenium.py View File

@ -211,24 +211,23 @@ def crawlForum(driver):
print("Crawling the ThiefWorld market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -243,8 +242,7 @@ def crawlForum(driver):
break
# comment out
if count == 20:
count = 0
if count == 1:
break
try:
@ -252,12 +250,6 @@ def crawlForum(driver):
'/html/body/div/div[1]/div/div/div[2]/div[3]/div/ul/li[13]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -267,9 +259,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling ThiefWorld forum done sucessfully. Press ENTER to continue\n")


BIN
MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
MarketPlaces/Tor2door/__pycache__/parser.cpython-310.pyc View File


BIN
MarketPlaces/Tor2door/__pycache__/parser.cpython-311.pyc View File


+ 10
- 22
MarketPlaces/Tor2door/crawler_selenium.py View File

@ -228,25 +228,23 @@ def crawlForum(driver):
print("Crawling the Tor2door market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -256,12 +254,12 @@ def crawlForum(driver):
driver.refresh()
savePage(driver.page_source, item)
driver.back()
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
@ -269,15 +267,8 @@ def crawlForum(driver):
'/html/body/main/div/div/div[2]/div[11]/div/nav')
a = nav.find_element(by=By.LINK_TEXT, value="")
link = a.get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -287,9 +278,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Tor2door market done sucessfully. Press ENTER to continue\n")


BIN
MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
MarketPlaces/TorBay/__pycache__/parser.cpython-310.pyc View File


BIN
MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc View File


+ 27
- 38
MarketPlaces/TorBay/crawler_selenium.py View File

@ -32,19 +32,19 @@ baseURL = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
opentor()
# opentor()
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
#
new_parse(mktName, baseURL, False)
# Opens Tor Browser
@ -198,24 +198,23 @@ def crawlForum(driver):
print("Crawling the TorBay Market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -226,25 +225,18 @@ def crawlForum(driver):
savePage(driver.page_source, item)
driver.back()
# #comment out
# break
#
# # # comment out
# if count == 1:
# count = 0
# break
# comment out
break
# comment out
if count == 1:
break
try:
link = driver.find_element(by=By.XPATH, value=
'/html/body/section/div/div/div[2]/div/div[2]/ul/li[3]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -254,9 +246,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling TorBay forum done sucessfully. Press ENTER to continue\n")


+ 80
- 162
MarketPlaces/TorBay/parser.py View File

@ -35,88 +35,51 @@ def torbay_description_parser(soup):
shipTo = "-1" # 18 Product_ShippedTo
# Finding Product Name
name = soup.find('div', {'class': 'product-information'}).find('h1').text.strip()
# Finding Vendor
vendor = soup.find('div', {"class": "profile-info"}).find('a').text.strip()
# Finding Vendor Rating
rating_vendor.append(-1)
# Finding Successful Transactions
success.append(-1)
bae = soup.find('div', {'class': "box"}).find_all('ul')
try:
product_name = soup.find('div', {'class': 'product-information'}).find('h1').text
name = cleanString(product_name.strip())
except:
try:
product_name = soup.find('div', {'class': 'profile-info'}).find('h2').text
name = cleanString(product_name.strip())
except:
# print(e)
print("product name")
# Finding Vendor FIx
try:
vendor_name = soup.find('div', {"class": "profile-info"}).find('h2').text
vendor = cleanString(vendor_name.strip())
except:
print("description vendor name failed\n")
# Finding Prices
USD = soup.find('div', {'class': "total-price"}).find('span').text.strip()
try:
USD = soup.find('div', {'class': "total-price"}).find('span').text.strip()
except:
print("description price failed\n")
# Finding the Product Category
category = soup.find('div', {'class': "profile-info"}).find('p').find('a').text.strip()
# Finding the Product Quantity Available
left.append(-1)
# Finding Number Sold
sold.append(-1)
li = bae[3].find_all('li')
# Finding Shipment Information (Origin)
if "Ships from:" in li[-2].text:
shipFrom = li[-2].text
shipFrom = shipFrom.replace("Ships from: ", "")
# shipFrom = shipFrom.replace(",", "")
shipFrom = shipFrom.strip()
# Finding Shipment Information (Destination)
shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
shipTo = shipTo.replace("Ships to: ", "")
shipTo = shipTo.strip()
if "certain countries" in shipTo:
countries = ""
tags = li[-1].find_all('span', {'class': "tag"})
for tag in tags:
country = tag.text.strip()
countries += country + ", "
shipTo = countries.strip(", ")
try:
cat = soup.find('div', {'class': "profile-info"}).find('p').text
category = cleanString(cat.strip())
except:
print("description product category failed")
# Finding the Product description
describe = soup.find('div', {'class': "pre-line"}).text
describe = describe.replace("\n", " ")
describe = describe.strip()
'''# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
try:
describe = soup.find('div', {'class': "info"}).find('p').text
if "\n" in describe:
describe = describe.replace("\n", " ")
describe = describe.replace("\r", " ")
describe = cleanString(describe.strip())
except:
# print("product desc")
try:
describe = soup.find('div', {'class': 'info'}).text
describe = cleanString(describe.strip())
except:
print("Product description")
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
@ -162,93 +125,48 @@ def torbay_listing_parser(soup):
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
product = bae[1].find('p').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
if len(bae) >= 5:
# Finding Prices
price = bae[0].text
ud = price.replace(" USD", " ")
# u = ud.replace("$","")
u = ud.replace(",", "")
u = u.strip()
USD.append(u)
# bc = (prc[1]).strip(' BTC')
# BTC.append(bc)
# Finding the Vendor
vendor_name = bae[1].find('a').text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Category
cat = bae[2].find('small').text
cat = cat.replace("Category: ", "")
cat = cat.replace(",", "")
cat = cat.strip()
category.append(cat)
# Finding Number Sold and Quantity Left
num = bae[3].text
num = num.replace("Sold: ", "")
num = num.strip()
sold.append(num)
quant = bae[4].find('small').text
quant = quant.replace("In stock: ", "")
quant = quant.strip()
qLeft.append(quant)
# Finding Successful Transactions
freq = bae[1].text
freq = freq.replace(vendor_name, "")
freq = re.sub(r'Vendor Level \d+', "", freq)
freq = freq.replace("(", "")
freq = freq.replace(")", "")
freq = freq.strip()
success.append(freq)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
try:
product_name = a.find('p', {'class': 'name'}).text
name.append(cleanString(product_name.strip()))
except:
print("product name")
try:
prod = a.find('p', {'class': 'price'}).text # price
USD.append(cleanString(prod.strip()))
except:
print("USD")
try:
ven = a.find('div', {'class': 'pc-footer'}).find('div').find('a').text # pc-footer
vendor.append(cleanString(ven.strip()))
# print(ven)
except:
print("vendor")
try:
h = a.find('p', {'class': 'name'}).find('a').get('href')
href.append(h)
except:
print("in href")
CVE.append("-1")
MS.append("-1")
rating_vendor.append("-1")
success.append("-1")
describe.append("-1")
views.append("-1")
reviews.append("-1")
rating_item.append("-1")
addDate.append("-1")
BTC.append("-1")
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
category.append("Hacking")
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,


+ 9
- 20
MarketPlaces/TorMarket/crawler_selenium.py View File

@ -201,24 +201,23 @@ def crawlForum(driver):
print("Crawling the TorMarket market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -234,7 +233,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -242,12 +240,6 @@ def crawlForum(driver):
'/html/body/div[2]/div/div/div[1]/main/nav/ul/li[5]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -257,9 +249,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling TorMarket forum done sucessfully. Press ENTER to continue\n")


+ 7
- 6
setup.ini View File

@ -1,14 +1,15 @@
[TOR]
firefox_binary_path = C:\Users\John Wick\Desktop\Tor Browser\Browser\firefox.exe
firefox_profile_path = C:\Users\John Wick\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
geckodriver_path = C:\Users\John Wick\PycharmProjects\dw_pipeline_test\selenium\geckodriver.exe
firefox_binary_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe
firefox_profile_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Data\\Browser\\profile.default
geckodriver_path = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test\\selenium\\geckodriver.exe
[Project]
project_directory = C:\Users\John Wick\PycharmProjects\dw_pipeline_test
shared_folder = Z:\\VBoxSvr\\VM_Files_ (shared)
project_directory = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test
shared_folder = \\VBoxSvr\\Shared
[PostgreSQL]
ip = localhost
username = postgres
password = postgres
password = password
database = darkweb_markets_forums

Loading…
Cancel
Save