diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml
index c554b68..08a5719 100644
--- a/.idea/DW_Pipeline_Test.iml
+++ b/.idea/DW_Pipeline_Test.iml
@@ -10,10 +10,23 @@
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 91bfa41..dc9ea49 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,4 @@
-
+
\ No newline at end of file
diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/AbyssForum/crawler_selenium.py
index 129e6dc..4823db2 100644
--- a/Forums/AbyssForum/crawler_selenium.py
+++ b/Forums/AbyssForum/crawler_selenium.py
@@ -32,15 +32,15 @@ baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion
def startCrawling():
# opentor()
forumName = getForumName()
- # driver = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(forumName, baseURL, True)
@@ -121,6 +121,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
def getAccess():
@@ -241,14 +243,14 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
break
try:
- link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
+ link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py
index 0f14223..e996150 100644
--- a/Forums/Altenens/crawler_selenium.py
+++ b/Forums/Altenens/crawler_selenium.py
@@ -42,7 +42,7 @@ def startCrawling():
print(driver.current_url, e)
closetor(driver)
- # new_parse(forumName, baseURL, True)
+ new_parse(forumName, baseURL, True)
# Opens Tor Browser
@@ -118,8 +118,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
+ # ff_prof.set_preference("network.dns.disablePrefetch", True)
+ # ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@@ -136,6 +136,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
@@ -253,7 +255,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
@@ -272,7 +274,7 @@ def crawlForum(driver):
print(link, e)
i += 1
- input("Crawling Altenens forum done successfully. Press ENTER to continue\n")
+ print("Crawling the Altenens forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website
diff --git a/Forums/BestCardingWorld/crawler_selenium.py b/Forums/BestCardingWorld/crawler_selenium.py
index 96821cd..22f6077 100644
--- a/Forums/BestCardingWorld/crawler_selenium.py
+++ b/Forums/BestCardingWorld/crawler_selenium.py
@@ -114,6 +114,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
@@ -238,8 +240,8 @@ def crawlForum(driver):
try:
nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[4]/ul')
- li = nav.find_element_by_class_name('next')
- page = li.find_element_by_tag_name('a').get_attribute('href')
+ li = nav.find_element(by=By.CLASS_NAME, value='next')
+ page = li.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
@@ -252,7 +254,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
@@ -260,8 +262,8 @@ def crawlForum(driver):
try:
bar = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[3]/ul')
- next = bar.find_element_by_class_name('next')
- link = next.find_element_by_tag_name('a').get_attribute('href')
+ next = bar.find_element(by=By.CLASS_NAME, value='next')
+ link = next.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
@@ -273,7 +275,7 @@ def crawlForum(driver):
print(link, e)
i += 1
- input("Crawling BestCardingWorld forum done sucessfully. Press ENTER to continue\n")
+ print("Crawling the BestCardingWorld forum done.")
# Returns 'True' if the link is a description link
diff --git a/Forums/BestCardingWorld/parser.py b/Forums/BestCardingWorld/parser.py
index 540435a..c4ca6e0 100644
--- a/Forums/BestCardingWorld/parser.py
+++ b/Forums/BestCardingWorld/parser.py
@@ -152,7 +152,7 @@ def bestcardingworld_description_parser(soup):
# Populate the final variable (this should be a list with all fields scraped)
- row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
+ row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
# Sending the results
@@ -166,15 +166,17 @@ def bestcardingworld_description_parser(soup):
#return: 'row' that contains a variety of lists that each hold info on the listing page
def bestcardingworld_listing_parser(soup):
- nm = 0 # this variable should receive the number of topics
- topic = [] # 1 all topics
- board = "-1" # 2 board name (the previous level of the topic in the Forum categorization tree.
+ nm = 0 # *this variable should receive the number of topics
+ forum = "BestCardingWorld" # 0 *forum name
+ board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
- view = [] # 3 number of views of each topic
- post = [] # 4 number of posts of each topic
- user = [] # 5 all users of each topic
- addDate = [] # 6 when the topic was created (difficult to find)
- href = [] # 16 this variable should receive all cleaned urls (we will use this to do the marge between Listing and Description pages)
+ author = [] # 2 *all authors of each topic
+ topic = [] # 3 *all topics
+ views = [] # 4 number of views of each topic
+ posts = [] # 5 number of posts of each topic
+ href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
+ # Listing and Description pages)
+ addDate = [] # 7 when the topic was created (difficult to find)
# Finding the board (should be just one)
@@ -187,7 +189,12 @@ def bestcardingworld_listing_parser(soup):
itopics = soup.find('ul', {"class": "topiclist topics"}).findAll('div',{"class": "list-inner"})
replies = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "posts"})
- views = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "views"})
+ view = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "views"})
+
+ # Counting how many topics we have found so far
+
+ nm = len(itopics)
+
index = 0
for itopic in itopics:
@@ -213,10 +220,6 @@ def bestcardingworld_listing_parser(soup):
topics = itopic.find('a', {"class": "topictitle"}).text
topic.append(cleanString(topics))
- # Counting how many topics we have found so far
-
- nm = len(topic)
-
# Adding the url to the list of urls
link = itopic.find('a', {"class": "topictitle"}).get('href')
link = cleanLink(link)
@@ -224,18 +227,18 @@ def bestcardingworld_listing_parser(soup):
# Finding the author of the topic
ps = itopic.find('div', {"class":"responsive-hide"}).find('a', {"class": "username-coloured"}).text
- author = ps.strip()
- user.append(cleanString(author))
+ user = ps.strip()
+ author.append(cleanString(user))
# Finding the number of replies
- posts = replies[index].text.split()[0]
- posts = posts.strip()
- post.append(cleanString(posts))
+ post = replies[index].text.split()[0]
+ post = post.strip()
+ posts.append(cleanString(post))
# Finding the number of Views
- tview = views[index].text.split()[0]
+ tview = view[index].text.split()[0]
tview = tview.strip()
- view.append(cleanString(tview))
+ views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable
#CryptBB doesn't show when topic was first posted on listing page
@@ -245,10 +248,9 @@ def bestcardingworld_listing_parser(soup):
addDate.append(date_time_obj)
#addDate.append("-1")
-
-
index += 1
- return organizeTopics("BestCardingWorld", nm, topic, board, view, post, user, addDate, href)
+
+ return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
#called by the crawler to get description links on a listing page
diff --git a/Forums/Cardingleaks/crawler_selenium.py b/Forums/Cardingleaks/crawler_selenium.py
index 85538fd..70200ff 100644
--- a/Forums/Cardingleaks/crawler_selenium.py
+++ b/Forums/Cardingleaks/crawler_selenium.py
@@ -34,15 +34,15 @@ baseURL = 'https://leaks.ws/'
def startCrawling():
# opentor()
forumName = getForumName()
- # driver = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(forumName, baseURL, True)
@@ -144,6 +144,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
@@ -159,7 +161,7 @@ def getAccess():
# Saves the crawled html page
-def savePage(page, url):
+def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
@@ -242,7 +244,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
- savePage(driver.page_source, topic + f"page{counter}") # very important
+ savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
@@ -261,7 +263,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/Forums/CryptBB/crawler_selenium.py b/Forums/CryptBB/crawler_selenium.py
index 5e98a7d..a462a65 100644
--- a/Forums/CryptBB/crawler_selenium.py
+++ b/Forums/CryptBB/crawler_selenium.py
@@ -30,15 +30,15 @@ baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion
def startCrawling():
# opentor()
forumName = getForumName()
- # driver = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(forumName, baseURL, True)
@@ -162,6 +162,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
@@ -289,7 +291,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py
index 7fbd56d..bcef5f8 100644
--- a/Forums/CryptBB/parser.py
+++ b/Forums/CryptBB/parser.py
@@ -124,7 +124,7 @@ def cryptBB_description_parser(soup):
stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday+ ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
- elif "hours ago" in dt:
+ elif "hour ago" in dt or "hours ago" in dt:
day = day.strftime('%m-%d-%Y')
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py
index fc29c07..8bb03bb 100644
--- a/Forums/DB_Connection/db_connection.py
+++ b/Forums/DB_Connection/db_connection.py
@@ -2,7 +2,6 @@ __author__ = 'DarkWeb'
import psycopg2
import traceback
-import configparser
def connectDataBase():
diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/HiddenAnswers/crawler_selenium.py
index 6641b81..392c90f 100644
--- a/Forums/HiddenAnswers/crawler_selenium.py
+++ b/Forums/HiddenAnswers/crawler_selenium.py
@@ -32,15 +32,15 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion
def startCrawling():
# opentor()
forumName = getForumName()
- # driver: webdriver.Firefox = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver: webdriver.Firefox = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(forumName, baseURL, True)
@@ -121,6 +121,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
def getAccess():
@@ -235,7 +237,7 @@ def crawlForum(driver: webdriver.Firefox):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/Forums/HiddenAnswers/parser.py b/Forums/HiddenAnswers/parser.py
index 1a3ee2d..16b56cb 100644
--- a/Forums/HiddenAnswers/parser.py
+++ b/Forums/HiddenAnswers/parser.py
@@ -127,15 +127,18 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
if date_posted.find("day") > 0:
datetime_obj = datetime.now() - timedelta(days=1)
else:
- datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
+ try:
+ datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
+ except ValueError:
+ datetime_obj = datetime.strptime(f"{date_posted}", "%b %d, %Y")
addDate.append(datetime_obj)
#this link will be cleaned
listing_href = queries.find("div", {"class": "qa-q-item-title"}).find("a").get("href")
href.append(listing_href)
-#need to change this method
nm = len(topic)
+
return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate)
#need to change this method
diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt
index 3526771..efa9686 100644
--- a/Forums/Initialization/forumsList.txt
+++ b/Forums/Initialization/forumsList.txt
@@ -1,9 +1 @@
-AbyssForum
-Altenens
-BestCardingWorld
-Cardingleaks
-CryptBB
-HiddenAnswers
-Libre
-OnniForums
-Procrax
\ No newline at end of file
+BestCardingWorld
\ No newline at end of file
diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py
index 272cb44..1f089e6 100644
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@@ -12,6 +12,7 @@ from Forums.OnniForums.parser import *
from Forums.Altenens.parser import *
from Forums.Procrax.parser import *
from Forums.Libre.parser import *
+from Forums.HiddenAnswers.parser import *
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@@ -126,6 +127,8 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
rw = procrax_listing_parser(soup)
elif forum == "Libre":
rw = libre_listing_parser(soup)
+ elif forum == "HiddenAnswers":
+ rw = HiddenAnswers_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@@ -160,6 +163,8 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
rmm = procrax_description_parser(soup)
elif forum == "Libre":
rmm = libre_description_parser(soup)
+ elif forum == "HiddenAnswers":
+ rmm = HiddenAnswers_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception
diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py
index d06cd83..1033474 100644
--- a/Forums/Libre/crawler_selenium.py
+++ b/Forums/Libre/crawler_selenium.py
@@ -30,15 +30,15 @@ baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion
def startCrawling():
# opentor()
forumName = getForumName()
- # driver = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(forumName, baseURL, True)
@@ -144,6 +144,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
@@ -255,7 +257,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
@@ -275,7 +277,7 @@ def crawlForum(driver):
print(link, e)
i += 1
- input("Crawling the Libre forum done.")
+ print("Crawling the Libre forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website
diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py
index 58b1313..0888d14 100644
--- a/Forums/OnniForums/crawler_selenium.py
+++ b/Forums/OnniForums/crawler_selenium.py
@@ -33,15 +33,15 @@ baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion
def startCrawling():
# opentor()
forumName = getForumName()
- # driver = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(forum=forumName, url=baseURL, createLog=True)
@@ -139,6 +139,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
@@ -267,7 +269,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/Forums/OnniForums/parser.py b/Forums/OnniForums/parser.py
index 393e6b3..3854141 100644
--- a/Forums/OnniForums/parser.py
+++ b/Forums/OnniForums/parser.py
@@ -139,12 +139,14 @@ def onniForums_listing_parser(soup: BeautifulSoup):
nm = len(thread_arrays)
for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above
-
- try:
- post_subject: str = thread.find("span",{"class": "subject_new"}).text #getting the topic
+
+ body = thread.find("span",{"class": "subject_new"})
+ try:
+ post_subject: str = body.text #getting the topic
except AttributeError:
- post_subject: str = thread.find("span",{"class": "subject_old"}).text
+ body = thread.find("span",{"class": "subject_old"})
+ post_subject: str = body.text
post_subject_cleaned = cleanString(post_subject.strip())
topic.append(post_subject_cleaned)
@@ -163,9 +165,8 @@ def onniForums_listing_parser(soup: BeautifulSoup):
author = thread.find("span",{"class" : "author smalltext"}).text
author_cleaned = cleanString(author.strip())
user.append(author_cleaned)
-
- reply_anchor = thread.find_all("td", {"align": "center"})[2].find('a')
- thread_link = reply_anchor.get('href')
+
+ thread_link = body.find('a').get('href')
href.append(thread_link)
return organizeTopics(
diff --git a/Forums/Procrax/crawler_selenium.py b/Forums/Procrax/crawler_selenium.py
index f2ed372..2a8be96 100644
--- a/Forums/Procrax/crawler_selenium.py
+++ b/Forums/Procrax/crawler_selenium.py
@@ -33,15 +33,15 @@ FORUM_NAME = 'Procrax'
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
- # driver = getAccess()
- #
- # if driver != 'down':
- # try:
- # login(driver)
- # crawlForum(driver)
- # except Exception as e:
- # print(driver.current_url, e)
- # closetor(driver)
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
new_parse(
forum=FORUM_NAME,
@@ -140,6 +140,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+ driver.maximize_window()
+
return driver
def getAccess():
@@ -257,7 +259,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/MarketPlaces/AnonymousMarketplace/crawler_selenium.py b/MarketPlaces/AnonymousMarketplace/crawler_selenium.py
index 99b4431..f7f9f17 100644
--- a/MarketPlaces/AnonymousMarketplace/crawler_selenium.py
+++ b/MarketPlaces/AnonymousMarketplace/crawler_selenium.py
@@ -187,12 +187,8 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # # carding
- # links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/carding/')
- # # hacked paypal
- # links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacked-paypal-accounts/')
- # hacking services
- links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacking-services/')
+ # home
+ links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/')
return links
@@ -232,7 +228,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
@@ -240,7 +236,7 @@ def crawlForum(driver):
#left in in case site changes
try:
- link = ""
+ link = driver.find_element(by=By.LINK_TEXT, value="→").get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
diff --git a/MarketPlaces/AnonymousMarketplace/parser.py b/MarketPlaces/AnonymousMarketplace/parser.py
index 08dbaa3..a85934c 100644
--- a/MarketPlaces/AnonymousMarketplace/parser.py
+++ b/MarketPlaces/AnonymousMarketplace/parser.py
@@ -41,12 +41,12 @@ def anonymousMarketplace_description_parser(soup: Tag):
describe_output += div.text
describe = cleanString(describe_output.strip())
- product_ratings: Tag = soup.find("div", {"class": "star-rating"})
+ product_ratings: Tag = soup.find("div", {"class": "woocommerce-product-rating"})
- product_reviews = product_ratings.find("div", {"class": "woocommerce-product-rating"}).find("strong", {"class": "rating"}).text
+ product_reviews = product_ratings.find("span", {"class": "rating"}).text
reviews = cleanString(product_reviews.strip())
- product_star_rating = product_ratings.find("span", {"class": "rating"}).text
+ product_star_rating = product_ratings.find("strong", {"class": "rating"}).text
rating_item = cleanString(product_star_rating.strip())
product_price = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text
@@ -86,15 +86,16 @@ def anonymousMarketplace_listing_parser(soup: Tag):
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
-
-
- product_list: ResultSet[Tag] = soup.find("ul", {"class": "product_list_widget"}).find_all("li")
+
+ woo = soup.find('div', {"class": "woocommerce"})
+
+ product_list = woo.find('ul', {"class": "products columns-4"}, recursive=False).find_all('li')
for item in product_list:
- item_href = item.find("a").get("href")
+ item_href = item.find("a", recursive=False).get("href")
href.append(item_href)
- item_name = item.find("span", {"class": "product-title"}).text
+ item_name = item.find("h2").text
name.append(cleanString(item_name.strip()))
item_rating = item.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text
@@ -103,14 +104,11 @@ def anonymousMarketplace_listing_parser(soup: Tag):
try:
item_price = item.find("span", {"class": "woocommerce-Price-amount amount"}).text
item_price = item_price.replace("$", "").strip()
- USD.append(item_price)
+ USD.append(cleanNumbers(item_price))
except AttributeError:
USD.append("-1")
-
-
-
-
- vendor.append("Anonymous")
+
+ vendor.append("AnonymousMarketplace")
rating_vendor.append("-1")
success.append("-1")
CVE.append("-1")
@@ -153,10 +151,6 @@ def anonymousMarketplace_listing_parser(soup: Tag):
shipTo=shipTo,
href=href
)
-
-
-
-
#called by the crawler to get description links on a listing page
@@ -167,10 +161,13 @@ def anonymous_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
- listing = soup.find('ul', {"class": "product_list_widget"}).findAll('li')
+
+ woo = soup.find('div', {"class": "woocommerce"})
+
+ listing = woo.find('ul', {"class": "products columns-4"}, recursive=False).find_all('li')
for a in listing:
- bae = a.find('a', href=True)
+ bae = a.find('a', href=True, recursive=False)
link = bae['href']
href.append(link)
diff --git a/MarketPlaces/Apocalypse/crawler_selenium.py b/MarketPlaces/Apocalypse/crawler_selenium.py
index c1f579d..4885c19 100644
--- a/MarketPlaces/Apocalypse/crawler_selenium.py
+++ b/MarketPlaces/Apocalypse/crawler_selenium.py
@@ -203,8 +203,12 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # # Hacking Services
- # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/19')
+ # # Digital Goods
+ # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74')
+ # # Fraud
+ # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75')
+ # # Services
+ # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76')
# software and malware
links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30')
@@ -243,7 +247,11 @@ def crawlForum(driver):
except:
driver.refresh()
savePage(driver, driver.page_source, item)
- driver.back()
+ # driver.back()
+ try:
+ driver.get(link)
+ except:
+ driver.refresh()
# comment out
# break
@@ -282,7 +290,7 @@ def isDescriptionLink(url):
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
- if 'subcategory' in url:
+ if 'category' in url:
return True
return False
diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py
new file mode 100644
index 0000000..1f80aec
--- /dev/null
+++ b/MarketPlaces/DarkBazar/crawler_selenium.py
@@ -0,0 +1,276 @@
+__author__ = 'DarkWeb'
+
+'''
+DarkBazar Marketplace Crawler (Selenium)
+'''
+
+from selenium import webdriver
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
+from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
+from selenium.webdriver.firefox.service import Service
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support.ui import Select
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+
+from PIL import Image
+import urllib.parse as urlparse
+import os, re, time
+from datetime import date
+import subprocess
+import configparser
+from bs4 import BeautifulSoup
+from MarketPlaces.Initialization.prepare_parser import new_parse
+from MarketPlaces.DarkBazar.parser import darkbazar_links_parser
+from MarketPlaces.Utilities.utilities import cleanHTML
+
+counter = 1
+baseURL = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/'
+
+
+def startCrawling():
+ # opentor()
+ mktName = getMKTName()
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
+
+ new_parse(mktName, baseURL, True)
+
+
+# Opens Tor Browser
+def opentor():
+ from MarketPlaces.Initialization.markets_mining import config
+
+ global pid
+ print("Connecting Tor...")
+ pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
+ pid = pro.pid
+ time.sleep(7.5)
+ input('Tor Connected. Press ENTER to continue\n')
+ return
+
+
+# Returns the name of the website
+def getMKTName():
+ name = 'DarkBazar'
+ return name
+
+
+# Return the base link of the website
+def getFixedURL():
+ url = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/'
+ return url
+
+
+# Closes Tor Browser
+def closetor(driver):
+ # global pid
+ # os.system("taskkill /pid " + str(pro.pid))
+ # os.system("taskkill /t /f /im tor.exe")
+ print('Closing Tor...')
+ driver.close()
+ time.sleep(3)
+ return
+
+
+# Creates FireFox 'driver' and configure its 'Profile'
+# to use Tor proxy and socket
+def createFFDriver():
+ from MarketPlaces.Initialization.markets_mining import config
+
+ ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
+
+ ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
+ ff_prof.set_preference("places.history.enabled", False)
+ ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
+ ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
+ ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
+ ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
+ ff_prof.set_preference("signon.rememberSignons", False)
+ ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
+ # ff_prof.set_preference("network.dns.disablePrefetch", True)
+ # ff_prof.set_preference("network.http.sendRefererHeader", 0)
+ ff_prof.set_preference("permissions.default.image", 1)
+ ff_prof.set_preference("browser.download.folderList", 2)
+ ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
+ ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
+ ff_prof.set_preference('network.proxy.type', 1)
+ ff_prof.set_preference("network.proxy.socks_version", 5)
+ ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
+ ff_prof.set_preference('network.proxy.socks_port', 9150)
+ ff_prof.set_preference('network.proxy.socks_remote_dns', True)
+ ff_prof.set_preference("javascript.enabled", False)
+ ff_prof.update_preferences()
+
+ service = Service(config.get('TOR', 'geckodriver_path'))
+
+ driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+
+ driver.maximize_window()
+
+ return driver
+
+
+#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
+def getAccess():
+ url = getFixedURL()
+ driver = createFFDriver()
+ try:
+ driver.get(url)
+ return driver
+ except:
+ driver.close()
+ return 'down'
+
+
+def login(driver):
+ input("Press ENTER when CAPTCHA is complete and login page has loaded\n")
+
+ # entering username and password into input boxes
+ usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]')
+ # Username here
+ usernameBox.send_keys('aliciamykeys')
+ passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]')
+ # Password here
+ passwordBox.send_keys('aliciawherearemykey$')
+ # session time
+ session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select'))
+ session_select.select_by_visible_text('Session 60min')
+
+ input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n")
+
+ # wait for listing page show up (This Xpath may need to change based on different seed url)
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, '//*[@id="submit"]')))
+
+
+def savePage(driver, page, url):
+ cleanPage = cleanHTML(driver, page)
+ filePath = getFullPathName(url)
+ os.makedirs(os.path.dirname(filePath), exist_ok=True)
+ open(filePath, 'wb').write(cleanPage.encode('utf-8'))
+ return
+
+
+def getFullPathName(url):
+ from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
+
+ mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
+ fileName = getNameFromURL(url)
+ if isDescriptionLink(url):
+ fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
+ else:
+ fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
+ return fullPath
+
+
+def getMKTName() -> str:
+ name = 'DarkBazar'
+ return name
+
+
+def getNameFromURL(url):
+ global counter
+ name = ''.join(e for e in url if e.isalnum())
+ if name == '':
+ name = str(counter)
+ counter = counter + 1
+ return name
+
+
+def getInterestedLinks():
+ links = []
+
+ # # Digital Goods
+ # links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=4')
+ # Services
+ links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=5')
+
+ return links
+
+
+def crawlForum(driver):
+
+ print("Crawling the DarkBazar market")
+
+ linksToCrawl = getInterestedLinks()
+
+ i = 0
+ while i < len(linksToCrawl):
+ link = linksToCrawl[i]
+ print('Crawling :', link)
+ try:
+ has_next_page = True
+ count = 0
+
+ while has_next_page:
+ try:
+ driver.get(link)
+ except:
+ driver.refresh()
+ html = driver.page_source
+ savePage(driver, html, link)
+
+ list = productPages(html)
+
+ for item in list:
+ itemURL = urlparse.urljoin(baseURL, str(item))
+ try:
+ driver.get(itemURL)
+ except:
+ driver.refresh()
+ savePage(driver, driver.page_source, item)
+ driver.back()
+
+ # comment out
+ # break
+
+ # comment out
+ if count == 1:
+ break
+
+ try:
+ link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
+ if link == "":
+ raise NoSuchElementException
+ count += 1
+
+ except NoSuchElementException:
+ has_next_page = False
+
+ except Exception as e:
+ print(link, e)
+ i += 1
+
+ print("Crawling the DarkBazar market done.")
+
+
+# Returns 'True' if the link is Topic link, may need to change for every website
+def isDescriptionLink(url):
+ if 'item' in url:
+ return True
+ return False
+
+
+# Returns True if the link is a listingPage link, may need to change for every website
+def isListingLink(url):
+ if 'category=' in url:
+ return True
+ return False
+
+
+def productPages(html):
+ soup = BeautifulSoup(html, "html.parser")
+ return darkbazar_links_parser(soup)
+
+
+def crawler():
+ startCrawling()
diff --git a/MarketPlaces/DarkBazar/parser.py b/MarketPlaces/DarkBazar/parser.py
new file mode 100644
index 0000000..ccb7266
--- /dev/null
+++ b/MarketPlaces/DarkBazar/parser.py
@@ -0,0 +1,267 @@
+__author__ = 'DarkWeb'
+
+# Here, we are importing the auxiliary functions to clean or convert data
+from MarketPlaces.Utilities.utilities import *
+
+# Here, we are importing BeautifulSoup to search through the HTML tree
+from bs4 import BeautifulSoup
+
+
+# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
+# stores info it needs in different lists, these lists are returned after being organized
+# @param: soup object looking at html page of description page
+# return: 'row' that contains a variety of lists that each hold info on the description page
+def darkbazar_description_parser(soup):
+ # Fields to be parsed
+
+ vendor = "-1" # 0 *Vendor_Name
+ success = "-1" # 1 Vendor_Successful_Transactions
+ rating_vendor = "-1" # 2 Vendor_Rating
+ name = "-1" # 3 *Product_Name
+ describe = "-1" # 4 Product_Description
+ CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
+ category = "-1" # 7 Product_Category
+ views = "-1" # 8 Product_Number_Of_Views
+ reviews = "-1" # 9 Product_Number_Of_Reviews
+ rating_item = "-1" # 10 Product_Rating
+ addDate = "-1" # 11 Product_AddedDate
+ BTC = "-1" # 12 Product_BTC_SellingPrice
+ USD = "-1" # 13 Product_USD_SellingPrice
+ EURO = "-1" # 14 Product_EURO_SellingPrice
+ sold = "-1" # 15 Product_QuantitySold
+ left = "-1" # 16 Product_QuantityLeft
+ shipFrom = "-1" # 17 Product_ShippedFrom
+ shipTo = "-1" # 18 Product_ShippedTo
+
+ # Finding Product Name
+ divmb = soup.findAll('div', {'class': "mb-1"})
+
+ name = divmb[0].text
+ name = name.replace('\n', ' ')
+ name = name.replace(",", "")
+ name = name.strip()
+
+ # Finding Vendor
+ vendor = divmb[1].find('a').text.strip()
+
+ # Finding Vendor Rating
+ temp = soup.find('div', {'class': ""}).text
+ temp = temp.split('(')
+ rating = temp[0].replace("Vendor's Review : ", "")
+ rating = rating.replace("%", "")
+ rating_vendor = rating.strip()
+
+ # Finding the Product Rating and Number of Product Reviews
+ reviews = temp[2].replace(" review)", "")
+ reviews = reviews.strip()
+
+ temp = temp[1].split(")")
+ rating = temp[1].replace("Product Review : ", "")
+ rating = rating.replace("%", "")
+ rating_item = rating.strip()
+
+ # Finding Prices
+ USD = soup.find('div', {'class': "h3 text-primary"}).text.strip()
+
+ # Finding the Product Category
+ pmb = soup.findAll('p', {'class': "mb-1"})
+
+ category = pmb[-1].text
+ category = category.replace("Category: ", "").strip()
+
+ # Finding the Product Quantity Available
+ left = divmb[-1].text
+ left = left.split(",", 1)[1]
+ left = left.replace("in stock", "")
+ left = left.strip()
+
+ # Finding Number Sold
+ sold = divmb[-1].text
+ sold = sold.split(",", 1)[0]
+ sold = sold.replace("sold", "")
+ sold = sold.strip()
+
+ # Finding Shipment Information (Origin)
+ pmb[0].text
+ shipFrom = shipFrom.replace("Ships from: ", "").strip()
+
+ # Finding Shipment Information (Destination)
+ pmb[1].text
+ shipTo = shipTo.replace("Ships to: ", "").strip()
+
+ # Finding the Product description
+ cardbody = soup.findAll('div', {'class': "card-body"})
+ describe = cardbody[1].text.strip()
+
+ # Searching for CVE and MS categories
+ cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
+ if cve:
+ CVE = " "
+ for idx in cve:
+ CVE += (idx)
+ CVE += " "
+ CVE = CVE.replace(',', ' ')
+ CVE = CVE.replace('\n', '')
+ ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
+ if ms:
+ MS = " "
+ for im in ms:
+ MS += (im)
+ MS += " "
+ MS = MS.replace(',', ' ')
+ MS = MS.replace('\n', '')
+
+ # Populating the final variable (this should be a list with all fields scraped)
+ row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
+ BTC, USD, EURO, sold, left, shipFrom, shipTo)
+
+ # Sending the results
+ return row
+
+
+# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
+# stores info it needs in different lists, these lists are returned after being organized
+# @param: soup object looking at html page of listing page
+# return: 'row' that contains a variety of lists that each hold info on the listing page
+def darkbazar_listing_parser(soup):
+
+ # Fields to be parsed
+ nm = 0 # *Total_Products (Should be Integer)
+ mktName = "DarkBazar" # 0 *Marketplace_Name
+ vendor = [] # 1 *Vendor y
+ rating_vendor = [] # 2 Vendor_Rating
+ success = [] # 3 Vendor_Successful_Transactions
+ name = [] # 4 *Product_Name y
+ CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
+ MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
+ category = [] # 7 Product_Category y
+ describe = [] # 8 Product_Description
+ views = [] # 9 Product_Number_Of_Views
+ reviews = [] # 10 Product_Number_Of_Reviews
+ rating_item = [] # 11 Product_Rating
+ addDate = [] # 12 Product_AddDate
+ BTC = [] # 13 Product_BTC_SellingPrice
+ USD = [] # 14 Product_USD_SellingPrice y
+ EURO = [] # 15 Product_EURO_SellingPrice
+ sold = [] # 16 Product_QuantitySold
+ qLeft = [] # 17 Product_QuantityLeft
+ shipFrom = [] # 18 Product_ShippedFrom
+ shipTo = [] # 19 Product_ShippedTo
+ href = [] # 20 Product_Links
+
+ listing = soup.findAll('div', {"id": "itembox"})
+
+ # Populating the Number of Products
+ nm = len(listing)
+
+ for a in listing:
+ bae = a.findAll('a', href=True)
+ lb = a.findAll('div', {"id": "littlebox"})
+
+ # Adding the url to the list of urls
+ link = bae[0].get('href')
+ link = cleanLink(link)
+ href.append(link)
+
+ # Finding the Product
+ product = lb[1].find('a').text
+ product = product.replace('\n', ' ')
+ product = product.replace(",", "")
+ product = product.replace("...", "")
+ product = product.strip()
+ name.append(product)
+
+ # Finding Prices
+ price = lb[-1].find('div', {"class": "mb-1"}).text
+ price = price.replace("$","")
+ price = price.strip()
+ USD.append(price)
+
+ # Finding the Vendor
+ vendor_name = lb[-1].find("a").text
+ vendor_name = vendor_name.replace(",", "")
+ vendor_name = vendor_name.strip()
+ vendor.append(vendor_name)
+
+ # Finding the Category
+ cat = lb[-1].find("span").text
+ cat = cat.replace("class:", "")
+ cat = cat.strip()
+ category.append(cat)
+
+ span = lb[0].findAll("span")
+
+ # Finding Number of Views
+ num = span[0].text
+ num = num.replace("views:", "")
+ num = num.strip()
+ sold.append(num)
+
+ # Finding Number Sold
+ num = span[2].text
+ num = num.replace("Sold:", "")
+ num = num.strip()
+ sold.append(num)
+
+ # Finding Quantity Left
+ quant = span[1].text
+ quant = quant.replace("stock:", "")
+ quant = quant.strip()
+ qLeft.append(quant)
+
+
+ # Searching for CVE and MS categories
+ cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
+ if not cve:
+ cveValue = "-1"
+ else:
+ cee = " "
+ for idx in cve:
+ cee += (idx)
+ cee += " "
+ cee = cee.replace(',', ' ')
+ cee = cee.replace('\n', '')
+ cveValue = cee
+ CVE.append(cveValue)
+
+ ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
+ if not ms:
+ MSValue = "-1"
+ else:
+ me = " "
+ for im in ms:
+ me += (im)
+ me += " "
+ me = me.replace(',', ' ')
+ me = me.replace('\n', '')
+ MSValue = me
+ MS.append(MSValue)
+
+ # Populate the final variable (this should be a list with all fields scraped)
+ return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
+ reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
+
+
+# called by the crawler to get description links on a listing page
+# @param: beautifulsoup object that is using the correct html page (listing page)
+# return: list of description links from a listing page
+def darkbazar_links_parser(soup):
+ # Returning all links that should be visited by the Crawler
+
+ href = []
+ listing = soup.findAll('div', {"id": "itembox"})
+
+ # for a in listing:
+ # bae = a.find('a', {"class": "text-info"}, href=True)
+ # link = bae['href']
+ # href.append(link)
+
+ for a in listing:
+ bae = a.findAll('a', href=True)
+
+ # Adding the url to the list of urls
+ link = bae[0].get('href')
+ href.append(link)
+
+ return href
\ No newline at end of file
diff --git a/MarketPlaces/DarkMatter/crawler_selenium.py b/MarketPlaces/DarkMatter/crawler_selenium.py
index e0babcb..67183e5 100644
--- a/MarketPlaces/DarkMatter/crawler_selenium.py
+++ b/MarketPlaces/DarkMatter/crawler_selenium.py
@@ -229,17 +229,16 @@ def crawlForum(driver):
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
- time.sleep(1.5) # to keep from detecting click speed
+ time.sleep(3) # to keep from detecting click speed
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
- time.sleep(1.5)
+ time.sleep(3) # to keep from detecting click speed
driver.back()
- # to keep from detecting click speed
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/MarketPlaces/DigitalThriftShop/crawler_selenium.py b/MarketPlaces/DigitalThriftShop/crawler_selenium.py
index 132d2af..39d9f5d 100644
--- a/MarketPlaces/DigitalThriftShop/crawler_selenium.py
+++ b/MarketPlaces/DigitalThriftShop/crawler_selenium.py
@@ -235,7 +235,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/MarketPlaces/HiddenMarket/crawler_selenium.py b/MarketPlaces/HiddenMarket/crawler_selenium.py
index 3813c76..b8cc323 100644
--- a/MarketPlaces/HiddenMarket/crawler_selenium.py
+++ b/MarketPlaces/HiddenMarket/crawler_selenium.py
@@ -277,7 +277,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt
index fe4ac4a..540b444 100644
--- a/MarketPlaces/Initialization/marketsList.txt
+++ b/MarketPlaces/Initialization/marketsList.txt
@@ -1,8 +1,12 @@
Apocalypse
+DarkBazar
DarkMatter
DigitalThriftShop
HiddenMarket
+LionMarketplace
Nexus
Robinhood
+ThiefWorld
TorBay
+TorMarket
ViceCity
\ No newline at end of file
diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py
index 4b9c02e..b93ef96 100644
--- a/MarketPlaces/Initialization/markets_mining.py
+++ b/MarketPlaces/Initialization/markets_mining.py
@@ -24,6 +24,7 @@ from MarketPlaces.HiddenMarket.crawler_selenium import crawler as crawlerHiddenM
from MarketPlaces.RobinhoodMarket.crawler_selenium import crawler as crawlerRobinhoodMarket
from MarketPlaces.Nexus.crawler_selenium import crawler as crawlerNexus
from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher
+from MarketPlaces.DarkBazar.crawler_selenium import crawler as crawlerDarkBazar
import configparser
import os
@@ -137,5 +138,7 @@ if __name__ == '__main__':
crawlerNexus()
elif mkt == "CypherMarketplace":
crawlerCypher()
+ elif mkt == "DarkBazar":
+ crawlerDarkBazar()
print("\nScraping process completed!")
diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py
index a460a18..68ba3b9 100644
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@@ -1,4 +1,4 @@
-__author__ = 'Helium'
+__author__ = 'DarkWeb'
import glob
import os
@@ -21,6 +21,7 @@ from MarketPlaces.HiddenMarket.parser import *
from MarketPlaces.RobinhoodMarket.parser import *
from MarketPlaces.Nexus.parser import *
from MarketPlaces.MikesGrandStore.parser import *
+from MarketPlaces.DarkBazar.parser import *
from MarketPlaces.Classifier.classify_product import predict
@@ -152,6 +153,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
rw = nexus_listing_parser(soup)
elif marketPlace == "MikesGrandStore":
rw = mikesGrandStore_listing_parser(soup)
+ elif marketPlace == "DarkBazar":
+ rw = darkbazar_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@@ -203,6 +206,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
rmm = nexus_description_parser(soup)
elif marketPlace == "MikesGrandStore":
rmm = mikesGrandStore_description_parser(soup)
+ elif marketPlace == "DarkBazar":
+ rmm = darkbazar_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception
diff --git a/MarketPlaces/LionMarketplace/crawler_selenium.py b/MarketPlaces/LionMarketplace/crawler_selenium.py
index 237838f..ab8e41d 100644
--- a/MarketPlaces/LionMarketplace/crawler_selenium.py
+++ b/MarketPlaces/LionMarketplace/crawler_selenium.py
@@ -234,7 +234,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/MarketPlaces/LionMarketplace/parser.py b/MarketPlaces/LionMarketplace/parser.py
index 726cc63..06a87e3 100644
--- a/MarketPlaces/LionMarketplace/parser.py
+++ b/MarketPlaces/LionMarketplace/parser.py
@@ -34,15 +34,11 @@ def lionmarketplace_description_parser(soup):
shipTo = "-1" # 18 Product_ShippedTo
# vendor name
- try:
- temp = soup.find('div', {'class': 'btn-group'}).find('a').text
- vendor = (cleanString(temp.strip()))
- except:
- print('vendor')
- vendor = "-1"
+ temp = soup.find('div', {'class': 'btn-group'}).find('a').text
+ vendor = (cleanString(temp.strip()))
# table with info
- table = soup.find('table', {'class', 'table border-0 text-left table-borderless'})
+ table = soup.find('table')
rows = table.findAll('tr')
# successful transaction
@@ -51,37 +47,20 @@ def lionmarketplace_description_parser(soup):
# vendor rating 5
rating_vendor = '-1'
-
# product name
- try:
- temp = soup.find('div', {'class', 'row'}).find('h2').text
- name = (cleanString(temp.strip()))
- except:
- name = '-1'
- print('product name')
+ temp = soup.find('div', {'class', 'row'}).find('h2').text
+ name = (cleanString(temp.strip()))
# product description
- try:
- temp = soup.find('div', {'class': "mt-4"}).findAll('p')
- temp = temp[1].text
- if "\n" in temp:
- temp = temp.replace("\n", " ")
- temp = temp.replace("\r", " ")
- describe = cleanString(temp.strip())
- except:
- describe="-1"
- print('describe')
+ temp = soup.find('div', {'class': "mt-4"}).find(text=True, recursive=False)
+ describe = cleanString(temp.strip())
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
# product category
- try:
- temp = rows[1].find('strong').text
- category = cleanString(temp.strip())
- except:
- category = "-1"
- print('category')
+ temp = rows[1].find('strong').text
+ category = cleanString(temp.strip())
# product number of views
views = "-1"
@@ -92,54 +71,38 @@ def lionmarketplace_description_parser(soup):
BTC = "-1"
# USD selling price
- try:
- temp = rows[2].find('strong').text
- if " $" in temp:
- temp = temp.replace(" $", "")
- elif "$" in temp:
- temp = temp.replace("$", "")
- USD = cleanString((temp.strip()))
- except:
- try:
- temp = soup.find('li').find('strong').text
- if " $" in temp:
- temp = temp.replace(" $", "")
- elif "$" in temp:
- temp = temp.replace("$", "")
- USD = cleanString((temp.strip()))
- except:
- print("USD")
+ temp = rows[2].find('strong').text
+ if " $" in temp:
+ temp = temp.replace(" $", "")
+ elif "$" in temp:
+ temp = temp.replace("$", "")
+ USD = cleanString((temp.strip()))
EURO = "-1" # 14 Product_EURO_SellingPrice
# product sold
- try:
- if (len(rows) <= 5):
- temp = rows[4].find('td').text
- string = cleanString(temp)
- if (string == 'Left/Sold'):
- temp = rows[4].findAll('td')
- temp = temp[1].findAll('span')
-
- # left
- temp2 = temp[1].text
- temp3 = temp[1].text
-
- if(" items" in temp2):
- temp2 = temp2.replace(" items", "")
- if(" items" in temp3):
- temp3 = temp3.replace(" items", "")
-
- sold = (cleanString(temp2.strip()))
- left = cleanString(temp3.strip())
- else:
- sold = '-1'
- left = "-1"
+ if (len(rows) <= 5):
+ temp = rows[4].find('td').text
+ string = cleanString(temp)
+ if (string == 'Left/Sold'):
+ temp = rows[4].findAll('td')
+ temp = temp[1].findAll('span')
+
+ # left
+ temp2 = temp[1].text
+ temp3 = temp[1].text
+
+ if(" items" in temp2):
+ temp2 = temp2.replace(" items", "")
+ if(" items" in temp3):
+ temp3 = temp3.replace(" items", "")
+
+ sold = (cleanString(temp2.strip()))
+ left = cleanString(temp3.strip())
else:
sold = '-1'
left = "-1"
- except:
- print("success")
+ else:
sold = '-1'
left = "-1"
@@ -161,7 +124,7 @@ def lionmarketplace_description_parser(soup):
def lionmarketplace_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
- mktName = "M00nkeyMarket" # 0 *Marketplace_Name
+ mktName = "LionMarketplace" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
@@ -183,21 +146,20 @@ def lionmarketplace_listing_parser(soup):
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
- listing = soup.findAll('div', {"class": "card-body"})
+ listings = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"})
# Populating the Number of Products
- nm = len(listing)
+ nm = len(listings)
- for a in listing:
+ for listing in listings:
+ a = listing.find('div', {"class": "card-body"})
row = a.findAll('p')
+
# vendor
- try:
- temp = row[3].text
- vendor.append(cleanString(temp.strip()))
- except:
- vendor.append("-1")
- print('vendor')
+ temp = row[3].text
+ temp = temp.replace("Vendor:", "")
+ vendor.append(cleanString(temp.strip()))
# vendor rating
rating_vendor.append("-1")
@@ -206,25 +168,16 @@ def lionmarketplace_listing_parser(soup):
success.append("-1")
# product name
- try:
- temp = a.find('a').text
- name.append(cleanString(temp.strip()))
- except:
- name.append("-1")
- print('product name')
+ temp = a.find('a').text
+ name.append(cleanString(temp.strip()))
CVE.append('-1')
MS.append('-1')
# product category
- try:
- temp = row[2].text
- if "Category: " in temp:
- temp = temp.replace("Category: ", "")
- category.append(cleanString(temp.strip()))
-
- except:
- print("Error in product category")
+ temp = row[2].text
+ temp = temp.replace("Category: ", "")
+ category.append(cleanString(temp.strip()))
describe.append('-1')
@@ -238,14 +191,10 @@ def lionmarketplace_listing_parser(soup):
BTC.append('-1')
# USD
- try:
- temp = row[0].find('strong').text
- if ' $' in temp:
- temp = temp.replace(" $", "")
- USD.append(cleanString(temp.strip())) # 14 Product_USD_SellingPrice
- except:
- print("USD")
- USD.append("-1")
+ temp = row[0].find('strong').text
+ if ' $' in temp:
+ temp = temp.replace(" $", "")
+ USD.append(cleanString(temp.strip())) # 14 Product_USD_SellingPrice
EURO.append("-1") # 15 Product_EURO_SellingPrice
@@ -257,11 +206,8 @@ def lionmarketplace_listing_parser(soup):
shipTo.append('-1') # 19 Product_ShippedTo
# href
- try:
- temp = a.find('a').get('href')
- href.append(temp)
- except:
- print('product name')
+ temp = a.find('a').get('href')
+ href.append(temp)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
@@ -276,9 +222,10 @@ def lionmarketplace_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
- listing = soup.findAll('div', {"class": "container d-flex justify-content-center"})
+ listings = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"})
- for a in listing:
+ for listing in listings:
+ a = listing.find('div', {"class": "card-body"})
bae = a.find('a', href=True)
link = bae['href']
href.append(link)
diff --git a/MarketPlaces/M00nkeyMarket/crawler_selenium.py b/MarketPlaces/M00nkeyMarket/crawler_selenium.py
index ccd8f11..690913e 100644
--- a/MarketPlaces/M00nkeyMarket/crawler_selenium.py
+++ b/MarketPlaces/M00nkeyMarket/crawler_selenium.py
@@ -159,7 +159,7 @@ def login(driver):
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div/div[1]/div/div/div[2]/div[3]/div")))
+ (By.XPATH, "/html/body/div/div[2]/div/div/div/div/div/div[1]/a/img")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
diff --git a/MarketPlaces/MetaVerseMarket/crawler_selenium.py b/MarketPlaces/MetaVerseMarket/crawler_selenium.py
new file mode 100644
index 0000000..e624d51
--- /dev/null
+++ b/MarketPlaces/MetaVerseMarket/crawler_selenium.py
@@ -0,0 +1,306 @@
+__author__ = 'Helium'
+
+'''
+MetaVerseMarket Marketplace Crawler (Selenium)
+not complete
+need to go through multiple pages...
+'''
+
+from selenium import webdriver
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
+from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
+from selenium.webdriver.firefox.service import Service
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+
+from PIL import Image
+import urllib.parse as urlparse
+import os, re, time
+from datetime import date
+import subprocess
+import configparser
+from bs4 import BeautifulSoup
+from MarketPlaces.Initialization.prepare_parser import new_parse
+from MarketPlaces.MetaVerseMarket.parser import metaversemarket_links_parser
+from MarketPlaces.Utilities.utilities import cleanHTML
+
+counter = 1
+baseURL = 'http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/login'
+
+
+# Opens Tor Browser, crawls the website, then parses, then closes tor
+#acts like the main method for the crawler, another function at the end of this code calls this function later
+def startCrawling():
+ # opentor()
+ mktName = getMKTName()
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
+
+ new_parse(mktName, baseURL, True)
+
+
+# Opens Tor Browser
+#prompts for ENTER input to continue
+def opentor():
+ from MarketPlaces.Initialization.markets_mining import config
+
+ global pid
+ print("Connecting Tor...")
+ pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
+ pid = pro.pid
+ time.sleep(7.5)
+ input('Tor Connected. Press ENTER to continue\n')
+ return
+
+
+# Returns the name of the website
+#return: name of site in string type
+def getMKTName():
+ name = 'MetaVerseMarket'
+ return name
+
+
+# Return the base link of the website
+#return: url of base site in string type
+def getFixedURL():
+ url = 'http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/login'
+ return url
+
+
+# Closes Tor Browser
+#@param: current selenium driver
+def closetor(driver):
+ # global pid
+ # os.system("taskkill /pid " + str(pro.pid))
+ # os.system("taskkill /t /f /im tor.exe")
+ print('Closing Tor...')
+ driver.close()
+ time.sleep(3)
+ return
+
+
+# Creates FireFox 'driver' and configure its 'Profile'
+# to use Tor proxy and socket
+def createFFDriver():
+ from MarketPlaces.Initialization.markets_mining import config
+
+ ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
+
+ ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
+ ff_prof.set_preference("places.history.enabled", False)
+ ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
+ ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
+ ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
+ ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
+ ff_prof.set_preference("signon.rememberSignons", False)
+ ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
+ ff_prof.set_preference("network.dns.disablePrefetch", True)
+ ff_prof.set_preference("network.http.sendRefererHeader", 0)
+ ff_prof.set_preference("permissions.default.image", 1)##
+ ff_prof.set_preference("browser.download.folderList", 2)
+ ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
+ ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
+ ff_prof.set_preference('network.proxy.type', 1)
+ ff_prof.set_preference("network.proxy.socks_version", 5)
+ ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
+ ff_prof.set_preference('network.proxy.socks_port', 9150)
+ ff_prof.set_preference('network.proxy.socks_remote_dns', True)
+ ff_prof.set_preference("javascript.enabled", False)
+ ff_prof.update_preferences()
+
+ service = Service(config.get('TOR', 'geckodriver_path'))
+
+ driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+
+ driver.maximize_window()
+
+ return driver
+
+
+#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
+#return: return the selenium driver or string 'down'
+def getAccess():
+ url = getFixedURL()
+ driver = createFFDriver()
+ try:
+ driver.get(url)
+ return driver
+ except:
+ driver.close()
+ return 'down'
+
+
+# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
+# then allows for manual solving of captcha in the terminal
+#@param: current selenium web driver
+def login(driver):
+
+ # entering username and password into input boxes
+ usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
+ # Username here
+ usernameBox.send_keys('metotomoto')
+ passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
+ # Password here
+ passwordBox.send_keys('lionking_kumba1ya')
+
+ input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n")
+
+ # wait for listing page show up (This Xpath may need to change based on different seed url)
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, '//*[@id="searchq"]')))
+
+# Saves the crawled html page, makes the directory path for html pages if not made
+def savePage(driver, page, url):
+ cleanPage = cleanHTML(driver, page)
+ filePath = getFullPathName(url)
+ os.makedirs(os.path.dirname(filePath), exist_ok=True)
+ open(filePath, 'wb').write(cleanPage.encode('utf-8'))
+ return
+
+
+# Gets the full path of the page to be saved along with its appropriate file name
+#@param: raw url as crawler crawls through every site
+def getFullPathName(url):
+ from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
+
+ mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
+ fileName = getNameFromURL(url)
+ if isDescriptionLink(url):
+ fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
+ else:
+ fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
+ return fullPath
+
+
+# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
+#@param: raw url as crawler crawls through every site
+def getNameFromURL(url):
+ global counter
+ name = ''.join(e for e in url if e.isalnum())
+ if (name == ''):
+ name = str(counter)
+ counter = counter + 1
+ return name
+
+
+# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
+#in this example, there are a couple of categories some threads fall under such as
+# Guides and Tutorials, Digital Products, and Software and Malware
+#as you can see they are categories of products
+def getInterestedLinks():
+ links = []
+
+ # hacking
+ links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hacking')
+ # hosting
+ links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hosting')
+ # hacking guides and tutorials
+ links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hacking-guides-and-tutorials')
+
+ return links
+
+
+# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
+#topic and description pages are crawled through here, where both types of pages are saved
+#@param: selenium driver
+def crawlForum(driver):
+ print("Crawling the MetaVerse market")
+
+ linksToCrawl = getInterestedLinks()
+
+ i = 0
+ while i < len(linksToCrawl):
+ link = linksToCrawl[i]
+ print('Crawling :', link)
+ try:
+ has_next_page = True
+ count = 0
+
+ while has_next_page:
+ try:
+ driver.get(link)
+ except:
+ driver.refresh()
+ html = driver.page_source
+ savePage(driver, html, link)
+
+ list = productPages(html)
+ for item in list:
+ itemURL = urlparse.urljoin(baseURL, str(item))
+ try:
+ driver.get(itemURL)
+ except:
+ driver.refresh()
+ savePage(driver, driver.page_source, item)
+ driver.back()
+
+ # comment out
+ break
+
+ # comment out
+ if count == 1:
+ break
+
+ try:
+ link = driver.find_element(by=By.XPATH, value='//a[@class="page-link-next"]').get_attribute('href')
+ if link == "":
+ raise NoSuchElementException
+ count += 1
+
+ except NoSuchElementException:
+ has_next_page = False
+
+ except Exception as e:
+ print(link, e)
+ i += 1
+
+ print("Crawling the MetaVerse market done.")
+
+
+# Returns 'True' if the link is a description link
+#@param: url of any url crawled
+#return: true if is a description page, false if not
+def isDescriptionLink(url):
+ if 'PR' in url:
+ return True
+ return False
+
+
+# Returns True if the link is a listingPage link
+#@param: url of any url crawled
+#return: true if is a Listing page, false if not
+def isListingLink(url):
+ if 'products' in url:
+ return True
+ return False
+
+
+# calling the parser to define the links, the html is the url of a link from the list of interested link list
+#@param: link from interested link list ie. getInterestingLinks()
+#return: list of description links that should be crawled through
+def productPages(html):
+ soup = BeautifulSoup(html, "html.parser")
+ return metaversemarket_links_parser(soup)
+
+
+# Drop links that "signout"
+# def isSignOut(url):
+# #absURL = urlparse.urljoin(url.base_url, url.url)
+# if 'signout' in url.lower() or 'logout' in url.lower():
+# return True
+#
+# return False
+
+
+def crawler():
+ startCrawling()
+ # print("Crawling and Parsing MetaVerseMarket .... DONE!")
diff --git a/MarketPlaces/MetaVerseMarket/parser.py b/MarketPlaces/MetaVerseMarket/parser.py
new file mode 100644
index 0000000..8c83293
--- /dev/null
+++ b/MarketPlaces/MetaVerseMarket/parser.py
@@ -0,0 +1,285 @@
+__author__ = 'DarkWeb'
+
+# Here, we are importing the auxiliary functions to clean or convert data
+from MarketPlaces.Utilities.utilities import *
+
+# Here, we are importing BeautifulSoup to search through the HTML tree
+from bs4 import BeautifulSoup
+
+
+# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
+# stores info it needs in different lists, these lists are returned after being organized
+# @param: soup object looking at html page of description page
+# return: 'row' that contains a variety of lists that each hold info on the description page
+def darkfox_description_parser(soup):
+ # Fields to be parsed
+
+ name = "-1" # 0 Product_Name
+ describe = "-1" # 1 Product_Description
+ lastSeen = "-1" # 2 Product_LastViewDate
+ CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
+ review = "-1" # 6 Product_Number_Of_Reviews
+ category = "-1" # 7 Product_Category
+ shipFrom = "-1" # 8 Product_ShippedFrom
+ shipTo = "-1" # 9 Product_ShippedTo
+ left = "-1" # 10 Product_QuantityLeft
+ escrow = "-1" # 11 Vendor_Warranty
+ terms = "-1" # 12 Vendor_TermsAndConditions
+ vendor = "-1" # 13 Vendor_Name
+ sold = "-1" # 14 Product_QuantitySold
+ addDate = "-1" # 15 Product_AddedDate
+ BTC = "-1" # 18 Product_BTC_SellingPrice
+ USD = "-1" # 19 Product_USD_SellingPrice
+ rating = "-1" # 20 Vendor_Rating
+ success = "-1" # 21 Vendor_Successful_Transactions
+ EURO = "-1" # 22 Product_EURO_SellingPrice
+
+ # Finding Product Name
+ name = soup.find('h1').text
+ name = name.replace('\n', ' ')
+ name = name.replace(",", "")
+ name = name.strip()
+
+ # Finding Vendor
+ vendor = soup.find('h3').find('a').text.strip()
+
+ # Finding Vendor Rating
+ rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
+
+ # Finding Successful Transactions
+ success = soup.find('h3').text
+ success = success.replace("Vendor: ", "")
+ success = success.replace(vendor, "")
+ success = success.replace("(", "")
+ success = success.replace(")", "")
+ success = success.strip()
+
+ bae = soup.find('div', {'class': "box"}).find_all('ul')
+
+ # Finding Prices
+ USD = bae[1].find('strong').text.strip()
+
+ li = bae[2].find_all('li')
+
+ # Finding Escrow
+ escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
+
+ # Finding the Product Category
+ category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
+
+ # Finding the Product Quantity Available
+ left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
+
+ # Finding Number Sold
+ sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
+
+ li = bae[3].find_all('li')
+
+ # Finding Shipment Information (Origin)
+ if "Ships from:" in li[-2].text:
+ shipFrom = li[-2].text
+ shipFrom = shipFrom.replace("Ships from: ", "")
+ # shipFrom = shipFrom.replace(",", "")
+ shipFrom = shipFrom.strip()
+
+ # Finding Shipment Information (Destination)
+ shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
+ shipTo = shipTo.replace("Ships to: ", "")
+ shipTo = shipTo.strip()
+ if "certain countries" in shipTo:
+ countries = ""
+ tags = li[-1].find_all('span', {'class': "tag"})
+ for tag in tags:
+ country = tag.text.strip()
+ countries += country + ", "
+ shipTo = countries.strip(", ")
+
+ # Finding the Product description
+ describe = soup.find('div', {'class': "pre-line"}).text
+ describe = describe.replace("\n", " ")
+ describe = describe.strip()
+
+ '''# Finding the Number of Product Reviews
+ tag = soup.findAll(text=re.compile('Reviews'))
+ for index in tag:
+ reviews = index
+ par = reviews.find('(')
+ if par >=0:
+ reviews = reviews.replace("Reviews (","")
+ reviews = reviews.replace(")","")
+ reviews = reviews.split(",")
+ review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
+ else :
+ review = "-1"'''
+
+ # Searching for CVE and MS categories
+ cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
+ if cve:
+ CVE = " "
+ for idx in cve:
+ CVE += (idx)
+ CVE += " "
+ CVE = CVE.replace(',', ' ')
+ CVE = CVE.replace('\n', '')
+ ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
+ if ms:
+ MS = " "
+ for im in ms:
+ MS += (im)
+ MS += " "
+ MS = MS.replace(',', ' ')
+ MS = MS.replace('\n', '')
+
+ # Populating the final variable (this should be a list with all fields scraped)
+ row = (name, describe, lastSeen, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
+ sold, addDate, BTC, USD, rating, success, EURO)
+
+ # Sending the results
+ return row
+
+
+# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
+# stores info it needs in different lists, these lists are returned after being organized
+# @param: soup object looking at html page of listing page
+# return: 'row' that contains a variety of lists that each hold info on the listing page
+def darkfox_listing_parser(soup):
+ # Fields to be parsed
+ nm = 0 # Total_Products (Should be Integer)
+ mktName = "DarkFox" # 0 Marketplace_Name
+ name = [] # 1 Product_Name
+ CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = [] # 3 Product_MS_Classification (Microsoft Security)
+ category = [] # 4 Product_Category
+ describe = [] # 5 Product_Description
+ escrow = [] # 6 Vendor_Warranty
+ views = [] # 7 Product_Number_Of_Views
+ reviews = [] # 8 Product_Number_Of_Reviews
+ addDate = [] # 9 Product_AddDate
+ lastSeen = [] # 10 Product_LastViewDate
+ BTC = [] # 11 Product_BTC_SellingPrice
+ USD = [] # 12 Product_USD_SellingPrice
+ EURO = [] # 13 Product_EURO_SellingPrice
+ sold = [] # 14 Product_QuantitySold
+ qLeft = [] # 15 Product_QuantityLeft
+ shipFrom = [] # 16 Product_ShippedFrom
+ shipTo = [] # 17 Product_ShippedTo
+ vendor = [] # 18 Vendor
+ rating = [] # 19 Vendor_Rating
+ success = [] # 20 Vendor_Successful_Transactions
+ href = [] # 23 Product_Links (Urls)
+
+ listing = soup.findAll('div', {"class": "card"})
+
+ # Populating the Number of Products
+ nm = len(listing)
+
+ for a in listing:
+ bae = a.findAll('a', href=True)
+
+ # Adding the url to the list of urls
+ link = bae[0].get('href')
+ link = cleanLink(link)
+ href.append(link)
+
+ # Finding the Product
+ product = bae[1].find('p').text
+ product = product.replace('\n', ' ')
+ product = product.replace(",", "")
+ product = product.replace("...", "")
+ product = product.strip()
+ name.append(product)
+
+ bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
+
+ if len(bae) >= 5:
+ # Finding Prices
+ price = bae[0].text
+ ud = price.replace(" USD", " ")
+ # u = ud.replace("$","")
+ u = ud.replace(",", "")
+ u = u.strip()
+ USD.append(u)
+ # bc = (prc[1]).strip(' BTC')
+ # BTC.append(bc)
+
+ # Finding the Vendor
+ vendor_name = bae[1].find('a').text
+ vendor_name = vendor_name.replace(",", "")
+ vendor_name = vendor_name.strip()
+ vendor.append(vendor_name)
+
+ # Finding the Category
+ cat = bae[2].find('small').text
+ cat = cat.replace("Category: ", "")
+ cat = cat.replace(",", "")
+ cat = cat.strip()
+ category.append(cat)
+
+ # Finding Number Sold and Quantity Left
+ num = bae[3].text
+ num = num.replace("Sold: ", "")
+ num = num.strip()
+ sold.append(num)
+
+ quant = bae[4].find('small').text
+ quant = quant.replace("In stock: ", "")
+ quant = quant.strip()
+ qLeft.append(quant)
+
+ # Finding Successful Transactions
+ freq = bae[1].text
+ freq = freq.replace(vendor_name, "")
+ freq = re.sub(r'Vendor Level \d+', "", freq)
+ freq = freq.replace("(", "")
+ freq = freq.replace(")", "")
+ freq = freq.strip()
+ success.append(freq)
+
+ # Searching for CVE and MS categories
+ cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
+ if not cve:
+ cveValue = "-1"
+ else:
+ cee = " "
+ for idx in cve:
+ cee += (idx)
+ cee += " "
+ cee = cee.replace(',', ' ')
+ cee = cee.replace('\n', '')
+ cveValue = cee
+ CVE.append(cveValue)
+
+ ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
+ if not ms:
+ MSValue = "-1"
+ else:
+ me = " "
+ for im in ms:
+ me += (im)
+ me += " "
+ me = me.replace(',', ' ')
+ me = me.replace('\n', '')
+ MSValue = me
+ MS.append(MSValue)
+
+ # Populate the final variable (this should be a list with all fields scraped)
+ return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
+ BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
+
+
+# called by the crawler to get description links on a listing page
+# @param: beautifulsoup object that is using the correct html page (listing page)
+# return: list of description links from a listing page
+def metaversemarket_links_parser(soup):
+ # Returning all links that should be visited by the Crawler
+
+ href = []
+ listing = soup.findAll('div', {"class": "col-12 p-0"})
+
+ for a in listing:
+ bae = a.find('a', href=True)
+ link = bae['href']
+ href.append(link)
+
+ return href
\ No newline at end of file
diff --git a/MarketPlaces/Nexus/crawler_selenium.py b/MarketPlaces/Nexus/crawler_selenium.py
index 70e1480..1962d79 100644
--- a/MarketPlaces/Nexus/crawler_selenium.py
+++ b/MarketPlaces/Nexus/crawler_selenium.py
@@ -173,16 +173,24 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # Bot nets
- links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/botnets/')
- # # Rats
- # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/rats/')
- # # Ransomware
- # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/ransomware/')
- # # Other Malware
- # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/outros-malware/')
- # # Hacking Tools & Scripting
- # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/ferramentas-de-hacking-scripts/')
+ # malware
+ links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/')
+ # # hacking-spam
+ # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/')
+ # # hacking services
+ # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/hacking/')
+ # # programming services
+ # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/programacao/')
+ # # remote admin services
+ # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/administracao-remota/')
+ # # hacking guides
+ # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-hacking/')
+ # # malware guides
+ # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-malware/')
+ # # fraud guides
+ # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-fraudes/')
+ # # fraud software
+ # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/fraudes/software-de-fraude/')
return links
@@ -222,7 +230,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/MarketPlaces/PabloEscobarMarket/crawler_selenium.py b/MarketPlaces/PabloEscobarMarket/crawler_selenium.py
new file mode 100644
index 0000000..cc9b890
--- /dev/null
+++ b/MarketPlaces/PabloEscobarMarket/crawler_selenium.py
@@ -0,0 +1,268 @@
+__author__ = 'DarkWeb'
+
+'''
+PabloEscobarMarket Marketplace Crawler (Selenium)
+'''
+
+from selenium import webdriver
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
+from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
+from selenium.webdriver.firefox.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+from PIL import Image
+import urllib.parse as urlparse
+import os, re, time
+import subprocess
+from bs4 import BeautifulSoup
+from MarketPlaces.Initialization.prepare_parser import new_parse
+from MarketPlaces.PabloEscobarMarket.parser import pabloescobarmarket_links_parser
+from MarketPlaces.Utilities.utilities import cleanHTML
+
+counter = 1
+baseURL = 'http://niejmptjzwhlfywruoab4pbuxg7kp2mtcr4c6mgpeykju5matewg36yd.onion/'
+
+
+# Opens Tor Browser, crawls the website
+def startCrawling():
+ # opentor()
+ mktName = getMKTName()
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
+
+ new_parse(mktName, baseURL, True)
+
+
+# Opens Tor Browser
+def opentor():
+ from MarketPlaces.Initialization.markets_mining import config
+
+ global pid
+ print("Connecting Tor...")
+ pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
+ pid = pro.pid
+ time.sleep(7.5)
+ input('Tor Connected. Press ENTER to continue\n')
+ return
+
+
+# Login using premade account credentials and do login captcha manually
+def login(driver):
+ input("Press ENTER when CAPTCHA is complete and login page has loaded\n")
+
+ # entering username and password into input boxes
+ usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
+ # Username here
+ usernameBox.send_keys('snorlaxrights')
+ passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="inputPassword3"]')
+ # Password here
+ passwordBox.send_keys('$noringAllday')
+
+ input("Press ENTER when CAPTCHA is completed\n")
+
+ # wait for listing page show up (This Xpath may need to change based on different seed url)
+ # wait for 50 sec until id = tab_content is found, then cont
+ WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
+ (By.XPATH, '//*[@id="collapse3"]')))
+
+
+# Returns the name of the website
+def getMKTName() -> str:
+ name = 'PabloEscobarMarket'
+ return name
+
+
+# Return the link of the website
+def getFixedURL():
+ url = 'http://niejmptjzwhlfywruoab4pbuxg7kp2mtcr4c6mgpeykju5matewg36yd.onion/'
+ return url
+
+
+# Closes Tor Browser
+def closetor(driver):
+ # global pid
+ # os.system("taskkill /pid " + str(pro.pid))
+ # os.system("taskkill /t /f /im tor.exe")
+ print('Closing Tor...')
+ driver.close() #close tab
+ time.sleep(3)
+ return
+
+
+# Creates FireFox 'driver' and configure its 'Profile'
+# to use Tor proxy and socket
+def createFFDriver():
+ from MarketPlaces.Initialization.markets_mining import config
+
+ ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
+
+ ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
+ ff_prof.set_preference("places.history.enabled", False)
+ ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
+ ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
+ ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
+ ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
+ ff_prof.set_preference("signon.rememberSignons", False)
+ ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
+ ff_prof.set_preference("network.dns.disablePrefetch", True)
+ ff_prof.set_preference("network.http.sendRefererHeader", 0)
+ ff_prof.set_preference("permissions.default.image", 3)
+ ff_prof.set_preference("browser.download.folderList", 2)
+ ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
+ ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
+ ff_prof.set_preference('network.proxy.type', 1)
+ ff_prof.set_preference("network.proxy.socks_version", 5)
+ ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
+ ff_prof.set_preference('network.proxy.socks_port', 9150)
+ ff_prof.set_preference('network.proxy.socks_remote_dns', True)
+ ff_prof.set_preference("javascript.enabled", True)
+ ff_prof.update_preferences()
+
+ service = Service(config.get('TOR', 'geckodriver_path'))
+
+ driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+
+ return driver
+
+
+def getAccess():
+ url = getFixedURL()
+ driver = createFFDriver()
+ try:
+ driver.get(url)
+ return driver
+ except:
+ driver.close()
+ return 'down'
+
+
+# Saves the crawled html page
+def savePage(driver, page, url):
+ cleanPage = cleanHTML(driver, page)
+ filePath = getFullPathName(url)
+ os.makedirs(os.path.dirname(filePath), exist_ok=True)
+ open(filePath, 'wb').write(cleanPage.encode('utf-8'))
+ return
+
+
+# Gets the full path of the page to be saved along with its appropriate file name
+def getFullPathName(url):
+ from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
+
+ mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
+ fileName = getNameFromURL(url)
+ if isDescriptionLink(url):
+ fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
+ else:
+ fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
+ return fullPath
+
+
+# Creates the file name from passed URL
+def getNameFromURL(url):
+ global counter
+ name = ''.join(e for e in url if e.isalnum())
+ if name == '':
+ name = str(counter)
+ counter = counter + 1
+ return name
+
+# FIX
+def getInterestedLinks():
+ links = []
+
+ # hire hacker
+ links.append('http://niejmptjzwhlfywruoab4pbuxg7kp2mtcr4c6mgpeykju5matewg36yd.onion/?sub_id=36')
+ # hacker
+ links.append('http://niejmptjzwhlfywruoab4pbuxg7kp2mtcr4c6mgpeykju5matewg36yd.onion/?sub_id=34')
+
+ return links
+
+
+def crawlForum(driver):
+ print("Crawling the PabloEscobarMarket market")
+
+ linksToCrawl = getInterestedLinks()
+
+ i = 0
+ while i < len(linksToCrawl):
+ link = linksToCrawl[i]
+ print('Crawling :', link)
+ try:
+ has_next_page = True
+ count = 0
+
+ while has_next_page:
+ try:
+ driver.get(link)
+ except:
+ driver.refresh()
+ html = driver.page_source
+ savePage(driver, html, link)
+
+ list = productPages(html)
+ for item in list:
+ itemURL = urlparse.urljoin(baseURL, str(item))
+ try:
+ driver.get(itemURL)
+ except:
+ driver.refresh()
+ savePage(driver, driver.page_source, item)
+ driver.back()
+
+ # comment out
+ break
+
+ # comment out
+ if count == 1:
+ break
+
+ try:
+ link = driver.find_element(by=By.XPATH, value='//a[@rel="next"]').get_attribute('href')
+ if link == "":
+ raise NoSuchElementException
+ count += 1
+
+ except NoSuchElementException:
+ has_next_page = False
+
+ except Exception as e:
+ print(link, e)
+ i += 1
+
+ print("Crawling the PabloEscobarMarket market done.")
+
+
+# Returns 'True' if the link is Topic link, may need to change for every website
+def isDescriptionLink(url):
+ if 'single_product' in url:
+ return True
+ return False
+
+
+# Returns True if the link is a listingPage link, may need to change for every website
+def isListingLink(url):
+ if 'sub_id' in url:
+ return True
+ return False
+
+
+# calling the parser to define the links
+def productPages(html):
+ soup = BeautifulSoup(html, "html.parser")
+ return pabloescobarmarket_links_parser(soup)
+
+
+def crawler():
+ startCrawling()
+ # print("Crawling and Parsing PabloEscobarMarket .... DONE!")
diff --git a/MarketPlaces/PabloEscobarMarket/parser.py b/MarketPlaces/PabloEscobarMarket/parser.py
new file mode 100644
index 0000000..ecdd086
--- /dev/null
+++ b/MarketPlaces/PabloEscobarMarket/parser.py
@@ -0,0 +1,241 @@
+__author__ = 'DarkWeb'
+
+# Here, we are importing the auxiliary functions to clean or convert data
+from MarketPlaces.Utilities.utilities import *
+
+# Here, we are importing BeautifulSoup to search through the HTML tree
+from bs4 import BeautifulSoup
+
+
+# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
+# stores info it needs in different lists, these lists are returned after being organized
+# @param: soup object looking at html page of description page
+# return: 'row' that contains a variety of lists that each hold info on the description page
+def pabloescobarmarket_description_parser(soup):
+ # Fields to be parsed
+
+ name = "-1" # 0 Product_Name
+ describe = "-1" # 1 Product_Description
+ lastSeen = "-1" # 2 Product_LastViewDate
+ CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
+ review = "-1" # 6 Product_Number_Of_Reviews
+ category = "-1" # 7 Product_Category
+ shipFrom = "-1" # 8 Product_ShippedFrom
+ shipTo = "-1" # 9 Product_ShippedTo
+ left = "-1" # 10 Product_QuantityLeft
+ escrow = "-1" # 11 Vendor_Warranty
+ terms = "-1" # 12 Vendor_TermsAndConditions
+ vendor = "-1" # 13 Vendor_Name
+ sold = "-1" # 14 Product_QuantitySold
+ addDate = "-1" # 15 Product_AddedDate
+ BTC = "-1" # 18 Product_BTC_SellingPrice
+ USD = "-1" # 19 Product_USD_SellingPrice
+ rating = "-1" # 20 Vendor_Rating
+ success = "-1" # 21 Vendor_Successful_Transactions
+ EURO = "-1" # 22 Product_EURO_SellingPrice
+
+ # Finding Product Name
+ # NA
+
+ divmd7 = soup.find('div', {'class': "col-md-7"})
+ ptag = soup.findAll('p')
+ # Finding Vendor
+ vendor = divmd7.find('a').text.strip()
+
+ # Finding Vendor Rating
+ # NA
+
+ # Finding Successful Transactions
+ success = soup.find('span', {'class': "badge-primary"})
+
+ # Finding Prices
+ USD = soup.find('span', {'class': "total"}).text.strip()
+
+ BTC = soup.find('div', {'class': "text-center"}).text.strip()
+
+ # Finding Escrow
+ escrow = ptag[-1].text.strip()
+
+ # Finding the Product Category
+ category = ptag[-2].text.strip()
+
+ # Finding the Product Quantity Available
+ # NA
+
+ # Finding Number Sold
+ # NA
+
+ # Finding Shipment Information (Origin)
+ # NA
+
+ # Finding Shipment Information (Destination)
+ # NA
+
+ # Finding the Product description
+ describe = soup.find('div', {'class': "text-white"}).text
+ describe = describe.replace("\n", " ")
+ describe = describe.strip()
+
+ '''# Finding the Number of Product Reviews
+ tag = soup.findAll(text=re.compile('Reviews'))
+ for index in tag:
+ reviews = index
+ par = reviews.find('(')
+ if par >=0:
+ reviews = reviews.replace("Reviews (","")
+ reviews = reviews.replace(")","")
+ reviews = reviews.split(",")
+ review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
+ else :
+ review = "-1"'''
+
+ # Searching for CVE and MS categories
+ cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
+ if cve:
+ CVE = " "
+ for idx in cve:
+ CVE += (idx)
+ CVE += " "
+ CVE = CVE.replace(',', ' ')
+ CVE = CVE.replace('\n', '')
+ ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
+ if ms:
+ MS = " "
+ for im in ms:
+ MS += (im)
+ MS += " "
+ MS = MS.replace(',', ' ')
+ MS = MS.replace('\n', '')
+
+ # Populating the final variable (this should be a list with all fields scraped)
+ row = (name, describe, lastSeen, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
+ sold, addDate, BTC, USD, rating, success, EURO)
+
+ # Sending the results
+ return row
+
+
+# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
+# stores info it needs in different lists, these lists are returned after being organized
+# @param: soup object looking at html page of listing page
+# return: 'row' that contains a variety of lists that each hold info on the listing page
+def pabloescobarmarket_listing_parser(soup):
+ # Fields to be parsed
+ nm = 0 # Total_Products (Should be Integer)
+ mktName = "PabloEscobarMarket" # 0 Marketplace_Name
+ name = [] # 1 Product_Name
+ CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = [] # 3 Product_MS_Classification (Microsoft Security)
+ category = [] # 4 Product_Category
+ describe = [] # 5 Product_Description
+ escrow = [] # 6 Vendor_Warranty
+ views = [] # 7 Product_Number_Of_Views
+ reviews = [] # 8 Product_Number_Of_Reviews
+ addDate = [] # 9 Product_AddDate
+ lastSeen = [] # 10 Product_LastViewDate
+ BTC = [] # 11 Product_BTC_SellingPrice
+ USD = [] # 12 Product_USD_SellingPrice
+ EURO = [] # 13 Product_EURO_SellingPrice
+ sold = [] # 14 Product_QuantitySold
+ qLeft = [] # 15 Product_QuantityLeft
+ shipFrom = [] # 16 Product_ShippedFrom
+ shipTo = [] # 17 Product_ShippedTo
+ vendor = [] # 18 Vendor
+ rating = [] # 19 Vendor_Rating
+ success = [] # 20 Vendor_Successful_Transactions
+ href = [] # 23 Product_Links (Urls)
+
+ listing = soup.findAll('div', {"class": "p-4"})
+
+ # Populating the Number of Products
+ nm = len(listing)
+
+ for a in listing:
+ bae = a.findAll('a', href=True)
+
+ # Adding the url to the list of urls
+ link = bae[0].get('href')
+ link = cleanLink(link)
+ href.append(link)
+
+ # Finding the Product
+ product = a.find('h4').text
+ product = product.replace('\n', ' ')
+ product = product.replace(",", "")
+ product = product.replace("...", "")
+ product = product.strip()
+ name.append(product)
+
+
+ # Finding Prices
+ price = a.find('div', {"class": "price"}).text
+ tempUSD = price.split("~")[0]
+ tempUSD = tempUSD.replace("$", "")
+ tempUSD = tempUSD.strip()
+ USD.append(tempUSD)
+
+ tempBTC = price.split("~")[1]
+ tempBTC = tempBTC.replace("BTC", "")
+ tempBTC = tempBTC.strip()
+ BTC.append(tempBTC)
+
+ # Finding the Vendor
+ #NA
+
+ # Finding the Category
+ # NA
+
+ # Finding Number Sold and Quantity Left
+ # NA
+
+ # Finding Successful Transactions
+ # NA
+
+ # Searching for CVE and MS categories
+ cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
+ if not cve:
+ cveValue = "-1"
+ else:
+ cee = " "
+ for idx in cve:
+ cee += (idx)
+ cee += " "
+ cee = cee.replace(',', ' ')
+ cee = cee.replace('\n', '')
+ cveValue = cee
+ CVE.append(cveValue)
+
+ ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
+ if not ms:
+ MSValue = "-1"
+ else:
+ me = " "
+ for im in ms:
+ me += (im)
+ me += " "
+ me = me.replace(',', ' ')
+ me = me.replace('\n', '')
+ MSValue = me
+ MS.append(MSValue)
+
+ # Populate the final variable (this should be a list with all fields scraped)
+ return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
+ BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
+
+
+# called by the crawler to get description links on a listing page
+# @param: beautifulsoup object that is using the correct html page (listing page)
+# return: list of description links from a listing page FIX
+def pabloescobarmarket_links_parser(soup):
+ # Returning all links that should be visited by the Crawler
+
+ href = []
+ listing = soup.findAll('div', {"class": "p-4"})
+
+ for a in listing:
+ bae = a.find('a', href=True)
+ link = bae['href']
+ href.append(link)
+
+ return href
\ No newline at end of file
diff --git a/MarketPlaces/RobinhoodMarket/crawler_selenium.py b/MarketPlaces/RobinhoodMarket/crawler_selenium.py
index 9124a8f..4ce1cfe 100644
--- a/MarketPlaces/RobinhoodMarket/crawler_selenium.py
+++ b/MarketPlaces/RobinhoodMarket/crawler_selenium.py
@@ -218,7 +218,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
@@ -266,5 +266,6 @@ def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")
+
if __name__ == '__main__':
- startCrawling()
\ No newline at end of file
+ startCrawling()
diff --git a/MarketPlaces/ThiefWorld/crawler_selenium.py b/MarketPlaces/ThiefWorld/crawler_selenium.py
index 345bdbe..e532006 100644
--- a/MarketPlaces/ThiefWorld/crawler_selenium.py
+++ b/MarketPlaces/ThiefWorld/crawler_selenium.py
@@ -144,7 +144,7 @@ def getAccess():
def login(driver):
# wait for page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div/div[1]/div/div[1]/div[1]/ul")))
+ (By.XPATH, "/html/body/div/header/div[2]/div/nav/div[2]/a[1]")))
temp = driver.find_element(By.XPATH, '/html/body/div/header/div[2]/div/nav/div[2]/a[1]').get_attribute(
'href') # /html/body/div/div[2]/div/div[2]/div
@@ -242,7 +242,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/MarketPlaces/ThiefWorld/parser.py b/MarketPlaces/ThiefWorld/parser.py
index ee15092..dbf7584 100644
--- a/MarketPlaces/ThiefWorld/parser.py
+++ b/MarketPlaces/ThiefWorld/parser.py
@@ -53,7 +53,7 @@ def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple:
USD = cleanString(usdText.replace("USD", "").strip())
ratingDiv = soup.find('div', {'class': 'rating_star'})
- rating_vendor = ratingDiv.get('title').strip(' ')[1]
+ rating_vendor = ratingDiv.get('title').split(' ')[1]
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
diff --git a/MarketPlaces/TorBay/crawler_selenium.py b/MarketPlaces/TorBay/crawler_selenium.py
index 5035999..4c8de5b 100644
--- a/MarketPlaces/TorBay/crawler_selenium.py
+++ b/MarketPlaces/TorBay/crawler_selenium.py
@@ -228,7 +228,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/MarketPlaces/TorMarket/crawler_selenium.py b/MarketPlaces/TorMarket/crawler_selenium.py
index b76fb1c..e1612ea 100644
--- a/MarketPlaces/TorMarket/crawler_selenium.py
+++ b/MarketPlaces/TorMarket/crawler_selenium.py
@@ -37,7 +37,6 @@ def startCrawling():
if driver != 'down':
try:
- # login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
@@ -101,8 +100,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
+ # ff_prof.set_preference("network.dns.disablePrefetch", True)
+ # ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@@ -186,12 +185,12 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # # Hacking Tutorials
- # links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/hacking/')
+ # # Tutorials
+ # links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/')
# Malware
links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/malware/')
- # # Hacking Services
- # links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/hacking-services/')
+ # # Services
+ # links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/')
return links
@@ -238,8 +237,7 @@ def crawlForum(driver):
break
try:
- link = driver.find_element(by=By.XPATH, value=
- '/html/body/div[2]/div/div/div[1]/main/nav/ul/li[5]/a').get_attribute('href')
+ link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
diff --git a/MarketPlaces/TorMarket/parser.py b/MarketPlaces/TorMarket/parser.py
index 69d680c..e6e14b9 100644
--- a/MarketPlaces/TorMarket/parser.py
+++ b/MarketPlaces/TorMarket/parser.py
@@ -104,61 +104,58 @@ def tormarket_listing_parser(soup):
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
- products_list = soup.find_all('li')
- nm = 0
+
+ products_list = soup.find('ul', {"class": "products columns-3 tablet-columns-2 mobile-columns-1"}).find_all('li')
+ nm = len(products_list)
+
for product in products_list:
+ # Finding the name of the product
+ name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
+ name_of_product_cleaned = cleanString(name_of_product.strip())
+ # print(name_of_product_cleaned)
+ name.append(name_of_product_cleaned)
+ #finding the URL
try:
- # Finding the name of the product
- name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
- name_of_product_cleaned = cleanString(name_of_product.strip())
- print(name_of_product_cleaned)
- name.append(name_of_product_cleaned)
- #finding the URL
- try:
- url = product.find("div", {"class": "product-loop-content text-center"}).find("a").get("href")
- print(url)
- href.append(url)
- except AttributeError as e:
- print("I can't find the link")
- raise e
-
- #finding the rating of the product
- rating_score_of_product = product.find("div", {"class": "product-loop-content text-center"}).find("div").find("span").text
- rating_item.append(cleanString(rating_score_of_product.strip()))
- print("done")
- #finding the rating of the vendors
- rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"}).find("strong").text
- rating_vendor.append(cleanString(rating_score_of_vendor.strip()))
- print("done")
- #finding the cost in USD
- cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text
- USD.append(cost)
- print("done")
- #finding the name of the vendor
- vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"}).find("a").text
- vendor.append(cleanString(vendor_name.strip()))
- print("done")
- #everything else appends a -1
- success.append("-1")
- CVE.append("-1")
- MS.append("-1")
- category.append("-1")
- describe.append("-1")
- views.append("-1")
- reviews.append("-1")
- addDate.append("-1")
- BTC.append("-1")
- EURO.append("-1")
- sold.append("-1")
- qLeft.append("-1")
- shipFrom.append("-1")
- shipTo.append("-1")
- print("Done! moving onto the next product!")
- print(len(shipTo))
- nm += 1
+ url = product.find("div", {"class": "product-loop-content text-center"}).find("a").get("href")
+ # print(url)
+ href.append(url)
except AttributeError as e:
- print("I'm somewhere I don't belong. I'm going to leave")
- continue
+ print("I can't find the link")
+ raise e
+
+ #finding the rating of the product
+ rating_score_of_product = product.find("div", {"class": "product-loop-content text-center"}).find("div").find("span").text
+ rating_item.append(cleanString(rating_score_of_product.strip()))
+ # print("done")
+ #finding the rating of the vendors
+ rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"}).find("strong").text
+ rating_vendor.append(cleanString(rating_score_of_vendor.strip()))
+ # print("done")
+ #finding the cost in USD
+ cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text
+ USD.append(cost)
+ # print("done")
+ #finding the name of the vendor
+ vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"}).find("a").text
+ vendor.append(cleanString(vendor_name.strip()))
+ # print("done")
+ #everything else appends a -1
+ success.append("-1")
+ CVE.append("-1")
+ MS.append("-1")
+ category.append("-1")
+ describe.append("-1")
+ views.append("-1")
+ reviews.append("-1")
+ addDate.append("-1")
+ BTC.append("-1")
+ EURO.append("-1")
+ sold.append("-1")
+ qLeft.append("-1")
+ shipFrom.append("-1")
+ shipTo.append("-1")
+ # print("Done! moving onto the next product!")
+ # print(len(shipTo))
# Populate the final variable (this should be a list with all fields scraped)
diff --git a/MarketPlaces/ViceCity/crawler_selenium.py b/MarketPlaces/ViceCity/crawler_selenium.py
index cf7ea82..05250e9 100644
--- a/MarketPlaces/ViceCity/crawler_selenium.py
+++ b/MarketPlaces/ViceCity/crawler_selenium.py
@@ -271,7 +271,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1: