diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml
index 5a5ac36..8489f64 100644
--- a/.idea/DW_Pipeline_Test.iml
+++ b/.idea/DW_Pipeline_Test.iml
@@ -5,33 +5,4 @@
-
-
-
\ No newline at end of file
diff --git a/Forums/AbyssForum/parser.py b/Forums/AbyssForum/parser.py
deleted file mode 100644
index 635c494..0000000
--- a/Forums/AbyssForum/parser.py
+++ /dev/null
@@ -1,166 +0,0 @@
-__author__ = 'Helium'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from Forums.Utilities.utilities import *
-from datetime import date
-from datetime import timedelta
-import re
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup
-
-# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
-
-def abyssForums_description_parser(soup):
-
- # Fields to be parsed
-
- topic = "-1" # 0 topic name
- user = [] # 1 all users of each post
- addDate = [] # 2 all dated of each post
- feedback = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format)
- status = [] # 4 all user's authority in each post such as (adm, member, dangerous)
- reputation = [] # 5 all users's karma in each post (usually found as a number)
- sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post)
- post = [] # 7 all messages of each post
- interest = [] # 8 all user's interest in each post
- image_user = [] # 9 all user avatars of each post
- image_post = [] # 10 all first images of each post
-
- # Finding the topic (should be just one coming from the Listing Page)
-
- li = soup.find("div", {"class": "page-body"}).find("h2", {"class": "topic-title"})
- topic = li.text.replace(",","")
- topic = topic.replace("\n","")
- topic = cleanString(topic.strip())
-
- regex = re.compile('post has-profile.*')
- posts = soup.find_all('div', {"class": regex})
- # print(len(posts))
-
- # For each message (post), get all the fields we are interested to:
-
- for ipost in posts:
-
- # Finding the author (user) of the post
- author = ipost.find('a', {"class": "username"}).text
- user.append(cleanString(author)) # Remember to clean the problematic characters
-
- status.append("-1")
- reputation.append("-1")
- interest.append("-1")
- sign.append("-1")
- feedback.append("-1")
- image_post.append("-1")
-
- img = ipost.find('dl', {"class": "postprofile"}).find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = "-1"
- image_user.append(img)
-
- image_user.append("-1")
-
- date_time_obj = ipost.find('time').attrs
- date = date_time_obj['datetime'][0:10]
- time = date_time_obj['datetime'][11:19]
- date_time_obj = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S')
- addDate.append(date_time_obj)
-
- # Finding the post
-
- inner = ipost.find('div', {"class": "content"})
- inner = inner.text.strip()
- post.append(cleanString(inner))
-
- # Populate the final variable (this should be a list with all fields scraped)
-
- row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
-
- # Sending the results
-
- return row
-
-# This is the method to parse the Listing Pages (one page with many posts)
-def abyssForums_listing_parser(soup: BeautifulSoup):
-
-
- nm = 0 # this variable should receive the number of topics
- forum = "AbyssForum" # 0 *forum name
- board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree.
- # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
- author = [] # 2 all authors of each topic
- topic = [] # 3 all topics
- views = [] # 4 number of views of each topic
- posts = [] # 5 number of posts of each topic
- href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
- addDate = [] # when the topic was created (difficult to find)
- image_author = [] # 8 all author avatars used in each topic
-
- # Listing and Description pages)
- #finding the board
-
- board = soup.find("h2", {"class": "forum-title"}).text
- board = cleanString(board.strip())
-
- type_of_posts = soup.find_all("li", {"class": re.compile("row bg\d")} )
- for literature in type_of_posts:
- title_of_post = literature.find("a", {"class": "topictitle"}).text
- title_of_post = cleanString(title_of_post)
- topic.append(title_of_post)
- user = literature.find("div", {"class": "topic-poster responsive-hide left-box"}).find("a", {"class": "username"}).text
- author.append(user)
- num_post = literature.find("dd", {"class": "posts"}).text.replace("Replies","").strip()
- posts.append(num_post)
- num_view = literature.find("dd", {"class": "views"}).text.replace("Views","").strip()
- views.append(num_view)
- #if int(num_post) != 0: join the last user who posted with the author?
- # reply = literature.find("dd", {"class": "lastpost"}).find("a", {"class": "username"}).text
- # user.append(reply)
-
- date_time_obj = literature.find('time').attrs
- date = date_time_obj['datetime'][0:10]
- time = date_time_obj['datetime'][11:19]
- date_added = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S')
-
- addDate.append(date_added)
-
- listing_href = literature.find("a", {"class": "topictitle"}).get("href")
- href.append(listing_href)
-
- image_author.append("-1")
-
- nm = len(topic)
-
- return organizeTopics(
- forum=forum,
- nm=nm,
- board=board,
- author=author,
- topic=topic,
- views=views,
- posts=posts,
- href=href,
- addDate=addDate,
- image_author=image_author
- )
-
-
-
-
-def abyssForum_links_parser(soup):
-
- # Returning all links that should be visited by the Crawler
-
- href = []
- #print(soup.find('table', {"class": "tborder clear"}).find(
- # 'tbody').find_all('tr', {"class": "inline_row"}))
- listing = soup.find_all('dl', {"class": "row-item topic_read"})
-
- for a in listing:
- link = a.find('div', {"class": "list-inner"}).find('a').get('href')
-
- href.append(link)
-
- return href
\ No newline at end of file
diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py
deleted file mode 100644
index 4dfa963..0000000
--- a/Forums/Altenens/crawler_selenium.py
+++ /dev/null
@@ -1,298 +0,0 @@
-__author__ = 'Helium'
-
-'''
-Altenens Forum Crawler (Selenium)
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-from PIL import Image
-
-import urllib.parse as urlparse
-import os, re, time
-from datetime import date
-import configparser
-import subprocess
-from bs4 import BeautifulSoup
-from Forums.Initialization.prepare_parser import new_parse
-from Forums.Altenens.parser import altenens_links_parser
-from Forums.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'https://altenens.is/'
-
-
-# Opens Tor Browser, crawls the website
-def startCrawling():
- forumName = getForumName()
- driver = getAccess()
-
- if driver != 'down':
- try:
- login(driver)
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(forumName, baseURL, True)
-
-
-# Login using premade account credentials and do login captcha manually
-def login(driver):
- #click login button
- login_link = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]').get_attribute('href')
- driver.get(login_link) # open tab with url
-
- #entering username and password into input boxes
- usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[1]/dd/input')
- #Username here
- usernameBox.send_keys('mylittlepony45')#sends string to the username box
- passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[2]/dd/div/div/input')
- #Password here
- passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox
-
- input("Press ENTER when CAPTCHA is completed\n")
-
- # wait for listing page show up (This Xpath may need to change based on different seed url)
- # wait for 50 sec until id = tab_content is found, then cont
- WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
- (By.XPATH, '/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]')))
-
-
-# Returns the name of the website
-def getForumName():
- name = 'Altenens'
- return name
-
-
-# Return the link of the website
-def getFixedURL():
- url = 'https://altenens.is/'
- return url
-
-
-# Closes Tor Browser
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close() #close tab
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from Forums.Initialization.forums_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- # ff_prof.set_preference("network.dns.disablePrefetch", True)
- # ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", True)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)# open url in browser
- return driver
- except:
- driver.close()# close tab
- return 'down'
-
-
-# Saves the crawled html page
-def savePage(driver, html, url):
- cleanPage = cleanHTML(driver, html)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-def getFullPathName(url):
- from Forums.Initialization.forums_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
- name = str(counter)
- counter = counter + 1
- return name
-
-
-def getInterestedLinks():
- links = []
-
- # Hacking
- links.append('https://altenens.is/forums/hacking.469162/')
- # Hacking showoff
- links.append('https://altenens.is/forums/hacking-showoff.469232/')
- # Remote administration
- links.append('https://altenens.is/forums/remote-administration.469161/')
- # Cracking tools
- links.append('https://altenens.is/forums/cracking-tools.469204/')
- # Cracking tutorials
- links.append('https://altenens.is/forums/cracking-tutorials-other-methods.469205/')
- # Combo lists and configs
- links.append('https://altenens.is/forums/combolists-and-configs.469206/')
- # Programming
- links.append('https://altenens.is/forums/programming.469239/')
-
- return links
-
-
-# newest version of crawling
-def crawlForum(driver):
- print("Crawling the Altenens forum")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
- topics = topicPages(html)
- for topic in topics:
- has_next_topic_page = True
- counter = 1
- page = topic
-
- while has_next_topic_page:
- itemURL = urlparse.urljoin(baseURL, str(page))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
-
- if isListingLink(driver.current_url):
- break
-
- savePage(driver, driver.page_source, topic + f"page{counter}") # very important
-
- # # comment out
- # if counter == 2:
- # break
-
- try:
- page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
- if page == "":
- raise NoSuchElementException
- counter += 1
-
- except NoSuchElementException:
- has_next_topic_page = False
-
- try:
- driver.get(link)
- except:
- driver.refresh()
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- try:
- link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the Altenens forum done.")
-
-
-# Returns 'True' if the link is Topic link, may need to change for every website
-def isDescriptionLink(url):
- if 'threads' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link, may need to change for every website
-def isListingLink(url):
- if '.is/forums' in url:
- return True
- return False
-
-
-# calling the parser to define the links
-def topicPages(html):
- soup = BeautifulSoup(html, "html.parser")
- #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
- return altenens_links_parser(soup)
-
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/Forums/Altenens/parser.py b/Forums/Altenens/parser.py
deleted file mode 100644
index e056cb2..0000000
--- a/Forums/Altenens/parser.py
+++ /dev/null
@@ -1,165 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from Forums.Utilities.utilities import *
-from datetime import date
-from datetime import timedelta
-import re
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup
-
-
-# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
-def altenens_description_parser(soup):
-
- topic = "-1" # 0 *topic name
- user = [] # 1 *all users of each post
- status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
- reputation = [] # 3 all user's karma in each post (usually found as a number)
- interest = [] # 4 all user's interest in each post
- sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
- post = [] # 6 all messages of each post
- feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
- addDate = [] # 8 all dates of each post
- image_user = [] # 9 all user avatars of each post
- image_post = [] # 10 all first images of each post
-
- etopic = soup.find("h1", {"class": "p-title-value"})
- if etopic is not None:
- topic = etopic.text
- topic = cleanString(topic.strip())
-
- body = soup.find('div', {"class": "block-container lbContainer"})
- iposts = body.find_all('article', {"class": "message message--post js-post js-inlineModContainer"})
-
- for ipost in iposts:
-
- author = ipost.find('h4', {"class": "message-name"}).text
- user.append(cleanString(author.strip()))
-
- stat = ipost.find('h5', {"class": "userTitle message-userTitle"}).text
- status.append(cleanString(stat.strip()))
-
- bar = ipost.find('div', {"class": "xtr-progress-bar"})
- if bar is not None:
- rep = bar.find('p').get('data-value')
- else:
- rep = "-1"
- reputation.append(cleanString(rep))
-
- interest.append("-1")
-
- signature = ipost.find('aside', {"class": "message-signature"})
- if signature is not None:
- signature = signature.text.strip()
- else:
- signature = "-1"
- sign.append(cleanString(signature))
-
- inner = ipost.find('div', {"class": "bbWrapper"}).find(text=True, recursive=False)
- if inner is not None:
- inner = inner.strip()
- else:
- inner = "" # cannot use -1 because the post is hidden unless you reply
- post.append(cleanString(inner))
-
- feedback.append("-1")
-
- dt = ipost.find('time', {"class": "u-dt"}).get('datetime')
- date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
- addDate.append(date_time_obj)
-
- img = ipost.find('div', {"class": "message-avatar-wrapper"}).find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = "-1"
- image_user.append(img)
-
- image_post.append("-1")
-
- # Populate the final variable (this should be a list with all fields scraped)
-
- row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
-
- # Sending the results
-
- return row
-
-
-# This is the method to parse the Listing Pages (one page with many posts)
-def altenens_listing_parser(soup):
-
- nm = 0 # *this variable should receive the number of topics
- forum = "Altenens" # 0 *forum name
- board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
- # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
- author = [] # 2 *all authors of each topic
- topic = [] # 3 *all topics
- views = [] # 4 number of views of each topic
- posts = [] # 5 number of posts of each topic
- href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
- # Listing and Description pages)
- addDate = [] # 7 when the topic was created (difficult to find)
- image_author = [] # 8 all author avatars used in each topic
-
- board = soup.find('h1', {"class": "p-title-value"}).text
- board = cleanString(board.strip())
-
- regex = re.compile('structItem structItem--thread.*')
- itopics = soup.find_all('div', {"class": regex})
-
- nm = len(itopics)
-
- for itopic in itopics:
-
- topics = itopic.find('div', {"class": "structItem-title"}).text
- topic.append(cleanString(topics.strip()))
-
- author_icon = itopic.find('a', {"class": "avatar avatar--s"})
- if author_icon != None:
- author_icon = author_icon.find('img')
- author_icon = author_icon.get('src')
- author_icon = author_icon.split('base64,')[-1]
- else:
- author_icon = "-1"
- image_author.append(author_icon)
-
- link = itopic.find('div', {"class": "structItem-title"}).find('a').get('href')
- href.append(link)
-
- user = itopic.find('ul', {"class": "structItem-parts"}).find('a').text
- author.append(cleanString(user.strip()))
-
- dt = itopic.find('time', {"class": "u-dt"}).get('datetime')
- date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
- addDate.append(date_time_obj)
-
- nposts = itopic.find('dl', {"class": "pairs pairs--justified"}).text
- nposts = nposts.replace('Replies', '')
- nposts = nposts.replace('K', '000')
- posts.append(cleanString(nposts))
-
- nviews = itopic.find('dl', {"class": "pairs pairs--justified structItem-minor"}).text
- nviews = nviews.replace('Views', '')
- nviews = nviews.replace('K', '000')
- views.append(cleanString(nviews))
-
- return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author)
-
-
-def altenens_links_parser(soup):
-
- # Returning all links that should be visited by the Crawler
-
- href = []
-
- listing = soup.find_all('div', {"class": "structItem-cell structItem-cell--main"})
-
- for a in listing:
- link = a.find('a', {"class": ""}).get('href')
-
- href.append(link)
-
- return href
\ No newline at end of file
diff --git a/Forums/Cardingleaks/crawler_selenium.py b/Forums/Cardingleaks/crawler_selenium.py
deleted file mode 100644
index 0712956..0000000
--- a/Forums/Cardingleaks/crawler_selenium.py
+++ /dev/null
@@ -1,303 +0,0 @@
-__author__ = 'DarkWeb'
-
-'''
-Cardingleaks Forum Crawler (Selenium)
-Crawler updated and fixed
-
-The site has this thing sometime where you'll have to look at a new post everyday. makes sure
-you login first before crawling.
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-
-from PIL import Image
-import urllib.parse as urlparse
-import os, re, time
-import subprocess
-from bs4 import BeautifulSoup
-from Forums.Initialization.prepare_parser import new_parse
-from Forums.Cardingleaks.parser import cardingleaks_links_parser
-from Forums.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'https://leaks.ws/'
-
-
-# Opens Tor Browser, crawls the website
-def startCrawling():
- forumName = getForumName()
- driver = getAccess()
-
- if driver != 'down':
- try:
- login(driver)
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(forumName, baseURL, True)
-
-
-# Login using premade account credentials and do login captcha manually
-def login(driver):
- #click login button
- login_link = driver.find_element(
- by=By.XPATH, value='/html/body/div[2]/div[1]/nav/div/div[3]/div[1]/a[1]').\
- get_attribute('href')
- driver.get(login_link)# open tab with url
-
- #entering username and password into input boxes
- usernameBox = driver.find_element(by=By.NAME, value='login')
- #Username here
- usernameBox.send_keys('somanyfrogs')#sends string to the username box
- passwordBox = driver.find_element(by=By.NAME, value='password')
- #Password here
- passwordBox.send_keys('therearewaytoomanyherehowwhy')# sends string to passwordBox
-
- login = driver.find_element(by=By.CLASS_NAME, value='block-container')
- login_link = login.find_element(by=By.TAG_NAME, value='button')
- login_link.click()
-
- # input('input')
-
- # wait for listing page show up (This Xpath may need to change based on different seed url)
- # wait for 50 sec until id = tab_content is found, then cont
- WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
- (By.CLASS_NAME, 'p-body-pageContent')))
-
-
-# Returns the name of the website
-def getForumName() -> str:
- name = 'Cardingleaks'
- return name
-
-
-# Return the link of the website
-def getFixedURL():
- url = 'https://leaks.ws/'
- return url
-
-
-# Closes Tor Browser
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close() #close tab
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from Forums.Initialization.forums_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", True)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)
- return driver
- except:
- driver.close()
- return 'down'
-
-
-# Saves the crawled html page
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-def getFullPathName(url):
- from Forums.Initialization.forums_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if name == '':
- name = str(counter)
- counter = counter + 1
- return name
-
-
-def getInterestedLinks():
- links = []
-
- # carding methods
- links.append('https://leaks.ws/forums/carding-methods.82/')
- # # carding schools
- # links.append('https://leaks.ws/forums/help-desk-carding-school.35/')
- # # carding discussion
- # links.append('https://leaks.ws/forums/carding-discussion-desk.58/')
- # # carding tutorials
- # links.append('https://leaks.ws/forums/carding-tutorials.13/')
- # # carding tools and software
- # links.append('https://leaks.ws/forums/carding-tools-softwares.10/')
- # # exploits and cracking tools
- # links.append('https://leaks.ws/forums/exploits-cracking-tools.22/')
-
- return links
-
-
-def crawlForum(driver):
- print("Crawling the Cardingleaks forum")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
- topics = topicPages(html)
- for topic in topics:
- has_next_topic_page = True
- counter = 1
- page = topic
-
- while has_next_topic_page:
- itemURL = urlparse.urljoin(baseURL, str(page))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
-
- if isListingLink(driver.current_url):
- break
-
- savePage(driver, driver.page_source, topic + f"page{counter}") # very important
-
- # # comment out
- # if counter == 2:
- # break
-
- try:
- page = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
- if page == "":
- raise NoSuchElementException
- counter += 1
-
- except NoSuchElementException:
- has_next_topic_page = False
-
- # making sure we go back to the listing page (browser back button simulation)
- try:
- driver.get(link)
- except:
- driver.refresh()
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- try:
- link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the Cardingleaks forum done.")
-
-
-# Returns 'True' if the link is Topic link, may need to change for every website
-def isDescriptionLink(url):
- if 'threads' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link, may need to change for every website
-def isListingLink(url):
- if '.ws/forums' in url:
- return True
- return False
-
-
-# calling the parser to define the links
-def topicPages(html):
- soup = BeautifulSoup(html, "html.parser")
- return cardingleaks_links_parser(soup)
-
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/Forums/Cardingleaks/parser.py b/Forums/Cardingleaks/parser.py
deleted file mode 100644
index a2da87b..0000000
--- a/Forums/Cardingleaks/parser.py
+++ /dev/null
@@ -1,167 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from Forums.Utilities.utilities import *
-from datetime import date
-from datetime import timedelta
-import re
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup, ResultSet, Tag
-
-# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
-
-
-def cardingleaks_description_parser(soup: Tag):
-
- # Fields to be parsed
-
- topic = "-1" # 0 *topic name
- user = [] # 1 *all users of each post
- status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
- reputation = [] # 3 all user's karma in each post (usually found as a number)
- interest = [] # 4 all user's interest in each post
- sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
- post = [] # 6 all messages of each post
- feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
- addDate = [] # 8 all dates of each post
- image_user = [] # 9 all user avatars of each post
- image_post = [] # 10 all first images of each post
-
- li = soup.find("h1", {"class": "p-title-value"})
- topic = cleanString(li.text.strip())
-
- post_list: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True})
-
- for ipost in post_list:
- username = ipost.get('data-author')
- user.append(username)
-
- user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text
- status.append(cleanString(user_status.strip()))
-
- user_statistics: ResultSet[Tag] = ipost.find("div", {"class": "message-userExtras"}).find_all("dl", {"class": "pairs pairs--justified"})
-
- user_reputation = "-1"
-
- for stat in user_statistics:
- data_type = stat.find("span").get("data-original-title")
- if data_type == "Points":
- user_reputation = stat.find("dd").text
- break
-
- reputation.append(cleanString(user_reputation.strip()))
-
- interest.append("-1")
-
- sign.append("-1")
-
- user_post = ipost.find("div", {"class": "message-content js-messageContent"}).text
- post.append(cleanString(user_post.strip()))
-
- feedback.append("-1")
-
- datetime_text = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime")
- datetime_obj = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S%z")
- addDate.append(datetime_obj)
-
- img = ipost.find('div', {"class": "message-content js-messageContent"}).find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = "-1"
- image_post.append(img)
-
- img = ipost.find('div', {"class": "message-avatar"}).find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = "-1"
- image_user.append(img)
-
- # Populate the final variable (this should be a list with all fields scraped)
-
- row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
-
- # Sending the results
-
- return row
-
-# This is the method to parse the Listing Pages (one page with many posts)
-
-def cardingleaks_listing_parser(soup: Tag):
-
- nm = 0 # *this variable should receive the number of topics
- forum = "Cardingleaks" # 0 *forum name
- board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
- # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
- author = [] # 2 *all authors of each topic
- topic = [] # 3 *all topics
- views = [] # 4 number of views of each topic
- posts = [] # 5 number of posts of each topic
- href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
- # Listing and Description pages)
- addDate = [] # 7 when the topic was created (difficult to find)
- image_user = [] # 8 all user avatars used in each topic
-
- # Finding the board (should be just one)
-
- li = soup.find("h1", {"class": "p-title-value"})
- board = cleanString(li.text.strip())
-
- thread_list = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True})
-
- sticky = soup.find('div', {"class": "structItemContainer-group structItemContainer-group--sticky"})
- if sticky is not None:
- thread_list = sticky.find_all("div", {"data-author": True}) + thread_list
-
- nm = len(thread_list)
-
- for thread in thread_list:
- thread_author = thread.get("data-author")
- author.append(thread_author)
-
- thread_topic = thread.find("div", {"class": "structItem-title"}).text
- topic.append(cleanString(thread_topic.strip()))
-
- author_icon = thread.find("a", {"class": "avatar avatar--s"})
- if author_icon is not None:
- author_icon = author_icon.find('img')
- if author_icon is not None:
- author_icon = author_icon.get('src').split('base64,')[-1]
- image_user.append(author_icon)
- else:
- image_user.append('-1')
- else:
- image_user.append('-1')
-
- thread_view = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find("dd").text
- # Context text view count (i.e., 8.8K) to numerical (i.e., 8800)
- if thread_view.find("K") > 0:
- thread_view = str(int(float(thread_view.replace("K", "")) * 1000))
- views.append(thread_view)
-
- thread_posts = thread.find("dl", {"class": "pairs pairs--justified"}).find("dd").text
- posts.append(cleanString(thread_posts.strip()))
-
- thread_href = thread.find("div", {"class": "structItem-title"}).find("a").get("href")
- href.append(thread_href)
-
- thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime")
- datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z")
- addDate.append(datetime_obj)
-
- return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_user)
-
-
-def cardingleaks_links_parser(soup):
- # Returning all links that should be visited by the Crawler
- href = []
- listing = soup.find_all('div', {"class": "structItem-title"})
-
- for a in listing:
- link = a.find('a').get('href')
-
- href.append(link)
-
- return [href[-1]]
diff --git a/Forums/CryptBB/crawler_mechanize.py b/Forums/CryptBB/crawler_mechanize.py
deleted file mode 100644
index 7a763c6..0000000
--- a/Forums/CryptBB/crawler_mechanize.py
+++ /dev/null
@@ -1,257 +0,0 @@
-__author__ = '91Shadows'
-
-'''
-CryptBB Crawler (Mechanize)
-'''
-
-import codecs, os, re
-import socks, socket, time
-from datetime import date
-
-import urllib.parse as urlparse
-import http.client as httplib
-import mechanize
-import subprocess
-from bs4 import BeautifulSoup
-from Forums.Initialization.prepare_parser import new_parse
-from Forums.BestCardingWorld.parser import bestcardingworld_links_parser
-
-counter = 1
-httplib.HTTPConnection._http_vsn = 10
-httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
-baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
-socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)
-
-
-# Opens Tor Browser, crawls the website
-def startCrawling():
- opentor()
- getUrl()
- forumName = getForumName()
- br = getAccess()
-
- if br != 'down':
- crawlForum(br)
- new_parse(forumName, False)
-
- # new_parse(forumName, False)
-
- closetor()
-
-
-# Opens Tor Browser
-def opentor():
- global pid
- print("Connecting Tor...")
- path = open('../../path.txt').readline()
- pro = subprocess.Popen(path)
- pid = pro.pid
- time.sleep(7.5)
- input("Tor Connected. Press ENTER to continue\n")
- return
-
-
-# Creates a connection through Tor Port
-def getUrl(timeout=None):
- socket.socket = socks.socksocket
- socket.create_connection = create_connection
- return
-
-
-# Makes the onion address request
-def create_connection(address, timeout=None, source_address=None):
- sock = socks.socksocket()
- sock.connect(address)
- return sock
-
-
-# Returns the name of website
-def getForumName():
- name = 'CryptBB'
- return name
-
-
-# Return the link of website
-def getFixedURL():
- url = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
-
- return url
-
-
-# Closes Tor Browser
-def closetor():
- global pid
- os.system("taskkill /pid " + str(pid))
- print('Closing Tor...')
- time.sleep(3)
- return
-
-
-# Creates a Mechanize browser and initializes its options
-def createBrowser():
- br = mechanize.Browser()
- cj = mechanize.CookieJar()
- br.set_cookiejar(cj)
-
- # Browser options
- br.set_handle_equiv(True)
- br.set_handle_redirect(True)
- br.set_handle_referer(True)
- br.set_handle_robots(False)
- br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
- br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
- ('Accept', '*/*')]
-
- return br
-
-
-def getAccess():
- url = getFixedURL()
- br = createBrowser()
-
- try:
-
- br.open(url)
- return br
-
- except:
-
- return 'down'
-
-
-# Saves the crawled html page
-def savePage(page, url):
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- a = page.read()
- open(filePath, "wb").write(a)
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-def getFullPathName(url):
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str(
- "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
- "%04d" % date.today().year) + '/' + 'Description/' + fileName + '.html'
- else:
- fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str(
- "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
- "%04d" % date.today().year) + '/' + 'Listing/' + fileName + '.html'
- return fullPath
-
-
-# Creates the name of the file based on URL
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
- name = str(counter)
- counter = counter + 1
- return name
-
-
-# Hacking and Markets related topics
-def getInterestedLinks():
- links = []
-
- links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43&sid=e12864ffccc5df877b03b573534955be')
-
- return links
-
-
-# Start crawling Forum pages
-def crawlForum(br):
- print("Crawling CryptBB forum")
-
- linksToCrawl = getInterestedLinks()
- visited = set(linksToCrawl)
- initialTime = time.time()
-
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- page = br.open(link)
- savePage(page, link)
-
- res = br.response().read()
- soup = BeautifulSoup(res, 'html.parser')
-
- next_link = soup.find("a", {"rel": "next"})
- if next_link != None:
- full_url = urlparse.urljoin(linksToCrawl[i], next_link['href'])
- linksToCrawl.insert(i + 1, full_url)
-
- listOfTopics = findDescriptionPages(link)
- for topic in listOfTopics:
- itemPage = br.open(str(topic))
- savePage(itemPage, topic)
-
- except Exception as e:
- print('Error getting link: ', link, e)
- i += 1
-
- # finalTime = time.time()
- # print finalTime - initialTime
-
- input("CryptBB forum done sucessfully. Press ENTER to continue\n")
-
- return
-
-
-# Returns True if the link is 'Topic' Links, may need to change for diff websites
-def isDescriptionLink(url):
- if 'topic' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link, may need to change for diff websites
-def isListingLink(url):
- '''
- reg = 'board=[0-9]+.[0-9]+\Z'
- if len(re.findall(reg, url)) == 0:
- return False
- return True
- '''
- if 'forum' in url:
- return True
- return False
-
-
-# calling the parser to define the links
-def findDescriptionPages(url):
- soup = ""
-
- error = False
- try:
- html = codecs.open(
- "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(
- "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
- "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html", encoding='utf8')
- soup = BeautifulSoup(html, "html.parser")
- except:
- try:
- html = open(
- "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(
- "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
- "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html")
- soup = BeautifulSoup(html, "html.parser")
- except:
- error = True
- print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
-
- if not error:
- return bestcardingworld_links_parser(soup)
-
- else:
- return []
-
-
-def crawler():
- startCrawling()
- print("Crawling and Parsing CryptBB .... DONE!")
diff --git a/Forums/CryptBB/crawler_selenium.py b/Forums/CryptBB/crawler_selenium.py
deleted file mode 100644
index e48b193..0000000
--- a/Forums/CryptBB/crawler_selenium.py
+++ /dev/null
@@ -1,331 +0,0 @@
-__author__ = 'DarkWeb'
-
-'''
-CryptBB Forum Crawler (Selenium)
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-
-from PIL import Image
-import urllib.parse as urlparse
-import os, re, time
-import subprocess
-from bs4 import BeautifulSoup
-from Forums.Initialization.prepare_parser import new_parse
-from Forums.CryptBB.parser import cryptBB_links_parser
-from Forums.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/'
-
-
-# Opens Tor Browser, crawls the website
-def startCrawling():
- forumName = getForumName()
- driver = getAccess()
-
- if driver != 'down':
- try:
- login(driver)
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(forumName, baseURL, True)
-
-
-# Login using premade account credentials and do login captcha manually
-def login(driver):
- #click login button
- login_link = driver.find_element(
- by=By.XPATH, value='/html/body/div/div[2]/div/table/tbody/tr[2]/td/center/pre/strong/a[1]').\
- get_attribute('href')
- driver.get(login_link)# open tab with url
-
- #entering username and password into input boxes
- usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input')
- #Username here
- usernameBox.send_keys('holyre')#sends string to the username box
- passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input')
- #Password here
- passwordBox.send_keys('PlatinumBorn2')# sends string to passwordBox
-
- '''
- # wait for captcha page show up
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div/div[2]/div/form/div/input")))
-
- # save captcha to local
- driver.find_element(by=By.XPATH, value='//*[@id="captcha_img"]').screenshot(r'..\CryptBB\captcha.png')
-
- # This method will show image in any image viewer
- im = Image.open(r'..\CryptBB\captcha.png')
-
- im.show()
-
- # wait until input space show up
- inputBox = driver.find_element(by=By.XPATH, value='//*[@id="imagestring"]')
-
-
- # ask user input captcha solution in terminal
- userIn = input("Enter solution: ")
-
- # send user solution into the input space
- inputBox.send_keys(userIn)
-
- # click the verify(submit) button
- driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div/form/div/input").click()
- '''
- input("Press ENTER when CAPTCHA is completed\n")
-
- # wait for listing page show up (This Xpath may need to change based on different seed url)
- # wait for 50 sec until id = tab_content is found, then cont
- WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
- (By.XPATH, '//*[@id="tab_content"]')))
-
-
-# Returns the name of the website
-def getForumName() -> str:
- name = 'CryptBB'
- return name
-
-
-# Return the link of the website
-def getFixedURL():
- url = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/'
- return url
-
-
-# Closes Tor Browser
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close() #close tab
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from Forums.Initialization.forums_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", True)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)
- return driver
- except:
- driver.close()
- return 'down'
-
-
-# Saves the crawled html page
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-def getFullPathName(url):
- from Forums.Initialization.forums_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if name == '':
- name = str(counter)
- counter = counter + 1
- return name
-
-
-def getInterestedLinks():
- links = []
-
- # Beginner Programming
- links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86')
- # Beginner Carding and Fraud
- links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91')
- # Beginner Hacking
- links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87')
- # Newbie
- links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=84')
- # Beginner Hardware
- links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=89')
- # Training Challenges
- links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=96')
- # Darknet Discussions
- links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88')
- # Public Leaks and Warez
- links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=97')
- # Sell
- links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=44')
-
- return links
-
-
-def crawlForum(driver):
- print("Crawling the CryptBB forum")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
- topics = topicPages(html)
- for topic in topics:
- has_next_topic_page = True
- counter = 1
- page = topic
-
- while has_next_topic_page:
- itemURL = urlparse.urljoin(baseURL, str(page))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
-
- if isListingLink(driver.current_url):
- break
-
- savePage(driver, driver.page_source, topic + f"page{counter}") # very important
-
- # # comment out
- # if counter == 2:
- # break
-
- try:
- temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div')
- page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
-
- if page == "":
- raise NoSuchElementException
- counter += 1
-
- except NoSuchElementException:
- has_next_topic_page = False
-
- # making sure we go back to the listing page (browser back button simulation)
- try:
- driver.get(link)
- except:
- driver.refresh()
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- try:
- temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div')
- link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
-
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the CrypttBB forum done.")
-
-
-# Returns 'True' if the link is Topic link, may need to change for every website
-def isDescriptionLink(url):
- if 'thread' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link, may need to change for every website
-def isListingLink(url):
- if '.onion/forumdisplay' in url:
- return True
- return False
-
-
-# calling the parser to define the links
-def topicPages(html):
- soup = BeautifulSoup(html, "html.parser")
- return cryptBB_links_parser(soup)
-
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py
deleted file mode 100644
index 1ac7bc6..0000000
--- a/Forums/CryptBB/parser.py
+++ /dev/null
@@ -1,282 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from Forums.Utilities.utilities import *
-from datetime import date
-from datetime import timedelta
-import re
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup
-
-# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
-
-
-def cryptBB_description_parser(soup):
-
- # Fields to be parsed
-
- topic = "-1" # 0 *topic name
- user = [] # 1 *all users of each post
- status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
- reputation = [] # 3 all user's karma in each post (usually found as a number)
- interest = [] # 4 all user's interest in each post
- sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
- post = [] # 6 all messages of each post
- feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
- addDate = [] # 8 all dates of each post
- image_user = [] # 9 all user avatars of each post
- image_post = [] # 10 all first images of each post
-
- # Finding the topic (should be just one coming from the Listing Page)
-
- li = soup.find("td", {"class": "thead"}).find('strong')
- topic = li.text
- topic = re.sub("\[\w*\]", '', topic)
-
- topic = topic.replace(",","")
- topic = topic.replace("\n","")
- topic = cleanString(topic.strip())
-
- # Finding the repeated tag that corresponds to the listing of posts
-
- posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
- 'div', {"class": "post"})
-
- # For each message (post), get all the fields we are interested to:
-
- for ipost in posts:
-
- if ipost.find('div', {"class": "deleted_post_author"}):
- continue
-
- # Finding a first level of the HTML page
-
- post_wrapper = ipost.find('span', {"class": "largetext"})
-
- # Finding the author (user) of the post
-
- author = post_wrapper.text.strip()
- user.append(cleanString(author)) # Remember to clean the problematic characters
-
- # Finding the status of the author
-
- smalltext = ipost.find('div', {"class": "post_author"})
-
- if smalltext is not None:
-
- # CryptBB does have membergroup and postgroup
- membergroup = smalltext.find('div', {"class": "profile-rank"})
- postgroup = smalltext.find('div', {"class": "postgroup"})
- if membergroup != None:
- membergroup = membergroup.text.strip()
- if postgroup != None:
- postgroup = postgroup.text.strip()
- membergroup = membergroup + " - " + postgroup
- else:
- if postgroup != None:
- membergroup = postgroup.text.strip()
- else:
- membergroup = "-1"
- status.append(cleanString(membergroup))
-
- # Finding the interest of the author
- # CryptBB does not have blurb
- blurb = smalltext.find('li', {"class": "blurb"})
- if blurb != None:
- blurb = blurb.text.strip()
- else:
- blurb = "-1"
- interest.append(cleanString(blurb))
-
- # Finding the reputation of the user
- # CryptBB does have reputation
- author_stats = smalltext.find('div', {"class": "author_statistics"})
- karma = author_stats.find('strong')
- if karma != None:
- karma = karma.text
- karma = karma.replace("Community Rating: ", "")
- karma = karma.replace("Karma: ", "")
- karma = karma.strip()
- else:
- karma = "-1"
- reputation.append(cleanString(karma))
-
- else:
- status.append('-1')
- interest.append('-1')
- reputation.append('-1')
-
- # Getting here another good tag to find the post date, post content and users' signature
-
- postarea = ipost.find('div', {"class": "post_content"})
-
- dt = postarea.find('span', {"class": "post_date"}).text
- # dt = dt.strip().split()
- dt = dt.strip()
- day=date.today()
- if "Today" in dt:
- today = day.strftime('%m-%d-%Y')
- stime = dt.replace('Today,','').strip()
- date_time_obj = today + ', '+stime
- date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
- elif "Yesterday" in dt:
- yesterday = day - timedelta(days=1)
- yesterday = yesterday.strftime('%m-%d-%Y')
- stime = dt.replace('Yesterday,','').strip()
- date_time_obj = yesterday + ', '+stime
- date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
- elif "ago" in dt:
- date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
- date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
- else:
- date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
- addDate.append(date_time_obj)
-
- # Finding the post
-
- inner = postarea.find('div', {"class": "post_body scaleimages"})
- quote = inner.find('blockquote')
- if quote is not None:
- quote.decompose()
- inner = inner.text.strip()
- post.append(cleanString(inner))
-
- # Finding the user's signature
-
- # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
- signature = ipost.find('div', {"class": "signature scaleimages"})
- if signature != None:
- signature = signature.text.strip()
- # print(signature)
- else:
- signature = "-1"
- sign.append(cleanString(signature))
-
- # As no information about user's feedback was found, just assign "-1" to the variable
-
- feedback.append("-1")
-
- img = ipost.find('div', {"class": "post_body scaleimages"}).find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = "-1"
- image_post.append(img)
-
- avatar = ipost.find('div', {"class": "author_avatar"})
- if avatar is not None:
- img = avatar.find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = "-1"
- else:
- img = "-1"
- image_user.append(img)
-
- # Populate the final variable (this should be a list with all fields scraped)
-
- row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
-
- # Sending the results
-
- return row
-
-# This is the method to parse the Listing Pages (one page with many posts)
-
-def cryptBB_listing_parser(soup):
-
- nm = 0 # *this variable should receive the number of topics
- forum = "CryptBB" # 0 *forum name
- board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
- # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
- author = [] # 2 *all authors of each topic
- topic = [] # 3 *all topics
- views = [] # 4 number of views of each topic
- posts = [] # 5 number of posts of each topic
- href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
- # Listing and Description pages)
- addDate = [] # 7 when the topic was created (difficult to find)
- image_author = [] # 8 all author avatars used in each topic
-
-
- # Finding the board (should be just one)
-
- board = soup.find('span', {"class": "active"}).text
- board = cleanString(board.strip())
-
- # Finding the repeated tag that corresponds to the listing of topics
-
- itopics = soup.find_all('tr',{"class": "inline_row"})
-
- # Counting how many topics
-
- nm = len(itopics)
-
- for itopic in itopics:
-
- # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
- # to don't miss any topic
-
- # Adding the topic to the topic list
- try:
- topics = itopic.find('span', {"class": "subject_old"}).find('a').text
- except:
- topics = itopic.find('span', {"class": "subject_new"}).find('a').text
- topics = re.sub("\[\w*\]", '', topics)
- topic.append(cleanString(topics))
-
- image_author.append(-1)
-
- # Adding the url to the list of urls
- try:
- link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
- except:
- link = itopic.find('span',{"class": "subject_new"}).find('a').get('href')
- href.append(link)
-
- # Finding the author of the topic
- ps = itopic.find('div', {"class":"author smalltext"}).text
- user = ps.strip()
- author.append(cleanString(user))
-
- # Finding the number of replies
- columns = itopic.findChildren('td',recursive=False)
- replies = columns[3].text
- if replies == '-':
- posts.append('-1')
- else:
- posts.append(cleanString(replies))
-
- # Finding the number of Views
- tview = columns[4].text
- if tview == '-':
- views.append('-1')
- else:
- views.append(cleanString(tview))
-
- # If no information about when the topic was added, just assign "-1" to the variable
-
- addDate.append("-1")
-
- return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author)
-
-
-def cryptBB_links_parser(soup):
-
- # Returning all links that should be visited by the Crawler
-
- href = []
-
- listing = soup.find('table', {"class": "tborder clear"}).find('tbody').find_all('tr', {"class": "inline_row"})
-
- for a in listing:
- try:
- link = a.find('span', {"class": "subject_old"}).find('a').get('href')
- except:
- link = a.find('span', {"class": "subject_new"}).find('a').get('href')
-
- href.append(link)
-
- return href
diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/DWForums/crawler_selenium.py
similarity index 67%
rename from Forums/HiddenAnswers/crawler_selenium.py
rename to Forums/DWForums/crawler_selenium.py
index f972861..d1e1a21 100644
--- a/Forums/HiddenAnswers/crawler_selenium.py
+++ b/Forums/DWForums/crawler_selenium.py
@@ -1,7 +1,7 @@
-__author__ = 'Helium'
+__author__ = 'DarkWeb'
'''
-HiddenAnswers Crawler (Selenium)
+DWForums Forum Crawler (Selenium)
'''
from selenium import webdriver
@@ -12,26 +12,24 @@ from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
-from PIL import Image
import urllib.parse as urlparse
-import os, re, time
+import os, time
from datetime import date
-import configparser
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
-from Forums.HiddenAnswers.parser import hiddenanswers_links_parser
+from Forums.DWForums.parser import dwForums_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
-baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/'
+baseURL = 'http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/'
# Opens Tor Browser, crawls the website
def startCrawling():
forumName = getForumName()
- driver: webdriver.Firefox = getAccess()
+ driver = getAccess()
if driver != 'down':
try:
@@ -41,25 +39,48 @@ def startCrawling():
print(driver.current_url, e)
closeDriver(driver)
- new_parse(forumName, baseURL, True)
+ new_parse(forumName, baseURL, False)
# Login using premade account credentials and do login captcha manually
def login(driver):
+ #click login button
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.CSS_SELECTOR, ".button--icon--user")))
+ login_link = driver.find_element(by=By.CSS_SELECTOR, value=".button--icon--user")
+ login_link.click()
+
+ #entering username and password into input boxes
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div[4]/div/div[2]/div/form/div[1]")))
+ container = driver.find_element(by=By.XPATH, value="/html/body/div[4]/div/div[2]/div/form/div[1]")
+ # print(container.get_attribute("outerHTML"))
+ boxes = container.find_elements(by=By.CLASS_NAME, value="input")
+ # print(len(boxes))
+
+ #Username here
+ boxes[0].send_keys('nice_reamer08')
+ #Password here
+ boxes[1].send_keys('tjpv$]Nc}XG@`%LM')
+ # no captcha on this site
+
+ # click the verify(submit) button
+ driver.find_element(by=By.CSS_SELECTOR, value=".button--icon--login").click()
+
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
- (By.XPATH, '/html/body/div[2]/div[2]/div/div[2]/div[4]/div/ul/li[14]/a')))
+ (By.CSS_SELECTOR, '.p-staffBar-inner > div:nth-child(4) > div:nth-child(1) > a:nth-child(1)')))
# Returns the name of the website
def getForumName():
- name = 'HiddenAnswers'
+ name = 'DWForums'
return name
# Return the link of the website
def getFixedURL():
- url = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/'
+ url = 'http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/'
return url
@@ -106,11 +127,12 @@ def createFFDriver():
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
+
driver.maximize_window()
return driver
+
def getAccess():
url = getFixedURL()
driver = createFFDriver()
@@ -157,26 +179,24 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # hacking
- links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking')
- # darknet and tor
- links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/darknet-and-tor')
- # internet
- links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/internet')
- # links
- links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/links')
- # programming
- links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/programming')
- # knowledge and information
- links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/knowledge-and-information')
- # other
- links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/other')
+ # Hacking
+ links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/hacking-forum.33/')
+ # # Beginner Carding and Fraud
+ # links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/remote-administration.34/')
+ # # Cracking Tools
+ # links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/cracking-tools.35/')
+ # # Cracking Tutorials and Other Methods - error here about file not exisitng
+ # links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/cracking-tutorials-other-methods.36/')
+ # # Combolists and Configs
+ # links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/combolists-and-configs.58/')
+ # # Paid Software and Antivirus
+ # links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/paid-softwares-and-antivirus.59/')
return links
-def crawlForum(driver: webdriver.Firefox):
- print("Crawling the HiddenAnswers forum")
+def crawlForum(driver):
+ print("Crawling the DWForums forum")
linksToCrawl = getInterestedLinks()
@@ -208,18 +228,14 @@ def crawlForum(driver: webdriver.Firefox):
driver.get(itemURL)
except:
driver.refresh()
-
- if isListingLink(driver.current_url):
- break
+ savePage(driver, driver.page_source, topic + f"page{counter}")
- savePage(driver, driver.page_source, topic + f"page{counter}") # very important
-
- # # comment out
- # if counter == 2:
- # break
+ # comment out
+ if counter == 2:
+ break
try:
- page = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href')
+ page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
@@ -227,21 +243,19 @@ def crawlForum(driver: webdriver.Firefox):
except NoSuchElementException:
has_next_topic_page = False
- # making sure we go back to the listing page (browser back button simulation)
- try:
- driver.get(link)
- except:
- driver.refresh()
+ for i in range(counter):
+ driver.back()
+
+ # comment out
+ break
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
+ # comment out
+ if count == 1:
+ break
try:
- link = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href')
+ temp = driver.find_element(by=By.LINK_TEXT, value="Next")
+ link = temp.get_attribute('href')
if link == "":
raise NoSuchElementException
@@ -254,19 +268,19 @@ def crawlForum(driver: webdriver.Firefox):
print(link, e)
i += 1
- print("Crawling the HiddenAnswers forum done.")
+ input("Crawling DWForums forum done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
- if 'http' not in url:
+ if '/threads/' in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
- if 'http' in url:
+ if '/forums/' in url:
return True
return False
@@ -275,9 +289,9 @@ def isListingLink(url):
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
- return hiddenanswers_links_parser(soup)
+ return dwForums_links_parser(soup)
def crawler():
startCrawling()
- # print("Crawling and Parsing Abyss .... DONE!")
+ # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/Forums/DWForums/parser.py b/Forums/DWForums/parser.py
new file mode 100644
index 0000000..e3616e3
--- /dev/null
+++ b/Forums/DWForums/parser.py
@@ -0,0 +1,312 @@
+__author__ = 'DarkWeb'
+
+# Here, we are importing the auxiliary functions to clean or convert data
+from Forums.Utilities.utilities import *
+from datetime import date
+from datetime import timedelta
+import re
+
+# Here, we are importing BeautifulSoup to search through the HTML tree
+from bs4 import BeautifulSoup
+
+# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
+
+def dwForums_description_parser(soup):
+
+ # Fields to be parsed
+
+ topic = "-1" # 0 *topic name
+ user = [] # 1 *all users of each post
+ status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
+ reputation = [] # 3 all user's karma in each post (usually found as a number)
+ interest = [] # 4 all user's interest in each post
+ sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
+ post = [] # 6 all messages of each post
+ feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
+ addDate = [] # 8 all dates of each post
+
+ # Finding the topic (should be just one coming from the Listing Page)
+
+ li = soup.find("h1", {"class": "p-title-value"})
+
+ topic = li.text
+ topic = topic.replace(u'\xa0', ' ')
+ topic = topic.replace(",","")
+ topic = topic.replace("\n","")
+ topic = cleanString(topic.strip())
+ # print(topic)
+ # Finding the repeated tag that corresponds to the listing of posts
+
+ # posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \
+ # soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"})
+
+ posts = soup.find('div', {"class": "js-replyNewMessageContainer"}).find_all(
+ 'article', {"class": "js-post"}, recursive=False)
+ # print(len(posts))
+
+ # For each message (post), get all the fields we are interested to:
+
+ for ipost in posts:
+
+ # Finding a first level of the HTML page
+
+ # post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"})
+ post_wrapper = ipost.find('h4', {"class": "message-name"})
+ # Finding the author (user) of the post
+
+ # author = post_wrapper.find('h4')
+ author = post_wrapper.text.strip()
+ # print("author " + author)
+ user.append(cleanString(author)) # Remember to clean the problematic characters
+
+ # Finding the status of the author
+
+
+
+ # Testing here two possibilities to find this status and combine them
+ # if ipost.find('h5', {"class": "deleted_post_author"}):
+ # status.append(-1)
+ # interest.append(-1)
+ # reputation.append(-1)
+ # addDate.append(-1)
+ # post.append("THIS POST HAS BEEN REMOVED!")
+ # sign.append(-1)
+ # feedback.append(-1)
+ # continue
+
+ # CryptBB does have membergroup and postgroup
+
+ membergroup = ipost.find('h5', {"class": "userTitle"})
+ # DWForums doesnt have postgroups
+ postgroup = None
+ if membergroup != None:
+ membergroup = membergroup.text.strip()
+ if postgroup != None:
+ postgroup = postgroup.text.strip()
+ membergroup = membergroup + " - " + postgroup
+ else:
+ if postgroup != None:
+ membergroup = postgroup.text.strip()
+ else:
+ membergroup = "-1"
+
+ status.append(cleanString(membergroup))
+ # print("status " + cleanString(membergroup))
+ # Finding the interest of the author
+ # DWForums does not have blurb
+ blurb = ipost.find('li', {"class": "blurb"})
+ if blurb != None:
+ blurb = blurb.text.strip()
+ else:
+ blurb = "-1"
+ interest.append(cleanString(blurb))
+
+ # Finding the reputation of the user
+ # CryptBB does have reputation
+ author_stats = ipost.find('div', {"class": "message-userExtras"})
+ if author_stats != None:
+ karma = author_stats.find_all('dl', {"class": "pairs"})[2]
+ else:
+ karma = None
+ if karma != None:
+ karma = karma.text
+ karma = karma.replace("Reaction score","")
+ karma = karma.replace(":", "")
+ karma = karma.strip()
+ else:
+ karma = "-1"
+ reputation.append(cleanString(karma))
+ # print("karma " + cleanString(karma))
+ # Getting here another good tag to find the post date, post content and users' signature
+
+ postarea = ipost.find('div', {"class": "message-attribution-main"})
+
+ dt = postarea.find('time', {"class": "u-dt"})['datetime']
+ # dt = dt.strip().split()
+ dt = dt.strip()[:16]
+ dt = dt.replace("T",", ")
+ day=date.today()
+ if "Yesterday" in dt:
+ yesterday = day - timedelta(days=1)
+ yesterday = yesterday.strftime('%m-%d-%Y')
+ stime = dt.replace('Yesterday,','').strip()
+ date_time_obj = yesterday+ ', '+stime
+ date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %H:%M')
+ elif "hours ago" in dt:
+ day = day.strftime('%m-%d-%Y')
+ date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
+ date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %H:%M')
+ else:
+ date_time_obj = datetime.strptime(dt, '%Y-%m-%d, %H:%M')
+ stime = date_time_obj.strftime('%b %d, %Y')
+ sdate = date_time_obj.strftime('%I:%M %p')
+
+
+ addDate.append(date_time_obj)
+ # print("date " + str(date_time_obj))
+ # Finding the date of the post
+ # date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p')
+ # smalltext = postarea.find('div', {"class": "flow_hidden"}).find('div', {"class": "keyinfo"})\
+ # .find('div', {"class": "smalltext"})
+ # sdatetime = smalltext.text
+ # sdatetime = sdatetime.replace(u"\xab","") # Removing unnecessary characters
+ # sdatetime = sdatetime.replace(u"\xbb","") # Removing unnecessary characters
+ # sdatetime = sdatetime.split("on: ") # Removing unnecessary characters
+ # sdatetime = sdatetime[1].strip()
+ # stime = sdatetime[:-12:-1] # Finding the time of the post
+ # stime = stime[::-1]
+ # sdate = sdatetime.replace(stime,"") # Finding the date of the post
+ # sdate = sdate.replace(",","")
+ # sdate = sdate.strip()
+
+ # Covert the date of the post that can be informed as: "12 February 2016", "today", "yesterday". We need
+ # a date format here as "mm/dd/yyyy"
+
+ # addDate.append(convertDate(sdate,"english", crawlerDate) + " " + stime)
+
+ # Finding the post
+
+ inner = ipost.find('article', {"class": "message-body"})
+ inner = inner.text.strip()
+ # print(inner)
+ post.append(cleanString(inner))
+
+ # Finding the users's signature
+
+ # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
+ signature = ipost.find('aside', {"class": "message-signature"})
+ if signature != None:
+ signature = signature.text.strip()
+ # print(signature)
+ else:
+ signature = "-1"
+ sign.append(cleanString(signature))
+
+ # As no information about users's feedback was found, just assign "-1" to the variable
+
+ feedback.append("-1")
+
+ # Populate the final variable (this should be a list with all fields scraped)
+
+ row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
+
+ # Sending the results
+
+ return row
+
+# This is the method to parse the Listing Pages (one page with many posts)
+
+def dwForums_listing_parser(soup):
+
+ nm = 0 # *this variable should receive the number of topics
+ forum = "DWForums" # 0 *forum name
+ board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
+ # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
+ author = [] # 2 *all authors of each topic
+ topic = [] # 3 *all topics
+ views = [] # 4 number of views of each topic
+ posts = [] # 5 number of posts of each topic
+ href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
+ # Listing and Description pages)
+ addDate = [] # 7 when the topic was created (difficult to find)
+
+ # Finding the board (should be just one)
+
+ board = soup.find('h1', {"class": "p-title-value"}).text
+ board = cleanString(board.strip())
+
+ # Finding the repeated tag that corresponds to the listing of topics
+
+ regex = re.compile('.*structItem--thread.*')
+ itopics = soup.find_all("div", {"class": regex})
+
+ for itopic in itopics:
+
+ # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
+ # to don't miss any topic
+
+ # tds = itopic.findAll('td', {"class": "subject stickybg2"})
+ #
+ # if len(tds) > 0:
+ # tag.append("strong")
+ # tag.append("subject stickybg2")
+ # tag.append("stats stickybg")
+ # else:
+ # tds = itopic.findAll('td', {"class": "subject windowbg2"})
+ # if len(tds) > 0:
+ # tag.append("span")
+ # tag.append("subject windowbg2")
+ # tag.append("stats windowbg")
+
+ # Adding the topic to the topic list
+ topics = itopic.find("div", {"class": "structItem-title"}).text
+ topics = topics.replace(",", "")
+ topics = topics.replace("\n", "")
+ topic.append(cleanString(topics.strip()))
+
+ # Counting how many topics we have found so far
+
+ nm = len(topic)
+
+ # Adding the url to the list of urls
+ link = itopic.select_one('a[href^="/threads/"]')
+ link = link['href']
+ link = cleanLink(link)
+ href.append(link)
+
+ # Finding the author of the topic
+ minor = itopic.find('div', {"class": "structItem-minor"})
+ ps = minor.find('li').text
+ user = ps.strip()
+ author.append(cleanString(user))
+
+ # Finding the number of replies
+ meta = itopic.find("div", {"class": "structItem-cell--meta"})
+ meta = meta.find_all("dl")
+ post = meta[0].find("dd").text
+ post = post.replace("K", "000")
+ posts.append(cleanString(post))
+
+ # Finding the number of Views
+ tview = meta[1].find("dd").text
+ tview = tview.replace("K", "000")
+ views.append(cleanString(tview))
+
+ # If no information about when the topic was added, just assign "-1" to the variable
+ minor = itopic.find("div", {"class": "structItem-minor"})
+ dt = minor.find('time')['datetime']
+ dt = dt.strip()[:16]
+ dt = dt.replace("T", ", ")
+ day = date.today()
+ if "Yesterday" in dt:
+ yesterday = day - timedelta(days=1)
+ yesterday = yesterday.strftime('%m-%d-%Y')
+ stime = dt.replace('Yesterday,', '').strip()
+ date_time_obj = yesterday + ', ' + stime
+ date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %H:%M')
+ else:
+ date_time_obj = datetime.strptime(dt, '%Y-%m-%d, %H:%M')
+ stime = date_time_obj.strftime('%b %d, %Y')
+ sdate = date_time_obj.strftime('%I:%M %p')
+ addDate.append(date_time_obj)
+
+ return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
+
+
+def dwForums_links_parser(soup):
+
+ # Returning all links that should be visited by the Crawler
+
+ href = []
+ #print(soup.find('table', {"class": "tborder clear"}).find(
+ # 'tbody').find_all('tr', {"class": "inline_row"}))
+ regex = re.compile('.*structItem--thread.*')
+ listing = soup.find_all("div", {"class": regex})
+
+ for a in listing:
+ link = a.select_one('a[href^="/threads/"]')
+ link = link['href']
+
+ href.append(link)
+
+ return href
\ No newline at end of file
diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/Dread/crawler_selenium.py
similarity index 68%
rename from Forums/AbyssForum/crawler_selenium.py
rename to Forums/Dread/crawler_selenium.py
index 27135f2..ce14732 100644
--- a/Forums/AbyssForum/crawler_selenium.py
+++ b/Forums/Dread/crawler_selenium.py
@@ -1,7 +1,7 @@
-__author__ = 'Helium'
+__author__ = 'DarkWeb'
'''
-AbyssForum Crawler (Selenium)
+Dread Forum Crawler (Selenium)
'''
from selenium import webdriver
@@ -12,20 +12,18 @@ from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
-from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
-import configparser
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
-from Forums.AbyssForum.parser import abyssForum_links_parser
+from Forums.Dread.parser import dread_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
-baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/'
+baseURL = 'http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/'
# Opens Tor Browser, crawls the website
@@ -41,25 +39,45 @@ def startCrawling():
print(driver.current_url, e)
closeDriver(driver)
- new_parse(forumName, baseURL, True)
+ new_parse(forumName, baseURL, False)
# Login using premade account credentials and do login captcha manually
def login(driver):
- # wait for listing page show up (This Xpath may need to change based on different seed url)
- WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
- (By.XPATH, '//*[@id="sn-category-3"]')))
+ '''
+ # code for captcha, for now, it runs too slow so captcha expires
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.CSS_SELECTOR, ".image")))
+
+ inputBoxes = driver.find_elements(by=By.TAG_NAME, value='input')
+ for index, inputBox in enumerate(inputBoxes):
+ driver.find_element(by=By.CSS_SELECTOR, value='.image').screenshot(r'..\Dread\captcha.png')
+ im = Image.open(r'..\Dread\captcha.png')
+
+ im.show()
+ userIn = input("Enter character: ")
+ inputBox.send_keys(userIn)
+ im.close()
+ if index != 5:
+ inputBoxes[index+1].click()
+ driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div/form/div/input").click()
+ '''
+ input("Press ENTER when CAPTCHA is completed\n")
+
+ #entering username and password into input boxes
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div/div[2]")))
# Returns the name of the website
def getForumName():
- name = 'AbyssForum'
+ name = 'Dread'
return name
# Return the link of the website
def getFixedURL():
- url = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/'
+ url = 'http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/'
return url
@@ -89,8 +107,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
+ # ff_prof.set_preference("network.dns.disablePrefetch", True)
+ # ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@@ -101,16 +119,18 @@ def createFFDriver():
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
+ ff_prof.set_preference("xpinstall.signatures.required", False);
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
+
driver.maximize_window()
return driver
+
def getAccess():
url = getFixedURL()
driver = createFFDriver()
@@ -157,26 +177,24 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # Hacked Database
- links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=26')
- # Hire a Hacker
- links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=27')
- # Hacking Tools
- links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=28')
- # Carding Forums
- links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=30')
- # Social Media Hacking
- links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=32')
- # Hacking Tutorials
- links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=12')
- # Cracking Tutorials
- links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=13')
+ # # OpSec
+ # links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/OpSec')
+ # Hacking 180
+ links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/hacking')
+ # # Jobs4Crypto
+ # links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/Jobs4Crypto')
+ # # Hacktown
+ # links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/HackTown')
+ # # Malware
+ # links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/malware')
+ # # Programming
+ # links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/programming')
return links
def crawlForum(driver):
- print("Crawling the AbyssForum forum")
+ print("Crawling the Dread forum")
linksToCrawl = getInterestedLinks()
@@ -208,20 +226,14 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
-
- if isListingLink(driver.current_url):
- break
-
- savePage(driver, driver.page_source, topic + f"page{counter}") # very important
+ savePage(driver, driver.page_source, topic + f"page{counter}")
- # # comment out
- # if counter == 2:
- # break
+ # comment out
+ if counter == 2:
+ break
try:
- temp = driver.find_element(By.CLASS_NAME, 'pagination')
- temp = temp.find_element(by=By.CLASS_NAME, value='next')
- page = temp.find_element(by=By.CLASS_NAME, value='button').get_attribute('href')
+ page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
@@ -229,23 +241,20 @@ def crawlForum(driver):
except NoSuchElementException:
has_next_topic_page = False
- # making sure we go back to the listing page (browser back button simulation)
- try:
- driver.get(link)
- except:
- driver.refresh()
+ for i in range(counter):
+ driver.back()
+
+ # comment out
+ break
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
+ # comment out
+ if count == 1:
+ break
try:
- temp = driver.find_element(By.CLASS_NAME, 'pagination')
- temp = temp.find_element(by=By.CLASS_NAME, value='next')
- link = temp.find_element(by=By.CLASS_NAME, value='button').get_attribute('href')
+ temp = driver.find_element(by=By.CLASS_NAME, value="pagination")
+ link = temp.find_element(by=By.CLASS_NAME, value="next").get_attribute('href')
+
if link == "":
raise NoSuchElementException
count += 1
@@ -257,19 +266,19 @@ def crawlForum(driver):
print(link, e)
i += 1
- print("Crawling the AbyssForum forum done.")
+ input("Crawling Dread forum done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
- if 'viewtopic' in url:
+ if '/post/' in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
- if '.onion/viewforum' in url:
+ if '/d/' in url:
return True
return False
@@ -278,9 +287,9 @@ def isListingLink(url):
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
- return abyssForum_links_parser(soup)
+ return dread_links_parser(soup)
def crawler():
startCrawling()
- # print("Crawling and Parsing Abyss .... DONE!")
+ # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/Forums/Dread/parser.py b/Forums/Dread/parser.py
new file mode 100644
index 0000000..8de9d0d
--- /dev/null
+++ b/Forums/Dread/parser.py
@@ -0,0 +1,334 @@
+__author__ = 'DarkWeb'
+
+# Here, we are importing the auxiliary functions to clean or convert data
+import datetime
+
+from Forums.Utilities.utilities import *
+from datetime import date
+from datetime import timedelta
+import re
+import traceback
+# Here, we are importing BeautifulSoup to search through the HTML tree
+from bs4 import BeautifulSoup
+
+
+# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
+def dread_description_parser(soup):
+
+ # Fields to be parsed
+
+ topic = "-1" # 0 *topic name
+ user = [] # 1 *all users of each post
+ status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
+ reputation = [] # 3 all user's karma in each post (usually found as a number)
+ interest = [] # 4 all user's interest in each post
+ sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
+ post = [] # 6 all messages of each post
+ feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
+ addDate = [] # 8 all dates of each post
+
+ # Finding the topic (should be just one coming from the Listing Page)
+ container = soup.find('div', {"class": "content"})
+ li = container.find("a", {"class": "title"})
+ if li == None:
+ return None
+ topic = li.text
+ topic = topic.replace(u'\xa0', ' ')
+ topic = topic.replace(",","")
+ topic = topic.replace("\n","")
+ topic = cleanString(topic.strip())
+ # print(topic)
+ # Finding the repeated tag that corresponds to the listing of posts
+
+ # posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \
+ # soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"})
+
+ # putting the initial post data since it is separated from comments
+ # author name
+ init_post = container.find('div', {"class": "item"})
+ author = init_post.find('div', {"class": "author"}).select_one('a[href^="/u/"]').text
+ flair = init_post.find('div', {"class": "author"}).find("span", {"class": "flair"})
+ try:
+ flair = flair.text.strip()
+ author = author.replace(flair, '')
+ except:
+ pass
+ author = author.strip()
+ user.append(cleanString(author))
+ # status
+ flair = init_post.find("span", {"class": "flair"})
+ if flair != None:
+ flair = flair.text.strip()
+ else:
+ flair = "-1"
+ status.append(cleanString(flair))
+ # no blurb
+ interest.append(-1)
+ # points for post
+ karma = init_post.find("div", {"class": "voteCount"})
+ if karma != None:
+ karma = karma.text
+ karma = karma.replace("points", "")
+ karma = karma.replace(":", "")
+ karma = karma.strip()
+ else:
+ karma = "-1"
+ reputation.append(cleanString(karma))
+ # date
+ spans = init_post.find('div', {"class": "author"}).find('span', recursive=False)
+ dt = spans['title']
+ month = find_month(dt)
+ split_text = dt.split()
+ day = int(re.search(r'\d+', split_text[0]).group())
+ year = int(split_text[2])
+ hm = re.findall(r'\d+', split_text[-1])
+ hm[0] = int(hm[0])
+ hm[1] = int(hm[1])
+ date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1])
+ addDate.append(date_time_obj)
+
+ # content
+ inner = init_post.find("div", {"class": "postContent"})
+ inner = inner.text.strip()
+ post.append(cleanString(inner))
+
+ # no signature
+ sign.append(-1)
+ # no feedback
+ feedback.append(-1)
+
+
+ comments = soup.find('div', {"class": "postComments"})
+ if comments == None:
+ row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
+ return row
+ else:
+ comments = soup.find('div', {"class": "postComments"}).find_all('div', "comment")
+ # print(len(posts))
+
+ # For each message (post), get all the fields we are interested to:
+
+ for ipost in comments:
+
+ # Finding a first level of the HTML page
+
+ # post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"})
+ cc = ipost.find('div', {"class": "commentContent"})
+
+ post_wrapper = cc.find('a', {"class": "username"}).text
+ flair = cc.find("span", {"class": "flair"})
+ try:
+ flair = flair.text.strip()
+ post_wrapper = post_wrapper.replace(flair, '')
+ except:
+ pass
+ author = post_wrapper.strip()
+ user.append(cleanString(author))
+
+
+ # Finding the status of the author
+
+ # Dread does not have membergroup and postgroup, but it has flair, similar enough
+
+
+ postgroup = None
+ if flair != None:
+ if postgroup != None:
+ postgroup = postgroup.text.strip()
+ flair = flair + " - " + postgroup
+ else:
+ if postgroup != None:
+ flair = postgroup.text.strip()
+ else:
+ flair = "-1"
+
+ status.append(cleanString(flair))
+ # print("status " + cleanString(membergroup))
+ # Finding the interest of the author
+ # Dread does not have blurb
+
+ interest.append(-1)
+
+ # Finding the reputation of the user
+ # Dread doesn't have reputation per user, but instead each post has its own point system
+ karma = cc.find('div', {"class": "votes"})
+
+ if karma != None:
+ karma = karma.text
+ karma = karma.replace("points","")
+ karma = karma.replace(":", "")
+ karma = karma.strip()
+ else:
+ karma = "-1"
+ reputation.append(cleanString(karma))
+ # print("karma " + cleanString(karma))
+ # Getting here another good tag to find the post date, post content and users' signature
+
+ postarea = ipost.find('div', {"class": "timestamp"}).find('span', recursive=False)
+ dt = postarea['title']
+ month = find_month(dt)
+ split_text = dt.split()
+ day = int(re.search(r'\d+', split_text[0]).group())
+ year = int(split_text[2])
+ hm = re.findall(r'\d+', split_text[-1])
+ hm[0] = int(hm[0])
+ hm[1] = int(hm[1])
+ date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1])
+ addDate.append(date_time_obj)
+
+ # Finding the post
+
+ inner = ipost.find('div', {"class": "commentBody"})
+ inner = inner.text.strip()
+ # print(inner)
+ post.append(cleanString(inner))
+
+ # No signature for Dread
+
+ sign.append(-1)
+
+ # As no information about users's feedback was found, just assign "-1" to the variable
+
+ feedback.append("-1")
+
+ # Populate the final variable (this should be a list with all fields scraped)
+
+ row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
+
+ # Sending the results
+
+ return row
+
+# This is the method to parse the Listing Pages (one page with many posts)
+
+def dread_listing_parser(soup):
+
+ nm = 0 # *this variable should receive the number of topics
+ forum = "Dread" # 0 *forum name
+ board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
+ # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
+ author = [] # 2 *all authors of each topic
+ topic = [] # 3 *all topics
+ views = [] # 4 number of views of each topic
+ posts = [] # 5 number of posts of each topic
+ href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
+ # Listing and Description pages)
+ addDate = [] # 7 when the topic was created (difficult to find)
+
+ # Finding the board (should be just one)
+
+ board = soup.find('a', {"class": "banner-top"}).text
+ board = cleanString(board.strip())
+
+ # Finding the repeated tag that corresponds to the listing of topics
+
+ itopics = soup.find("div", {"class": "postBoard"}).find_all("div", {"class": "item"}, recursive=False)
+
+ for itopic in itopics:
+
+ # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
+ # to don't miss any topic
+
+ # Adding the topic to the topic list
+ topic_title = itopic.find("a", {"class": "title"})
+ title_flair = topic_title.find('span', {"class": "flair"})
+ topics = topic_title.text
+ try:
+ title_flair = title_flair.text.strip()
+ topics = topics.replace(title_flair, '')
+ except:
+ pass
+ topics = topics.replace(u'\xa0', ' ')
+ topics = topics.replace(",", "")
+ topics = topics.replace("\n", "")
+ topic.append(cleanString(topics.strip()))
+
+ # Counting how many topics we have found so far
+
+ nm = len(topic)
+
+ # Adding the url to the list of urls
+ link = topic_title['href']
+ link = cleanLink(link)
+ href.append(link)
+
+ # Finding the author of the topic
+ ps = itopic.find('div', {"class": "author"})
+ post_wrapper = ps.select_one('a[href^="/u/"]').text
+ flair = ps.find("span", {"class": "flair"})
+ try:
+ flair = flair.text.strip()
+ post_wrapper = post_wrapper.replace(flair, '')
+ except:
+ pass
+ user = post_wrapper.strip()
+ author.append(cleanString(user))
+
+ # Finding the number of replies
+ meta = itopic.find("div", {"class": "postMain"})
+ post = meta.find("a").text
+ post = post.replace("comments", '').strip()
+ posts.append(cleanString(post))
+
+ # Finding the number of Views - not shown in Dread
+ views.append("-1")
+
+ # If no information about when the topic was added, just assign "-1" to the variable
+ spans = itopic.find('div', {"class": "author"}).find('span', recursive=False)
+ dt = spans['title']
+ month = find_month(dt)
+ split_text = dt.split()
+ day = int(re.search(r'\d+', split_text[0]).group())
+ year = int(split_text[2])
+ hm = re.findall(r'\d+', split_text[-1])
+ hm[0] = int(hm[0])
+ hm[1] = int(hm[1])
+ date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1])
+ addDate.append(date_time_obj)
+
+ return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
+
+
+def dread_links_parser(soup):
+
+ # Returning all links that should be visited by the Crawler
+
+ href = []
+ #print(soup.find('table', {"class": "tborder clear"}).find(
+ # 'tbody').find_all('tr', {"class": "inline_row"}))
+
+ listing = soup.find("div", {"class": "postBoard"}).find_all("div",{"class": "item"}, recursive=False)
+
+ for a in listing:
+ link = a.find("a", {"class": "title"})
+ link = link['href']
+
+ href.append(link)
+
+ return href
+
+def find_month(s):
+ if 'January' in s:
+ return 1
+ elif 'February' in s:
+ return 2
+ elif 'March' in s:
+ return 3
+ elif 'April' in s:
+ return 4
+ elif 'May' in s:
+ return 5
+ elif 'June' in s:
+ return 6
+ elif 'July' in s:
+ return 7
+ elif 'August' in s:
+ return 8
+ elif 'September' in s:
+ return 9
+ elif 'October' in s:
+ return 10
+ elif 'November' in s:
+ return 11
+ elif 'December' in s:
+ return 12
diff --git a/Forums/Helium/crawler_selenium.py b/Forums/Helium/crawler_selenium.py
new file mode 100644
index 0000000..9de4236
--- /dev/null
+++ b/Forums/Helium/crawler_selenium.py
@@ -0,0 +1,328 @@
+__author__ = 'DarkWeb'
+
+'''
+Helium Forum Crawler (Selenium)
+'''
+
+from selenium import webdriver
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
+from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
+from selenium.webdriver.firefox.service import Service
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+
+from PIL import Image
+import urllib.parse as urlparse
+import os, time
+from datetime import date
+import subprocess
+from bs4 import BeautifulSoup
+from Forums.Initialization.prepare_parser import new_parse
+from Forums.Helium.parser import helium_links_parser
+from Forums.Utilities.utilities import cleanHTML
+
+counter = 1
+baseURL = 'http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/'
+
+
+# Opens Tor Browser, crawls the website
+def startCrawling():
+ # opentor()
+ # forumName = getForumName()
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closetor(driver)
+
+ # new_parse(forumName, False)
+
+
+# Opens Tor Browser
+def opentor():
+ global pid
+ print("Connecting Tor...")
+ path = open('../../path.txt').readline().strip()
+ pro = subprocess.Popen(path)
+ pid = pro.pid
+ time.sleep(7.5)
+ input('Tor Connected. Press ENTER to continue\n')
+ return
+
+
+# Login using premade account credentials and do login captcha manually
+def login(driver):
+ #wait for login page
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div[2]/div/div[1]/div/div/div[2]/form/div[5]/div/button")))
+
+ #entering username and password into input boxes
+ usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
+ #Username here
+ usernameBox.send_keys('holyre')
+ passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
+ #Password here
+ passwordBox.send_keys('PlatinumBorn2')
+
+ '''
+ # wait for captcha page show up
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, '//*[@id="captcha_img"]')))
+
+ # save captcha to local
+ driver.find_element(by=By.XPATH, value='//*[@id="captcha_img"]').screenshot(r'..\Helium\captcha.png')
+
+ # This method will show image in any image viewer
+ im = Image.open(r'..\Helium\captcha.png')
+
+ im.show()
+
+ # wait until input space show up
+ inputBox = driver.find_element(by=By.XPATH, value='//*[@id="captcha"]')
+
+ # ask user input captcha solution in terminal
+ userIn = input("Enter solution: ")
+
+ # send user solution into the input space
+ inputBox.send_keys(userIn)
+
+ # click the verify(submit) button
+ driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div[1]/div/div/div[2]/form/div[5]/div/button").click()
+ '''
+ input("Press ENTER when CAPTCHA is completed\n")
+
+ # wait for listing page show up (This Xpath may need to change based on different seed url)
+ WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
+ (By.XPATH, '/html/body/div[2]/div/p')))
+
+
+# Returns the name of the website
+def getForumName():
+ name = 'Helium'
+ return name
+
+
+# Return the link of the website
+def getFixedURL():
+ url = 'http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/login'
+ return url
+
+
+# Closes Tor Browser
+def closetor(driver):
+ # global pid
+ # os.system("taskkill /pid " + str(pro.pid))
+ # os.system("taskkill /t /f /im tor.exe")
+ print('Closing Tor...')
+ driver.close()
+ time.sleep(3)
+ return
+
+
+# Creates FireFox 'driver' and configure its 'Profile'
+# to use Tor proxy and socket
+def createFFDriver():
+ file = open('../../path.txt', 'r')
+ lines = file.readlines()
+
+ ff_binary = FirefoxBinary(lines[0].strip())
+
+ ff_prof = FirefoxProfile(lines[1].strip())
+ ff_prof.set_preference("places.history.enabled", False)
+ ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
+ ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
+ ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
+ ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
+ ff_prof.set_preference("signon.rememberSignons", False)
+ ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
+ ff_prof.set_preference("network.dns.disablePrefetch", True)
+ ff_prof.set_preference("network.http.sendRefererHeader", 0)
+ # ff_prof.set_preference("permissions.default.image", 2)
+ ff_prof.set_preference("browser.download.folderList", 2)
+ ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
+ ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
+ ff_prof.set_preference('network.proxy.type', 1)
+ ff_prof.set_preference("network.proxy.socks_version", 5)
+ ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
+ ff_prof.set_preference('network.proxy.socks_port', 9150)
+ ff_prof.set_preference('network.proxy.socks_remote_dns', True)
+ ff_prof.set_preference("javascript.enabled", True)
+ ff_prof.update_preferences()
+
+ service = Service(lines[2].strip())
+
+ driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+
+ return driver
+
+
+def getAccess():
+ url = getFixedURL()
+ driver = createFFDriver()
+ try:
+ driver.get(url)
+ return driver
+ except:
+ driver.close()
+ return 'down'
+
+
+# Saves the crawled html page
+def savePage(page, url):
+ cleanPage = cleanHTML(page)
+ filePath = getFullPathName(url)
+ os.makedirs(os.path.dirname(filePath), exist_ok=True)
+ open(filePath, 'wb').write(cleanPage.encode('utf-8'))
+ return
+
+
+# Gets the full path of the page to be saved along with its appropriate file name
+def getFullPathName(url):
+ fileName = getNameFromURL(url)
+ if isDescriptionLink(url):
+ fullPath = r'..\Helium\HTML_Pages\\' + str(
+ "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
+ "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
+ else:
+ fullPath = r'..\Helium\HTML_Pages\\' + str(
+ "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
+ "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
+ return fullPath
+
+
+# Creates the file name from passed URL
+def getNameFromURL(url):
+ global counter
+ name = ''.join(e for e in url if e.isalnum())
+ if name == '':
+ name = str(counter)
+ counter = counter + 1
+ return name
+
+
+def getInterestedLinks():
+ links = []
+
+ # # General Discussion
+ # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/6')
+ # # Anonymity and Security
+ # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/8')
+ # # Programming
+ # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/9')
+ # # Carding Discussions
+ # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/10')
+ # # Hacked Database (free)
+ # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/11')
+ # Hacking tools, exploits and POC
+ links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/17')
+ # # Hacked Database
+ # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/12')
+ # # Hacking and other Services
+ # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/13')
+ # # Selling/Buying Malware, Exploits etc
+ # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/22')
+ # # General Tutorials
+ # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/18')
+ # # Hacking Tutorials
+ # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/19')
+
+ return links
+
+
+def crawlForum(driver):
+ print("Crawling the Helium forum")
+
+ linksToCrawl = getInterestedLinks()
+ # visited = set(linksToCrawl)
+ # initialTime = time.time()
+
+ i = 0
+ count = 0
+ while i < len(linksToCrawl):
+ link = linksToCrawl[i]
+ print('Crawling :', link)
+ try:
+ try:
+ driver.get(link)
+ except:
+ driver.refresh()
+ html = driver.page_source
+ savePage(html, link)
+
+ has_next_page = True
+ while has_next_page:
+ list = topicPages(html)
+ for item in list:
+ itemURL = urlparse.urljoin(baseURL, str(item))
+ try:
+ driver.get(itemURL)
+ except:
+ driver.refresh()
+ savePage(driver.page_source, item)
+ driver.back()
+ # comment out
+ break
+
+ # comment out
+ if count == 1:
+ count = 0
+ break
+
+ try:
+ bar = driver.find_element(by=By.XPATH, value=
+ '/html/body/div[2]/div/div[3]/ul')
+ li = bar.find_elements(By.TAG_NAME, 'li')[-1]
+ link = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
+
+ if link == "":
+ raise NoSuchElementException
+ try:
+ driver.get(link)
+ except:
+ driver.refresh()
+ html = driver.page_source
+ savePage(html, link)
+ count += 1
+
+ except NoSuchElementException:
+ has_next_page = False
+
+ except Exception as e:
+ print(link, e)
+ i += 1
+
+ # finalTime = time.time()
+ # print finalTime - initialTime
+
+ input("Crawling Helium forum done successfully. Press ENTER to continue\n")
+
+
+# Returns 'True' if the link is Topic link
+def isDescriptionLink(url):
+ if 'topic' in url:
+ return True
+ return False
+
+
+# Returns True if the link is a listingPage link
+def isListingLink(url):
+ if 'board' in url:
+ return True
+ return False
+
+
+# calling the parser to define the links
+def topicPages(html):
+ soup = BeautifulSoup(html, "html.parser")
+ return helium_links_parser(soup)
+
+
+def crawler():
+ startCrawling()
+ # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/Forums/Helium/parser.py b/Forums/Helium/parser.py
new file mode 100644
index 0000000..5a852a8
--- /dev/null
+++ b/Forums/Helium/parser.py
@@ -0,0 +1,248 @@
+__author__ = 'DarkWeb'
+
+# Here, we are importing the auxiliary functions to clean or convert data
+from Forums.Utilities.utilities import *
+
+# Here, we are importing BeautifulSoup to search through the HTML tree
+from bs4 import BeautifulSoup
+
+# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
+def helium_description_parser(soup):
+
+ # Fields to be parsed
+
+ topic = "-1" # topic name
+ user = [] # all users of each post
+ addDate = [] # all dated of each post
+ feedback = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format)
+ status = [] # all user's authority in each post such as (adm, member, dangerous)
+ reputation = [] # all users's karma in each post (usually found as a number)
+ sign = [] # all user's signature in each post (usually a standard message after the content of the post)
+ post = [] # all messages of each post
+ interest = [] # all user's interest in each post
+
+ # Finding the topic (should be just one coming from the Listing Page)
+
+ li = soup.find("h4", {"class": "text-truncated"})
+ topic = li.text
+ topic = topic.replace("Topic:", "")
+ topic = topic.replace("Post Reply", "")
+ topic = topic.replace(",", "")
+ topic = topic.replace("\n", "")
+ topic = cleanString(topic.strip())
+
+ # Finding the repeated tag that corresponds to the listing of posts
+
+ posts = soup.findAll('div', {"id": "a9"})
+
+ # For each message (post), get all the fields we are interested to:
+
+ for ipost in posts:
+
+ # Finding a first level of the HTML page
+
+ # Finding the author (user) of the post
+
+ heading = ipost.find('div', {"class": "panel-heading"})
+ title = heading.find('div', {"class": "panel-title"}).text
+ author = title.replace("User:", "")
+ author = author.strip()
+ user.append(cleanString(author)) # Remember to clean the problematic characters
+
+ # Finding the status of the author
+ # Testing here two possibilities to find this status and combine them
+ # Helium does not have membergroup and postgroup
+
+ membergroup = heading.find('li', {"class": "membergroup"})
+ postgroup = heading.find('li', {"class": "postgroup"})
+ if membergroup != None:
+ membergroup = membergroup.text.strip()
+ if postgroup != None:
+ postgroup = postgroup.text.strip()
+ membergroup = membergroup + " - " + postgroup
+ else:
+ if postgroup != None:
+ membergroup = postgroup.text.strip()
+ else:
+ membergroup = "-1"
+ status.append(cleanString(membergroup))
+
+ # Finding the interest of the author
+ # Helium does not have blurb
+
+ blurb = heading.find('li', {"class": "blurb"})
+ if blurb != None:
+ blurb = blurb.text.strip()
+ else:
+ blurb = "-1"
+ interest.append(cleanString(blurb))
+
+ # Finding the reputation of the user
+ # Helium does not have karma
+
+ karma = heading.find('li', {"class": "karma"})
+ if karma != None:
+ karma = karma.text
+ karma = karma.replace("Community Rating: ","")
+ karma = karma.replace("Karma: ","")
+ karma = karma.strip()
+ else:
+ karma = "-1"
+ reputation.append(cleanString(karma))
+
+ # Getting here another good tag to find the post date, post content and users' signature
+
+ postarea = ipost.find('div', {"class": "content_body"})
+
+ # Finding the date of the post
+ # Helium does not have date
+
+ addDate.append("-1")
+
+ # dt = ipost.find('p', {"class": "author"}).text.split('»')[1]
+ # # dt = dt.strip().split()
+ # dt = dt.strip()
+ # date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p')
+ # stime = date_time_obj.strftime('%a %b %d, %Y')
+ # sdate = date_time_obj.strftime('%I:%M %p')
+ # addDate.append(date_time_obj)
+
+ # date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p')
+ # smalltext = postarea.find('div', {"class": "flow_hidden"}).find('div', {"class": "keyinfo"})\
+ # .find('div', {"class": "smalltext"})
+ # sdatetime = smalltext.text
+ # sdatetime = sdatetime.replace(u"\xab","") # Removing unnecessary characters
+ # sdatetime = sdatetime.replace(u"\xbb","") # Removing unnecessary characters
+ # sdatetime = sdatetime.split("on: ") # Removing unnecessary characters
+ # sdatetime = sdatetime[1].strip()
+ # stime = sdatetime[:-12:-1] # Finding the time of the post
+ # stime = stime[::-1]
+ # sdate = sdatetime.replace(stime,"") # Finding the date of the post
+ # sdate = sdate.replace(",","")
+ # sdate = sdate.strip()
+
+ # Covert the date of the post that can be informed as: "12 February 2016", "today", "yesterday". We need
+ # a date format here as "mm/dd/yyyy"
+
+ #addDate.append(convertDate(sdate,"english", crawlerDate) + " " + stime)
+
+ # Finding the post
+
+ paragraphs = postarea.find_all('p')
+ p = ""
+ for paragraph in paragraphs:
+ p += paragraph.text.strip() + " "
+ quote = postarea.find('div', {"class": "standard_quote"})
+ if quote != None:
+ q = quote.text.strip()
+ p.replace(q, "")
+ post.append(cleanString(p.strip()))
+
+ # Finding the users's signature
+ # Helium does not have signature
+
+ #signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
+ signature = ipost.find('div', {"class": "post_wrapper"})
+ if signature != None:
+ signature = signature.text.strip()
+ else:
+ signature = "-1"
+ sign.append(cleanString(signature))
+
+ # As no information about users's feedback was found, just assign "-1" to the variable
+
+ feedback.append("-1")
+
+ # Populate the final variable (this should be a list with all fields scraped)
+
+ row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
+
+ # Sending the results
+
+ return row
+
+
+# This is the method to parse the Listing Pages (one page with many posts)
+def helium_listing_parser(soup):
+
+ board = "-1" # board name (the previous level of the topic in the Forum categorization tree.
+ # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
+
+ nm = 0 # this variable should receive the number of topics
+ topic = [] # all topics
+ user = [] # all users of each topic
+ post = [] # number of posts of each topic
+ view = [] # number of views of each topic
+ addDate = [] # when the topic was created (difficult to find)
+ href = [] # this variable should receive all cleaned urls (we will use this to do the marge between
+ # Listing and Description pages)
+
+ # Finding the board (should be just one)
+
+ parents = soup.find('div', {"class": "col-md-12"}).findAll('li')
+ board = parents[1].text + u"->" + parents[2].get('title')
+ board = board.replace("\n", "")
+ board = cleanString(board.strip())
+
+ # Finding the repeated tag that corresponds to the listing of topics
+
+ itopics = soup.find('table', {"class": "table"}).find('tbody').findAll('td', {"class": "col-md-8"})
+ repliesViews = soup.find('table', {"class": "table"}).find('tbody').findAll('td', {"class": "col-md-2"})
+
+ # Counting how many topics we have found so far
+
+ nm = len(itopics)
+
+ index = 0
+ for itopic in itopics:
+
+ # Adding the topic to the topic list
+
+ topics = itopic.find('a').get('title')
+ topics = topics.replace(",", "")
+ topic.append(cleanString(topics.strip()))
+
+ # Adding the url to the list of urls
+ link = itopic.find('a').get('href')
+ link = cleanLink(link)
+ href.append(link)
+
+ # Finding the author of the topic
+ author = itopic.find('strong').text
+ user.append(cleanString(author.strip()))
+
+ rv = repliesViews[index].find('p').text.split()
+
+ # Finding the number of replies
+ posts = rv[0].replace("Replies", "")
+ post.append(cleanString(posts.strip()))
+
+ # Finding the number of Views
+ tview = rv[1].replace("Views", "")
+ view.append(cleanString(tview.strip()))
+
+ # If no information about when the topic was added, just assign "-1" to the variable
+ # dt = itopic.find('div', {"class": "responsive-hide"}).text.split('»')[1]
+ # dt = dt.strip()
+ # date_time_obj = datetime.strptime(dt,'%a %b %d, %Y %I:%M %p')
+ # addDate.append(date_time_obj)
+ addDate.append("-1")
+ index += 1
+
+ return organizeTopics("Helium", nm, topic, board, view, post, user, addDate, href)
+
+
+def helium_links_parser(soup):
+
+ # Returning all links that should be visited by the Crawler
+
+ href = []
+
+ listing = soup.find('table', {"class": "table"}).find('tbody').findAll('td', {"class": "col-md-8"})
+
+ for a in listing:
+ bae = a.find('a', href=True)
+ link = bae['href']
+ href.append(link)
+
+ return href
\ No newline at end of file
diff --git a/Forums/HiddenAnswers/parser.py b/Forums/HiddenAnswers/parser.py
deleted file mode 100644
index 0f2647f..0000000
--- a/Forums/HiddenAnswers/parser.py
+++ /dev/null
@@ -1,212 +0,0 @@
-__author__ = 'Helium'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from typing import List
-from Forums.Utilities.utilities import *
-from datetime import date
-from datetime import timedelta
-import re
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup, ResultSet, Tag
-
-# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
-
-def HiddenAnswers_description_parser(soup: BeautifulSoup):
-
- # Fields to be parsed
-
- topic: str = "-1" # 0 topic name
- user: List[str] = [] # 1 all users of each post
- addDate: List[datetime] = [] # 2 all dated of each post
- feedback: List[str] = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format)
- status: List[str] = [] # 4 all user's authority in each post such as (adm, member, dangerous)
- reputation: List[str] = [] # 5 all user's karma in each post (usually found as a number)
- sign: List[str] = [] # 6 all user's signature in each post (usually a standard message after the content of the post)
- post: List[str] = [] # 7 all messages of each post
- interest: List[str] = [] # 8 all user's interest in each post
- image_user = [] # 9 all user avatars of each post
- image_post = [] # 10 all first images of each post
-
- # Finding the topic (should be just one coming from the Listing Page)
- li = soup.find("h1").find("span", {"itemprop": "name"})
- topic = li.text
-
- question: Tag = soup.find("div", {"class": "qa-part-q-view"})
-
- question_user = question.find("span", {"class": "qa-q-view-who-data"}).text
- user.append(cleanString(question_user.strip()))
-
- question_time = question.find("span", {"class": "qa-q-view-when-data"}).find("time").get("datetime")
- datetime_string = question_time.split("+")[0]
- datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S")
- addDate.append(datetime_obj)
-
- question_user_status = question.find("span", {"class": "qa-q-view-who-title"})
- if question_user_status is not None:
- question_user_status = question_user_status.text
- status.append(cleanString(question_user_status.strip()))
- else:
- status.append('-1')
-
- question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"})
- if question_user_karma is not None:
- question_user_karma = question_user_karma.text
- # Convert karma to pure numerical string
- if question_user_karma.find("k") > -1:
- question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000)
- reputation.append(cleanString(question_user_karma.strip()))
- else:
- reputation.append('-1')
-
- question_content = question.find("div", {"class": "qa-q-view-content qa-post-content"}).text
- post.append(cleanString(question_content.strip()))
-
- feedback.append("-1")
- sign.append("-1")
- interest.append("-1")
-
- img = question.find('div', {"class": "qa-q-view-content qa-post-content"}).find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = "-1"
- image_post.append(img)
-
- img = question.find('span', {"class": "qa-q-view-avatar-meta"}).find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = "-1"
- image_user.append(img)
-
- answer_list: ResultSet[Tag] = soup.find("div", {"class": "qa-a-list"}).find_all("div", {"class": "qa-a-list-item"})
-
-
- for replies in answer_list:
- user_name = replies.find("span", {"class", "qa-a-item-who-data"}).text
- user.append(cleanString(user_name.strip()))
-
- date_added = replies.find("span", {"class": "qa-a-item-when"}).find("time", {"itemprop": "dateCreated"}).get('datetime')
- date_string = date_added.split("+")[0]
- datetime_obj = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S")
- addDate.append(datetime_obj)
-
-
- post_data = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).text
- post.append(cleanString(post_data.strip()))
-
- user_reputations = replies.find("span", {"class", "qa-a-item-who-title"})
- if user_reputations is not None:
- user_reputations = user_reputations.text
- status.append(cleanString(user_reputations.strip()))
- else:
- status.append('-1')
-
- karma = replies.find("span", {"class": "qa-a-item-who-points-data"})
- if karma is not None:
- karma = karma.text
- # Convert karma to pure numerical string
- if karma.find("k") > -1:
- karma = str(float(karma.replace("k", "")) * 1000)
- reputation.append(cleanString(karma.strip()))
- else:
- reputation.append('-1')
-
- feedback.append("-1")
- sign.append("-1")
- interest.append("-1")
-
- img = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = "-1"
- image_post.append(img)
-
- img = replies.find('span', {"class": "qa-a-item-avatar-meta"}).find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = "-1"
- image_user.append(img)
-
- # Populate the final variable (this should be a list with all fields scraped)
-
- row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
-
- # Sending the results
- return row
-
-
-def HiddenAnswers_listing_parser(soup: BeautifulSoup):
-
- nm: int = 0 # this variable should receive the number of topics
- forum: str = "HiddenAnswers" # 0 *forum name
- board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree.
- # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
- user: List[str] = [] # 2 all users of each topic
- topic: List[str] = [] # 3 all topics
- view: List[int] = [] # 4 number of views of each topic
- post: List[int] = [] # 5 number of posts of each topic
- href: List[str] = [] # 6 this variable should receive all cleaned urls (we will use this to do the merge between
- # Listing and Description pages)
- addDate: List[str] = [] # 7 when the topic was created (difficult to find)
- image_user = [] # 8 all user avatars used in each topic
-
- # Finding the board
- board = soup.find("div", {"class": "qa-main-heading"}).find("h1").text
- board = board.replace('Recent questions in', '')
- board = cleanString(board.strip())
-
- queries_by_user: ResultSet[Tag] = soup.find("div", {"class": "qa-q-list"}).find_all("div", {"class": "qa-q-list-item"})
-
- for queries in queries_by_user:
- topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text
- topic.append(cleanString(topic_of_query.strip()))
-
- image_user.append("-1") # qa-q-item-where
-
- author = queries.find("span", {"class": "qa-q-item-who-data"}).text
- user.append(cleanString(author.strip()))
-
- num_answers = queries.find("span", {"class": "qa-a-count-data"}).text
- post.append(cleanString(num_answers.strip()))
-
- view.append("-1")
-
- date_posted = queries.find("span", {"class": "qa-q-item-when-data"}).text
-
- if date_posted.find("day") > 0:
- datetime_obj = datetime.now() - timedelta(days=1)
- else:
- try:
- datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
- except ValueError:
- datetime_obj = datetime.strptime(f"{date_posted}", "%b %d, %Y")
- addDate.append(datetime_obj)
- #this link will be cleaned
-
- listing_href = queries.find("div", {"class": "qa-q-item-title"}).find("a").get("href")
- href.append(listing_href)
-
- nm = len(topic)
-
- return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate, image_user)
-
-#need to change this method
-def hiddenanswers_links_parser(soup):
-
- # Returning all links that should be visited by the Crawler
-
- href = []
- #print(soup.find('table', {"class": "tborder clear"}).find(
- # 'tbody').find_all('tr', {"class": "inline_row"}))
- listing = soup.find_all('div', {"class": "qa-q-item-title"})
-
- for a in listing:
- link = a.find('a').get('href')
-
- href.append(link)
-
- return href
\ No newline at end of file
diff --git a/Forums/Initialization/forums_mining.py b/Forums/Initialization/forums_mining.py
index 4d68840..af7ce47 100644
--- a/Forums/Initialization/forums_mining.py
+++ b/Forums/Initialization/forums_mining.py
@@ -6,14 +6,6 @@ Starting point of the Darkweb Forums Mining
from datetime import *
from Forums.BestCardingWorld.crawler_selenium import crawler as crawlerBestCardingWorld
-from Forums.CryptBB.crawler_selenium import crawler as crawlerCryptBB
-from Forums.OnniForums.crawler_selenium import crawler as crawlerOnniForums
-from Forums.AbyssForum.crawler_selenium import crawler as crawlerAbyssForum
-from Forums.Procrax.crawler_selenium import crawler as crawlerProcraxForum
-from Forums.HiddenAnswers.crawler_selenium import crawler as crawlerHiddenAnswers
-from Forums.Cardingleaks.crawler_selenium import crawler as crawlerCardingleaks
-from Forums.Altenens.crawler_selenium import crawler as crawlerAltenens
-from Forums.Libre.crawler_selenium import crawler as crawlerLibre
import configparser
import os
@@ -118,22 +110,6 @@ if __name__ == '__main__':
if forum == "BestCardingWorld":
crawlerBestCardingWorld()
- elif forum == "CryptBB":
- crawlerCryptBB()
- elif forum == "OnniForums":
- crawlerOnniForums()
- elif forum == "AbyssForum":
- crawlerAbyssForum()
- elif forum == "HiddenAnswers":
- crawlerHiddenAnswers()
- elif forum == 'Procrax':
- crawlerProcraxForum()
- elif forum == 'Cardingleaks':
- crawlerCardingleaks()
- elif forum == 'Altenens':
- crawlerAltenens()
- elif forum == 'Libre':
- crawlerLibre()
print("\nScraping process completed!")
diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py
index 79b79a7..91b662f 100644
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@@ -8,14 +8,6 @@ from psycopg2.extras import RealDictCursor
from Forums.DB_Connection.db_connection import *
from Forums.BestCardingWorld.parser import *
-from Forums.Cardingleaks.parser import *
-from Forums.CryptBB.parser import *
-from Forums.OnniForums.parser import *
-from Forums.Altenens.parser import *
-from Forums.Procrax.parser import *
-from Forums.Libre.parser import *
-from Forums.HiddenAnswers.parser import *
-from Forums.AbyssForum.parser import *
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@@ -124,22 +116,6 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
if forum == "BestCardingWorld":
rw = bestcardingworld_listing_parser(soup)
- elif forum == "Cardingleaks":
- rw = cardingleaks_listing_parser(soup)
- elif forum == "CryptBB":
- rw = cryptBB_listing_parser(soup)
- elif forum == "OnniForums":
- rw = onniForums_listing_parser(soup)
- elif forum == "Altenens":
- rw = altenens_listing_parser(soup)
- elif forum == "Procrax":
- rw = procrax_listing_parser(soup)
- elif forum == "Libre":
- rw = libre_listing_parser(soup)
- elif forum == "HiddenAnswers":
- rw = HiddenAnswers_listing_parser(soup)
- elif forum == "AbyssForum":
- rw = abyssForums_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@@ -163,22 +139,6 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
if forum == "BestCardingWorld":
rmm = bestcardingworld_description_parser(soup)
- elif forum == "Cardingleaks":
- rmm = cardingleaks_description_parser(soup)
- elif forum == "CryptBB":
- rmm = cryptBB_description_parser(soup)
- elif forum == "OnniForums":
- rmm = onniForums_description_parser(soup)
- elif forum == "Altenens":
- rmm = altenens_description_parser(soup)
- elif forum == "Procrax":
- rmm = procrax_description_parser(soup)
- elif forum == "Libre":
- rmm = libre_description_parser(soup)
- elif forum == "HiddenAnswers":
- rmm = HiddenAnswers_description_parser(soup)
- elif forum == "AbyssForum":
- rmm = abyssForums_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception
diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py
deleted file mode 100644
index 19a05aa..0000000
--- a/Forums/Libre/crawler_selenium.py
+++ /dev/null
@@ -1,302 +0,0 @@
-__author__ = 'DarkWeb'
-
-'''
-Libre Forum Crawler (Selenium)
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-
-from PIL import Image
-import urllib.parse as urlparse
-import os, re, time
-import subprocess
-from bs4 import BeautifulSoup
-from Forums.Initialization.prepare_parser import new_parse
-from Forums.Libre.parser import libre_links_parser
-from Forums.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/'
-
-
-# Opens Tor Browser, crawls the website
-def startCrawling():
- forumName = getForumName()
- driver = getAccess()
-
- if driver != 'down':
- try:
- login(driver)
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(forumName, baseURL, True)
-
-
-# Login using premade account credentials and do login captcha manually
-def login(driver):
-
- input('Press enter when CAPTCHA is completed, and you\'re at the login page')
-
- #entering username and password into input boxes
- usernameBox = driver.find_element(by=By.NAME, value='username')
- #Username here
- usernameBox.send_keys('ct1234')#sends string to the username box
- passwordBox = driver.find_element(by=By.NAME, value='password')
- #Password here
- passwordBox.send_keys('r5o0wqmw')# sends string to passwordBox
-
- input("Press the login button and solve the CAPTCHA then press enter\n")
-
- # input('input')
-
- # wait for listing page show up (This Xpath may need to change based on different seed url)
- # wait for 50 sec until id = tab_content is found, then cont
- WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
- (By.TAG_NAME, 'nav')))
-
- # click link to correct forum board
- login_link = driver.find_element(by=By.XPATH, value='/html/body/nav/div[1]/a[3]').get_attribute('href')
- driver.get(login_link) # open tab with url
-
- # wait for listing page show up (This Xpath may need to change based on different seed url)
- # wait for 50 sec until id = tab_content is found, then cont
- WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
- (By.XPATH, '/html/body/div/div/div[3]/div[5]')))
-
-
-# Returns the name of the website
-def getForumName() -> str:
- name = 'Libre'
- return name
-
-
-# Return the link of the website
-def getFixedURL():
- url = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/'
- return url
-
-
-# Closes Tor Browser
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close() #close tab
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from Forums.Initialization.forums_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", True)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)
- return driver
- except:
- driver.close()
- return 'down'
-
-
-# Saves the crawled html page
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-def getFullPathName(url):
- from Forums.Initialization.forums_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if name == '':
- name = str(counter)
- counter = counter + 1
- return name
-
-
-def getInterestedLinks():
- links = []
-
- # cybersecurity
- links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/CyberSecurity')
- # services
- links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/Services')
- # programming
- links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/Programming')
- # jobs for crypto
- links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/JobsforCypto')
- # darknet markets
- links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/DarkNetMarkets')
-
- return links
-
-
-def crawlForum(driver):
- print("Crawling the Libre forum")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
- topics = topicPages(html)
- for topic in topics:
- has_next_topic_page = True
- counter = 1
- page = topic
-
- while has_next_topic_page:
- itemURL = urlparse.urljoin(baseURL, str(page))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
-
- if isListingLink(driver.current_url):
- break
-
- savePage(driver, driver.page_source, topic + f"page{counter}") # very important
-
- # # comment out
- # if counter == 2:
- # break
-
- try:
- page = "" # no next page so far may have some later on
- if page == "":
- raise NoSuchElementException
- counter += 1
-
- except NoSuchElementException:
- has_next_topic_page = False
-
- # making sure we go back to the listing page (browser back button simulation)
- try:
- driver.get(link)
- except:
- driver.refresh()
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- try:
- link = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href')
-
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the Libre forum done.")
-
-
-# Returns 'True' if the link is Topic link, may need to change for every website
-def isDescriptionLink(url):
- if '/p/' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link, may need to change for every website
-def isListingLink(url):
- if '.onion/c' in url:
- return True
- return False
-
-
-# calling the parser to define the links
-def topicPages(html):
- soup = BeautifulSoup(html, "html.parser")
- return libre_links_parser(soup)
-
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/Forums/Libre/parser.py b/Forums/Libre/parser.py
deleted file mode 100644
index 16113f7..0000000
--- a/Forums/Libre/parser.py
+++ /dev/null
@@ -1,249 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from Forums.Utilities.utilities import *
-from datetime import date
-from datetime import timedelta
-import re
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup, ResultSet, Tag
-
-
-# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
-def libre_description_parser(soup: Tag):
- # Fields to be parsed
-
- topic = "-1" # 0 *topic name
- user = [] # 1 *all users of each post
- status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
- reputation = [] # 3 all user's karma in each post (usually found as a number)
- interest = [] # 4 all user's interest in each post
- sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
- post = [] # 6 all messages of each post
- feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
- addDate = [] # 8 all dates of each post
- image_user = [] # 9 all user avatars of each post
- image_post = [] # 10 all first images of each post
-
- # Finding the topic (should be just one coming from the Listing Page)
-
- topic_found = soup.find("a", {"class": "link text-xl text-zinc-300"}).text
- topic = cleanString(topic_found.strip())
-
- original_post: Tag = soup.find("div", {"class": "flex items-start"})
-
- original_user = original_post.find("div", {"class": "info-p"}).find("a", {"class": "link"}).text
- user.append(cleanString(original_user.replace("/u/", "").strip()))
-
- original_user_statistics: ResultSet[Tag] = original_post.find("div", {"class": "info-p"}).find_all("span")
-
- original_time = original_user_statistics[0].text[2:]
- datetime_append = datetime.strptime(original_time, "%Y-%m-%d %H:%M:%S GMT")
- addDate.append(datetime_append)
-
- original_karma = original_user_statistics[1].text[2]
- reputation.append(cleanString(original_karma.strip()))
-
- original_content = soup.find("div", {"class": "content-p"}).text
- post.append(cleanString(original_content.strip()))
-
-
- status.append("-1")
- interest.append("-1")
- sign.append("-1")
- feedback.append("-1")
-
- image_post.append("-1")
-
- img = original_post.find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = "-1"
- image_user.append(img)
-
- # Finding the repeated tag that corresponds to the listing of posts
-
- # try:
- posts: ResultSet[Tag] = soup.find_all("div", {"class": "flex items-stretch"})
-
- # For each message (post), get all the fields we are interested to:
-
- for ipost in posts:
- # Finding a first level of the HTML page
-
- # Finding the author (user) of the post
-
- user_name = ipost.find("p", {"class": "text-zinc-400 text-justify"}).find("a", {"class": "link"}).text
- user.append(cleanString(user_name.replace("/u/", "").strip())) # Remember to clean the problematic characters
-
- status.append("-1")
-
- # Finding the interest of the author
- # CryptBB does not have blurb
-
- interest.append("-1")
-
- # Finding the reputation of the user
- # CryptBB does have reputation
-
- karma = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
- karma_cleaned = karma.split(" ")[6]
- reputation.append(cleanString(karma_cleaned.strip()))
-
- # Getting here another good tag to find the post date, post content and users' signature
-
- date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
- date_time_cleaned = date_posted.replace(user_name, "")[3:-12]
- datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT")
- addDate.append(datetime_append)
-
- # Finding the post
- user_post = ipost.find("div", {"class": "content-c"}).text
- post.append(cleanString(user_post))
-
- # Finding the user's signature
-
- sign.append("-1")
-
- # As no information about user's feedback was found, just assign "-1" to the variable
-
- feedback.append("-1")
-
- # As no information about post's image was found, just assign "-1" to the variable
-
- image_post.append("-1")
-
- # As no information about user's image was found, just assign "-1" to the variable
-
- image_user.append("-1")
-
- # Populate the final variable (this should be a list with all fields scraped)
- # print(topic)
- # print(user)
- # print(status)
- # print(reputation)
- # print(interest)
- # print(sign)
- # print(post)
- # print(feedback)
- # print(addDate)
- # print(len(user))
- # print(len(status))
- # print(len(reputation))
- # print(len(interest))
- # print(len(sign))
- # print(len(feedback))
- # print(len(addDate))
-
- row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
-
- # Sending the results
-
- return row
-
-
-# This is the method to parse the Listing Pages (one page with many posts)
-def libre_listing_parser(soup):
- nm = 0 # *this variable should receive the number of topics
- forum = "Libre" # 0 *forum name
- board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
- # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
- author = [] # 2 *all authors of each topic
- topic = [] # 3 *all topics
- views = [] # 4 number of views of each topic
- posts = [] # 5 number of posts of each topic
- href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
- # Listing and Description pages)
- addDate = [] # 7 when the topic was created (difficult to find)
- image_author = [] # 8 all author avatars used in each topic
-
- # Finding the board (should be just one)
-
- board = soup.find('div', {"class": "title"}).find("h1").text
- board = cleanString(board.strip())
-
- # Finding the repeated tag that corresponds to the listing of topics
-
- itopics = soup.find("div", {"class", "space-y-2 mt-4"}).find_all('div', {"class": "flex box"})
-
- nm = 0
- for itopic in itopics:
- nm += 1
- # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
- # to don't miss any topic
-
- # Adding the topic to the topic list
- topic_string = itopic.find("a", {"class": "link text-xl text-zinc-300"}).text
- cleaned_topic_string = cleanString(topic_string.strip())
- topic.append(cleaned_topic_string)
-
- image_author.append("-1")
-
- # Adding the url to the list of urls
- link_to_clean = itopic.find('div', {'class': 'flex space-x-2 items-center'}).find('a').get('href')
-
- href.append(link_to_clean)
-
- # Finding the author of the topic
- username_not_cleaned = itopic.find('div', {"class": "flex-grow p-2 text-justify"}).find('a').text
- username_cleaned = username_not_cleaned.split("/")[-1]
- author.append(cleanString(username_cleaned))
-
- # Finding the number of views
- num_views = itopic.find_all("div", {"class": "flex items-center"})[0].find("p").text
- views.append(cleanString(num_views))
-
- # Finding the number of replies
- num_replies = itopic.find_all("div", {"class": "flex items-center"})[1].find("p").text
- posts.append(cleanString(num_replies))
-
- # If no information about when the topic was added, just assign "-1" to the variable
-
- date_time_concatenated = itopic.find("p", {"class": "text-sm text-zinc-400 italic"}).text
- date_time_cleaned = date_time_concatenated.replace(username_not_cleaned, "")
- # creating the datetime object
- date_time_array = date_time_cleaned[3:]
- datetime_append = datetime.strptime(date_time_array, "%Y-%m-%d %H:%M:%S GMT")
- addDate.append(datetime_append)
-
- # print(forum)
- # print(nm)
- # print(board)
- # print(author)
- # print(topic)
- # print(views)
- # print(href)
- # print(addDate)
- # print(len(author))
- # print(len(topic))
- # print(len(views))
- # print(len(href))
- # print(len(addDate))
-
- return organizeTopics(
- forum=forum,
- nm=nm,
- board=board,
- author=author,
- topic=topic,
- views=views,
- posts=posts,
- href=href,
- addDate=addDate,
- image_author=image_author
- )
-
-
-def libre_links_parser(soup):
- # Returning all links that should be visited by the Crawler
- href = []
- listing = soup.find("div", {"class", "space-y-2 mt-4"}).find_all('div', {"class": "flex box"})
-
- for a in listing:
- link = a.find('div', {'class': 'flex space-x-2 items-center'}).find('a').get('href')
-
- href.append(link)
-
- return href
diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py
deleted file mode 100644
index d801d29..0000000
--- a/Forums/OnniForums/crawler_selenium.py
+++ /dev/null
@@ -1,310 +0,0 @@
-__author__ = 'Helium'
-
-'''
-OnniForums Crawler (Selenium)
-Now goes through multiple topic pages.
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-from PIL import Image
-
-import urllib.parse as urlparse
-import os, re, time
-import configparser
-from datetime import date
-import subprocess
-from bs4 import BeautifulSoup
-from Forums.Initialization.prepare_parser import new_parse
-from Forums.OnniForums.parser import onniForums_links_parser
-from Forums.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/'
-
-
-# Opens Tor Browser, crawls the website
-def startCrawling():
- forumName = getForumName()
- driver = getAccess()
-
- if driver != 'down':
- try:
- login(driver)
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(forum=forumName, url=baseURL, createLog=True)
-
-
-# Login using premade account credentials and do login captcha manually
-def login(driver):
- #click login button
- login_link = driver.find_element(
- by=By.XPATH, value='/html/body/div/div[1]/div[2]/div[1]/div/span/a[1]').get_attribute('href')
- driver.get(login_link)
-
- #entering username and password into input boxes
- usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input')
- #Username here
- usernameBox.send_keys('cabbage_purely')
- passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input')
- #Password here
- passwordBox.send_keys('$ourP@tchK1ds')
-
- clicker = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/div/input')
- clicker.click()
-
- # wait for listing page show up (This Xpath may need to change based on different seed url)
- WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
- (By.XPATH, '//*[@id="content"]')))
-
-
-# Returns the name of the website
-def getForumName():
- name = 'OnniForums'
- return name
-
-
-# Return the link of the website
-def getFixedURL():
- url = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/'
- return url
-
-
-# Closes Tor Browser
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close()
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from Forums.Initialization.forums_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", True)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)
- return driver
- except:
- driver.close()
- return 'down'
-
-
-# Saves the crawled html page
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-def getFullPathName(url):
- from Forums.Initialization.forums_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
- name = str(counter)
- counter = counter + 1
- return name
-
-
-def getInterestedLinks():
- links = []
-
- # Hacking & Cracking tutorials
- links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials')
- # # Hacking & Cracking questions
- # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions')
- # # Exploit PoCs
- # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Exploit-PoCs')
- # # sellers
- # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Sellers')
- # # buyers questions
- # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Buyers-Questions')
- # # combo lists
- # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Combo-lists')
- # # Malware-development
- # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Malware-development')
- # # coding
- # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Coding')
- # # Carding & Fraud
- # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Carding-Fraud')
- # # OPSEC
- # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-OPSEC--13')
-
- return links
-
-
-def crawlForum(driver):
- print("Crawling the OnniForums forum")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
- topics = topicPages(html)
- for topic in topics:
- has_next_topic_page = True
- counter = 1
- page = topic
-
- while has_next_topic_page:
- itemURL = urlparse.urljoin(baseURL, str(page))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
-
- if isListingLink(driver.current_url):
- break
-
- savePage(driver, driver.page_source, topic + f"page{counter}") # very important
-
- # # comment out
- # if counter == 2:
- # break
-
- try:
- temp = driver.find_element(by=By.CLASS_NAME, value='float_left')
- page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
-
- if page == "":
- raise NoSuchElementException
- counter += 1
-
- except NoSuchElementException:
- has_next_topic_page = False
-
- # making sure we go back to the listing page (browser back button simulation)
- try:
- driver.get(link)
- except:
- driver.refresh()
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- try:
- temp = driver.find_element(by=By.CLASS_NAME, value='float_left')
- link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
-
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the OnniForums forum done.")
-
-
-# Returns 'True' if the link is Topic link
-def isDescriptionLink(url):
- if 'Thread' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link
-def isListingLink(url):
- if '.onion/Forum' in url:
- return True
- return False
-
-
-# calling the parser to define the links
-def topicPages(html):
- soup = BeautifulSoup(html, "html.parser")
- #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
- return onniForums_links_parser(soup)
-
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/Forums/OnniForums/parser.py b/Forums/OnniForums/parser.py
deleted file mode 100644
index 72674b1..0000000
--- a/Forums/OnniForums/parser.py
+++ /dev/null
@@ -1,222 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from typing import List
-from Forums.Utilities.utilities import *
-from datetime import date
-from datetime import timedelta
-import re
-import string
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup
-
-# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
-
-def onniForums_description_parser(soup: BeautifulSoup) -> tuple:
-
- topicName: str = "-1" # 0 *topic name
- users : List[str] = [] # 1 *all users of each post
- statuses : List[str] = [] # 2 all user's authority in each post such as (adm, member, dangerous)
- reputations : List[str] = [] # 3 all user's karma in each post (usually found as a number)
- interests : List[str] = [] # 4 all user's interest in each post
- signs : List[str] = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
- posts : List[str] = [] # 6 all messages of each post
- feedbacks : List[str] = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
- addDates : List[datetime] = [] # 8 all dates of each post
- image_user : List[str] = [] # 9 all user avatars of each post
- image_post : List[str] = [] # 10 all first images of each post
-
- # Getting the topicName
- topicName = soup.find("table", {"class": "tborder tfixed clear"}) \
- .find("td", {"class": "thead"}) \
- .find_all("div")[-1].text
-
- topicName = cleanString(topicName.strip())
-
- topics_array = soup.find_all("div", {"class": "post"})
-
-
- for topic in topics_array:
- # Extracting and cleaning author information
- author_information: BeautifulSoup = topic.find("div", {"class": "author_information"})
-
- username: str = author_information.find("span", {"class": "largetext"}).text
- username_cleaned = cleanString(username.strip())
- users.append(username_cleaned)
-
- user_status: str = author_information.find("span", {"class": "smalltext"}).text
-
-
- # Banned users often have weird text issues in HTML
- # So we detect banned users and give them a unique string
- if user_status.find("Banned") > 0: user_status_cleaned = "Banned"
-
- elif user_status.find("Unregistered") > 0: user_status_cleaned = "Unregistered"
-
- else: user_status_cleaned = cleanString(user_status.strip()) # Remove excessive spaces in string
-
- # Add cleaned data into array
- statuses.append(user_status_cleaned)
-
- if user_status_cleaned in ['Unregistered', 'Banned']: reputations.append(-1)
- else:
- author_statistics: BeautifulSoup = topic.find("div", {"class": "author_statistics"})
-
- reputation: str = author_statistics.find_all("div", {"class": "float_right"})[-1].text
- reputation_cleaned = cleanString(reputation.strip())
- reputations.append(reputation_cleaned)
-
- # Append a "-1" to `interests` and `signs` array since they don't exist on this forum
- interests.append("-1")
- signs.append("-1")
-
- post_content: str = topic.find("div", {"class": "post_body scaleimages"}).text
- # Clean post content of excessive spaces and characters
- post_content_cleaned = post_content.replace("[You must reply to view this hidden content]", "")
- post_content_cleaned = cleanString(post_content_cleaned.strip())
- posts.append(post_content_cleaned)
-
- # Append a "-1" to `feedbacks` array since they don't exists on this forum
- feedbacks.append("-1")
-
- date_posted = topic.find("span", {"class": "post_date"}).text.strip()
- if 'modified' in date_posted:
- date_posted = date_posted.split('(')[0].strip()
-
- if 'Today' in date_posted or 'Yesterday' in date_posted:
- day = topic.find("span", {"class": "post_date"}).find('span').get('title').strip()
- time = date_posted.split(',')[1].strip()
- date_posted = day + ', ' + time
- date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p")
-
- elif 'hour' in date_posted or 'minute' in date_posted:
- date_posted = topic.find("span", {"class": "post_date"}).find('span').get('title').strip()
- date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p")
-
- else:
- date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p")
-
- addDates.append(date_object)
-
- image_post.append("-1")
-
- avatar = topic.find('div', {"class": "author_avatar"})
- if avatar is not None:
- img = avatar.find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = '-1'
- else:
- img = "-1"
- image_user.append(img)
-
- # TESTING PURPOSES - DO NOT REMOVE
-
- # Populate the final variable (this should be a list with all fields scraped)
-
- row = (topicName, users, statuses, reputations, interests, signs, posts, feedbacks, addDates, image_user, image_post)
-
- # Sending the results
-
- return row
-
-
-
-def onniForums_listing_parser(soup: BeautifulSoup):
-
- nm = 0 # this variable should receive the number of topics
- forum = "OnniForums" # 0 *forum name
- boardName = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree.
- # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
- user: List[str] = [] # 2 all users of each topic
- topic : List[str] = [] # 3 all topics
- view: List[int] = [] # 4 number of views of each topic
- post : List[int] = [] # 5 number of posts of each topic
- href: List[str] = [] # 6 this variable should receive all cleaned urls (we will use this to do the merge between Listing and Description pages)
- addDate : List[str] = [] # 7 when the topic was created (difficult to find)
- image_author : List[str] = [] # 8 all author avatars used in each topic
-
- # Finding the board (should be just one)
- board_metadata: BeautifulSoup = soup.find("table",{"class" : "tborder clear"})
-
- boardName = board_metadata.find_all("div")[1].text
- boardName = cleanString(boardName.strip())
-
- thread_arrays = board_metadata.find_all("tr", {"class":"inline_row"}) # gets the information of posts
-
- nm = len(thread_arrays)
-
- for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above
-
- body = thread.find("span",{"class": "subject_new"})
- try:
- post_subject: str = body.text #getting the topic
- except:
- body = thread.find("span",{"class": "subject_old"})
- post_subject: str = body.text
-
- post_subject_cleaned = cleanString(post_subject.strip())
- topic.append(post_subject_cleaned)
-
- author_icon = thread.find('div', {"class": "lavatar-old lavatar-old-f"})
- if author_icon != None:
- author_icon = author_icon.find('img')
- author_icon = author_icon.get('src')
- author_icon = author_icon.split('base64,')[-1]
- else:
- author_icon = "-1"
- image_author.append(author_icon)
-
- reply_count = thread.find_all("td", {"align": "center"})[2].text
- post.append(cleanNumbers(reply_count))
-
- views = thread.find_all("td", {"align": "center"})[3].text
- view.append(cleanNumbers(views))
-
- # dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text
- # dates_added_cleaned = dates_added.split(',')[0]
- # addDate.append(dates_added_cleaned)
-
- author = thread.find("span",{"class" : "author smalltext"}).text
- author_cleaned = cleanString(author.strip())
- user.append(author_cleaned)
-
- thread_link = body.find('a').get('href')
- href.append(thread_link)
-
- return organizeTopics(
- forum=forum,
- nm=nm,
- board=boardName,
- author=user,
- topic=topic,
- views=view,
- posts=post,
- href=href,
- addDate=addDate,
- image_author=image_author
- )
-
-
-
-
-
-
-# This is the method to parse the Listing Pages (one page with many posts)
-
-def onniForums_links_parser(soup: BeautifulSoup):
-
- href = []
- listing = soup.find_all('tr', {'class': 'inline_row'})
-
- for thread in listing:
- try:
- link = thread.find('span', {"class": "subject_old"}).find('a').get('href')
- except:
- link = thread.find('span', {"class": "subject_new"}).find('a').get('href')
-
- href.append(link)
-
- return href
\ No newline at end of file
diff --git a/Forums/OnniForums/testing.py b/Forums/OnniForums/testing.py
deleted file mode 100644
index c18cfd4..0000000
--- a/Forums/OnniForums/testing.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import os
-from Forums.OnniForums.parser import onniForums_description_parser
-from Forums.OnniForums.parser import onniForums_listing_parser
-from bs4 import BeautifulSoup
-
-baseUrl = './HTML_Pages/06272023/Listing/httponnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qdonionForumCardingFraud.html'
-
-with open(baseUrl, 'r') as file:
- testHTML = file.read()
-
-soup = BeautifulSoup(testHTML, 'html.parser')
-
-output = onniForums_listing_parser(soup)
-
-print(output)
-
-all_descriptions = os.listdir("./HTML_Pages/06272023/Description/")[1:]
-
-total = len(all_descriptions)
-descriptions_with_unicode_error = 0
-
-print("\nTESTING DESCRIPTION PARSER:\n")
-
-for desc in all_descriptions:
-
- print(f"\nTesting: ./HTML_Pages/06272023/Description/{desc} \n")
-
-
-
- try:
- with open(f"./HTML_Pages/06272023/Description/{desc}", "r") as file:
- test_html = file.read()
-
- soup = BeautifulSoup(test_html, features="html.parser")
-
- description_output = onniForums_description_parser(soup)
-
- print(f"\nTopic name : {description_output[0]}")
- print(f"Contents : {description_output[1]}")
- print(f"Users : {description_output[2]}")
- print(f"Dates posted: {description_output[3]}")
- print(f"Feedbacks : {description_output[4]}")
- print(f"Statuses : {description_output[5]}")
- print(f"Reputations : {description_output[6]}")
- print(f"Signatures : {description_output[7]}")
- print(f"Interests : {description_output[8]}\n")
-
- except UnicodeDecodeError:
- descriptions_with_unicode_error += 1
- print(f"UnicodeDecodeError: the file `{desc}` cannot be decoded by Python!")
-
-print("\nTESTING COMPLETE\n")
-print(f"Number of descriptions : {total}")
-print(f"Descriptions w/ errors : {descriptions_with_unicode_error}")
-print(f"Failure percentage : {round(descriptions_with_unicode_error/total, 4) * 100}%\n")
-
-
diff --git a/Forums/Procrax/crawler_selenium.py b/Forums/Procrax/crawler_selenium.py
deleted file mode 100644
index c12088a..0000000
--- a/Forums/Procrax/crawler_selenium.py
+++ /dev/null
@@ -1,321 +0,0 @@
-__author__ = 'Helium'
-
-'''
-Procrax Forum Crawler (Selenium)
-rechecked and confirmed
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-from PIL import Image
-
-import urllib.parse as urlparse
-import os, re, time
-from datetime import date
-import configparser
-import subprocess
-from bs4 import BeautifulSoup
-from Forums.Initialization.prepare_parser import new_parse
-from Forums.Procrax.parser import procrax_links_parser
-from Forums.Utilities.utilities import cleanHTML
-
-counter = 1
-BASE_URL = 'https://procrax.cx/'
-FORUM_NAME = 'Procrax'
-
-
-# Opens Tor Browser, crawls the website
-def startCrawling():
- driver = getAccess()
-
- if driver != 'down':
- try:
- login(driver)
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(forum=FORUM_NAME, url=BASE_URL, createLog=True)
-
-
-# Login using premade account credentials and do login captcha manually
-def login(driver):
- WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
- (By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span')))
- #entering username and password into input boxes
- usernameBox = driver.find_element(by=By.NAME, value='login')
- #Username here
- usernameBox.send_keys('cheese_pizza_man')#sends string to the username box
- passwordBox = driver.find_element(by=By.NAME, value='password')
- #Password here
- passwordBox.send_keys('Gr33nSp@m&3ggs')# sends string to passwordBox
-
- clicker = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span')
- clicker.click()
-
- # # wait for listing page show up (This Xpath may need to change based on different seed url)
- # # wait for 50 sec until id = tab_content is found, then cont
- WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
- (By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[1]/div/div[1]/div')))
-
-
-# Returns the name of the website
-def getForumName():
- name = 'Procrax'
- return name
-
-
-# Return the link of the website
-def getFixedURL():
- url = 'https://procrax.cx/'
- return url
-
-
-# Closes Tor Browser
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close() #close tab
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from Forums.Initialization.forums_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", True)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-def getAccess():
- driver = createFFDriver()
- try:
- driver.get(BASE_URL)# open url in browser
- return driver
- except:
- driver.close()# close tab
- return 'down'
-
-
-# Saves the crawled html page
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-def getFullPathName(url):
- from Forums.Initialization.forums_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + FORUM_NAME + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
- name = str(counter)
- counter = counter + 1
- return name
-
-
-def getInterestedLinks():
- links = []
-
- # verified sales
- links.append('https://procrax.cx/forums/verified-sales-market.10/')
- # unverified sales
- links.append('https://procrax.cx/forums/unverified-sales-market.12/')
- # combos
- links.append('https://procrax.cx/forums/bases.79/')
- # tools
- links.append('https://procrax.cx/forums/tools.81/')
- # configs
- links.append('https://procrax.cx/forums/configs.82/')
- # craxtube
- links.append('https://procrax.cx/forums/craxtube.83/')
- # general hacking
- links.append('https://procrax.cx/forums/general-hacking.24/')
- # hacking security tools
- links.append('https://procrax.cx/forums/hacking-security-tools.20/')
- # hacktube
- links.append('https://procrax.cx/forums/hacktube.22/')
- # cardingtube
- links.append('https://procrax.cx/forums/cardingtube.26/')
- # cardable
- links.append('https://procrax.cx/forums/cardable-websites.28/')
- # spam software
- links.append('https://procrax.cx/forums/mailing.72/')
- # spam tools
- links.append('https://procrax.cx/forums/tools-bots-validators.73/')
- # darknet news
- links.append('https://procrax.cx/forums/darknet-news-articles.42/')
- # links
- links.append('https://procrax.cx/forums/darknet-markets-deep-onion-links.43/')
- # courses
- links.append('https://procrax.cx/forums/courses.59/')
- # software
- links.append('https://procrax.cx/forums/software.76/')
- # general forum
- links.append('https://procrax.cx/forums/forum-discussions-updates.7/')
-
- return links
-
-
-def crawlForum(driver):
- print("Crawling the Procrax forum")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
- topics = topicPages(html)
- for topic in topics:
- has_next_topic_page = True
- counter = 1
- page = topic
-
- while has_next_topic_page:
- itemURL = urlparse.urljoin(BASE_URL, str(page))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
-
- if isListingLink(driver.current_url):
- break
-
- savePage(driver, driver.page_source, topic + f"page{counter}") # very important
-
- # # comment out
- # if counter == 2:
- # break
-
- try:
- page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
-
- if page == "":
- raise NoSuchElementException
- counter += 1
-
- except NoSuchElementException:
- has_next_topic_page = False
-
- # making sure we go back to the listing page (browser back button simulation)
- try:
- driver.get(link)
- except:
- driver.refresh()
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- try:
-
- link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
-
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the Procrax forum done.")
-
-
-# Returns 'True' if the link is Topic link, may need to change for every website
-def isDescriptionLink(url):
- if 'threads' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link, may need to change for every website
-def isListingLink(url):
- if '.cx/forums' in url:
- return True
- return False
-
-
-# calling the parser to define the links
-def topicPages(html):
- soup = BeautifulSoup(html, "html.parser")
- #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
- return procrax_links_parser(soup)
-
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/Forums/Procrax/parser.py b/Forums/Procrax/parser.py
deleted file mode 100644
index dda0090..0000000
--- a/Forums/Procrax/parser.py
+++ /dev/null
@@ -1,189 +0,0 @@
-__author__ = 'Helium'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from Forums.Utilities.utilities import *
-from datetime import date
-from datetime import timedelta
-import re
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup, ResultSet, Tag
-
-# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
-
-
-def procrax_description_parser(soup: Tag):
-
- # Fields to be parsed
-
- topic = "-1" # 0 topic name
- user = [] # 1 all users of each post
- addDate = [] # 2 all dated of each post
- feedback = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format)
- status = [] # 4 all user's authority in each post such as (adm, member, dangerous)
- reputation = [] # 5 all user's karma in each post (usually found as a number)
- sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post)
- post = [] # 7 all messages of each post
- interest = [] # 8 all user's interest in each post
- image_user = [] # 9 all user avatars of each post
- image_post = [] # 10 all first images of each post
-
- # Finding the topic (should be just one coming from the Listing Page)
-
- li = soup.find("h1", {"class": "p-title-value"})
- topic = li.text
-
- thread: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True})
-
- for ipost in thread:
- username = ipost.find("h4", {"class": "message-name"}).text
- user.append(cleanString(username.strip()))
-
- date_posted = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime")
- datetime_obj = datetime.strptime(date_posted, "%Y-%m-%dT%H:%M:%S%z")
- addDate.append(datetime_obj)
-
- feedback.append("-1")
-
- user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text
- status.append(cleanString(user_status.strip()))
-
- user_lvl = ipost.find("div", {"class": "afAwardLevel"})
- if user_lvl is not None:
- user_lvl = user_lvl.text
- reputation.append(cleanString(user_lvl.strip()))
- else:
- reputation.append('-1')
-
- sign.append("-1")
-
- user_post = ipost.find("article", {"class": "message-body js-selectToQuote"}).text
- post.append(cleanString(user_post.strip()))
-
- interest.append("-1")
-
- bbWrapper = ipost.find('div', {"class": "bbWrapper"})
- if bbWrapper is not None:
- img = bbWrapper.find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = "-1"
- else:
- img = "-1"
- image_post.append(img)
-
- avatar = ipost.find("a", {"class": "avatar avatar--m"})
- if avatar is not None:
- img = avatar.find('img')
- if img is not None:
- img = img.get('src').split('base64,')[-1]
- else:
- img = "-1"
- else:
- img = "-1"
- image_user.append(img)
-
- # Populate the final variable (this should be a list with all fields scraped)
-
- row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
-
- # Sending the results
-
- return row
-
-# This is the method to parse the Listing Pages (one page with many posts)
-
-def procrax_listing_parser(soup: Tag):
-
- nm = 0 # this variable should receive the number of topics
- forum: str = "Procrax" # 0 *forum name
- board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree.
- # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
-
- author = [] # 2 all authors of each topic
- topic = [] # 3 all topics
- views = [] # 4 number of views of each topic
- posts = [] # 5 number of posts of each topic
- href = [] # 6this variable should receive all cleaned urls (we will use this to do the marge between
- # Listing and Description pages)
- addDate = [] # 7 when the topic was created (difficult to find)
- image_author = [] # 8 all author avatars used in each topic
-
- # Finding the board (should be just one)
- li = soup.find("h1", {"class": "p-title-value"})
- board = cleanString(li.text.strip())
-
- threads_list = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True})
-
- sticky = soup.find("div", {"class": "structItemContainer-group structItemContainer-group--sticky"})
- if sticky is not None:
- threads_list = sticky.find_all("div", {"data-author": True}) + threads_list
-
- nm = len(threads_list)
-
- for thread in threads_list:
- thread_title = thread.find("div", {"class": "structItem-title"}).text
- topic.append(cleanString(thread_title.strip()))
-
- author_icon = thread.find('a', {"class": "avatar avatar--s"})
- if author_icon != None:
- author_icon = author_icon.find('img')
- if author_icon != None:
- author_icon = author_icon.get('src')
- author_icon = author_icon.split('base64,')[-1]
- else:
- author_icon = "-1"
- else:
- author_icon = "-1"
- image_author.append(author_icon)
-
- thread_author = thread.get("data-author")
- author.append(cleanString(thread_author))
-
- thread_views = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find('dd').text
- thread_views = thread_views.lower().replace("k", "000")
- thread_views = thread_views.lower().replace("m", "000000")
- views.append(thread_views.strip())
-
- thread_replies = thread.find("dl", {"class": "pairs pairs--justified"}).find('dd').text
- # All threads contain one topic post and reply posts
- thread_total_posts = thread_replies.lower().replace("k", "000")
- posts.append(thread_total_posts.strip())
-
- thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime")
- datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z")
- addDate.append(datetime_obj)
-
- thread_link: str = thread.find("div", {"class": "structItem-title"}).find('a', {'class': ''}).get('href')
- href.append(thread_link)
-
-
- return organizeTopics(
- forum=forum,
- nm=nm,
- board=board,
- author=author,
- topic=topic,
- views=views,
- posts=posts,
- addDate=addDate,
- href=href,
- image_author=image_author
- )
-
-
-def procrax_links_parser(soup):
-
- # Returning all links that should be visited by the Crawler
-
- href = []
-
- listing = soup.find_all('div', {"class": "structItem-title"})
-
- for a in listing:
- link = a.find('a', {'class': ''}).get('href')
-
- href.append(link)
-
- return href
\ No newline at end of file
diff --git a/MarketPlaces/AnonMarket/crawler_selenium.py b/MarketPlaces/AnonMarket/crawler_selenium.py
deleted file mode 100644
index eab9ea0..0000000
--- a/MarketPlaces/AnonMarket/crawler_selenium.py
+++ /dev/null
@@ -1,293 +0,0 @@
-__author__ = 'Helium'
-
-'''
-Anon Market Crawler (Selenium)
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.common.by import By
-
-from PIL import Image
-import urllib.parse as urlparse
-import os, re, time
-from datetime import date
-import subprocess
-import configparser
-from bs4 import BeautifulSoup
-from MarketPlaces.Initialization.prepare_parser import new_parse
-from MarketPlaces.AnonMarket.parser import AnonMarket_links_parser
-from MarketPlaces.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion'
-
-
-# Opens Tor Browser, crawls the website, then parses, then closes tor
-#acts like the main method for the crawler, another function at the end of this code calls this function later
-def startCrawling():
- mktName = getMKTName()
- driver = getAccess()
-
- if driver != 'down':
- try:
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(mktName, baseURL, True)
-
-
-# Returns the name of the website
-#return: name of site in string type
-def getMKTName():
- name = 'AnonMarket'
- return name
-
-
-# Return the base link of the website
-#return: url of base site in string type
-def getFixedURL():
- url = 'http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion'
- return url
-
-
-# Closes Tor Browser
-#@param: current selenium driver
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close()
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from MarketPlaces.Initialization.markets_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", False)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
-#return: return the selenium driver or string 'down'
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)
- return driver
- except:
- driver.close()
- return 'down'
-
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-#@param: raw url as crawler crawls through every site
-def getFullPathName(url):
- from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
-#@param: raw url as crawler crawls through every site
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
- name = str(counter)
- counter = counter + 1
- return name
-
-
-# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
-#in this example, there are a couple of categories some threads fall under such as
-# Guides and Tutorials, Digital Products, and Software and Malware
-#as you can see they are categories of products
-def getInterestedLinks():
- links = []
-
- # Malware
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/malware')
- # Bootkits
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/bootkits')
- # Backdoors
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/backdoors')
- # Keyloggers
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/keyloggers')
- # Wireless Trackers
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wireless_trackers')
- # Screen Scrapers
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/screen_scrapers')
- # Mobile Forensic Tools
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/mobile_forensics_tools')
- # Wifi Jammers
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wifi_jammers')
- # Carding
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/carding')
- # Worms
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/worms')
- # Viruses
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/viruses')
- # Trojans
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/trojans')
- # Botnets
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/botnets')
- # Security Technology
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security_technology')
- # Hacks
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/hacks')
- # Exploit kits
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/exploit_kit')
- # Security
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security')
- # Ransomware
- links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/ransomware')
-
- return links
-
-
-# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
-#topic and description pages are crawled through here, where both types of pages are saved
-#@param: selenium driver
-def crawlForum(driver):
- print("Crawling the Anon Market")
-
- linksToCrawl = getInterestedLinks()
-
- for link in linksToCrawl:
- print('Crawling :', link)
-
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
-
- html = driver.page_source
- savePage(driver, html, link)
-
- # Get all product links on the current page
- products_list = productPages(html)
- for item in products_list:
- itemURL = urlparse.urljoin(baseURL, str(item))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
- savePage(driver, driver.page_source, item)
- driver.back() # Go back to listing after visiting each product
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- # Locate the next page link
- try:
- # Find the active page number
- active_page_element = driver.find_element(By.XPATH, '//div[@class="page activepage"]')
- # current_page = int(active_page_element.text)
-
- next_page_element = active_page_element.find_element(By.XPATH, 'following-sibling::a[1]')
- link = next_page_element.get_attribute('href')
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
-
- print("Crawling the Anon Market done.")
-
-# Returns 'True' if the link is a description link
-#@param: url of any url crawled
-#return: true if is a description page, false if not
-def isDescriptionLink(url):
- if 'product' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link
-#@param: url of any url crawled
-#return: true if is a Listing page, false if not
-def isListingLink(url):
- if 'category' in url:
- return True
- return False
-
-
-# calling the parser to define the links, the html is the url of a link from the list of interested link list
-#@param: link from interested link list ie. getInterestingLinks()
-#return: list of description links that should be crawled through
-def productPages(html):
- soup = BeautifulSoup(html, "html.parser")
- return AnonMarket_links_parser(soup)
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing Nexus .... DONE!")
-
diff --git a/MarketPlaces/AnonMarket/parser.py b/MarketPlaces/AnonMarket/parser.py
deleted file mode 100644
index 997d43e..0000000
--- a/MarketPlaces/AnonMarket/parser.py
+++ /dev/null
@@ -1,195 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from MarketPlaces.Utilities.utilities import *
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup
-
-import re
-
-#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of description page
-#return: 'row' that contains a variety of lists that each hold info on the description page
-def AnonMarket_description_parser(soup):
-
- # Fields to be parsed
- vendor = "-1" # 0 *Vendor_Name
- success = "-1" # 1 Vendor_Successful_Transactions
- rating_vendor = "-1" # 2 Vendor_Rating
- name = "-1" # 3 *Product_Name
- describe = "-1" # 4 Product_Description
- CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
- category = "-1" # 7 Product_Category
- views = "-1" # 8 Product_Number_Of_Views
- reviews = "-1" # 9 Product_Number_Of_Reviews
- rating_item = "-1" # 10 Product_Rating
- addDate = "-1" # 11 Product_AddedDate
- BTC = "-1" # 12 Product_BTC_SellingPrice
- USD = "-1" # 13 Product_USD_SellingPrice
- EURO = "-1" # 14 Product_EURO_SellingPrice
- sold = "-1" # 15 Product_QuantitySold
- left = "-1" # 16 Product_QuantityLeft
- shipFrom = "-1" # 17 Product_ShippedFrom
- shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
-
- name_of_product = soup.find("div", {"class": "heading"}).text
- name = cleanString(name_of_product.strip())
-
- description_div = soup.find("div", {"class": "tab1"})
- if description_div is None:
- describe = "-1"
- else:
- describe = cleanString(description_div.text.strip())
-
- info_div = soup.find('div', {'class': 'information'})
- table = info_div.find('table') if info_div else None
-
- # Find all table rows
- rows = table.find_all('tr')
-
- # Parse each row to get relevant data
- data = {}
- for row in rows:
- columns = row.find_all('td')
- if len(columns) == 3:
- key = columns[0].text.strip()
- value = columns[2].text.strip()
- data[key] = value
-
- # Extract specific data from the dictionary and assign them to individual variables
- vendor = data.get('Vendor', '-1')
- shipFrom = data.get('Location', '-1')
- shipTo = data.get('Ships to', '-1')
- category = data.get('Category', '-1')
- USD = data.get('Price', '-1').split()[0]
- left = data.get('Stock', '-1')
-
- # image
- image = soup.find('img', {"class": "bigthumbnail"})
- image = image.get('src').split('base64,')[-1]
-
- # Populating the final variable (this should be a list with all fields scraped)
- row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
-
- # Sending the results
- return row
-
-
-#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of listing page
-#return: 'row' that contains a variety of lists that each hold info on the listing page
-def AnonMarket_listing_parser(soup):
-
- # Fields to be parsed
- nm = 0 # *Total_Products (Should be Integer)
- mktName = "AnonMarket" # 0 *Marketplace_Name
- vendor = [] # 1 *Vendor y
- rating_vendor = [] # 2 Vendor_Rating
- success = [] # 3 Vendor_Successful_Transactions
- name = [] # 4 *Product_Name y
- CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
- MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
- category = [] # 7 Product_Category y
- describe = [] # 8 Product_Description
- views = [] # 9 Product_Number_Of_Views
- reviews = [] # 10 Product_Number_Of_Reviews
- rating_item = [] # 11 Product_Rating
- addDate = [] # 12 Product_AddDate
- BTC = [] # 13 Product_BTC_SellingPrice
- USD = [] # 14 Product_USD_SellingPrice y
- EURO = [] # 15 Product_EURO_SellingPrice
- sold = [] # 16 Product_QuantitySold
- qLeft = [] # 17 Product_QuantityLeft
- shipFrom = [] # 18 Product_ShippedFrom
- shipTo = [] # 19 Product_ShippedTo
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
- base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion"
-
- cat = soup.find("div", {'class': 'heading'}).text
-
- products_list = soup.find_all('div', {'class': 'item'})
- nm = 0
- for product in products_list:
- name_of_product = product.find("div", {"class": "title"}).text.strip()
- name.append(name_of_product)
-
- name_of_vendor = product.find("a", {'class': 'seller'}).text.strip()
- vendor.append(name_of_vendor)
-
- category.append(cat)
-
- tbody = product.find('div', {"class": "info"}).find('tbody')
-
- # rating_item
- width = tbody.find('div', {"class": "stars2"}).get('style')
- rating_item.append(cleanNumbers(width.strip()))
-
- tr = tbody.findAll('tr', recursive=False)
- td = tr[2].findAll('td')
-
- # sold
- sold.append(td[0].text.strip())
-
- # reviews
- reviews.append(td[1].text.strip())
-
- product_link_element = product.find("div", {"class": "title"}).find_parent('a')
- link = product_link_element['href']
- full_link = base_url + link
- href.append(full_link)
-
- # Append '-1' for unavailable data
- rating_vendor.append("-1")
- success.append("-1")
- CVE.append("-1")
- MS.append("-1")
- describe.append("-1")
- views.append("-1")
- addDate.append("-1")
- BTC.append("-1")
- USD.append("-1")
- EURO.append("-1")
- qLeft.append("-1")
- shipFrom.append("-1")
- shipTo.append("-1")
-
- nm += 1
-
- # Populate the final variable (this should be a list with all fields scraped)
- return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
-
-
-#called by the crawler to get description links on a listing page
-#@param: beautifulsoup object that is using the correct html page (listing page)
-#return: list of description links from a listing page
-def AnonMarket_links_parser(soup):
- # Base URL to prepend to each product link
- base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion"
-
- # Returning all links that should be visited by the Crawler
- href = []
-
- # Using a shorter, but still unique, class name
- listing = soup.find('div', {'class': 'items'}).find_all('a', href=True, attrs={'href': lambda x: "/product/" in x})
-
- for a in listing:
- link = a.get('href')
- if link: # Checks if 'href' attribute is not None
- # Prepending the base URL to the scraped link
- full_link = base_url + link
- href.append(full_link)
-
- # Filtering out any links that might not have '/product/' in them
- product_links = [link for link in href if '/product/' in link]
-
- return product_links
diff --git a/MarketPlaces/Apocalypse/parser.py b/MarketPlaces/Apocalypse/parser.py
deleted file mode 100644
index 6610cc6..0000000
--- a/MarketPlaces/Apocalypse/parser.py
+++ /dev/null
@@ -1,226 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from MarketPlaces.Utilities.utilities import *
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup, ResultSet, Tag
-
-
-def apocalypse_description_parser(soup: Tag):
-
- # Fields to be parsed
-
- vendor = "-1" # 0 *Vendor_Name
- success = "-1" # 1 Vendor_Successful_Transactions
- rating_vendor = "-1" # 2 Vendor_Rating
- name = "-1" # 3 *Product_Name
- describe = "-1" # 4 Product_Description
- CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
- category = "-1" # 7 Product_Category
- views = "-1" # 8 Product_Number_Of_Views
- reviews = "-1" # 9 Product_Number_Of_Reviews
- rating_item = "-1" # 10 Product_Rating
- addDate = "-1" # 11 Product_AddedDate
- BTC = "-1" # 12 Product_BTC_SellingPrice
- USD = "-1" # 13 Product_USD_SellingPrice
- EURO = "-1" # 14 Product_EURO_SellingPrice
- sold = "-1" # 15 Product_QuantitySold
- left = "-1" # 16 Product_QuantityLeft
- shipFrom = "-1" # 17 Product_ShippedFrom
- shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
-
- content: Tag = soup.find("div", {'id': "article_page"})
-
- product_name = content.find("p", {"class": "list-group-item text-center mb-0 box"}).text
- name = cleanString(product_name.strip())
-
- product_description = content.find("pre").text
- describe = cleanString(product_description.strip())
-
- # Finding Product Image
- image = soup.find('div', {'class': 'col-md-7 text-center'}).find('img')
- image = image.get('src').split('base64,')[-1]
-
- product_reviews_list: Tag = content.find("table", {"class": "table product_reviews"}) \
- .find_all("li")
-
- review = str(len(product_reviews_list))
-
- product_category = content.find("a", {"class": "badge badge-danger"}).text
- category = cleanString(product_category.strip())
-
- product_ships_from = content.find("span", {"class": "badge badge-info"}).text
- shipFrom = cleanString(product_ships_from.strip())
-
- product_success_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-success"})
- product_ships_to = product_success_badge[1].text
- shipTo = cleanString(product_ships_to.strip())
-
- product_supply = content.find("span", {"class": "badge badge-warning"}).text
- left = cleanString(product_supply.strip())
-
- product_primary_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-primary"})
-
- # Product vendor comes in the form of "@ vendor_name"
- product_vendor = product_primary_badge[0].text.replace("@", "")
-
- vendor = cleanString(product_vendor.strip())
- sold = cleanString(product_primary_badge[1].text.strip())
-
- product_prices: Tag = content.find("p", {"style": "border-bottom:1px solid GREY;"})
- USD = product_prices.find("span", {"class": "pr"}).text
- prices_array: ResultSet[Tag] = product_prices.find_all("span", {"class": "pr1"})
- BTC = prices_array[1].text
-
- # Populating the final variable (this should be a list with all fields scraped)
- row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
-
- # Sending the results
- return row
-
-
-def apocalypse_listing_parser(soup: Tag):
-
- # Fields to be parsed
- nm = 0 # Total_Products (Should be Integer)
- mktName = "Apocalypse" # 0 Marketplace_Name
- name = [] # 1 Product_Name
- CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = [] # 3 Product_MS_Classification (Microsoft Security)
- category = [] # 4 Product_Category
- describe = [] # 5 Product_Description
- escrow = [] # 6 Vendor_Warranty
- views = [] # 7 Product_Number_Of_Views
- reviews = [] # 8 Product_Number_Of_Reviews
- addDate = [] # 9 Product_AddDate
- lastSeen = [] # 10 Product_LastViewDate
- BTC = [] # 11 Product_BTC_SellingPrice
- USD = [] # 12 Product_USD_SellingPrice
- EURO = [] # 13 Product_EURO_SellingPrice
- sold = [] # 14 Product_QuantitySold
- qLeft =[] # 15 Product_QuantityLeft
- shipFrom = [] # 16 Product_ShippedFrom
- shipTo = [] # 17 Product_ShippedTo
- vendor = [] # 18 Vendor
- rating = [] # 19 Vendor_Rating
- success = [] # 20 Vendor_Successful_Transactions
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
-
- table = soup.find("div", {"class": "col-lg-9 my-4"})
- if table is None:
- table = soup.find("div", {"class": "col-lg-9"})
- listings: ResultSet[Tag] = table.find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
-
- for prod in listings:
-
- product_name = prod.find('h5', {"class": "art_title"}).text
- name.append(cleanString(product_name.strip()))
-
- # Finding Product Image
- product_image = prod.find('img', {'class': 'customHeight'})
- product_image = product_image.get('src').split('base64,')[-1]
- image.append(product_image)
-
- CVE.append("-1")
- MS.append("-1")
- describe.append("-1")
- escrow.append("-1")
- reviews.append("-1")
- addDate.append("-1")
- lastSeen.append("-1")
- BTC.append("-1")
- EURO.append("-1")
- shipTo.append("-1")
- success.append("-1")
- image_vendor.append("-1")
-
- product_price = prod.find("span", {"class": "priceP"}).text
- USD.append(cleanString(product_price.strip()))
-
- product_sold = prod.find("span", {"class": "badge badge-success"}).text
- sold.append(cleanString(product_sold.strip()))
-
- product_statistics: ResultSet[Tag] = prod.find_all("p", {"class": "mb-0 card-text"})
-
- product_category = product_statistics[0].find("a").text
- category.append(cleanString(product_category.strip()))
-
- product_sold = product_statistics[1].find("span").text
- sold.append(cleanString(product_sold.strip()))
-
- product_quantity_left = product_statistics[2].find("span", {"class": "badge bluebadge"}).text
- qLeft.append(cleanString(product_quantity_left.strip()))
-
- product_views = product_statistics[3].find("span").text
- views.append(cleanString(product_views.strip()))
-
- product_ships_from = product_statistics[4].find("span").text
- shipFrom.append(cleanString(product_ships_from.strip()))
-
- product_vendor_tag: Tag = product_statistics[5].find("a").find("span", {"class": "badge badge-primary"})
- # Product vendors & ratings are displayed as "vender_name ★ 5.0"
- # When split by the star (★), it should return a 2-value array
- product_vendor, product_vendor_rating = product_vendor_tag.text.split("★")
-
- try:
- vendor.append(cleanString(product_vendor.strip()))
- rating.append(cleanString(product_vendor_rating.strip()))
- except Exception as e:
- raise e
-
- product_href = prod.find('a').get('href')
- href.append(product_href)
-
- nm += 1
-
- return organizeProducts(
- marketplace=mktName,
- nm=nm,
- vendor=vendor,
- rating_vendor=rating,
- success_vendor=success,
- nombre=name,
- CVE=CVE,
- MS=MS,
- category=category,
- describe=describe,
- views=views,
- reviews=reviews,
- rating_item=["-1" for _ in range(nm)],
- addDate=addDate,
- BTC=BTC,
- USD=USD,
- EURO=EURO,
- sold=sold,
- qLeft=qLeft,
- shipFrom=shipFrom,
- shipTo=shipTo,
- href=href,
- image=image,
- image_vendor=image_vendor
- )
-
-
-#called by the crawler to get description links on a listing page
-#@param: beautifulsoup object that is using the correct html page (listing page)
-#return: list of description links from a listing page
-def apocalypse_links_parser(soup):
-
- # Returning all links that should be visited by the Crawler
-
- href = []
- listing = soup.findAll('div', {"class": "col-lg-4 col-md-6 mb-1"})
-
- for a in listing:
- bae = a.find('a', href=True)
- link = bae['href']
- href.append(link)
-
- return href
\ No newline at end of file
diff --git a/MarketPlaces/LionMarketplace/crawler_selenium.py b/MarketPlaces/Ares/crawler_selenium.py
similarity index 59%
rename from MarketPlaces/LionMarketplace/crawler_selenium.py
rename to MarketPlaces/Ares/crawler_selenium.py
index e20f630..fbed2b1 100644
--- a/MarketPlaces/LionMarketplace/crawler_selenium.py
+++ b/MarketPlaces/Ares/crawler_selenium.py
@@ -1,7 +1,7 @@
-__author__ = 'Helium'
+__author__ = 'DarkWeb'
'''
-LionMarketplace Marketplace Crawler (Selenium)
+Ares Market Crawler (Selenium)
'''
from selenium import webdriver
@@ -9,64 +9,107 @@ from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
-
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
+
import urllib.parse as urlparse
-import os, re, time
+import os, time
from datetime import date
import subprocess
-import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
-from MarketPlaces.LionMarketplace.parser import lionmarketplace_links_parser
+from MarketPlaces.Ares.parser import ares_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
-baseURL = 'http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/'
+baseURL = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion'
-# Opens Tor Browser, crawls the website, then parses, then closes tor
-#acts like the main method for the crawler, another function at the end of this code calls this function later
+# Opens Tor Browser, crawls the website
def startCrawling():
- mktName = getMKTName()
+ marketName = getMarketName()
driver = getAccess()
if driver != 'down':
try:
- # login(driver)
+ login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
- new_parse(mktName, baseURL, True)
+ new_parse(marketName, False)
+
+
+# Login using premade account credentials and do login captcha manually
+def login(driver):
+ #wait for login page
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/center")))
+
+ #entering username and password into input boxes
+ usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
+ #Username here
+ usernameBox.send_keys('blabri')
+ passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
+ #Password here
+ passwordBox.send_keys('fishowal')
+
+ '''
+ # wait for captcha page show up
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img")))
+
+ # save captcha to local
+ driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img').screenshot(
+ r'..\Ares\captcha.png')
+
+ # This method will show image in any image viewer
+ im = Image.open(r'..\Ares\captcha.png')
+
+ im.show()
+
+ # wait until input space show up
+ inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/input')
+
+ # ask user input captcha solution in terminal
+ userIn = input("Enter solution: ")
+
+ # send user solution into the input space
+ inputBox.send_keys(userIn)
+
+ # click the verify(submit) button
+ driver.find_element(by=By.XPATH, value="/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[4]/div/div/button").click()
+ '''
+ input("Press ENTER when CAPTCHA is completed\n")
+
+ # wait for listing page show up (This Xpath may need to change based on different seed url)
+ WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
+ (By.XPATH, '/html/body/div[7]/div[3]/div[2]/div[1]/div[1]')))
# Returns the name of the website
-#return: name of site in string type
-def getMKTName():
- name = 'LionMarketplace'
+def getMarketName():
+ name = 'Ares'
return name
-# Return the base link of the website
-#return: url of base site in string type
+# Return the link of the website
def getFixedURL():
- url = 'http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/'
+ url = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion'
+
return url
# Closes Tor Browser
-#@param: current selenium driver
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
- driver.close()
+ driver.quit()
time.sleep(3)
return
@@ -103,14 +146,12 @@ def createFFDriver():
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
+
driver.maximize_window()
return driver
-#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
-#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
@@ -122,30 +163,7 @@ def getAccess():
return 'down'
-# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
-# then allows for manual solving of captcha in the terminal
-#@param: current selenium web driver
-def login(driver):
- # wait for page to show up (This Xpath may need to change based on different seed url)
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, '//*[@id="username"]')))
-
- # entering username and password into input boxes
- usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
- # Username here
- usernameBox.send_keys('blabri')
- passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
- # Password here
- passwordBox.send_keys('fishowal')
-
- input("Press ENTER when CAPTCHA is completed\n")
-
- # wait for listing page show up (This Xpath may need to change based on different seed url)
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/nav/div/div/ul[2]/form/button")))
-
-
-# Saves the crawled html page, makes the directory path for html pages if not made
+# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
@@ -155,7 +173,6 @@ def savePage(driver, page, url):
# Gets the full path of the page to be saved along with its appropriate file name
-#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
@@ -168,37 +185,47 @@ def getFullPathName(url):
return fullPath
-# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
-#@param: raw url as crawler crawls through every site
+# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
+ if name == '':
name = str(counter)
counter = counter + 1
return name
-# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
-#in this example, there are a couple of categories some threads fall under such as
-# Guides and Tutorials, Digital Products, and Software and Malware
-#as you can see they are categories of products
def getInterestedLinks():
links = []
- # Hacking
- links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/ba142ac0-c7e7-11ec-9bd1-fdd89c3d3f91')
- # Digital
- links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/12')
+ # # Digital - Other
+ # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/91ecd5d0-002c-11ec-9b46-ede2378c5d3c')
+ # # Digital - VPN
+ # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9431b830-002b-11ec-86d6-cdaf65cd97f1')
+ # # Digital - Coding
+ # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/948b7400-a939-11ec-adc5-2f775203130c')
+ # Digital - Malware
+ links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2')
+ # # Digital - Guides
+ # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662')
+ # # Digital - Hacking
+ # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921')
+ # # Digital - Malware
+ # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/b3258c50-002b-11ec-b658-876d3d651145')
+ # # Digital - Services
+ # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/bae64840-002b-11ec-bbcc-a93431540099')
+ # # Digital - Software
+ # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1')
+ # # Digital - Exploits
+ # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6')
+ # # Digital - Tutorials
+ # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8')
return links
-# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
-#topic and description pages are crawled through here, where both types of pages are saved
-#@param: selenium driver
def crawlForum(driver):
- print("Crawling the LionMarketplace market")
+ print("Crawling the Ares market")
linksToCrawl = getInterestedLinks()
@@ -228,16 +255,19 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item)
driver.back()
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
+ # comment out
+ break
+
+ # comment out
+ if count == 1:
+ break
try:
- nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/div[2]/nav')
- link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href')
+ nav = driver.find_element(by=By.XPATH, value=
+ '/html/body/div[7]/div[3]/div/div[2]/nav')
+ a = nav.find_element(by=By.LINK_TEXT, value="Next")
+ link = a.get_attribute('href')
+
if link == "":
raise NoSuchElementException
count += 1
@@ -249,12 +279,10 @@ def crawlForum(driver):
print(link, e)
i += 1
- print("Crawling the LionMarketplace market done.")
+ input("Crawling Ares market done sucessfully. Press ENTER to continue\n")
-# Returns 'True' if the link is a description link
-#@param: url of any url crawled
-#return: true if is a description page, false if not
+# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
if 'product' in url:
return True
@@ -262,29 +290,16 @@ def isDescriptionLink(url):
# Returns True if the link is a listingPage link
-#@param: url of any url crawled
-#return: true if is a Listing page, false if not
def isListingLink(url):
if 'category' in url:
return True
return False
-# calling the parser to define the links, the html is the url of a link from the list of interested link list
-#@param: link from interested link list ie. getInterestingLinks()
-#return: list of description links that should be crawled through
+# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
- return lionmarketplace_links_parser(soup)
-
-
-# Drop links that "signout"
-# def isSignOut(url):
-# #absURL = urlparse.urljoin(url.base_url, url.url)
-# if 'signout' in url.lower() or 'logout' in url.lower():
-# return True
-#
-# return False
+ return ares_links_parser(soup)
def crawler():
diff --git a/MarketPlaces/Ares/parser.py b/MarketPlaces/Ares/parser.py
new file mode 100644
index 0000000..3232b0c
--- /dev/null
+++ b/MarketPlaces/Ares/parser.py
@@ -0,0 +1,227 @@
+__author__ = 'DarkWeb'
+
+# Here, we are importing the auxiliary functions to clean or convert data
+from MarketPlaces.Utilities.utilities import *
+
+# Here, we are importing BeautifulSoup to search through the HTML tree
+from bs4 import BeautifulSoup
+
+
+# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
+def ares_description_parser(soup):
+
+ # Fields to be parsed
+
+ vendor = "-1" # 0 *Vendor_Name
+ success = "-1" # 1 Vendor_Successful_Transactions
+ rating_vendor = "-1" # 2 Vendor_Rating
+ name = "-1" # 3 *Product_Name
+ describe = "-1" # 4 Product_Description
+ CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
+ category = "-1" # 7 Product_Category
+ views = "-1" # 8 Product_Number_Of_Views
+ reviews = "-1" # 9 Product_Number_Of_Reviews
+ rating_item = "-1" # 10 Product_Rating
+ addDate = "-1" # 11 Product_AddedDate
+ BTC = "-1" # 12 Product_BTC_SellingPrice
+ USD = "-1" # 13 Product_USD_SellingPrice
+ EURO = "-1" # 14 Product_EURO_SellingPrice
+ sold = "-1" # 15 Product_QuantitySold
+ left = "-1" # 16 Product_QuantityLeft
+ shipFrom = "-1" # 17 Product_ShippedFrom
+ shipTo = "-1" # 18 Product_ShippedTo
+
+ # Finding Product Name
+ name = soup.find('div', {'class': "col-md-12 my-2"}).text
+ name = name.replace('\n', ' ')
+ name = name.replace(",", "")
+ name = name.strip()
+
+ bae = soup.find('div', {'class': "col-md-7"}).find('span').find_all('span')
+
+ # Finding Vendor
+ vendor = bae[0].text
+ vendor = vendor.replace(",", "")
+ vendor = vendor.replace("...", "")
+ vendor = vendor.strip()
+
+ # Finding Vendor Rating
+ full_stars = bae[2].find_all('i', {'class': "fas fa-star"})
+ half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"})
+ rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0)
+
+ # Finding Successful Transactions
+ success = bae[4].text
+ success = success.replace("Sales ", "")
+ success = success.strip()
+
+ bae = soup.find('span', {'class': "text-left"}).find_all('span')
+
+ # Finding Prices
+ USD = bae[0].text
+ USD = USD.replace("\n$", "")
+ USD = USD.strip()
+
+ shipping_info = bae[4].text
+ if "Digital" not in shipping_info:
+ shipping_info = shipping_info.split(" ")
+
+ # Finding Shipment Information (Origin)
+ shipFrom = shipping_info[0].strip()
+
+ # Finding Shipment Information (Destination)
+ shipTo = shipping_info[1].strip()
+
+ bae = soup.find_all('textarea')
+
+ # Finding the Product description
+ describe = bae[0].text
+ describe = describe.replace("\n", " ")
+ describe = describe.replace("\r", " ")
+ describe = describe.strip()
+
+ # Finding the Terms and Conditions
+ terms = bae[1].text
+ terms = terms.replace("\n", " ")
+ terms = terms.strip()
+
+ '''
+ # Finding the Number of Product Reviews
+ tag = soup.findAll(text=re.compile('Reviews'))
+ for index in tag:
+ reviews = index
+ par = reviews.find('(')
+ if par >=0:
+ reviews = reviews.replace("Reviews (","")
+ reviews = reviews.replace(")","")
+ reviews = reviews.split(",")
+ review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
+ else :
+ review = "-1"
+ '''
+
+ # Searching for CVE and MS categories
+ cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
+ if cve:
+ CVE = " "
+ for idx in cve:
+ CVE += (idx)
+ CVE += " "
+ CVE = CVE.replace(',', ' ')
+ CVE = CVE.replace('\n', '')
+ ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
+ if ms:
+ MS = " "
+ for im in ms:
+ MS += (im)
+ MS += " "
+ MS = MS.replace(',', ' ')
+ MS = MS.replace('\n', '')
+
+ # Populating the final variable (this should be a list with all fields scraped)
+ row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
+ BTC, USD, EURO, sold, left, shipFrom, shipTo)
+
+ # Sending the results
+ return row
+
+
+# This is the method to parse the Listing Pages
+def ares_listing_parser(soup):
+
+ # Fields to be parsed
+ nm = 0 # *Total_Products (Should be Integer)
+ mktName = "Ares" # 0 *Marketplace_Name
+ vendor = [] # 1 *Vendor
+ rating_vendor = [] # 2 Vendor_Rating
+ success = [] # 3 Vendor_Successful_Transactions
+ name = [] # 4 *Product_Name
+ CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = [] # 6 Product_MS_Classification (Microsoft Security)
+ category = [] # 7 Product_Category
+ describe = [] # 8 Product_Description
+ views = [] # 9 Product_Number_Of_Views
+ reviews = [] # 10 Product_Number_Of_Reviews
+ rating_item = [] # 11 Product_Rating
+ addDate = [] # 12 Product_AddDate
+ BTC = [] # 13 Product_BTC_SellingPrice
+ USD = [] # 14 Product_USD_SellingPrice
+ EURO = [] # 15 Product_EURO_SellingPrice
+ sold = [] # 16 Product_QuantitySold
+ qLeft = [] # 17 Product_QuantityLeft
+ shipFrom = [] # 18 Product_ShippedFrom
+ shipTo = [] # 19 Product_ShippedTo
+ href = [] # 20 Product_Links
+
+ listing = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"})
+
+ # Populating the Number of Products
+ nm = len(listing)
+
+ for a in listing:
+ bae = a.findAll('a', href=True)
+
+ # Adding the url to the list of urls
+ link = bae[0].get('href')
+ link = cleanLink(link)
+ href.append(link)
+
+ # Finding the Vendor
+ vendor_name = bae[1].text
+ vendor_name = vendor_name.replace(",", "")
+ vendor_name = vendor_name.strip()
+ vendor.append(vendor_name)
+
+ # Finding the Product
+ product = bae[2].find('img').get('alt')
+ product = product.replace('\n', ' ')
+ product = product.replace(",", "")
+ product = product.strip()
+ name.append(product)
+
+ # Searching for CVE and MS categories
+ cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
+ if not cve:
+ cveValue="-1"
+ else:
+ cee = " "
+ for idx in cve:
+ cee += (idx)
+ cee += " "
+ cee = cee.replace(',', ' ')
+ cee = cee.replace('\n', '')
+ cveValue=cee
+ CVE.append(cveValue)
+
+ ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
+ if not ms:
+ MSValue="-1"
+ else:
+ me = " "
+ for im in ms:
+ me += (im)
+ me += " "
+ me = me.replace(',', ' ')
+ me = me.replace('\n', '')
+ MSValue=me
+ MS.append(MSValue)
+
+ # Populate the final variable (this should be a list with all fields scraped)
+ return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
+ reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
+
+
+def ares_links_parser(soup):
+
+ # Returning all links that should be visited by the Crawler
+ href = []
+
+ listing = soup.findAll('a', {"class": "btn btn-success w-100 my-1"})
+
+ for a in listing:
+
+ link = a['href']
+ href.append(link)
+
+ return href
\ No newline at end of file
diff --git a/MarketPlaces/ThiefWorld/crawler_selenium.py b/MarketPlaces/Bohemia/crawler_selenium.py
similarity index 51%
rename from MarketPlaces/ThiefWorld/crawler_selenium.py
rename to MarketPlaces/Bohemia/crawler_selenium.py
index 95db8ff..c923f60 100644
--- a/MarketPlaces/ThiefWorld/crawler_selenium.py
+++ b/MarketPlaces/Bohemia/crawler_selenium.py
@@ -1,7 +1,7 @@
-__author__ = 'Helium'
+__author__ = 'DarkWeb'
'''
-ThiefWorld Market Crawler (Selenium)
+Bohemia Market Crawler (Selenium)
'''
from selenium import webdriver
@@ -10,6 +10,7 @@ from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
@@ -18,16 +19,13 @@ import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
-import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
-
-from MarketPlaces.ThiefWorld.parser import thiefworld_links_parser
+from MarketPlaces.Bohemia.parser import bohemia_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
-baseURL = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/'
-
+baseURL = 'http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
@@ -37,26 +35,81 @@ def startCrawling():
if driver != 'down':
try:
+ captcha(driver)
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
- new_parse(mktName, baseURL, True)
+ new_parse(mktName, False)
+
+
+def login(driver):
+ #wait for login page
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div/div[4]/div/div/form/input[1]")))
+
+ #click on login page confirmation
+ driver.find_element(by=By.XPATH, value="/html/body/div/div[4]/div/div/form/input[1]").click()
+
+ #wait until next page shows up
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[1]/input")))
+
+ #entering username and password into input boxes
+ usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[1]/input')
+ #username here
+ usernameBox.send_keys('ct-1234')
+ passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[2]/input')
+ #password here
+ passwordBox.send_keys('DementedBed123-')
+ #session time
+ session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[3]/select'))
+ session_select.select_by_visible_text('300 Minutes')
+
+ '''
+ #wait for captcha page to show up
+ inputBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[4]/div/input')
+
+ #save captcha to local
+ driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Bohemia\captcha2.png')
+ im = Image.open(r'..\Bohemia\captcha2.png')
+ im.show()
+
+ #ask user input captcha solution in terminal
+ userIn = input("Enter Solution: ")
+
+ #send user solution into input field
+ inputBox.send_keys(userIn)
+
+ #click the submit button
+ driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[5]/button').click()
+ '''
+ input("Press ENTER when CAPTCHA is completed\n")
+
+ #wait for listing page to show up
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div[2]/div[2]/div[1]/div")))
# Returns the name of the website
#return: name of site in string type
def getMKTName():
- name = 'ThiefWorld'
+ name = 'Bohemia'
return name
+# Returns credentials needed for the mkt
+def getCredentials():
+ credentials = 'blank blank blank blank cap 0'
+ return credentials
+
+
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
- url = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/'
+ url = 'http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/'
return url
@@ -80,13 +133,13 @@ def createFFDriver():
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
+ # ff_prof.set_preference("places.history.enabled", False)
+ # ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
+ # ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
+ # ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
+ # ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
+ # ff_prof.set_preference("signon.rememberSignons", False)
+ # ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
@@ -98,18 +151,17 @@ def createFFDriver():
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", False)
+ ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
+
driver.maximize_window()
return driver
-
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
@@ -126,18 +178,77 @@ def getAccess():
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
-def login(driver):
- # wait for page to show up (This Xpath may need to change based on different seed url)
+def captcha(driver):
+ '''
+ # wait for captcha page show up (for bohemia it takes A WHILE)
+ print("Connecting Bohemia...")
+ time.sleep(7.5)
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/div")))
+ input('Bohemia Connected. Press ENTER to continue\n')
+
+ # save captcha to local
+ driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div/div").screenshot(r'..\Bohemia\captcha.png')
+
+ # open method used to open different extension image file
+ im = Image.open(r'..\Bohemia\captcha.png')
+
+ # This method will show image in any image viewer
+ im.show()
+
+ # Prints link to console since captcha requires the link
+ print(getFixedURL())
+
+ # wait until input space show up
+ inputBox = driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div/div/input")
+
+ # ask user input captha solution in terminal
+ userIn = input("Enter solution: ")
+
+ # send user solution into the input space
+ inputBox.send_keys(userIn)
+
+ # click the verify(submit) button
+ driver.find_element(by=By.XPATH, value='/html/body/div/div/form/button[1]').click()
+
+ # im.close()
+ '''
+ input("Press ENTER when CAPTCHA is completed\n")
+
+ # wait for next captcha to show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div/header/div[2]/div/nav/div[2]/a[1]")))
+ (By.XPATH, "/html/body/div/div/form")))
+
+ '''
+ for square in range(1,7):
+
+ inputBox = driver.find_element(by=By.XPATH, value=f"/html/body/div/div/form/div[1]/input[{square}]")
+ inputBox.click()
+ time.sleep(.5)
+ # userIn = input("Enter Solution: ")
+ # inputBox.send_keys(userIn)
+
+ # Takes screenshot every iteration because after input the captcha changes
+ driver.find_element(by=By.XPATH, value="/html/body/div/div/form").screenshot(r'..\Bohemia\captcha1.png')
+
+ # Opens and crops image
+ im = Image.open(r'..\Bohemia\captcha1.png')
+ im = im.crop(((im.width // 2 - 80), (im.height // 2 - 100), (im.width // 2 + 80), (im.height // 2 + 60)))
+ im.show()
+ # im.close()
- temp = driver.find_element(By.XPATH, '/html/body/div/header/div[2]/div/nav/div[2]/a[1]').get_attribute(
- 'href') # /html/body/div/div[2]/div/div[2]/div
- link = urlparse.urljoin(baseURL, str(temp))
- driver.get(link) # open
- # wait for listing page show up (This Xpath may need to change based on different seed url)
+ userIn = input("Enter Solution: ")
+ inputBox.send_keys(userIn)
+
+ #locate and press submit button
+ driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()
+ # driver.find_element(by=By.XPATH, value='/html/body/div/div/form/button[2]')
+ '''
+
+ input("Press ENTER when CAPTCHA is completed\n")
+
+ #wait for next page to show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.ID, "side-bar")))
+ (By.XPATH, "/html/body/div/div[4]/div/div/form/input[1]")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
@@ -172,7 +283,6 @@ def getNameFromURL(url):
counter = counter + 1
return name
-
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
@@ -180,30 +290,31 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # Hacking and DOSS
- links.append(['Hacking and DOSS', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/35'])
- # Carding Manuals
- links.append(['Carding Manuals', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/20'])
- # Software
- links.append(['Software', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/37'])
- # Database
- links.append(['Database', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/38'])
+ # Malware and Botnets
+ links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?page=1&type=all&catid=95')
+ # #Exploits
+ # links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?page=1&type=all&catid=99')
+ # #Methods
+ # links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?catid=100')
+ # #Exploit kits
+ # links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?catid=101')
+ # #Hacking Software
+ # links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?catid=103')
- return links
+ return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
- print("Crawling the ThiefWorld market")
+ print("Crawling the Bohemia Market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
- cat = linksToCrawl[i][0]
- link = linksToCrawl[i][1]
+ link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
@@ -215,7 +326,6 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
- html += f"{cat}"
savePage(driver, html, link)
list = productPages(html)
@@ -228,17 +338,18 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item)
driver.back()
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
+ # comment out
+ break
+
+ # comment out
+ if count == 1:
+ break
try:
- nav = driver.find_element(by=By.XPATH, value='/html/body/div/div[1]/div/div/div[2]/div[3]')
- right = nav.find_element(by=By.CLASS_NAME, value='pag_right')
- link = right.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
+ nav = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[2]/ul')
+ a = nav.find_element(by=By.PARTIAL_LINK_TEXT, value="Next")
+ link = a.get_attribute('href')
+
if link == "":
raise NoSuchElementException
count += 1
@@ -250,14 +361,14 @@ def crawlForum(driver):
print(link, e)
i += 1
- print("Crawling the ThiefWorld market done.")
+ input("Crawling Bohemia Market done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
- if 'product' in url:
+ if bool(re.search(r'\blisting\b',url)): # accurate with bohemia
return True
return False
@@ -266,7 +377,7 @@ def isDescriptionLink(url):
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
- if 'catalog' in url:
+ if bool(re.search(r'\blistings\b',url)): # accurate with bohemia
return True
return False
@@ -276,16 +387,16 @@ def isListingLink(url):
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
- return thiefworld_links_parser(soup)
+ return bohemia_links_parser(soup)
# Drop links that "signout"
-# def isSignOut(url):
-# #absURL = urlparse.urljoin(url.base_url, url.url)
-# if 'signout' in url.lower() or 'logout' in url.lower():
-# return True
-#
-# return False
+def isSignOut(url):
+ #absURL = urlparse.urljoin(url.base_url, url.url)
+ if 'signout' in url.lower() or 'logout' in url.lower():
+ return True
+
+ return False
def crawler():
diff --git a/MarketPlaces/DarkBazar/parser.py b/MarketPlaces/Bohemia/parser.py
similarity index 57%
rename from MarketPlaces/DarkBazar/parser.py
rename to MarketPlaces/Bohemia/parser.py
index 9386d18..7157722 100644
--- a/MarketPlaces/DarkBazar/parser.py
+++ b/MarketPlaces/Bohemia/parser.py
@@ -1,5 +1,7 @@
__author__ = 'DarkWeb'
+import re
+
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
@@ -11,7 +13,8 @@ from bs4 import BeautifulSoup
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of description page
# return: 'row' that contains a variety of lists that each hold info on the description page
-def darkbazar_description_parser(soup):
+def bohemia_description_parser(soup):
+
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
@@ -33,75 +36,83 @@ def darkbazar_description_parser(soup):
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name
- divmb = soup.findAll('div', {'class': "mb-1"})
-
- name = divmb[0].text
+ name = soup.find('h1', {"style": "margin: 0; margin-bottom: 0.5em;"}).text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
- vendor = divmb[1].find('a').text.strip()
+ vendor = soup.find('div', {"class": "user-photo"}).find_next_sibling('a').text
+ vendor = vendor.strip()
# Finding Vendor Rating
- temp = soup.find('div', {'class': ""}).text
- temp = temp.split('(')
- rating = temp[0].replace("Vendor's Review : ", "")
- rating = rating.replace("%", "")
- rating_vendor = rating.strip()
-
- # Finding the Product Rating and Number of Product Reviews
- reviews = temp[2].replace(" review)", "")
- reviews = reviews.strip()
+ rating_vendor = soup.find('span', {'class': "user-percent"}).text.strip()
- temp = temp[1].split(")")
- rating = temp[1].replace("Product Review : ", "")
- rating = rating.replace("%", "")
- rating_item = rating.strip()
+ # Finding Users' Successful Transactions
+ temp = ''
+ success = soup.find('span', {'class': "smalltext shadow-text"}).text
+ temp = success.split("|")
+ success = str(temp[1])
+ success = success.strip()
# Finding Prices
- USD = soup.find('div', {'class': "h3 text-primary"}).text.strip()
+ prices = soup.find('div', {'class': "col-md-3 sidebar-navigation user-details"}
+ ).find('div', {'class': "container text-left"})
+ USD = prices.find('h1').text.strip()
+ BTC = prices.find('h1').find_next_sibling('h3').text
+ BTC = BTC.replace("BTC", "")
+ BTC = BTC.strip()
- # Finding the Product Category
- pmb = soup.findAll('p', {'class': "mb-1"})
+ detail_row = soup.find('div', {'class': "detail-container text-left"}).find_all('strong')
- category = pmb[-1].text
- category = category.replace("Category: ", "").strip()
+ # Finding the Product Category (there isnt a thing for it on the page
+ # category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Quantity Available
- left = divmb[-1].text
- left = left.split(",", 1)[1]
- left = left.replace("in stock", "")
- left = left.strip()
+ left = soup.find('div', {'class': "container detail-container text-left"})
+ left = left.find('div', {'class': "detail-row"}).text.replace('\n', '')
+ left = left.split("Available Stock:")
+ left = left[1].strip()
# Finding Number Sold
- sold = divmb[-1].text
- sold = sold.split(",", 1)[0]
- sold = sold.replace("sold", "")
+ sold = detail_row[0].find_parent()
+ sold = sold.text
+ sold = sold.replace("Total Sold:", "")
sold = sold.strip()
- # Finding Shipment Information (Origin)
- pmb[0].text
- shipFrom = shipFrom.replace("Ships from: ", "").strip()
-
- # Finding Shipment Information (Destination)
- pmb[1].text
- shipTo = shipTo.replace("Ships to: ", "").strip()
+ # Finding Shipment Information (Origin) (There is no shipping information)
+ '''if "Ships from:" in li[-2].text:
+ shipFrom = li[-2].text
+ shipFrom = shipFrom.replace("Ships from: ", "")
+ # shipFrom = shipFrom.replace(",", "")
+ shipFrom = shipFrom.strip()'''
+
+ # Finding Shipment Information (Destination) (No shipping info
+ '''shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
+ shipTo = shipTo.replace("Ships to: ", "")
+ shipTo = shipTo.strip()
+ if "certain countries" in shipTo:
+ countries = ""
+ tags = li[-1].find_all('span', {'class': "tag"})
+ for tag in tags:
+ country = tag.text.strip()
+ countries += country + ", "
+ shipTo = countries.strip(", ")'''
# Finding the Product description
- cardbody = soup.findAll('div', {'class': "card-body"})
- describe = cardbody[1].text.strip()
+ describe = soup.find('div', {'class': "container feedback-container"})
+ describe = describe.find_next_sibling('div', {'class': "container"}).find('p').text
+ describe = describe.replace("\n", " ")
+ describe = describe.strip()
- # Finding Product Image
- image = soup.find('div', {'class': 'product-primary'}).find('img')
- image = image.get('src')
- image = image.split('base64,')[-1]
+ # Finding the Number of Product Reviews
+ review = detail_row[2].find_parent().text
+ review = review.split("Based on")
+ review = review[1].replace("ratings)", "").strip()
- # Searching for CVE and MS categories
+ # Searching for CVE and MS categories (cant find it)
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
@@ -121,7 +132,7 @@ def darkbazar_description_parser(soup):
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
+ BTC, USD, EURO, sold, left, shipFrom, shipTo)
# Sending the results
return row
@@ -131,17 +142,17 @@ def darkbazar_description_parser(soup):
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of listing page
# return: 'row' that contains a variety of lists that each hold info on the listing page
-def darkbazar_listing_parser(soup):
+def bohemia_listing_parser(soup):
# Fields to be parsed
- nm = 0 # *Total_Products (Should be Integer)
- mktName = "DarkBazar" # 0 *Marketplace_Name
+ nm = 0 # *Total_Products (Should be Integer)
+ mktName = "Bohemia" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
- CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
- MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
+ CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
@@ -152,21 +163,18 @@ def darkbazar_listing_parser(soup):
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
- qLeft = [] # 17 Product_QuantityLeft
+ qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
+ href = [] # 20 Product_Links
- listing = soup.findAll('div', {"id": "itembox"})
+ listing = soup.findAll('div', {"class": "product-link"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
- lb = a.findAll('div', {"id": "littlebox"})
# Adding the url to the list of urls
link = bae[0].get('href')
@@ -174,64 +182,60 @@ def darkbazar_listing_parser(soup):
href.append(link)
# Finding the Product
- product = lb[1].find('a').text
+ product = bae[0].text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
- # Finding Product Image
- product_image = a.find('img')
- product_image = product_image.get('src')
- product_image = product_image.split('base64,')[-1]
- image.append(product_image)
+ bae = a.find('div', {'class': "container"})
# Finding Prices
- price = lb[-1].find('div', {"class": "mb-1"}).text
- price = price.replace("$","")
- price = price.strip()
- USD.append(price)
+ price = bae.find('div', {'class': "product-price"}).find('h2').text
+ ud = price.replace("USD", " ")
+ # u = ud.replace("$","")
+ ud = ud.replace(",", "")
+ ud = ud.strip()
+ USD.append(ud)
+ bc = bae.find('div', {'class': "product-price"}).find('span', {'class': "shadow-text smalltext boldtext"}).text
+ bc = bc.replace("\n", "")
+ bc = bc.split()
+ bc = bc[0].replace("BTC", "").strip()
+ BTC.append(bc)
# Finding the Vendor
- vendor_name = lb[-1].find("a").text
- vendor_name = vendor_name.replace(",", "")
+ vendor_name = bae.find('b').find('a').text
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
- image_vendor.append("-1")
-
# Finding the Category
- cat = lb[-1].find("span").text
- cat = cat.replace("class:", "")
+ cat = bae.find('span', {'class': "shadow-text smalltext"}).find('strong').text
cat = cat.strip()
category.append(cat)
- span = lb[0].findAll("span")
-
- # Finding Number of Views
- num = span[0].text
- num = num.replace("views:", "")
+ # Finding Number Sold and Quantity Left
+ num = bae.find('div', {'class': "product-details-bottom"}).find('span').text
+ num = num.replace("Sold", "")
+ num = num.replace("times in total", "")
num = num.strip()
sold.append(num)
- # Finding Number Sold
- num = span[2].text
- num = num.replace("Sold:", "")
- num = num.strip()
- sold.append(num)
-
- # Finding Quantity Left
- quant = span[1].text
- quant = quant.replace("stock:", "")
- quant = quant.strip()
+ quant = bae.find('div', {'class': "product-price"}).text
+ quant = quant.replace("\n", "")
+ quant = quant.split("Available")
+ quant = quant[0].replace("Autoship", "").strip()
qLeft.append(quant)
- # add shipping information
- ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->")
- shipFrom.append(ship[0].replace("Ship from ", "").strip())
- shipTo.append(ship[1].replace("to ", "").strip())
+ # Finding Successful Transactions
+ freq = bae.find('div', {'title': "Total Sales"}).find_parent().text.replace("\n", "")
+ freq = freq.strip().split()
+ freq = freq[-1].strip()
+ success.append(freq)
+ # find vendor rating
+ rate = bae.find('b').find('strong').text.strip()
+ rating_vendor.append(rate)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
@@ -262,28 +266,24 @@ def darkbazar_listing_parser(soup):
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
+ reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
# called by the crawler to get description links on a listing page
# @param: beautifulsoup object that is using the correct html page (listing page)
# return: list of description links from a listing page
-def darkbazar_links_parser(soup):
+def bohemia_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
- listing = soup.findAll('div', {"id": "itembox"})
- # for a in listing:
- # bae = a.find('a', {"class": "text-info"}, href=True)
- # link = bae['href']
- # href.append(link)
+ temp = soup.find('div', {"class": "col-md-9 sidebar-content-right listing-content"})
+ temp = temp.find('div', {"class": "product-listing"})
+ listing = temp.findAll('div', {"class": "product-heading"})
for a in listing:
- bae = a.findAll('a', href=True)
-
- # Adding the url to the list of urls
- link = bae[0].get('href')
+ bae = a.find('a', href=True)
+ link = bae['href']
href.append(link)
return href
\ No newline at end of file
diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py
deleted file mode 100644
index fdfb640..0000000
--- a/MarketPlaces/DarkBazar/crawler_selenium.py
+++ /dev/null
@@ -1,262 +0,0 @@
-__author__ = 'DarkWeb'
-
-'''
-DarkBazar Marketplace Crawler (Selenium)
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support.ui import Select
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.common.by import By
-
-from PIL import Image
-import urllib.parse as urlparse
-import os, re, time
-from datetime import date
-import subprocess
-import configparser
-from bs4 import BeautifulSoup
-from MarketPlaces.Initialization.prepare_parser import new_parse
-from MarketPlaces.DarkBazar.parser import darkbazar_links_parser
-from MarketPlaces.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/'
-
-
-def startCrawling():
- mktName = getMKTName()
- driver = getAccess()
-
- if driver != 'down':
- try:
- login(driver)
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(mktName, baseURL, True)
-
-
-# Returns the name of the website
-def getMKTName():
- name = 'DarkBazar'
- return name
-
-
-# Return the base link of the website
-def getFixedURL():
- url = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/'
- return url
-
-
-# Closes Tor Browser
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close()
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from MarketPlaces.Initialization.markets_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- # ff_prof.set_preference("network.dns.disablePrefetch", True)
- # ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", False)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)
- return driver
- except:
- driver.close()
- return 'down'
-
-
-def login(driver):
- input("Press ENTER when CAPTCHA is complete and login page has loaded\n")
-
- # entering username and password into input boxes
- usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]')
- # Username here
- usernameBox.send_keys('aliciamykeys')
- passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]')
- # Password here
- passwordBox.send_keys('aliciawherearemykey$')
- # session time
- session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select'))
- session_select.select_by_visible_text('Session 60min')
-
- input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n")
-
- # wait for listing page show up (This Xpath may need to change based on different seed url)
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, '//*[@id="submit"]')))
-
-
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-def getFullPathName(url):
- from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-def getMKTName() -> str:
- name = 'DarkBazar'
- return name
-
-
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if name == '':
- name = str(counter)
- counter = counter + 1
- return name
-
-
-def getInterestedLinks():
- links = []
-
- # Digital Goods
- links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=3')
- # Services
- links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=5')
-
- return links
-
-
-def crawlForum(driver):
-
- print("Crawling the DarkBazar market")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
- list = productPages(html)
-
- for item in list:
- itemURL = urlparse.urljoin(baseURL, str(item))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
- savePage(driver, driver.page_source, item)
- driver.back()
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- try:
- link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the DarkBazar market done.")
-
-
-# Returns 'True' if the link is Topic link, may need to change for every website
-def isDescriptionLink(url):
- if 'item' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link, may need to change for every website
-def isListingLink(url):
- if 'category=' in url:
- return True
- return False
-
-
-def productPages(html):
- soup = BeautifulSoup(html, "html.parser")
- return darkbazar_links_parser(soup)
-
-
-def crawler():
- startCrawling()
diff --git a/MarketPlaces/DarkMatter/crawler_selenium.py b/MarketPlaces/DarkMatter/crawler_selenium.py
deleted file mode 100644
index b75eea5..0000000
--- a/MarketPlaces/DarkMatter/crawler_selenium.py
+++ /dev/null
@@ -1,284 +0,0 @@
-__author__ = 'Helium'
-
-'''
-DarkMatter Marketplace Crawler (Selenium)
-Crawler works, but it slow since there is a speed check for clicking
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.common.by import By
-
-from PIL import Image
-import urllib.parse as urlparse
-import os, re, time
-from datetime import date
-import subprocess
-import configparser
-from bs4 import BeautifulSoup
-from MarketPlaces.Initialization.prepare_parser import new_parse
-from MarketPlaces.DarkMatter.parser import darkmatter_links_parser
-from MarketPlaces.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/'
-
-
-# Opens Tor Browser, crawls the website, then parses, then closes tor
-#acts like the main method for the crawler, another function at the end of this code calls this function later
-def startCrawling():
- mktName = getMKTName()
- driver = getAccess()
-
- if driver != 'down':
- try:
- login(driver)
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(mktName, baseURL, True)
-
-
-# Returns the name of the website
-#return: name of site in string type
-def getMKTName():
- name = 'DarkMatter'
- return name
-
-
-# Return the base link of the website
-#return: url of base site in string type
-def getFixedURL():
- url = 'http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/'
- return url
-
-
-# Closes Tor Browser
-#@param: current selenium driver
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close()
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from MarketPlaces.Initialization.markets_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- #ff_prof.set_preference("network.dns.disablePrefetch", True)#connection issue
- #ff_prof.set_preference("network.http.sendRefererHeader", 0)#connection issue
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", False)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
-#return: return the selenium driver or string 'down'
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)
- return driver
- except:
- driver.close()
- return 'down'
-
-
-# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
-# then allows for manual solving of captcha in the terminal
-#@param: current selenium web driver
-def login(driver):
- input("Press ENTER when CAPTCHA is completed and page is loaded\n")
- # wait for page to show up (This Xpath may need to change based on different seed url)
-
-# Saves the crawled html page, makes the directory path for html pages if not made
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-#@param: raw url as crawler crawls through every site
-def getFullPathName(url):
- from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
-#@param: raw url as crawler crawls through every site
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
- name = str(counter)
- counter = counter + 1
- return name
-
-
-# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
-#in this example, there are a couple of categories some threads fall under such as
-# Guides and Tutorials, Digital Products, and Software and Malware
-#as you can see they are categories of products
-def getInterestedLinks():
- links = []
-
- # digital fraud software
- links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=76')
- # legit
- links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=78')
- # hack guides
- links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=94')
- # services
- links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=117')
- # software/malware
- links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=121')
-
- return links
-
-
-# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
-#topic and description pages are crawled through here, where both types of pages are saved
-#@param: selenium driver
-def crawlForum(driver):
- print("Crawling the DarkMatter market")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
- list = productPages(html)
- for item in list:
- itemURL = urlparse.urljoin(baseURL, str(item))
- try:
- time.sleep(3) # to keep from detecting click speed
- driver.get(itemURL)
- except:
- driver.refresh()
- savePage(driver, driver.page_source, item)
- time.sleep(3) # to keep from detecting click speed
- driver.back()
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- try:
- link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href')
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the DarkMatter market done.")
-
-
-# Returns 'True' if the link is a description link
-#@param: url of any url crawled
-#return: true if is a description page, false if not
-def isDescriptionLink(url):
- if 'products/' in url and '/products/?category' not in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link
-#@param: url of any url crawled
-#return: true if is a Listing page, false if not
-def isListingLink(url):
- if '?category' in url:
- return True
- return False
-
-
-# calling the parser to define the links, the html is the url of a link from the list of interested link list
-#@param: link from interested link list ie. getInterestingLinks()
-#return: list of description links that should be crawled through
-def productPages(html):
- soup = BeautifulSoup(html, "html.parser")
- return darkmatter_links_parser(soup)
-
-
-# Drop links that "signout"
-# def isSignOut(url):
-# #absURL = urlparse.urljoin(url.base_url, url.url)
-# if 'signout' in url.lower() or 'logout' in url.lower():
-# return True
-#
-# return False
-
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/MarketPlaces/DarkMatter/parser.py b/MarketPlaces/DarkMatter/parser.py
deleted file mode 100644
index 2a681bc..0000000
--- a/MarketPlaces/DarkMatter/parser.py
+++ /dev/null
@@ -1,261 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from MarketPlaces.Utilities.utilities import *
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup
-
-
-#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of description page
-#return: 'row' that contains a variety of lists that each hold info on the description page
-def darkmatter_description_parser(soup):
-
- # Fields to be parsed
-
- vendor = "-1" # 0 *Vendor_Name
- success = "-1" # 1 Vendor_Successful_Transactions
- rating_vendor = "-1" # 2 Vendor_Rating
- name = "-1" # 3 *Product_Name
- describe = "-1" # 4 Product_Description
- CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
- category = "-1" # 7 Product_Category
- views = "-1" # 8 Product_Number_Of_Views
- reviews = "-1" # 9 Product_Number_Of_Reviews
- rating_item = "-1" # 10 Product_Rating
- addDate = "-1" # 11 Product_AddedDate
- BTC = "-1" # 12 Product_BTC_SellingPrice
- USD = "-1" # 13 Product_USD_SellingPrice
- EURO = "-1" # 14 Product_EURO_SellingPrice
- sold = "-1" # 15 Product_QuantitySold
- left = "-1" # 16 Product_QuantityLeft
- shipFrom = "-1" # 17 Product_ShippedFrom
- shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
-
- # 0 *Vendor_Name
- try:
- temp = soup.find('table', {'class', 'vtable'})
- temp = temp.findAll('tr')
- temp2 = temp[3].find('a').text
- vendor = cleanString(temp2.strip())
- except:
- temp = soup.find('table', {'class', 'vtable'})
- temp = temp.findAll('tr')
- temp2 = temp[4].find('a').text
- vendor = cleanString(temp2.strip())
-
- # product name
- name = soup.find('div', {'class', 'title-h2'}).text
- name = cleanString(name.strip())
-
- #product description
- temp = soup.find('pre', {'class', 'description'}).text
- temp = temp.replace('\n', ' ')
- describe = cleanString(temp.strip())
-
- #product category
- try:
- temp = soup.find('table', {'class', 'vtable'})
- temp = temp.findAll('tr')
- temp2 = temp[4].find('th').text
- temp2 = cleanString(temp2)
- if (temp2 == "Category"):
- temp2 = temp[4].find('a').text
- category = cleanString(temp2.strip())
- except:
- temp = soup.find('table', {'class', 'vtable'})
- temp = temp.findAll('tr')
- temp2 = temp[5].find('th').text
- temp2 = cleanString(temp2.strip)
- if (temp2 == "Category"):
- temp2 = temp[5].find('a').text
- category = cleanString(temp2.strip())
-
- # usd
- temp = soup.find('table', {'class', 'vtable'})
- temp = temp.findAll('tr')
- temp2 = temp[1].find('td').text
- temp2 = temp2.replace(' USD', '')
- USD = cleanString(temp2)
-
- # 15 Product_QuantitySold
- temp = soup.find('table', {'class', 'vtable'})
- temp = temp.findAll('tr')
- temp2 = temp[5].find('th').text
- temp2 = cleanString(temp2)
- temp3 = temp[6].find('th').text
- temp3 = cleanString(temp3)
- if (temp2 == "Sold"):
- temp2 = temp[5].find('td').text
- sold = cleanString(temp2.strip())
- elif (temp3 == "Sold"):
- temp2 = temp[6].find('td').text
- sold = cleanString(temp2.strip())
-
- # Finding Product Image
- image = soup.find('td', {"class": "vtop"}).find('img')
- if image is not None:
- image = image.get('src').split('base64,')[-1]
- else:
- image = '-1'
-
- # Populating the final variable (this should be a list with all fields scraped)
- row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
-
- # Sending the results
- return row
-
-
-#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of listing page
-#return: 'row' that contains a variety of lists that each hold info on the listing page
-def darkmatter_listing_parser(soup):
-
- # Fields to be parsed
-
- nm = 0 # *Total_Products (Should be Integer)
- mktName = "DarkMatter" # 0 *Marketplace_Name
- vendor = [] # 1 *Vendor y
- rating = [] # 2 Vendor_Rating
- success = [] # 3 Vendor_Successful_Transactions
- name = [] # 4 *Product_Name y
- CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = [] # 6 Product_MS_Classification (Microsoft Security)
- category = [] # 7 Product_Category y
- describe = [] # 8 Product_Description
- views = [] # 9 Product_Number_Of_Views
- reviews = [] # 10 Product_Number_Of_Reviews
- rating_item = [] # 11 Product_Rating
- addDate = [] # 12 Product_AddDate
- BTC = [] # 13 Product_BTC_SellingPrice
- USD = [] # 14 Product_USD_SellingPrice y
- EURO = [] # 15 Product_EURO_SellingPrice
- sold = [] # 16 Product_QuantitySold
- qLeft =[] # 17 Product_QuantityLeft
- shipFrom = [] # 18 Product_ShippedFrom
- shipTo = [] # 19 Product_ShippedTo
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
-
- names = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", "colspan": "3"})
- left = soup.find('div', {"class": "content"}).findAll('table', {"class": "vtable"})
- right = soup.find('div', {"class": "content"}).findAll('td', {"class": "vtop centered"})
- images = soup.find('div', {"class": "content"}).findAll('td', {"class": "vcentered"})
-
- # vtop centered
- count = 0
- # Populating the Number of Products
- nm = len(names)
-
- for a in names:
- # product name
- temp = a.find('a').text
- if ("pcs x " in temp):
- index = temp.index("pcs x ")
- result = temp[index + len("pcs x "):]
- name.append(cleanString(result))
- elif("pks x " in temp):
- index = temp.index("pks x ")
- result = temp[index + len("pks x "):]
- name.append(cleanString(result))
- elif ("job x " in temp):
- index = temp.index("job x ")
- result = temp[index + len("job x "):]
- name.append(cleanString(result))
-
- CVE.append("-1")
- MS.append("-1")
-
- temp2 = left[count].findAll('tr')
-
- length_2 = len(temp2) - 1
-
- # category
- temp = temp2[1].find('td').text
- category.append(cleanString(temp.strip()))
-
- describe.append("-1")
- #escrow.append("-1")
- views.append("-1")
- reviews.append("-1")
- addDate.append("-1")
- #lastSeen.append("-1")
- BTC.append("-1")
- image_vendor.append("-1")
-
- # usd
- temp3 = right[count*2].find('span').text
- temp = temp3.replace(' USD', '')
- USD.append(cleanString(temp))
-
- EURO.append("-1")
-
- # 14 Product_QuantitySold
- temp3 = temp2[length_2].find('th').text
- temp3 = cleanString(temp3)
- if (temp3 == "Sold:"):
- temp = temp2[length_2].find('td').text
- sold.append(cleanString(temp.strip()))
- else:
- sold.append("-1")
-
- qLeft.append("-1")
- shipFrom.append("-1")
-
- # ship to
- temp3 = temp2[length_2].find('th').text
- temp3 = cleanString(temp3)
- if (temp3 == "Ship To:"):
- temp = temp2[length_2].find('td').text
- shipTo.append(cleanString(temp.strip()))
- else:
- shipTo.append("-1")
-
- # vendor
- temp = temp2[0].find('a').text
- vendor.append(cleanString(temp.strip()))
-
- # add product rating (stars)
- rating.append("-1")
- success.append("-1")
-
- temp = a.find('a').get('href')
- href.append(temp)
-
- # Finding Product Image
- image = images[count*2].find('img').get('src')
- image = image.split('base64,')[-1]
-
- count += 1
-
- rating_item.append("-1")
-
- # Populate the final variable (this should be a list with all fields scraped)
- return organizeProducts(mktName, nm, vendor, rating, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
-
-
-#called by the crawler to get description links on a listing page
-#@param: beautifulsoup object that is using the correct html page (listing page)
-#return: list of description links from a listing page
-def darkmatter_links_parser(soup):
-
- # Returning all links that should be visited by the Crawler
-
- href = []
- listing = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", 'colspan': '3'})
-
- for a in listing:
- bae = a.find('a', href=True)
- link = bae['href']
- href.append(link)
-
- return href
\ No newline at end of file
diff --git a/MarketPlaces/DigitalThriftShop/crawler_selenium.py b/MarketPlaces/DigitalThriftShop/crawler_selenium.py
deleted file mode 100644
index 163e135..0000000
--- a/MarketPlaces/DigitalThriftShop/crawler_selenium.py
+++ /dev/null
@@ -1,286 +0,0 @@
-__author__ = 'Helium'
-
-'''
-DigitalThriftShop Marketplace Crawler (Selenium)
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.common.by import By
-
-from PIL import Image
-import urllib.parse as urlparse
-import os, re, time
-from datetime import date
-import subprocess
-import configparser
-from bs4 import BeautifulSoup
-from MarketPlaces.Initialization.prepare_parser import new_parse
-
-from MarketPlaces.DigitalThriftShop.parser import digitalthriftshop_links_parser
-from MarketPlaces.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/'
-
-
-# Opens Tor Browser, crawls the website, then parses, then closes tor
-#acts like the main method for the crawler, another function at the end of this code calls this function later
-def startCrawling():
- mktName = getMKTName()
- driver = getAccess()
-
- if driver != 'down':
- try:
- login(driver)
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(mktName, baseURL, True)
-
-
-# Returns the name of the website
-#return: name of site in string type
-def getMKTName():
- name = 'DigitalThriftShop'
- return name
-
-
-# Return the base link of the website
-#return: url of base site in string type
-def getFixedURL():
- url = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/'
- return url
-
-
-# Closes Tor Browser
-#@param: current selenium driver
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close()
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from MarketPlaces.Initialization.markets_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", False)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
-#return: return the selenium driver or string 'down'
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)
- return driver
- except:
- driver.close()
- return 'down'
-
-
-# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
-# then allows for manual solving of captcha in the terminal
-#@param: current selenium web driver
-def login(driver):
- # wait for page to show up (This Xpath may need to change based on different seed url)
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.ID, "woocommerce_product_categories-2")))
-
-# Saves the crawled html page, makes the directory path for html pages if not made
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-#@param: raw url as crawler crawls through every site
-def getFullPathName(url):
- from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
-#@param: raw url as crawler crawls through every site
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
- name = str(counter)
- counter = counter + 1
- return name
-
-
-# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
-#in this example, there are a couple of categories some threads fall under such as
-# Guides and Tutorials, Digital Products, and Software and Malware
-#as you can see they are categories of products
-def getInterestedLinks():
- links = []
-
- # Apps
- links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/apps/')
- # Books
- links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/books/')
- # Bot nets
- links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/botnets/')
- # ransomware
- links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/ransomware/')
- # rats
- links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/rats/')
- # scripts
- links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/scripts/')
-
- return links
-
-
-# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
-#topic and description pages are crawled through here, where both types of pages are saved
-#@param: selenium driver
-def crawlForum(driver):
- print("Crawling the DigitalThriftShop market")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
- list = productPages(html)
- for item in list:
- itemURL = urlparse.urljoin(baseURL, str(item))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
- savePage(driver, driver.page_source, item)
- driver.back()
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- try:
- nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/div[2]/main/div[1]/nav')
- link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='→').get_attribute('href')
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the DigitalThriftShop market done.")
-
-
-# Returns 'True' if the link is a description link
-#@param: url of any url crawled
-#return: true if is a description page, false if not
-def isDescriptionLink(url):
- if 'product/' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link
-#@param: url of any url crawled
-#return: true if is a Listing page, false if not
-def isListingLink(url):
- if 'product-' in url:
- return True
- return False
-
-
-# calling the parser to define the links, the html is the url of a link from the list of interested link list
-#@param: link from interested link list ie. getInterestingLinks()
-#return: list of description links that should be crawled through
-def productPages(html):
- soup = BeautifulSoup(html, "html.parser")
- return digitalthriftshop_links_parser(soup)
-
-
-# Drop links that "signout"
-# def isSignOut(url):
-# #absURL = urlparse.urljoin(url.base_url, url.url)
-# if 'signout' in url.lower() or 'logout' in url.lower():
-# return True
-#
-# return False
-
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/MarketPlaces/DigitalThriftShop/parser.py b/MarketPlaces/DigitalThriftShop/parser.py
deleted file mode 100644
index 8a4126c..0000000
--- a/MarketPlaces/DigitalThriftShop/parser.py
+++ /dev/null
@@ -1,173 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from MarketPlaces.Utilities.utilities import *
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup, ResultSet, Tag
-
-
-#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of description page
-#return: 'row' that contains a variety of lists that each hold info on the description page
-def digitalThriftShop_description_parser(soup: Tag):
-
- # Fields to be parsed
-
- vendor = "-1" # 0 *Vendor_Name
- success = "-1" # 1 Vendor_Successful_Transactions
- rating_vendor = "-1" # 2 Vendor_Rating
- name = "-1" # 3 *Product_Name
- describe = "-1" # 4 Product_Description
- CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
- category = "-1" # 7 Product_Category
- views = "-1" # 8 Product_Number_Of_Views
- reviews = "-1" # 9 Product_Number_Of_Reviews
- rating_item = "-1" # 10 Product_Rating
- addDate = "-1" # 11 Product_AddedDate
- BTC = "-1" # 12 Product_BTC_SellingPrice
- USD = "-1" # 13 Product_USD_SellingPrice
- EURO = "-1" # 14 Product_EURO_SellingPrice
- sold = "-1" # 15 Product_QuantitySold
- left = "-1" # 16 Product_QuantityLeft
- shipFrom = "-1" # 17 Product_ShippedFrom
- shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
-
-
- product_name = soup.find("h1", {"class": "product_title entry-title"}).text
- name = cleanString(product_name.strip())
-
- product_description = soup.find("div", {"id": "tab-description"}).find("p").text
- describe = cleanString(product_description.strip())
-
- # Finding Product Image
- image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
- image = image.get('src').split('base64,')[-1]
-
- product_category = soup.find("span", {"class": "posted_in"}).find("a").text
- category = cleanString(product_category.strip())
-
- product_rating: Tag = soup.find("div", {"class": "woocommerce-product-rating"})
- if product_rating is not None:
- rating_item = product_rating.find("strong", {"class": "rating"}).text
- reviews = product_rating.find("span", {"class": "rating"}).text
-
- product_BTC = soup.find("div", {"id": "price-BTC"}).find("span", {"class": "priceinfo cw-noselect"}).text
- BTC = cleanString(product_BTC.strip())
-
- product_USD = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text
- USD = cleanString(product_USD.replace("$", "").strip())
-
- # Populating the final variable (this should be a list with all fields scraped)
- row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
-
- # Sending the results
- return row
-
-
-#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of listing page
-#return: 'row' that contains a variety of lists that each hold info on the listing page
-def digitalThriftShop_listing_parser(soup: Tag):
-
- # Fields to be parsed
- nm = 0 # *Total_Products (Should be Integer)
- mktName = "DigitalThriftShop" # 0 *Marketplace_Name
- vendor = [] # 1 *Vendor y
- rating_vendor = [] # 2 Vendor_Rating
- success = [] # 3 Vendor_Successful_Transactions
- name = [] # 4 *Product_Name y
- CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = [] # 6 Product_MS_Classification (Microsoft Security)
- category = [] # 7 Product_Category y
- describe = [] # 8 Product_Description
- views = [] # 9 Product_Number_Of_Views
- reviews = [] # 10 Product_Number_Of_Reviews
- rating_item = [] # 11 Product_Rating
- addDate = [] # 12 Product_AddDate
- BTC = [] # 13 Product_BTC_SellingPrice
- USD = [] # 14 Product_USD_SellingPrice y
- EURO = [] # 15 Product_EURO_SellingPrice
- sold = [] # 16 Product_QuantitySold
- qLeft =[] # 17 Product_QuantityLeft
- shipFrom = [] # 18 Product_ShippedFrom
- shipTo = [] # 19 Product_ShippedTo
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
-
- product_category = soup.find("h1", {"class": "woocommerce-products-header__title page-title"}).text
-
- products_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-5"}).find_all("li")
-
- for product in products_list:
- nm += 1
- vendor.append(mktName)
- rating_vendor.append("-1")
- success.append("-1")
-
- product_name = product.find("h2", {"class": "woocommerce-loop-product__title"}).text
- name.append(cleanString(product_name.strip()))
-
- # Finding Product Image
- product_image = product.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
- product_image = product_image.get('src').split('base64,')[-1]
- image.append(product_image)
-
- CVE.append("-1")
- MS.append("-1")
- category.append(cleanString(product_category.strip()))
- describe.append("-1")
- views.append("-1")
- reviews.append("-1")
- image_vendor.append("-1")
-
- try:
- product_rating = product.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text
- rating_item.append(cleanString(product_rating.strip()))
- except:
- rating_item.append("-1")
-
- addDate.append("-1")
- BTC.append("-1")
-
- product_USD = product.find("span", {"class": "price"}).text
- USD.append(product_USD.replace("$", "").strip())
-
- EURO.append("-1")
- sold.append("-1")
- qLeft.append("-1")
- shipFrom.append("-1")
- shipTo.append("-1")
-
- product_href = product.find("a", {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}).get("href")
- href.append(cleanString(product_href.strip()))
-
-
- # Populate the final variable (this should be a list with all fields scraped)
- return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
-
-
-#called by the crawler to get description links on a listing page
-#@param: beautifulsoup object that is using the correct html page (listing page)
-#return: list of description links from a listing page
-def digitalthriftshop_links_parser(soup):
-
- # Returning all links that should be visited by the Crawler
-
- href = []
- listing = soup.find('ul', {"class": "products columns-5"}).findAll('li')
-
- for a in listing:
- bae = a.find('a', href=True)
- link = bae['href']
- href.append(link)
-
- return href
\ No newline at end of file
diff --git a/MarketPlaces/HiddenMarket/parser.py b/MarketPlaces/HiddenMarket/parser.py
deleted file mode 100644
index eb36a5b..0000000
--- a/MarketPlaces/HiddenMarket/parser.py
+++ /dev/null
@@ -1,288 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from MarketPlaces.Utilities.utilities import *
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup
-
-
-# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
-def hiddenmarket_description_parser(soup):
- # Fields to be parsed
-
- vendor = "-1" # 0 *Vendor_Name
- success = "-1" # 1 Vendor_Successful_Transactions
- rating_vendor = "-1" # 2 Vendor_Rating
- name = "-1" # 3 *Product_Name
- describe = "-1" # 4 Product_Description
- CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
- category = "-1" # 7 Product_Category
- views = "-1" # 8 Product_Number_Of_Views
- reviews = "-1" # 9 Product_Number_Of_Reviews
- rating_item = "-1" # 10 Product_Rating
- addDate = "-1" # 11 Product_AddedDate
- BTC = "-1" # 12 Product_BTC_SellingPrice
- USD = "-1" # 13 Product_USD_SellingPrice
- EURO = "-1" # 14 Product_EURO_SellingPrice
- sold = "-1" # 15 Product_QuantitySold
- left = "-1" # 16 Product_QuantityLeft
- shipFrom = "-1" # 17 Product_ShippedFrom
- shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
-
- bae = soup.find('div', {'class': "main"})
-
- # Finding Product Name
- name = bae.find('div', {'class': "heading"}).text
- name = name.replace('\n', ' ')
- name = name.replace(",", "")
- name = name.strip()
-
- mb = bae.find('div', {'class': "information"}).findAll('tr')
-
- # Finding Vendor
- vendor = mb[1].find('a').text
- vendor = vendor.replace(",", "")
- vendor = vendor.strip()
-
- # # Finding Vendor Rating
- # full_stars = bae[2].find_all('i', {'class': "fas fa-star"})
- # half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"})
- # rating = len(full_stars) + (0.5 if half_star is not None else 0)
-
- # Finding Quantity Left
- temp = mb[-3].text
- left = temp.replace("Quantity in stock:", "")
- left = left.strip()
-
- # Finding USD
- USD = mb[0].text
- USD = USD.replace("Price:", "")
- USD = USD.replace("USD", "")
- USD = USD.strip()
-
- # Finding BTC
- # temp = bae.find('div', {"class": "small"}).text.split("BTC")
-
- # BTC = temp[0].strip()
-
- # Finding Shipment Information (Origin)
- shipFrom = mb[2].text
- shipFrom = shipFrom.replace("Seller location:", "")
- shipFrom = shipFrom.strip()
-
- # Finding Shipment Information (Destination)
- shipTo = mb[3].text
- shipTo = shipTo.replace("Ships to (seller):", "")
- shipTo = shipTo.strip()
-
- # Finding the Product description
- describe = bae.find('div', {"class": "twotabs"}).find('div', {'class': "tab1"}).text
- describe = cleanString(describe.strip())
-
- # Finding Product Image
- image = soup.find('div', {"class": "thumbnails"}).find('img', {"class": "bigthumbnail"})
- image = image.get('src').split('base64,')[-1]
-
- # Finding the Product Category
- category = mb[-4].text
- category = category.replace("Category:", "")
- category = category.strip()
-
- #Finding the number of reviews
- reviews = bae.find_all('div', {'class': "heading"})
- reviews = reviews[-2].text
- reviews = reviews.replace("Comments (", "")
- reviews = reviews.replace(")", "")
-
- # Searching for CVE and MS categories
- cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
- if cve:
- CVE = " "
- for idx in cve:
- CVE += (idx)
- CVE += " "
- CVE = CVE.replace(',', ' ')
- CVE = CVE.replace('\n', '')
- ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
- if ms:
- MS = " "
- for im in ms:
- MS += (im)
- MS += " "
- MS = MS.replace(',', ' ')
- MS = MS.replace('\n', '')
-
- # Populating the final variable (this should be a list with all fields scraped)
- row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
-
- # Sending the results
- return row
-
-
-# This is the method to parse the Listing Pages
-def hiddenmarket_listing_parser(soup):
- # Fields to be parsed
- nm = 0 # *Total_Products (Should be Integer)
- mktName = "HiddenMarket" # 0 *Marketplace_Name
- vendor = [] # 1 *Vendor y
- rating_vendor = [] # 2 Vendor_Rating
- success = [] # 3 Vendor_Successful_Transactions
- name = [] # 4 *Product_Name y
- CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = [] # 6 Product_MS_Classification (Microsoft Security)
- category = [] # 7 Product_Category y
- describe = [] # 8 Product_Description
- views = [] # 9 Product_Number_Of_Views
- reviews = [] # 10 Product_Number_Of_Reviews
- rating_item = [] # 11 Product_Rating
- addDate = [] # 12 Product_AddDate
- BTC = [] # 13 Product_BTC_SellingPrice
- USD = [] # 14 Product_USD_SellingPrice y
- EURO = [] # 15 Product_EURO_SellingPrice
- sold = [] # 16 Product_QuantitySold
- qLeft = [] # 17 Product_QuantityLeft
- shipFrom = [] # 18 Product_ShippedFrom
- shipTo = [] # 19 Product_ShippedTo
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
-
- listing = soup.findAll('div', {"class": "item"})
-
- # Populating the Number of Products
- nm = len(listing)
-
- # Finding Category
- cat = soup.find("div", {'class': "heading"}).text
- cat = cat.replace(",", "")
- cat = cat.strip()
-
- for card in listing:
-
- category.append(cat)
-
- # Adding the url to the list of urls
- link = card.find_all('a')
- link = link[1].get('href')
-
- href.append(link)
-
- # Finding Product Name
- product = card.find('div', {'class': "title"})
- product = product.text
- product = product.replace('\n', ' ')
- product = product.replace(",", "")
- product = product.strip()
- name.append(product)
-
- # Finding Product Image
- image.append("-1")
-
- # Finding Vendor
- vendor_name = card.find('div', {"class": "seller"}).text
- vendor_name = vendor_name.replace(",", "")
- vendor_name = vendor_name.strip()
- vendor.append(vendor_name)
-
- image_vendor.append("-1")
-
- # Finding USD
- usd = card.find('div', {"class": "buttons"}).find('div', {'class': "price"}).text
- usd = usd.replace("USD", "")
- usd = usd.strip()
- USD.append(usd)
-
- tb = card.find("div", {"class": "stats"})
- tb = tb.find_all('td')
-
- # Finding Reviews
- num = tb[-1].text
- num = num.strip()
- reviews.append(num)
-
- # Finding Views
- view = tb[-3].text.strip()
- views.append(view)
-
- # Finding Num of Sales
- sale = tb[-2].text.strip()
- sold.append(sale)
-
- # Finding Item Rating
- if num == '0':
- item_rating = '-1'
- else:
- item_rating = card.find('div', {'class': 'stats'}).find('div', {'class': "stars2"})
- item_rating = item_rating.get('style')
- item_rating = item_rating.replace("width:", "")
- item_rating = item_rating.replace("%", "")
- rating_item.append(item_rating)
-
-
- # Finding shipping info
- shipping = card.find('div', {'class': "shipping"}).text.split('>')
- # SHip from
- origin = shipping[0].strip()
- shipFrom.append(origin)
- #Ship to
- destination = shipping[1].strip()
- shipTo.append(destination)
-
- # Finding description (site only shows partial description on listing pages)
- # description = card.next_sibling.find('div', {'class': "description"}).text
- # description = description.replace("\n", " ")
- # description = description.replace("\r", " ")
- # description = description.replace("-", " ")
- # description = description.strip()
- # describe.append(description)
-
- # Searching for CVE and MS categories
- cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
- if not cve:
- cveValue = "-1"
- else:
- cee = " "
- for idx in cve:
- cee += (idx)
- cee += " "
- cee = cee.replace(',', ' ')
- cee = cee.replace('\n', '')
- cveValue = cee
- CVE.append(cveValue)
-
- ms = card.findAll(text=re.compile('MS\d{2}-\d{3}'))
- if not ms:
- MSValue = "-1"
- else:
- me = " "
- for im in ms:
- me += (im)
- me += " "
- me = me.replace(',', ' ')
- me = me.replace('\n', '')
- MSValue = me
- MS.append(MSValue)
-
- # Populate the final variable (this should be a list with all fields scraped)
- return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
-
-
-def hiddenmarket_links_parser(soup):
- # Returning all links that should be visited by the Crawler
- href = []
-
- listing = soup.findAll('div', {"class": "item"})
-
- for div in listing:
- link = div.findAll('a')
- link = link[1]
- link = link['href']
- href.append(link)
-
- return href
diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py
index 5ec07b6..f85b46c 100644
--- a/MarketPlaces/Initialization/markets_mining.py
+++ b/MarketPlaces/Initialization/markets_mining.py
@@ -6,28 +6,14 @@ Starting point of the Darkweb Markets Mining
from datetime import *
from MarketPlaces.DarkFox.crawler_selenium import crawler as crawlerDarkFox
-from MarketPlaces.Tor2door.crawler_selenium import crawler as crawlerTor2door
-from MarketPlaces.ThiefWorld.crawler_selenium import crawler as crawlerThiefWorld
-from MarketPlaces.TorBay.crawler_selenium import crawler as crawlerTorBay
-from MarketPlaces.LionMarketplace.crawler_selenium import crawler as crawlerLionMarketplace
-from MarketPlaces.TorMarket.crawler_selenium import crawler as crawlerTorMarket
from MarketPlaces.MikesGrandStore.crawler_selenium import crawler as crawlerMikesGrandStore
from MarketPlaces.DarkTor.crawler_selenium import crawler as crawlerDarkTor
-from MarketPlaces.DigitalThriftShop.crawler_selenium import crawler as crawlerDigitalThriftShop
from MarketPlaces.AnonymousMarketplace.crawler_selenium import crawler as crawlerAnonymousMarketplace
-from MarketPlaces.Apocalypse.crawler_selenium import crawler as crawlerApocalypseMarketplace
from MarketPlaces.CityMarket.crawler_selenium import crawler as crawlerCityMarket
-from MarketPlaces.DarkMatter.crawler_selenium import crawler as crawlerDarkMatter
from MarketPlaces.M00nkeyMarket.crawler_selenium import crawler as crawlerM00nkeyMarket
from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity
-from MarketPlaces.HiddenMarket.crawler_selenium import crawler as crawlerHiddenMarket
-from MarketPlaces.RobinhoodMarket.crawler_selenium import crawler as crawlerRobinhoodMarket
-from MarketPlaces.Nexus.crawler_selenium import crawler as crawlerNexus
from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher
-from MarketPlaces.DarkBazar.crawler_selenium import crawler as crawlerDarkBazar
from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar
-from MarketPlaces.AnonMarket.crawler_selenium import crawler as crawlerAnonMarket
-from MarketPlaces.MetaVerseMarket.crawler_selenium import crawler as crawlerMetaVerse
import configparser
import os
@@ -105,49 +91,21 @@ if __name__ == '__main__':
# if crawlerDarkFox(base["url"], base["categories"]):
# break
crawlerDarkFox()
- elif mkt == 'Tor2door':
- crawlerTor2door()
- elif mkt == "ThiefWorld":
- crawlerThiefWorld()
- elif mkt == "TorBay":
- crawlerTorBay()
- elif mkt == "LionMarketplace":
- crawlerLionMarketplace()
- elif mkt == "TorMarket":
- crawlerTorMarket()
elif mkt == "MikesGrandStore":
crawlerMikesGrandStore()
elif mkt == "DarkTor":
crawlerDarkTor()
- elif mkt == "DigitalThriftShop":
- crawlerDigitalThriftShop()
elif mkt == "AnonymousMarketplace":
crawlerAnonymousMarketplace()
- elif mkt == "Apocalypse":
- crawlerApocalypseMarketplace()
elif mkt == "CityMarket":
crawlerCityMarket()
- elif mkt == "DarkMatter":
- crawlerDarkMatter()
elif mkt == "M00nkeyMarket":
crawlerM00nkeyMarket()
elif mkt == "ViceCity":
crawlerViceCity()
- elif mkt == "HiddenMarket":
- crawlerHiddenMarket()
- elif mkt == "RobinhoodMarket":
- crawlerRobinhoodMarket()
- elif mkt == "Nexus":
- crawlerNexus()
elif mkt == "CypherMarketplace":
crawlerCypher()
- elif mkt == "DarkBazar":
- crawlerDarkBazar()
elif mkt == "PabloEscobarMarket":
crawlerPabloEscobar()
- elif mkt == "AnonMarket":
- crawlerAnonMarket()
- elif mkt == "MetaVerseMarket":
- crawlerMetaVerse()
print("\nScraping process completed!")
diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py
index 9cfe2a9..e075541 100644
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@@ -9,26 +9,12 @@ from psycopg2.extras import RealDictCursor
from MarketPlaces.DB_Connection.db_connection import *
from MarketPlaces.DarkFox.parser import *
-from MarketPlaces.Tor2door.parser import *
-from MarketPlaces.Apocalypse.parser import *
-from MarketPlaces.ThiefWorld.parser import *
from MarketPlaces.AnonymousMarketplace.parser import *
from MarketPlaces.ViceCity.parser import *
-from MarketPlaces.TorBay.parser import *
from MarketPlaces.M00nkeyMarket.parser import *
-from MarketPlaces.DarkMatter.parser import *
-from MarketPlaces.DigitalThriftShop.parser import *
-from MarketPlaces.LionMarketplace.parser import *
-from MarketPlaces.TorMarket.parser import *
-from MarketPlaces.HiddenMarket.parser import *
-from MarketPlaces.RobinhoodMarket.parser import *
-from MarketPlaces.Nexus.parser import *
from MarketPlaces.MikesGrandStore.parser import *
-from MarketPlaces.DarkBazar.parser import *
from MarketPlaces.PabloEscobarMarket.parser import *
-from MarketPlaces.AnonMarket.parser import *
from MarketPlaces.CityMarket.parser import *
-from MarketPlaces.MetaVerseMarket.parser import *
from MarketPlaces.Classifier.classify_product import predict
@@ -132,46 +118,18 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
if marketPlace == "DarkFox":
rw = darkfox_listing_parser(soup)
- elif marketPlace == "Tor2door":
- rw = tor2door_listing_parser(soup)
- elif marketPlace == "Apocalypse":
- rw = apocalypse_listing_parser(soup)
- elif marketPlace == "ThiefWorld":
- rw = thiefWorld_listing_parser(soup)
elif marketPlace == "AnonymousMarketplace":
rw = anonymousMarketplace_listing_parser(soup)
elif marketPlace == "ViceCity":
rw = vicecity_listing_parser(soup)
- elif marketPlace == "TorBay":
- rw = torbay_listing_parser(soup)
elif marketPlace == "M00nkeyMarket":
rw = m00nkey_listing_parser(soup)
- elif marketPlace == "HiddenMarket":
- rw = hiddenmarket_listing_parser(soup)
- elif marketPlace == "DarkMatter":
- rw = darkmatter_listing_parser(soup)
- elif marketPlace == "DigitalThriftShop":
- rw = digitalThriftShop_listing_parser(soup)
- elif marketPlace == "LionMarketplace":
- rw = lionmarketplace_listing_parser(soup)
- elif marketPlace == "TorMarket":
- rw = tormarket_listing_parser(soup)
- elif marketPlace == "RobinhoodMarket":
- rw = Robinhood_listing_parser(soup)
- elif marketPlace == "Nexus":
- rw = nexus_listing_parser(soup)
elif marketPlace == "MikesGrandStore":
rw = mikesGrandStore_listing_parser(soup)
- elif marketPlace == "DarkBazar":
- rw = darkbazar_listing_parser(soup)
elif marketPlace == "PabloEscobarMarket":
rw = pabloescobarmarket_listing_parser(soup)
- elif marketPlace == "AnonMarket":
- rw = AnonMarket_listing_parser(soup)
elif marketPlace == "CityMarket":
rw = city_listing_parser(soup)
- elif marketPlace == "MetaVerseMarket":
- rw = metaversemarket_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@@ -194,46 +152,18 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
if marketPlace == "DarkFox":
rmm = darkfox_description_parser(soup)
- elif marketPlace == "Tor2door":
- rmm = tor2door_description_parser(soup)
- elif marketPlace == "Apocalypse":
- rmm = apocalypse_description_parser(soup)
- elif marketPlace == "ThiefWorld":
- rmm = thiefWorld_description_parser(soup)
elif marketPlace == "AnonymousMarketplace":
rmm = anonymousMarketplace_description_parser(soup)
elif marketPlace == "ViceCity":
rmm = vicecity_description_parser(soup)
- elif marketPlace == "TorBay":
- rmm = torbay_description_parser(soup)
elif marketPlace == "M00nkeyMarket":
rmm = m00nkey_description_parser(soup)
- elif marketPlace == "HiddenMarket":
- rmm = hiddenmarket_description_parser(soup)
- elif marketPlace == "DarkMatter":
- rmm = darkmatter_description_parser(soup)
- elif marketPlace == "DigitalThriftShop":
- rmm = digitalThriftShop_description_parser(soup)
- elif marketPlace == "LionMarketplace":
- rmm = lionmarketplace_description_parser(soup)
- elif marketPlace == "TorMarket":
- rmm = tormarket_description_parser(soup)
- elif marketPlace == "RobinhoodMarket":
- rmm = Robinhood_description_parser(soup)
- elif marketPlace == "Nexus":
- rmm = nexus_description_parser(soup)
elif marketPlace == "MikesGrandStore":
rmm = mikesGrandStore_description_parser(soup)
- elif marketPlace == "DarkBazar":
- rmm = darkbazar_description_parser(soup)
elif marketPlace == "PabloEscobarMarket":
rmm = pabloescobarmarket_description_parser(soup)
- elif marketPlace == "AnonMarket":
- rmm = AnonMarket_description_parser(soup)
elif marketPlace == "CityMarket":
rmm = city_description_parser(soup)
- elif marketPlace == "MetaVerseMarket":
- rmm = metaversemarket_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception
diff --git a/MarketPlaces/Kingdom/crawler_mechanize.py b/MarketPlaces/Kingdom/crawler_mechanize.py
new file mode 100644
index 0000000..9a680a8
--- /dev/null
+++ b/MarketPlaces/Kingdom/crawler_mechanize.py
@@ -0,0 +1,325 @@
+__author__ = '91Shadows'
+
+'''
+DarkFox marketplace Crawler
+'''
+
+import codecs
+import socks, socket, time
+from datetime import date
+import urllib.parse as urlparse
+import http.client as httplib
+import mechanize
+import os
+import subprocess
+from bs4 import BeautifulSoup
+from MarketPlaces.Initialization.prepare_parser import new_parse
+from MarketPlaces.DarkFox.parser import darkfox_links_parser
+
+counter = 1
+httplib.HTTPConnection._http_vsn = 10
+httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
+baseURL = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
+socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)
+
+
+# Opens Tor Browser, crawls the mkt
+def startCrawling():
+
+ opentor()
+ getUrl()
+ url = getFixedURL()
+ mktName = getMKTName()
+ credentials = getCredentials()
+ br = getAccess(url, credentials)
+
+ if br != 'down':
+ crawlMkt(url, br)
+ #new_parse(mktName, False)
+
+ #new_parse(mktName, False)
+
+ closetor()
+
+
+#Opens Tor Browser
+def opentor():
+ global pid
+ print("Connecting Tor...")
+ path = open('../../path.txt').readline()
+ pro = subprocess.Popen(path)
+ pid = pro.pid
+ time.sleep(5)
+ input("Tor Connected. Press ENTER to continue\n")
+ return
+
+
+# Creates a connection through Tor Port
+def getUrl(timeout=None):
+ socket.socket = socks.socksocket
+ socket.create_connection = create_connection
+ return
+
+
+# Makes the onion address request
+def create_connection(address, timeout=None, source_address=None):
+ sock = socks.socksocket()
+ sock.connect(address)
+ return sock
+
+
+# Returns the name of the mkt (Crypto)
+def getMKTName():
+ name = 'DarkFox'
+ return name
+
+
+# Returns credentials needed for the mkt
+def getCredentials():
+ credentials = 'blank blank blank blank cap 0'
+ return credentials
+
+
+# Return the link of the mkt (DarkFox Link)
+def getFixedURL():
+ url = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
+ return url
+
+
+# Closes Tor Browser
+def closetor():
+ global pid
+ os.system("taskkill /pid " + str(pid))
+ print('Closing Tor...')
+ time.sleep(3)
+ return
+
+
+# Creates a Mechanize browser and initializes its options
+def createBrowser():
+ br = mechanize.Browser()
+ cj = mechanize.CookieJar()
+ br.set_cookiejar(cj)
+
+ # Browser options
+ br.set_handle_equiv( True )
+ br.set_handle_redirect( True )
+ br.set_handle_referer( True )
+ br.set_handle_robots(False)
+ br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 )
+
+ br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
+ ('Accept', '*/*')]
+
+ return br
+
+
+def getAccess(loginPage, credentials):
+
+ logInName = credentials.split()[0]
+ userName = credentials.split()[1]
+ logInPass = credentials.split()[2]
+ password = credentials.split()[3]
+ captchaName = credentials.split()[4]
+ formId = credentials.split()[5]
+
+ br = createBrowser()
+
+ try:
+ keepTrying = True
+ while (keepTrying):
+
+ br.open(loginPage)
+ time.sleep(7)
+ html = br.response()
+ soup = BeautifulSoup(html)
+ image_tags = soup.findAll('div', {"class": "imgWrap"})
+ captchaLink = image_tags[0]
+ imagelink = captchaLink['style'].split('url(')[1][:-1]
+ data = br.open(imagelink).read()
+ br.back()
+ open('captcha.png', "wb").write(data)
+ '''
+ subprocess.Popen("python capt.py", shell=False)
+ time.sleep(61)
+ captchaAnswerFile = open("answer.txt", "r")
+ captchaAnswer = captchaAnswerFile.read().__str__()
+ '''
+ captchaAnswer = input('Please provide me with captcha : ')
+ formIndex = int(formId)
+ br.select_form(nr=formIndex)
+ #br[logInName] = userName
+ #br[logInPass] = password
+ br[captchaName] = captchaAnswer.__str__()
+ br.submit()
+ if br.geturl() != 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/':
+ keepTrying = False
+
+ return br
+
+ except:
+
+ return 'down'
+
+
+# Saves the crawled html page
+def savePage(page, url):
+ filePath = getFullPathName(url)
+ os.makedirs(os.path.dirname(filePath), exist_ok=True)
+ a = page.read()
+ open(filePath, "wb").write(a)
+ return
+
+
+# Gets the full path of the page to be saved along with its appropriate file name
+def getFullPathName(url):
+ fileName = getNameFromURL(url)
+ if isDescriptionLink(url):
+ fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
+ "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
+ "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
+ else:
+ fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
+ "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
+ "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
+ return fullPath
+
+
+# Creates the name of the file based on URL
+def getNameFromURL(url):
+ global counter
+ name = ''.join(e for e in url if e.isalnum())
+ if (name == ''):
+ name = str(counter)
+ counter = counter + 1
+ return name
+
+
+# Hacking and Markets related topics
+def getInterestedLinks():
+ links = []
+
+ # Guides and Tutorials
+ links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')
+ # Digital Products
+ links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')
+ # Software and Malware
+ links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')
+ # Services
+ links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')
+ # Miscellaneous
+ links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')
+ # Hosting and Security
+ links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')
+
+ # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing1.html')
+ # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing2.html')
+
+ return links
+
+
+def crawlMkt(url, br):
+
+ print("Crawling the DarkFox marketplace")
+
+ linksToCrawl = getInterestedLinks()
+ visited = set(linksToCrawl)
+ initialTime = time.time()
+
+ i = 0
+ while i < len(linksToCrawl):
+ link = linksToCrawl[i]
+ print('Crawling :', link)
+ try :
+ page = br.open(link)
+ savePage(page, link)
+ for l in br.links():
+ absURL = urlparse.urljoin(l.base_url, l.url)
+ if absURL not in visited and not isSignOut(absURL) and isListingLink(absURL):
+ visited.add(absURL)
+
+ #disabling the process of finding other links
+ #linksToCrawl.append(absURL)
+
+ # crawler asks parser to get links of ALL products on ALL listing pages
+ list = productPages(link)
+ j = 0
+ for item in list:
+ if j == 2:
+ break
+ #itemURL = baseURL + str(item)
+ try:
+ #itemPage = br.open(itemURL)
+ itemPage = br.open(item)
+ savePage(itemPage, item)
+ except:
+ #print 'Error in page: ', itemURL
+ print('Error in page: ', item)
+ j+=1
+
+ except Exception as e:
+ print(link, e.message)
+ i += 1
+
+ #finalTime = time.time()
+ #print finalTime - initialTime
+
+ input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n")
+
+ return
+
+
+def isDescriptionLink(url):
+ if 'product' in url:
+ return True
+ return False
+
+
+# Returns True if the link is a listingPage link
+def isListingLink(url):
+ if 'category' in url:
+ return True
+ return False
+
+
+# calling the parser to define the links
+def productPages(url):
+
+ soup = ""
+
+ error = False
+ try:
+ html = codecs.open(
+ r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
+ "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
+ "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html', encoding='utf8')
+ soup = BeautifulSoup(html, "html.parser")
+ except:
+ try:
+ html = open(
+ r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
+ "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
+ "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html')
+ soup = BeautifulSoup(html, "html.parser")
+ except:
+ error = True
+ print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
+
+ if error:
+ return []
+ else:
+ return darkfox_links_parser(soup)
+
+
+# Drop links that "singout"
+def isSignOut(url):
+ #absURL = urlparse.urljoin(url.base_url, url.url)
+ if 'signout' in url.lower() or 'logout' in url.lower():
+ return True
+
+ return False
+
+
+def crawler():
+ startCrawling()
+ #print "Crawling and Parsing Crypto .... DONE!"
diff --git a/MarketPlaces/Kingdom/crawler_selenium.py b/MarketPlaces/Kingdom/crawler_selenium.py
new file mode 100644
index 0000000..e6b489f
--- /dev/null
+++ b/MarketPlaces/Kingdom/crawler_selenium.py
@@ -0,0 +1,342 @@
+__author__ = 'DarkWeb'
+
+'''
+Kingdom Market Crawler (Selenium)
+'''
+
+from selenium import webdriver
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
+from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
+from selenium.webdriver.firefox.service import Service
+from selenium.common.exceptions import TimeoutException
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support.ui import Select
+from PIL import Image
+import base64
+from io import BytesIO
+
+
+import urllib.parse as urlparse
+import os, re, time
+from datetime import date
+import subprocess
+from bs4 import BeautifulSoup
+from MarketPlaces.Initialization.prepare_parser import new_parse
+from MarketPlaces.Kingdom.parser import kingdom_links_parser
+from MarketPlaces.Utilities.utilities import cleanHTML
+
+counter = 1
+baseURL = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion'
+
+
+# Opens Tor Browser, crawls the website
+def startCrawling():
+ # marketName = getMarketName()
+ driver = getAccess()
+
+ if driver != 'down':
+ try:
+ captcha(driver)
+ login(driver)
+ crawlForum(driver)
+ except Exception as e:
+ print(driver.current_url, e)
+ closeDriver(driver)
+
+ # new_parse(marketName, False)
+
+
+def captcha(driver):
+ '''
+ # wait for captcha page
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div/div[1]")))
+
+ # save captcha to local
+ driver.find_element(by=By.XPATH, value='/html/body/div/div[2]').screenshot(
+ r'..\Kingdom\captcha1.png')
+
+ # This method will show image in any image viewer
+ im = Image.open(r'..\Kingdom\captcha1.png')
+ im.show()
+
+ iframes = driver.find_elements(by=By.TAG_NAME, value='iframe')
+
+ # ask user input captcha solution in terminal
+ print("Enter squares from smallest to largest (squares are numbered 1-9 left to right)")
+ for order in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']:
+ id = input(f"{order}: ")
+ iframes[int(id)-1].click()
+ '''
+ input("Press ENTER when CAPTCHA is completed\n")
+
+ # wait for login page
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button")))
+
+
+# Login using premade account credentials and do login captcha manually
+def login(driver):
+ # wait for login page
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button")))
+
+ # entering username and password into input boxes
+ usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-name"]')
+ # Username here
+ usernameBox.send_keys('blabri')
+ passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-passwd"]')
+ # Password here
+ passwordBox.send_keys('fishowal')
+
+ select = Select(driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-sessiontime"]'))
+ select.select_by_visible_text('24 hours')
+
+ '''
+ # wait for captcha page show up
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, '//*[@id="captcha"]')))
+
+ # save captcha to local
+ driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Kingdom\captcha2.png')
+
+ # This method will show image in any image viewer
+ im = Image.open(r'..\Kingdom\captcha2.png')
+ im.show()
+
+ # wait until input space show up
+ inputBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-captcha"]')
+
+ # ask user input captcha solution in terminal
+ userIn = input("Enter solution: ")
+
+ # send user solution into the input space
+ inputBox.send_keys(userIn)
+
+ # click the verify(submit) button
+ driver.find_element(by=By.XPATH, value="/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button").click()
+ '''
+ input("Press ENTER when CAPTCHA is completed\n")
+
+ # wait for listing page show up (This Xpath may need to change based on different seed url)
+ WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
+ (By.XPATH, '/html/body/div/div/div[3]/div[2]')))
+
+
+# Returns the name of the website
+def getMarketName():
+ name = 'Kingdom'
+ return name
+
+
+# Return the link of the website
+def getFixedURL():
+ url = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion'
+
+ return url
+
+
+# Closes Tor Browser
+def closeDriver(driver):
+ # global pid
+ # os.system("taskkill /pid " + str(pro.pid))
+ # os.system("taskkill /t /f /im tor.exe")
+ print('Closing Tor...')
+ driver.close()
+ time.sleep(3)
+ return
+
+
+# Creates FireFox 'driver' and configure its 'Profile'
+# to use Tor proxy and socket
+def createFFDriver():
+ from MarketPlaces.Initialization.markets_mining import config
+
+ ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
+
+ ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
+ ff_prof.set_preference("places.history.enabled", False)
+ ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
+ ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
+ ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
+ ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
+ ff_prof.set_preference("signon.rememberSignons", False)
+ ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
+ ff_prof.set_preference("network.dns.disablePrefetch", True)
+ ff_prof.set_preference("network.http.sendRefererHeader", 0)
+ ff_prof.set_preference("permissions.default.image", 3)
+ ff_prof.set_preference("browser.download.folderList", 2)
+ ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
+ ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
+ ff_prof.set_preference('network.proxy.type', 1)
+ ff_prof.set_preference("network.proxy.socks_version", 5)
+ ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
+ ff_prof.set_preference('network.proxy.socks_port', 9150)
+ ff_prof.set_preference('network.proxy.socks_remote_dns', True)
+ ff_prof.set_preference("javascript.enabled", False)
+ ff_prof.update_preferences()
+
+ service = Service(config.get('TOR', 'geckodriver_path'))
+
+ driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+
+ driver.maximize_window()
+
+ return driver
+
+
+def getAccess():
+ url = getFixedURL()
+ driver = createFFDriver()
+ try:
+ driver.get(url)
+ return driver
+ except:
+ driver.close()
+ return 'down'
+
+
+# Saves the crawled html page
+def savePage(driver, page, url):
+ cleanPage = cleanHTML(driver, page)
+ filePath = getFullPathName(url)
+ os.makedirs(os.path.dirname(filePath), exist_ok=True)
+ open(filePath, 'wb').write(cleanPage.encode('utf-8'))
+ return
+
+
+# Gets the full path of the page to be saved along with its appropriate file name
+def getFullPathName(url):
+ from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
+
+ mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
+ fileName = getNameFromURL(url)
+ if isDescriptionLink(url):
+ fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
+ else:
+ fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
+ return fullPath
+
+
+# Creates the file name from passed URL
+def getNameFromURL(url):
+ global counter
+ name = ''.join(e for e in url if e.isalnum())
+ if (name == ''):
+ name = str(counter)
+ counter = counter + 1
+ return name
+
+
+def getInterestedLinks():
+ links = []
+
+ # Software and Malware
+ links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=c298a77d9e93ad32')
+ # # Services
+ # links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=c298a77d9e93ad32')
+ # # Exploits
+ # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45')
+ # # Tools
+ # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46')
+ # # Malware
+ # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47')
+ # # Cryptography
+ # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48')
+ # # Others
+ # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49')
+ # # Hacking Tutorials
+ # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50')
+ # # Hacked Accounts and Database Dumps
+ # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30')
+ # # Android Moded pak
+ # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53')
+
+ return links
+
+
+def crawlForum(driver):
+ print("Crawling the Kingdom market")
+
+ linksToCrawl = getInterestedLinks()
+
+ i = 0
+ while i < len(linksToCrawl):
+ link = linksToCrawl[i]
+ print('Crawling :', link)
+ try:
+ has_next_page = True
+ count = 0
+
+ while has_next_page:
+ try:
+ driver.get(link)
+ except:
+ driver.refresh()
+ html = driver.page_source
+ savePage(driver, html, link)
+
+ list = productPages(html)
+ for item in list:
+ itemURL = urlparse.urljoin(baseURL, str(item))
+ try:
+ driver.get(itemURL)
+ except:
+ driver.refresh()
+ savePage(driver, driver.page_source, item)
+ driver.back()
+
+ # comment out
+ break
+
+ # comment out
+ if count == 1:
+ break
+
+ try:
+ temp = driver.find_element(by=By.XPATH, value=
+ '/html/body/div/div/div[3]/div[2]/div[2]/div/div/ul')
+ next = temp.find_element_by_class_name("next")
+ link = link.find_element_by_tag_name('a').get_attribute('href')
+ if link == "":
+ raise NoSuchElementException
+ count += 1
+
+ except NoSuchElementException:
+ has_next_page = False
+
+ except Exception as e:
+ print(link, e)
+ i += 1
+
+ input("Crawling Kingdom Market done sucessfully. Press ENTER to continue\n")
+
+
+# Returns 'True' if the link is Topic link
+def isDescriptionLink(url):
+ if 'view' in url:
+ return True
+ return False
+
+
+# Returns True if the link is a listingPage link
+def isListingLink(url):
+ if 'category' in url:
+ return True
+ return False
+
+
+# calling the parser to define the links
+def productPages(html):
+ soup = BeautifulSoup(html, "html.parser")
+ #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
+ return kingdom_links_parser(soup)
+
+
+def crawler():
+ startCrawling()
+ # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/MarketPlaces/Kingdom/parser.py b/MarketPlaces/Kingdom/parser.py
new file mode 100644
index 0000000..b1e05d5
--- /dev/null
+++ b/MarketPlaces/Kingdom/parser.py
@@ -0,0 +1,188 @@
+__author__ = 'DarkWeb'
+
+# Here, we are importing the auxiliary functions to clean or convert data
+from MarketPlaces.Utilities.utilities import *
+
+# Here, we are importing BeautifulSoup to search through the HTML tree
+from bs4 import BeautifulSoup
+
+
+# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
+def kingdom_description_parser(soup):
+
+ # Fields to be parsed
+
+ vendor = "-1" # 0 *Vendor_Name
+ success = "-1" # 1 Vendor_Successful_Transactions
+ rating_vendor = "-1" # 2 Vendor_Rating
+ name = "-1" # 3 *Product_Name
+ describe = "-1" # 4 Product_Description
+ CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
+ category = "-1" # 7 Product_Category
+ views = "-1" # 8 Product_Number_Of_Views
+ reviews = "-1" # 9 Product_Number_Of_Reviews
+ rating_item = "-1" # 10 Product_Rating
+ addDate = "-1" # 11 Product_AddedDate
+ BTC = "-1" # 12 Product_BTC_SellingPrice
+ USD = "-1" # 13 Product_USD_SellingPrice
+ EURO = "-1" # 14 Product_EURO_SellingPrice
+ sold = "-1" # 15 Product_QuantitySold
+ left = "-1" # 16 Product_QuantityLeft
+ shipFrom = "-1" # 17 Product_ShippedFrom
+ shipTo = "-1" # 18 Product_ShippedTo
+
+ # Finding Product Name
+
+ tag = soup.find('div', {"class": "col-md-9"})
+
+ desc = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-cont"})
+ name = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-head"}).text
+ name = name.replace('\n', ' ')
+ name = name.replace(',', ' ')
+ name = name.strip()
+
+ # Finding Prices
+ # Kingdom prices can be shown in a variety of currencies, not all in USD, so keeping currency
+ rows = desc.find_all('div', {"class", "row"}, recursive=False)
+ price = rows[-1].find('div', {"class": "row"}).find('h3').text
+ price = price.replace(',', '')
+ price = price.strip()
+ # USD = price.replace("USD",'')
+ BTC = rows[-1].find('div', {"class": "row"}).find_next_sibling('div').find('span').text
+
+ # Finding Vendor
+ vendor = rows[0].select_one('a[href^="/user"]').text
+ vendor = vendor.replace(",", " ")
+ vendor = vendor.strip()
+
+ # Finding Shipment Information (Origem)
+ descs = rows[0].find_all('div', {"class": "col-md-3 text-right"})
+ shipFrom = descs[2].text
+ shipFrom = shipFrom.replace(",", "")
+ shipFrom = shipFrom.strip()
+
+ # Finding Shipment Information (Destiny)
+ shipTo = rows[-1].find('div', {"class": "col-md-6"}).text
+ shipTo = shipTo.replace("Ship to:","")
+ shipTo = shipTo.replace(",","").strip()
+ if(shipTo == ''):
+ shipTo = -1
+
+ # Finding the Product Category
+ category = descs[0].text
+ category = category.replace(",", "")
+ category = category.strip()
+
+ # Finding the Product Quantity Available
+ left = descs[1].text
+ left = left.replace(",", "")
+ left = left.strip()
+
+ # Finding when the Product was Added
+ dt = descs[-1].text.strip()
+ addDate = datetime.strptime(dt, '%d.%m.%Y')
+
+ # Finding the Product description
+ describe = cleanString(soup.find('div', {"id": "descriptionContent"}).text)
+
+ # Finding the Number of Product Reviews
+ review = len(soup.find('div', {"id": "feedbackContent"}).find_all(recursive=False))
+
+ # Searching for CVE and MS categories
+ # no cve or ms in Kingdom
+
+ # Populating the final variable (this should be a list with all fields scraped)
+
+ row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
+ BTC, USD, EURO, sold, left, shipFrom, shipTo)
+
+ # Sending the results
+
+ return row
+
+
+def kingdom_listing_parser(soup):
+
+ # Fields to be parsed
+ nm = 0 # *Total_Products (Should be Integer)
+ mktName = "Kingdom" # 0 *Marketplace_Name
+ vendor = [] # 1 *Vendor y
+ rating_vendor = [] # 2 Vendor_Rating
+ success = [] # 3 Vendor_Successful_Transactions
+ name = [] # 4 *Product_Name y
+ CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = [] # 6 Product_MS_Classification (Microsoft Security)
+ category = [] # 7 Product_Category y
+ describe = [] # 8 Product_Description
+ views = [] # 9 Product_Number_Of_Views
+ reviews = [] # 10 Product_Number_Of_Reviews
+ rating_item = [] # 11 Product_Rating
+ addDate = [] # 12 Product_AddDate
+ BTC = [] # 13 Product_BTC_SellingPrice
+ USD = [] # 14 Product_USD_SellingPrice y
+ EURO = [] # 15 Product_EURO_SellingPrice
+ sold = [] # 16 Product_QuantitySold
+ qLeft =[] # 17 Product_QuantityLeft
+ shipFrom = [] # 18 Product_ShippedFrom
+ shipTo = [] # 19 Product_ShippedTo
+ href = [] # 20 Product_Links
+
+ listing = soup.find('div', {"id": "p0"}).find('div').find_all('div', {"class": "row"}, recursive=False)
+
+ # Populating the Number of Products
+ nm = len(listing)
+
+ for a in listing:
+
+ # Finding Prices
+ #in array USD, there may be prices not in USD, so includes currency as well
+ prices = a.find('div', {"class": "col-md-3"})
+ u = prices.find('h3').text
+ u = u.strip()
+ u = u.replace(',', '')
+ u = u.strip()
+ USD.append(u)
+ bc = prices.find('div').find('span').text
+ BTC.append(bc)
+
+ # Finding the Product
+ product = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]').text
+ product = product.replace('\n', ' ')
+ product = product.replace(","," ")
+ product = product.strip()
+ name.append(product)
+
+ # Finding the Vendor
+ vendor_name = a.select_one('a[href^="/user"]').text
+ vendor_name = vendor_name.replace(",", " ").replace('/', '')
+ vendor_name = vendor_name.strip()
+ vendor.append(vendor_name)
+
+ # Adding the url to the list of urls
+ link = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]')['href']
+ link = cleanLink(link)
+ href.append(link)
+
+ # Searching for CVE and MS categories
+ # cve and ms not in kingdom
+
+ # Populate the final variable (this should be a list with all fields scraped)
+ return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
+ reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
+
+
+def kingdom_links_parser(soup):
+
+ # Returning all links that should be visited by the Crawler
+
+ href = []
+
+ listing = soup.findAll('div', {"class": "col-md-7"})
+
+ for a in listing:
+ link = a.select_one('a[href^="/offer/view?"]')
+ link = link['href']
+ href.append(link)
+
+ return href
\ No newline at end of file
diff --git a/MarketPlaces/LionMarketplace/parser.py b/MarketPlaces/LionMarketplace/parser.py
deleted file mode 100644
index 3b5dc27..0000000
--- a/MarketPlaces/LionMarketplace/parser.py
+++ /dev/null
@@ -1,235 +0,0 @@
-__author__ = 'Helium'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from MarketPlaces.Utilities.utilities import *
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup
-
-
-#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of description page
-#return: 'row' that contains a variety of lists that each hold info on the description page
-def lionmarketplace_description_parser(soup):
-
- # Fields to be parsed
-
- vendor = "-1" # 0 *Vendor_Name
- success = "-1" # 1 Vendor_Successful_Transactions
- rating_vendor = "-1" # 2 Vendor_Rating
- name = "-1" # 3 *Product_Name
- describe = "-1" # 4 Product_Description
- CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
- category = "-1" # 7 Product_Category
- views = "-1" # 8 Product_Number_Of_Views
- reviews = "-1" # 9 Product_Number_Of_Reviews
- rating_item = "-1" # 10 Product_Rating
- addDate = "-1" # 11 Product_AddedDate
- BTC = "-1" # 12 Product_BTC_SellingPrice
- USD = "-1" # 13 Product_USD_SellingPrice
- EURO = "-1" # 14 Product_EURO_SellingPrice
- sold = "-1" # 15 Product_QuantitySold
- left = "-1" # 16 Product_QuantityLeft
- shipFrom = "-1" # 17 Product_ShippedFrom
- shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
-
- # vendor name
- temp = soup.find('div', {'class': 'btn-group'}).find('a').text
- vendor = (cleanString(temp.strip()))
-
- # table with info
- table = soup.find('table')
- rows = table.findAll('tr')
-
- # vendor rating
- pos = soup.find('span', {"class": "fas fa-plus-circle text-success"}).parent.text
- pos = int(pos.strip())
- neu = soup.find('span', {"class": "fas fa-stop-circle text-secondary"}).parent.text
- neu = int(neu.strip())
- neg = soup.find('span', {"class": "fas fa-minus-circle text-danger"}).parent.text
- neg = int(neg.strip())
- total = pos + neu + neg
- if total > 0:
- rating_vendor = str((pos + 0.5*neu) / total)
-
- # product name
- temp = soup.find('div', {'class', 'row'}).find('h2').text
- name = (cleanString(temp.strip()))
-
- # product description
- temp = soup.find('div', {'class': "mt-4"}).contents[-1]
- describe = cleanString(temp.strip())
-
- # Finding Product Image
- image = soup.find('div', {'id': 'slide-1'}).find('img')
- image = image.get('src')
- image = image.split('base64,')[-1]
-
- full = rows[0].findAll('i', {"class": "fas fa-star"})
- half = rows[0].find('i', {"class": "fas fa-star-half-alt"})
- rating_item = len(full)
- if half is not None:
- rating_item += 0.5
- rating_item = str(rating_item)
-
- # USD selling price
- temp = rows[2].find('strong').text
- if " $" in temp:
- temp = temp.replace(" $", "")
- elif "$" in temp:
- temp = temp.replace("$", "")
- USD = cleanString((temp.strip()))
-
- # product sold
- temp = rows[4].find('td')
- if temp is not None and cleanString(temp.text.strip()) == 'Left/Sold':
- temp = rows[4].findAll('td')
- temp = temp[1].findAll('span')
-
- # left
- sold = temp[1].text
- left = temp[0].text
-
- sold = cleanNumbers(sold.strip())
- left = cleanNumbers(left.strip())
- else:
- sold = '-1'
- left = "-1"
-
- # Populating the final variable (this should be a list with all fields scraped)
- row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
-
- # Sending the results
- return row
-
-
-#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of listing page
-#return: 'row' that contains a variety of lists that each hold info on the listing page
-def lionmarketplace_listing_parser(soup):
-
- # Fields to be parsed
- nm = 0 # *Total_Products (Should be Integer)
- mktName = "LionMarketplace" # 0 *Marketplace_Name
- vendor = [] # 1 *Vendor y
- rating_vendor = [] # 2 Vendor_Rating
- success = [] # 3 Vendor_Successful_Transactions
- name = [] # 4 *Product_Name y
- CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = [] # 6 Product_MS_Classification (Microsoft Security)
- category = [] # 7 Product_Category y
- describe = [] # 8 Product_Description
- views = [] # 9 Product_Number_Of_Views
- reviews = [] # 10 Product_Number_Of_Reviews
- rating_item = [] # 11 Product_Rating
- addDate = [] # 12 Product_AddDate
- BTC = [] # 13 Product_BTC_SellingPrice
- USD = [] # 14 Product_USD_SellingPrice y
- EURO = [] # 15 Product_EURO_SellingPrice
- sold = [] # 16 Product_QuantitySold
- qLeft =[] # 17 Product_QuantityLeft
- shipFrom = [] # 18 Product_ShippedFrom
- shipTo = [] # 19 Product_ShippedTo
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
-
- listings = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"})
-
- # Populating the Number of Products
- nm = len(listings)
-
- for listing in listings:
-
- a = listing.find('div', {"class": "card-body"})
- row = a.findAll('p')
-
- # vendor
- temp = row[3].text
- temp = temp.replace("Vendor:", "")
- vendor.append(cleanString(temp.strip()))
-
- image_vendor.append("-1")
-
- # vendor rating
- rating_vendor.append("-1")
-
- # successful transactions CHECK AGAIN HERE
- success.append("-1")
-
- # product name
- temp = a.find('a').text
- name.append(cleanString(temp.strip()))
-
- # Finding Product Image
- product_image = listing.find('img', {'class': 'card-img-top rounded'})
- product_image = product_image.get('src')
- product_image = product_image.split('base64,')[-1]
- image.append(product_image)
-
- CVE.append('-1')
- MS.append('-1')
-
- # product category
- temp = row[1].text
- temp = temp.replace("Category: ", "")
- category.append(cleanString(temp.strip()))
-
- describe.append('-1')
-
- # product views
- vnum = listing.find('p', {"class": "position-absolute bg-primary opacity-60 text-white mt-4 mr-5 pr-1"}).text
- views.append(cleanNumbers(vnum.strip()))
-
- reviews.append('-1') # 10 Product_Number_Of_Reviews
- rating_item.append('-1') # 11 Product_Rating
- addDate.append('-1') # 12 Product_AddDate
-
- # BTC
- BTC.append('-1')
-
- # USD
- temp = row[0].find('strong').text
- USD.append(cleanNumbers(temp.strip())) # 14 Product_USD_SellingPrice
-
- EURO.append("-1") # 15 Product_EURO_SellingPrice
-
- # product sold
- sold.append("-1")
-
- qLeft.append('-1') # 17 Product_QuantityLeft
- shipFrom.append('-1') # 18 Product_ShippedFrom
- shipTo.append('-1') # 19 Product_ShippedTo
-
- # href
- temp = a.find('a').get('href')
- href.append(temp)
-
- # Populate the final variable (this should be a list with all fields scraped)
- return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
-
-
-#called by the crawler to get description links on a listing page
-#@param: beautifulsoup object that is using the correct html page (listing page)
-#return: list of description links from a listing page
-def lionmarketplace_links_parser(soup):
-
- # Returning all links that should be visited by the Crawler
-
- href = []
- listings = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"})
-
- for listing in listings:
- a = listing.find('div', {"class": "card-body"})
- bae = a.find('a', href=True)
- link = bae['href']
- href.append(link)
-
- return href
\ No newline at end of file
diff --git a/MarketPlaces/MetaVerseMarket/crawler_selenium.py b/MarketPlaces/MetaVerseMarket/crawler_selenium.py
deleted file mode 100644
index 44eb335..0000000
--- a/MarketPlaces/MetaVerseMarket/crawler_selenium.py
+++ /dev/null
@@ -1,291 +0,0 @@
-__author__ = 'Helium'
-
-'''
-MetaVerseMarket Marketplace Crawler (Selenium)
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.common.by import By
-
-from PIL import Image
-import urllib.parse as urlparse
-import os, re, time
-from datetime import date
-import subprocess
-import configparser
-from bs4 import BeautifulSoup
-from MarketPlaces.Initialization.prepare_parser import new_parse
-from MarketPlaces.MetaVerseMarket.parser import metaversemarket_links_parser
-from MarketPlaces.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/login'
-
-
-# Opens Tor Browser, crawls the website, then parses, then closes tor
-#acts like the main method for the crawler, another function at the end of this code calls this function later
-def startCrawling():
- mktName = getMKTName()
- driver = getAccess()
-
- if driver != 'down':
- try:
- login(driver)
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(mktName, baseURL, True)
-
-
-# Returns the name of the website
-#return: name of site in string type
-def getMKTName():
- name = 'MetaVerseMarket'
- return name
-
-
-# Return the base link of the website
-#return: url of base site in string type
-def getFixedURL():
- url = 'http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/login'
- return url
-
-
-# Closes Tor Browser
-#@param: current selenium driver
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close()
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from MarketPlaces.Initialization.markets_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", False)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
-#return: return the selenium driver or string 'down'
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)
- return driver
- except:
- driver.close()
- return 'down'
-
-
-# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
-# then allows for manual solving of captcha in the terminal
-#@param: current selenium web driver
-def login(driver):
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, '//*[@id="username"]')))
-
- # entering username and password into input boxes
- usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
- # Username here
- usernameBox.send_keys('metotomoto')
- passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
- # Password here
- passwordBox.send_keys('lionking_kumba1ya')
-
- input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n")
-
- # wait for listing page show up (This Xpath may need to change based on different seed url)
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, '//*[@id="searchq"]')))
-
-# Saves the crawled html page, makes the directory path for html pages if not made
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-#@param: raw url as crawler crawls through every site
-def getFullPathName(url):
- from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
-#@param: raw url as crawler crawls through every site
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
- name = str(counter)
- counter = counter + 1
- return name
-
-
-# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
-#in this example, there are a couple of categories some threads fall under such as
-# Guides and Tutorials, Digital Products, and Software and Malware
-#as you can see they are categories of products
-def getInterestedLinks():
- links = []
-
- # software and malware
- links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/softwares-and-malwares')
- # guides and tutorials
- links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/guides-and-tutorials')
- # services
- links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/services')
-
- return links
-
-
-# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
-#topic and description pages are crawled through here, where both types of pages are saved
-#@param: selenium driver
-def crawlForum(driver):
- print("Crawling the MetaVerse market")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
- list = productPages(html)
- for item in list:
- itemURL = urlparse.urljoin(baseURL, str(item))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
- savePage(driver, driver.page_source, item)
- driver.back()
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- try:
- link = driver.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href')
- if link.endswith('#') or link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the MetaVerse market done.")
-
-
-# Returns 'True' if the link is a description link
-#@param: url of any url crawled
-#return: true if is a description page, false if not
-def isDescriptionLink(url):
- if 'PR' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link
-#@param: url of any url crawled
-#return: true if is a Listing page, false if not
-def isListingLink(url):
- if 'products' in url:
- return True
- return False
-
-
-# calling the parser to define the links, the html is the url of a link from the list of interested link list
-#@param: link from interested link list ie. getInterestingLinks()
-#return: list of description links that should be crawled through
-def productPages(html):
- soup = BeautifulSoup(html, "html.parser")
- return metaversemarket_links_parser(soup)
-
-
-# Drop links that "signout"
-# def isSignOut(url):
-# #absURL = urlparse.urljoin(url.base_url, url.url)
-# if 'signout' in url.lower() or 'logout' in url.lower():
-# return True
-#
-# return False
-
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing MetaVerseMarket .... DONE!")
diff --git a/MarketPlaces/MetaVerseMarket/parser.py b/MarketPlaces/MetaVerseMarket/parser.py
deleted file mode 100644
index 5c12390..0000000
--- a/MarketPlaces/MetaVerseMarket/parser.py
+++ /dev/null
@@ -1,269 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from MarketPlaces.Utilities.utilities import *
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup
-
-
-# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
-# stores info it needs in different lists, these lists are returned after being organized
-# @param: soup object looking at html page of description page
-# return: 'row' that contains a variety of lists that each hold info on the description page
-def metaversemarket_description_parser(soup):
- # Fields to be parsed
-
- vendor = "-1" # 0 *Vendor_Name
- success = "-1" # 1 Vendor_Successful_Transactions
- rating_vendor = "-1" # 2 Vendor_Rating
- name = "-1" # 3 *Product_Name
- describe = "-1" # 4 Product_Description
- CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
- category = "-1" # 7 Product_Category
- views = "-1" # 8 Product_Number_Of_Views
- reviews = "-1" # 9 Product_Number_Of_Reviews
- rating_item = "-1" # 10 Product_Rating
- addDate = "-1" # 11 Product_AddedDate
- BTC = "-1" # 12 Product_BTC_SellingPrice
- USD = "-1" # 13 Product_USD_SellingPrice
- EURO = "-1" # 14 Product_EURO_SellingPrice
- sold = "-1" # 15 Product_QuantitySold
- left = "-1" # 16 Product_QuantityLeft
- shipFrom = "-1" # 17 Product_ShippedFrom
- shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
-
- # Finding Product Name
- name = soup.find('div', {'class': "panel-heading"}).text
- name = cleanString(name.strip())
-
- temp = soup.findAll('div', {'class': "col-xs-12 col-sm-6 mt-5"})
-
- # Finding Product Image
- image = temp[0].find('img')
- image = image.get('src')
- image = image.split('base64,')[-1]
-
- # Finding Vendor
- temp = temp[1].findAll('span')
- vendor = temp[1].find('b').text
- vendor = cleanString(vendor.strip())
-
- # Finding Vendor Rating
- pos = soup.find('span', {'class': "badge bg-success fs-12px"}).text
- pos = int(cleanNumbers(pos).strip())
- neg = soup.find('span', {'class': "badge bg-danger fs-12px"}).text
- neg = int(cleanNumbers(neg).strip())
- total = pos + neg
- if total > 0:
- rating_vendor = str(pos / total)
-
- # Finding Prices
- USD = soup.find('h3', {'class': "mb-2"}).text
- USD = cleanNumbers(USD).strip()
-
- # Finding the Product Category
- temp = soup.select('div[class="mt-2"]')[1].text
- temp = temp.replace("Category:", "")
- category = temp.strip()
-
- # Finding Number of Views
- views = soup.find('button', {"class": "btn btn-secondary text-center w-33 fw-bold"}).text
- views = views.strip()
-
- # Finding the Product Quantity Available
- temp = soup.find('button', {"class": "btn btn-success text-center w-33 fw-bold"}).text
- temp = temp.split("/")
- left = temp[1].strip()
-
- # Finding Number Sold
- sold = temp[0].strip()
-
- # Finding Shipment Information (Origin)
- temp = soup.find('div', {'class': "alert alert-info"}).text
- temp = temp.split("to")
- shipFrom = temp[0].replace("Shipping from ", "").strip()
-
- # Finding Shipment Information (Destination)
- shipTo = temp[1].split("for")
- shipTo = shipTo[0].strip()
-
- # Finding the Product description
- describe = soup.find('p', {'class': "card-text"}).text
- describe = cleanString(describe.strip())
-
- # Searching for CVE and MS categories
- cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
- if cve:
- CVE = " "
- for idx in cve:
- CVE += (idx)
- CVE += " "
- CVE = CVE.replace(',', ' ')
- CVE = CVE.replace('\n', '')
- ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
- if ms:
- MS = " "
- for im in ms:
- MS += (im)
- MS += " "
- MS = MS.replace(',', ' ')
- MS = MS.replace('\n', '')
-
- # Populating the final variable (this should be a list with all fields scraped)
- row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
-
- # Sending the results
- return row
-
-
-# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
-# stores info it needs in different lists, these lists are returned after being organized
-# @param: soup object looking at html page of listing page
-# return: 'row' that contains a variety of lists that each hold info on the listing page
-def metaversemarket_listing_parser(soup):
- # Fields to be parsed
- nm = 0 # *Total_Products (Should be Integer)
- mktName = "MetaVerseMarket" # 0 *Marketplace_Name
- vendor = [] # 1 *Vendor y
- rating_vendor = [] # 2 Vendor_Rating
- success = [] # 3 Vendor_Successful_Transactions
- name = [] # 4 *Product_Name y
- CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
- MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
- category = [] # 7 Product_Category y
- describe = [] # 8 Product_Description
- views = [] # 9 Product_Number_Of_Views
- reviews = [] # 10 Product_Number_Of_Reviews
- rating_item = [] # 11 Product_Rating
- addDate = [] # 12 Product_AddDate
- BTC = [] # 13 Product_BTC_SellingPrice
- USD = [] # 14 Product_USD_SellingPrice y
- EURO = [] # 15 Product_EURO_SellingPrice
- sold = [] # 16 Product_QuantitySold
- qLeft = [] # 17 Product_QuantityLeft
- shipFrom = [] # 18 Product_ShippedFrom
- shipTo = [] # 19 Product_ShippedTo
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
-
- listing = soup.findAll('div', {"class": "col-12 col-sm-4 col-xl-3 product_item_col p-1"})
-
- # Populating the Number of Products
- nm = len(listing)
-
- for a in listing:
- bae = a.findAll('a', href=True)
-
- # Adding the url to the list of urls
- link = bae[0].get('href')
- link = cleanLink(link)
- href.append(link)
-
- # Finding the Product
- product = bae[1].find('span', {"class": "text-primary"}).text
- name.append(cleanString(product.strip()))
-
- # Finding Prices
- price = a.find('strong').text
- USD.append(cleanNumbers(price).strip())
-
- # Finding the Vendor
- temp = a.find('div', {'class': "mt-1 fs-12px"})
- temp = temp.findAll('span')
- vendor_name = temp[1].find('b').text
- vendor.append(cleanString(vendor_name.strip()))
-
- # Finding the Category
- cat = a.select_one('div[class="fs-12px"]')
- cat = cat.findAll('span')[1].text
- cat = cat.strip()
- category.append(cat)
-
- ul = a.find('ul', {"class": "product-actions"})
-
- # Finding Number Sold and Quantity Left
- temp = ul.find('span', {'class': "badge bg-success"}).text
- temp = temp.split("/")
- num = temp[0]
- num = num.replace('k', '000')
- sold.append(cleanNumbers(num).strip())
-
- quant = temp[1]
- quant = quant.replace('k', '000')
- qLeft.append(cleanNumbers(quant).strip())
-
- # Finding Descrption
- # description = a.find('p', {'class': "alert alert-light text-ssbold p-1"}).text
- # description = description.replace("\n", " ")
- # description = description.strip()
- # describe.append(cleanString(description))
-
- # Finding Number of Views
- view = ul.find('span', {'class': "badge bg-primary"}).text
- view = view.replace('.', '')
- view = view.replace('K', '000')
- views.append(view.strip())
-
- # Find where ships from
- ships = a.find('div', {'class': "alert alert-info item_alert fs-12px p-1"})
- ships = ships.findAll('b')
- sFrom = ships[0].text.strip()
- shipFrom.append(sFrom)
-
- # Find where it ships to
- sTo = ships[1].text.strip()
- shipTo.append(sTo)
-
- # Searching for CVE and MS categories
- cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
- if not cve:
- cveValue = "-1"
- else:
- cee = " "
- for idx in cve:
- cee += (idx)
- cee += " "
- cee = cee.replace(',', ' ')
- cee = cee.replace('\n', '')
- cveValue = cee
- CVE.append(cveValue)
-
- ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
- if not ms:
- MSValue = "-1"
- else:
- me = " "
- for im in ms:
- me += (im)
- me += " "
- me = me.replace(',', ' ')
- me = me.replace('\n', '')
- MSValue = me
- MS.append(MSValue)
-
- # Populate the final variable (this should be a list with all fields scraped)
- return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
-
-
-# called by the crawler to get description links on a listing page
-# @param: beautifulsoup object that is using the correct html page (listing page)
-# return: list of description links from a listing page
-def metaversemarket_links_parser(soup):
- # Returning all links that should be visited by the Crawler
-
- href = []
- listing = soup.findAll('div', {"class": "col-12 col-sm-4 col-xl-3 product_item_col p-1"})
-
- for a in listing:
- bae = a.find('a', href=True)
- link = bae['href']
- href.append(link)
-
- return href
\ No newline at end of file
diff --git a/MarketPlaces/Nexus/crawler_selenium.py b/MarketPlaces/Nexus/crawler_selenium.py
deleted file mode 100644
index bd76f59..0000000
--- a/MarketPlaces/Nexus/crawler_selenium.py
+++ /dev/null
@@ -1,289 +0,0 @@
-__author__ = 'Helium'
-
-'''
-Nexus Market Crawler (Selenium)
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.common.by import By
-
-from PIL import Image
-import urllib.parse as urlparse
-import os, re, time
-from datetime import date
-import subprocess
-import configparser
-from bs4 import BeautifulSoup
-from MarketPlaces.Initialization.prepare_parser import new_parse
-from MarketPlaces.Nexus.parser import nexus_links_parser
-from MarketPlaces.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion'
-
-
-# Opens Tor Browser, crawls the website, then parses, then closes tor
-#acts like the main method for the crawler, another function at the end of this code calls this function later
-def startCrawling():
- mktName = getMKTName()
- driver = getAccess()
-
- if driver != 'down':
- try:
- input("Press ENTER when page loads after DDOS protection")
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(mktName, baseURL, True)
-
-
-# Returns the name of the website
-#return: name of site in string type
-def getMKTName():
- name = 'Nexus'
- return name
-
-
-# Return the base link of the website
-#return: url of base site in string type
-def getFixedURL():
- url = 'http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion'
- return url
-
-
-# Closes Tor Browser
-#@param: current selenium driver
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close()
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from MarketPlaces.Initialization.markets_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- # ff_prof.set_preference("network.dns.disablePrefetch", True)
- # ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", True)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
-#return: return the selenium driver or string 'down'
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)
- return driver
- except:
- driver.close()
- return 'down'
-
-
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-#@param: raw url as crawler crawls through every site
-def getFullPathName(url):
- from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isListingLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
-#@param: raw url as crawler crawls through every site
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
- name = str(counter)
- counter = counter + 1
- return name
-
-
-# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
-#in this example, there are a couple of categories some threads fall under such as
-# Guides and Tutorials, Digital Products, and Software and Malware
-#as you can see they are categories of products
-def getInterestedLinks():
- links = []
-
- # malware
- links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/')
- # hacking-spam
- links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/')
- # hacking services
- links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/hacking/')
- # programming services
- links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/programacao/')
- # remote admin services
- links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/administracao-remota/')
- # hacking guides
- links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-hacking/')
- # malware guides
- links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-malware/')
- # fraud guides
- links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-fraudes/')
- # fraud software
- links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/fraudes/software-de-fraude/')
-
- return links
-
-
-# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
-#topic and description pages are crawled through here, where both types of pages are saved
-#@param: selenium driver
-def crawlForum(driver):
- print("Crawling the Nexus market")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
-
- # waiting for btc price to load
- try:
- WebDriverWait(driver, 1).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/ul/li[1]/div/span/span[3]")))
- time.sleep(5)
- except:
- pass
-
- html = driver.page_source
- savePage(driver, html, link)
-
- list = productPages(html)
- for item in list:
- itemURL = urlparse.urljoin(baseURL, str(item))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
-
- # waiting for btc price to load
- try:
- WebDriverWait(driver, 1).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/div[3]/div[2]/p/span[3]")))
- except:
- pass
-
- savePage(driver, driver.page_source, item)
- driver.back()
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- try:
- link = driver.find_element(by=By.LINK_TEXT, value='→').get_attribute('href')
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the Nexus market done.")
-
-
-# Returns 'True' if the link is a description link
-#@param: url of any url crawled
-#return: true if is a description page, false if not
-def isDescriptionLink(url):
- if 'produto' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link
-#@param: url of any url crawled
-#return: true if is a Listing page, false if not
-def isListingLink(url):
- if 'categoria-produto' in url:
- return True
- return False
-
-
-# calling the parser to define the links, the html is the url of a link from the list of interested link list
-#@param: link from interested link list ie. getInterestingLinks()
-#return: list of description links that should be crawled through
-def productPages(html):
- soup = BeautifulSoup(html, "html.parser")
- return nexus_links_parser(soup)
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing Nexus .... DONE!")
-
diff --git a/MarketPlaces/Nexus/parser.py b/MarketPlaces/Nexus/parser.py
deleted file mode 100644
index 107a80a..0000000
--- a/MarketPlaces/Nexus/parser.py
+++ /dev/null
@@ -1,236 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from MarketPlaces.Utilities.utilities import *
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup
-
-import re
-
-usd_to_brl_r = None
-
-
-#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of description page
-#return: 'row' that contains a variety of lists that each hold info on the description page
-def nexus_description_parser(soup):
-
- # Fields to be parsed
-
- vendor = "-1" # 0 *Vendor_Name
- success = "-1" # 1 Vendor_Successful_Transactions
- rating_vendor = "-1" # 2 Vendor_Rating
- name = "-1" # 3 *Product_Name
- describe = "-1" # 4 Product_Description
- CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
- category = "-1" # 7 Product_Category
- views = "-1" # 8 Product_Number_Of_Views
- reviews = "-1" # 9 Product_Number_Of_Reviews
- rating_item = "-1" # 10 Product_Rating
- addDate = "-1" # 11 Product_AddedDate
- BTC = "-1" # 12 Product_BTC_SellingPrice
- USD = "-1" # 13 Product_USD_SellingPrice
- EURO = "-1" # 14 Product_EURO_SellingPrice
- sold = "-1" # 15 Product_QuantitySold
- left = "-1" # 16 Product_QuantityLeft
- shipFrom = "-1" # 17 Product_ShippedFrom
- shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
-
-
- #finding the name of the product
- name_of_product = soup.find("h1", {"class": "product_title entry-title"}).text
- name = cleanString(name_of_product.strip())
-
- # Finding USD Price
- real = soup.find('span', {"class": "price"}).find('bdi').text
- real = real.split(',')
- whole = cleanNumbers(real[0]).replace('.', '')
- real = whole + '.' + real[1]
- usd = float(real) / usd_to_brl_r
- USD = str(round(usd, 2))
-
- # Find the BTC Price
- prices = soup.find('p', {"class": "price"}).findAll('span', {"class": "cs"})
- if len(prices) > 0:
- BTC = prices[0].text
- BTC = cleanNumbers(BTC.strip())
-
- # finding the description of the product
- description_div = soup.find("div", {"class": "woocommerce-product-details__short-description"})
- if description_div is None:
- describe = "-1"
- else:
- describe = cleanString(description_div.text.strip())
-
- # Finding Product Image
- image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img')
- image = image.get('src')
- image = image.split('base64,')[-1]
-
- #find the category of the product
- name_of_category = soup.find("span", {"class": "posted_in"}).find("a").text
- category = cleanString(name_of_category.strip())
-
- #finding the name of the vendor
- name_of_vendor = soup.find("div", {"class": "dokan-vendor-name"}).find("h5").text
- vendor = cleanString(name_of_vendor)
-
- #finding the vendor's rating
- vendorRating = soup.find("div", {"class": "dokan-vendor-rating"}).find("p").text
- rating_vendor = cleanString(vendorRating)
- #everything else gets a -1 because they are not found
-
- # Populating the final variable (this should be a list with all fields scraped)
- row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
-
-
- # Sending the results
- return row
-
-
-#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of listing page
-#return: 'row' that contains a variety of lists that each hold info on the listing page
-def nexus_listing_parser(soup):
-
- global usd_to_brl_r
- while usd_to_brl_r is None:
- try:
- usd_to_brl_r = float(input("1 US Dollar = (Brazilian Real) "))
- except ValueError:
- pass
-
- # Fields to be parsed
- nm = 0 # *Total_Products (Should be Integer)
- mktName = "Nexus" # 0 *Marketplace_Name
- vendor = [] # 1 *Vendor y
- rating_vendor = [] # 2 Vendor_Rating
- success = [] # 3 Vendor_Successful_Transactions
- name = [] # 4 *Product_Name y
- CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = [] # 6 Product_MS_Classification (Microsoft Security)
- category = [] # 7 Product_Category y
- describe = [] # 8 Product_Description
- views = [] # 9 Product_Number_Of_Views
- reviews = [] # 10 Product_Number_Of_Reviews
- rating_item = [] # 11 Product_Rating
- addDate = [] # 12 Product_AddDate
- BTC = [] # 13 Product_BTC_SellingPrice
- USD = [] # 14 Product_USD_SellingPrice y
- EURO = [] # 15 Product_EURO_SellingPrice
- sold = [] # 16 Product_QuantitySold
- qLeft =[] # 17 Product_QuantityLeft
- shipFrom = [] # 18 Product_ShippedFrom
- shipTo = [] # 19 Product_ShippedTo
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
-
- main = soup.find('main', {'id': 'main'})
- products_list = main.find('ul', recursive=False).find_all('li', recursive=False)
- nm = len(products_list)
-
- for product in products_list:
- # Finding the name of the product
- name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
- name_of_product_cleaned = cleanString(name_of_product.strip())
- # print(name_of_product_cleaned)
- name.append(name_of_product_cleaned)
- #finding the URL
- try:
- url = product.find("a", class_="woocommerce-loop-product__link").get('href')
- href.append(url)
- except AttributeError as e:
- print("I can't find the link")
- raise e
-
- # Finding Product Image
- product_image = product.find('a', {'class': 'woocommerce-loop-image-link woocommerce-LoopProduct-link woocommerce-loop-product__link'}).find('img')
- product_image = product_image.get('src')
- product_image = product_image.split('base64,')[-1]
- image.append(product_image)
-
- # Finding USD Price
- real = product.find('span', {"class": "price"}).find('bdi').text
- real = real.split(',')
- whole = cleanNumbers(real[0]).replace('.', '')
- real = whole + '.' + real[1]
- usd = float(real) / usd_to_brl_r
- USD.append(str(round(usd, 2)))
-
- # Finding BTC Price
- prices = product.find('span', {"class": "price"}).findAll('span', {"class": "cs"})
- if len(prices) > 0:
- price = prices[0].text
- BTC.append(cleanNumbers(price.strip()))
-
- #everything else appends a -1
- rating_vendor.append("-1")
- vendor.append('-1')
- success.append("-1")
- CVE.append("-1")
- MS.append("-1")
- category.append("-1")
- describe.append("-1")
- views.append("-1")
- reviews.append("-1")
- addDate.append("-1")
- EURO.append("-1")
- sold.append("-1")
- qLeft.append("-1")
- shipFrom.append("-1")
- shipTo.append("-1")
- image_vendor.append("-1")
-
- # Populate the final variable (this should be a list with all fields scraped)
- return organizeProducts(
- marketplace = mktName,
- nm = nm,
- vendor = vendor,
- rating_vendor = rating_vendor,
- success_vendor = success,
- nombre = name,
- CVE = CVE,
- MS = MS,
- category = category,
- describe = describe,
- views = views,
- reviews = reviews,
- rating_item = rating_item,
- addDate = addDate,
- BTC = BTC,
- USD = USD,
- EURO = EURO,
- sold = sold,
- qLeft = qLeft,
- shipFrom = shipFrom,
- shipTo = shipTo,
- href = href,
- image = image,
- image_vendor = image_vendor
- )
-
-
-#called by the crawler to get description links on a listing page
-#@param: beautifulsoup object that is using the correct html page (listing page)
-#return: list of description links from a listing page
-def nexus_links_parser(soup):
- # Returning all links that should be visited by the Crawler
-
- href = []
- # Using a shorter, but still unique, class name
- listing = soup.find_all("a", class_="woocommerce-loop-product__link")
-
- for a in listing:
- link = a.get('href')
- if link: # Checks if 'href' attribute is not None
- href.append(link)
-
- return href
diff --git a/MarketPlaces/Tor2door/crawler_selenium.py b/MarketPlaces/Quest/crawler_selenium.py
similarity index 72%
rename from MarketPlaces/Tor2door/crawler_selenium.py
rename to MarketPlaces/Quest/crawler_selenium.py
index 17988be..69287a9 100644
--- a/MarketPlaces/Tor2door/crawler_selenium.py
+++ b/MarketPlaces/Quest/crawler_selenium.py
@@ -1,7 +1,7 @@
__author__ = 'DarkWeb'
'''
-Tor2door Market Crawler (Selenium)
+Quest Market Crawler (Selenium)
'''
from selenium import webdriver
@@ -16,22 +16,22 @@ from PIL import Image
import urllib.parse as urlparse
import os, re, time
+from datetime import date
import subprocess
-import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
-from MarketPlaces.Tor2door.parser import tor2door_links_parser
+from MarketPlaces.Quest.parser import quest_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
-baseURL = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion'
+baseURL = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion'
# Opens Tor Browser, crawls the website
def startCrawling():
- marketName = getMKTName()
+ marketName = getMarketName()
driver = getAccess()
-
+
if driver != 'down':
try:
login(driver)
@@ -39,15 +39,15 @@ def startCrawling():
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
-
- new_parse(marketName, baseURL, True)
+
+ new_parse(marketName, False)
# Login using premade account credentials and do login captcha manually
def login(driver):
#wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, '//*[@id="username"]')))
+ (By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button")))
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
@@ -60,19 +60,19 @@ def login(driver):
'''
# wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/main/div/div/div/div/div/form/div[3]/div/div[1]/label/img")))
+ (By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img")))
# save captcha to local
- driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[3]/div/div[1]/label/img').screenshot(
- r'..\Tor2door\captcha.png')
+ driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img').screenshot(
+ r'..\Quest\captcha.png')
# This method will show image in any image viewer
- im = Image.open(r'..\Tor2door\captcha.png')
+ im = Image.open(r'..\Quest\captcha.png')
im.show()
# wait until input space show up
- inputBox = driver.find_element(by=By.XPATH, value='//*[@id="captcha"]')
+ inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/input')
# ask user input captcha solution in terminal
userIn = input("Enter solution: ")
@@ -81,24 +81,24 @@ def login(driver):
inputBox.send_keys(userIn)
# click the verify(submit) button
- driver.find_element(by=By.XPATH, value="/html/body/main/div/div/div/div/div/form/div[4]/button").click()
+ driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, '/html/body/main/div/div/div[1]/div/div[1]/div/h5')))
+ WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
+ (By.XPATH, '/html/body/div[5]/div/div/div/span')))
# Returns the name of the website
-def getMKTName():
- name = 'Tor2door'
+def getMarketName():
+ name = 'Quest'
return name
# Return the link of the website
def getFixedURL():
- url = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/login'
+ url = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion'
return url
@@ -109,7 +109,7 @@ def closeDriver(driver):
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
- driver.quit()
+ driver.close()
time.sleep(3)
return
@@ -129,8 +129,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- # ff_prof.set_preference("network.dns.disablePrefetch", True)
- # ff_prof.set_preference("network.http.sendRefererHeader", 0)
+ ff_prof.set_preference("network.dns.disablePrefetch", True)
+ ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@@ -146,7 +146,7 @@ def createFFDriver():
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
+
driver.maximize_window()
return driver
@@ -198,22 +198,24 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # Digital - Guides - Hacking
- links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=55')
- # Digital - Guides - Others
- links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=57')
- # Digital - Software
- links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=60')
- # Software - Malware
- links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=69')
- # Software - Others
- links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=78')
+ # # Digital - Services
+ # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/8ae67900-22ed-11ec-a710-31f963ce8d35')
+ # # Digital - Software
+ # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/92809300-22ed-11ec-b143-af312e1dab77')
+ # # Digital - Tutorials
+ # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/9d1592b0-22ed-11ec-b82d-c3d2878a8716')
+ # # Digital - Malware
+ # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/a35bae90-22ed-11ec-ad2e-410f5a5339b5')
+ # # Digital - Hacking
+ # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/b4252cf0-22ed-11ec-8032-751549438ed5')
+ # Digital - Exploits
+ links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/c0c3ac60-22ed-11ec-9e97-41cd1912fdee')
return links
def crawlForum(driver):
- print("Crawling the Tor2door market")
+ print("Crawling the Quest market")
linksToCrawl = getInterestedLinks()
@@ -243,17 +245,17 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item)
driver.back()
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
+ # comment out
+ break
+
+ # comment out
+ if count == 1:
+ break
try:
- nav = driver.find_element(by=By.XPATH, value=
- '/html/body/main/div/div/div[2]/div[11]/div/nav')
- a = nav.find_element(by=By.LINK_TEXT, value="›")
+ nav = driver.find_element(by=By.XPATH, value='/html/body/div[6]/nav')
+ li = nav.find_elements(By.TAG_NAME, 'li')
+ a = li[-1].find_element(By.TAG_NAME, 'a')
link = a.get_attribute('href')
if link == "":
raise NoSuchElementException
@@ -266,19 +268,19 @@ def crawlForum(driver):
print(link, e)
i += 1
- print("Crawling the Tor2door market done.")
+ input("Crawling Quest market done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
- if 'products/' in url:
+ if 'product' in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
- if 'category=' in url:
+ if 'category' in url:
return True
return False
@@ -286,7 +288,7 @@ def isListingLink(url):
# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
- return tor2door_links_parser(soup)
+ return quest_links_parser(soup)
def crawler():
diff --git a/MarketPlaces/Quest/parser.py b/MarketPlaces/Quest/parser.py
new file mode 100644
index 0000000..6761ed9
--- /dev/null
+++ b/MarketPlaces/Quest/parser.py
@@ -0,0 +1,232 @@
+__author__ = 'DarkWeb'
+
+# Here, we are importing the auxiliary functions to clean or convert data
+from MarketPlaces.Utilities.utilities import *
+
+# Here, we are importing BeautifulSoup to search through the HTML tree
+from bs4 import BeautifulSoup
+
+
+# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
+def quest_description_parser(soup):
+
+ # Fields to be parsed
+
+ vendor = "-1" # 0 *Vendor_Name
+ success = "-1" # 1 Vendor_Successful_Transactions
+ rating_vendor = "-1" # 2 Vendor_Rating
+ name = "-1" # 3 *Product_Name
+ describe = "-1" # 4 Product_Description
+ CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
+ category = "-1" # 7 Product_Category
+ views = "-1" # 8 Product_Number_Of_Views
+ reviews = "-1" # 9 Product_Number_Of_Reviews
+ rating_item = "-1" # 10 Product_Rating
+ addDate = "-1" # 11 Product_AddedDate
+ BTC = "-1" # 12 Product_BTC_SellingPrice
+ USD = "-1" # 13 Product_USD_SellingPrice
+ EURO = "-1" # 14 Product_EURO_SellingPrice
+ sold = "-1" # 15 Product_QuantitySold
+ left = "-1" # 16 Product_QuantityLeft
+ shipFrom = "-1" # 17 Product_ShippedFrom
+ shipTo = "-1" # 18 Product_ShippedTo
+
+ row = soup.find_all('div', {'class': "row"})
+
+ # Finding Product Name
+ name = row[1].text
+ name = name.replace('\n', ' ')
+ name = name.replace(",", "")
+ name = name.strip()
+
+ small = row[3].find_all('small')
+
+ # Finding Vendor
+ vendor = small[0].text
+ vendor = vendor.replace("Vendor:", "")
+ vendor = vendor.replace(",", "")
+ vendor = vendor.strip()
+
+ # Finding Vendor Rating
+ full_stars = small[2].find_all('i', {'class': "fas fa-star"})
+ half_star = small[2].find('i', {'class': "fas fa-star-half-alt"})
+ rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0)
+
+ # Finding Successful Transactions
+ success = small[4].text
+ success = success.replace("Total Sales:", "")
+ success = success.strip()
+
+ small = row[2].find('p', {'class': "text-left"}).find_all('small')
+
+ # Finding Prices
+ USD = small[1].text
+ USD = USD.replace("$", "")
+ USD = USD.strip()
+
+ shipping_info = row[2].find('p', {'class': "text-left"}).find('span').text.strip()
+ if "Digital" not in shipping_info:
+ shipping_info = shipping_info.split(" ")
+
+ # Finding Shipment Information (Origin)
+ shipFrom = shipping_info[0].strip()
+
+ # Finding Shipment Information (Destination)
+ shipTo = shipping_info[1].strip()
+
+ textarea = row[2].find_all('textarea')
+
+ # Finding the Product description
+ describe = textarea[0].text
+ describe = describe.replace("\n", " ")
+ describe = describe.replace("\r", " ")
+ describe = describe.strip()
+
+ '''
+ # Finding the Number of Product Reviews
+ tag = soup.findAll(text=re.compile('Reviews'))
+ for index in tag:
+ reviews = index
+ par = reviews.find('(')
+ if par >=0:
+ reviews = reviews.replace("Reviews (","")
+ reviews = reviews.replace(")","")
+ reviews = reviews.split(",")
+ review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
+ else :
+ review = "-1"
+ '''
+
+ # Searching for CVE and MS categories
+ cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
+ if cve:
+ CVE = " "
+ for idx in cve:
+ CVE += (idx)
+ CVE += " "
+ CVE = CVE.replace(',', ' ')
+ CVE = CVE.replace('\n', '')
+ ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
+ if ms:
+ MS = " "
+ for im in ms:
+ MS += (im)
+ MS += " "
+ MS = MS.replace(',', ' ')
+ MS = MS.replace('\n', '')
+
+ # Populating the final variable (this should be a list with all fields scraped)
+ row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
+ BTC, USD, EURO, sold, left, shipFrom, shipTo)
+
+ # Sending the results
+ return row
+
+
+# This is the method to parse the Listing Pages
+def quest_listing_parser(soup):
+
+ # Fields to be parsed
+ nm = 0 # *Total_Products (Should be Integer)
+ mktName = "Quest" # 0 *Marketplace_Name
+ vendor = [] # 1 *Vendor y
+ rating_vendor = [] # 2 Vendor_Rating
+ success = [] # 3 Vendor_Successful_Transactions
+ name = [] # 4 *Product_Name y
+ CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = [] # 6 Product_MS_Classification (Microsoft Security)
+ category = [] # 7 Product_Category y
+ describe = [] # 8 Product_Description
+ views = [] # 9 Product_Number_Of_Views
+ reviews = [] # 10 Product_Number_Of_Reviews
+ rating_item = [] # 11 Product_Rating
+ addDate = [] # 12 Product_AddDate
+ BTC = [] # 13 Product_BTC_SellingPrice
+ USD = [] # 14 Product_USD_SellingPrice y
+ EURO = [] # 15 Product_EURO_SellingPrice
+ sold = [] # 16 Product_QuantitySold
+ qLeft =[] # 17 Product_QuantityLeft
+ shipFrom = [] # 18 Product_ShippedFrom
+ shipTo = [] # 19 Product_ShippedTo
+ href = [] # 20 Product_Links
+
+ # Finding category of listing page
+ cat = soup.find('span', {'class': "btn btn-sm btn-outline-mgray active border-info"}).text
+ cat = cat.replace("Digital -", "")
+ cat = cat.strip()
+
+ listing = soup.find_all('div', {"class": "col-md-2 my-md-0 col-12"})
+
+ # Populating the Number of Products
+ nm = len(listing)
+
+ for a in listing:
+ bae = a.find_all('a', href=True)
+
+ # Adding the category
+ category.append(cat)
+
+ # Adding the url to the list of urls
+ link = bae[0].get('href')
+ link = cleanLink(link)
+ href.append(link)
+
+ # Finding the Vendor
+ vendor_name = bae[2].text
+ vendor_name = vendor_name.replace(",", "")
+ vendor_name = vendor_name.strip()
+ vendor.append(vendor_name)
+
+ # Finding the Product
+ product = bae[1].find('img').get('alt')
+ product = product.replace('\n', ' ')
+ product = product.replace(",", "")
+ product = product.strip()
+ name.append(product)
+
+ # Searching for CVE and MS categories
+ cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
+ if not cve:
+ cveValue="-1"
+ else:
+ cee = " "
+ for idx in cve:
+ cee += (idx)
+ cee += " "
+ cee = cee.replace(',', ' ')
+ cee = cee.replace('\n', '')
+ cveValue=cee
+ CVE.append(cveValue)
+
+ ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
+ if not ms:
+ MSValue="-1"
+ else:
+ me = " "
+ for im in ms:
+ me += (im)
+ me += " "
+ me = me.replace(',', ' ')
+ me = me.replace('\n', '')
+ MSValue=me
+ MS.append(MSValue)
+
+ # Populate the final variable (this should be a list with all fields scraped)
+ return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
+ reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
+
+
+def quest_links_parser(soup):
+
+ # Returning all links that should be visited by the Crawler
+ href = []
+
+ listing = soup.findAll('div', {"class": "col-md-2 my-md-0 col-12"})
+
+ for div in listing:
+
+ link = div.find('a')["href"]
+ href.append(link)
+
+ return href
\ No newline at end of file
diff --git a/MarketPlaces/RobinhoodMarket/crawler_selenium.py b/MarketPlaces/RobinhoodMarket/crawler_selenium.py
deleted file mode 100644
index 232fac7..0000000
--- a/MarketPlaces/RobinhoodMarket/crawler_selenium.py
+++ /dev/null
@@ -1,256 +0,0 @@
-__author__ = 'chris'
-
-'''
-RobinhoodMarket Market Crawler (Selenium)
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-from PIL import Image
-
-import urllib.parse as urlparse
-import os, re, time
-import subprocess
-import configparser
-from bs4 import BeautifulSoup
-from MarketPlaces.Initialization.prepare_parser import new_parse
-from MarketPlaces.RobinhoodMarket.parser import Robinhood_links_parser
-from MarketPlaces.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/'
-
-
-# Opens Tor Browser, crawls the website
-def startCrawling():
- marketName = getMKTName()
-
- driver = getAccess()
-
- if driver != 'down':
- try:
- # Captcha
- input("Press ENTER when website has loaded")
- # Robinhood doesn't need login
- # login(driver)
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(marketName, baseURL, True)
-
-
-# Login is not needed in Robinhood
-def login(driver):
- pass
-
-
-# Returns the name of the website
-def getMKTName():
- name = 'RobinhoodMarket'
- return name
-
-
-# Return the link of the website
-def getFixedURL():
- url = 'http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/'
-
- return url
-
-
-# Closes Tor Browser
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.quit()
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from MarketPlaces.Initialization.markets_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", False)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)
- return driver
- except:
- driver.close()
- return 'down'
-
-
-# Saves the crawled html page
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-def getFullPathName(url):
- from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if name == '':
- name = str(counter)
- counter = counter + 1
- return name
-
-
-def getInterestedLinks():
- links = []
-
- # Hacking
- links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/hacking/')
- # Other Software
- links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/')
-
- return links
-
-
-def crawlForum(driver):
- print("Crawling the Robinhood market")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
-
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
- list = productPages(html)
- for c, item in enumerate(list):
-
- itemURL = urlparse.urljoin(baseURL, str(item))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
- savePage(driver, driver.page_source, item)
- driver.back()
-
- # comment out
- # if c == 4:
- # break
-
- # comment out
- # if count == 1:
- # break
-
- # go to next page of market
- try:
- nav = driver.find_element(by=By.XPATH, value="//a[@class='next page-numbers']")
- link = nav.get_attribute('href')
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the Robinhood market done.")
-
-
-# Returns 'True' if the link is Topic link
-def isDescriptionLink(url):
- if 'product' in url and 'category' not in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link
-def isListingLink(url):
- if 'category=' in url:
- return True
- return False
-
-
-# calling the parser to define the links
-def productPages(html):
- soup = BeautifulSoup(html, "html.parser")
- return Robinhood_links_parser(soup)
-
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing BestCardingWorld .... DONE!")
-
-
-if __name__ == '__main__':
- startCrawling()
diff --git a/MarketPlaces/RobinhoodMarket/parser.py b/MarketPlaces/RobinhoodMarket/parser.py
deleted file mode 100644
index 059d327..0000000
--- a/MarketPlaces/RobinhoodMarket/parser.py
+++ /dev/null
@@ -1,334 +0,0 @@
-__author__ = 'chris'
-
-import re
-import traceback
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from MarketPlaces.Utilities.utilities import *
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup
-
-# Import for test run
-import glob
-import os
-import codecs
-import shutil
-
-# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
-def Robinhood_description_parser(soup):
-
- # Fields to be parsed
-
- vendor = "-1" # 0 *Vendor_Name
- success = "-1" # 1 Vendor_Successful_Transactions
- rating_vendor = "-1" # 2 Vendor_Rating
- name = "-1" # 3 *Product_Name
- describe = "-1" # 4 Product_Description
- CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
- category = "-1" # 7 Product_Category
- views = "-1" # 8 Product_Number_Of_Views
- reviews = "-1" # 9 Product_Number_Of_Reviews
- rating_item = "-1" # 10 Product_Rating
- addDate = "-1" # 11 Product_AddedDate
- BTC = "-1" # 12 Product_BTC_SellingPrice
- USD = "-1" # 13 Product_USD_SellingPrice
- EURO = "-1" # 14 Product_EURO_SellingPrice
- sold = "-1" # 15 Product_QuantitySold
- left = "-1" # 16 Product_QuantityLeft
- shipFrom = "-1" # 17 Product_ShippedFrom
- shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
-
- # Finding Product Name
- name = soup.find('h1').text
- name = name.replace('\n', ' ')
- name = name.replace(",", "")
- name = name.strip()
-
- # Finding description
- desc = ''
- tab = soup.find('div', {"id": "tab-description"})
- if tab is not None:
- for p in tab.findAll('p'):
- desc += p.text
- if desc == '':
- short = soup.find('div', {"class": "woocommerce-product-details__short-description"})
- if short is not None:
- desc = short.text
- describe = cleanString(desc.strip())
-
- # Finding Product Image
- image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img')
- image = image.get('src')
- image = image.split('base64,')[-1]
-
- # Finding Vendor
- vendor = soup.find('a', {'class': 'wcfm_dashboard_item_title'}).text
- vendor = vendor.replace(",", "")
- vendor = vendor.replace("Sold by:", "")
- vendor = vendor.strip()
-
- # Finding Vendor Image
- vendor_image = soup.find('div', {'class': 'wcfmmp_sold_by_container_left'}).find('img')
- vendor_image = vendor_image.get('src')
- vendor_image = vendor_image.split('base64,')[-1]
-
- # Finding Category
- catSpan = soup.find('span', {'class': 'posted_in'})
- category = catSpan.find('a').text
-
- # Finding USD
- priceText = soup.find('p', {'class': 'price'}).text
- USD = str(priceText).strip()
-
- # Searching for CVE and MS categories
- cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
- if cve:
- CVE = " "
- for idx in cve:
- CVE += (idx)
- CVE += " "
- CVE = CVE.replace(',', ' ')
- CVE = CVE.replace('\n', '')
- ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
- if ms:
- MS = " "
- for im in ms:
- MS += (im)
- MS += " "
- MS = MS.replace(',', ' ')
- MS = MS.replace('\n', '')
-
- # Populating the final variable (this should be a list with all fields scraped)
- row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
-
- # Sending the results
- return row
-
-
-# This is the method to parse the Listing Pages
-def Robinhood_listing_parser(soup):
-
- # Fields to be parsed
- nm = 0 # *Total_Products (Should be Integer)
- mktName = "RobinhoodMarket" # 0 *Marketplace_Name
- vendor = [] # 1 *Vendor y
- rating_vendor = [] # 2 Vendor_Rating
- success = [] # 3 Vendor_Successful_Transactions
- name = [] # 4 *Product_Name y
- CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = [] # 6 Product_MS_Classification (Microsoft Security)
- category = [] # 7 Product_Category y
- describe = [] # 8 Product_Description
- views = [] # 9 Product_Number_Of_Views
- reviews = [] # 10 Product_Number_Of_Reviews
- rating_item = [] # 11 Product_Rating
- addDate = [] # 12 Product_AddDate
- BTC = [] # 13 Product_BTC_SellingPrice
- USD = [] # 14 Product_USD_SellingPrice y
- EURO = [] # 15 Product_EURO_SellingPrice
- sold = [] # 16 Product_QuantitySold
- qLeft =[] # 17 Product_QuantityLeft
- shipFrom = [] # 18 Product_ShippedFrom
- shipTo = [] # 19 Product_ShippedTo
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
-
- listing = soup.find('ul', {"class": "products columns-4"})
- items = listing.findAll('li')
-
- # Populating the Number of Products
- nm = len(items)
-
- for card in items:
- # Finding Category
- cat = soup.find("h1").text
- cat = cat.replace('\n', ' ')
- cat = cat.replace(",", "")
- cat = cat.strip()
- category.append(cat)
-
- bae = card.findAll('a')
-
- # Adding the url to the list of urls
- link = card.find('a').get('href')
- href.append(link)
-
- # Finding Product Name
- product = card.find("h2").text
- product = product.replace('\n', ' ')
- product = product.replace(",", "")
- product = product.strip()
- name.append(product)
-
- # Finding Product Image
- product_image = card.find('a').find('img')
- product_image = product_image.get('src')
- product_image = product_image.split('base64,')[-1]
- image.append(product_image)
-
- info = card.find('div', {'class': 'wcfmmp_sold_by_container'})
-
- # Finding Vendor
- vendor_name = info.find('a', {'class', 'wcfm_dashboard_item_title'}).text
- vendor_name = vendor_name.replace(",", "")
- vendor_name = vendor_name.strip()
- vendor.append(vendor_name)
-
- # Finding Vendor Image
- vendor_icon = info.find('img', {'class', 'wcfmmp_sold_by_logo'})
- vendor_icon = vendor_icon.get('src')
- vendor_icon = vendor_icon.split('base64,')[-1]
- image_vendor.append(vendor_icon)
-
- # Finding USD
- span = card.find('span', {'class': 'price'})
- if span is not None:
- bdi = span.find('bdi')
- usdText = bdi.find('span').next_sibling
- usdVal = usdText.text
- else:
- usdVal = "0"
- USD.append(usdVal)
-
- # Searching for CVE and MS categories
- cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
- if not cve:
- cveValue="-1"
- else:
- cee = " "
- for idx in cve:
- cee += (idx)
- cee += " "
- cee = cee.replace(',', ' ')
- cee = cee.replace('\n', '')
- cveValue=cee
- CVE.append(cveValue)
-
- ms = card.findAll(text=re.compile('MS\d{2}-\d{3}'))
- if not ms:
- MSValue="-1"
- else:
- me = " "
- for im in ms:
- me += (im)
- me += " "
- me = me.replace(',', ' ')
- me = me.replace('\n', '')
- MSValue=me
- MS.append(MSValue)
-
- #print(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- # reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
-
- # Populate the final variable (this should be a list with all fields scraped)
- return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
-
-
-def Robinhood_links_parser(soup):
-
- # Returning all links that should be visited by the Crawler
- href = []
-
- #list = soup.findAll('div', {"class": "woocommerce columns-4"})
- listing = soup.find('ul', {"class": "products columns-4"}).findAll('li')
-
- for item in listing:
-
- link = item.find('a')['href']
- href.append(link)
-
- return href
-
-
-if __name__ == '__main__':
- nError = 0
- marketPlace = 'RobinhoodMarket'
-
- lines = [] # listing pages
- lns = [] # description pages
- detPage = {}
-
- '''
- # reading description pages
- count = 0
- for fileDescription in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\08082023\\Description", '*.html')):
- count += 1
- lns.append(fileDescription)
- # if count > 5:
- # break
-
- for index, line2 in enumerate(lns):
-
- print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns)))
-
- try:
- html = codecs.open(line2.strip('\n'), encoding='utf8')
- soup = BeautifulSoup(html, "html.parser")
- html.close()
- except:
-
- try:
- html = open(line2.strip('\n'))
- soup = BeautifulSoup(html, "html.parser")
- html.close()
- except:
-
- nError += 1
- print("There was a problem to read the file " + line2 + " in the Description section!")
- # if createLog:
- # logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section.\n")
- continue
-
- try:
- print(Robinhood_description_parser(soup))
- except:
- traceback.print_exc()
- print("There was a problem to parse the file " + line2 + " in the Description section!")
- '''
-
- # reading listing pages
- count = 0
- for fileListing in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\08082023\\Listing", '*.html')):
- count += 1
- lines.append(fileListing)
- #if count > 1:
- # break
-
- for index, line1 in enumerate(lines):
-
- print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines)))
-
- readError = False
- try:
- html = codecs.open(line1.strip('\n'), encoding='utf8')
- soup = BeautifulSoup(html, "html.parser")
- html.close()
- except:
- try:
- html = open(line1.strip('\n'))
- soup = BeautifulSoup(html, "html.parser")
- html.close()
- except:
- print("There was a problem to read the file " + line1 + " in the Listing section!")
- readError = True
-
- if not readError:
-
- parseError = False
- try:
- test = Robinhood_listing_parser(soup)
- print(Robinhood_listing_parser(soup))
- except:
- traceback.print_exc()
- print("There was a problem to parse the file " + line1 + " in the listing section!")
- parseError = True
-
-
- print("DONE")
\ No newline at end of file
diff --git a/MarketPlaces/Apocalypse/crawler_selenium.py b/MarketPlaces/Royal/crawler_selenium.py
similarity index 54%
rename from MarketPlaces/Apocalypse/crawler_selenium.py
rename to MarketPlaces/Royal/crawler_selenium.py
index b91bf0e..857cb27 100644
--- a/MarketPlaces/Apocalypse/crawler_selenium.py
+++ b/MarketPlaces/Royal/crawler_selenium.py
@@ -1,68 +1,171 @@
-__author__ = 'Helium'
+__author__ = 'DarkWeb'
'''
-Apocalypse Forum Crawler (Selenium)
-two captchas. if you get a captcha wrong you have to reload program.
+Royal Marketplace Crawler (Selenium)
'''
from selenium import webdriver
+from selenium.webdriver.support.select import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
-
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
+
+
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
-import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
-from MarketPlaces.Apocalypse.parser import apocalypse_links_parser
+from MarketPlaces.Royal.parser import royal_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
-baseURL = 'http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/'
+baseURL = 'http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion'
-# Opens Tor Browser, crawls the website, then parses, then closes tor
-#acts like the main method for the crawler, another function at the end of this code calls this function later
+# Opens Tor Browser, crawls the website
def startCrawling():
- mktName = getMKTName()
+ marketName = getMarketName()
driver = getAccess()
if driver != 'down':
try:
+ captcha(driver)
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
- closetor(driver)
+ closeDriver(driver)
+
+ new_parse(marketName, False)
+
+
+def captcha(driver):
+ '''
+ # wait for captcha page
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div[2]/div/div/div/div/form/div/div[2]/button")))
+
+ inputChars = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[2]/div[1]/input')
+ inputNum = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[2]/div[2]/input')
+
+ driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[1]/div/div').screenshot(
+ r'..\Royal\captcha1.png')
+
+ im = Image.open(r'..\Royal\captcha1.png')
+ im.show()
+
+ chars = input("Enter characters: ")
+ inputChars.send_keys(chars)
+
+ num = input("Enter number of wrong puzzle pieces: ")
+ inputNum.send_keys(num)
+
+ # click the verify(submit) button
+ driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div/div/div/form/div/div[2]/button").click()
+ '''
+ input("Press ENTER when CAPTCHA is completed\n")
- new_parse(mktName, baseURL, True)
+ # wait for login page
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div[2]/div/div/div[2]/h1")))
+
+ '''
+ temp = driver.find_element(by=By.XPATH, value='/html/body/div/div/form/div[1]')
+ boxes = temp.find_elements(by=By.TAG_NAME, value='input')
+
+ for box in boxes:
+ # click box to update captcha image
+ box.click()
+
+ # save clock captcha to local
+ time.sleep(1)
+ driver.find_element(by=By.XPATH, value='/html/body/div/div/form/div[1]/div').screenshot(
+ r'..\Royal\captcha1.png')
+
+ im = Image.open(r'..\Royal\captcha1.png')
+ im.show()
+
+ letter = input("Enter letter: ")
+ box.send_keys(letter)
+
+ # click the verify(submit) button
+ driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()
+
+ # wait for login page
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div[1]/div/div/div[2]/form/input[3]")))
+ '''
+
+
+# Login using premade account credentials and do login captcha manually
+def login(driver):
+ # wait for login page
+ WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+ (By.XPATH, "/html/body/div[2]/div/div/div[2]/form/div[4]")))
+
+ # entering username and password into input boxes
+ usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
+ # Username here
+ usernameBox.send_keys('blabri')
+ passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
+ # Password here
+ passwordBox.send_keys('fishowal')
+
+ # click "Login"
+ driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[2]/form/div[4]').click()
+
+ '''
+ # wait for captcha page show up
+ time.sleep(3)
+
+ # save captcha to local
+ driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[2]/form/div[4]/label/div/div').screenshot(
+ r'..\Royal\captcha2.png')
+
+ # This method will show image in any image viewer
+ im = Image.open(r'..\Royal\captcha2.png')
+ im.show()
+
+ # ask user input captcha solution in terminal
+ userIn = input("Enter location of wrong pieces (squares are numbered 1-24 left to right, # # #): ")
+ squares = userIn.split()
+
+ # send user solution into the input space
+ for id in squares:
+ driver.find_element(by=By.XPATH, value='//*[@id="cl[' + str((int(id)-1)) + ']"]').click()
+
+ # click the verify(submit) button
+ driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div/div[2]/form/div[4]/label/div/div/div/button").click()
+ '''
+ input("Press ENTER when CAPTCHA is completed\n")
+
+ # wait for listing page show up (This Xpath may need to change based on different seed url)
+ WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
+ (By.XPATH, '/html/body/div[3]/div/div[5]/div[1]')))
# Returns the name of the website
-#return: name of site in string type
-def getMKTName():
- name = 'Apocalypse'
+def getMarketName():
+ name = 'Royal'
return name
-# Return the base link of the website
-#return: url of base site in string type
+# Return the link of the website
def getFixedURL():
- url = 'http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/'
+ url = 'http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion'
+
return url
# Closes Tor Browser
-#@param: current selenium driver
-def closetor(driver):
+def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@@ -87,8 +190,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
+ # ff_prof.set_preference("network.dns.disablePrefetch", True)
+ # ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@@ -104,14 +207,12 @@ def createFFDriver():
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
+
driver.maximize_window()
return driver
-#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
-#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
@@ -123,32 +224,7 @@ def getAccess():
return 'down'
-# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
-# then allows for manual solving of captcha in the terminal
-#@param: current selenium web driver
-def login(driver):
- input("Press ENTER when CAPTCHA is completed\n")
-
- # wait for page to show up (This Xpath may need to change based on different seed url)
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, '//*[@id="name"]')))
-
- # entering username and password into input boxes
- usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="name"]')
- # Username here
- usernameBox.send_keys('shooby')
- passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
- # Password here
- passwordBox.send_keys('perry_1the2_platypu$')
-
- input("Press ENTER when CAPTCHA is completed\n")
-
- # wait for listing page show up (This Xpath may need to change based on different seed url)
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div[1]/div[2]/div[1]/div[1]/a[13]")))
-
-
-# Saves the crawled html page, makes the directory path for html pages if not made
+# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
@@ -158,7 +234,6 @@ def savePage(driver, page, url):
# Gets the full path of the page to be saved along with its appropriate file name
-#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
@@ -171,41 +246,33 @@ def getFullPathName(url):
return fullPath
-# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
-#@param: raw url as crawler crawls through every site
+# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
+ if name == '':
name = str(counter)
counter = counter + 1
return name
-# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
-#in this example, there are a couple of categories some threads fall under such as
-# Guides and Tutorials, Digital Products, and Software and Malware
-#as you can see they are categories of products
def getInterestedLinks():
links = []
- # Digital Goods
- links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74')
- # Fraud
- links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75')
- # Services
- links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76')
- # software and malware
- links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30')
+ # Digital - Fraud Software
+ links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Fraud%20Software')
+ # # Digital - Guides and Tutorials
+ # links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Guides%20&%20Tutorials')
+ # # Digital - Legitimate Software
+ # links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Legitimiate%20Software')
+ # # Services - Carding
+ # links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Carding')
return links
-# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
-#topic and description pages are crawled through here, where both types of pages are saved
-#@param: selenium driver
def crawlForum(driver):
- print("Crawling the Apocalypse market")
+ print("Crawling the Royal market")
linksToCrawl = getInterestedLinks()
@@ -233,22 +300,20 @@ def crawlForum(driver):
except:
driver.refresh()
savePage(driver, driver.page_source, item)
- # driver.back()
- try:
- driver.get(link)
- except:
- driver.refresh()
+ driver.back()
+
+ # comment out
+ break
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
+ # comment out
+ if count == 1:
+ break
try:
- nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div/div[2]/nav')
- link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='»').get_attribute('href')
+ nav = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[1]/div[2]/nav')
+ li = nav.find_elements(by=By.TAG_NAME, value='li')
+ a = li[-1].find_element(by=By.TAG_NAME, value='a')
+ link = a.get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
@@ -260,42 +325,27 @@ def crawlForum(driver):
print(link, e)
i += 1
- print("Crawling the Apocalypse market done.")
+ input("Crawling Royal forum done sucessfully. Press ENTER to continue\n")
-# Returns 'True' if the link is a description link
-#@param: url of any url crawled
-#return: true if is a description page, false if not
+# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
- if 'article' in url:
+ if 'product' in url:
return True
return False
# Returns True if the link is a listingPage link
-#@param: url of any url crawled
-#return: true if is a Listing page, false if not
def isListingLink(url):
if 'category' in url:
return True
return False
-# calling the parser to define the links, the html is the url of a link from the list of interested link list
-#@param: link from interested link list ie. getInterestingLinks()
-#return: list of description links that should be crawled through
+# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
- return apocalypse_links_parser(soup)
-
-
-# Drop links that "signout"
-# def isSignOut(url):
-# #absURL = urlparse.urljoin(url.base_url, url.url)
-# if 'signout' in url.lower() or 'logout' in url.lower():
-# return True
-#
-# return False
+ return royal_links_parser(soup)
def crawler():
diff --git a/MarketPlaces/Tor2door/parser.py b/MarketPlaces/Royal/parser.py
similarity index 68%
rename from MarketPlaces/Tor2door/parser.py
rename to MarketPlaces/Royal/parser.py
index 49e0a93..dfb2d32 100644
--- a/MarketPlaces/Tor2door/parser.py
+++ b/MarketPlaces/Royal/parser.py
@@ -8,7 +8,7 @@ from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
-def tor2door_description_parser(soup):
+def royal_description_parser(soup):
# Fields to be parsed
@@ -31,23 +31,18 @@ def tor2door_description_parser(soup):
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
-
- bae = soup.find('div', {'class': "col-9"})
# Finding Product Name
- name = bae.find('h2').text
+ name = soup.find('h5', {'class': "bold"}).text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
- mb = bae.findAll('div', {"class": "mb-1"})
+ bae = soup.find('div', {'class': "card-header bg-light"})
# Finding Vendor
- vendor = mb[0].text
+ vendor = bae.find('a').text
vendor = vendor.replace(",", "")
- vendor = vendor.replace("Sold by:", "")
vendor = vendor.strip()
# # Finding Vendor Rating
@@ -55,24 +50,45 @@ def tor2door_description_parser(soup):
# half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"})
# rating = len(full_stars) + (0.5 if half_star is not None else 0)
- # Finding Quantity Sold and Left
- temp = mb[4].text.split(',')
-
- sold = temp[0].replace("sold", "")
- sold = sold.strip()
-
- left = temp[1].replace("in stock", "")
- left = left.strip()
-
- # Finding USD
- USD = bae.find('div', {"class": "h3 text-secondary"}).text
- USD = USD.replace("$", "")
- USD = USD.strip()
-
- # Finding BTC
- temp = bae.find('div', {"class": "small"}).text.split("BTC")
-
- BTC = temp[0].strip()
+ # Finding Successful Transactions
+ success = bae.find('b').text
+ success = success.replace("(", "")
+ success = success.replace(")", "")
+ success = success.strip()
+
+ form = soup.find_all('form', {'method': "POST"})
+ bae = form[1].find_all('div', {'class': "row"})
+
+ # Finding Quantity Sold
+ div = bae[2].find_all('div', {'class': "col"})
+ temp = div[1].text
+ temp = temp.split()
+ if len(temp) > 0:
+ sold = temp[0].strip()
+ sold = re.sub(r'[^0-9.]', "", sold)
+ if sold == "":
+ sold = -1
+ else:
+ sold = -1
+
+ # Finding Quantity Left
+ div = bae[3].find_all('div', {'class': "col"})
+ temp = div[1].text
+ temp = temp.split()
+ if len(temp) > 0:
+ left = temp[0].strip()
+ left = re.sub(r'[^0-9.]', "", left)
+ if left == "":
+ left = -1
+ else:
+ left = -1
+
+ # Finding Prices
+ temp = bae[-2].find('strong').text
+ temp = temp.replace("Price:", "")
+ temp = temp.split()
+ USD = temp[0].strip()
+ USD = re.sub(r'[^0-9.]', "", USD)
# shipping_info = bae[4].text
# if "Digital" not in shipping_info:
@@ -85,7 +101,7 @@ def tor2door_description_parser(soup):
# shipTo = shipping_info[1].strip()
# Finding the Product description
- describe = bae.find('div', {"class": "card border-top-0"}).text
+ describe = soup.find('xmp').text
describe = describe.replace("\n", " ")
describe = describe.replace("\r", " ")
describe = describe.strip()
@@ -108,23 +124,20 @@ def tor2door_description_parser(soup):
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
- image = bae.find('div', {"class": "product-primary"}).find('img')
- image = image.get('src').split('base64,')[-1]
-
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
+ BTC, USD, EURO, sold, left, shipFrom, shipTo)
# Sending the results
return row
# This is the method to parse the Listing Pages
-def tor2door_listing_parser(soup):
+def royal_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
- mktName = "Tor2door" # 0 *Marketplace_Name
+ mktName = "Royal" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
@@ -144,69 +157,43 @@ def tor2door_listing_parser(soup):
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
+ href = [] # 20 Product_Links
+
+ # Finding category of listing page
+ cat = soup.find('li', {'class': "breadcrumb-item active"}).text
+ cat = cat.strip()
- listing = soup.findAll('div', {"class": "card product-card mb-3"})
+ listing = soup.findAll('div', {'class': "card search border shadow-sm mb-3"})
# Populating the Number of Products
nm = len(listing)
- # Finding Category
- cat = soup.find("div", {"class": "col-9"})
- cat = cat.find("h2").text
- cat = cat.replace("Category: ", "")
- cat = cat.replace(",", "")
- cat = cat.strip()
+ for a in listing:
+ bae = a.findAll('a', href=True)
- for card in listing:
+ # Adding the category
category.append(cat)
- bae = card.findAll('a')
-
# Adding the url to the list of urls
- link = bae[0].get('href')
+ link = bae[1].get('href')
+ link = cleanLink(link)
href.append(link)
- # Finding Product Name
- product = bae[1].text
- product = product.replace('\n', ' ')
- product = product.replace(",", "")
- product = product.strip()
- name.append(product)
-
- # Finding Vendor
- vendor_name = bae[2].text
+ # Finding the Vendor
+ vendor_name = bae[0].text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
- # Finding USD
- usd = card.find('div', {"class": "mb-1"}).text
- usd = usd.replace("$", "")
- usd = usd.strip()
- USD.append(usd)
-
- # Finding Rating
- stars = card.find("ul", {"class": "star-list"})
- full = stars.findAll('i', {"class": "fas fa-star star star-active"})
- half = stars.find('i', {"class": "fas fa-star-half star star-active"})
- rating = len(full)
- if half is not None:
- rating += 0.5
- rating_item.append(str(rating))
-
- # Finding Reviews
- num = card.find("span", {"class": "rate-count"}).text
- num = num.replace("(", "")
- num = num.replace("review)", "")
- num = num.replace("reviews)", "")
- num = num.strip()
- reviews.append(num)
+ # Finding the Product
+ product = bae[2].get('title')
+ product = product.replace('\n', ' ')
+ product = product.replace(",", "")
+ product = product.strip()
+ name.append(product)
# Searching for CVE and MS categories
- cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
+ cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
@@ -219,7 +206,7 @@ def tor2door_listing_parser(soup):
cveValue=cee
CVE.append(cveValue)
- ms = card.findAll(text=re.compile('MS\d{2}-\d{3}'))
+ ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
@@ -232,24 +219,22 @@ def tor2door_listing_parser(soup):
MSValue=me
MS.append(MSValue)
- image = bae[0].find('img')
- image = image.get('src').split('base64,')[-1]
-
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
+ reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
-def tor2door_links_parser(soup):
+def royal_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
- listing = soup.findAll('div', {"class": "card product-card mb-3"})
+ listing = soup.findAll('div', {"class": "card search border shadow-sm mb-3"})
for div in listing:
- link = div.find('a')['href']
+ a = div.find_all('a')
+ link = a[1].get('href')
href.append(link)
return href
\ No newline at end of file
diff --git a/MarketPlaces/ThiefWorld/parser.py b/MarketPlaces/ThiefWorld/parser.py
deleted file mode 100644
index ba0f51c..0000000
--- a/MarketPlaces/ThiefWorld/parser.py
+++ /dev/null
@@ -1,190 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from typing import List, Tuple
-from MarketPlaces.Utilities.utilities import *
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup, ResultSet, Tag
-
-
-def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple:
-
- # Fields to be parsed
- vendor = "-1" # 0 *Vendor_Name
- success = "-1" # 1 Vendor_Successful_Transactions
- rating_vendor = "-1" # 2 Vendor_Rating
- name = "-1" # 3 *Product_Name
- describe = "-1" # 4 Product_Description
- CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
- category = "-1" # 7 Product_Category
- views = "-1" # 8 Product_Number_Of_Views
- reviews = "-1" # 9 Product_Number_Of_Reviews
- rating_item = "-1" # 10 Product_Rating
- addDate = "-1" # 11 Product_AddedDate
- BTC = "-1" # 12 Product_BTC_SellingPrice
- USD = "-1" # 13 Product_USD_SellingPrice
- EURO = "-1" # 14 Product_EURO_SellingPrice
- sold = "-1" # 15 Product_QuantitySold
- left = "-1" # 16 Product_QuantityLeft
- shipFrom = "-1" # 17 Product_ShippedFrom
- shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
-
- name = soup.find("h1", {'class': 'title'}).text
- name = cleanString(name.strip())
-
- describe = soup.find('div', {'id': 'descriptionContent'}).text
- describe = cleanString(describe.strip())
-
- # Finding Product Image
- image = soup.find('div', {'class': 'product_img_big'}).find('img')
- image = image.get('src')
- image = image.split('base64,')[-1]
-
- commentListTag: Tag = soup.find('ul', {'class': 'comment_list scrollbar'})
- commentList = commentListTag.find_all('li')
- review = str(len(commentList))
-
- citySelection: str = soup.find('ul', {'class': 'meta text-muted i_location'}).text
- shipFrom = cleanString(citySelection.strip())
-
- vendor = soup.find('h1', {'class': 'title over'}).text
- vendor = cleanString(vendor.strip())
-
- usdTag: Tag = soup.find('div', {'class': 'product_price__big'}).find('span')
- usdText = usdTag.text.strip('/')[0]
- # usdText format: " USD " (i.e., "70 000 USD ")
- USD = cleanString(usdText.replace("USD", "").strip())
-
- ratingDiv = soup.find('div', {'class': 'rating_star'})
- rating_vendor = ratingDiv.get('title').split(' ')[1]
-
- rating_item = soup.find('div', {'class': 'product_rate'}).text
- rating_item = rating_item.replace("rating", "")
- rating_item = cleanString(rating_item.strip())
-
- # Populating the final variable (this should be a list with all fields scraped)
- row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
-
- # Sending the results
- return row
-
-
-def thiefWorld_listing_parser(soup: BeautifulSoup):
-
- # Fields to be parsed
- nm = 0 # Total_Products (Should be Integer)
- mktName = "ThiefWorld" # 0 Marketplace_Name
- vendor = [] # 1 *Vendor y
- rating_vendor = [] # 2 Vendor_Rating
- success = [] # 3 Vendor_Successful_Transactions
- name = [] # 4 *Product_Name y
- CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = [] # 6 Product_MS_Classification (Microsoft Security)
- category = [] # 7 Product_Category y
- describe = [] # 8 Product_Description
- views = [] # 9 Product_Number_Of_Views
- reviews = [] # 10 Product_Number_Of_Reviews
- rating_item = [] # 11 Product_Rating
- addDate = [] # 12 Product_AddDate
- BTC = [] # 13 Product_BTC_SellingPrice
- USD = [] # 14 Product_USD_SellingPrice y
- EURO = [] # 15 Product_EURO_SellingPrice
- sold = [] # 16 Product_QuantitySold
- qLeft =[] # 17 Product_QuantityLeft
- shipFrom = [] # 18 Product_ShippedFrom
- shipTo = [] # 19 Product_ShippedTo
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
-
- productList: ResultSet[Tag] = soup.find_all('div', {'class': 'catalog_item'})
-
- nm = len(productList)
-
- for product in productList:
-
- productTitle: Tag = product.find('div', {'class': 'title'}).find('a')
-
- productName = cleanString(productTitle.text.strip())
- name.append(productName)
-
- # Finding Product Image
- product_image = product.find('noscript').find('img')
- product_image = product_image.get('src')
- product_image = product_image.split('base64,')[-1]
- image.append(product_image)
-
- productHref = productTitle.get('href')
- href.append(productHref)
-
- CVE.append('-1')
- MS.append('-1')
-
- cat = soup.find('calsys-cat').text
- category.append(cat.strip())
-
- productDescription = product.find('div', {'class': 'text'}).text
- productDescription = cleanString(productDescription.strip())
- describe.append(productDescription)
-
- views.append('-1')
- reviews.append('-1')
- addDate.append('-1')
- BTC.append('-1')
-
- priceText = product.find('span', {'class': 'price'}).find('span').text
- priceText = priceText.split('USD')[0]
- priceText = cleanString(priceText.strip())
- USD.append(priceText)
-
- EURO.append('-1')
- sold.append('-1')
- qLeft.append('-1')
- shipFrom.append('-1')
- shipTo.append('-1')
-
- productVendor = product.find('div', {'class': 'market over'}).find('a').text
- productVendor = cleanString(productVendor.strip())
- vendor.append(productVendor)
-
- image_vendor.append('-1')
-
- rating_vendor.append('-1')
- #rating_item.append('-1')
-
- rating = product.find('div', {'class': 'rating_star_yellow'}).attrs.get('style')
- rating = rating.replace("width: ", "")
- rating_item.append(cleanString(rating))
-
- success.append('-1')
-
-
- # Populate the final variable (this should be a list with all fields scraped)
- return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
-
-
-
-
-
-#called by the crawler to get description links on a listing page
-#@param: beautifulsoup object that is using the correct html page (listing page)
-#return: list of description links from a listing page
-def thiefworld_links_parser(soup):
-
- # Returning all links that should be visited by the Crawler
-
- href = []
- listing = soup.find('div', {"class": "row tile__list tileitems_filter pad15 tileproduct__list"}).findAll('div', {"class": "desc"})
-
- for a in listing:
- bae = a.find('div', {"class": "title"}).find('a', href=True)
- link = bae['href']
- href.append(link)
-
- return href
\ No newline at end of file
diff --git a/MarketPlaces/TorBay/crawler_selenium.py b/MarketPlaces/TorBay/crawler_selenium.py
deleted file mode 100644
index 18a04be..0000000
--- a/MarketPlaces/TorBay/crawler_selenium.py
+++ /dev/null
@@ -1,268 +0,0 @@
-__author__ = 'Helium'
-
-'''
-TorBay Market Forum Crawler (Selenium)
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.common.by import By
-
-from PIL import Image
-import urllib.parse as urlparse
-import os, time
-from datetime import date
-import subprocess
-import configparser
-import subprocess
-from bs4 import BeautifulSoup
-from MarketPlaces.Initialization.prepare_parser import new_parse
-from MarketPlaces.TorBay.parser import torbay_links_parser
-from MarketPlaces.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion/'
-
-
-# Opens Tor Browser, crawls the website, then parses, then closes tor
-#acts like the main method for the crawler, another function at the end of this code calls this function later
-def startCrawling():
- mktName = getMKTName()
- driver = getAccess()
-
- if driver != 'down':
- try:
- login(driver)
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(mktName, baseURL, True)
-
-
-# Returns the name of the website
-#return: name of site in string type
-def getMKTName():
- name = 'TorBay'
- return name
-
-
-# Return the base link of the website
-#return: url of base site in string type
-def getFixedURL():
- url = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion/'
- return url
-
-
-# Closes Tor Browser
-#@param: current selenium driver
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close()
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from MarketPlaces.Initialization.markets_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", True)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
-#return: return the selenium driver or string 'down'
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)
- return driver
- except:
- driver.close()
- return 'down'
-
-
-# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
-# then allows for manual solving of captcha in the terminal
-#@param: current selenium web driver
-def login(driver):
- # wait for page to show up (This Xpath may need to change based on different seed url)
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div[2]/div/div/div/ul/li[6]/a")))
-
-
-# Saves the crawled html page, makes the directory path for html pages if not made
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-#@param: raw url as crawler crawls through every site
-def getFullPathName(url):
- from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
-#@param: raw url as crawler crawls through every site
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
- name = str(counter)
- counter = counter + 1
- return name
-
-
-# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
-#in this example, there are a couple of categories some threads fall under such as
-# Guides and Tutorials, Digital Products, and Software and Malware
-#as you can see they are categories of products
-def getInterestedLinks():
- links = []
-
- # Hacking
- links.append('http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion/category/hacking')
-
- return links
-
-
-# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
-#topic and description pages are crawled through here, where both types of pages are saved
-#@param: selenium driver
-def crawlForum(driver):
- print("Crawling the TorBay Market")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
- list = productPages(html)
- for item in list:
- itemURL = urlparse.urljoin(baseURL, str(item))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
- savePage(driver, driver.page_source, item)
- driver.back()
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- try:
- nav = driver.find_element(by=By.XPATH, value='/html/body/section/div/div/div[2]/div/div[2]/ul')
- link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href')
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the TorBay market done.")
-
-
-# Returns 'True' if the link is a description link
-#@param: url of any url crawled
-#return: true if is a description page, false if not
-def isDescriptionLink(url):
- if 'product' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link
-#@param: url of any url crawled
-#return: true if is a Listing page, false if not
-def isListingLink(url):
- if 'category' in url:
- return True
- return False
-
-
-# calling the parser to define the links, the html is the url of a link from the list of interested link list
-#@param: link from interested link list ie. getInterestingLinks()
-#return: list of description links that should be crawled through
-def productPages(html):
- soup = BeautifulSoup(html, "html.parser")
- return torbay_links_parser(soup)
-
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/MarketPlaces/TorBay/parser.py b/MarketPlaces/TorBay/parser.py
deleted file mode 100644
index 69d2cfb..0000000
--- a/MarketPlaces/TorBay/parser.py
+++ /dev/null
@@ -1,183 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from MarketPlaces.Utilities.utilities import *
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup
-
-
-#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of description page
-#return: 'row' that contains a variety of lists that each hold info on the description page
-def torbay_description_parser(soup):
-
- # Fields to be parsed
-
- vendor = "-1" # 0 *Vendor_Name
- success = "-1" # 1 Vendor_Successful_Transactions
- rating_vendor = "-1" # 2 Vendor_Rating
- name = "-1" # 3 *Product_Name
- describe = "-1" # 4 Product_Description
- CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
- category = "-1" # 7 Product_Category
- views = "-1" # 8 Product_Number_Of_Views
- reviews = "-1" # 9 Product_Number_Of_Reviews
- rating_item = "-1" # 10 Product_Rating
- addDate = "-1" # 11 Product_AddedDate
- BTC = "-1" # 12 Product_BTC_SellingPrice
- USD = "-1" # 13 Product_USD_SellingPrice
- EURO = "-1" # 14 Product_EURO_SellingPrice
- sold = "-1" # 15 Product_QuantitySold
- left = "-1" # 16 Product_QuantityLeft
- shipFrom = "-1" # 17 Product_ShippedFrom
- shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
-
- # Finding Product Name
- try:
- product_name = soup.find('div', {'class': 'product-information'}).find('h1').text
- name = cleanString(product_name.strip())
- except:
- product_name = soup.find('div', {'class': 'profile-info'}).find('h2').text
- name = cleanString(product_name.strip())
-
- # Finding Vendor FIx
- vendor_name = soup.find('div', {"class": "profile-info"}).find('h2').text
- vendor = cleanString(vendor_name.strip())
-
- # Finding Vendor Image
- vendor_image = soup.find('div', {'class': 'avatar'}).find('img')
- vendor_image = vendor_image.get('src')
- vendor_image = vendor_image.split('base64,')[-1]
-
- # Finding Prices
- USD = soup.find('div', {'class': "total-price"}).find('span').text.strip()
-
- # Finding the Product Category
- cat = soup.find('div', {'class': "profile-info"}).find('p').text
- category = cleanString(cat.strip())
-
- # Finding the Product description
- try:
- describe = soup.find('div', {'class': "info"}).find('p').text
- if "\n" in describe:
- describe = describe.replace("\n", " ")
- describe = describe.replace("\r", " ")
- describe = cleanString(describe.strip())
- except:
- # print("product desc")
- describe = soup.find('div', {'class': 'info'}).text
- describe = cleanString(describe.strip())
-
- # Finding Product Image
- image = soup.find('div', {'class': 'image text-center'}).find('img')
- image = image.get('src')
- image = image.split('base64,')[-1]
-
- # Populating the final variable (this should be a list with all fields scraped)
- row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
-
- # Sending the results
- return row
-
-
-#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of listing page
-#return: 'row' that contains a variety of lists that each hold info on the listing page
-def torbay_listing_parser(soup):
-
- # Fields to be parsed
- nm = 0 # *Total_Products (Should be Integer)
- mktName = "TorBay" # 0 *Marketplace_Name
- vendor = [] # 1 *Vendor y
- rating_vendor = [] # 2 Vendor_Rating
- success = [] # 3 Vendor_Successful_Transactions
- name = [] # 4 *Product_Name y
- CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = [] # 6 Product_MS_Classification (Microsoft Security)
- category = [] # 7 Product_Category y
- describe = [] # 8 Product_Description
- views = [] # 9 Product_Number_Of_Views
- reviews = [] # 10 Product_Number_Of_Reviews
- rating_item = [] # 11 Product_Rating
- addDate = [] # 12 Product_AddDate
- BTC = [] # 13 Product_BTC_SellingPrice
- USD = [] # 14 Product_USD_SellingPrice y
- EURO = [] # 15 Product_EURO_SellingPrice
- sold = [] # 16 Product_QuantitySold
- qLeft =[] # 17 Product_QuantityLeft
- shipFrom = [] # 18 Product_ShippedFrom
- shipTo = [] # 19 Product_ShippedTo
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
-
- listing = soup.findAll('div', {"class": "product-card"})
-
- # Populating the Number of Products
- nm = len(listing)
-
- for a in listing:
-
- product_name = a.find('p', {'class': 'name'}).text
- name.append(cleanString(product_name.strip()))
-
- # Finding Product Image
- image.append("-1")
-
- prod = a.find('p', {'class': 'price'}).text # price
- USD.append(cleanString(prod.strip()))
-
- ven = a.find('div', {'class': 'pc-footer'}).find('div').find('a').text # pc-footer
- vendor.append(cleanString(ven.strip()))
- # print(ven)
-
- # Finding Vendor Image
- image_vendor.append("-1")
-
- h = a.find('p', {'class': 'name'}).find('a').get('href')
- href.append(h)
-
- CVE.append("-1")
- MS.append("-1")
- rating_vendor.append("-1")
- success.append("-1")
- describe.append("-1")
- views.append("-1")
- reviews.append("-1")
- rating_item.append("-1")
- addDate.append("-1")
- BTC.append("-1")
- EURO.append("-1")
- sold.append("-1")
- qLeft.append("-1")
- shipFrom.append("-1")
- shipTo.append("-1")
- category.append("Hacking")
-
- # Populate the final variable (this should be a list with all fields scraped)
- return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
-
-
-#called by the crawler to get description links on a listing page
-#@param: beautifulsoup object that is using the correct html page (listing page)
-#return: list of description links from a listing page
-def torbay_links_parser(soup):
-
- # Returning all links that should be visited by the Crawler
-
- href = []
- listing = soup.find('section', {"id": "content"}).findAll('div', {"class": "product-card"})
-
- for a in listing:
- bae = a.find('div', {"class": "pc-footer"}).find('a', {"class": "btn btn-primary"}, href=True)
- link = bae['href']
- href.append(link)
-
- return href
\ No newline at end of file
diff --git a/MarketPlaces/TorMarket/crawler_selenium.py b/MarketPlaces/TorMarket/crawler_selenium.py
deleted file mode 100644
index 7021abc..0000000
--- a/MarketPlaces/TorMarket/crawler_selenium.py
+++ /dev/null
@@ -1,277 +0,0 @@
-__author__ = 'Helium'
-
-'''
-TorMarket Forum Crawler (Selenium)
-'''
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-from selenium.webdriver.firefox.service import Service
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.common.by import By
-
-from PIL import Image
-import urllib.parse as urlparse
-import os, re, time
-from datetime import date
-import subprocess
-import configparser
-from bs4 import BeautifulSoup
-from MarketPlaces.Initialization.prepare_parser import new_parse
-from MarketPlaces.TorMarket.parser import tormarket_links_parser
-from MarketPlaces.Utilities.utilities import cleanHTML
-
-counter = 1
-baseURL = 'http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/'
-
-
-# Opens Tor Browser, crawls the website, then parses, then closes tor
-#acts like the main method for the crawler, another function at the end of this code calls this function later
-def startCrawling():
- mktName = getMKTName()
- driver = getAccess()
-
- if driver != 'down':
- try:
- crawlForum(driver)
- except Exception as e:
- print(driver.current_url, e)
- closeDriver(driver)
-
- new_parse(mktName, baseURL, True)
-
-
-# Returns the name of the website
-#return: name of site in string type
-def getMKTName():
- name = 'TorMarket'
- return name
-
-
-# Return the base link of the website
-#return: url of base site in string type
-def getFixedURL():
- url = 'http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/'
- return url
-
-
-# Closes Tor Browser
-#@param: current selenium driver
-def closeDriver(driver):
- # global pid
- # os.system("taskkill /pid " + str(pro.pid))
- # os.system("taskkill /t /f /im tor.exe")
- print('Closing Tor...')
- driver.close()
- time.sleep(3)
- return
-
-
-# Creates FireFox 'driver' and configure its 'Profile'
-# to use Tor proxy and socket
-def createFFDriver():
- from MarketPlaces.Initialization.markets_mining import config
-
- ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
-
- ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
- ff_prof.set_preference("places.history.enabled", False)
- ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
- ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
- ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
- ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
- ff_prof.set_preference("signon.rememberSignons", False)
- ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- # ff_prof.set_preference("network.dns.disablePrefetch", True)
- # ff_prof.set_preference("network.http.sendRefererHeader", 0)
- ff_prof.set_preference("permissions.default.image", 3)
- ff_prof.set_preference("browser.download.folderList", 2)
- ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
- ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
- ff_prof.set_preference('network.proxy.type', 1)
- ff_prof.set_preference("network.proxy.socks_version", 5)
- ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
- ff_prof.set_preference('network.proxy.socks_port', 9150)
- ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", False)
- ff_prof.update_preferences()
-
- service = Service(config.get('TOR', 'geckodriver_path'))
-
- driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
-
- driver.maximize_window()
-
- return driver
-
-
-#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
-#return: return the selenium driver or string 'down'
-def getAccess():
- url = getFixedURL()
- driver = createFFDriver()
- try:
- driver.get(url)
- return driver
- except:
- driver.close()
- return 'down'
-
-
-# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
-# then allows for manual solving of captcha in the terminal
-#@param: current selenium web driver
-def login(driver):
- # wait for page to show up (This Xpath may need to change based on different seed url)
- WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div[2]/div/div/div/main/article/div/section[4]/div/div[1]/div/div/div/div/ul/li[15]/ul/li[3]/a")))
-
-# Saves the crawled html page, makes the directory path for html pages if not made
-def savePage(driver, page, url):
- cleanPage = cleanHTML(driver, page)
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- open(filePath, 'wb').write(cleanPage.encode('utf-8'))
- return
-
-
-# Gets the full path of the page to be saved along with its appropriate file name
-#@param: raw url as crawler crawls through every site
-def getFullPathName(url):
- from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
-
- mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
- else:
- fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
- return fullPath
-
-
-# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
-#@param: raw url as crawler crawls through every site
-def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
- name = str(counter)
- counter = counter + 1
- return name
-
-
-# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
-#in this example, there are a couple of categories some threads fall under such as
-# Guides and Tutorials, Digital Products, and Software and Malware
-#as you can see they are categories of products
-def getInterestedLinks():
- links = []
-
- # Tutorials
- links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/')
- # Malware
- links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/malware/')
- # Services
- links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/')
-
- return links
-
-
-# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
-#topic and description pages are crawled through here, where both types of pages are saved
-#@param: selenium driver
-def crawlForum(driver):
- print("Crawling the TorMarket market")
-
- linksToCrawl = getInterestedLinks()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- has_next_page = True
- count = 0
-
- while has_next_page:
- try:
- driver.get(link)
- except:
- driver.refresh()
- html = driver.page_source
- savePage(driver, html, link)
-
- list = productPages(html)
- for item in list:
- itemURL = urlparse.urljoin(baseURL, str(item))
- try:
- driver.get(itemURL)
- except:
- driver.refresh()
- savePage(driver, driver.page_source, item)
- driver.back()
-
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
-
- try:
- link = driver.find_element(by=By.LINK_TEXT, value='NEXT').get_attribute('href')
- if link == "":
- raise NoSuchElementException
- count += 1
-
- except NoSuchElementException:
- has_next_page = False
-
- except Exception as e:
- print(link, e)
- i += 1
-
- print("Crawling the TorMarket market done.")
-
-
-# Returns 'True' if the link is a description link
-#@param: url of any url crawled
-#return: true if is a description page, false if not
-def isDescriptionLink(url):
- if 'shop' in url:
- return True
- return False
-
-
-# Returns True if the link is a listingPage link
-#@param: url of any url crawled
-#return: true if is a Listing page, false if not
-def isListingLink(url):
- if 'product-category' in url:
- return True
- return False
-
-
-# calling the parser to define the links, the html is the url of a link from the list of interested link list
-#@param: link from interested link list ie. getInterestingLinks()
-#return: list of description links that should be crawled through
-def productPages(html):
- soup = BeautifulSoup(html, "html.parser")
- return tormarket_links_parser(soup)
-
-
-# Drop links that "signout"
-# def isSignOut(url):
-# #absURL = urlparse.urljoin(url.base_url, url.url)
-# if 'signout' in url.lower() or 'logout' in url.lower():
-# return True
-#
-# return False
-
-
-def crawler():
- startCrawling()
- # print("Crawling and Parsing BestCardingWorld .... DONE!")
diff --git a/MarketPlaces/TorMarket/parser.py b/MarketPlaces/TorMarket/parser.py
deleted file mode 100644
index 6a6fac0..0000000
--- a/MarketPlaces/TorMarket/parser.py
+++ /dev/null
@@ -1,189 +0,0 @@
-__author__ = 'DarkWeb'
-
-# Here, we are importing the auxiliary functions to clean or convert data
-from MarketPlaces.Utilities.utilities import *
-
-# Here, we are importing BeautifulSoup to search through the HTML tree
-from bs4 import BeautifulSoup
-
-import re
-
-#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of description page
-#return: 'row' that contains a variety of lists that each hold info on the description page
-def tormarket_description_parser(soup):
-
- # Fields to be parsed
-
- vendor = "-1" # 0 *Vendor_Name
- success = "-1" # 1 Vendor_Successful_Transactions
- rating_vendor = "-1" # 2 Vendor_Rating
- name = "-1" # 3 *Product_Name
- describe = "-1" # 4 Product_Description
- CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
- MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
- category = "-1" # 7 Product_Category
- views = "-1" # 8 Product_Number_Of_Views
- reviews = "-1" # 9 Product_Number_Of_Reviews
- rating_item = "-1" # 10 Product_Rating
- addDate = "-1" # 11 Product_AddedDate
- BTC = "-1" # 12 Product_BTC_SellingPrice
- USD = "-1" # 13 Product_USD_SellingPrice
- EURO = "-1" # 14 Product_EURO_SellingPrice
- sold = "-1" # 15 Product_QuantitySold
- left = "-1" # 16 Product_QuantityLeft
- shipFrom = "-1" # 17 Product_ShippedFrom
- shipTo = "-1" # 18 Product_ShippedTo
- image = "-1" # 19 Product_Image
- vendor_image = "-1" # 20 Vendor_Image
-
- #finding the name of the product
- name_of_product = soup.find("h1", {"class": "product_title entry-title"}).find("a").text
- name = cleanString(name_of_product.strip())
-
- #finding the description of the product
- description_of_product = soup.find("div", {"class": "woocommerce-product-details__short-description"}).text
- describe = cleanString(description_of_product.strip())
-
- #finding the name of the vendor
- name_of_vendor = soup.find("div", {"class": "wcfmmp_sold_by_store"})
- if name_of_vendor is not None:
- name_of_vendor = name_of_vendor.find("a").text
- vendor = cleanString(name_of_vendor.strip())
- else:
- vendor = "TorMarket"
-
- #finding the price of the item
- price = soup.find("p", {"class": "price"}).find("bdi").text
- price_cleaned = price[1:]
- USD = price_cleaned.strip()
-
- category = soup.find('span', {"class": "posted_in"}).text
- category = category.split(':')[-1]
- category = category.replace(',', '/')
- category = cleanString(category.strip())
- #everything else gets a -1 because they are not found
-
- # Populating the final variable (this should be a list with all fields scraped)
- row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
-
- # Sending the results
- return row
-
-
-#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
-#stores info it needs in different lists, these lists are returned after being organized
-#@param: soup object looking at html page of listing page
-#return: 'row' that contains a variety of lists that each hold info on the listing page
-def tormarket_listing_parser(soup):
-
- # Fields to be parsed
- nm = 0 # *Total_Products (Should be Integer)
- mktName = "TorMarket" # 0 *Marketplace_Name
- vendor = [] # 1 *Vendor y
- rating_vendor = [] # 2 Vendor_Rating
- success = [] # 3 Vendor_Successful_Transactions
- name = [] # 4 *Product_Name y
- CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
- MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
- category = [] # 7 Product_Category y
- describe = [] # 8 Product_Description
- views = [] # 9 Product_Number_Of_Views
- reviews = [] # 10 Product_Number_Of_Reviews
- rating_item = [] # 11 Product_Rating
- addDate = [] # 12 Product_AddDate
- BTC = [] # 13 Product_BTC_SellingPrice
- USD = [] # 14 Product_USD_SellingPrice y
- EURO = [] # 15 Product_EURO_SellingPrice
- sold = [] # 16 Product_QuantitySold
- qLeft = [] # 17 Product_QuantityLeft
- shipFrom = [] # 18 Product_ShippedFrom
- shipTo = [] # 19 Product_ShippedTo
- image = [] # 20 Product_Image
- image_vendor = [] # 21 Vendor_Image
- href = [] # 22 Product_Links
-
- products_list = soup.find('ul', {"class": "products columns-3 tablet-columns-2 mobile-columns-1"}).find_all('li')
- nm = len(products_list)
-
- for product in products_list:
- # Finding the name of the product
- name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
- name_of_product_cleaned = cleanString(name_of_product.strip())
- # print(name_of_product_cleaned)
- name.append(name_of_product_cleaned)
- #finding the URL
- try:
- url = product.find("div", {"class": "product-loop-content text-center"}).find("a").get("href")
- # print(url)
- href.append(url)
- except AttributeError as e:
- print("I can't find the link")
- raise e
-
- #finding the rating of the product
- rating_score_of_product = product.find("div", {"class": "product-loop-content text-center"}).find("div").find("span").text
- rating_item.append(cleanString(rating_score_of_product.strip()))
- # print("done")
- #finding the rating of the vendors
- rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"})
- if rating_score_of_vendor is not None:
- rating_score_of_vendor = rating_score_of_vendor.find("strong").text
- rating_vendor.append(cleanString(rating_score_of_vendor.strip()))
- else:
- rating_vendor.append('-1')
- # print("done")
- #finding the cost in USD
- cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text
- USD.append(cost)
- # print("done")
- #finding the name of the vendor
- vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"})
- if vendor_name is not None:
- vendor_name = vendor_name.find("a").text
- vendor.append(cleanString(vendor_name.strip()))
- else:
- vendor.append(mktName)
- # print("done")
- #everything else appends a -1
- success.append("-1")
- CVE.append("-1")
- MS.append("-1")
- category.append("-1")
- describe.append("-1")
- views.append("-1")
- reviews.append("-1")
- addDate.append("-1")
- BTC.append("-1")
- EURO.append("-1")
- sold.append("-1")
- qLeft.append("-1")
- shipFrom.append("-1")
- shipTo.append("-1")
- # print("Done! moving onto the next product!")
- # print(len(shipTo))
-
-
- # Populate the final variable (this should be a list with all fields scraped)
- return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
-
-
-#called by the crawler to get description links on a listing page
-#@param: beautifulsoup object that is using the correct html page (listing page)
-#return: list of description links from a listing page
-def tormarket_links_parser(soup):
-
- # Returning all links that should be visited by the Crawler
-
- href = []
- listing = soup.findAll('div', {"class": "product-loop-content text-center"})
-
- for a in listing:
- bae = a.find('h2', {"class": "woocommerce-loop-product__title"}).find('a', href=True)
- link = bae['href']
- href.append(link)
-
- return href
\ No newline at end of file
diff --git a/MarketPlaces/HiddenMarket/crawler_selenium.py b/MarketPlaces/WeTheNorth/crawler_selenium.py
similarity index 63%
rename from MarketPlaces/HiddenMarket/crawler_selenium.py
rename to MarketPlaces/WeTheNorth/crawler_selenium.py
index 533129a..c6d5b70 100644
--- a/MarketPlaces/HiddenMarket/crawler_selenium.py
+++ b/MarketPlaces/WeTheNorth/crawler_selenium.py
@@ -1,7 +1,7 @@
__author__ = 'DarkWeb'
'''
-HiddenMarket Market Crawler (Selenium)
+WeTheNorth Market Crawler (Selenium)
'''
from selenium import webdriver
@@ -16,20 +16,20 @@ from PIL import Image
import urllib.parse as urlparse
import os, re, time
+from datetime import date
import subprocess
-import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
-from MarketPlaces.HiddenMarket.parser import hiddenmarket_links_parser
+from MarketPlaces.WeTheNorth.parser import wethenorth_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
-baseURL = 'http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/'
+baseURL = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion'
# Opens Tor Browser, crawls the website
def startCrawling():
- marketName = getMKTName()
+ marketName = getMarketName()
driver = getAccess()
if driver != 'down':
@@ -40,39 +40,40 @@ def startCrawling():
print(driver.current_url, e)
closeDriver(driver)
- new_parse(marketName, baseURL, True)
+ new_parse(marketName, False)
# Login using premade account credentials and do login captcha manually
def login(driver):
- # wait for login page
+ time.sleep(3)
+ #wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/div[3]/div[3]")))
+ (By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[1]/input")))
- # entering username and password into input boxes
- # usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
- # Username here
- # usernameBox.send_keys('ct1234')
- # passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
- # Password here
- # passwordBox.send_keys('DementedBed1230')
+ #entering username and password into input boxes
+ usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[1]/input')
+ #Username here
+ usernameBox.send_keys('blabri')
+ passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[2]/input')
+ #Password here
+ passwordBox.send_keys('fishowal')
'''
# wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- (By.XPATH, "/html/body/main/div/div/div/div/div/form/div[3]/div/div[1]/label/img")))
+ (By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img")))
# save captcha to local
- driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[3]/div/div[1]/label/img').screenshot(
- r'..\captcha.png')
+ driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img').screenshot(
+ r'..\WeTheNorth\captcha.png')
# This method will show image in any image viewer
- im = Image.open(r'..\captcha.png')
+ im = Image.open(r'..\WeTheNorth\captcha.png')
im.show()
# wait until input space show up
- inputBox = driver.find_element(by=By.XPATH, value='//*[@id="captcha"]')
+ inputBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[4]/input')
# ask user input captcha solution in terminal
userIn = input("Enter solution: ")
@@ -81,24 +82,24 @@ def login(driver):
inputBox.send_keys(userIn)
# click the verify(submit) button
- driver.find_element(by=By.XPATH, value="/html/body/main/div/div/div/div/div/form/div[4]/button").click()
+ driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div[2]/div/div[3]/form/div[5]/input").click()
'''
- # input("Press ENTER when CAPTCHA is completed\n")
+ input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
- # WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
- # (By.XPATH, '/html/body/main/div/div/div[1]/div/div[1]/div/h5')))
+ WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
+ (By.XPATH, '//*[@id="information"]')))
# Returns the name of the website
-def getMKTName():
- name = 'HiddenMarket'
+def getMarketName():
+ name = 'WeTheNorth'
return name
# Return the link of the website
def getFixedURL():
- url = 'http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/'
+ url = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion'
return url
@@ -109,7 +110,7 @@ def closeDriver(driver):
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
- driver.quit()
+ driver.close()
time.sleep(3)
return
@@ -140,7 +141,7 @@ def createFFDriver():
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
- ff_prof.set_preference("javascript.enabled", False)
+ ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
@@ -198,34 +199,19 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # Civil Software
- links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/civil_softwares')
- # Tutorials - Carding
- links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/carding')
- # Digital - Hacks
- links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/hacks')
- # Digital - Exploit Kit
- links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/exploit_kit')
- # 0Day
- links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/0day')
- # Digital Forensics
- links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/digital_forensics')
- # Tutorials - Mining
- links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/mining')
- # Tutorials - Worms
- links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/worms')
- # Tutorials - Viruses
- links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/viruses')
- # Tutorials - Trojans
- links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/trojans')
- # Tutorials - Botnets
- links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/botnets')
+ # # Fraud Software
+ # links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=5&podcategory=3')
+ # # Guides and Tutorials - Hacking
+ # links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=3&podcategory=3')
+ # Software and Malware
+ links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=10')
+
return links
def crawlForum(driver):
- print("Crawling the HiddenMarket market")
+ print("Crawling the WeTheNorth market")
linksToCrawl = getInterestedLinks()
@@ -233,20 +219,15 @@ def crawlForum(driver):
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
- categoryLink = link
- maxNumPages = 0 # temp value.
+
try:
+ pg_counter = 1
has_next_page = True
count = 0
- pageCount = 1
while has_next_page:
try:
driver.get(link)
- if pageCount == 1:
- maxNumPages = int(driver.find_element(by=By.CLASS_NAME, value='main')
- .find_element(by=By.CLASS_NAME, value='pages')
- .find_elements(By.CLASS_NAME, value='page')[-1].text)
except:
driver.refresh()
html = driver.page_source
@@ -262,19 +243,22 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item)
driver.back()
- # # comment out
- # break
- #
- # # comment out
- # if count == 1:
- # break
+ # comment out
+ break
+
+ # comment out
+ if count == 1:
+ break
try:
- pageCount += 1
- if pageCount > maxNumPages:
+ nav = driver.find_element(by=By.XPATH, value=
+ '/html/body/div[2]/div[3]/div[3]/div[2]/div[7]')
+ pg_counter += 1
+ pg_counter_str = "p=" + str(pg_counter) + "&"
+ a = nav.find_element(by=By.XPATH, value = '//a[contains(@href,"'+pg_counter_str+'")]')
+ link = a.get_attribute('href')
+ if link == "":
raise NoSuchElementException
- pageLink = "/" + str(pageCount) + "/"
- link = categoryLink + pageLink
count += 1
except NoSuchElementException:
@@ -284,7 +268,7 @@ def crawlForum(driver):
print(link, e)
i += 1
- print("Crawling the HiddenMarket market done.")
+ input("Crawling WeTheNorth market done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link
@@ -304,7 +288,7 @@ def isListingLink(url):
# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
- return hiddenmarket_links_parser(soup)
+ return wethenorth_links_parser(soup)
def crawler():
diff --git a/MarketPlaces/WeTheNorth/parser.py b/MarketPlaces/WeTheNorth/parser.py
new file mode 100644
index 0000000..56a42ec
--- /dev/null
+++ b/MarketPlaces/WeTheNorth/parser.py
@@ -0,0 +1,248 @@
+__author__ = 'DarkWeb'
+
+# Here, we are importing the auxiliary functions to clean or convert data
+from MarketPlaces.Utilities.utilities import *
+
+# Here, we are importing BeautifulSoup to search through the HTML tree
+from bs4 import BeautifulSoup
+
+
+# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
+def wethenorth_description_parser(soup):
+
+ # Fields to be parsed
+
+ vendor = "-1" # 0 *Vendor_Name
+ success = "-1" # 1 Vendor_Successful_Transactions
+ rating_vendor = "-1" # 2 Vendor_Rating
+ name = "-1" # 3 *Product_Name
+ describe = "-1" # 4 Product_Description
+ CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
+ category = "-1" # 7 Product_Category
+ views = "-1" # 8 Product_Number_Of_Views
+ reviews = "-1" # 9 Product_Number_Of_Reviews
+ rating_item = "-1" # 10 Product_Rating
+ addDate = "-1" # 11 Product_AddedDate
+ BTC = "-1" # 12 Product_BTC_SellingPrice
+ USD = "-1" # 13 Product_USD_SellingPrice
+ EURO = "-1" # 14 Product_EURO_SellingPrice
+ sold = "-1" # 15 Product_QuantitySold
+ left = "-1" # 16 Product_QuantityLeft
+ shipFrom = "-1" # 17 Product_ShippedFrom
+ shipTo = "-1" # 18 Product_ShippedTo
+
+ # Finding Product Name
+ listDes = soup.find('div', {'class': "listDes"})
+ name = listDes.find('h2').text
+ name = name.replace('\n', ' ')
+ name = name.replace(",", "")
+ name = name.strip()
+
+ # Finding Vendor
+ vendor = listDes.find('b').text
+ vendor = vendor.replace(",", "")
+ vendor = vendor.replace("...", "")
+ vendor = vendor.replace("-", "")
+ vendor = vendor.strip()
+
+ # Finding Vendor Rating
+ # rating = listDes.find('span',{'class':'levelSet'})
+ # rating = rating.text
+ # rating = rating.replace('\n', ' ')
+ # rating = rating.replace(",", "")
+ # rating = rating.strip()
+
+ # Finding Successful Transactions
+ success = listDes.find_all('p')[1]
+ success = success.find('span').text
+ success = success.split()
+ success = success[0].strip()
+
+ # Finding Prices - all prices in We The North are in CAD, I left the CAD in the resulting String so that it would show CAD for all prices
+ padp = listDes.find('p',{'class':'padp'})
+ USD = padp.find('span').text
+ USD = USD.strip()
+
+ # Finding Escrow - no escrow on WTN market
+
+ shipping_info = listDes.find('tbody')
+ if "Digital" not in shipping_info:
+ shipping_info = shipping_info.find_all('tr')
+ row1 = shipping_info[0].find_all('td')
+
+ # Finding Shipment Information (Origin)
+ shipFrom = row1[-1].text
+ shipFrom=shipFrom.strip()
+ if shipFrom=="":
+ shipFrom="-1"
+
+ row2 = shipping_info[1].find_all('td')
+
+ # Finding Shipment Information (Destination)
+ shipTo = row2[-1].text
+ shipTo= shipTo.strip()
+ if shipTo == "":
+ shipTo = "-1"
+
+ # Finding the Product description
+ describe = soup.find("div",{'class':'tabcontent'})
+ describe = describe.find('p').text
+ describe = describe.replace("\n", " ")
+ describe = describe.replace("\r", " ")
+ describe = describe.strip()
+
+ '''
+ # Finding the Number of Product Reviews
+ tag = soup.findAll(text=re.compile('Reviews'))
+ for index in tag:
+ reviews = index
+ par = reviews.find('(')
+ if par >=0:
+ reviews = reviews.replace("Reviews (","")
+ reviews = reviews.replace(")","")
+ reviews = reviews.split(",")
+ review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
+ else :
+ review = "-1"
+ '''
+
+ # Searching for CVE and MS categories
+ # no CVE or MS for WTN market
+
+ # Populating the final variable (this should be a list with all fields scraped)
+ row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
+ BTC, USD, EURO, sold, left, shipFrom, shipTo)
+
+ # Sending the results
+ return row
+
+
+# This is the method to parse the Listing Pages
+def wethenorth_listing_parser(soup):
+
+ # Fields to be parsed
+ nm = 0 # *Total_Products (Should be Integer)
+ mktName = "WeTheNorth" # 0 *Marketplace_Name
+ vendor = [] # 1 *Vendor y
+ rating_vendor = [] # 2 Vendor_Rating
+ success = [] # 3 Vendor_Successful_Transactions
+ name = [] # 4 *Product_Name y
+ CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
+ MS = [] # 6 Product_MS_Classification (Microsoft Security)
+ category = [] # 7 Product_Category y
+ describe = [] # 8 Product_Description
+ views = [] # 9 Product_Number_Of_Views
+ reviews = [] # 10 Product_Number_Of_Reviews
+ rating_item = [] # 11 Product_Rating
+ addDate = [] # 12 Product_AddDate
+ BTC = [] # 13 Product_BTC_SellingPrice
+ USD = [] # 14 Product_USD_SellingPrice y
+ EURO = [] # 15 Product_EURO_SellingPrice
+ sold = [] # 16 Product_QuantitySold
+ qLeft =[] # 17 Product_QuantityLeft
+ shipFrom = [] # 18 Product_ShippedFrom
+ shipTo = [] # 19 Product_ShippedTo
+ href = [] # 20 Product_Links
+
+ right_content = soup.find('div', {"class": "right-content"})
+ listing = right_content.findAll('div', {"class": "col-1search"})
+ listing = listing[3:]
+
+ # Populating the Number of Products
+ nm = len(listing)
+
+ for a in listing:
+ bae = a.findAll('a', href=True)
+
+ # Adding the url to the list of urls
+ link = bae[0].get('href')
+ link = cleanLink(link)
+ href.append(link)
+
+ # Finding the Vendor
+ vendor_name = a.find('p', {'class': 'padp'})
+ vendor_name = vendor_name.find('a').text
+ vendor_name = vendor_name.replace(",", "")
+ vendor_name = vendor_name.strip()
+ vendor.append(vendor_name)
+
+ # Finding the Product
+ product = bae[0].text
+ product = product.replace('\n', ' ')
+ product = product.replace(",", "")
+ product = product.strip()
+ name.append(product)
+
+ # Finding the Category
+ category_name = a.find('p', {'class': 'padp'}).text
+ first_dash = category_name.find('-')
+ second_dash = category_name[first_dash+1:].find('-')
+ category_name = category_name[first_dash+1:second_dash]
+ category_name=category_name.strip()
+ category.append(category_name)
+
+ # Finding Views
+ view_count = a.text
+ view_count = view_count[view_count.find('Views:'): view_count.find('Sales:')]
+ view_count = view_count.replace('Views:', ' ')
+ view_count = view_count.replace('/', ' ')
+ view_count = view_count.strip()
+ views.append(view_count)
+
+ # Finding success sales
+ sold_count = a.text
+ sold_count = sold_count[sold_count.find('Sales:'): sold_count.find('Short')]
+ sold_count = sold_count.replace('Sales:', ' ')
+ sold_count = sold_count.replace('/', ' ')
+ sold_count = sold_count.strip()
+ success.append(sold_count)
+
+ # Searching for CVE and MS categories
+ # no CVE or MS in WTN market
+ cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
+ if not cve:
+ cveValue="-1"
+ else:
+ cee = " "
+ for idx in cve:
+ cee += (idx)
+ cee += " "
+ cee = cee.replace(',', ' ')
+ cee = cee.replace('\n', '')
+ cveValue=cee
+ CVE.append(cveValue)
+
+ ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
+ if not ms:
+ MSValue="-1"
+ else:
+ me = " "
+ for im in ms:
+ me += (im)
+ me += " "
+ me = me.replace(',', ' ')
+ me = me.replace('\n', '')
+ MSValue=me
+ MS.append(MSValue)
+
+ # Populate the final variable (this should be a list with all fields scraped)
+ return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
+ reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
+
+
+def wethenorth_links_parser(soup):
+
+ # Returning all links that should be visited by the Crawler
+ href = []
+ right_content = soup.find('div',{"class": "right-content"})
+ listing = right_content.findAll('div', {"class": "col-1search"})
+ #cut out the irrelevant products that are in blue, the first three products of each page usually unrelated
+ listing = listing[3:]
+ for a in listing:
+
+ link = a.find('a')
+ link = link['href']
+ href.append(link)
+
+ return href
\ No newline at end of file