@ -1,166 +0,0 @@ | |||
__author__ = 'Helium' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from Forums.Utilities.utilities import * | |||
from datetime import date | |||
from datetime import timedelta | |||
import re | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) | |||
def abyssForums_description_parser(soup): | |||
# Fields to be parsed | |||
topic = "-1" # 0 topic name | |||
user = [] # 1 all users of each post | |||
addDate = [] # 2 all dated of each post | |||
feedback = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format) | |||
status = [] # 4 all user's authority in each post such as (adm, member, dangerous) | |||
reputation = [] # 5 all users's karma in each post (usually found as a number) | |||
sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post) | |||
post = [] # 7 all messages of each post | |||
interest = [] # 8 all user's interest in each post | |||
image_user = [] # 9 all user avatars of each post | |||
image_post = [] # 10 all first images of each post | |||
# Finding the topic (should be just one coming from the Listing Page) | |||
li = soup.find("div", {"class": "page-body"}).find("h2", {"class": "topic-title"}) | |||
topic = li.text.replace(",","") | |||
topic = topic.replace("\n","") | |||
topic = cleanString(topic.strip()) | |||
regex = re.compile('post has-profile.*') | |||
posts = soup.find_all('div', {"class": regex}) | |||
# print(len(posts)) | |||
# For each message (post), get all the fields we are interested to: | |||
for ipost in posts: | |||
# Finding the author (user) of the post | |||
author = ipost.find('a', {"class": "username"}).text | |||
user.append(cleanString(author)) # Remember to clean the problematic characters | |||
status.append("-1") | |||
reputation.append("-1") | |||
interest.append("-1") | |||
sign.append("-1") | |||
feedback.append("-1") | |||
image_post.append("-1") | |||
img = ipost.find('dl', {"class": "postprofile"}).find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = "-1" | |||
image_user.append(img) | |||
image_user.append("-1") | |||
date_time_obj = ipost.find('time').attrs | |||
date = date_time_obj['datetime'][0:10] | |||
time = date_time_obj['datetime'][11:19] | |||
date_time_obj = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S') | |||
addDate.append(date_time_obj) | |||
# Finding the post | |||
inner = ipost.find('div', {"class": "content"}) | |||
inner = inner.text.strip() | |||
post.append(cleanString(inner)) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages (one page with many posts) | |||
def abyssForums_listing_parser(soup: BeautifulSoup): | |||
nm = 0 # this variable should receive the number of topics | |||
forum = "AbyssForum" # 0 *forum name | |||
board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree. | |||
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) | |||
author = [] # 2 all authors of each topic | |||
topic = [] # 3 all topics | |||
views = [] # 4 number of views of each topic | |||
posts = [] # 5 number of posts of each topic | |||
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between | |||
addDate = [] # when the topic was created (difficult to find) | |||
image_author = [] # 8 all author avatars used in each topic | |||
# Listing and Description pages) | |||
#finding the board | |||
board = soup.find("h2", {"class": "forum-title"}).text | |||
board = cleanString(board.strip()) | |||
type_of_posts = soup.find_all("li", {"class": re.compile("row bg\d")} ) | |||
for literature in type_of_posts: | |||
title_of_post = literature.find("a", {"class": "topictitle"}).text | |||
title_of_post = cleanString(title_of_post) | |||
topic.append(title_of_post) | |||
user = literature.find("div", {"class": "topic-poster responsive-hide left-box"}).find("a", {"class": "username"}).text | |||
author.append(user) | |||
num_post = literature.find("dd", {"class": "posts"}).text.replace("Replies","").strip() | |||
posts.append(num_post) | |||
num_view = literature.find("dd", {"class": "views"}).text.replace("Views","").strip() | |||
views.append(num_view) | |||
#if int(num_post) != 0: join the last user who posted with the author? | |||
# reply = literature.find("dd", {"class": "lastpost"}).find("a", {"class": "username"}).text | |||
# user.append(reply) | |||
date_time_obj = literature.find('time').attrs | |||
date = date_time_obj['datetime'][0:10] | |||
time = date_time_obj['datetime'][11:19] | |||
date_added = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S') | |||
addDate.append(date_added) | |||
listing_href = literature.find("a", {"class": "topictitle"}).get("href") | |||
href.append(listing_href) | |||
image_author.append("-1") | |||
nm = len(topic) | |||
return organizeTopics( | |||
forum=forum, | |||
nm=nm, | |||
board=board, | |||
author=author, | |||
topic=topic, | |||
views=views, | |||
posts=posts, | |||
href=href, | |||
addDate=addDate, | |||
image_author=image_author | |||
) | |||
def abyssForum_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
#print(soup.find('table', {"class": "tborder clear"}).find( | |||
# 'tbody').find_all('tr', {"class": "inline_row"})) | |||
listing = soup.find_all('dl', {"class": "row-item topic_read"}) | |||
for a in listing: | |||
link = a.find('div', {"class": "list-inner"}).find('a').get('href') | |||
href.append(link) | |||
return href |
@ -1,298 +0,0 @@ | |||
__author__ = 'Helium' | |||
''' | |||
Altenens Forum Crawler (Selenium) | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.common.by import By | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
from datetime import date | |||
import configparser | |||
import subprocess | |||
from bs4 import BeautifulSoup | |||
from Forums.Initialization.prepare_parser import new_parse | |||
from Forums.Altenens.parser import altenens_links_parser | |||
from Forums.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'https://altenens.is/' | |||
# Opens Tor Browser, crawls the website | |||
def startCrawling(): | |||
forumName = getForumName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(forumName, baseURL, True) | |||
# Login using premade account credentials and do login captcha manually | |||
def login(driver): | |||
#click login button | |||
login_link = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]').get_attribute('href') | |||
driver.get(login_link) # open tab with url | |||
#entering username and password into input boxes | |||
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[1]/dd/input') | |||
#Username here | |||
usernameBox.send_keys('mylittlepony45')#sends string to the username box | |||
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[2]/dd/div/div/input') | |||
#Password here | |||
passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox | |||
input("Press ENTER when CAPTCHA is completed\n") | |||
# wait for listing page show up (This Xpath may need to change based on different seed url) | |||
# wait for 50 sec until id = tab_content is found, then cont | |||
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( | |||
(By.XPATH, '/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]'))) | |||
# Returns the name of the website | |||
def getForumName(): | |||
name = 'Altenens' | |||
return name | |||
# Return the link of the website | |||
def getFixedURL(): | |||
url = 'https://altenens.is/' | |||
return url | |||
# Closes Tor Browser | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() #close tab | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from Forums.Initialization.forums_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
# ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
# ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", True) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url)# open url in browser | |||
return driver | |||
except: | |||
driver.close()# close tab | |||
return 'down' | |||
# Saves the crawled html page | |||
def savePage(driver, html, url): | |||
cleanPage = cleanHTML(driver, html) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
def getFullPathName(url): | |||
from Forums.Initialization.forums_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if (name == ''): | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
def getInterestedLinks(): | |||
links = [] | |||
# Hacking | |||
links.append('https://altenens.is/forums/hacking.469162/') | |||
# Hacking showoff | |||
links.append('https://altenens.is/forums/hacking-showoff.469232/') | |||
# Remote administration | |||
links.append('https://altenens.is/forums/remote-administration.469161/') | |||
# Cracking tools | |||
links.append('https://altenens.is/forums/cracking-tools.469204/') | |||
# Cracking tutorials | |||
links.append('https://altenens.is/forums/cracking-tutorials-other-methods.469205/') | |||
# Combo lists and configs | |||
links.append('https://altenens.is/forums/combolists-and-configs.469206/') | |||
# Programming | |||
links.append('https://altenens.is/forums/programming.469239/') | |||
return links | |||
# newest version of crawling | |||
def crawlForum(driver): | |||
print("Crawling the Altenens forum") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
topics = topicPages(html) | |||
for topic in topics: | |||
has_next_topic_page = True | |||
counter = 1 | |||
page = topic | |||
while has_next_topic_page: | |||
itemURL = urlparse.urljoin(baseURL, str(page)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
if isListingLink(driver.current_url): | |||
break | |||
savePage(driver, driver.page_source, topic + f"page{counter}") # very important | |||
# # comment out | |||
# if counter == 2: | |||
# break | |||
try: | |||
page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') | |||
if page == "": | |||
raise NoSuchElementException | |||
counter += 1 | |||
except NoSuchElementException: | |||
has_next_topic_page = False | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
try: | |||
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the Altenens forum done.") | |||
# Returns 'True' if the link is Topic link, may need to change for every website | |||
def isDescriptionLink(url): | |||
if 'threads' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link, may need to change for every website | |||
def isListingLink(url): | |||
if '.is/forums' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links | |||
def topicPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) | |||
return altenens_links_parser(soup) | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing BestCardingWorld .... DONE!") |
@ -1,165 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from Forums.Utilities.utilities import * | |||
from datetime import date | |||
from datetime import timedelta | |||
import re | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) | |||
def altenens_description_parser(soup): | |||
topic = "-1" # 0 *topic name | |||
user = [] # 1 *all users of each post | |||
status = [] # 2 all user's authority in each post such as (adm, member, dangerous) | |||
reputation = [] # 3 all user's karma in each post (usually found as a number) | |||
interest = [] # 4 all user's interest in each post | |||
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) | |||
post = [] # 6 all messages of each post | |||
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) | |||
addDate = [] # 8 all dates of each post | |||
image_user = [] # 9 all user avatars of each post | |||
image_post = [] # 10 all first images of each post | |||
etopic = soup.find("h1", {"class": "p-title-value"}) | |||
if etopic is not None: | |||
topic = etopic.text | |||
topic = cleanString(topic.strip()) | |||
body = soup.find('div', {"class": "block-container lbContainer"}) | |||
iposts = body.find_all('article', {"class": "message message--post js-post js-inlineModContainer"}) | |||
for ipost in iposts: | |||
author = ipost.find('h4', {"class": "message-name"}).text | |||
user.append(cleanString(author.strip())) | |||
stat = ipost.find('h5', {"class": "userTitle message-userTitle"}).text | |||
status.append(cleanString(stat.strip())) | |||
bar = ipost.find('div', {"class": "xtr-progress-bar"}) | |||
if bar is not None: | |||
rep = bar.find('p').get('data-value') | |||
else: | |||
rep = "-1" | |||
reputation.append(cleanString(rep)) | |||
interest.append("-1") | |||
signature = ipost.find('aside', {"class": "message-signature"}) | |||
if signature is not None: | |||
signature = signature.text.strip() | |||
else: | |||
signature = "-1" | |||
sign.append(cleanString(signature)) | |||
inner = ipost.find('div', {"class": "bbWrapper"}).find(text=True, recursive=False) | |||
if inner is not None: | |||
inner = inner.strip() | |||
else: | |||
inner = "" # cannot use -1 because the post is hidden unless you reply | |||
post.append(cleanString(inner)) | |||
feedback.append("-1") | |||
dt = ipost.find('time', {"class": "u-dt"}).get('datetime') | |||
date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z') | |||
addDate.append(date_time_obj) | |||
img = ipost.find('div', {"class": "message-avatar-wrapper"}).find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = "-1" | |||
image_user.append(img) | |||
image_post.append("-1") | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages (one page with many posts) | |||
def altenens_listing_parser(soup): | |||
nm = 0 # *this variable should receive the number of topics | |||
forum = "Altenens" # 0 *forum name | |||
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. | |||
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) | |||
author = [] # 2 *all authors of each topic | |||
topic = [] # 3 *all topics | |||
views = [] # 4 number of views of each topic | |||
posts = [] # 5 number of posts of each topic | |||
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between | |||
# Listing and Description pages) | |||
addDate = [] # 7 when the topic was created (difficult to find) | |||
image_author = [] # 8 all author avatars used in each topic | |||
board = soup.find('h1', {"class": "p-title-value"}).text | |||
board = cleanString(board.strip()) | |||
regex = re.compile('structItem structItem--thread.*') | |||
itopics = soup.find_all('div', {"class": regex}) | |||
nm = len(itopics) | |||
for itopic in itopics: | |||
topics = itopic.find('div', {"class": "structItem-title"}).text | |||
topic.append(cleanString(topics.strip())) | |||
author_icon = itopic.find('a', {"class": "avatar avatar--s"}) | |||
if author_icon != None: | |||
author_icon = author_icon.find('img') | |||
author_icon = author_icon.get('src') | |||
author_icon = author_icon.split('base64,')[-1] | |||
else: | |||
author_icon = "-1" | |||
image_author.append(author_icon) | |||
link = itopic.find('div', {"class": "structItem-title"}).find('a').get('href') | |||
href.append(link) | |||
user = itopic.find('ul', {"class": "structItem-parts"}).find('a').text | |||
author.append(cleanString(user.strip())) | |||
dt = itopic.find('time', {"class": "u-dt"}).get('datetime') | |||
date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z') | |||
addDate.append(date_time_obj) | |||
nposts = itopic.find('dl', {"class": "pairs pairs--justified"}).text | |||
nposts = nposts.replace('Replies', '') | |||
nposts = nposts.replace('K', '000') | |||
posts.append(cleanString(nposts)) | |||
nviews = itopic.find('dl', {"class": "pairs pairs--justified structItem-minor"}).text | |||
nviews = nviews.replace('Views', '') | |||
nviews = nviews.replace('K', '000') | |||
views.append(cleanString(nviews)) | |||
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author) | |||
def altenens_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.find_all('div', {"class": "structItem-cell structItem-cell--main"}) | |||
for a in listing: | |||
link = a.find('a', {"class": ""}).get('href') | |||
href.append(link) | |||
return href |
@ -1,303 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
''' | |||
Cardingleaks Forum Crawler (Selenium) | |||
Crawler updated and fixed | |||
The site has this thing sometime where you'll have to look at a new post everyday. makes sure | |||
you login first before crawling. | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.common.by import By | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
import subprocess | |||
from bs4 import BeautifulSoup | |||
from Forums.Initialization.prepare_parser import new_parse | |||
from Forums.Cardingleaks.parser import cardingleaks_links_parser | |||
from Forums.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'https://leaks.ws/' | |||
# Opens Tor Browser, crawls the website | |||
def startCrawling(): | |||
forumName = getForumName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(forumName, baseURL, True) | |||
# Login using premade account credentials and do login captcha manually | |||
def login(driver): | |||
#click login button | |||
login_link = driver.find_element( | |||
by=By.XPATH, value='/html/body/div[2]/div[1]/nav/div/div[3]/div[1]/a[1]').\ | |||
get_attribute('href') | |||
driver.get(login_link)# open tab with url | |||
#entering username and password into input boxes | |||
usernameBox = driver.find_element(by=By.NAME, value='login') | |||
#Username here | |||
usernameBox.send_keys('somanyfrogs')#sends string to the username box | |||
passwordBox = driver.find_element(by=By.NAME, value='password') | |||
#Password here | |||
passwordBox.send_keys('therearewaytoomanyherehowwhy')# sends string to passwordBox | |||
login = driver.find_element(by=By.CLASS_NAME, value='block-container') | |||
login_link = login.find_element(by=By.TAG_NAME, value='button') | |||
login_link.click() | |||
# input('input') | |||
# wait for listing page show up (This Xpath may need to change based on different seed url) | |||
# wait for 50 sec until id = tab_content is found, then cont | |||
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( | |||
(By.CLASS_NAME, 'p-body-pageContent'))) | |||
# Returns the name of the website | |||
def getForumName() -> str: | |||
name = 'Cardingleaks' | |||
return name | |||
# Return the link of the website | |||
def getFixedURL(): | |||
url = 'https://leaks.ws/' | |||
return url | |||
# Closes Tor Browser | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() #close tab | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from Forums.Initialization.forums_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", True) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
# Saves the crawled html page | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
def getFullPathName(url): | |||
from Forums.Initialization.forums_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if name == '': | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
def getInterestedLinks(): | |||
links = [] | |||
# carding methods | |||
links.append('https://leaks.ws/forums/carding-methods.82/') | |||
# # carding schools | |||
# links.append('https://leaks.ws/forums/help-desk-carding-school.35/') | |||
# # carding discussion | |||
# links.append('https://leaks.ws/forums/carding-discussion-desk.58/') | |||
# # carding tutorials | |||
# links.append('https://leaks.ws/forums/carding-tutorials.13/') | |||
# # carding tools and software | |||
# links.append('https://leaks.ws/forums/carding-tools-softwares.10/') | |||
# # exploits and cracking tools | |||
# links.append('https://leaks.ws/forums/exploits-cracking-tools.22/') | |||
return links | |||
def crawlForum(driver): | |||
print("Crawling the Cardingleaks forum") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
topics = topicPages(html) | |||
for topic in topics: | |||
has_next_topic_page = True | |||
counter = 1 | |||
page = topic | |||
while has_next_topic_page: | |||
itemURL = urlparse.urljoin(baseURL, str(page)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
if isListingLink(driver.current_url): | |||
break | |||
savePage(driver, driver.page_source, topic + f"page{counter}") # very important | |||
# # comment out | |||
# if counter == 2: | |||
# break | |||
try: | |||
page = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') | |||
if page == "": | |||
raise NoSuchElementException | |||
counter += 1 | |||
except NoSuchElementException: | |||
has_next_topic_page = False | |||
# making sure we go back to the listing page (browser back button simulation) | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
try: | |||
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the Cardingleaks forum done.") | |||
# Returns 'True' if the link is Topic link, may need to change for every website | |||
def isDescriptionLink(url): | |||
if 'threads' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link, may need to change for every website | |||
def isListingLink(url): | |||
if '.ws/forums' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links | |||
def topicPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
return cardingleaks_links_parser(soup) | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing BestCardingWorld .... DONE!") |
@ -1,167 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from Forums.Utilities.utilities import * | |||
from datetime import date | |||
from datetime import timedelta | |||
import re | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup, ResultSet, Tag | |||
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) | |||
def cardingleaks_description_parser(soup: Tag): | |||
# Fields to be parsed | |||
topic = "-1" # 0 *topic name | |||
user = [] # 1 *all users of each post | |||
status = [] # 2 all user's authority in each post such as (adm, member, dangerous) | |||
reputation = [] # 3 all user's karma in each post (usually found as a number) | |||
interest = [] # 4 all user's interest in each post | |||
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) | |||
post = [] # 6 all messages of each post | |||
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) | |||
addDate = [] # 8 all dates of each post | |||
image_user = [] # 9 all user avatars of each post | |||
image_post = [] # 10 all first images of each post | |||
li = soup.find("h1", {"class": "p-title-value"}) | |||
topic = cleanString(li.text.strip()) | |||
post_list: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True}) | |||
for ipost in post_list: | |||
username = ipost.get('data-author') | |||
user.append(username) | |||
user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text | |||
status.append(cleanString(user_status.strip())) | |||
user_statistics: ResultSet[Tag] = ipost.find("div", {"class": "message-userExtras"}).find_all("dl", {"class": "pairs pairs--justified"}) | |||
user_reputation = "-1" | |||
for stat in user_statistics: | |||
data_type = stat.find("span").get("data-original-title") | |||
if data_type == "Points": | |||
user_reputation = stat.find("dd").text | |||
break | |||
reputation.append(cleanString(user_reputation.strip())) | |||
interest.append("-1") | |||
sign.append("-1") | |||
user_post = ipost.find("div", {"class": "message-content js-messageContent"}).text | |||
post.append(cleanString(user_post.strip())) | |||
feedback.append("-1") | |||
datetime_text = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime") | |||
datetime_obj = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S%z") | |||
addDate.append(datetime_obj) | |||
img = ipost.find('div', {"class": "message-content js-messageContent"}).find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = "-1" | |||
image_post.append(img) | |||
img = ipost.find('div', {"class": "message-avatar"}).find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = "-1" | |||
image_user.append(img) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages (one page with many posts) | |||
def cardingleaks_listing_parser(soup: Tag): | |||
nm = 0 # *this variable should receive the number of topics | |||
forum = "Cardingleaks" # 0 *forum name | |||
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. | |||
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) | |||
author = [] # 2 *all authors of each topic | |||
topic = [] # 3 *all topics | |||
views = [] # 4 number of views of each topic | |||
posts = [] # 5 number of posts of each topic | |||
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between | |||
# Listing and Description pages) | |||
addDate = [] # 7 when the topic was created (difficult to find) | |||
image_user = [] # 8 all user avatars used in each topic | |||
# Finding the board (should be just one) | |||
li = soup.find("h1", {"class": "p-title-value"}) | |||
board = cleanString(li.text.strip()) | |||
thread_list = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True}) | |||
sticky = soup.find('div', {"class": "structItemContainer-group structItemContainer-group--sticky"}) | |||
if sticky is not None: | |||
thread_list = sticky.find_all("div", {"data-author": True}) + thread_list | |||
nm = len(thread_list) | |||
for thread in thread_list: | |||
thread_author = thread.get("data-author") | |||
author.append(thread_author) | |||
thread_topic = thread.find("div", {"class": "structItem-title"}).text | |||
topic.append(cleanString(thread_topic.strip())) | |||
author_icon = thread.find("a", {"class": "avatar avatar--s"}) | |||
if author_icon is not None: | |||
author_icon = author_icon.find('img') | |||
if author_icon is not None: | |||
author_icon = author_icon.get('src').split('base64,')[-1] | |||
image_user.append(author_icon) | |||
else: | |||
image_user.append('-1') | |||
else: | |||
image_user.append('-1') | |||
thread_view = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find("dd").text | |||
# Context text view count (i.e., 8.8K) to numerical (i.e., 8800) | |||
if thread_view.find("K") > 0: | |||
thread_view = str(int(float(thread_view.replace("K", "")) * 1000)) | |||
views.append(thread_view) | |||
thread_posts = thread.find("dl", {"class": "pairs pairs--justified"}).find("dd").text | |||
posts.append(cleanString(thread_posts.strip())) | |||
thread_href = thread.find("div", {"class": "structItem-title"}).find("a").get("href") | |||
href.append(thread_href) | |||
thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime") | |||
datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z") | |||
addDate.append(datetime_obj) | |||
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_user) | |||
def cardingleaks_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.find_all('div', {"class": "structItem-title"}) | |||
for a in listing: | |||
link = a.find('a').get('href') | |||
href.append(link) | |||
return [href[-1]] |
@ -1,257 +0,0 @@ | |||
__author__ = '91Shadows' | |||
''' | |||
CryptBB Crawler (Mechanize) | |||
''' | |||
import codecs, os, re | |||
import socks, socket, time | |||
from datetime import date | |||
import urllib.parse as urlparse | |||
import http.client as httplib | |||
import mechanize | |||
import subprocess | |||
from bs4 import BeautifulSoup | |||
from Forums.Initialization.prepare_parser import new_parse | |||
from Forums.BestCardingWorld.parser import bestcardingworld_links_parser | |||
counter = 1 | |||
httplib.HTTPConnection._http_vsn = 10 | |||
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' | |||
baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5' | |||
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150) | |||
# Opens Tor Browser, crawls the website | |||
def startCrawling(): | |||
opentor() | |||
getUrl() | |||
forumName = getForumName() | |||
br = getAccess() | |||
if br != 'down': | |||
crawlForum(br) | |||
new_parse(forumName, False) | |||
# new_parse(forumName, False) | |||
closetor() | |||
# Opens Tor Browser | |||
def opentor(): | |||
global pid | |||
print("Connecting Tor...") | |||
path = open('../../path.txt').readline() | |||
pro = subprocess.Popen(path) | |||
pid = pro.pid | |||
time.sleep(7.5) | |||
input("Tor Connected. Press ENTER to continue\n") | |||
return | |||
# Creates a connection through Tor Port | |||
def getUrl(timeout=None): | |||
socket.socket = socks.socksocket | |||
socket.create_connection = create_connection | |||
return | |||
# Makes the onion address request | |||
def create_connection(address, timeout=None, source_address=None): | |||
sock = socks.socksocket() | |||
sock.connect(address) | |||
return sock | |||
# Returns the name of website | |||
def getForumName(): | |||
name = 'CryptBB' | |||
return name | |||
# Return the link of website | |||
def getFixedURL(): | |||
url = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5' | |||
return url | |||
# Closes Tor Browser | |||
def closetor(): | |||
global pid | |||
os.system("taskkill /pid " + str(pid)) | |||
print('Closing Tor...') | |||
time.sleep(3) | |||
return | |||
# Creates a Mechanize browser and initializes its options | |||
def createBrowser(): | |||
br = mechanize.Browser() | |||
cj = mechanize.CookieJar() | |||
br.set_cookiejar(cj) | |||
# Browser options | |||
br.set_handle_equiv(True) | |||
br.set_handle_redirect(True) | |||
br.set_handle_referer(True) | |||
br.set_handle_robots(False) | |||
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) | |||
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'), | |||
('Accept', '*/*')] | |||
return br | |||
def getAccess(): | |||
url = getFixedURL() | |||
br = createBrowser() | |||
try: | |||
br.open(url) | |||
return br | |||
except: | |||
return 'down' | |||
# Saves the crawled html page | |||
def savePage(page, url): | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
a = page.read() | |||
open(filePath, "wb").write(a) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
def getFullPathName(url): | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str( | |||
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( | |||
"%04d" % date.today().year) + '/' + 'Description/' + fileName + '.html' | |||
else: | |||
fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str( | |||
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( | |||
"%04d" % date.today().year) + '/' + 'Listing/' + fileName + '.html' | |||
return fullPath | |||
# Creates the name of the file based on URL | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if (name == ''): | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
# Hacking and Markets related topics | |||
def getInterestedLinks(): | |||
links = [] | |||
links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43&sid=e12864ffccc5df877b03b573534955be') | |||
return links | |||
# Start crawling Forum pages | |||
def crawlForum(br): | |||
print("Crawling CryptBB forum") | |||
linksToCrawl = getInterestedLinks() | |||
visited = set(linksToCrawl) | |||
initialTime = time.time() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
page = br.open(link) | |||
savePage(page, link) | |||
res = br.response().read() | |||
soup = BeautifulSoup(res, 'html.parser') | |||
next_link = soup.find("a", {"rel": "next"}) | |||
if next_link != None: | |||
full_url = urlparse.urljoin(linksToCrawl[i], next_link['href']) | |||
linksToCrawl.insert(i + 1, full_url) | |||
listOfTopics = findDescriptionPages(link) | |||
for topic in listOfTopics: | |||
itemPage = br.open(str(topic)) | |||
savePage(itemPage, topic) | |||
except Exception as e: | |||
print('Error getting link: ', link, e) | |||
i += 1 | |||
# finalTime = time.time() | |||
# print finalTime - initialTime | |||
input("CryptBB forum done sucessfully. Press ENTER to continue\n") | |||
return | |||
# Returns True if the link is 'Topic' Links, may need to change for diff websites | |||
def isDescriptionLink(url): | |||
if 'topic' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link, may need to change for diff websites | |||
def isListingLink(url): | |||
''' | |||
reg = 'board=[0-9]+.[0-9]+\Z' | |||
if len(re.findall(reg, url)) == 0: | |||
return False | |||
return True | |||
''' | |||
if 'forum' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links | |||
def findDescriptionPages(url): | |||
soup = "" | |||
error = False | |||
try: | |||
html = codecs.open( | |||
"C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str( | |||
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( | |||
"%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html", encoding='utf8') | |||
soup = BeautifulSoup(html, "html.parser") | |||
except: | |||
try: | |||
html = open( | |||
"C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str( | |||
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( | |||
"%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html") | |||
soup = BeautifulSoup(html, "html.parser") | |||
except: | |||
error = True | |||
print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.") | |||
if not error: | |||
return bestcardingworld_links_parser(soup) | |||
else: | |||
return [] | |||
def crawler(): | |||
startCrawling() | |||
print("Crawling and Parsing CryptBB .... DONE!") |
@ -1,331 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
''' | |||
CryptBB Forum Crawler (Selenium) | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.common.by import By | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
import subprocess | |||
from bs4 import BeautifulSoup | |||
from Forums.Initialization.prepare_parser import new_parse | |||
from Forums.CryptBB.parser import cryptBB_links_parser | |||
from Forums.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/' | |||
# Opens Tor Browser, crawls the website | |||
def startCrawling(): | |||
forumName = getForumName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(forumName, baseURL, True) | |||
# Login using premade account credentials and do login captcha manually | |||
def login(driver): | |||
#click login button | |||
login_link = driver.find_element( | |||
by=By.XPATH, value='/html/body/div/div[2]/div/table/tbody/tr[2]/td/center/pre/strong/a[1]').\ | |||
get_attribute('href') | |||
driver.get(login_link)# open tab with url | |||
#entering username and password into input boxes | |||
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input') | |||
#Username here | |||
usernameBox.send_keys('holyre')#sends string to the username box | |||
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input') | |||
#Password here | |||
passwordBox.send_keys('PlatinumBorn2')# sends string to passwordBox | |||
''' | |||
# wait for captcha page show up | |||
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( | |||
(By.XPATH, "/html/body/div/div[2]/div/form/div/input"))) | |||
# save captcha to local | |||
driver.find_element(by=By.XPATH, value='//*[@id="captcha_img"]').screenshot(r'..\CryptBB\captcha.png') | |||
# This method will show image in any image viewer | |||
im = Image.open(r'..\CryptBB\captcha.png') | |||
im.show() | |||
# wait until input space show up | |||
inputBox = driver.find_element(by=By.XPATH, value='//*[@id="imagestring"]') | |||
# ask user input captcha solution in terminal | |||
userIn = input("Enter solution: ") | |||
# send user solution into the input space | |||
inputBox.send_keys(userIn) | |||
# click the verify(submit) button | |||
driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div/form/div/input").click() | |||
''' | |||
input("Press ENTER when CAPTCHA is completed\n") | |||
# wait for listing page show up (This Xpath may need to change based on different seed url) | |||
# wait for 50 sec until id = tab_content is found, then cont | |||
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( | |||
(By.XPATH, '//*[@id="tab_content"]'))) | |||
# Returns the name of the website | |||
def getForumName() -> str: | |||
name = 'CryptBB' | |||
return name | |||
# Return the link of the website | |||
def getFixedURL(): | |||
url = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/' | |||
return url | |||
# Closes Tor Browser | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() #close tab | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from Forums.Initialization.forums_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", True) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
# Saves the crawled html page | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
def getFullPathName(url): | |||
from Forums.Initialization.forums_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if name == '': | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
def getInterestedLinks(): | |||
links = [] | |||
# Beginner Programming | |||
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86') | |||
# Beginner Carding and Fraud | |||
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91') | |||
# Beginner Hacking | |||
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87') | |||
# Newbie | |||
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=84') | |||
# Beginner Hardware | |||
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=89') | |||
# Training Challenges | |||
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=96') | |||
# Darknet Discussions | |||
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88') | |||
# Public Leaks and Warez | |||
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=97') | |||
# Sell | |||
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=44') | |||
return links | |||
def crawlForum(driver): | |||
print("Crawling the CryptBB forum") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
topics = topicPages(html) | |||
for topic in topics: | |||
has_next_topic_page = True | |||
counter = 1 | |||
page = topic | |||
while has_next_topic_page: | |||
itemURL = urlparse.urljoin(baseURL, str(page)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
if isListingLink(driver.current_url): | |||
break | |||
savePage(driver, driver.page_source, topic + f"page{counter}") # very important | |||
# # comment out | |||
# if counter == 2: | |||
# break | |||
try: | |||
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') | |||
page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') | |||
if page == "": | |||
raise NoSuchElementException | |||
counter += 1 | |||
except NoSuchElementException: | |||
has_next_topic_page = False | |||
# making sure we go back to the listing page (browser back button simulation) | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
try: | |||
temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div') | |||
link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the CrypttBB forum done.") | |||
# Returns 'True' if the link is Topic link, may need to change for every website | |||
def isDescriptionLink(url): | |||
if 'thread' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link, may need to change for every website | |||
def isListingLink(url): | |||
if '.onion/forumdisplay' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links | |||
def topicPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
return cryptBB_links_parser(soup) | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing BestCardingWorld .... DONE!") |
@ -1,282 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from Forums.Utilities.utilities import * | |||
from datetime import date | |||
from datetime import timedelta | |||
import re | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) | |||
def cryptBB_description_parser(soup): | |||
# Fields to be parsed | |||
topic = "-1" # 0 *topic name | |||
user = [] # 1 *all users of each post | |||
status = [] # 2 all user's authority in each post such as (adm, member, dangerous) | |||
reputation = [] # 3 all user's karma in each post (usually found as a number) | |||
interest = [] # 4 all user's interest in each post | |||
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) | |||
post = [] # 6 all messages of each post | |||
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) | |||
addDate = [] # 8 all dates of each post | |||
image_user = [] # 9 all user avatars of each post | |||
image_post = [] # 10 all first images of each post | |||
# Finding the topic (should be just one coming from the Listing Page) | |||
li = soup.find("td", {"class": "thead"}).find('strong') | |||
topic = li.text | |||
topic = re.sub("\[\w*\]", '', topic) | |||
topic = topic.replace(",","") | |||
topic = topic.replace("\n","") | |||
topic = cleanString(topic.strip()) | |||
# Finding the repeated tag that corresponds to the listing of posts | |||
posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all( | |||
'div', {"class": "post"}) | |||
# For each message (post), get all the fields we are interested to: | |||
for ipost in posts: | |||
if ipost.find('div', {"class": "deleted_post_author"}): | |||
continue | |||
# Finding a first level of the HTML page | |||
post_wrapper = ipost.find('span', {"class": "largetext"}) | |||
# Finding the author (user) of the post | |||
author = post_wrapper.text.strip() | |||
user.append(cleanString(author)) # Remember to clean the problematic characters | |||
# Finding the status of the author | |||
smalltext = ipost.find('div', {"class": "post_author"}) | |||
if smalltext is not None: | |||
# CryptBB does have membergroup and postgroup | |||
membergroup = smalltext.find('div', {"class": "profile-rank"}) | |||
postgroup = smalltext.find('div', {"class": "postgroup"}) | |||
if membergroup != None: | |||
membergroup = membergroup.text.strip() | |||
if postgroup != None: | |||
postgroup = postgroup.text.strip() | |||
membergroup = membergroup + " - " + postgroup | |||
else: | |||
if postgroup != None: | |||
membergroup = postgroup.text.strip() | |||
else: | |||
membergroup = "-1" | |||
status.append(cleanString(membergroup)) | |||
# Finding the interest of the author | |||
# CryptBB does not have blurb | |||
blurb = smalltext.find('li', {"class": "blurb"}) | |||
if blurb != None: | |||
blurb = blurb.text.strip() | |||
else: | |||
blurb = "-1" | |||
interest.append(cleanString(blurb)) | |||
# Finding the reputation of the user | |||
# CryptBB does have reputation | |||
author_stats = smalltext.find('div', {"class": "author_statistics"}) | |||
karma = author_stats.find('strong') | |||
if karma != None: | |||
karma = karma.text | |||
karma = karma.replace("Community Rating: ", "") | |||
karma = karma.replace("Karma: ", "") | |||
karma = karma.strip() | |||
else: | |||
karma = "-1" | |||
reputation.append(cleanString(karma)) | |||
else: | |||
status.append('-1') | |||
interest.append('-1') | |||
reputation.append('-1') | |||
# Getting here another good tag to find the post date, post content and users' signature | |||
postarea = ipost.find('div', {"class": "post_content"}) | |||
dt = postarea.find('span', {"class": "post_date"}).text | |||
# dt = dt.strip().split() | |||
dt = dt.strip() | |||
day=date.today() | |||
if "Today" in dt: | |||
today = day.strftime('%m-%d-%Y') | |||
stime = dt.replace('Today,','').strip() | |||
date_time_obj = today + ', '+stime | |||
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') | |||
elif "Yesterday" in dt: | |||
yesterday = day - timedelta(days=1) | |||
yesterday = yesterday.strftime('%m-%d-%Y') | |||
stime = dt.replace('Yesterday,','').strip() | |||
date_time_obj = yesterday + ', '+stime | |||
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') | |||
elif "ago" in dt: | |||
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] | |||
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p') | |||
else: | |||
date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') | |||
addDate.append(date_time_obj) | |||
# Finding the post | |||
inner = postarea.find('div', {"class": "post_body scaleimages"}) | |||
quote = inner.find('blockquote') | |||
if quote is not None: | |||
quote.decompose() | |||
inner = inner.text.strip() | |||
post.append(cleanString(inner)) | |||
# Finding the user's signature | |||
# signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) | |||
signature = ipost.find('div', {"class": "signature scaleimages"}) | |||
if signature != None: | |||
signature = signature.text.strip() | |||
# print(signature) | |||
else: | |||
signature = "-1" | |||
sign.append(cleanString(signature)) | |||
# As no information about user's feedback was found, just assign "-1" to the variable | |||
feedback.append("-1") | |||
img = ipost.find('div', {"class": "post_body scaleimages"}).find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = "-1" | |||
image_post.append(img) | |||
avatar = ipost.find('div', {"class": "author_avatar"}) | |||
if avatar is not None: | |||
img = avatar.find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = "-1" | |||
else: | |||
img = "-1" | |||
image_user.append(img) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages (one page with many posts) | |||
def cryptBB_listing_parser(soup): | |||
nm = 0 # *this variable should receive the number of topics | |||
forum = "CryptBB" # 0 *forum name | |||
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. | |||
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) | |||
author = [] # 2 *all authors of each topic | |||
topic = [] # 3 *all topics | |||
views = [] # 4 number of views of each topic | |||
posts = [] # 5 number of posts of each topic | |||
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between | |||
# Listing and Description pages) | |||
addDate = [] # 7 when the topic was created (difficult to find) | |||
image_author = [] # 8 all author avatars used in each topic | |||
# Finding the board (should be just one) | |||
board = soup.find('span', {"class": "active"}).text | |||
board = cleanString(board.strip()) | |||
# Finding the repeated tag that corresponds to the listing of topics | |||
itopics = soup.find_all('tr',{"class": "inline_row"}) | |||
# Counting how many topics | |||
nm = len(itopics) | |||
for itopic in itopics: | |||
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them | |||
# to don't miss any topic | |||
# Adding the topic to the topic list | |||
try: | |||
topics = itopic.find('span', {"class": "subject_old"}).find('a').text | |||
except: | |||
topics = itopic.find('span', {"class": "subject_new"}).find('a').text | |||
topics = re.sub("\[\w*\]", '', topics) | |||
topic.append(cleanString(topics)) | |||
image_author.append(-1) | |||
# Adding the url to the list of urls | |||
try: | |||
link = itopic.find('span', {"class": "subject_old"}).find('a').get('href') | |||
except: | |||
link = itopic.find('span',{"class": "subject_new"}).find('a').get('href') | |||
href.append(link) | |||
# Finding the author of the topic | |||
ps = itopic.find('div', {"class":"author smalltext"}).text | |||
user = ps.strip() | |||
author.append(cleanString(user)) | |||
# Finding the number of replies | |||
columns = itopic.findChildren('td',recursive=False) | |||
replies = columns[3].text | |||
if replies == '-': | |||
posts.append('-1') | |||
else: | |||
posts.append(cleanString(replies)) | |||
# Finding the number of Views | |||
tview = columns[4].text | |||
if tview == '-': | |||
views.append('-1') | |||
else: | |||
views.append(cleanString(tview)) | |||
# If no information about when the topic was added, just assign "-1" to the variable | |||
addDate.append("-1") | |||
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author) | |||
def cryptBB_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.find('table', {"class": "tborder clear"}).find('tbody').find_all('tr', {"class": "inline_row"}) | |||
for a in listing: | |||
try: | |||
link = a.find('span', {"class": "subject_old"}).find('a').get('href') | |||
except: | |||
link = a.find('span', {"class": "subject_new"}).find('a').get('href') | |||
href.append(link) | |||
return href |
@ -0,0 +1,312 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from Forums.Utilities.utilities import * | |||
from datetime import date | |||
from datetime import timedelta | |||
import re | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) | |||
def dwForums_description_parser(soup): | |||
# Fields to be parsed | |||
topic = "-1" # 0 *topic name | |||
user = [] # 1 *all users of each post | |||
status = [] # 2 all user's authority in each post such as (adm, member, dangerous) | |||
reputation = [] # 3 all user's karma in each post (usually found as a number) | |||
interest = [] # 4 all user's interest in each post | |||
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) | |||
post = [] # 6 all messages of each post | |||
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) | |||
addDate = [] # 8 all dates of each post | |||
# Finding the topic (should be just one coming from the Listing Page) | |||
li = soup.find("h1", {"class": "p-title-value"}) | |||
topic = li.text | |||
topic = topic.replace(u'\xa0', ' ') | |||
topic = topic.replace(",","") | |||
topic = topic.replace("\n","") | |||
topic = cleanString(topic.strip()) | |||
# print(topic) | |||
# Finding the repeated tag that corresponds to the listing of posts | |||
# posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \ | |||
# soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"}) | |||
posts = soup.find('div', {"class": "js-replyNewMessageContainer"}).find_all( | |||
'article', {"class": "js-post"}, recursive=False) | |||
# print(len(posts)) | |||
# For each message (post), get all the fields we are interested to: | |||
for ipost in posts: | |||
# Finding a first level of the HTML page | |||
# post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"}) | |||
post_wrapper = ipost.find('h4', {"class": "message-name"}) | |||
# Finding the author (user) of the post | |||
# author = post_wrapper.find('h4') | |||
author = post_wrapper.text.strip() | |||
# print("author " + author) | |||
user.append(cleanString(author)) # Remember to clean the problematic characters | |||
# Finding the status of the author | |||
# Testing here two possibilities to find this status and combine them | |||
# if ipost.find('h5', {"class": "deleted_post_author"}): | |||
# status.append(-1) | |||
# interest.append(-1) | |||
# reputation.append(-1) | |||
# addDate.append(-1) | |||
# post.append("THIS POST HAS BEEN REMOVED!") | |||
# sign.append(-1) | |||
# feedback.append(-1) | |||
# continue | |||
# CryptBB does have membergroup and postgroup | |||
membergroup = ipost.find('h5', {"class": "userTitle"}) | |||
# DWForums doesnt have postgroups | |||
postgroup = None | |||
if membergroup != None: | |||
membergroup = membergroup.text.strip() | |||
if postgroup != None: | |||
postgroup = postgroup.text.strip() | |||
membergroup = membergroup + " - " + postgroup | |||
else: | |||
if postgroup != None: | |||
membergroup = postgroup.text.strip() | |||
else: | |||
membergroup = "-1" | |||
status.append(cleanString(membergroup)) | |||
# print("status " + cleanString(membergroup)) | |||
# Finding the interest of the author | |||
# DWForums does not have blurb | |||
blurb = ipost.find('li', {"class": "blurb"}) | |||
if blurb != None: | |||
blurb = blurb.text.strip() | |||
else: | |||
blurb = "-1" | |||
interest.append(cleanString(blurb)) | |||
# Finding the reputation of the user | |||
# CryptBB does have reputation | |||
author_stats = ipost.find('div', {"class": "message-userExtras"}) | |||
if author_stats != None: | |||
karma = author_stats.find_all('dl', {"class": "pairs"})[2] | |||
else: | |||
karma = None | |||
if karma != None: | |||
karma = karma.text | |||
karma = karma.replace("Reaction score","") | |||
karma = karma.replace(":", "") | |||
karma = karma.strip() | |||
else: | |||
karma = "-1" | |||
reputation.append(cleanString(karma)) | |||
# print("karma " + cleanString(karma)) | |||
# Getting here another good tag to find the post date, post content and users' signature | |||
postarea = ipost.find('div', {"class": "message-attribution-main"}) | |||
dt = postarea.find('time', {"class": "u-dt"})['datetime'] | |||
# dt = dt.strip().split() | |||
dt = dt.strip()[:16] | |||
dt = dt.replace("T",", ") | |||
day=date.today() | |||
if "Yesterday" in dt: | |||
yesterday = day - timedelta(days=1) | |||
yesterday = yesterday.strftime('%m-%d-%Y') | |||
stime = dt.replace('Yesterday,','').strip() | |||
date_time_obj = yesterday+ ', '+stime | |||
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %H:%M') | |||
elif "hours ago" in dt: | |||
day = day.strftime('%m-%d-%Y') | |||
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] | |||
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %H:%M') | |||
else: | |||
date_time_obj = datetime.strptime(dt, '%Y-%m-%d, %H:%M') | |||
stime = date_time_obj.strftime('%b %d, %Y') | |||
sdate = date_time_obj.strftime('%I:%M %p') | |||
addDate.append(date_time_obj) | |||
# print("date " + str(date_time_obj)) | |||
# Finding the date of the post | |||
# date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p') | |||
# smalltext = postarea.find('div', {"class": "flow_hidden"}).find('div', {"class": "keyinfo"})\ | |||
# .find('div', {"class": "smalltext"}) | |||
# sdatetime = smalltext.text | |||
# sdatetime = sdatetime.replace(u"\xab","") # Removing unnecessary characters | |||
# sdatetime = sdatetime.replace(u"\xbb","") # Removing unnecessary characters | |||
# sdatetime = sdatetime.split("on: ") # Removing unnecessary characters | |||
# sdatetime = sdatetime[1].strip() | |||
# stime = sdatetime[:-12:-1] # Finding the time of the post | |||
# stime = stime[::-1] | |||
# sdate = sdatetime.replace(stime,"") # Finding the date of the post | |||
# sdate = sdate.replace(",","") | |||
# sdate = sdate.strip() | |||
# Covert the date of the post that can be informed as: "12 February 2016", "today", "yesterday". We need | |||
# a date format here as "mm/dd/yyyy" | |||
# addDate.append(convertDate(sdate,"english", crawlerDate) + " " + stime) | |||
# Finding the post | |||
inner = ipost.find('article', {"class": "message-body"}) | |||
inner = inner.text.strip() | |||
# print(inner) | |||
post.append(cleanString(inner)) | |||
# Finding the users's signature | |||
# signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) | |||
signature = ipost.find('aside', {"class": "message-signature"}) | |||
if signature != None: | |||
signature = signature.text.strip() | |||
# print(signature) | |||
else: | |||
signature = "-1" | |||
sign.append(cleanString(signature)) | |||
# As no information about users's feedback was found, just assign "-1" to the variable | |||
feedback.append("-1") | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages (one page with many posts) | |||
def dwForums_listing_parser(soup): | |||
nm = 0 # *this variable should receive the number of topics | |||
forum = "DWForums" # 0 *forum name | |||
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. | |||
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) | |||
author = [] # 2 *all authors of each topic | |||
topic = [] # 3 *all topics | |||
views = [] # 4 number of views of each topic | |||
posts = [] # 5 number of posts of each topic | |||
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between | |||
# Listing and Description pages) | |||
addDate = [] # 7 when the topic was created (difficult to find) | |||
# Finding the board (should be just one) | |||
board = soup.find('h1', {"class": "p-title-value"}).text | |||
board = cleanString(board.strip()) | |||
# Finding the repeated tag that corresponds to the listing of topics | |||
regex = re.compile('.*structItem--thread.*') | |||
itopics = soup.find_all("div", {"class": regex}) | |||
for itopic in itopics: | |||
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them | |||
# to don't miss any topic | |||
# tds = itopic.findAll('td', {"class": "subject stickybg2"}) | |||
# | |||
# if len(tds) > 0: | |||
# tag.append("strong") | |||
# tag.append("subject stickybg2") | |||
# tag.append("stats stickybg") | |||
# else: | |||
# tds = itopic.findAll('td', {"class": "subject windowbg2"}) | |||
# if len(tds) > 0: | |||
# tag.append("span") | |||
# tag.append("subject windowbg2") | |||
# tag.append("stats windowbg") | |||
# Adding the topic to the topic list | |||
topics = itopic.find("div", {"class": "structItem-title"}).text | |||
topics = topics.replace(",", "") | |||
topics = topics.replace("\n", "") | |||
topic.append(cleanString(topics.strip())) | |||
# Counting how many topics we have found so far | |||
nm = len(topic) | |||
# Adding the url to the list of urls | |||
link = itopic.select_one('a[href^="/threads/"]') | |||
link = link['href'] | |||
link = cleanLink(link) | |||
href.append(link) | |||
# Finding the author of the topic | |||
minor = itopic.find('div', {"class": "structItem-minor"}) | |||
ps = minor.find('li').text | |||
user = ps.strip() | |||
author.append(cleanString(user)) | |||
# Finding the number of replies | |||
meta = itopic.find("div", {"class": "structItem-cell--meta"}) | |||
meta = meta.find_all("dl") | |||
post = meta[0].find("dd").text | |||
post = post.replace("K", "000") | |||
posts.append(cleanString(post)) | |||
# Finding the number of Views | |||
tview = meta[1].find("dd").text | |||
tview = tview.replace("K", "000") | |||
views.append(cleanString(tview)) | |||
# If no information about when the topic was added, just assign "-1" to the variable | |||
minor = itopic.find("div", {"class": "structItem-minor"}) | |||
dt = minor.find('time')['datetime'] | |||
dt = dt.strip()[:16] | |||
dt = dt.replace("T", ", ") | |||
day = date.today() | |||
if "Yesterday" in dt: | |||
yesterday = day - timedelta(days=1) | |||
yesterday = yesterday.strftime('%m-%d-%Y') | |||
stime = dt.replace('Yesterday,', '').strip() | |||
date_time_obj = yesterday + ', ' + stime | |||
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %H:%M') | |||
else: | |||
date_time_obj = datetime.strptime(dt, '%Y-%m-%d, %H:%M') | |||
stime = date_time_obj.strftime('%b %d, %Y') | |||
sdate = date_time_obj.strftime('%I:%M %p') | |||
addDate.append(date_time_obj) | |||
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) | |||
def dwForums_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
#print(soup.find('table', {"class": "tborder clear"}).find( | |||
# 'tbody').find_all('tr', {"class": "inline_row"})) | |||
regex = re.compile('.*structItem--thread.*') | |||
listing = soup.find_all("div", {"class": regex}) | |||
for a in listing: | |||
link = a.select_one('a[href^="/threads/"]') | |||
link = link['href'] | |||
href.append(link) | |||
return href |
@ -0,0 +1,334 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
import datetime | |||
from Forums.Utilities.utilities import * | |||
from datetime import date | |||
from datetime import timedelta | |||
import re | |||
import traceback | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) | |||
def dread_description_parser(soup): | |||
# Fields to be parsed | |||
topic = "-1" # 0 *topic name | |||
user = [] # 1 *all users of each post | |||
status = [] # 2 all user's authority in each post such as (adm, member, dangerous) | |||
reputation = [] # 3 all user's karma in each post (usually found as a number) | |||
interest = [] # 4 all user's interest in each post | |||
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) | |||
post = [] # 6 all messages of each post | |||
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) | |||
addDate = [] # 8 all dates of each post | |||
# Finding the topic (should be just one coming from the Listing Page) | |||
container = soup.find('div', {"class": "content"}) | |||
li = container.find("a", {"class": "title"}) | |||
if li == None: | |||
return None | |||
topic = li.text | |||
topic = topic.replace(u'\xa0', ' ') | |||
topic = topic.replace(",","") | |||
topic = topic.replace("\n","") | |||
topic = cleanString(topic.strip()) | |||
# print(topic) | |||
# Finding the repeated tag that corresponds to the listing of posts | |||
# posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \ | |||
# soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"}) | |||
# putting the initial post data since it is separated from comments | |||
# author name | |||
init_post = container.find('div', {"class": "item"}) | |||
author = init_post.find('div', {"class": "author"}).select_one('a[href^="/u/"]').text | |||
flair = init_post.find('div', {"class": "author"}).find("span", {"class": "flair"}) | |||
try: | |||
flair = flair.text.strip() | |||
author = author.replace(flair, '') | |||
except: | |||
pass | |||
author = author.strip() | |||
user.append(cleanString(author)) | |||
# status | |||
flair = init_post.find("span", {"class": "flair"}) | |||
if flair != None: | |||
flair = flair.text.strip() | |||
else: | |||
flair = "-1" | |||
status.append(cleanString(flair)) | |||
# no blurb | |||
interest.append(-1) | |||
# points for post | |||
karma = init_post.find("div", {"class": "voteCount"}) | |||
if karma != None: | |||
karma = karma.text | |||
karma = karma.replace("points", "") | |||
karma = karma.replace(":", "") | |||
karma = karma.strip() | |||
else: | |||
karma = "-1" | |||
reputation.append(cleanString(karma)) | |||
# date | |||
spans = init_post.find('div', {"class": "author"}).find('span', recursive=False) | |||
dt = spans['title'] | |||
month = find_month(dt) | |||
split_text = dt.split() | |||
day = int(re.search(r'\d+', split_text[0]).group()) | |||
year = int(split_text[2]) | |||
hm = re.findall(r'\d+', split_text[-1]) | |||
hm[0] = int(hm[0]) | |||
hm[1] = int(hm[1]) | |||
date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1]) | |||
addDate.append(date_time_obj) | |||
# content | |||
inner = init_post.find("div", {"class": "postContent"}) | |||
inner = inner.text.strip() | |||
post.append(cleanString(inner)) | |||
# no signature | |||
sign.append(-1) | |||
# no feedback | |||
feedback.append(-1) | |||
comments = soup.find('div', {"class": "postComments"}) | |||
if comments == None: | |||
row = (topic, post, user, addDate, feedback, status, reputation, sign, interest) | |||
return row | |||
else: | |||
comments = soup.find('div', {"class": "postComments"}).find_all('div', "comment") | |||
# print(len(posts)) | |||
# For each message (post), get all the fields we are interested to: | |||
for ipost in comments: | |||
# Finding a first level of the HTML page | |||
# post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"}) | |||
cc = ipost.find('div', {"class": "commentContent"}) | |||
post_wrapper = cc.find('a', {"class": "username"}).text | |||
flair = cc.find("span", {"class": "flair"}) | |||
try: | |||
flair = flair.text.strip() | |||
post_wrapper = post_wrapper.replace(flair, '') | |||
except: | |||
pass | |||
author = post_wrapper.strip() | |||
user.append(cleanString(author)) | |||
# Finding the status of the author | |||
# Dread does not have membergroup and postgroup, but it has flair, similar enough | |||
postgroup = None | |||
if flair != None: | |||
if postgroup != None: | |||
postgroup = postgroup.text.strip() | |||
flair = flair + " - " + postgroup | |||
else: | |||
if postgroup != None: | |||
flair = postgroup.text.strip() | |||
else: | |||
flair = "-1" | |||
status.append(cleanString(flair)) | |||
# print("status " + cleanString(membergroup)) | |||
# Finding the interest of the author | |||
# Dread does not have blurb | |||
interest.append(-1) | |||
# Finding the reputation of the user | |||
# Dread doesn't have reputation per user, but instead each post has its own point system | |||
karma = cc.find('div', {"class": "votes"}) | |||
if karma != None: | |||
karma = karma.text | |||
karma = karma.replace("points","") | |||
karma = karma.replace(":", "") | |||
karma = karma.strip() | |||
else: | |||
karma = "-1" | |||
reputation.append(cleanString(karma)) | |||
# print("karma " + cleanString(karma)) | |||
# Getting here another good tag to find the post date, post content and users' signature | |||
postarea = ipost.find('div', {"class": "timestamp"}).find('span', recursive=False) | |||
dt = postarea['title'] | |||
month = find_month(dt) | |||
split_text = dt.split() | |||
day = int(re.search(r'\d+', split_text[0]).group()) | |||
year = int(split_text[2]) | |||
hm = re.findall(r'\d+', split_text[-1]) | |||
hm[0] = int(hm[0]) | |||
hm[1] = int(hm[1]) | |||
date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1]) | |||
addDate.append(date_time_obj) | |||
# Finding the post | |||
inner = ipost.find('div', {"class": "commentBody"}) | |||
inner = inner.text.strip() | |||
# print(inner) | |||
post.append(cleanString(inner)) | |||
# No signature for Dread | |||
sign.append(-1) | |||
# As no information about users's feedback was found, just assign "-1" to the variable | |||
feedback.append("-1") | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages (one page with many posts) | |||
def dread_listing_parser(soup): | |||
nm = 0 # *this variable should receive the number of topics | |||
forum = "Dread" # 0 *forum name | |||
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. | |||
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) | |||
author = [] # 2 *all authors of each topic | |||
topic = [] # 3 *all topics | |||
views = [] # 4 number of views of each topic | |||
posts = [] # 5 number of posts of each topic | |||
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between | |||
# Listing and Description pages) | |||
addDate = [] # 7 when the topic was created (difficult to find) | |||
# Finding the board (should be just one) | |||
board = soup.find('a', {"class": "banner-top"}).text | |||
board = cleanString(board.strip()) | |||
# Finding the repeated tag that corresponds to the listing of topics | |||
itopics = soup.find("div", {"class": "postBoard"}).find_all("div", {"class": "item"}, recursive=False) | |||
for itopic in itopics: | |||
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them | |||
# to don't miss any topic | |||
# Adding the topic to the topic list | |||
topic_title = itopic.find("a", {"class": "title"}) | |||
title_flair = topic_title.find('span', {"class": "flair"}) | |||
topics = topic_title.text | |||
try: | |||
title_flair = title_flair.text.strip() | |||
topics = topics.replace(title_flair, '') | |||
except: | |||
pass | |||
topics = topics.replace(u'\xa0', ' ') | |||
topics = topics.replace(",", "") | |||
topics = topics.replace("\n", "") | |||
topic.append(cleanString(topics.strip())) | |||
# Counting how many topics we have found so far | |||
nm = len(topic) | |||
# Adding the url to the list of urls | |||
link = topic_title['href'] | |||
link = cleanLink(link) | |||
href.append(link) | |||
# Finding the author of the topic | |||
ps = itopic.find('div', {"class": "author"}) | |||
post_wrapper = ps.select_one('a[href^="/u/"]').text | |||
flair = ps.find("span", {"class": "flair"}) | |||
try: | |||
flair = flair.text.strip() | |||
post_wrapper = post_wrapper.replace(flair, '') | |||
except: | |||
pass | |||
user = post_wrapper.strip() | |||
author.append(cleanString(user)) | |||
# Finding the number of replies | |||
meta = itopic.find("div", {"class": "postMain"}) | |||
post = meta.find("a").text | |||
post = post.replace("comments", '').strip() | |||
posts.append(cleanString(post)) | |||
# Finding the number of Views - not shown in Dread | |||
views.append("-1") | |||
# If no information about when the topic was added, just assign "-1" to the variable | |||
spans = itopic.find('div', {"class": "author"}).find('span', recursive=False) | |||
dt = spans['title'] | |||
month = find_month(dt) | |||
split_text = dt.split() | |||
day = int(re.search(r'\d+', split_text[0]).group()) | |||
year = int(split_text[2]) | |||
hm = re.findall(r'\d+', split_text[-1]) | |||
hm[0] = int(hm[0]) | |||
hm[1] = int(hm[1]) | |||
date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1]) | |||
addDate.append(date_time_obj) | |||
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) | |||
def dread_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
#print(soup.find('table', {"class": "tborder clear"}).find( | |||
# 'tbody').find_all('tr', {"class": "inline_row"})) | |||
listing = soup.find("div", {"class": "postBoard"}).find_all("div",{"class": "item"}, recursive=False) | |||
for a in listing: | |||
link = a.find("a", {"class": "title"}) | |||
link = link['href'] | |||
href.append(link) | |||
return href | |||
def find_month(s): | |||
if 'January' in s: | |||
return 1 | |||
elif 'February' in s: | |||
return 2 | |||
elif 'March' in s: | |||
return 3 | |||
elif 'April' in s: | |||
return 4 | |||
elif 'May' in s: | |||
return 5 | |||
elif 'June' in s: | |||
return 6 | |||
elif 'July' in s: | |||
return 7 | |||
elif 'August' in s: | |||
return 8 | |||
elif 'September' in s: | |||
return 9 | |||
elif 'October' in s: | |||
return 10 | |||
elif 'November' in s: | |||
return 11 | |||
elif 'December' in s: | |||
return 12 |
@ -0,0 +1,328 @@ | |||
__author__ = 'DarkWeb' | |||
''' | |||
Helium Forum Crawler (Selenium) | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.common.by import By | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, time | |||
from datetime import date | |||
import subprocess | |||
from bs4 import BeautifulSoup | |||
from Forums.Initialization.prepare_parser import new_parse | |||
from Forums.Helium.parser import helium_links_parser | |||
from Forums.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/' | |||
# Opens Tor Browser, crawls the website | |||
def startCrawling(): | |||
# opentor() | |||
# forumName = getForumName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closetor(driver) | |||
# new_parse(forumName, False) | |||
# Opens Tor Browser | |||
def opentor(): | |||
global pid | |||
print("Connecting Tor...") | |||
path = open('../../path.txt').readline().strip() | |||
pro = subprocess.Popen(path) | |||
pid = pro.pid | |||
time.sleep(7.5) | |||
input('Tor Connected. Press ENTER to continue\n') | |||
return | |||
# Login using premade account credentials and do login captcha manually | |||
def login(driver): | |||
#wait for login page | |||
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( | |||
(By.XPATH, "/html/body/div[2]/div/div[1]/div/div/div[2]/form/div[5]/div/button"))) | |||
#entering username and password into input boxes | |||
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') | |||
#Username here | |||
usernameBox.send_keys('holyre') | |||
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') | |||
#Password here | |||
passwordBox.send_keys('PlatinumBorn2') | |||
''' | |||
# wait for captcha page show up | |||
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( | |||
(By.XPATH, '//*[@id="captcha_img"]'))) | |||
# save captcha to local | |||
driver.find_element(by=By.XPATH, value='//*[@id="captcha_img"]').screenshot(r'..\Helium\captcha.png') | |||
# This method will show image in any image viewer | |||
im = Image.open(r'..\Helium\captcha.png') | |||
im.show() | |||
# wait until input space show up | |||
inputBox = driver.find_element(by=By.XPATH, value='//*[@id="captcha"]') | |||
# ask user input captcha solution in terminal | |||
userIn = input("Enter solution: ") | |||
# send user solution into the input space | |||
inputBox.send_keys(userIn) | |||
# click the verify(submit) button | |||
driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div[1]/div/div/div[2]/form/div[5]/div/button").click() | |||
''' | |||
input("Press ENTER when CAPTCHA is completed\n") | |||
# wait for listing page show up (This Xpath may need to change based on different seed url) | |||
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( | |||
(By.XPATH, '/html/body/div[2]/div/p'))) | |||
# Returns the name of the website | |||
def getForumName(): | |||
name = 'Helium' | |||
return name | |||
# Return the link of the website | |||
def getFixedURL(): | |||
url = 'http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/login' | |||
return url | |||
# Closes Tor Browser | |||
def closetor(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
file = open('../../path.txt', 'r') | |||
lines = file.readlines() | |||
ff_binary = FirefoxBinary(lines[0].strip()) | |||
ff_prof = FirefoxProfile(lines[1].strip()) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
# ff_prof.set_preference("permissions.default.image", 2) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", True) | |||
ff_prof.update_preferences() | |||
service = Service(lines[2].strip()) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
return driver | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
# Saves the crawled html page | |||
def savePage(page, url): | |||
cleanPage = cleanHTML(page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
def getFullPathName(url): | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = r'..\Helium\HTML_Pages\\' + str( | |||
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( | |||
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' | |||
else: | |||
fullPath = r'..\Helium\HTML_Pages\\' + str( | |||
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( | |||
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' | |||
return fullPath | |||
# Creates the file name from passed URL | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if name == '': | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
def getInterestedLinks(): | |||
links = [] | |||
# # General Discussion | |||
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/6') | |||
# # Anonymity and Security | |||
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/8') | |||
# # Programming | |||
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/9') | |||
# # Carding Discussions | |||
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/10') | |||
# # Hacked Database (free) | |||
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/11') | |||
# Hacking tools, exploits and POC | |||
links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/17') | |||
# # Hacked Database | |||
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/12') | |||
# # Hacking and other Services | |||
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/13') | |||
# # Selling/Buying Malware, Exploits etc | |||
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/22') | |||
# # General Tutorials | |||
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/18') | |||
# # Hacking Tutorials | |||
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/19') | |||
return links | |||
def crawlForum(driver): | |||
print("Crawling the Helium forum") | |||
linksToCrawl = getInterestedLinks() | |||
# visited = set(linksToCrawl) | |||
# initialTime = time.time() | |||
i = 0 | |||
count = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(html, link) | |||
has_next_page = True | |||
while has_next_page: | |||
list = topicPages(html) | |||
for item in list: | |||
itemURL = urlparse.urljoin(baseURL, str(item)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
savePage(driver.page_source, item) | |||
driver.back() | |||
# comment out | |||
break | |||
# comment out | |||
if count == 1: | |||
count = 0 | |||
break | |||
try: | |||
bar = driver.find_element(by=By.XPATH, value= | |||
'/html/body/div[2]/div/div[3]/ul') | |||
li = bar.find_elements(By.TAG_NAME, 'li')[-1] | |||
link = li.find_element(By.TAG_NAME, 'a').get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(html, link) | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
# finalTime = time.time() | |||
# print finalTime - initialTime | |||
input("Crawling Helium forum done successfully. Press ENTER to continue\n") | |||
# Returns 'True' if the link is Topic link | |||
def isDescriptionLink(url): | |||
if 'topic' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link | |||
def isListingLink(url): | |||
if 'board' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links | |||
def topicPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
return helium_links_parser(soup) | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing BestCardingWorld .... DONE!") |
@ -0,0 +1,248 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from Forums.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) | |||
def helium_description_parser(soup): | |||
# Fields to be parsed | |||
topic = "-1" # topic name | |||
user = [] # all users of each post | |||
addDate = [] # all dated of each post | |||
feedback = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format) | |||
status = [] # all user's authority in each post such as (adm, member, dangerous) | |||
reputation = [] # all users's karma in each post (usually found as a number) | |||
sign = [] # all user's signature in each post (usually a standard message after the content of the post) | |||
post = [] # all messages of each post | |||
interest = [] # all user's interest in each post | |||
# Finding the topic (should be just one coming from the Listing Page) | |||
li = soup.find("h4", {"class": "text-truncated"}) | |||
topic = li.text | |||
topic = topic.replace("Topic:", "") | |||
topic = topic.replace("Post Reply", "") | |||
topic = topic.replace(",", "") | |||
topic = topic.replace("\n", "") | |||
topic = cleanString(topic.strip()) | |||
# Finding the repeated tag that corresponds to the listing of posts | |||
posts = soup.findAll('div', {"id": "a9"}) | |||
# For each message (post), get all the fields we are interested to: | |||
for ipost in posts: | |||
# Finding a first level of the HTML page | |||
# Finding the author (user) of the post | |||
heading = ipost.find('div', {"class": "panel-heading"}) | |||
title = heading.find('div', {"class": "panel-title"}).text | |||
author = title.replace("User:", "") | |||
author = author.strip() | |||
user.append(cleanString(author)) # Remember to clean the problematic characters | |||
# Finding the status of the author | |||
# Testing here two possibilities to find this status and combine them | |||
# Helium does not have membergroup and postgroup | |||
membergroup = heading.find('li', {"class": "membergroup"}) | |||
postgroup = heading.find('li', {"class": "postgroup"}) | |||
if membergroup != None: | |||
membergroup = membergroup.text.strip() | |||
if postgroup != None: | |||
postgroup = postgroup.text.strip() | |||
membergroup = membergroup + " - " + postgroup | |||
else: | |||
if postgroup != None: | |||
membergroup = postgroup.text.strip() | |||
else: | |||
membergroup = "-1" | |||
status.append(cleanString(membergroup)) | |||
# Finding the interest of the author | |||
# Helium does not have blurb | |||
blurb = heading.find('li', {"class": "blurb"}) | |||
if blurb != None: | |||
blurb = blurb.text.strip() | |||
else: | |||
blurb = "-1" | |||
interest.append(cleanString(blurb)) | |||
# Finding the reputation of the user | |||
# Helium does not have karma | |||
karma = heading.find('li', {"class": "karma"}) | |||
if karma != None: | |||
karma = karma.text | |||
karma = karma.replace("Community Rating: ","") | |||
karma = karma.replace("Karma: ","") | |||
karma = karma.strip() | |||
else: | |||
karma = "-1" | |||
reputation.append(cleanString(karma)) | |||
# Getting here another good tag to find the post date, post content and users' signature | |||
postarea = ipost.find('div', {"class": "content_body"}) | |||
# Finding the date of the post | |||
# Helium does not have date | |||
addDate.append("-1") | |||
# dt = ipost.find('p', {"class": "author"}).text.split('»')[1] | |||
# # dt = dt.strip().split() | |||
# dt = dt.strip() | |||
# date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p') | |||
# stime = date_time_obj.strftime('%a %b %d, %Y') | |||
# sdate = date_time_obj.strftime('%I:%M %p') | |||
# addDate.append(date_time_obj) | |||
# date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p') | |||
# smalltext = postarea.find('div', {"class": "flow_hidden"}).find('div', {"class": "keyinfo"})\ | |||
# .find('div', {"class": "smalltext"}) | |||
# sdatetime = smalltext.text | |||
# sdatetime = sdatetime.replace(u"\xab","") # Removing unnecessary characters | |||
# sdatetime = sdatetime.replace(u"\xbb","") # Removing unnecessary characters | |||
# sdatetime = sdatetime.split("on: ") # Removing unnecessary characters | |||
# sdatetime = sdatetime[1].strip() | |||
# stime = sdatetime[:-12:-1] # Finding the time of the post | |||
# stime = stime[::-1] | |||
# sdate = sdatetime.replace(stime,"") # Finding the date of the post | |||
# sdate = sdate.replace(",","") | |||
# sdate = sdate.strip() | |||
# Covert the date of the post that can be informed as: "12 February 2016", "today", "yesterday". We need | |||
# a date format here as "mm/dd/yyyy" | |||
#addDate.append(convertDate(sdate,"english", crawlerDate) + " " + stime) | |||
# Finding the post | |||
paragraphs = postarea.find_all('p') | |||
p = "" | |||
for paragraph in paragraphs: | |||
p += paragraph.text.strip() + " " | |||
quote = postarea.find('div', {"class": "standard_quote"}) | |||
if quote != None: | |||
q = quote.text.strip() | |||
p.replace(q, "") | |||
post.append(cleanString(p.strip())) | |||
# Finding the users's signature | |||
# Helium does not have signature | |||
#signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) | |||
signature = ipost.find('div', {"class": "post_wrapper"}) | |||
if signature != None: | |||
signature = signature.text.strip() | |||
else: | |||
signature = "-1" | |||
sign.append(cleanString(signature)) | |||
# As no information about users's feedback was found, just assign "-1" to the variable | |||
feedback.append("-1") | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
row = (topic, post, user, addDate, feedback, status, reputation, sign, interest) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages (one page with many posts) | |||
def helium_listing_parser(soup): | |||
board = "-1" # board name (the previous level of the topic in the Forum categorization tree. | |||
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) | |||
nm = 0 # this variable should receive the number of topics | |||
topic = [] # all topics | |||
user = [] # all users of each topic | |||
post = [] # number of posts of each topic | |||
view = [] # number of views of each topic | |||
addDate = [] # when the topic was created (difficult to find) | |||
href = [] # this variable should receive all cleaned urls (we will use this to do the marge between | |||
# Listing and Description pages) | |||
# Finding the board (should be just one) | |||
parents = soup.find('div', {"class": "col-md-12"}).findAll('li') | |||
board = parents[1].text + u"->" + parents[2].get('title') | |||
board = board.replace("\n", "") | |||
board = cleanString(board.strip()) | |||
# Finding the repeated tag that corresponds to the listing of topics | |||
itopics = soup.find('table', {"class": "table"}).find('tbody').findAll('td', {"class": "col-md-8"}) | |||
repliesViews = soup.find('table', {"class": "table"}).find('tbody').findAll('td', {"class": "col-md-2"}) | |||
# Counting how many topics we have found so far | |||
nm = len(itopics) | |||
index = 0 | |||
for itopic in itopics: | |||
# Adding the topic to the topic list | |||
topics = itopic.find('a').get('title') | |||
topics = topics.replace(",", "") | |||
topic.append(cleanString(topics.strip())) | |||
# Adding the url to the list of urls | |||
link = itopic.find('a').get('href') | |||
link = cleanLink(link) | |||
href.append(link) | |||
# Finding the author of the topic | |||
author = itopic.find('strong').text | |||
user.append(cleanString(author.strip())) | |||
rv = repliesViews[index].find('p').text.split() | |||
# Finding the number of replies | |||
posts = rv[0].replace("Replies", "") | |||
post.append(cleanString(posts.strip())) | |||
# Finding the number of Views | |||
tview = rv[1].replace("Views", "") | |||
view.append(cleanString(tview.strip())) | |||
# If no information about when the topic was added, just assign "-1" to the variable | |||
# dt = itopic.find('div', {"class": "responsive-hide"}).text.split('»')[1] | |||
# dt = dt.strip() | |||
# date_time_obj = datetime.strptime(dt,'%a %b %d, %Y %I:%M %p') | |||
# addDate.append(date_time_obj) | |||
addDate.append("-1") | |||
index += 1 | |||
return organizeTopics("Helium", nm, topic, board, view, post, user, addDate, href) | |||
def helium_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.find('table', {"class": "table"}).find('tbody').findAll('td', {"class": "col-md-8"}) | |||
for a in listing: | |||
bae = a.find('a', href=True) | |||
link = bae['href'] | |||
href.append(link) | |||
return href |
@ -1,212 +0,0 @@ | |||
__author__ = 'Helium' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from typing import List | |||
from Forums.Utilities.utilities import * | |||
from datetime import date | |||
from datetime import timedelta | |||
import re | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup, ResultSet, Tag | |||
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) | |||
def HiddenAnswers_description_parser(soup: BeautifulSoup): | |||
# Fields to be parsed | |||
topic: str = "-1" # 0 topic name | |||
user: List[str] = [] # 1 all users of each post | |||
addDate: List[datetime] = [] # 2 all dated of each post | |||
feedback: List[str] = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format) | |||
status: List[str] = [] # 4 all user's authority in each post such as (adm, member, dangerous) | |||
reputation: List[str] = [] # 5 all user's karma in each post (usually found as a number) | |||
sign: List[str] = [] # 6 all user's signature in each post (usually a standard message after the content of the post) | |||
post: List[str] = [] # 7 all messages of each post | |||
interest: List[str] = [] # 8 all user's interest in each post | |||
image_user = [] # 9 all user avatars of each post | |||
image_post = [] # 10 all first images of each post | |||
# Finding the topic (should be just one coming from the Listing Page) | |||
li = soup.find("h1").find("span", {"itemprop": "name"}) | |||
topic = li.text | |||
question: Tag = soup.find("div", {"class": "qa-part-q-view"}) | |||
question_user = question.find("span", {"class": "qa-q-view-who-data"}).text | |||
user.append(cleanString(question_user.strip())) | |||
question_time = question.find("span", {"class": "qa-q-view-when-data"}).find("time").get("datetime") | |||
datetime_string = question_time.split("+")[0] | |||
datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S") | |||
addDate.append(datetime_obj) | |||
question_user_status = question.find("span", {"class": "qa-q-view-who-title"}) | |||
if question_user_status is not None: | |||
question_user_status = question_user_status.text | |||
status.append(cleanString(question_user_status.strip())) | |||
else: | |||
status.append('-1') | |||
question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}) | |||
if question_user_karma is not None: | |||
question_user_karma = question_user_karma.text | |||
# Convert karma to pure numerical string | |||
if question_user_karma.find("k") > -1: | |||
question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000) | |||
reputation.append(cleanString(question_user_karma.strip())) | |||
else: | |||
reputation.append('-1') | |||
question_content = question.find("div", {"class": "qa-q-view-content qa-post-content"}).text | |||
post.append(cleanString(question_content.strip())) | |||
feedback.append("-1") | |||
sign.append("-1") | |||
interest.append("-1") | |||
img = question.find('div', {"class": "qa-q-view-content qa-post-content"}).find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = "-1" | |||
image_post.append(img) | |||
img = question.find('span', {"class": "qa-q-view-avatar-meta"}).find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = "-1" | |||
image_user.append(img) | |||
answer_list: ResultSet[Tag] = soup.find("div", {"class": "qa-a-list"}).find_all("div", {"class": "qa-a-list-item"}) | |||
for replies in answer_list: | |||
user_name = replies.find("span", {"class", "qa-a-item-who-data"}).text | |||
user.append(cleanString(user_name.strip())) | |||
date_added = replies.find("span", {"class": "qa-a-item-when"}).find("time", {"itemprop": "dateCreated"}).get('datetime') | |||
date_string = date_added.split("+")[0] | |||
datetime_obj = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S") | |||
addDate.append(datetime_obj) | |||
post_data = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).text | |||
post.append(cleanString(post_data.strip())) | |||
user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}) | |||
if user_reputations is not None: | |||
user_reputations = user_reputations.text | |||
status.append(cleanString(user_reputations.strip())) | |||
else: | |||
status.append('-1') | |||
karma = replies.find("span", {"class": "qa-a-item-who-points-data"}) | |||
if karma is not None: | |||
karma = karma.text | |||
# Convert karma to pure numerical string | |||
if karma.find("k") > -1: | |||
karma = str(float(karma.replace("k", "")) * 1000) | |||
reputation.append(cleanString(karma.strip())) | |||
else: | |||
reputation.append('-1') | |||
feedback.append("-1") | |||
sign.append("-1") | |||
interest.append("-1") | |||
img = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = "-1" | |||
image_post.append(img) | |||
img = replies.find('span', {"class": "qa-a-item-avatar-meta"}).find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = "-1" | |||
image_user.append(img) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) | |||
# Sending the results | |||
return row | |||
def HiddenAnswers_listing_parser(soup: BeautifulSoup): | |||
nm: int = 0 # this variable should receive the number of topics | |||
forum: str = "HiddenAnswers" # 0 *forum name | |||
board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree. | |||
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) | |||
user: List[str] = [] # 2 all users of each topic | |||
topic: List[str] = [] # 3 all topics | |||
view: List[int] = [] # 4 number of views of each topic | |||
post: List[int] = [] # 5 number of posts of each topic | |||
href: List[str] = [] # 6 this variable should receive all cleaned urls (we will use this to do the merge between | |||
# Listing and Description pages) | |||
addDate: List[str] = [] # 7 when the topic was created (difficult to find) | |||
image_user = [] # 8 all user avatars used in each topic | |||
# Finding the board | |||
board = soup.find("div", {"class": "qa-main-heading"}).find("h1").text | |||
board = board.replace('Recent questions in', '') | |||
board = cleanString(board.strip()) | |||
queries_by_user: ResultSet[Tag] = soup.find("div", {"class": "qa-q-list"}).find_all("div", {"class": "qa-q-list-item"}) | |||
for queries in queries_by_user: | |||
topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text | |||
topic.append(cleanString(topic_of_query.strip())) | |||
image_user.append("-1") # qa-q-item-where | |||
author = queries.find("span", {"class": "qa-q-item-who-data"}).text | |||
user.append(cleanString(author.strip())) | |||
num_answers = queries.find("span", {"class": "qa-a-count-data"}).text | |||
post.append(cleanString(num_answers.strip())) | |||
view.append("-1") | |||
date_posted = queries.find("span", {"class": "qa-q-item-when-data"}).text | |||
if date_posted.find("day") > 0: | |||
datetime_obj = datetime.now() - timedelta(days=1) | |||
else: | |||
try: | |||
datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y") | |||
except ValueError: | |||
datetime_obj = datetime.strptime(f"{date_posted}", "%b %d, %Y") | |||
addDate.append(datetime_obj) | |||
#this link will be cleaned | |||
listing_href = queries.find("div", {"class": "qa-q-item-title"}).find("a").get("href") | |||
href.append(listing_href) | |||
nm = len(topic) | |||
return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate, image_user) | |||
#need to change this method | |||
def hiddenanswers_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
#print(soup.find('table', {"class": "tborder clear"}).find( | |||
# 'tbody').find_all('tr', {"class": "inline_row"})) | |||
listing = soup.find_all('div', {"class": "qa-q-item-title"}) | |||
for a in listing: | |||
link = a.find('a').get('href') | |||
href.append(link) | |||
return href |
@ -1,302 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
''' | |||
Libre Forum Crawler (Selenium) | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.common.by import By | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
import subprocess | |||
from bs4 import BeautifulSoup | |||
from Forums.Initialization.prepare_parser import new_parse | |||
from Forums.Libre.parser import libre_links_parser | |||
from Forums.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/' | |||
# Opens Tor Browser, crawls the website | |||
def startCrawling(): | |||
forumName = getForumName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(forumName, baseURL, True) | |||
# Login using premade account credentials and do login captcha manually | |||
def login(driver): | |||
input('Press enter when CAPTCHA is completed, and you\'re at the login page') | |||
#entering username and password into input boxes | |||
usernameBox = driver.find_element(by=By.NAME, value='username') | |||
#Username here | |||
usernameBox.send_keys('ct1234')#sends string to the username box | |||
passwordBox = driver.find_element(by=By.NAME, value='password') | |||
#Password here | |||
passwordBox.send_keys('r5o0wqmw')# sends string to passwordBox | |||
input("Press the login button and solve the CAPTCHA then press enter\n") | |||
# input('input') | |||
# wait for listing page show up (This Xpath may need to change based on different seed url) | |||
# wait for 50 sec until id = tab_content is found, then cont | |||
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( | |||
(By.TAG_NAME, 'nav'))) | |||
# click link to correct forum board | |||
login_link = driver.find_element(by=By.XPATH, value='/html/body/nav/div[1]/a[3]').get_attribute('href') | |||
driver.get(login_link) # open tab with url | |||
# wait for listing page show up (This Xpath may need to change based on different seed url) | |||
# wait for 50 sec until id = tab_content is found, then cont | |||
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( | |||
(By.XPATH, '/html/body/div/div/div[3]/div[5]'))) | |||
# Returns the name of the website | |||
def getForumName() -> str: | |||
name = 'Libre' | |||
return name | |||
# Return the link of the website | |||
def getFixedURL(): | |||
url = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/' | |||
return url | |||
# Closes Tor Browser | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() #close tab | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from Forums.Initialization.forums_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", True) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
# Saves the crawled html page | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
def getFullPathName(url): | |||
from Forums.Initialization.forums_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if name == '': | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
def getInterestedLinks(): | |||
links = [] | |||
# cybersecurity | |||
links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/CyberSecurity') | |||
# services | |||
links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/Services') | |||
# programming | |||
links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/Programming') | |||
# jobs for crypto | |||
links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/JobsforCypto') | |||
# darknet markets | |||
links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/DarkNetMarkets') | |||
return links | |||
def crawlForum(driver): | |||
print("Crawling the Libre forum") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
topics = topicPages(html) | |||
for topic in topics: | |||
has_next_topic_page = True | |||
counter = 1 | |||
page = topic | |||
while has_next_topic_page: | |||
itemURL = urlparse.urljoin(baseURL, str(page)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
if isListingLink(driver.current_url): | |||
break | |||
savePage(driver, driver.page_source, topic + f"page{counter}") # very important | |||
# # comment out | |||
# if counter == 2: | |||
# break | |||
try: | |||
page = "" # no next page so far may have some later on | |||
if page == "": | |||
raise NoSuchElementException | |||
counter += 1 | |||
except NoSuchElementException: | |||
has_next_topic_page = False | |||
# making sure we go back to the listing page (browser back button simulation) | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
try: | |||
link = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the Libre forum done.") | |||
# Returns 'True' if the link is Topic link, may need to change for every website | |||
def isDescriptionLink(url): | |||
if '/p/' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link, may need to change for every website | |||
def isListingLink(url): | |||
if '.onion/c' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links | |||
def topicPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
return libre_links_parser(soup) | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing BestCardingWorld .... DONE!") |
@ -1,249 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from Forums.Utilities.utilities import * | |||
from datetime import date | |||
from datetime import timedelta | |||
import re | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup, ResultSet, Tag | |||
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) | |||
def libre_description_parser(soup: Tag): | |||
# Fields to be parsed | |||
topic = "-1" # 0 *topic name | |||
user = [] # 1 *all users of each post | |||
status = [] # 2 all user's authority in each post such as (adm, member, dangerous) | |||
reputation = [] # 3 all user's karma in each post (usually found as a number) | |||
interest = [] # 4 all user's interest in each post | |||
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) | |||
post = [] # 6 all messages of each post | |||
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) | |||
addDate = [] # 8 all dates of each post | |||
image_user = [] # 9 all user avatars of each post | |||
image_post = [] # 10 all first images of each post | |||
# Finding the topic (should be just one coming from the Listing Page) | |||
topic_found = soup.find("a", {"class": "link text-xl text-zinc-300"}).text | |||
topic = cleanString(topic_found.strip()) | |||
original_post: Tag = soup.find("div", {"class": "flex items-start"}) | |||
original_user = original_post.find("div", {"class": "info-p"}).find("a", {"class": "link"}).text | |||
user.append(cleanString(original_user.replace("/u/", "").strip())) | |||
original_user_statistics: ResultSet[Tag] = original_post.find("div", {"class": "info-p"}).find_all("span") | |||
original_time = original_user_statistics[0].text[2:] | |||
datetime_append = datetime.strptime(original_time, "%Y-%m-%d %H:%M:%S GMT") | |||
addDate.append(datetime_append) | |||
original_karma = original_user_statistics[1].text[2] | |||
reputation.append(cleanString(original_karma.strip())) | |||
original_content = soup.find("div", {"class": "content-p"}).text | |||
post.append(cleanString(original_content.strip())) | |||
status.append("-1") | |||
interest.append("-1") | |||
sign.append("-1") | |||
feedback.append("-1") | |||
image_post.append("-1") | |||
img = original_post.find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = "-1" | |||
image_user.append(img) | |||
# Finding the repeated tag that corresponds to the listing of posts | |||
# try: | |||
posts: ResultSet[Tag] = soup.find_all("div", {"class": "flex items-stretch"}) | |||
# For each message (post), get all the fields we are interested to: | |||
for ipost in posts: | |||
# Finding a first level of the HTML page | |||
# Finding the author (user) of the post | |||
user_name = ipost.find("p", {"class": "text-zinc-400 text-justify"}).find("a", {"class": "link"}).text | |||
user.append(cleanString(user_name.replace("/u/", "").strip())) # Remember to clean the problematic characters | |||
status.append("-1") | |||
# Finding the interest of the author | |||
# CryptBB does not have blurb | |||
interest.append("-1") | |||
# Finding the reputation of the user | |||
# CryptBB does have reputation | |||
karma = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text | |||
karma_cleaned = karma.split(" ")[6] | |||
reputation.append(cleanString(karma_cleaned.strip())) | |||
# Getting here another good tag to find the post date, post content and users' signature | |||
date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text | |||
date_time_cleaned = date_posted.replace(user_name, "")[3:-12] | |||
datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT") | |||
addDate.append(datetime_append) | |||
# Finding the post | |||
user_post = ipost.find("div", {"class": "content-c"}).text | |||
post.append(cleanString(user_post)) | |||
# Finding the user's signature | |||
sign.append("-1") | |||
# As no information about user's feedback was found, just assign "-1" to the variable | |||
feedback.append("-1") | |||
# As no information about post's image was found, just assign "-1" to the variable | |||
image_post.append("-1") | |||
# As no information about user's image was found, just assign "-1" to the variable | |||
image_user.append("-1") | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
# print(topic) | |||
# print(user) | |||
# print(status) | |||
# print(reputation) | |||
# print(interest) | |||
# print(sign) | |||
# print(post) | |||
# print(feedback) | |||
# print(addDate) | |||
# print(len(user)) | |||
# print(len(status)) | |||
# print(len(reputation)) | |||
# print(len(interest)) | |||
# print(len(sign)) | |||
# print(len(feedback)) | |||
# print(len(addDate)) | |||
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages (one page with many posts) | |||
def libre_listing_parser(soup): | |||
nm = 0 # *this variable should receive the number of topics | |||
forum = "Libre" # 0 *forum name | |||
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. | |||
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) | |||
author = [] # 2 *all authors of each topic | |||
topic = [] # 3 *all topics | |||
views = [] # 4 number of views of each topic | |||
posts = [] # 5 number of posts of each topic | |||
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between | |||
# Listing and Description pages) | |||
addDate = [] # 7 when the topic was created (difficult to find) | |||
image_author = [] # 8 all author avatars used in each topic | |||
# Finding the board (should be just one) | |||
board = soup.find('div', {"class": "title"}).find("h1").text | |||
board = cleanString(board.strip()) | |||
# Finding the repeated tag that corresponds to the listing of topics | |||
itopics = soup.find("div", {"class", "space-y-2 mt-4"}).find_all('div', {"class": "flex box"}) | |||
nm = 0 | |||
for itopic in itopics: | |||
nm += 1 | |||
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them | |||
# to don't miss any topic | |||
# Adding the topic to the topic list | |||
topic_string = itopic.find("a", {"class": "link text-xl text-zinc-300"}).text | |||
cleaned_topic_string = cleanString(topic_string.strip()) | |||
topic.append(cleaned_topic_string) | |||
image_author.append("-1") | |||
# Adding the url to the list of urls | |||
link_to_clean = itopic.find('div', {'class': 'flex space-x-2 items-center'}).find('a').get('href') | |||
href.append(link_to_clean) | |||
# Finding the author of the topic | |||
username_not_cleaned = itopic.find('div', {"class": "flex-grow p-2 text-justify"}).find('a').text | |||
username_cleaned = username_not_cleaned.split("/")[-1] | |||
author.append(cleanString(username_cleaned)) | |||
# Finding the number of views | |||
num_views = itopic.find_all("div", {"class": "flex items-center"})[0].find("p").text | |||
views.append(cleanString(num_views)) | |||
# Finding the number of replies | |||
num_replies = itopic.find_all("div", {"class": "flex items-center"})[1].find("p").text | |||
posts.append(cleanString(num_replies)) | |||
# If no information about when the topic was added, just assign "-1" to the variable | |||
date_time_concatenated = itopic.find("p", {"class": "text-sm text-zinc-400 italic"}).text | |||
date_time_cleaned = date_time_concatenated.replace(username_not_cleaned, "") | |||
# creating the datetime object | |||
date_time_array = date_time_cleaned[3:] | |||
datetime_append = datetime.strptime(date_time_array, "%Y-%m-%d %H:%M:%S GMT") | |||
addDate.append(datetime_append) | |||
# print(forum) | |||
# print(nm) | |||
# print(board) | |||
# print(author) | |||
# print(topic) | |||
# print(views) | |||
# print(href) | |||
# print(addDate) | |||
# print(len(author)) | |||
# print(len(topic)) | |||
# print(len(views)) | |||
# print(len(href)) | |||
# print(len(addDate)) | |||
return organizeTopics( | |||
forum=forum, | |||
nm=nm, | |||
board=board, | |||
author=author, | |||
topic=topic, | |||
views=views, | |||
posts=posts, | |||
href=href, | |||
addDate=addDate, | |||
image_author=image_author | |||
) | |||
def libre_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.find("div", {"class", "space-y-2 mt-4"}).find_all('div', {"class": "flex box"}) | |||
for a in listing: | |||
link = a.find('div', {'class': 'flex space-x-2 items-center'}).find('a').get('href') | |||
href.append(link) | |||
return href |
@ -1,310 +0,0 @@ | |||
__author__ = 'Helium' | |||
''' | |||
OnniForums Crawler (Selenium) | |||
Now goes through multiple topic pages. | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.common.by import By | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
import configparser | |||
from datetime import date | |||
import subprocess | |||
from bs4 import BeautifulSoup | |||
from Forums.Initialization.prepare_parser import new_parse | |||
from Forums.OnniForums.parser import onniForums_links_parser | |||
from Forums.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/' | |||
# Opens Tor Browser, crawls the website | |||
def startCrawling(): | |||
forumName = getForumName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(forum=forumName, url=baseURL, createLog=True) | |||
# Login using premade account credentials and do login captcha manually | |||
def login(driver): | |||
#click login button | |||
login_link = driver.find_element( | |||
by=By.XPATH, value='/html/body/div/div[1]/div[2]/div[1]/div/span/a[1]').get_attribute('href') | |||
driver.get(login_link) | |||
#entering username and password into input boxes | |||
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input') | |||
#Username here | |||
usernameBox.send_keys('cabbage_purely') | |||
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input') | |||
#Password here | |||
passwordBox.send_keys('$ourP@tchK1ds') | |||
clicker = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/div/input') | |||
clicker.click() | |||
# wait for listing page show up (This Xpath may need to change based on different seed url) | |||
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( | |||
(By.XPATH, '//*[@id="content"]'))) | |||
# Returns the name of the website | |||
def getForumName(): | |||
name = 'OnniForums' | |||
return name | |||
# Return the link of the website | |||
def getFixedURL(): | |||
url = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/' | |||
return url | |||
# Closes Tor Browser | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from Forums.Initialization.forums_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", True) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
# Saves the crawled html page | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
def getFullPathName(url): | |||
from Forums.Initialization.forums_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if (name == ''): | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
def getInterestedLinks(): | |||
links = [] | |||
# Hacking & Cracking tutorials | |||
links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials') | |||
# # Hacking & Cracking questions | |||
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions') | |||
# # Exploit PoCs | |||
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Exploit-PoCs') | |||
# # sellers | |||
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Sellers') | |||
# # buyers questions | |||
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Buyers-Questions') | |||
# # combo lists | |||
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Combo-lists') | |||
# # Malware-development | |||
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Malware-development') | |||
# # coding | |||
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Coding') | |||
# # Carding & Fraud | |||
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Carding-Fraud') | |||
# # OPSEC | |||
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-OPSEC--13') | |||
return links | |||
def crawlForum(driver): | |||
print("Crawling the OnniForums forum") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
topics = topicPages(html) | |||
for topic in topics: | |||
has_next_topic_page = True | |||
counter = 1 | |||
page = topic | |||
while has_next_topic_page: | |||
itemURL = urlparse.urljoin(baseURL, str(page)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
if isListingLink(driver.current_url): | |||
break | |||
savePage(driver, driver.page_source, topic + f"page{counter}") # very important | |||
# # comment out | |||
# if counter == 2: | |||
# break | |||
try: | |||
temp = driver.find_element(by=By.CLASS_NAME, value='float_left') | |||
page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') | |||
if page == "": | |||
raise NoSuchElementException | |||
counter += 1 | |||
except NoSuchElementException: | |||
has_next_topic_page = False | |||
# making sure we go back to the listing page (browser back button simulation) | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
try: | |||
temp = driver.find_element(by=By.CLASS_NAME, value='float_left') | |||
link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the OnniForums forum done.") | |||
# Returns 'True' if the link is Topic link | |||
def isDescriptionLink(url): | |||
if 'Thread' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link | |||
def isListingLink(url): | |||
if '.onion/Forum' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links | |||
def topicPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) | |||
return onniForums_links_parser(soup) | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing BestCardingWorld .... DONE!") |
@ -1,222 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from typing import List | |||
from Forums.Utilities.utilities import * | |||
from datetime import date | |||
from datetime import timedelta | |||
import re | |||
import string | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) | |||
def onniForums_description_parser(soup: BeautifulSoup) -> tuple: | |||
topicName: str = "-1" # 0 *topic name | |||
users : List[str] = [] # 1 *all users of each post | |||
statuses : List[str] = [] # 2 all user's authority in each post such as (adm, member, dangerous) | |||
reputations : List[str] = [] # 3 all user's karma in each post (usually found as a number) | |||
interests : List[str] = [] # 4 all user's interest in each post | |||
signs : List[str] = [] # 5 all user's signature in each post (usually a standard message after the content of the post) | |||
posts : List[str] = [] # 6 all messages of each post | |||
feedbacks : List[str] = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) | |||
addDates : List[datetime] = [] # 8 all dates of each post | |||
image_user : List[str] = [] # 9 all user avatars of each post | |||
image_post : List[str] = [] # 10 all first images of each post | |||
# Getting the topicName | |||
topicName = soup.find("table", {"class": "tborder tfixed clear"}) \ | |||
.find("td", {"class": "thead"}) \ | |||
.find_all("div")[-1].text | |||
topicName = cleanString(topicName.strip()) | |||
topics_array = soup.find_all("div", {"class": "post"}) | |||
for topic in topics_array: | |||
# Extracting and cleaning author information | |||
author_information: BeautifulSoup = topic.find("div", {"class": "author_information"}) | |||
username: str = author_information.find("span", {"class": "largetext"}).text | |||
username_cleaned = cleanString(username.strip()) | |||
users.append(username_cleaned) | |||
user_status: str = author_information.find("span", {"class": "smalltext"}).text | |||
# Banned users often have weird text issues in HTML | |||
# So we detect banned users and give them a unique string | |||
if user_status.find("Banned") > 0: user_status_cleaned = "Banned" | |||
elif user_status.find("Unregistered") > 0: user_status_cleaned = "Unregistered" | |||
else: user_status_cleaned = cleanString(user_status.strip()) # Remove excessive spaces in string | |||
# Add cleaned data into array | |||
statuses.append(user_status_cleaned) | |||
if user_status_cleaned in ['Unregistered', 'Banned']: reputations.append(-1) | |||
else: | |||
author_statistics: BeautifulSoup = topic.find("div", {"class": "author_statistics"}) | |||
reputation: str = author_statistics.find_all("div", {"class": "float_right"})[-1].text | |||
reputation_cleaned = cleanString(reputation.strip()) | |||
reputations.append(reputation_cleaned) | |||
# Append a "-1" to `interests` and `signs` array since they don't exist on this forum | |||
interests.append("-1") | |||
signs.append("-1") | |||
post_content: str = topic.find("div", {"class": "post_body scaleimages"}).text | |||
# Clean post content of excessive spaces and characters | |||
post_content_cleaned = post_content.replace("[You must reply to view this hidden content]", "") | |||
post_content_cleaned = cleanString(post_content_cleaned.strip()) | |||
posts.append(post_content_cleaned) | |||
# Append a "-1" to `feedbacks` array since they don't exists on this forum | |||
feedbacks.append("-1") | |||
date_posted = topic.find("span", {"class": "post_date"}).text.strip() | |||
if 'modified' in date_posted: | |||
date_posted = date_posted.split('(')[0].strip() | |||
if 'Today' in date_posted or 'Yesterday' in date_posted: | |||
day = topic.find("span", {"class": "post_date"}).find('span').get('title').strip() | |||
time = date_posted.split(',')[1].strip() | |||
date_posted = day + ', ' + time | |||
date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p") | |||
elif 'hour' in date_posted or 'minute' in date_posted: | |||
date_posted = topic.find("span", {"class": "post_date"}).find('span').get('title').strip() | |||
date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p") | |||
else: | |||
date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p") | |||
addDates.append(date_object) | |||
image_post.append("-1") | |||
avatar = topic.find('div', {"class": "author_avatar"}) | |||
if avatar is not None: | |||
img = avatar.find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = '-1' | |||
else: | |||
img = "-1" | |||
image_user.append(img) | |||
# TESTING PURPOSES - DO NOT REMOVE | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
row = (topicName, users, statuses, reputations, interests, signs, posts, feedbacks, addDates, image_user, image_post) | |||
# Sending the results | |||
return row | |||
def onniForums_listing_parser(soup: BeautifulSoup): | |||
nm = 0 # this variable should receive the number of topics | |||
forum = "OnniForums" # 0 *forum name | |||
boardName = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree. | |||
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) | |||
user: List[str] = [] # 2 all users of each topic | |||
topic : List[str] = [] # 3 all topics | |||
view: List[int] = [] # 4 number of views of each topic | |||
post : List[int] = [] # 5 number of posts of each topic | |||
href: List[str] = [] # 6 this variable should receive all cleaned urls (we will use this to do the merge between Listing and Description pages) | |||
addDate : List[str] = [] # 7 when the topic was created (difficult to find) | |||
image_author : List[str] = [] # 8 all author avatars used in each topic | |||
# Finding the board (should be just one) | |||
board_metadata: BeautifulSoup = soup.find("table",{"class" : "tborder clear"}) | |||
boardName = board_metadata.find_all("div")[1].text | |||
boardName = cleanString(boardName.strip()) | |||
thread_arrays = board_metadata.find_all("tr", {"class":"inline_row"}) # gets the information of posts | |||
nm = len(thread_arrays) | |||
for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above | |||
body = thread.find("span",{"class": "subject_new"}) | |||
try: | |||
post_subject: str = body.text #getting the topic | |||
except: | |||
body = thread.find("span",{"class": "subject_old"}) | |||
post_subject: str = body.text | |||
post_subject_cleaned = cleanString(post_subject.strip()) | |||
topic.append(post_subject_cleaned) | |||
author_icon = thread.find('div', {"class": "lavatar-old lavatar-old-f"}) | |||
if author_icon != None: | |||
author_icon = author_icon.find('img') | |||
author_icon = author_icon.get('src') | |||
author_icon = author_icon.split('base64,')[-1] | |||
else: | |||
author_icon = "-1" | |||
image_author.append(author_icon) | |||
reply_count = thread.find_all("td", {"align": "center"})[2].text | |||
post.append(cleanNumbers(reply_count)) | |||
views = thread.find_all("td", {"align": "center"})[3].text | |||
view.append(cleanNumbers(views)) | |||
# dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text | |||
# dates_added_cleaned = dates_added.split(',')[0] | |||
# addDate.append(dates_added_cleaned) | |||
author = thread.find("span",{"class" : "author smalltext"}).text | |||
author_cleaned = cleanString(author.strip()) | |||
user.append(author_cleaned) | |||
thread_link = body.find('a').get('href') | |||
href.append(thread_link) | |||
return organizeTopics( | |||
forum=forum, | |||
nm=nm, | |||
board=boardName, | |||
author=user, | |||
topic=topic, | |||
views=view, | |||
posts=post, | |||
href=href, | |||
addDate=addDate, | |||
image_author=image_author | |||
) | |||
# This is the method to parse the Listing Pages (one page with many posts) | |||
def onniForums_links_parser(soup: BeautifulSoup): | |||
href = [] | |||
listing = soup.find_all('tr', {'class': 'inline_row'}) | |||
for thread in listing: | |||
try: | |||
link = thread.find('span', {"class": "subject_old"}).find('a').get('href') | |||
except: | |||
link = thread.find('span', {"class": "subject_new"}).find('a').get('href') | |||
href.append(link) | |||
return href |
@ -1,57 +0,0 @@ | |||
import os | |||
from Forums.OnniForums.parser import onniForums_description_parser | |||
from Forums.OnniForums.parser import onniForums_listing_parser | |||
from bs4 import BeautifulSoup | |||
baseUrl = './HTML_Pages/06272023/Listing/httponnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qdonionForumCardingFraud.html' | |||
with open(baseUrl, 'r') as file: | |||
testHTML = file.read() | |||
soup = BeautifulSoup(testHTML, 'html.parser') | |||
output = onniForums_listing_parser(soup) | |||
print(output) | |||
all_descriptions = os.listdir("./HTML_Pages/06272023/Description/")[1:] | |||
total = len(all_descriptions) | |||
descriptions_with_unicode_error = 0 | |||
print("\nTESTING DESCRIPTION PARSER:\n") | |||
for desc in all_descriptions: | |||
print(f"\nTesting: ./HTML_Pages/06272023/Description/{desc} \n") | |||
try: | |||
with open(f"./HTML_Pages/06272023/Description/{desc}", "r") as file: | |||
test_html = file.read() | |||
soup = BeautifulSoup(test_html, features="html.parser") | |||
description_output = onniForums_description_parser(soup) | |||
print(f"\nTopic name : {description_output[0]}") | |||
print(f"Contents : {description_output[1]}") | |||
print(f"Users : {description_output[2]}") | |||
print(f"Dates posted: {description_output[3]}") | |||
print(f"Feedbacks : {description_output[4]}") | |||
print(f"Statuses : {description_output[5]}") | |||
print(f"Reputations : {description_output[6]}") | |||
print(f"Signatures : {description_output[7]}") | |||
print(f"Interests : {description_output[8]}\n") | |||
except UnicodeDecodeError: | |||
descriptions_with_unicode_error += 1 | |||
print(f"UnicodeDecodeError: the file `{desc}` cannot be decoded by Python!") | |||
print("\nTESTING COMPLETE\n") | |||
print(f"Number of descriptions : {total}") | |||
print(f"Descriptions w/ errors : {descriptions_with_unicode_error}") | |||
print(f"Failure percentage : {round(descriptions_with_unicode_error/total, 4) * 100}%\n") | |||
@ -1,321 +0,0 @@ | |||
__author__ = 'Helium' | |||
''' | |||
Procrax Forum Crawler (Selenium) | |||
rechecked and confirmed | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.common.by import By | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
from datetime import date | |||
import configparser | |||
import subprocess | |||
from bs4 import BeautifulSoup | |||
from Forums.Initialization.prepare_parser import new_parse | |||
from Forums.Procrax.parser import procrax_links_parser | |||
from Forums.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
BASE_URL = 'https://procrax.cx/' | |||
FORUM_NAME = 'Procrax' | |||
# Opens Tor Browser, crawls the website | |||
def startCrawling(): | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(forum=FORUM_NAME, url=BASE_URL, createLog=True) | |||
# Login using premade account credentials and do login captcha manually | |||
def login(driver): | |||
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( | |||
(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span'))) | |||
#entering username and password into input boxes | |||
usernameBox = driver.find_element(by=By.NAME, value='login') | |||
#Username here | |||
usernameBox.send_keys('cheese_pizza_man')#sends string to the username box | |||
passwordBox = driver.find_element(by=By.NAME, value='password') | |||
#Password here | |||
passwordBox.send_keys('Gr33nSp@m&3ggs')# sends string to passwordBox | |||
clicker = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span') | |||
clicker.click() | |||
# # wait for listing page show up (This Xpath may need to change based on different seed url) | |||
# # wait for 50 sec until id = tab_content is found, then cont | |||
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( | |||
(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[1]/div/div[1]/div'))) | |||
# Returns the name of the website | |||
def getForumName(): | |||
name = 'Procrax' | |||
return name | |||
# Return the link of the website | |||
def getFixedURL(): | |||
url = 'https://procrax.cx/' | |||
return url | |||
# Closes Tor Browser | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() #close tab | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from Forums.Initialization.forums_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", True) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
def getAccess(): | |||
driver = createFFDriver() | |||
try: | |||
driver.get(BASE_URL)# open url in browser | |||
return driver | |||
except: | |||
driver.close()# close tab | |||
return 'down' | |||
# Saves the crawled html page | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
def getFullPathName(url): | |||
from Forums.Initialization.forums_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + FORUM_NAME + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if (name == ''): | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
def getInterestedLinks(): | |||
links = [] | |||
# verified sales | |||
links.append('https://procrax.cx/forums/verified-sales-market.10/') | |||
# unverified sales | |||
links.append('https://procrax.cx/forums/unverified-sales-market.12/') | |||
# combos | |||
links.append('https://procrax.cx/forums/bases.79/') | |||
# tools | |||
links.append('https://procrax.cx/forums/tools.81/') | |||
# configs | |||
links.append('https://procrax.cx/forums/configs.82/') | |||
# craxtube | |||
links.append('https://procrax.cx/forums/craxtube.83/') | |||
# general hacking | |||
links.append('https://procrax.cx/forums/general-hacking.24/') | |||
# hacking security tools | |||
links.append('https://procrax.cx/forums/hacking-security-tools.20/') | |||
# hacktube | |||
links.append('https://procrax.cx/forums/hacktube.22/') | |||
# cardingtube | |||
links.append('https://procrax.cx/forums/cardingtube.26/') | |||
# cardable | |||
links.append('https://procrax.cx/forums/cardable-websites.28/') | |||
# spam software | |||
links.append('https://procrax.cx/forums/mailing.72/') | |||
# spam tools | |||
links.append('https://procrax.cx/forums/tools-bots-validators.73/') | |||
# darknet news | |||
links.append('https://procrax.cx/forums/darknet-news-articles.42/') | |||
# links | |||
links.append('https://procrax.cx/forums/darknet-markets-deep-onion-links.43/') | |||
# courses | |||
links.append('https://procrax.cx/forums/courses.59/') | |||
# software | |||
links.append('https://procrax.cx/forums/software.76/') | |||
# general forum | |||
links.append('https://procrax.cx/forums/forum-discussions-updates.7/') | |||
return links | |||
def crawlForum(driver): | |||
print("Crawling the Procrax forum") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
topics = topicPages(html) | |||
for topic in topics: | |||
has_next_topic_page = True | |||
counter = 1 | |||
page = topic | |||
while has_next_topic_page: | |||
itemURL = urlparse.urljoin(BASE_URL, str(page)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
if isListingLink(driver.current_url): | |||
break | |||
savePage(driver, driver.page_source, topic + f"page{counter}") # very important | |||
# # comment out | |||
# if counter == 2: | |||
# break | |||
try: | |||
page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') | |||
if page == "": | |||
raise NoSuchElementException | |||
counter += 1 | |||
except NoSuchElementException: | |||
has_next_topic_page = False | |||
# making sure we go back to the listing page (browser back button simulation) | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
try: | |||
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the Procrax forum done.") | |||
# Returns 'True' if the link is Topic link, may need to change for every website | |||
def isDescriptionLink(url): | |||
if 'threads' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link, may need to change for every website | |||
def isListingLink(url): | |||
if '.cx/forums' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links | |||
def topicPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) | |||
return procrax_links_parser(soup) | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing BestCardingWorld .... DONE!") |
@ -1,189 +0,0 @@ | |||
__author__ = 'Helium' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from Forums.Utilities.utilities import * | |||
from datetime import date | |||
from datetime import timedelta | |||
import re | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup, ResultSet, Tag | |||
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) | |||
def procrax_description_parser(soup: Tag): | |||
# Fields to be parsed | |||
topic = "-1" # 0 topic name | |||
user = [] # 1 all users of each post | |||
addDate = [] # 2 all dated of each post | |||
feedback = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format) | |||
status = [] # 4 all user's authority in each post such as (adm, member, dangerous) | |||
reputation = [] # 5 all user's karma in each post (usually found as a number) | |||
sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post) | |||
post = [] # 7 all messages of each post | |||
interest = [] # 8 all user's interest in each post | |||
image_user = [] # 9 all user avatars of each post | |||
image_post = [] # 10 all first images of each post | |||
# Finding the topic (should be just one coming from the Listing Page) | |||
li = soup.find("h1", {"class": "p-title-value"}) | |||
topic = li.text | |||
thread: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True}) | |||
for ipost in thread: | |||
username = ipost.find("h4", {"class": "message-name"}).text | |||
user.append(cleanString(username.strip())) | |||
date_posted = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime") | |||
datetime_obj = datetime.strptime(date_posted, "%Y-%m-%dT%H:%M:%S%z") | |||
addDate.append(datetime_obj) | |||
feedback.append("-1") | |||
user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text | |||
status.append(cleanString(user_status.strip())) | |||
user_lvl = ipost.find("div", {"class": "afAwardLevel"}) | |||
if user_lvl is not None: | |||
user_lvl = user_lvl.text | |||
reputation.append(cleanString(user_lvl.strip())) | |||
else: | |||
reputation.append('-1') | |||
sign.append("-1") | |||
user_post = ipost.find("article", {"class": "message-body js-selectToQuote"}).text | |||
post.append(cleanString(user_post.strip())) | |||
interest.append("-1") | |||
bbWrapper = ipost.find('div', {"class": "bbWrapper"}) | |||
if bbWrapper is not None: | |||
img = bbWrapper.find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = "-1" | |||
else: | |||
img = "-1" | |||
image_post.append(img) | |||
avatar = ipost.find("a", {"class": "avatar avatar--m"}) | |||
if avatar is not None: | |||
img = avatar.find('img') | |||
if img is not None: | |||
img = img.get('src').split('base64,')[-1] | |||
else: | |||
img = "-1" | |||
else: | |||
img = "-1" | |||
image_user.append(img) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages (one page with many posts) | |||
def procrax_listing_parser(soup: Tag): | |||
nm = 0 # this variable should receive the number of topics | |||
forum: str = "Procrax" # 0 *forum name | |||
board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree. | |||
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) | |||
author = [] # 2 all authors of each topic | |||
topic = [] # 3 all topics | |||
views = [] # 4 number of views of each topic | |||
posts = [] # 5 number of posts of each topic | |||
href = [] # 6this variable should receive all cleaned urls (we will use this to do the marge between | |||
# Listing and Description pages) | |||
addDate = [] # 7 when the topic was created (difficult to find) | |||
image_author = [] # 8 all author avatars used in each topic | |||
# Finding the board (should be just one) | |||
li = soup.find("h1", {"class": "p-title-value"}) | |||
board = cleanString(li.text.strip()) | |||
threads_list = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True}) | |||
sticky = soup.find("div", {"class": "structItemContainer-group structItemContainer-group--sticky"}) | |||
if sticky is not None: | |||
threads_list = sticky.find_all("div", {"data-author": True}) + threads_list | |||
nm = len(threads_list) | |||
for thread in threads_list: | |||
thread_title = thread.find("div", {"class": "structItem-title"}).text | |||
topic.append(cleanString(thread_title.strip())) | |||
author_icon = thread.find('a', {"class": "avatar avatar--s"}) | |||
if author_icon != None: | |||
author_icon = author_icon.find('img') | |||
if author_icon != None: | |||
author_icon = author_icon.get('src') | |||
author_icon = author_icon.split('base64,')[-1] | |||
else: | |||
author_icon = "-1" | |||
else: | |||
author_icon = "-1" | |||
image_author.append(author_icon) | |||
thread_author = thread.get("data-author") | |||
author.append(cleanString(thread_author)) | |||
thread_views = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find('dd').text | |||
thread_views = thread_views.lower().replace("k", "000") | |||
thread_views = thread_views.lower().replace("m", "000000") | |||
views.append(thread_views.strip()) | |||
thread_replies = thread.find("dl", {"class": "pairs pairs--justified"}).find('dd').text | |||
# All threads contain one topic post and reply posts | |||
thread_total_posts = thread_replies.lower().replace("k", "000") | |||
posts.append(thread_total_posts.strip()) | |||
thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime") | |||
datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z") | |||
addDate.append(datetime_obj) | |||
thread_link: str = thread.find("div", {"class": "structItem-title"}).find('a', {'class': ''}).get('href') | |||
href.append(thread_link) | |||
return organizeTopics( | |||
forum=forum, | |||
nm=nm, | |||
board=board, | |||
author=author, | |||
topic=topic, | |||
views=views, | |||
posts=posts, | |||
addDate=addDate, | |||
href=href, | |||
image_author=image_author | |||
) | |||
def procrax_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.find_all('div', {"class": "structItem-title"}) | |||
for a in listing: | |||
link = a.find('a', {'class': ''}).get('href') | |||
href.append(link) | |||
return href |
@ -1,293 +0,0 @@ | |||
__author__ = 'Helium' | |||
''' | |||
Anon Market Crawler (Selenium) | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.common.by import By | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
from datetime import date | |||
import subprocess | |||
import configparser | |||
from bs4 import BeautifulSoup | |||
from MarketPlaces.Initialization.prepare_parser import new_parse | |||
from MarketPlaces.AnonMarket.parser import AnonMarket_links_parser | |||
from MarketPlaces.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion' | |||
# Opens Tor Browser, crawls the website, then parses, then closes tor | |||
#acts like the main method for the crawler, another function at the end of this code calls this function later | |||
def startCrawling(): | |||
mktName = getMKTName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(mktName, baseURL, True) | |||
# Returns the name of the website | |||
#return: name of site in string type | |||
def getMKTName(): | |||
name = 'AnonMarket' | |||
return name | |||
# Return the base link of the website | |||
#return: url of base site in string type | |||
def getFixedURL(): | |||
url = 'http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion' | |||
return url | |||
# Closes Tor Browser | |||
#@param: current selenium driver | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from MarketPlaces.Initialization.markets_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", False) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' | |||
#return: return the selenium driver or string 'down' | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
#@param: raw url as crawler crawls through every site | |||
def getFullPathName(url): | |||
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned | |||
#@param: raw url as crawler crawls through every site | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if (name == ''): | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list | |||
#in this example, there are a couple of categories some threads fall under such as | |||
# Guides and Tutorials, Digital Products, and Software and Malware | |||
#as you can see they are categories of products | |||
def getInterestedLinks(): | |||
links = [] | |||
# Malware | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/malware') | |||
# Bootkits | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/bootkits') | |||
# Backdoors | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/backdoors') | |||
# Keyloggers | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/keyloggers') | |||
# Wireless Trackers | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wireless_trackers') | |||
# Screen Scrapers | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/screen_scrapers') | |||
# Mobile Forensic Tools | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/mobile_forensics_tools') | |||
# Wifi Jammers | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wifi_jammers') | |||
# Carding | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/carding') | |||
# Worms | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/worms') | |||
# Viruses | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/viruses') | |||
# Trojans | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/trojans') | |||
# Botnets | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/botnets') | |||
# Security Technology | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security_technology') | |||
# Hacks | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/hacks') | |||
# Exploit kits | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/exploit_kit') | |||
# Security | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security') | |||
# Ransomware | |||
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/ransomware') | |||
return links | |||
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through | |||
#topic and description pages are crawled through here, where both types of pages are saved | |||
#@param: selenium driver | |||
def crawlForum(driver): | |||
print("Crawling the Anon Market") | |||
linksToCrawl = getInterestedLinks() | |||
for link in linksToCrawl: | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
# Get all product links on the current page | |||
products_list = productPages(html) | |||
for item in products_list: | |||
itemURL = urlparse.urljoin(baseURL, str(item)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
savePage(driver, driver.page_source, item) | |||
driver.back() # Go back to listing after visiting each product | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
# Locate the next page link | |||
try: | |||
# Find the active page number | |||
active_page_element = driver.find_element(By.XPATH, '//div[@class="page activepage"]') | |||
# current_page = int(active_page_element.text) | |||
next_page_element = active_page_element.find_element(By.XPATH, 'following-sibling::a[1]') | |||
link = next_page_element.get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
print("Crawling the Anon Market done.") | |||
# Returns 'True' if the link is a description link | |||
#@param: url of any url crawled | |||
#return: true if is a description page, false if not | |||
def isDescriptionLink(url): | |||
if 'product' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link | |||
#@param: url of any url crawled | |||
#return: true if is a Listing page, false if not | |||
def isListingLink(url): | |||
if 'category' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links, the html is the url of a link from the list of interested link list | |||
#@param: link from interested link list ie. getInterestingLinks() | |||
#return: list of description links that should be crawled through | |||
def productPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
return AnonMarket_links_parser(soup) | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing Nexus .... DONE!") | |||
@ -1,195 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
import re | |||
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of description page | |||
#return: 'row' that contains a variety of lists that each hold info on the description page | |||
def AnonMarket_description_parser(soup): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
image = "-1" # 19 Product_Image | |||
vendor_image = "-1" # 20 Vendor_Image | |||
name_of_product = soup.find("div", {"class": "heading"}).text | |||
name = cleanString(name_of_product.strip()) | |||
description_div = soup.find("div", {"class": "tab1"}) | |||
if description_div is None: | |||
describe = "-1" | |||
else: | |||
describe = cleanString(description_div.text.strip()) | |||
info_div = soup.find('div', {'class': 'information'}) | |||
table = info_div.find('table') if info_div else None | |||
# Find all table rows | |||
rows = table.find_all('tr') | |||
# Parse each row to get relevant data | |||
data = {} | |||
for row in rows: | |||
columns = row.find_all('td') | |||
if len(columns) == 3: | |||
key = columns[0].text.strip() | |||
value = columns[2].text.strip() | |||
data[key] = value | |||
# Extract specific data from the dictionary and assign them to individual variables | |||
vendor = data.get('Vendor', '-1') | |||
shipFrom = data.get('Location', '-1') | |||
shipTo = data.get('Ships to', '-1') | |||
category = data.get('Category', '-1') | |||
USD = data.get('Price', '-1').split()[0] | |||
left = data.get('Stock', '-1') | |||
# image | |||
image = soup.find('img', {"class": "bigthumbnail"}) | |||
image = image.get('src').split('base64,')[-1] | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) | |||
# Sending the results | |||
return row | |||
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of listing page | |||
#return: 'row' that contains a variety of lists that each hold info on the listing page | |||
def AnonMarket_listing_parser(soup): | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "AnonMarket" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft = [] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
image = [] # 20 Product_Image | |||
image_vendor = [] # 21 Vendor_Image | |||
href = [] # 22 Product_Links | |||
base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion" | |||
cat = soup.find("div", {'class': 'heading'}).text | |||
products_list = soup.find_all('div', {'class': 'item'}) | |||
nm = 0 | |||
for product in products_list: | |||
name_of_product = product.find("div", {"class": "title"}).text.strip() | |||
name.append(name_of_product) | |||
name_of_vendor = product.find("a", {'class': 'seller'}).text.strip() | |||
vendor.append(name_of_vendor) | |||
category.append(cat) | |||
tbody = product.find('div', {"class": "info"}).find('tbody') | |||
# rating_item | |||
width = tbody.find('div', {"class": "stars2"}).get('style') | |||
rating_item.append(cleanNumbers(width.strip())) | |||
tr = tbody.findAll('tr', recursive=False) | |||
td = tr[2].findAll('td') | |||
# sold | |||
sold.append(td[0].text.strip()) | |||
# reviews | |||
reviews.append(td[1].text.strip()) | |||
product_link_element = product.find("div", {"class": "title"}).find_parent('a') | |||
link = product_link_element['href'] | |||
full_link = base_url + link | |||
href.append(full_link) | |||
# Append '-1' for unavailable data | |||
rating_vendor.append("-1") | |||
success.append("-1") | |||
CVE.append("-1") | |||
MS.append("-1") | |||
describe.append("-1") | |||
views.append("-1") | |||
addDate.append("-1") | |||
BTC.append("-1") | |||
USD.append("-1") | |||
EURO.append("-1") | |||
qLeft.append("-1") | |||
shipFrom.append("-1") | |||
shipTo.append("-1") | |||
nm += 1 | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) | |||
#called by the crawler to get description links on a listing page | |||
#@param: beautifulsoup object that is using the correct html page (listing page) | |||
#return: list of description links from a listing page | |||
def AnonMarket_links_parser(soup): | |||
# Base URL to prepend to each product link | |||
base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion" | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
# Using a shorter, but still unique, class name | |||
listing = soup.find('div', {'class': 'items'}).find_all('a', href=True, attrs={'href': lambda x: "/product/" in x}) | |||
for a in listing: | |||
link = a.get('href') | |||
if link: # Checks if 'href' attribute is not None | |||
# Prepending the base URL to the scraped link | |||
full_link = base_url + link | |||
href.append(full_link) | |||
# Filtering out any links that might not have '/product/' in them | |||
product_links = [link for link in href if '/product/' in link] | |||
return product_links |
@ -1,226 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup, ResultSet, Tag | |||
def apocalypse_description_parser(soup: Tag): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
image = "-1" # 19 Product_Image | |||
vendor_image = "-1" # 20 Vendor_Image | |||
content: Tag = soup.find("div", {'id': "article_page"}) | |||
product_name = content.find("p", {"class": "list-group-item text-center mb-0 box"}).text | |||
name = cleanString(product_name.strip()) | |||
product_description = content.find("pre").text | |||
describe = cleanString(product_description.strip()) | |||
# Finding Product Image | |||
image = soup.find('div', {'class': 'col-md-7 text-center'}).find('img') | |||
image = image.get('src').split('base64,')[-1] | |||
product_reviews_list: Tag = content.find("table", {"class": "table product_reviews"}) \ | |||
.find_all("li") | |||
review = str(len(product_reviews_list)) | |||
product_category = content.find("a", {"class": "badge badge-danger"}).text | |||
category = cleanString(product_category.strip()) | |||
product_ships_from = content.find("span", {"class": "badge badge-info"}).text | |||
shipFrom = cleanString(product_ships_from.strip()) | |||
product_success_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-success"}) | |||
product_ships_to = product_success_badge[1].text | |||
shipTo = cleanString(product_ships_to.strip()) | |||
product_supply = content.find("span", {"class": "badge badge-warning"}).text | |||
left = cleanString(product_supply.strip()) | |||
product_primary_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-primary"}) | |||
# Product vendor comes in the form of "@ vendor_name" | |||
product_vendor = product_primary_badge[0].text.replace("@", "") | |||
vendor = cleanString(product_vendor.strip()) | |||
sold = cleanString(product_primary_badge[1].text.strip()) | |||
product_prices: Tag = content.find("p", {"style": "border-bottom:1px solid GREY;"}) | |||
USD = product_prices.find("span", {"class": "pr"}).text | |||
prices_array: ResultSet[Tag] = product_prices.find_all("span", {"class": "pr1"}) | |||
BTC = prices_array[1].text | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) | |||
# Sending the results | |||
return row | |||
def apocalypse_listing_parser(soup: Tag): | |||
# Fields to be parsed | |||
nm = 0 # Total_Products (Should be Integer) | |||
mktName = "Apocalypse" # 0 Marketplace_Name | |||
name = [] # 1 Product_Name | |||
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = [] # 3 Product_MS_Classification (Microsoft Security) | |||
category = [] # 4 Product_Category | |||
describe = [] # 5 Product_Description | |||
escrow = [] # 6 Vendor_Warranty | |||
views = [] # 7 Product_Number_Of_Views | |||
reviews = [] # 8 Product_Number_Of_Reviews | |||
addDate = [] # 9 Product_AddDate | |||
lastSeen = [] # 10 Product_LastViewDate | |||
BTC = [] # 11 Product_BTC_SellingPrice | |||
USD = [] # 12 Product_USD_SellingPrice | |||
EURO = [] # 13 Product_EURO_SellingPrice | |||
sold = [] # 14 Product_QuantitySold | |||
qLeft =[] # 15 Product_QuantityLeft | |||
shipFrom = [] # 16 Product_ShippedFrom | |||
shipTo = [] # 17 Product_ShippedTo | |||
vendor = [] # 18 Vendor | |||
rating = [] # 19 Vendor_Rating | |||
success = [] # 20 Vendor_Successful_Transactions | |||
image = [] # 20 Product_Image | |||
image_vendor = [] # 21 Vendor_Image | |||
href = [] # 22 Product_Links | |||
table = soup.find("div", {"class": "col-lg-9 my-4"}) | |||
if table is None: | |||
table = soup.find("div", {"class": "col-lg-9"}) | |||
listings: ResultSet[Tag] = table.find_all("div", {"class": "col-lg-4 col-md-6 mb-1"}) | |||
for prod in listings: | |||
product_name = prod.find('h5', {"class": "art_title"}).text | |||
name.append(cleanString(product_name.strip())) | |||
# Finding Product Image | |||
product_image = prod.find('img', {'class': 'customHeight'}) | |||
product_image = product_image.get('src').split('base64,')[-1] | |||
image.append(product_image) | |||
CVE.append("-1") | |||
MS.append("-1") | |||
describe.append("-1") | |||
escrow.append("-1") | |||
reviews.append("-1") | |||
addDate.append("-1") | |||
lastSeen.append("-1") | |||
BTC.append("-1") | |||
EURO.append("-1") | |||
shipTo.append("-1") | |||
success.append("-1") | |||
image_vendor.append("-1") | |||
product_price = prod.find("span", {"class": "priceP"}).text | |||
USD.append(cleanString(product_price.strip())) | |||
product_sold = prod.find("span", {"class": "badge badge-success"}).text | |||
sold.append(cleanString(product_sold.strip())) | |||
product_statistics: ResultSet[Tag] = prod.find_all("p", {"class": "mb-0 card-text"}) | |||
product_category = product_statistics[0].find("a").text | |||
category.append(cleanString(product_category.strip())) | |||
product_sold = product_statistics[1].find("span").text | |||
sold.append(cleanString(product_sold.strip())) | |||
product_quantity_left = product_statistics[2].find("span", {"class": "badge bluebadge"}).text | |||
qLeft.append(cleanString(product_quantity_left.strip())) | |||
product_views = product_statistics[3].find("span").text | |||
views.append(cleanString(product_views.strip())) | |||
product_ships_from = product_statistics[4].find("span").text | |||
shipFrom.append(cleanString(product_ships_from.strip())) | |||
product_vendor_tag: Tag = product_statistics[5].find("a").find("span", {"class": "badge badge-primary"}) | |||
# Product vendors & ratings are displayed as "vender_name ★ 5.0" | |||
# When split by the star (★), it should return a 2-value array | |||
product_vendor, product_vendor_rating = product_vendor_tag.text.split("★") | |||
try: | |||
vendor.append(cleanString(product_vendor.strip())) | |||
rating.append(cleanString(product_vendor_rating.strip())) | |||
except Exception as e: | |||
raise e | |||
product_href = prod.find('a').get('href') | |||
href.append(product_href) | |||
nm += 1 | |||
return organizeProducts( | |||
marketplace=mktName, | |||
nm=nm, | |||
vendor=vendor, | |||
rating_vendor=rating, | |||
success_vendor=success, | |||
nombre=name, | |||
CVE=CVE, | |||
MS=MS, | |||
category=category, | |||
describe=describe, | |||
views=views, | |||
reviews=reviews, | |||
rating_item=["-1" for _ in range(nm)], | |||
addDate=addDate, | |||
BTC=BTC, | |||
USD=USD, | |||
EURO=EURO, | |||
sold=sold, | |||
qLeft=qLeft, | |||
shipFrom=shipFrom, | |||
shipTo=shipTo, | |||
href=href, | |||
image=image, | |||
image_vendor=image_vendor | |||
) | |||
#called by the crawler to get description links on a listing page | |||
#@param: beautifulsoup object that is using the correct html page (listing page) | |||
#return: list of description links from a listing page | |||
def apocalypse_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.findAll('div', {"class": "col-lg-4 col-md-6 mb-1"}) | |||
for a in listing: | |||
bae = a.find('a', href=True) | |||
link = bae['href'] | |||
href.append(link) | |||
return href |
@ -0,0 +1,227 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) | |||
def ares_description_parser(soup): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
# Finding Product Name | |||
name = soup.find('div', {'class': "col-md-12 my-2"}).text | |||
name = name.replace('\n', ' ') | |||
name = name.replace(",", "") | |||
name = name.strip() | |||
bae = soup.find('div', {'class': "col-md-7"}).find('span').find_all('span') | |||
# Finding Vendor | |||
vendor = bae[0].text | |||
vendor = vendor.replace(",", "") | |||
vendor = vendor.replace("...", "") | |||
vendor = vendor.strip() | |||
# Finding Vendor Rating | |||
full_stars = bae[2].find_all('i', {'class': "fas fa-star"}) | |||
half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) | |||
rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0) | |||
# Finding Successful Transactions | |||
success = bae[4].text | |||
success = success.replace("Sales ", "") | |||
success = success.strip() | |||
bae = soup.find('span', {'class': "text-left"}).find_all('span') | |||
# Finding Prices | |||
USD = bae[0].text | |||
USD = USD.replace("\n$", "") | |||
USD = USD.strip() | |||
shipping_info = bae[4].text | |||
if "Digital" not in shipping_info: | |||
shipping_info = shipping_info.split(" ") | |||
# Finding Shipment Information (Origin) | |||
shipFrom = shipping_info[0].strip() | |||
# Finding Shipment Information (Destination) | |||
shipTo = shipping_info[1].strip() | |||
bae = soup.find_all('textarea') | |||
# Finding the Product description | |||
describe = bae[0].text | |||
describe = describe.replace("\n", " ") | |||
describe = describe.replace("\r", " ") | |||
describe = describe.strip() | |||
# Finding the Terms and Conditions | |||
terms = bae[1].text | |||
terms = terms.replace("\n", " ") | |||
terms = terms.strip() | |||
''' | |||
# Finding the Number of Product Reviews | |||
tag = soup.findAll(text=re.compile('Reviews')) | |||
for index in tag: | |||
reviews = index | |||
par = reviews.find('(') | |||
if par >=0: | |||
reviews = reviews.replace("Reviews (","") | |||
reviews = reviews.replace(")","") | |||
reviews = reviews.split(",") | |||
review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) | |||
else : | |||
review = "-1" | |||
''' | |||
# Searching for CVE and MS categories | |||
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) | |||
if cve: | |||
CVE = " " | |||
for idx in cve: | |||
CVE += (idx) | |||
CVE += " " | |||
CVE = CVE.replace(',', ' ') | |||
CVE = CVE.replace('\n', '') | |||
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) | |||
if ms: | |||
MS = " " | |||
for im in ms: | |||
MS += (im) | |||
MS += " " | |||
MS = MS.replace(',', ' ') | |||
MS = MS.replace('\n', '') | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages | |||
def ares_listing_parser(soup): | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "Ares" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) | |||
category = [] # 7 Product_Category | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft = [] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
href = [] # 20 Product_Links | |||
listing = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) | |||
# Populating the Number of Products | |||
nm = len(listing) | |||
for a in listing: | |||
bae = a.findAll('a', href=True) | |||
# Adding the url to the list of urls | |||
link = bae[0].get('href') | |||
link = cleanLink(link) | |||
href.append(link) | |||
# Finding the Vendor | |||
vendor_name = bae[1].text | |||
vendor_name = vendor_name.replace(",", "") | |||
vendor_name = vendor_name.strip() | |||
vendor.append(vendor_name) | |||
# Finding the Product | |||
product = bae[2].find('img').get('alt') | |||
product = product.replace('\n', ' ') | |||
product = product.replace(",", "") | |||
product = product.strip() | |||
name.append(product) | |||
# Searching for CVE and MS categories | |||
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) | |||
if not cve: | |||
cveValue="-1" | |||
else: | |||
cee = " " | |||
for idx in cve: | |||
cee += (idx) | |||
cee += " " | |||
cee = cee.replace(',', ' ') | |||
cee = cee.replace('\n', '') | |||
cveValue=cee | |||
CVE.append(cveValue) | |||
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) | |||
if not ms: | |||
MSValue="-1" | |||
else: | |||
me = " " | |||
for im in ms: | |||
me += (im) | |||
me += " " | |||
me = me.replace(',', ' ') | |||
me = me.replace('\n', '') | |||
MSValue=me | |||
MS.append(MSValue) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) | |||
def ares_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.findAll('a', {"class": "btn btn-success w-100 my-1"}) | |||
for a in listing: | |||
link = a['href'] | |||
href.append(link) | |||
return href |
@ -1,262 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
''' | |||
DarkBazar Marketplace Crawler (Selenium) | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from selenium.webdriver.support.ui import Select | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.common.by import By | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
from datetime import date | |||
import subprocess | |||
import configparser | |||
from bs4 import BeautifulSoup | |||
from MarketPlaces.Initialization.prepare_parser import new_parse | |||
from MarketPlaces.DarkBazar.parser import darkbazar_links_parser | |||
from MarketPlaces.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/' | |||
def startCrawling(): | |||
mktName = getMKTName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(mktName, baseURL, True) | |||
# Returns the name of the website | |||
def getMKTName(): | |||
name = 'DarkBazar' | |||
return name | |||
# Return the base link of the website | |||
def getFixedURL(): | |||
url = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/' | |||
return url | |||
# Closes Tor Browser | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from MarketPlaces.Initialization.markets_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
# ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
# ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", False) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
def login(driver): | |||
input("Press ENTER when CAPTCHA is complete and login page has loaded\n") | |||
# entering username and password into input boxes | |||
usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]') | |||
# Username here | |||
usernameBox.send_keys('aliciamykeys') | |||
passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]') | |||
# Password here | |||
passwordBox.send_keys('aliciawherearemykey$') | |||
# session time | |||
session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select')) | |||
session_select.select_by_visible_text('Session 60min') | |||
input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n") | |||
# wait for listing page show up (This Xpath may need to change based on different seed url) | |||
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( | |||
(By.XPATH, '//*[@id="submit"]'))) | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
def getFullPathName(url): | |||
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
def getMKTName() -> str: | |||
name = 'DarkBazar' | |||
return name | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if name == '': | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
def getInterestedLinks(): | |||
links = [] | |||
# Digital Goods | |||
links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=3') | |||
# Services | |||
links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=5') | |||
return links | |||
def crawlForum(driver): | |||
print("Crawling the DarkBazar market") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
list = productPages(html) | |||
for item in list: | |||
itemURL = urlparse.urljoin(baseURL, str(item)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
savePage(driver, driver.page_source, item) | |||
driver.back() | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
try: | |||
link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the DarkBazar market done.") | |||
# Returns 'True' if the link is Topic link, may need to change for every website | |||
def isDescriptionLink(url): | |||
if 'item' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link, may need to change for every website | |||
def isListingLink(url): | |||
if 'category=' in url: | |||
return True | |||
return False | |||
def productPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
return darkbazar_links_parser(soup) | |||
def crawler(): | |||
startCrawling() |
@ -1,284 +0,0 @@ | |||
__author__ = 'Helium' | |||
''' | |||
DarkMatter Marketplace Crawler (Selenium) | |||
Crawler works, but it slow since there is a speed check for clicking | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.common.by import By | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
from datetime import date | |||
import subprocess | |||
import configparser | |||
from bs4 import BeautifulSoup | |||
from MarketPlaces.Initialization.prepare_parser import new_parse | |||
from MarketPlaces.DarkMatter.parser import darkmatter_links_parser | |||
from MarketPlaces.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/' | |||
# Opens Tor Browser, crawls the website, then parses, then closes tor | |||
#acts like the main method for the crawler, another function at the end of this code calls this function later | |||
def startCrawling(): | |||
mktName = getMKTName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(mktName, baseURL, True) | |||
# Returns the name of the website | |||
#return: name of site in string type | |||
def getMKTName(): | |||
name = 'DarkMatter' | |||
return name | |||
# Return the base link of the website | |||
#return: url of base site in string type | |||
def getFixedURL(): | |||
url = 'http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/' | |||
return url | |||
# Closes Tor Browser | |||
#@param: current selenium driver | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from MarketPlaces.Initialization.markets_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
#ff_prof.set_preference("network.dns.disablePrefetch", True)#connection issue | |||
#ff_prof.set_preference("network.http.sendRefererHeader", 0)#connection issue | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", False) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' | |||
#return: return the selenium driver or string 'down' | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha | |||
# then allows for manual solving of captcha in the terminal | |||
#@param: current selenium web driver | |||
def login(driver): | |||
input("Press ENTER when CAPTCHA is completed and page is loaded\n") | |||
# wait for page to show up (This Xpath may need to change based on different seed url) | |||
# Saves the crawled html page, makes the directory path for html pages if not made | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
#@param: raw url as crawler crawls through every site | |||
def getFullPathName(url): | |||
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned | |||
#@param: raw url as crawler crawls through every site | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if (name == ''): | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list | |||
#in this example, there are a couple of categories some threads fall under such as | |||
# Guides and Tutorials, Digital Products, and Software and Malware | |||
#as you can see they are categories of products | |||
def getInterestedLinks(): | |||
links = [] | |||
# digital fraud software | |||
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=76') | |||
# legit | |||
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=78') | |||
# hack guides | |||
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=94') | |||
# services | |||
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=117') | |||
# software/malware | |||
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=121') | |||
return links | |||
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through | |||
#topic and description pages are crawled through here, where both types of pages are saved | |||
#@param: selenium driver | |||
def crawlForum(driver): | |||
print("Crawling the DarkMatter market") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
list = productPages(html) | |||
for item in list: | |||
itemURL = urlparse.urljoin(baseURL, str(item)) | |||
try: | |||
time.sleep(3) # to keep from detecting click speed | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
savePage(driver, driver.page_source, item) | |||
time.sleep(3) # to keep from detecting click speed | |||
driver.back() | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
try: | |||
link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the DarkMatter market done.") | |||
# Returns 'True' if the link is a description link | |||
#@param: url of any url crawled | |||
#return: true if is a description page, false if not | |||
def isDescriptionLink(url): | |||
if 'products/' in url and '/products/?category' not in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link | |||
#@param: url of any url crawled | |||
#return: true if is a Listing page, false if not | |||
def isListingLink(url): | |||
if '?category' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links, the html is the url of a link from the list of interested link list | |||
#@param: link from interested link list ie. getInterestingLinks() | |||
#return: list of description links that should be crawled through | |||
def productPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
return darkmatter_links_parser(soup) | |||
# Drop links that "signout" | |||
# def isSignOut(url): | |||
# #absURL = urlparse.urljoin(url.base_url, url.url) | |||
# if 'signout' in url.lower() or 'logout' in url.lower(): | |||
# return True | |||
# | |||
# return False | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing BestCardingWorld .... DONE!") |
@ -1,261 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of description page | |||
#return: 'row' that contains a variety of lists that each hold info on the description page | |||
def darkmatter_description_parser(soup): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
image = "-1" # 19 Product_Image | |||
vendor_image = "-1" # 20 Vendor_Image | |||
# 0 *Vendor_Name | |||
try: | |||
temp = soup.find('table', {'class', 'vtable'}) | |||
temp = temp.findAll('tr') | |||
temp2 = temp[3].find('a').text | |||
vendor = cleanString(temp2.strip()) | |||
except: | |||
temp = soup.find('table', {'class', 'vtable'}) | |||
temp = temp.findAll('tr') | |||
temp2 = temp[4].find('a').text | |||
vendor = cleanString(temp2.strip()) | |||
# product name | |||
name = soup.find('div', {'class', 'title-h2'}).text | |||
name = cleanString(name.strip()) | |||
#product description | |||
temp = soup.find('pre', {'class', 'description'}).text | |||
temp = temp.replace('\n', ' ') | |||
describe = cleanString(temp.strip()) | |||
#product category | |||
try: | |||
temp = soup.find('table', {'class', 'vtable'}) | |||
temp = temp.findAll('tr') | |||
temp2 = temp[4].find('th').text | |||
temp2 = cleanString(temp2) | |||
if (temp2 == "Category"): | |||
temp2 = temp[4].find('a').text | |||
category = cleanString(temp2.strip()) | |||
except: | |||
temp = soup.find('table', {'class', 'vtable'}) | |||
temp = temp.findAll('tr') | |||
temp2 = temp[5].find('th').text | |||
temp2 = cleanString(temp2.strip) | |||
if (temp2 == "Category"): | |||
temp2 = temp[5].find('a').text | |||
category = cleanString(temp2.strip()) | |||
# usd | |||
temp = soup.find('table', {'class', 'vtable'}) | |||
temp = temp.findAll('tr') | |||
temp2 = temp[1].find('td').text | |||
temp2 = temp2.replace(' USD', '') | |||
USD = cleanString(temp2) | |||
# 15 Product_QuantitySold | |||
temp = soup.find('table', {'class', 'vtable'}) | |||
temp = temp.findAll('tr') | |||
temp2 = temp[5].find('th').text | |||
temp2 = cleanString(temp2) | |||
temp3 = temp[6].find('th').text | |||
temp3 = cleanString(temp3) | |||
if (temp2 == "Sold"): | |||
temp2 = temp[5].find('td').text | |||
sold = cleanString(temp2.strip()) | |||
elif (temp3 == "Sold"): | |||
temp2 = temp[6].find('td').text | |||
sold = cleanString(temp2.strip()) | |||
# Finding Product Image | |||
image = soup.find('td', {"class": "vtop"}).find('img') | |||
if image is not None: | |||
image = image.get('src').split('base64,')[-1] | |||
else: | |||
image = '-1' | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) | |||
# Sending the results | |||
return row | |||
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of listing page | |||
#return: 'row' that contains a variety of lists that each hold info on the listing page | |||
def darkmatter_listing_parser(soup): | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "DarkMatter" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft =[] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
image = [] # 20 Product_Image | |||
image_vendor = [] # 21 Vendor_Image | |||
href = [] # 22 Product_Links | |||
names = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", "colspan": "3"}) | |||
left = soup.find('div', {"class": "content"}).findAll('table', {"class": "vtable"}) | |||
right = soup.find('div', {"class": "content"}).findAll('td', {"class": "vtop centered"}) | |||
images = soup.find('div', {"class": "content"}).findAll('td', {"class": "vcentered"}) | |||
# vtop centered | |||
count = 0 | |||
# Populating the Number of Products | |||
nm = len(names) | |||
for a in names: | |||
# product name | |||
temp = a.find('a').text | |||
if ("pcs x " in temp): | |||
index = temp.index("pcs x ") | |||
result = temp[index + len("pcs x "):] | |||
name.append(cleanString(result)) | |||
elif("pks x " in temp): | |||
index = temp.index("pks x ") | |||
result = temp[index + len("pks x "):] | |||
name.append(cleanString(result)) | |||
elif ("job x " in temp): | |||
index = temp.index("job x ") | |||
result = temp[index + len("job x "):] | |||
name.append(cleanString(result)) | |||
CVE.append("-1") | |||
MS.append("-1") | |||
temp2 = left[count].findAll('tr') | |||
length_2 = len(temp2) - 1 | |||
# category | |||
temp = temp2[1].find('td').text | |||
category.append(cleanString(temp.strip())) | |||
describe.append("-1") | |||
#escrow.append("-1") | |||
views.append("-1") | |||
reviews.append("-1") | |||
addDate.append("-1") | |||
#lastSeen.append("-1") | |||
BTC.append("-1") | |||
image_vendor.append("-1") | |||
# usd | |||
temp3 = right[count*2].find('span').text | |||
temp = temp3.replace(' USD', '') | |||
USD.append(cleanString(temp)) | |||
EURO.append("-1") | |||
# 14 Product_QuantitySold | |||
temp3 = temp2[length_2].find('th').text | |||
temp3 = cleanString(temp3) | |||
if (temp3 == "Sold:"): | |||
temp = temp2[length_2].find('td').text | |||
sold.append(cleanString(temp.strip())) | |||
else: | |||
sold.append("-1") | |||
qLeft.append("-1") | |||
shipFrom.append("-1") | |||
# ship to | |||
temp3 = temp2[length_2].find('th').text | |||
temp3 = cleanString(temp3) | |||
if (temp3 == "Ship To:"): | |||
temp = temp2[length_2].find('td').text | |||
shipTo.append(cleanString(temp.strip())) | |||
else: | |||
shipTo.append("-1") | |||
# vendor | |||
temp = temp2[0].find('a').text | |||
vendor.append(cleanString(temp.strip())) | |||
# add product rating (stars) | |||
rating.append("-1") | |||
success.append("-1") | |||
temp = a.find('a').get('href') | |||
href.append(temp) | |||
# Finding Product Image | |||
image = images[count*2].find('img').get('src') | |||
image = image.split('base64,')[-1] | |||
count += 1 | |||
rating_item.append("-1") | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) | |||
#called by the crawler to get description links on a listing page | |||
#@param: beautifulsoup object that is using the correct html page (listing page) | |||
#return: list of description links from a listing page | |||
def darkmatter_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", 'colspan': '3'}) | |||
for a in listing: | |||
bae = a.find('a', href=True) | |||
link = bae['href'] | |||
href.append(link) | |||
return href |
@ -1,286 +0,0 @@ | |||
__author__ = 'Helium' | |||
''' | |||
DigitalThriftShop Marketplace Crawler (Selenium) | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.common.by import By | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
from datetime import date | |||
import subprocess | |||
import configparser | |||
from bs4 import BeautifulSoup | |||
from MarketPlaces.Initialization.prepare_parser import new_parse | |||
from MarketPlaces.DigitalThriftShop.parser import digitalthriftshop_links_parser | |||
from MarketPlaces.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/' | |||
# Opens Tor Browser, crawls the website, then parses, then closes tor | |||
#acts like the main method for the crawler, another function at the end of this code calls this function later | |||
def startCrawling(): | |||
mktName = getMKTName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(mktName, baseURL, True) | |||
# Returns the name of the website | |||
#return: name of site in string type | |||
def getMKTName(): | |||
name = 'DigitalThriftShop' | |||
return name | |||
# Return the base link of the website | |||
#return: url of base site in string type | |||
def getFixedURL(): | |||
url = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/' | |||
return url | |||
# Closes Tor Browser | |||
#@param: current selenium driver | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from MarketPlaces.Initialization.markets_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", False) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' | |||
#return: return the selenium driver or string 'down' | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha | |||
# then allows for manual solving of captcha in the terminal | |||
#@param: current selenium web driver | |||
def login(driver): | |||
# wait for page to show up (This Xpath may need to change based on different seed url) | |||
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( | |||
(By.ID, "woocommerce_product_categories-2"))) | |||
# Saves the crawled html page, makes the directory path for html pages if not made | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
#@param: raw url as crawler crawls through every site | |||
def getFullPathName(url): | |||
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned | |||
#@param: raw url as crawler crawls through every site | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if (name == ''): | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list | |||
#in this example, there are a couple of categories some threads fall under such as | |||
# Guides and Tutorials, Digital Products, and Software and Malware | |||
#as you can see they are categories of products | |||
def getInterestedLinks(): | |||
links = [] | |||
# Apps | |||
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/apps/') | |||
# Books | |||
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/books/') | |||
# Bot nets | |||
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/botnets/') | |||
# ransomware | |||
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/ransomware/') | |||
# rats | |||
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/rats/') | |||
# scripts | |||
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/scripts/') | |||
return links | |||
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through | |||
#topic and description pages are crawled through here, where both types of pages are saved | |||
#@param: selenium driver | |||
def crawlForum(driver): | |||
print("Crawling the DigitalThriftShop market") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
list = productPages(html) | |||
for item in list: | |||
itemURL = urlparse.urljoin(baseURL, str(item)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
savePage(driver, driver.page_source, item) | |||
driver.back() | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
try: | |||
nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/div[2]/main/div[1]/nav') | |||
link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='→').get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the DigitalThriftShop market done.") | |||
# Returns 'True' if the link is a description link | |||
#@param: url of any url crawled | |||
#return: true if is a description page, false if not | |||
def isDescriptionLink(url): | |||
if 'product/' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link | |||
#@param: url of any url crawled | |||
#return: true if is a Listing page, false if not | |||
def isListingLink(url): | |||
if 'product-' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links, the html is the url of a link from the list of interested link list | |||
#@param: link from interested link list ie. getInterestingLinks() | |||
#return: list of description links that should be crawled through | |||
def productPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
return digitalthriftshop_links_parser(soup) | |||
# Drop links that "signout" | |||
# def isSignOut(url): | |||
# #absURL = urlparse.urljoin(url.base_url, url.url) | |||
# if 'signout' in url.lower() or 'logout' in url.lower(): | |||
# return True | |||
# | |||
# return False | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing BestCardingWorld .... DONE!") |
@ -1,173 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup, ResultSet, Tag | |||
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of description page | |||
#return: 'row' that contains a variety of lists that each hold info on the description page | |||
def digitalThriftShop_description_parser(soup: Tag): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
image = "-1" # 19 Product_Image | |||
vendor_image = "-1" # 20 Vendor_Image | |||
product_name = soup.find("h1", {"class": "product_title entry-title"}).text | |||
name = cleanString(product_name.strip()) | |||
product_description = soup.find("div", {"id": "tab-description"}).find("p").text | |||
describe = cleanString(product_description.strip()) | |||
# Finding Product Image | |||
image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img') | |||
image = image.get('src').split('base64,')[-1] | |||
product_category = soup.find("span", {"class": "posted_in"}).find("a").text | |||
category = cleanString(product_category.strip()) | |||
product_rating: Tag = soup.find("div", {"class": "woocommerce-product-rating"}) | |||
if product_rating is not None: | |||
rating_item = product_rating.find("strong", {"class": "rating"}).text | |||
reviews = product_rating.find("span", {"class": "rating"}).text | |||
product_BTC = soup.find("div", {"id": "price-BTC"}).find("span", {"class": "priceinfo cw-noselect"}).text | |||
BTC = cleanString(product_BTC.strip()) | |||
product_USD = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text | |||
USD = cleanString(product_USD.replace("$", "").strip()) | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) | |||
# Sending the results | |||
return row | |||
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of listing page | |||
#return: 'row' that contains a variety of lists that each hold info on the listing page | |||
def digitalThriftShop_listing_parser(soup: Tag): | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "DigitalThriftShop" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft =[] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
image = [] # 20 Product_Image | |||
image_vendor = [] # 21 Vendor_Image | |||
href = [] # 22 Product_Links | |||
product_category = soup.find("h1", {"class": "woocommerce-products-header__title page-title"}).text | |||
products_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-5"}).find_all("li") | |||
for product in products_list: | |||
nm += 1 | |||
vendor.append(mktName) | |||
rating_vendor.append("-1") | |||
success.append("-1") | |||
product_name = product.find("h2", {"class": "woocommerce-loop-product__title"}).text | |||
name.append(cleanString(product_name.strip())) | |||
# Finding Product Image | |||
product_image = product.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'}) | |||
product_image = product_image.get('src').split('base64,')[-1] | |||
image.append(product_image) | |||
CVE.append("-1") | |||
MS.append("-1") | |||
category.append(cleanString(product_category.strip())) | |||
describe.append("-1") | |||
views.append("-1") | |||
reviews.append("-1") | |||
image_vendor.append("-1") | |||
try: | |||
product_rating = product.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text | |||
rating_item.append(cleanString(product_rating.strip())) | |||
except: | |||
rating_item.append("-1") | |||
addDate.append("-1") | |||
BTC.append("-1") | |||
product_USD = product.find("span", {"class": "price"}).text | |||
USD.append(product_USD.replace("$", "").strip()) | |||
EURO.append("-1") | |||
sold.append("-1") | |||
qLeft.append("-1") | |||
shipFrom.append("-1") | |||
shipTo.append("-1") | |||
product_href = product.find("a", {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}).get("href") | |||
href.append(cleanString(product_href.strip())) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) | |||
#called by the crawler to get description links on a listing page | |||
#@param: beautifulsoup object that is using the correct html page (listing page) | |||
#return: list of description links from a listing page | |||
def digitalthriftshop_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.find('ul', {"class": "products columns-5"}).findAll('li') | |||
for a in listing: | |||
bae = a.find('a', href=True) | |||
link = bae['href'] | |||
href.append(link) | |||
return href |
@ -1,288 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) | |||
def hiddenmarket_description_parser(soup): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
image = "-1" # 19 Product_Image | |||
vendor_image = "-1" # 20 Vendor_Image | |||
bae = soup.find('div', {'class': "main"}) | |||
# Finding Product Name | |||
name = bae.find('div', {'class': "heading"}).text | |||
name = name.replace('\n', ' ') | |||
name = name.replace(",", "") | |||
name = name.strip() | |||
mb = bae.find('div', {'class': "information"}).findAll('tr') | |||
# Finding Vendor | |||
vendor = mb[1].find('a').text | |||
vendor = vendor.replace(",", "") | |||
vendor = vendor.strip() | |||
# # Finding Vendor Rating | |||
# full_stars = bae[2].find_all('i', {'class': "fas fa-star"}) | |||
# half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) | |||
# rating = len(full_stars) + (0.5 if half_star is not None else 0) | |||
# Finding Quantity Left | |||
temp = mb[-3].text | |||
left = temp.replace("Quantity in stock:", "") | |||
left = left.strip() | |||
# Finding USD | |||
USD = mb[0].text | |||
USD = USD.replace("Price:", "") | |||
USD = USD.replace("USD", "") | |||
USD = USD.strip() | |||
# Finding BTC | |||
# temp = bae.find('div', {"class": "small"}).text.split("BTC") | |||
# BTC = temp[0].strip() | |||
# Finding Shipment Information (Origin) | |||
shipFrom = mb[2].text | |||
shipFrom = shipFrom.replace("Seller location:", "") | |||
shipFrom = shipFrom.strip() | |||
# Finding Shipment Information (Destination) | |||
shipTo = mb[3].text | |||
shipTo = shipTo.replace("Ships to (seller):", "") | |||
shipTo = shipTo.strip() | |||
# Finding the Product description | |||
describe = bae.find('div', {"class": "twotabs"}).find('div', {'class': "tab1"}).text | |||
describe = cleanString(describe.strip()) | |||
# Finding Product Image | |||
image = soup.find('div', {"class": "thumbnails"}).find('img', {"class": "bigthumbnail"}) | |||
image = image.get('src').split('base64,')[-1] | |||
# Finding the Product Category | |||
category = mb[-4].text | |||
category = category.replace("Category:", "") | |||
category = category.strip() | |||
#Finding the number of reviews | |||
reviews = bae.find_all('div', {'class': "heading"}) | |||
reviews = reviews[-2].text | |||
reviews = reviews.replace("Comments (", "") | |||
reviews = reviews.replace(")", "") | |||
# Searching for CVE and MS categories | |||
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) | |||
if cve: | |||
CVE = " " | |||
for idx in cve: | |||
CVE += (idx) | |||
CVE += " " | |||
CVE = CVE.replace(',', ' ') | |||
CVE = CVE.replace('\n', '') | |||
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) | |||
if ms: | |||
MS = " " | |||
for im in ms: | |||
MS += (im) | |||
MS += " " | |||
MS = MS.replace(',', ' ') | |||
MS = MS.replace('\n', '') | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages | |||
def hiddenmarket_listing_parser(soup): | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "HiddenMarket" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft = [] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
image = [] # 20 Product_Image | |||
image_vendor = [] # 21 Vendor_Image | |||
href = [] # 22 Product_Links | |||
listing = soup.findAll('div', {"class": "item"}) | |||
# Populating the Number of Products | |||
nm = len(listing) | |||
# Finding Category | |||
cat = soup.find("div", {'class': "heading"}).text | |||
cat = cat.replace(",", "") | |||
cat = cat.strip() | |||
for card in listing: | |||
category.append(cat) | |||
# Adding the url to the list of urls | |||
link = card.find_all('a') | |||
link = link[1].get('href') | |||
href.append(link) | |||
# Finding Product Name | |||
product = card.find('div', {'class': "title"}) | |||
product = product.text | |||
product = product.replace('\n', ' ') | |||
product = product.replace(",", "") | |||
product = product.strip() | |||
name.append(product) | |||
# Finding Product Image | |||
image.append("-1") | |||
# Finding Vendor | |||
vendor_name = card.find('div', {"class": "seller"}).text | |||
vendor_name = vendor_name.replace(",", "") | |||
vendor_name = vendor_name.strip() | |||
vendor.append(vendor_name) | |||
image_vendor.append("-1") | |||
# Finding USD | |||
usd = card.find('div', {"class": "buttons"}).find('div', {'class': "price"}).text | |||
usd = usd.replace("USD", "") | |||
usd = usd.strip() | |||
USD.append(usd) | |||
tb = card.find("div", {"class": "stats"}) | |||
tb = tb.find_all('td') | |||
# Finding Reviews | |||
num = tb[-1].text | |||
num = num.strip() | |||
reviews.append(num) | |||
# Finding Views | |||
view = tb[-3].text.strip() | |||
views.append(view) | |||
# Finding Num of Sales | |||
sale = tb[-2].text.strip() | |||
sold.append(sale) | |||
# Finding Item Rating | |||
if num == '0': | |||
item_rating = '-1' | |||
else: | |||
item_rating = card.find('div', {'class': 'stats'}).find('div', {'class': "stars2"}) | |||
item_rating = item_rating.get('style') | |||
item_rating = item_rating.replace("width:", "") | |||
item_rating = item_rating.replace("%", "") | |||
rating_item.append(item_rating) | |||
# Finding shipping info | |||
shipping = card.find('div', {'class': "shipping"}).text.split('>') | |||
# SHip from | |||
origin = shipping[0].strip() | |||
shipFrom.append(origin) | |||
#Ship to | |||
destination = shipping[1].strip() | |||
shipTo.append(destination) | |||
# Finding description (site only shows partial description on listing pages) | |||
# description = card.next_sibling.find('div', {'class': "description"}).text | |||
# description = description.replace("\n", " ") | |||
# description = description.replace("\r", " ") | |||
# description = description.replace("-", " ") | |||
# description = description.strip() | |||
# describe.append(description) | |||
# Searching for CVE and MS categories | |||
cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) | |||
if not cve: | |||
cveValue = "-1" | |||
else: | |||
cee = " " | |||
for idx in cve: | |||
cee += (idx) | |||
cee += " " | |||
cee = cee.replace(',', ' ') | |||
cee = cee.replace('\n', '') | |||
cveValue = cee | |||
CVE.append(cveValue) | |||
ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) | |||
if not ms: | |||
MSValue = "-1" | |||
else: | |||
me = " " | |||
for im in ms: | |||
me += (im) | |||
me += " " | |||
me = me.replace(',', ' ') | |||
me = me.replace('\n', '') | |||
MSValue = me | |||
MS.append(MSValue) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) | |||
def hiddenmarket_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.findAll('div', {"class": "item"}) | |||
for div in listing: | |||
link = div.findAll('a') | |||
link = link[1] | |||
link = link['href'] | |||
href.append(link) | |||
return href |
@ -0,0 +1,325 @@ | |||
__author__ = '91Shadows' | |||
''' | |||
DarkFox marketplace Crawler | |||
''' | |||
import codecs | |||
import socks, socket, time | |||
from datetime import date | |||
import urllib.parse as urlparse | |||
import http.client as httplib | |||
import mechanize | |||
import os | |||
import subprocess | |||
from bs4 import BeautifulSoup | |||
from MarketPlaces.Initialization.prepare_parser import new_parse | |||
from MarketPlaces.DarkFox.parser import darkfox_links_parser | |||
counter = 1 | |||
httplib.HTTPConnection._http_vsn = 10 | |||
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' | |||
baseURL = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/' | |||
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150) | |||
# Opens Tor Browser, crawls the mkt | |||
def startCrawling(): | |||
opentor() | |||
getUrl() | |||
url = getFixedURL() | |||
mktName = getMKTName() | |||
credentials = getCredentials() | |||
br = getAccess(url, credentials) | |||
if br != 'down': | |||
crawlMkt(url, br) | |||
#new_parse(mktName, False) | |||
#new_parse(mktName, False) | |||
closetor() | |||
#Opens Tor Browser | |||
def opentor(): | |||
global pid | |||
print("Connecting Tor...") | |||
path = open('../../path.txt').readline() | |||
pro = subprocess.Popen(path) | |||
pid = pro.pid | |||
time.sleep(5) | |||
input("Tor Connected. Press ENTER to continue\n") | |||
return | |||
# Creates a connection through Tor Port | |||
def getUrl(timeout=None): | |||
socket.socket = socks.socksocket | |||
socket.create_connection = create_connection | |||
return | |||
# Makes the onion address request | |||
def create_connection(address, timeout=None, source_address=None): | |||
sock = socks.socksocket() | |||
sock.connect(address) | |||
return sock | |||
# Returns the name of the mkt (Crypto) | |||
def getMKTName(): | |||
name = 'DarkFox' | |||
return name | |||
# Returns credentials needed for the mkt | |||
def getCredentials(): | |||
credentials = 'blank blank blank blank cap 0' | |||
return credentials | |||
# Return the link of the mkt (DarkFox Link) | |||
def getFixedURL(): | |||
url = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/' | |||
return url | |||
# Closes Tor Browser | |||
def closetor(): | |||
global pid | |||
os.system("taskkill /pid " + str(pid)) | |||
print('Closing Tor...') | |||
time.sleep(3) | |||
return | |||
# Creates a Mechanize browser and initializes its options | |||
def createBrowser(): | |||
br = mechanize.Browser() | |||
cj = mechanize.CookieJar() | |||
br.set_cookiejar(cj) | |||
# Browser options | |||
br.set_handle_equiv( True ) | |||
br.set_handle_redirect( True ) | |||
br.set_handle_referer( True ) | |||
br.set_handle_robots(False) | |||
br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 ) | |||
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'), | |||
('Accept', '*/*')] | |||
return br | |||
def getAccess(loginPage, credentials): | |||
logInName = credentials.split()[0] | |||
userName = credentials.split()[1] | |||
logInPass = credentials.split()[2] | |||
password = credentials.split()[3] | |||
captchaName = credentials.split()[4] | |||
formId = credentials.split()[5] | |||
br = createBrowser() | |||
try: | |||
keepTrying = True | |||
while (keepTrying): | |||
br.open(loginPage) | |||
time.sleep(7) | |||
html = br.response() | |||
soup = BeautifulSoup(html) | |||
image_tags = soup.findAll('div', {"class": "imgWrap"}) | |||
captchaLink = image_tags[0] | |||
imagelink = captchaLink['style'].split('url(')[1][:-1] | |||
data = br.open(imagelink).read() | |||
br.back() | |||
open('captcha.png', "wb").write(data) | |||
''' | |||
subprocess.Popen("python capt.py", shell=False) | |||
time.sleep(61) | |||
captchaAnswerFile = open("answer.txt", "r") | |||
captchaAnswer = captchaAnswerFile.read().__str__() | |||
''' | |||
captchaAnswer = input('Please provide me with captcha : ') | |||
formIndex = int(formId) | |||
br.select_form(nr=formIndex) | |||
#br[logInName] = userName | |||
#br[logInPass] = password | |||
br[captchaName] = captchaAnswer.__str__() | |||
br.submit() | |||
if br.geturl() != 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/': | |||
keepTrying = False | |||
return br | |||
except: | |||
return 'down' | |||
# Saves the crawled html page | |||
def savePage(page, url): | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
a = page.read() | |||
open(filePath, "wb").write(a) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
def getFullPathName(url): | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str( | |||
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( | |||
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' | |||
else: | |||
fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str( | |||
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( | |||
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' | |||
return fullPath | |||
# Creates the name of the file based on URL | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if (name == ''): | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
# Hacking and Markets related topics | |||
def getInterestedLinks(): | |||
links = [] | |||
# Guides and Tutorials | |||
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/30739153-1fcd-45cd-b919-072b439c6e06') | |||
# Digital Products | |||
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781') | |||
# Software and Malware | |||
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc') | |||
# Services | |||
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280') | |||
# Miscellaneous | |||
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb') | |||
# Hosting and Security | |||
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14') | |||
# links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing1.html') | |||
# links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing2.html') | |||
return links | |||
def crawlMkt(url, br): | |||
print("Crawling the DarkFox marketplace") | |||
linksToCrawl = getInterestedLinks() | |||
visited = set(linksToCrawl) | |||
initialTime = time.time() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try : | |||
page = br.open(link) | |||
savePage(page, link) | |||
for l in br.links(): | |||
absURL = urlparse.urljoin(l.base_url, l.url) | |||
if absURL not in visited and not isSignOut(absURL) and isListingLink(absURL): | |||
visited.add(absURL) | |||
#disabling the process of finding other links | |||
#linksToCrawl.append(absURL) | |||
# crawler asks parser to get links of ALL products on ALL listing pages | |||
list = productPages(link) | |||
j = 0 | |||
for item in list: | |||
if j == 2: | |||
break | |||
#itemURL = baseURL + str(item) | |||
try: | |||
#itemPage = br.open(itemURL) | |||
itemPage = br.open(item) | |||
savePage(itemPage, item) | |||
except: | |||
#print 'Error in page: ', itemURL | |||
print('Error in page: ', item) | |||
j+=1 | |||
except Exception as e: | |||
print(link, e.message) | |||
i += 1 | |||
#finalTime = time.time() | |||
#print finalTime - initialTime | |||
input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n") | |||
return | |||
def isDescriptionLink(url): | |||
if 'product' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link | |||
def isListingLink(url): | |||
if 'category' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links | |||
def productPages(url): | |||
soup = "" | |||
error = False | |||
try: | |||
html = codecs.open( | |||
r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str( | |||
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( | |||
"%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html', encoding='utf8') | |||
soup = BeautifulSoup(html, "html.parser") | |||
except: | |||
try: | |||
html = open( | |||
r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str( | |||
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( | |||
"%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html') | |||
soup = BeautifulSoup(html, "html.parser") | |||
except: | |||
error = True | |||
print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.") | |||
if error: | |||
return [] | |||
else: | |||
return darkfox_links_parser(soup) | |||
# Drop links that "singout" | |||
def isSignOut(url): | |||
#absURL = urlparse.urljoin(url.base_url, url.url) | |||
if 'signout' in url.lower() or 'logout' in url.lower(): | |||
return True | |||
return False | |||
def crawler(): | |||
startCrawling() | |||
#print "Crawling and Parsing Crypto .... DONE!" |
@ -0,0 +1,342 @@ | |||
__author__ = 'DarkWeb' | |||
''' | |||
Kingdom Market Crawler (Selenium) | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.common.exceptions import TimeoutException | |||
from selenium.webdriver.firefox.options import Options | |||
from selenium.webdriver.common.by import By | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from selenium.webdriver.support.ui import Select | |||
from PIL import Image | |||
import base64 | |||
from io import BytesIO | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
from datetime import date | |||
import subprocess | |||
from bs4 import BeautifulSoup | |||
from MarketPlaces.Initialization.prepare_parser import new_parse | |||
from MarketPlaces.Kingdom.parser import kingdom_links_parser | |||
from MarketPlaces.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion' | |||
# Opens Tor Browser, crawls the website | |||
def startCrawling(): | |||
# marketName = getMarketName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
captcha(driver) | |||
login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
# new_parse(marketName, False) | |||
def captcha(driver): | |||
''' | |||
# wait for captcha page | |||
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( | |||
(By.XPATH, "/html/body/div/div[1]"))) | |||
# save captcha to local | |||
driver.find_element(by=By.XPATH, value='/html/body/div/div[2]').screenshot( | |||
r'..\Kingdom\captcha1.png') | |||
# This method will show image in any image viewer | |||
im = Image.open(r'..\Kingdom\captcha1.png') | |||
im.show() | |||
iframes = driver.find_elements(by=By.TAG_NAME, value='iframe') | |||
# ask user input captcha solution in terminal | |||
print("Enter squares from smallest to largest (squares are numbered 1-9 left to right)") | |||
for order in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']: | |||
id = input(f"{order}: ") | |||
iframes[int(id)-1].click() | |||
''' | |||
input("Press ENTER when CAPTCHA is completed\n") | |||
# wait for login page | |||
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( | |||
(By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) | |||
# Login using premade account credentials and do login captcha manually | |||
def login(driver): | |||
# wait for login page | |||
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( | |||
(By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) | |||
# entering username and password into input boxes | |||
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-name"]') | |||
# Username here | |||
usernameBox.send_keys('blabri') | |||
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-passwd"]') | |||
# Password here | |||
passwordBox.send_keys('fishowal') | |||
select = Select(driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-sessiontime"]')) | |||
select.select_by_visible_text('24 hours') | |||
''' | |||
# wait for captcha page show up | |||
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( | |||
(By.XPATH, '//*[@id="captcha"]'))) | |||
# save captcha to local | |||
driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Kingdom\captcha2.png') | |||
# This method will show image in any image viewer | |||
im = Image.open(r'..\Kingdom\captcha2.png') | |||
im.show() | |||
# wait until input space show up | |||
inputBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-captcha"]') | |||
# ask user input captcha solution in terminal | |||
userIn = input("Enter solution: ") | |||
# send user solution into the input space | |||
inputBox.send_keys(userIn) | |||
# click the verify(submit) button | |||
driver.find_element(by=By.XPATH, value="/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button").click() | |||
''' | |||
input("Press ENTER when CAPTCHA is completed\n") | |||
# wait for listing page show up (This Xpath may need to change based on different seed url) | |||
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( | |||
(By.XPATH, '/html/body/div/div/div[3]/div[2]'))) | |||
# Returns the name of the website | |||
def getMarketName(): | |||
name = 'Kingdom' | |||
return name | |||
# Return the link of the website | |||
def getFixedURL(): | |||
url = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion' | |||
return url | |||
# Closes Tor Browser | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from MarketPlaces.Initialization.markets_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", False) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
# Saves the crawled html page | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
def getFullPathName(url): | |||
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if (name == ''): | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
def getInterestedLinks(): | |||
links = [] | |||
# Software and Malware | |||
links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=c298a77d9e93ad32') | |||
# # Services | |||
# links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=c298a77d9e93ad32') | |||
# # Exploits | |||
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45') | |||
# # Tools | |||
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46') | |||
# # Malware | |||
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47') | |||
# # Cryptography | |||
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48') | |||
# # Others | |||
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49') | |||
# # Hacking Tutorials | |||
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50') | |||
# # Hacked Accounts and Database Dumps | |||
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30') | |||
# # Android Moded pak | |||
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53') | |||
return links | |||
def crawlForum(driver): | |||
print("Crawling the Kingdom market") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
list = productPages(html) | |||
for item in list: | |||
itemURL = urlparse.urljoin(baseURL, str(item)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
savePage(driver, driver.page_source, item) | |||
driver.back() | |||
# comment out | |||
break | |||
# comment out | |||
if count == 1: | |||
break | |||
try: | |||
temp = driver.find_element(by=By.XPATH, value= | |||
'/html/body/div/div/div[3]/div[2]/div[2]/div/div/ul') | |||
next = temp.find_element_by_class_name("next") | |||
link = link.find_element_by_tag_name('a').get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
input("Crawling Kingdom Market done sucessfully. Press ENTER to continue\n") | |||
# Returns 'True' if the link is Topic link | |||
def isDescriptionLink(url): | |||
if 'view' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link | |||
def isListingLink(url): | |||
if 'category' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links | |||
def productPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) | |||
return kingdom_links_parser(soup) | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing BestCardingWorld .... DONE!") |
@ -0,0 +1,188 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) | |||
def kingdom_description_parser(soup): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
# Finding Product Name | |||
tag = soup.find('div', {"class": "col-md-9"}) | |||
desc = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-cont"}) | |||
name = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-head"}).text | |||
name = name.replace('\n', ' ') | |||
name = name.replace(',', ' ') | |||
name = name.strip() | |||
# Finding Prices | |||
# Kingdom prices can be shown in a variety of currencies, not all in USD, so keeping currency | |||
rows = desc.find_all('div', {"class", "row"}, recursive=False) | |||
price = rows[-1].find('div', {"class": "row"}).find('h3').text | |||
price = price.replace(',', '') | |||
price = price.strip() | |||
# USD = price.replace("USD",'') | |||
BTC = rows[-1].find('div', {"class": "row"}).find_next_sibling('div').find('span').text | |||
# Finding Vendor | |||
vendor = rows[0].select_one('a[href^="/user"]').text | |||
vendor = vendor.replace(",", " ") | |||
vendor = vendor.strip() | |||
# Finding Shipment Information (Origem) | |||
descs = rows[0].find_all('div', {"class": "col-md-3 text-right"}) | |||
shipFrom = descs[2].text | |||
shipFrom = shipFrom.replace(",", "") | |||
shipFrom = shipFrom.strip() | |||
# Finding Shipment Information (Destiny) | |||
shipTo = rows[-1].find('div', {"class": "col-md-6"}).text | |||
shipTo = shipTo.replace("Ship to:","") | |||
shipTo = shipTo.replace(",","").strip() | |||
if(shipTo == ''): | |||
shipTo = -1 | |||
# Finding the Product Category | |||
category = descs[0].text | |||
category = category.replace(",", "") | |||
category = category.strip() | |||
# Finding the Product Quantity Available | |||
left = descs[1].text | |||
left = left.replace(",", "") | |||
left = left.strip() | |||
# Finding when the Product was Added | |||
dt = descs[-1].text.strip() | |||
addDate = datetime.strptime(dt, '%d.%m.%Y') | |||
# Finding the Product description | |||
describe = cleanString(soup.find('div', {"id": "descriptionContent"}).text) | |||
# Finding the Number of Product Reviews | |||
review = len(soup.find('div', {"id": "feedbackContent"}).find_all(recursive=False)) | |||
# Searching for CVE and MS categories | |||
# no cve or ms in Kingdom | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo) | |||
# Sending the results | |||
return row | |||
def kingdom_listing_parser(soup): | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "Kingdom" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft =[] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
href = [] # 20 Product_Links | |||
listing = soup.find('div', {"id": "p0"}).find('div').find_all('div', {"class": "row"}, recursive=False) | |||
# Populating the Number of Products | |||
nm = len(listing) | |||
for a in listing: | |||
# Finding Prices | |||
#in array USD, there may be prices not in USD, so includes currency as well | |||
prices = a.find('div', {"class": "col-md-3"}) | |||
u = prices.find('h3').text | |||
u = u.strip() | |||
u = u.replace(',', '') | |||
u = u.strip() | |||
USD.append(u) | |||
bc = prices.find('div').find('span').text | |||
BTC.append(bc) | |||
# Finding the Product | |||
product = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]').text | |||
product = product.replace('\n', ' ') | |||
product = product.replace(","," ") | |||
product = product.strip() | |||
name.append(product) | |||
# Finding the Vendor | |||
vendor_name = a.select_one('a[href^="/user"]').text | |||
vendor_name = vendor_name.replace(",", " ").replace('/', '') | |||
vendor_name = vendor_name.strip() | |||
vendor.append(vendor_name) | |||
# Adding the url to the list of urls | |||
link = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]')['href'] | |||
link = cleanLink(link) | |||
href.append(link) | |||
# Searching for CVE and MS categories | |||
# cve and ms not in kingdom | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) | |||
def kingdom_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.findAll('div', {"class": "col-md-7"}) | |||
for a in listing: | |||
link = a.select_one('a[href^="/offer/view?"]') | |||
link = link['href'] | |||
href.append(link) | |||
return href |
@ -1,235 +0,0 @@ | |||
__author__ = 'Helium' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of description page | |||
#return: 'row' that contains a variety of lists that each hold info on the description page | |||
def lionmarketplace_description_parser(soup): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
image = "-1" # 19 Product_Image | |||
vendor_image = "-1" # 20 Vendor_Image | |||
# vendor name | |||
temp = soup.find('div', {'class': 'btn-group'}).find('a').text | |||
vendor = (cleanString(temp.strip())) | |||
# table with info | |||
table = soup.find('table') | |||
rows = table.findAll('tr') | |||
# vendor rating | |||
pos = soup.find('span', {"class": "fas fa-plus-circle text-success"}).parent.text | |||
pos = int(pos.strip()) | |||
neu = soup.find('span', {"class": "fas fa-stop-circle text-secondary"}).parent.text | |||
neu = int(neu.strip()) | |||
neg = soup.find('span', {"class": "fas fa-minus-circle text-danger"}).parent.text | |||
neg = int(neg.strip()) | |||
total = pos + neu + neg | |||
if total > 0: | |||
rating_vendor = str((pos + 0.5*neu) / total) | |||
# product name | |||
temp = soup.find('div', {'class', 'row'}).find('h2').text | |||
name = (cleanString(temp.strip())) | |||
# product description | |||
temp = soup.find('div', {'class': "mt-4"}).contents[-1] | |||
describe = cleanString(temp.strip()) | |||
# Finding Product Image | |||
image = soup.find('div', {'id': 'slide-1'}).find('img') | |||
image = image.get('src') | |||
image = image.split('base64,')[-1] | |||
full = rows[0].findAll('i', {"class": "fas fa-star"}) | |||
half = rows[0].find('i', {"class": "fas fa-star-half-alt"}) | |||
rating_item = len(full) | |||
if half is not None: | |||
rating_item += 0.5 | |||
rating_item = str(rating_item) | |||
# USD selling price | |||
temp = rows[2].find('strong').text | |||
if " $" in temp: | |||
temp = temp.replace(" $", "") | |||
elif "$" in temp: | |||
temp = temp.replace("$", "") | |||
USD = cleanString((temp.strip())) | |||
# product sold | |||
temp = rows[4].find('td') | |||
if temp is not None and cleanString(temp.text.strip()) == 'Left/Sold': | |||
temp = rows[4].findAll('td') | |||
temp = temp[1].findAll('span') | |||
# left | |||
sold = temp[1].text | |||
left = temp[0].text | |||
sold = cleanNumbers(sold.strip()) | |||
left = cleanNumbers(left.strip()) | |||
else: | |||
sold = '-1' | |||
left = "-1" | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) | |||
# Sending the results | |||
return row | |||
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of listing page | |||
#return: 'row' that contains a variety of lists that each hold info on the listing page | |||
def lionmarketplace_listing_parser(soup): | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "LionMarketplace" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft =[] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
image = [] # 20 Product_Image | |||
image_vendor = [] # 21 Vendor_Image | |||
href = [] # 22 Product_Links | |||
listings = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) | |||
# Populating the Number of Products | |||
nm = len(listings) | |||
for listing in listings: | |||
a = listing.find('div', {"class": "card-body"}) | |||
row = a.findAll('p') | |||
# vendor | |||
temp = row[3].text | |||
temp = temp.replace("Vendor:", "") | |||
vendor.append(cleanString(temp.strip())) | |||
image_vendor.append("-1") | |||
# vendor rating | |||
rating_vendor.append("-1") | |||
# successful transactions CHECK AGAIN HERE | |||
success.append("-1") | |||
# product name | |||
temp = a.find('a').text | |||
name.append(cleanString(temp.strip())) | |||
# Finding Product Image | |||
product_image = listing.find('img', {'class': 'card-img-top rounded'}) | |||
product_image = product_image.get('src') | |||
product_image = product_image.split('base64,')[-1] | |||
image.append(product_image) | |||
CVE.append('-1') | |||
MS.append('-1') | |||
# product category | |||
temp = row[1].text | |||
temp = temp.replace("Category: ", "") | |||
category.append(cleanString(temp.strip())) | |||
describe.append('-1') | |||
# product views | |||
vnum = listing.find('p', {"class": "position-absolute bg-primary opacity-60 text-white mt-4 mr-5 pr-1"}).text | |||
views.append(cleanNumbers(vnum.strip())) | |||
reviews.append('-1') # 10 Product_Number_Of_Reviews | |||
rating_item.append('-1') # 11 Product_Rating | |||
addDate.append('-1') # 12 Product_AddDate | |||
# BTC | |||
BTC.append('-1') | |||
# USD | |||
temp = row[0].find('strong').text | |||
USD.append(cleanNumbers(temp.strip())) # 14 Product_USD_SellingPrice | |||
EURO.append("-1") # 15 Product_EURO_SellingPrice | |||
# product sold | |||
sold.append("-1") | |||
qLeft.append('-1') # 17 Product_QuantityLeft | |||
shipFrom.append('-1') # 18 Product_ShippedFrom | |||
shipTo.append('-1') # 19 Product_ShippedTo | |||
# href | |||
temp = a.find('a').get('href') | |||
href.append(temp) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) | |||
#called by the crawler to get description links on a listing page | |||
#@param: beautifulsoup object that is using the correct html page (listing page) | |||
#return: list of description links from a listing page | |||
def lionmarketplace_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listings = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) | |||
for listing in listings: | |||
a = listing.find('div', {"class": "card-body"}) | |||
bae = a.find('a', href=True) | |||
link = bae['href'] | |||
href.append(link) | |||
return href |
@ -1,291 +0,0 @@ | |||
__author__ = 'Helium' | |||
''' | |||
MetaVerseMarket Marketplace Crawler (Selenium) | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.common.by import By | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
from datetime import date | |||
import subprocess | |||
import configparser | |||
from bs4 import BeautifulSoup | |||
from MarketPlaces.Initialization.prepare_parser import new_parse | |||
from MarketPlaces.MetaVerseMarket.parser import metaversemarket_links_parser | |||
from MarketPlaces.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/login' | |||
# Opens Tor Browser, crawls the website, then parses, then closes tor | |||
#acts like the main method for the crawler, another function at the end of this code calls this function later | |||
def startCrawling(): | |||
mktName = getMKTName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(mktName, baseURL, True) | |||
# Returns the name of the website | |||
#return: name of site in string type | |||
def getMKTName(): | |||
name = 'MetaVerseMarket' | |||
return name | |||
# Return the base link of the website | |||
#return: url of base site in string type | |||
def getFixedURL(): | |||
url = 'http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/login' | |||
return url | |||
# Closes Tor Browser | |||
#@param: current selenium driver | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from MarketPlaces.Initialization.markets_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", False) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' | |||
#return: return the selenium driver or string 'down' | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha | |||
# then allows for manual solving of captcha in the terminal | |||
#@param: current selenium web driver | |||
def login(driver): | |||
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( | |||
(By.XPATH, '//*[@id="username"]'))) | |||
# entering username and password into input boxes | |||
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') | |||
# Username here | |||
usernameBox.send_keys('metotomoto') | |||
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') | |||
# Password here | |||
passwordBox.send_keys('lionking_kumba1ya') | |||
input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n") | |||
# wait for listing page show up (This Xpath may need to change based on different seed url) | |||
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( | |||
(By.XPATH, '//*[@id="searchq"]'))) | |||
# Saves the crawled html page, makes the directory path for html pages if not made | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
#@param: raw url as crawler crawls through every site | |||
def getFullPathName(url): | |||
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned | |||
#@param: raw url as crawler crawls through every site | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if (name == ''): | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list | |||
#in this example, there are a couple of categories some threads fall under such as | |||
# Guides and Tutorials, Digital Products, and Software and Malware | |||
#as you can see they are categories of products | |||
def getInterestedLinks(): | |||
links = [] | |||
# software and malware | |||
links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/softwares-and-malwares') | |||
# guides and tutorials | |||
links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/guides-and-tutorials') | |||
# services | |||
links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/services') | |||
return links | |||
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through | |||
#topic and description pages are crawled through here, where both types of pages are saved | |||
#@param: selenium driver | |||
def crawlForum(driver): | |||
print("Crawling the MetaVerse market") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
list = productPages(html) | |||
for item in list: | |||
itemURL = urlparse.urljoin(baseURL, str(item)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
savePage(driver, driver.page_source, item) | |||
driver.back() | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
try: | |||
link = driver.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href') | |||
if link.endswith('#') or link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the MetaVerse market done.") | |||
# Returns 'True' if the link is a description link | |||
#@param: url of any url crawled | |||
#return: true if is a description page, false if not | |||
def isDescriptionLink(url): | |||
if 'PR' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link | |||
#@param: url of any url crawled | |||
#return: true if is a Listing page, false if not | |||
def isListingLink(url): | |||
if 'products' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links, the html is the url of a link from the list of interested link list | |||
#@param: link from interested link list ie. getInterestingLinks() | |||
#return: list of description links that should be crawled through | |||
def productPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
return metaversemarket_links_parser(soup) | |||
# Drop links that "signout" | |||
# def isSignOut(url): | |||
# #absURL = urlparse.urljoin(url.base_url, url.url) | |||
# if 'signout' in url.lower() or 'logout' in url.lower(): | |||
# return True | |||
# | |||
# return False | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing MetaVerseMarket .... DONE!") |
@ -1,269 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs | |||
# stores info it needs in different lists, these lists are returned after being organized | |||
# @param: soup object looking at html page of description page | |||
# return: 'row' that contains a variety of lists that each hold info on the description page | |||
def metaversemarket_description_parser(soup): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
image = "-1" # 19 Product_Image | |||
vendor_image = "-1" # 20 Vendor_Image | |||
# Finding Product Name | |||
name = soup.find('div', {'class': "panel-heading"}).text | |||
name = cleanString(name.strip()) | |||
temp = soup.findAll('div', {'class': "col-xs-12 col-sm-6 mt-5"}) | |||
# Finding Product Image | |||
image = temp[0].find('img') | |||
image = image.get('src') | |||
image = image.split('base64,')[-1] | |||
# Finding Vendor | |||
temp = temp[1].findAll('span') | |||
vendor = temp[1].find('b').text | |||
vendor = cleanString(vendor.strip()) | |||
# Finding Vendor Rating | |||
pos = soup.find('span', {'class': "badge bg-success fs-12px"}).text | |||
pos = int(cleanNumbers(pos).strip()) | |||
neg = soup.find('span', {'class': "badge bg-danger fs-12px"}).text | |||
neg = int(cleanNumbers(neg).strip()) | |||
total = pos + neg | |||
if total > 0: | |||
rating_vendor = str(pos / total) | |||
# Finding Prices | |||
USD = soup.find('h3', {'class': "mb-2"}).text | |||
USD = cleanNumbers(USD).strip() | |||
# Finding the Product Category | |||
temp = soup.select('div[class="mt-2"]')[1].text | |||
temp = temp.replace("Category:", "") | |||
category = temp.strip() | |||
# Finding Number of Views | |||
views = soup.find('button', {"class": "btn btn-secondary text-center w-33 fw-bold"}).text | |||
views = views.strip() | |||
# Finding the Product Quantity Available | |||
temp = soup.find('button', {"class": "btn btn-success text-center w-33 fw-bold"}).text | |||
temp = temp.split("/") | |||
left = temp[1].strip() | |||
# Finding Number Sold | |||
sold = temp[0].strip() | |||
# Finding Shipment Information (Origin) | |||
temp = soup.find('div', {'class': "alert alert-info"}).text | |||
temp = temp.split("to") | |||
shipFrom = temp[0].replace("Shipping from ", "").strip() | |||
# Finding Shipment Information (Destination) | |||
shipTo = temp[1].split("for") | |||
shipTo = shipTo[0].strip() | |||
# Finding the Product description | |||
describe = soup.find('p', {'class': "card-text"}).text | |||
describe = cleanString(describe.strip()) | |||
# Searching for CVE and MS categories | |||
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) | |||
if cve: | |||
CVE = " " | |||
for idx in cve: | |||
CVE += (idx) | |||
CVE += " " | |||
CVE = CVE.replace(',', ' ') | |||
CVE = CVE.replace('\n', '') | |||
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) | |||
if ms: | |||
MS = " " | |||
for im in ms: | |||
MS += (im) | |||
MS += " " | |||
MS = MS.replace(',', ' ') | |||
MS = MS.replace('\n', '') | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) | |||
# Sending the results | |||
return row | |||
# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs | |||
# stores info it needs in different lists, these lists are returned after being organized | |||
# @param: soup object looking at html page of listing page | |||
# return: 'row' that contains a variety of lists that each hold info on the listing page | |||
def metaversemarket_listing_parser(soup): | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "MetaVerseMarket" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft = [] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
image = [] # 20 Product_Image | |||
image_vendor = [] # 21 Vendor_Image | |||
href = [] # 22 Product_Links | |||
listing = soup.findAll('div', {"class": "col-12 col-sm-4 col-xl-3 product_item_col p-1"}) | |||
# Populating the Number of Products | |||
nm = len(listing) | |||
for a in listing: | |||
bae = a.findAll('a', href=True) | |||
# Adding the url to the list of urls | |||
link = bae[0].get('href') | |||
link = cleanLink(link) | |||
href.append(link) | |||
# Finding the Product | |||
product = bae[1].find('span', {"class": "text-primary"}).text | |||
name.append(cleanString(product.strip())) | |||
# Finding Prices | |||
price = a.find('strong').text | |||
USD.append(cleanNumbers(price).strip()) | |||
# Finding the Vendor | |||
temp = a.find('div', {'class': "mt-1 fs-12px"}) | |||
temp = temp.findAll('span') | |||
vendor_name = temp[1].find('b').text | |||
vendor.append(cleanString(vendor_name.strip())) | |||
# Finding the Category | |||
cat = a.select_one('div[class="fs-12px"]') | |||
cat = cat.findAll('span')[1].text | |||
cat = cat.strip() | |||
category.append(cat) | |||
ul = a.find('ul', {"class": "product-actions"}) | |||
# Finding Number Sold and Quantity Left | |||
temp = ul.find('span', {'class': "badge bg-success"}).text | |||
temp = temp.split("/") | |||
num = temp[0] | |||
num = num.replace('k', '000') | |||
sold.append(cleanNumbers(num).strip()) | |||
quant = temp[1] | |||
quant = quant.replace('k', '000') | |||
qLeft.append(cleanNumbers(quant).strip()) | |||
# Finding Descrption | |||
# description = a.find('p', {'class': "alert alert-light text-ssbold p-1"}).text | |||
# description = description.replace("\n", " ") | |||
# description = description.strip() | |||
# describe.append(cleanString(description)) | |||
# Finding Number of Views | |||
view = ul.find('span', {'class': "badge bg-primary"}).text | |||
view = view.replace('.', '') | |||
view = view.replace('K', '000') | |||
views.append(view.strip()) | |||
# Find where ships from | |||
ships = a.find('div', {'class': "alert alert-info item_alert fs-12px p-1"}) | |||
ships = ships.findAll('b') | |||
sFrom = ships[0].text.strip() | |||
shipFrom.append(sFrom) | |||
# Find where it ships to | |||
sTo = ships[1].text.strip() | |||
shipTo.append(sTo) | |||
# Searching for CVE and MS categories | |||
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) | |||
if not cve: | |||
cveValue = "-1" | |||
else: | |||
cee = " " | |||
for idx in cve: | |||
cee += (idx) | |||
cee += " " | |||
cee = cee.replace(',', ' ') | |||
cee = cee.replace('\n', '') | |||
cveValue = cee | |||
CVE.append(cveValue) | |||
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) | |||
if not ms: | |||
MSValue = "-1" | |||
else: | |||
me = " " | |||
for im in ms: | |||
me += (im) | |||
me += " " | |||
me = me.replace(',', ' ') | |||
me = me.replace('\n', '') | |||
MSValue = me | |||
MS.append(MSValue) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) | |||
# called by the crawler to get description links on a listing page | |||
# @param: beautifulsoup object that is using the correct html page (listing page) | |||
# return: list of description links from a listing page | |||
def metaversemarket_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.findAll('div', {"class": "col-12 col-sm-4 col-xl-3 product_item_col p-1"}) | |||
for a in listing: | |||
bae = a.find('a', href=True) | |||
link = bae['href'] | |||
href.append(link) | |||
return href |
@ -1,289 +0,0 @@ | |||
__author__ = 'Helium' | |||
''' | |||
Nexus Market Crawler (Selenium) | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.common.by import By | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
from datetime import date | |||
import subprocess | |||
import configparser | |||
from bs4 import BeautifulSoup | |||
from MarketPlaces.Initialization.prepare_parser import new_parse | |||
from MarketPlaces.Nexus.parser import nexus_links_parser | |||
from MarketPlaces.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion' | |||
# Opens Tor Browser, crawls the website, then parses, then closes tor | |||
#acts like the main method for the crawler, another function at the end of this code calls this function later | |||
def startCrawling(): | |||
mktName = getMKTName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
input("Press ENTER when page loads after DDOS protection") | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(mktName, baseURL, True) | |||
# Returns the name of the website | |||
#return: name of site in string type | |||
def getMKTName(): | |||
name = 'Nexus' | |||
return name | |||
# Return the base link of the website | |||
#return: url of base site in string type | |||
def getFixedURL(): | |||
url = 'http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion' | |||
return url | |||
# Closes Tor Browser | |||
#@param: current selenium driver | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from MarketPlaces.Initialization.markets_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
# ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
# ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", True) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' | |||
#return: return the selenium driver or string 'down' | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
#@param: raw url as crawler crawls through every site | |||
def getFullPathName(url): | |||
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isListingLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned | |||
#@param: raw url as crawler crawls through every site | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if (name == ''): | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list | |||
#in this example, there are a couple of categories some threads fall under such as | |||
# Guides and Tutorials, Digital Products, and Software and Malware | |||
#as you can see they are categories of products | |||
def getInterestedLinks(): | |||
links = [] | |||
# malware | |||
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/') | |||
# hacking-spam | |||
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/') | |||
# hacking services | |||
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/hacking/') | |||
# programming services | |||
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/programacao/') | |||
# remote admin services | |||
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/administracao-remota/') | |||
# hacking guides | |||
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-hacking/') | |||
# malware guides | |||
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-malware/') | |||
# fraud guides | |||
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-fraudes/') | |||
# fraud software | |||
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/fraudes/software-de-fraude/') | |||
return links | |||
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through | |||
#topic and description pages are crawled through here, where both types of pages are saved | |||
#@param: selenium driver | |||
def crawlForum(driver): | |||
print("Crawling the Nexus market") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
# waiting for btc price to load | |||
try: | |||
WebDriverWait(driver, 1).until(EC.visibility_of_element_located( | |||
(By.XPATH, "/html/body/div[1]/div[2]/div/div/main/ul/li[1]/div/span/span[3]"))) | |||
time.sleep(5) | |||
except: | |||
pass | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
list = productPages(html) | |||
for item in list: | |||
itemURL = urlparse.urljoin(baseURL, str(item)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
# waiting for btc price to load | |||
try: | |||
WebDriverWait(driver, 1).until(EC.visibility_of_element_located( | |||
(By.XPATH, "/html/body/div[1]/div[2]/div/div/main/div[3]/div[2]/p/span[3]"))) | |||
except: | |||
pass | |||
savePage(driver, driver.page_source, item) | |||
driver.back() | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
try: | |||
link = driver.find_element(by=By.LINK_TEXT, value='→').get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the Nexus market done.") | |||
# Returns 'True' if the link is a description link | |||
#@param: url of any url crawled | |||
#return: true if is a description page, false if not | |||
def isDescriptionLink(url): | |||
if 'produto' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link | |||
#@param: url of any url crawled | |||
#return: true if is a Listing page, false if not | |||
def isListingLink(url): | |||
if 'categoria-produto' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links, the html is the url of a link from the list of interested link list | |||
#@param: link from interested link list ie. getInterestingLinks() | |||
#return: list of description links that should be crawled through | |||
def productPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
return nexus_links_parser(soup) | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing Nexus .... DONE!") | |||
@ -1,236 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
import re | |||
usd_to_brl_r = None | |||
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of description page | |||
#return: 'row' that contains a variety of lists that each hold info on the description page | |||
def nexus_description_parser(soup): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
image = "-1" # 19 Product_Image | |||
vendor_image = "-1" # 20 Vendor_Image | |||
#finding the name of the product | |||
name_of_product = soup.find("h1", {"class": "product_title entry-title"}).text | |||
name = cleanString(name_of_product.strip()) | |||
# Finding USD Price | |||
real = soup.find('span', {"class": "price"}).find('bdi').text | |||
real = real.split(',') | |||
whole = cleanNumbers(real[0]).replace('.', '') | |||
real = whole + '.' + real[1] | |||
usd = float(real) / usd_to_brl_r | |||
USD = str(round(usd, 2)) | |||
# Find the BTC Price | |||
prices = soup.find('p', {"class": "price"}).findAll('span', {"class": "cs"}) | |||
if len(prices) > 0: | |||
BTC = prices[0].text | |||
BTC = cleanNumbers(BTC.strip()) | |||
# finding the description of the product | |||
description_div = soup.find("div", {"class": "woocommerce-product-details__short-description"}) | |||
if description_div is None: | |||
describe = "-1" | |||
else: | |||
describe = cleanString(description_div.text.strip()) | |||
# Finding Product Image | |||
image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img') | |||
image = image.get('src') | |||
image = image.split('base64,')[-1] | |||
#find the category of the product | |||
name_of_category = soup.find("span", {"class": "posted_in"}).find("a").text | |||
category = cleanString(name_of_category.strip()) | |||
#finding the name of the vendor | |||
name_of_vendor = soup.find("div", {"class": "dokan-vendor-name"}).find("h5").text | |||
vendor = cleanString(name_of_vendor) | |||
#finding the vendor's rating | |||
vendorRating = soup.find("div", {"class": "dokan-vendor-rating"}).find("p").text | |||
rating_vendor = cleanString(vendorRating) | |||
#everything else gets a -1 because they are not found | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) | |||
# Sending the results | |||
return row | |||
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of listing page | |||
#return: 'row' that contains a variety of lists that each hold info on the listing page | |||
def nexus_listing_parser(soup): | |||
global usd_to_brl_r | |||
while usd_to_brl_r is None: | |||
try: | |||
usd_to_brl_r = float(input("1 US Dollar = (Brazilian Real) ")) | |||
except ValueError: | |||
pass | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "Nexus" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft =[] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
image = [] # 20 Product_Image | |||
image_vendor = [] # 21 Vendor_Image | |||
href = [] # 22 Product_Links | |||
main = soup.find('main', {'id': 'main'}) | |||
products_list = main.find('ul', recursive=False).find_all('li', recursive=False) | |||
nm = len(products_list) | |||
for product in products_list: | |||
# Finding the name of the product | |||
name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text | |||
name_of_product_cleaned = cleanString(name_of_product.strip()) | |||
# print(name_of_product_cleaned) | |||
name.append(name_of_product_cleaned) | |||
#finding the URL | |||
try: | |||
url = product.find("a", class_="woocommerce-loop-product__link").get('href') | |||
href.append(url) | |||
except AttributeError as e: | |||
print("I can't find the link") | |||
raise e | |||
# Finding Product Image | |||
product_image = product.find('a', {'class': 'woocommerce-loop-image-link woocommerce-LoopProduct-link woocommerce-loop-product__link'}).find('img') | |||
product_image = product_image.get('src') | |||
product_image = product_image.split('base64,')[-1] | |||
image.append(product_image) | |||
# Finding USD Price | |||
real = product.find('span', {"class": "price"}).find('bdi').text | |||
real = real.split(',') | |||
whole = cleanNumbers(real[0]).replace('.', '') | |||
real = whole + '.' + real[1] | |||
usd = float(real) / usd_to_brl_r | |||
USD.append(str(round(usd, 2))) | |||
# Finding BTC Price | |||
prices = product.find('span', {"class": "price"}).findAll('span', {"class": "cs"}) | |||
if len(prices) > 0: | |||
price = prices[0].text | |||
BTC.append(cleanNumbers(price.strip())) | |||
#everything else appends a -1 | |||
rating_vendor.append("-1") | |||
vendor.append('-1') | |||
success.append("-1") | |||
CVE.append("-1") | |||
MS.append("-1") | |||
category.append("-1") | |||
describe.append("-1") | |||
views.append("-1") | |||
reviews.append("-1") | |||
addDate.append("-1") | |||
EURO.append("-1") | |||
sold.append("-1") | |||
qLeft.append("-1") | |||
shipFrom.append("-1") | |||
shipTo.append("-1") | |||
image_vendor.append("-1") | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts( | |||
marketplace = mktName, | |||
nm = nm, | |||
vendor = vendor, | |||
rating_vendor = rating_vendor, | |||
success_vendor = success, | |||
nombre = name, | |||
CVE = CVE, | |||
MS = MS, | |||
category = category, | |||
describe = describe, | |||
views = views, | |||
reviews = reviews, | |||
rating_item = rating_item, | |||
addDate = addDate, | |||
BTC = BTC, | |||
USD = USD, | |||
EURO = EURO, | |||
sold = sold, | |||
qLeft = qLeft, | |||
shipFrom = shipFrom, | |||
shipTo = shipTo, | |||
href = href, | |||
image = image, | |||
image_vendor = image_vendor | |||
) | |||
#called by the crawler to get description links on a listing page | |||
#@param: beautifulsoup object that is using the correct html page (listing page) | |||
#return: list of description links from a listing page | |||
def nexus_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
# Using a shorter, but still unique, class name | |||
listing = soup.find_all("a", class_="woocommerce-loop-product__link") | |||
for a in listing: | |||
link = a.get('href') | |||
if link: # Checks if 'href' attribute is not None | |||
href.append(link) | |||
return href |
@ -0,0 +1,232 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) | |||
def quest_description_parser(soup): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
row = soup.find_all('div', {'class': "row"}) | |||
# Finding Product Name | |||
name = row[1].text | |||
name = name.replace('\n', ' ') | |||
name = name.replace(",", "") | |||
name = name.strip() | |||
small = row[3].find_all('small') | |||
# Finding Vendor | |||
vendor = small[0].text | |||
vendor = vendor.replace("Vendor:", "") | |||
vendor = vendor.replace(",", "") | |||
vendor = vendor.strip() | |||
# Finding Vendor Rating | |||
full_stars = small[2].find_all('i', {'class': "fas fa-star"}) | |||
half_star = small[2].find('i', {'class': "fas fa-star-half-alt"}) | |||
rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0) | |||
# Finding Successful Transactions | |||
success = small[4].text | |||
success = success.replace("Total Sales:", "") | |||
success = success.strip() | |||
small = row[2].find('p', {'class': "text-left"}).find_all('small') | |||
# Finding Prices | |||
USD = small[1].text | |||
USD = USD.replace("$", "") | |||
USD = USD.strip() | |||
shipping_info = row[2].find('p', {'class': "text-left"}).find('span').text.strip() | |||
if "Digital" not in shipping_info: | |||
shipping_info = shipping_info.split(" ") | |||
# Finding Shipment Information (Origin) | |||
shipFrom = shipping_info[0].strip() | |||
# Finding Shipment Information (Destination) | |||
shipTo = shipping_info[1].strip() | |||
textarea = row[2].find_all('textarea') | |||
# Finding the Product description | |||
describe = textarea[0].text | |||
describe = describe.replace("\n", " ") | |||
describe = describe.replace("\r", " ") | |||
describe = describe.strip() | |||
''' | |||
# Finding the Number of Product Reviews | |||
tag = soup.findAll(text=re.compile('Reviews')) | |||
for index in tag: | |||
reviews = index | |||
par = reviews.find('(') | |||
if par >=0: | |||
reviews = reviews.replace("Reviews (","") | |||
reviews = reviews.replace(")","") | |||
reviews = reviews.split(",") | |||
review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) | |||
else : | |||
review = "-1" | |||
''' | |||
# Searching for CVE and MS categories | |||
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) | |||
if cve: | |||
CVE = " " | |||
for idx in cve: | |||
CVE += (idx) | |||
CVE += " " | |||
CVE = CVE.replace(',', ' ') | |||
CVE = CVE.replace('\n', '') | |||
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) | |||
if ms: | |||
MS = " " | |||
for im in ms: | |||
MS += (im) | |||
MS += " " | |||
MS = MS.replace(',', ' ') | |||
MS = MS.replace('\n', '') | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages | |||
def quest_listing_parser(soup): | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "Quest" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft =[] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
href = [] # 20 Product_Links | |||
# Finding category of listing page | |||
cat = soup.find('span', {'class': "btn btn-sm btn-outline-mgray active border-info"}).text | |||
cat = cat.replace("Digital -", "") | |||
cat = cat.strip() | |||
listing = soup.find_all('div', {"class": "col-md-2 my-md-0 col-12"}) | |||
# Populating the Number of Products | |||
nm = len(listing) | |||
for a in listing: | |||
bae = a.find_all('a', href=True) | |||
# Adding the category | |||
category.append(cat) | |||
# Adding the url to the list of urls | |||
link = bae[0].get('href') | |||
link = cleanLink(link) | |||
href.append(link) | |||
# Finding the Vendor | |||
vendor_name = bae[2].text | |||
vendor_name = vendor_name.replace(",", "") | |||
vendor_name = vendor_name.strip() | |||
vendor.append(vendor_name) | |||
# Finding the Product | |||
product = bae[1].find('img').get('alt') | |||
product = product.replace('\n', ' ') | |||
product = product.replace(",", "") | |||
product = product.strip() | |||
name.append(product) | |||
# Searching for CVE and MS categories | |||
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) | |||
if not cve: | |||
cveValue="-1" | |||
else: | |||
cee = " " | |||
for idx in cve: | |||
cee += (idx) | |||
cee += " " | |||
cee = cee.replace(',', ' ') | |||
cee = cee.replace('\n', '') | |||
cveValue=cee | |||
CVE.append(cveValue) | |||
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) | |||
if not ms: | |||
MSValue="-1" | |||
else: | |||
me = " " | |||
for im in ms: | |||
me += (im) | |||
me += " " | |||
me = me.replace(',', ' ') | |||
me = me.replace('\n', '') | |||
MSValue=me | |||
MS.append(MSValue) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) | |||
def quest_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.findAll('div', {"class": "col-md-2 my-md-0 col-12"}) | |||
for div in listing: | |||
link = div.find('a')["href"] | |||
href.append(link) | |||
return href |
@ -1,256 +0,0 @@ | |||
__author__ = 'chris' | |||
''' | |||
RobinhoodMarket Market Crawler (Selenium) | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.common.by import By | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
import subprocess | |||
import configparser | |||
from bs4 import BeautifulSoup | |||
from MarketPlaces.Initialization.prepare_parser import new_parse | |||
from MarketPlaces.RobinhoodMarket.parser import Robinhood_links_parser | |||
from MarketPlaces.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/' | |||
# Opens Tor Browser, crawls the website | |||
def startCrawling(): | |||
marketName = getMKTName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
# Captcha | |||
input("Press ENTER when website has loaded") | |||
# Robinhood doesn't need login | |||
# login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(marketName, baseURL, True) | |||
# Login is not needed in Robinhood | |||
def login(driver): | |||
pass | |||
# Returns the name of the website | |||
def getMKTName(): | |||
name = 'RobinhoodMarket' | |||
return name | |||
# Return the link of the website | |||
def getFixedURL(): | |||
url = 'http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/' | |||
return url | |||
# Closes Tor Browser | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.quit() | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from MarketPlaces.Initialization.markets_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", False) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
# Saves the crawled html page | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
def getFullPathName(url): | |||
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if name == '': | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
def getInterestedLinks(): | |||
links = [] | |||
# Hacking | |||
links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/hacking/') | |||
# Other Software | |||
links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/') | |||
return links | |||
def crawlForum(driver): | |||
print("Crawling the Robinhood market") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
list = productPages(html) | |||
for c, item in enumerate(list): | |||
itemURL = urlparse.urljoin(baseURL, str(item)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
savePage(driver, driver.page_source, item) | |||
driver.back() | |||
# comment out | |||
# if c == 4: | |||
# break | |||
# comment out | |||
# if count == 1: | |||
# break | |||
# go to next page of market | |||
try: | |||
nav = driver.find_element(by=By.XPATH, value="//a[@class='next page-numbers']") | |||
link = nav.get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the Robinhood market done.") | |||
# Returns 'True' if the link is Topic link | |||
def isDescriptionLink(url): | |||
if 'product' in url and 'category' not in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link | |||
def isListingLink(url): | |||
if 'category=' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links | |||
def productPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
return Robinhood_links_parser(soup) | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing BestCardingWorld .... DONE!") | |||
if __name__ == '__main__': | |||
startCrawling() |
@ -1,334 +0,0 @@ | |||
__author__ = 'chris' | |||
import re | |||
import traceback | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# Import for test run | |||
import glob | |||
import os | |||
import codecs | |||
import shutil | |||
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) | |||
def Robinhood_description_parser(soup): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
image = "-1" # 19 Product_Image | |||
vendor_image = "-1" # 20 Vendor_Image | |||
# Finding Product Name | |||
name = soup.find('h1').text | |||
name = name.replace('\n', ' ') | |||
name = name.replace(",", "") | |||
name = name.strip() | |||
# Finding description | |||
desc = '' | |||
tab = soup.find('div', {"id": "tab-description"}) | |||
if tab is not None: | |||
for p in tab.findAll('p'): | |||
desc += p.text | |||
if desc == '': | |||
short = soup.find('div', {"class": "woocommerce-product-details__short-description"}) | |||
if short is not None: | |||
desc = short.text | |||
describe = cleanString(desc.strip()) | |||
# Finding Product Image | |||
image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img') | |||
image = image.get('src') | |||
image = image.split('base64,')[-1] | |||
# Finding Vendor | |||
vendor = soup.find('a', {'class': 'wcfm_dashboard_item_title'}).text | |||
vendor = vendor.replace(",", "") | |||
vendor = vendor.replace("Sold by:", "") | |||
vendor = vendor.strip() | |||
# Finding Vendor Image | |||
vendor_image = soup.find('div', {'class': 'wcfmmp_sold_by_container_left'}).find('img') | |||
vendor_image = vendor_image.get('src') | |||
vendor_image = vendor_image.split('base64,')[-1] | |||
# Finding Category | |||
catSpan = soup.find('span', {'class': 'posted_in'}) | |||
category = catSpan.find('a').text | |||
# Finding USD | |||
priceText = soup.find('p', {'class': 'price'}).text | |||
USD = str(priceText).strip() | |||
# Searching for CVE and MS categories | |||
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) | |||
if cve: | |||
CVE = " " | |||
for idx in cve: | |||
CVE += (idx) | |||
CVE += " " | |||
CVE = CVE.replace(',', ' ') | |||
CVE = CVE.replace('\n', '') | |||
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) | |||
if ms: | |||
MS = " " | |||
for im in ms: | |||
MS += (im) | |||
MS += " " | |||
MS = MS.replace(',', ' ') | |||
MS = MS.replace('\n', '') | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages | |||
def Robinhood_listing_parser(soup): | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "RobinhoodMarket" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft =[] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
image = [] # 20 Product_Image | |||
image_vendor = [] # 21 Vendor_Image | |||
href = [] # 22 Product_Links | |||
listing = soup.find('ul', {"class": "products columns-4"}) | |||
items = listing.findAll('li') | |||
# Populating the Number of Products | |||
nm = len(items) | |||
for card in items: | |||
# Finding Category | |||
cat = soup.find("h1").text | |||
cat = cat.replace('\n', ' ') | |||
cat = cat.replace(",", "") | |||
cat = cat.strip() | |||
category.append(cat) | |||
bae = card.findAll('a') | |||
# Adding the url to the list of urls | |||
link = card.find('a').get('href') | |||
href.append(link) | |||
# Finding Product Name | |||
product = card.find("h2").text | |||
product = product.replace('\n', ' ') | |||
product = product.replace(",", "") | |||
product = product.strip() | |||
name.append(product) | |||
# Finding Product Image | |||
product_image = card.find('a').find('img') | |||
product_image = product_image.get('src') | |||
product_image = product_image.split('base64,')[-1] | |||
image.append(product_image) | |||
info = card.find('div', {'class': 'wcfmmp_sold_by_container'}) | |||
# Finding Vendor | |||
vendor_name = info.find('a', {'class', 'wcfm_dashboard_item_title'}).text | |||
vendor_name = vendor_name.replace(",", "") | |||
vendor_name = vendor_name.strip() | |||
vendor.append(vendor_name) | |||
# Finding Vendor Image | |||
vendor_icon = info.find('img', {'class', 'wcfmmp_sold_by_logo'}) | |||
vendor_icon = vendor_icon.get('src') | |||
vendor_icon = vendor_icon.split('base64,')[-1] | |||
image_vendor.append(vendor_icon) | |||
# Finding USD | |||
span = card.find('span', {'class': 'price'}) | |||
if span is not None: | |||
bdi = span.find('bdi') | |||
usdText = bdi.find('span').next_sibling | |||
usdVal = usdText.text | |||
else: | |||
usdVal = "0" | |||
USD.append(usdVal) | |||
# Searching for CVE and MS categories | |||
cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) | |||
if not cve: | |||
cveValue="-1" | |||
else: | |||
cee = " " | |||
for idx in cve: | |||
cee += (idx) | |||
cee += " " | |||
cee = cee.replace(',', ' ') | |||
cee = cee.replace('\n', '') | |||
cveValue=cee | |||
CVE.append(cveValue) | |||
ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) | |||
if not ms: | |||
MSValue="-1" | |||
else: | |||
me = " " | |||
for im in ms: | |||
me += (im) | |||
me += " " | |||
me = me.replace(',', ' ') | |||
me = me.replace('\n', '') | |||
MSValue=me | |||
MS.append(MSValue) | |||
#print(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
# reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) | |||
def Robinhood_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
#list = soup.findAll('div', {"class": "woocommerce columns-4"}) | |||
listing = soup.find('ul', {"class": "products columns-4"}).findAll('li') | |||
for item in listing: | |||
link = item.find('a')['href'] | |||
href.append(link) | |||
return href | |||
if __name__ == '__main__': | |||
nError = 0 | |||
marketPlace = 'RobinhoodMarket' | |||
lines = [] # listing pages | |||
lns = [] # description pages | |||
detPage = {} | |||
''' | |||
# reading description pages | |||
count = 0 | |||
for fileDescription in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\08082023\\Description", '*.html')): | |||
count += 1 | |||
lns.append(fileDescription) | |||
# if count > 5: | |||
# break | |||
for index, line2 in enumerate(lns): | |||
print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns))) | |||
try: | |||
html = codecs.open(line2.strip('\n'), encoding='utf8') | |||
soup = BeautifulSoup(html, "html.parser") | |||
html.close() | |||
except: | |||
try: | |||
html = open(line2.strip('\n')) | |||
soup = BeautifulSoup(html, "html.parser") | |||
html.close() | |||
except: | |||
nError += 1 | |||
print("There was a problem to read the file " + line2 + " in the Description section!") | |||
# if createLog: | |||
# logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section.\n") | |||
continue | |||
try: | |||
print(Robinhood_description_parser(soup)) | |||
except: | |||
traceback.print_exc() | |||
print("There was a problem to parse the file " + line2 + " in the Description section!") | |||
''' | |||
# reading listing pages | |||
count = 0 | |||
for fileListing in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\08082023\\Listing", '*.html')): | |||
count += 1 | |||
lines.append(fileListing) | |||
#if count > 1: | |||
# break | |||
for index, line1 in enumerate(lines): | |||
print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines))) | |||
readError = False | |||
try: | |||
html = codecs.open(line1.strip('\n'), encoding='utf8') | |||
soup = BeautifulSoup(html, "html.parser") | |||
html.close() | |||
except: | |||
try: | |||
html = open(line1.strip('\n')) | |||
soup = BeautifulSoup(html, "html.parser") | |||
html.close() | |||
except: | |||
print("There was a problem to read the file " + line1 + " in the Listing section!") | |||
readError = True | |||
if not readError: | |||
parseError = False | |||
try: | |||
test = Robinhood_listing_parser(soup) | |||
print(Robinhood_listing_parser(soup)) | |||
except: | |||
traceback.print_exc() | |||
print("There was a problem to parse the file " + line1 + " in the listing section!") | |||
parseError = True | |||
print("DONE") |
@ -1,190 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from typing import List, Tuple | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup, ResultSet, Tag | |||
def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple: | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
image = "-1" # 19 Product_Image | |||
vendor_image = "-1" # 20 Vendor_Image | |||
name = soup.find("h1", {'class': 'title'}).text | |||
name = cleanString(name.strip()) | |||
describe = soup.find('div', {'id': 'descriptionContent'}).text | |||
describe = cleanString(describe.strip()) | |||
# Finding Product Image | |||
image = soup.find('div', {'class': 'product_img_big'}).find('img') | |||
image = image.get('src') | |||
image = image.split('base64,')[-1] | |||
commentListTag: Tag = soup.find('ul', {'class': 'comment_list scrollbar'}) | |||
commentList = commentListTag.find_all('li') | |||
review = str(len(commentList)) | |||
citySelection: str = soup.find('ul', {'class': 'meta text-muted i_location'}).text | |||
shipFrom = cleanString(citySelection.strip()) | |||
vendor = soup.find('h1', {'class': 'title over'}).text | |||
vendor = cleanString(vendor.strip()) | |||
usdTag: Tag = soup.find('div', {'class': 'product_price__big'}).find('span') | |||
usdText = usdTag.text.strip('/')[0] | |||
# usdText format: "<value> USD " (i.e., "70 000 USD ") | |||
USD = cleanString(usdText.replace("USD", "").strip()) | |||
ratingDiv = soup.find('div', {'class': 'rating_star'}) | |||
rating_vendor = ratingDiv.get('title').split(' ')[1] | |||
rating_item = soup.find('div', {'class': 'product_rate'}).text | |||
rating_item = rating_item.replace("rating", "") | |||
rating_item = cleanString(rating_item.strip()) | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) | |||
# Sending the results | |||
return row | |||
def thiefWorld_listing_parser(soup: BeautifulSoup): | |||
# Fields to be parsed | |||
nm = 0 # Total_Products (Should be Integer) | |||
mktName = "ThiefWorld" # 0 Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft =[] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
image = [] # 20 Product_Image | |||
image_vendor = [] # 21 Vendor_Image | |||
href = [] # 22 Product_Links | |||
productList: ResultSet[Tag] = soup.find_all('div', {'class': 'catalog_item'}) | |||
nm = len(productList) | |||
for product in productList: | |||
productTitle: Tag = product.find('div', {'class': 'title'}).find('a') | |||
productName = cleanString(productTitle.text.strip()) | |||
name.append(productName) | |||
# Finding Product Image | |||
product_image = product.find('noscript').find('img') | |||
product_image = product_image.get('src') | |||
product_image = product_image.split('base64,')[-1] | |||
image.append(product_image) | |||
productHref = productTitle.get('href') | |||
href.append(productHref) | |||
CVE.append('-1') | |||
MS.append('-1') | |||
cat = soup.find('calsys-cat').text | |||
category.append(cat.strip()) | |||
productDescription = product.find('div', {'class': 'text'}).text | |||
productDescription = cleanString(productDescription.strip()) | |||
describe.append(productDescription) | |||
views.append('-1') | |||
reviews.append('-1') | |||
addDate.append('-1') | |||
BTC.append('-1') | |||
priceText = product.find('span', {'class': 'price'}).find('span').text | |||
priceText = priceText.split('USD')[0] | |||
priceText = cleanString(priceText.strip()) | |||
USD.append(priceText) | |||
EURO.append('-1') | |||
sold.append('-1') | |||
qLeft.append('-1') | |||
shipFrom.append('-1') | |||
shipTo.append('-1') | |||
productVendor = product.find('div', {'class': 'market over'}).find('a').text | |||
productVendor = cleanString(productVendor.strip()) | |||
vendor.append(productVendor) | |||
image_vendor.append('-1') | |||
rating_vendor.append('-1') | |||
#rating_item.append('-1') | |||
rating = product.find('div', {'class': 'rating_star_yellow'}).attrs.get('style') | |||
rating = rating.replace("width: ", "") | |||
rating_item.append(cleanString(rating)) | |||
success.append('-1') | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) | |||
#called by the crawler to get description links on a listing page | |||
#@param: beautifulsoup object that is using the correct html page (listing page) | |||
#return: list of description links from a listing page | |||
def thiefworld_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.find('div', {"class": "row tile__list tileitems_filter pad15 tileproduct__list"}).findAll('div', {"class": "desc"}) | |||
for a in listing: | |||
bae = a.find('div', {"class": "title"}).find('a', href=True) | |||
link = bae['href'] | |||
href.append(link) | |||
return href |
@ -1,268 +0,0 @@ | |||
__author__ = 'Helium' | |||
''' | |||
TorBay Market Forum Crawler (Selenium) | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.common.by import By | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, time | |||
from datetime import date | |||
import subprocess | |||
import configparser | |||
import subprocess | |||
from bs4 import BeautifulSoup | |||
from MarketPlaces.Initialization.prepare_parser import new_parse | |||
from MarketPlaces.TorBay.parser import torbay_links_parser | |||
from MarketPlaces.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion/' | |||
# Opens Tor Browser, crawls the website, then parses, then closes tor | |||
#acts like the main method for the crawler, another function at the end of this code calls this function later | |||
def startCrawling(): | |||
mktName = getMKTName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
login(driver) | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(mktName, baseURL, True) | |||
# Returns the name of the website | |||
#return: name of site in string type | |||
def getMKTName(): | |||
name = 'TorBay' | |||
return name | |||
# Return the base link of the website | |||
#return: url of base site in string type | |||
def getFixedURL(): | |||
url = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion/' | |||
return url | |||
# Closes Tor Browser | |||
#@param: current selenium driver | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from MarketPlaces.Initialization.markets_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", True) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' | |||
#return: return the selenium driver or string 'down' | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha | |||
# then allows for manual solving of captcha in the terminal | |||
#@param: current selenium web driver | |||
def login(driver): | |||
# wait for page to show up (This Xpath may need to change based on different seed url) | |||
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( | |||
(By.XPATH, "/html/body/div[2]/div/div/div/ul/li[6]/a"))) | |||
# Saves the crawled html page, makes the directory path for html pages if not made | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
#@param: raw url as crawler crawls through every site | |||
def getFullPathName(url): | |||
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned | |||
#@param: raw url as crawler crawls through every site | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if (name == ''): | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list | |||
#in this example, there are a couple of categories some threads fall under such as | |||
# Guides and Tutorials, Digital Products, and Software and Malware | |||
#as you can see they are categories of products | |||
def getInterestedLinks(): | |||
links = [] | |||
# Hacking | |||
links.append('http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion/category/hacking') | |||
return links | |||
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through | |||
#topic and description pages are crawled through here, where both types of pages are saved | |||
#@param: selenium driver | |||
def crawlForum(driver): | |||
print("Crawling the TorBay Market") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
list = productPages(html) | |||
for item in list: | |||
itemURL = urlparse.urljoin(baseURL, str(item)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
savePage(driver, driver.page_source, item) | |||
driver.back() | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
try: | |||
nav = driver.find_element(by=By.XPATH, value='/html/body/section/div/div/div[2]/div/div[2]/ul') | |||
link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the TorBay market done.") | |||
# Returns 'True' if the link is a description link | |||
#@param: url of any url crawled | |||
#return: true if is a description page, false if not | |||
def isDescriptionLink(url): | |||
if 'product' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link | |||
#@param: url of any url crawled | |||
#return: true if is a Listing page, false if not | |||
def isListingLink(url): | |||
if 'category' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links, the html is the url of a link from the list of interested link list | |||
#@param: link from interested link list ie. getInterestingLinks() | |||
#return: list of description links that should be crawled through | |||
def productPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
return torbay_links_parser(soup) | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing BestCardingWorld .... DONE!") |
@ -1,183 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of description page | |||
#return: 'row' that contains a variety of lists that each hold info on the description page | |||
def torbay_description_parser(soup): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
image = "-1" # 19 Product_Image | |||
vendor_image = "-1" # 20 Vendor_Image | |||
# Finding Product Name | |||
try: | |||
product_name = soup.find('div', {'class': 'product-information'}).find('h1').text | |||
name = cleanString(product_name.strip()) | |||
except: | |||
product_name = soup.find('div', {'class': 'profile-info'}).find('h2').text | |||
name = cleanString(product_name.strip()) | |||
# Finding Vendor FIx | |||
vendor_name = soup.find('div', {"class": "profile-info"}).find('h2').text | |||
vendor = cleanString(vendor_name.strip()) | |||
# Finding Vendor Image | |||
vendor_image = soup.find('div', {'class': 'avatar'}).find('img') | |||
vendor_image = vendor_image.get('src') | |||
vendor_image = vendor_image.split('base64,')[-1] | |||
# Finding Prices | |||
USD = soup.find('div', {'class': "total-price"}).find('span').text.strip() | |||
# Finding the Product Category | |||
cat = soup.find('div', {'class': "profile-info"}).find('p').text | |||
category = cleanString(cat.strip()) | |||
# Finding the Product description | |||
try: | |||
describe = soup.find('div', {'class': "info"}).find('p').text | |||
if "\n" in describe: | |||
describe = describe.replace("\n", " ") | |||
describe = describe.replace("\r", " ") | |||
describe = cleanString(describe.strip()) | |||
except: | |||
# print("product desc") | |||
describe = soup.find('div', {'class': 'info'}).text | |||
describe = cleanString(describe.strip()) | |||
# Finding Product Image | |||
image = soup.find('div', {'class': 'image text-center'}).find('img') | |||
image = image.get('src') | |||
image = image.split('base64,')[-1] | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) | |||
# Sending the results | |||
return row | |||
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of listing page | |||
#return: 'row' that contains a variety of lists that each hold info on the listing page | |||
def torbay_listing_parser(soup): | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "TorBay" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft =[] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
image = [] # 20 Product_Image | |||
image_vendor = [] # 21 Vendor_Image | |||
href = [] # 22 Product_Links | |||
listing = soup.findAll('div', {"class": "product-card"}) | |||
# Populating the Number of Products | |||
nm = len(listing) | |||
for a in listing: | |||
product_name = a.find('p', {'class': 'name'}).text | |||
name.append(cleanString(product_name.strip())) | |||
# Finding Product Image | |||
image.append("-1") | |||
prod = a.find('p', {'class': 'price'}).text # price | |||
USD.append(cleanString(prod.strip())) | |||
ven = a.find('div', {'class': 'pc-footer'}).find('div').find('a').text # pc-footer | |||
vendor.append(cleanString(ven.strip())) | |||
# print(ven) | |||
# Finding Vendor Image | |||
image_vendor.append("-1") | |||
h = a.find('p', {'class': 'name'}).find('a').get('href') | |||
href.append(h) | |||
CVE.append("-1") | |||
MS.append("-1") | |||
rating_vendor.append("-1") | |||
success.append("-1") | |||
describe.append("-1") | |||
views.append("-1") | |||
reviews.append("-1") | |||
rating_item.append("-1") | |||
addDate.append("-1") | |||
BTC.append("-1") | |||
EURO.append("-1") | |||
sold.append("-1") | |||
qLeft.append("-1") | |||
shipFrom.append("-1") | |||
shipTo.append("-1") | |||
category.append("Hacking") | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) | |||
#called by the crawler to get description links on a listing page | |||
#@param: beautifulsoup object that is using the correct html page (listing page) | |||
#return: list of description links from a listing page | |||
def torbay_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.find('section', {"id": "content"}).findAll('div', {"class": "product-card"}) | |||
for a in listing: | |||
bae = a.find('div', {"class": "pc-footer"}).find('a', {"class": "btn btn-primary"}, href=True) | |||
link = bae['href'] | |||
href.append(link) | |||
return href |
@ -1,277 +0,0 @@ | |||
__author__ = 'Helium' | |||
''' | |||
TorMarket Forum Crawler (Selenium) | |||
''' | |||
from selenium import webdriver | |||
from selenium.common.exceptions import NoSuchElementException | |||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | |||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary | |||
from selenium.webdriver.firefox.service import Service | |||
from selenium.webdriver.support.ui import WebDriverWait | |||
from selenium.webdriver.support import expected_conditions as EC | |||
from selenium.webdriver.common.by import By | |||
from PIL import Image | |||
import urllib.parse as urlparse | |||
import os, re, time | |||
from datetime import date | |||
import subprocess | |||
import configparser | |||
from bs4 import BeautifulSoup | |||
from MarketPlaces.Initialization.prepare_parser import new_parse | |||
from MarketPlaces.TorMarket.parser import tormarket_links_parser | |||
from MarketPlaces.Utilities.utilities import cleanHTML | |||
counter = 1 | |||
baseURL = 'http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/' | |||
# Opens Tor Browser, crawls the website, then parses, then closes tor | |||
#acts like the main method for the crawler, another function at the end of this code calls this function later | |||
def startCrawling(): | |||
mktName = getMKTName() | |||
driver = getAccess() | |||
if driver != 'down': | |||
try: | |||
crawlForum(driver) | |||
except Exception as e: | |||
print(driver.current_url, e) | |||
closeDriver(driver) | |||
new_parse(mktName, baseURL, True) | |||
# Returns the name of the website | |||
#return: name of site in string type | |||
def getMKTName(): | |||
name = 'TorMarket' | |||
return name | |||
# Return the base link of the website | |||
#return: url of base site in string type | |||
def getFixedURL(): | |||
url = 'http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/' | |||
return url | |||
# Closes Tor Browser | |||
#@param: current selenium driver | |||
def closeDriver(driver): | |||
# global pid | |||
# os.system("taskkill /pid " + str(pro.pid)) | |||
# os.system("taskkill /t /f /im tor.exe") | |||
print('Closing Tor...') | |||
driver.close() | |||
time.sleep(3) | |||
return | |||
# Creates FireFox 'driver' and configure its 'Profile' | |||
# to use Tor proxy and socket | |||
def createFFDriver(): | |||
from MarketPlaces.Initialization.markets_mining import config | |||
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) | |||
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) | |||
ff_prof.set_preference("places.history.enabled", False) | |||
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) | |||
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) | |||
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) | |||
ff_prof.set_preference("signon.rememberSignons", False) | |||
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) | |||
# ff_prof.set_preference("network.dns.disablePrefetch", True) | |||
# ff_prof.set_preference("network.http.sendRefererHeader", 0) | |||
ff_prof.set_preference("permissions.default.image", 3) | |||
ff_prof.set_preference("browser.download.folderList", 2) | |||
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) | |||
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") | |||
ff_prof.set_preference('network.proxy.type', 1) | |||
ff_prof.set_preference("network.proxy.socks_version", 5) | |||
ff_prof.set_preference('network.proxy.socks', '127.0.0.1') | |||
ff_prof.set_preference('network.proxy.socks_port', 9150) | |||
ff_prof.set_preference('network.proxy.socks_remote_dns', True) | |||
ff_prof.set_preference("javascript.enabled", False) | |||
ff_prof.update_preferences() | |||
service = Service(config.get('TOR', 'geckodriver_path')) | |||
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) | |||
driver.maximize_window() | |||
return driver | |||
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' | |||
#return: return the selenium driver or string 'down' | |||
def getAccess(): | |||
url = getFixedURL() | |||
driver = createFFDriver() | |||
try: | |||
driver.get(url) | |||
return driver | |||
except: | |||
driver.close() | |||
return 'down' | |||
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha | |||
# then allows for manual solving of captcha in the terminal | |||
#@param: current selenium web driver | |||
def login(driver): | |||
# wait for page to show up (This Xpath may need to change based on different seed url) | |||
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( | |||
(By.XPATH, "/html/body/div[2]/div/div/div/main/article/div/section[4]/div/div[1]/div/div/div/div/ul/li[15]/ul/li[3]/a"))) | |||
# Saves the crawled html page, makes the directory path for html pages if not made | |||
def savePage(driver, page, url): | |||
cleanPage = cleanHTML(driver, page) | |||
filePath = getFullPathName(url) | |||
os.makedirs(os.path.dirname(filePath), exist_ok=True) | |||
open(filePath, 'wb').write(cleanPage.encode('utf-8')) | |||
return | |||
# Gets the full path of the page to be saved along with its appropriate file name | |||
#@param: raw url as crawler crawls through every site | |||
def getFullPathName(url): | |||
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE | |||
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") | |||
fileName = getNameFromURL(url) | |||
if isDescriptionLink(url): | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') | |||
else: | |||
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') | |||
return fullPath | |||
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned | |||
#@param: raw url as crawler crawls through every site | |||
def getNameFromURL(url): | |||
global counter | |||
name = ''.join(e for e in url if e.isalnum()) | |||
if (name == ''): | |||
name = str(counter) | |||
counter = counter + 1 | |||
return name | |||
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list | |||
#in this example, there are a couple of categories some threads fall under such as | |||
# Guides and Tutorials, Digital Products, and Software and Malware | |||
#as you can see they are categories of products | |||
def getInterestedLinks(): | |||
links = [] | |||
# Tutorials | |||
links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/') | |||
# Malware | |||
links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/malware/') | |||
# Services | |||
links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/') | |||
return links | |||
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through | |||
#topic and description pages are crawled through here, where both types of pages are saved | |||
#@param: selenium driver | |||
def crawlForum(driver): | |||
print("Crawling the TorMarket market") | |||
linksToCrawl = getInterestedLinks() | |||
i = 0 | |||
while i < len(linksToCrawl): | |||
link = linksToCrawl[i] | |||
print('Crawling :', link) | |||
try: | |||
has_next_page = True | |||
count = 0 | |||
while has_next_page: | |||
try: | |||
driver.get(link) | |||
except: | |||
driver.refresh() | |||
html = driver.page_source | |||
savePage(driver, html, link) | |||
list = productPages(html) | |||
for item in list: | |||
itemURL = urlparse.urljoin(baseURL, str(item)) | |||
try: | |||
driver.get(itemURL) | |||
except: | |||
driver.refresh() | |||
savePage(driver, driver.page_source, item) | |||
driver.back() | |||
# # comment out | |||
# break | |||
# | |||
# # comment out | |||
# if count == 1: | |||
# break | |||
try: | |||
link = driver.find_element(by=By.LINK_TEXT, value='NEXT').get_attribute('href') | |||
if link == "": | |||
raise NoSuchElementException | |||
count += 1 | |||
except NoSuchElementException: | |||
has_next_page = False | |||
except Exception as e: | |||
print(link, e) | |||
i += 1 | |||
print("Crawling the TorMarket market done.") | |||
# Returns 'True' if the link is a description link | |||
#@param: url of any url crawled | |||
#return: true if is a description page, false if not | |||
def isDescriptionLink(url): | |||
if 'shop' in url: | |||
return True | |||
return False | |||
# Returns True if the link is a listingPage link | |||
#@param: url of any url crawled | |||
#return: true if is a Listing page, false if not | |||
def isListingLink(url): | |||
if 'product-category' in url: | |||
return True | |||
return False | |||
# calling the parser to define the links, the html is the url of a link from the list of interested link list | |||
#@param: link from interested link list ie. getInterestingLinks() | |||
#return: list of description links that should be crawled through | |||
def productPages(html): | |||
soup = BeautifulSoup(html, "html.parser") | |||
return tormarket_links_parser(soup) | |||
# Drop links that "signout" | |||
# def isSignOut(url): | |||
# #absURL = urlparse.urljoin(url.base_url, url.url) | |||
# if 'signout' in url.lower() or 'logout' in url.lower(): | |||
# return True | |||
# | |||
# return False | |||
def crawler(): | |||
startCrawling() | |||
# print("Crawling and Parsing BestCardingWorld .... DONE!") |
@ -1,189 +0,0 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
import re | |||
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of description page | |||
#return: 'row' that contains a variety of lists that each hold info on the description page | |||
def tormarket_description_parser(soup): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
image = "-1" # 19 Product_Image | |||
vendor_image = "-1" # 20 Vendor_Image | |||
#finding the name of the product | |||
name_of_product = soup.find("h1", {"class": "product_title entry-title"}).find("a").text | |||
name = cleanString(name_of_product.strip()) | |||
#finding the description of the product | |||
description_of_product = soup.find("div", {"class": "woocommerce-product-details__short-description"}).text | |||
describe = cleanString(description_of_product.strip()) | |||
#finding the name of the vendor | |||
name_of_vendor = soup.find("div", {"class": "wcfmmp_sold_by_store"}) | |||
if name_of_vendor is not None: | |||
name_of_vendor = name_of_vendor.find("a").text | |||
vendor = cleanString(name_of_vendor.strip()) | |||
else: | |||
vendor = "TorMarket" | |||
#finding the price of the item | |||
price = soup.find("p", {"class": "price"}).find("bdi").text | |||
price_cleaned = price[1:] | |||
USD = price_cleaned.strip() | |||
category = soup.find('span', {"class": "posted_in"}).text | |||
category = category.split(':')[-1] | |||
category = category.replace(',', '/') | |||
category = cleanString(category.strip()) | |||
#everything else gets a -1 because they are not found | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) | |||
# Sending the results | |||
return row | |||
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs | |||
#stores info it needs in different lists, these lists are returned after being organized | |||
#@param: soup object looking at html page of listing page | |||
#return: 'row' that contains a variety of lists that each hold info on the listing page | |||
def tormarket_listing_parser(soup): | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "TorMarket" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft = [] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
image = [] # 20 Product_Image | |||
image_vendor = [] # 21 Vendor_Image | |||
href = [] # 22 Product_Links | |||
products_list = soup.find('ul', {"class": "products columns-3 tablet-columns-2 mobile-columns-1"}).find_all('li') | |||
nm = len(products_list) | |||
for product in products_list: | |||
# Finding the name of the product | |||
name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text | |||
name_of_product_cleaned = cleanString(name_of_product.strip()) | |||
# print(name_of_product_cleaned) | |||
name.append(name_of_product_cleaned) | |||
#finding the URL | |||
try: | |||
url = product.find("div", {"class": "product-loop-content text-center"}).find("a").get("href") | |||
# print(url) | |||
href.append(url) | |||
except AttributeError as e: | |||
print("I can't find the link") | |||
raise e | |||
#finding the rating of the product | |||
rating_score_of_product = product.find("div", {"class": "product-loop-content text-center"}).find("div").find("span").text | |||
rating_item.append(cleanString(rating_score_of_product.strip())) | |||
# print("done") | |||
#finding the rating of the vendors | |||
rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"}) | |||
if rating_score_of_vendor is not None: | |||
rating_score_of_vendor = rating_score_of_vendor.find("strong").text | |||
rating_vendor.append(cleanString(rating_score_of_vendor.strip())) | |||
else: | |||
rating_vendor.append('-1') | |||
# print("done") | |||
#finding the cost in USD | |||
cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text | |||
USD.append(cost) | |||
# print("done") | |||
#finding the name of the vendor | |||
vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"}) | |||
if vendor_name is not None: | |||
vendor_name = vendor_name.find("a").text | |||
vendor.append(cleanString(vendor_name.strip())) | |||
else: | |||
vendor.append(mktName) | |||
# print("done") | |||
#everything else appends a -1 | |||
success.append("-1") | |||
CVE.append("-1") | |||
MS.append("-1") | |||
category.append("-1") | |||
describe.append("-1") | |||
views.append("-1") | |||
reviews.append("-1") | |||
addDate.append("-1") | |||
BTC.append("-1") | |||
EURO.append("-1") | |||
sold.append("-1") | |||
qLeft.append("-1") | |||
shipFrom.append("-1") | |||
shipTo.append("-1") | |||
# print("Done! moving onto the next product!") | |||
# print(len(shipTo)) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) | |||
#called by the crawler to get description links on a listing page | |||
#@param: beautifulsoup object that is using the correct html page (listing page) | |||
#return: list of description links from a listing page | |||
def tormarket_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
listing = soup.findAll('div', {"class": "product-loop-content text-center"}) | |||
for a in listing: | |||
bae = a.find('h2', {"class": "woocommerce-loop-product__title"}).find('a', href=True) | |||
link = bae['href'] | |||
href.append(link) | |||
return href |
@ -0,0 +1,248 @@ | |||
__author__ = 'DarkWeb' | |||
# Here, we are importing the auxiliary functions to clean or convert data | |||
from MarketPlaces.Utilities.utilities import * | |||
# Here, we are importing BeautifulSoup to search through the HTML tree | |||
from bs4 import BeautifulSoup | |||
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) | |||
def wethenorth_description_parser(soup): | |||
# Fields to be parsed | |||
vendor = "-1" # 0 *Vendor_Name | |||
success = "-1" # 1 Vendor_Successful_Transactions | |||
rating_vendor = "-1" # 2 Vendor_Rating | |||
name = "-1" # 3 *Product_Name | |||
describe = "-1" # 4 Product_Description | |||
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) | |||
category = "-1" # 7 Product_Category | |||
views = "-1" # 8 Product_Number_Of_Views | |||
reviews = "-1" # 9 Product_Number_Of_Reviews | |||
rating_item = "-1" # 10 Product_Rating | |||
addDate = "-1" # 11 Product_AddedDate | |||
BTC = "-1" # 12 Product_BTC_SellingPrice | |||
USD = "-1" # 13 Product_USD_SellingPrice | |||
EURO = "-1" # 14 Product_EURO_SellingPrice | |||
sold = "-1" # 15 Product_QuantitySold | |||
left = "-1" # 16 Product_QuantityLeft | |||
shipFrom = "-1" # 17 Product_ShippedFrom | |||
shipTo = "-1" # 18 Product_ShippedTo | |||
# Finding Product Name | |||
listDes = soup.find('div', {'class': "listDes"}) | |||
name = listDes.find('h2').text | |||
name = name.replace('\n', ' ') | |||
name = name.replace(",", "") | |||
name = name.strip() | |||
# Finding Vendor | |||
vendor = listDes.find('b').text | |||
vendor = vendor.replace(",", "") | |||
vendor = vendor.replace("...", "") | |||
vendor = vendor.replace("-", "") | |||
vendor = vendor.strip() | |||
# Finding Vendor Rating | |||
# rating = listDes.find('span',{'class':'levelSet'}) | |||
# rating = rating.text | |||
# rating = rating.replace('\n', ' ') | |||
# rating = rating.replace(",", "") | |||
# rating = rating.strip() | |||
# Finding Successful Transactions | |||
success = listDes.find_all('p')[1] | |||
success = success.find('span').text | |||
success = success.split() | |||
success = success[0].strip() | |||
# Finding Prices - all prices in We The North are in CAD, I left the CAD in the resulting String so that it would show CAD for all prices | |||
padp = listDes.find('p',{'class':'padp'}) | |||
USD = padp.find('span').text | |||
USD = USD.strip() | |||
# Finding Escrow - no escrow on WTN market | |||
shipping_info = listDes.find('tbody') | |||
if "Digital" not in shipping_info: | |||
shipping_info = shipping_info.find_all('tr') | |||
row1 = shipping_info[0].find_all('td') | |||
# Finding Shipment Information (Origin) | |||
shipFrom = row1[-1].text | |||
shipFrom=shipFrom.strip() | |||
if shipFrom=="": | |||
shipFrom="-1" | |||
row2 = shipping_info[1].find_all('td') | |||
# Finding Shipment Information (Destination) | |||
shipTo = row2[-1].text | |||
shipTo= shipTo.strip() | |||
if shipTo == "": | |||
shipTo = "-1" | |||
# Finding the Product description | |||
describe = soup.find("div",{'class':'tabcontent'}) | |||
describe = describe.find('p').text | |||
describe = describe.replace("\n", " ") | |||
describe = describe.replace("\r", " ") | |||
describe = describe.strip() | |||
''' | |||
# Finding the Number of Product Reviews | |||
tag = soup.findAll(text=re.compile('Reviews')) | |||
for index in tag: | |||
reviews = index | |||
par = reviews.find('(') | |||
if par >=0: | |||
reviews = reviews.replace("Reviews (","") | |||
reviews = reviews.replace(")","") | |||
reviews = reviews.split(",") | |||
review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) | |||
else : | |||
review = "-1" | |||
''' | |||
# Searching for CVE and MS categories | |||
# no CVE or MS for WTN market | |||
# Populating the final variable (this should be a list with all fields scraped) | |||
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, | |||
BTC, USD, EURO, sold, left, shipFrom, shipTo) | |||
# Sending the results | |||
return row | |||
# This is the method to parse the Listing Pages | |||
def wethenorth_listing_parser(soup): | |||
# Fields to be parsed | |||
nm = 0 # *Total_Products (Should be Integer) | |||
mktName = "WeTheNorth" # 0 *Marketplace_Name | |||
vendor = [] # 1 *Vendor y | |||
rating_vendor = [] # 2 Vendor_Rating | |||
success = [] # 3 Vendor_Successful_Transactions | |||
name = [] # 4 *Product_Name y | |||
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) | |||
MS = [] # 6 Product_MS_Classification (Microsoft Security) | |||
category = [] # 7 Product_Category y | |||
describe = [] # 8 Product_Description | |||
views = [] # 9 Product_Number_Of_Views | |||
reviews = [] # 10 Product_Number_Of_Reviews | |||
rating_item = [] # 11 Product_Rating | |||
addDate = [] # 12 Product_AddDate | |||
BTC = [] # 13 Product_BTC_SellingPrice | |||
USD = [] # 14 Product_USD_SellingPrice y | |||
EURO = [] # 15 Product_EURO_SellingPrice | |||
sold = [] # 16 Product_QuantitySold | |||
qLeft =[] # 17 Product_QuantityLeft | |||
shipFrom = [] # 18 Product_ShippedFrom | |||
shipTo = [] # 19 Product_ShippedTo | |||
href = [] # 20 Product_Links | |||
right_content = soup.find('div', {"class": "right-content"}) | |||
listing = right_content.findAll('div', {"class": "col-1search"}) | |||
listing = listing[3:] | |||
# Populating the Number of Products | |||
nm = len(listing) | |||
for a in listing: | |||
bae = a.findAll('a', href=True) | |||
# Adding the url to the list of urls | |||
link = bae[0].get('href') | |||
link = cleanLink(link) | |||
href.append(link) | |||
# Finding the Vendor | |||
vendor_name = a.find('p', {'class': 'padp'}) | |||
vendor_name = vendor_name.find('a').text | |||
vendor_name = vendor_name.replace(",", "") | |||
vendor_name = vendor_name.strip() | |||
vendor.append(vendor_name) | |||
# Finding the Product | |||
product = bae[0].text | |||
product = product.replace('\n', ' ') | |||
product = product.replace(",", "") | |||
product = product.strip() | |||
name.append(product) | |||
# Finding the Category | |||
category_name = a.find('p', {'class': 'padp'}).text | |||
first_dash = category_name.find('-') | |||
second_dash = category_name[first_dash+1:].find('-') | |||
category_name = category_name[first_dash+1:second_dash] | |||
category_name=category_name.strip() | |||
category.append(category_name) | |||
# Finding Views | |||
view_count = a.text | |||
view_count = view_count[view_count.find('Views:'): view_count.find('Sales:')] | |||
view_count = view_count.replace('Views:', ' ') | |||
view_count = view_count.replace('/', ' ') | |||
view_count = view_count.strip() | |||
views.append(view_count) | |||
# Finding success sales | |||
sold_count = a.text | |||
sold_count = sold_count[sold_count.find('Sales:'): sold_count.find('Short')] | |||
sold_count = sold_count.replace('Sales:', ' ') | |||
sold_count = sold_count.replace('/', ' ') | |||
sold_count = sold_count.strip() | |||
success.append(sold_count) | |||
# Searching for CVE and MS categories | |||
# no CVE or MS in WTN market | |||
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) | |||
if not cve: | |||
cveValue="-1" | |||
else: | |||
cee = " " | |||
for idx in cve: | |||
cee += (idx) | |||
cee += " " | |||
cee = cee.replace(',', ' ') | |||
cee = cee.replace('\n', '') | |||
cveValue=cee | |||
CVE.append(cveValue) | |||
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) | |||
if not ms: | |||
MSValue="-1" | |||
else: | |||
me = " " | |||
for im in ms: | |||
me += (im) | |||
me += " " | |||
me = me.replace(',', ' ') | |||
me = me.replace('\n', '') | |||
MSValue=me | |||
MS.append(MSValue) | |||
# Populate the final variable (this should be a list with all fields scraped) | |||
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, | |||
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) | |||
def wethenorth_links_parser(soup): | |||
# Returning all links that should be visited by the Crawler | |||
href = [] | |||
right_content = soup.find('div',{"class": "right-content"}) | |||
listing = right_content.findAll('div', {"class": "col-1search"}) | |||
#cut out the irrelevant products that are in blue, the first three products of each page usually unrelated | |||
listing = listing[3:] | |||
for a in listing: | |||
link = a.find('a') | |||
link = link['href'] | |||
href.append(link) | |||
return href |