diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml index 5a5ac36..8489f64 100644 --- a/.idea/DW_Pipeline_Test.iml +++ b/.idea/DW_Pipeline_Test.iml @@ -5,33 +5,4 @@ - - - \ No newline at end of file diff --git a/Forums/AbyssForum/parser.py b/Forums/AbyssForum/parser.py deleted file mode 100644 index 635c494..0000000 --- a/Forums/AbyssForum/parser.py +++ /dev/null @@ -1,166 +0,0 @@ -__author__ = 'Helium' - -# Here, we are importing the auxiliary functions to clean or convert data -from Forums.Utilities.utilities import * -from datetime import date -from datetime import timedelta -import re - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup - -# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) - -def abyssForums_description_parser(soup): - - # Fields to be parsed - - topic = "-1" # 0 topic name - user = [] # 1 all users of each post - addDate = [] # 2 all dated of each post - feedback = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format) - status = [] # 4 all user's authority in each post such as (adm, member, dangerous) - reputation = [] # 5 all users's karma in each post (usually found as a number) - sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post) - post = [] # 7 all messages of each post - interest = [] # 8 all user's interest in each post - image_user = [] # 9 all user avatars of each post - image_post = [] # 10 all first images of each post - - # Finding the topic (should be just one coming from the Listing Page) - - li = soup.find("div", {"class": "page-body"}).find("h2", {"class": "topic-title"}) - topic = li.text.replace(",","") - topic = topic.replace("\n","") - topic = cleanString(topic.strip()) - - regex = re.compile('post has-profile.*') - posts = soup.find_all('div', {"class": regex}) - # print(len(posts)) - - # For each message (post), get all the fields we are interested to: - - for ipost in posts: - - # Finding the author (user) of the post - author = ipost.find('a', {"class": "username"}).text - user.append(cleanString(author)) # Remember to clean the problematic characters - - status.append("-1") - reputation.append("-1") - interest.append("-1") - sign.append("-1") - feedback.append("-1") - image_post.append("-1") - - img = ipost.find('dl', {"class": "postprofile"}).find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = "-1" - image_user.append(img) - - image_user.append("-1") - - date_time_obj = ipost.find('time').attrs - date = date_time_obj['datetime'][0:10] - time = date_time_obj['datetime'][11:19] - date_time_obj = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S') - addDate.append(date_time_obj) - - # Finding the post - - inner = ipost.find('div', {"class": "content"}) - inner = inner.text.strip() - post.append(cleanString(inner)) - - # Populate the final variable (this should be a list with all fields scraped) - - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) - - # Sending the results - - return row - -# This is the method to parse the Listing Pages (one page with many posts) -def abyssForums_listing_parser(soup: BeautifulSoup): - - - nm = 0 # this variable should receive the number of topics - forum = "AbyssForum" # 0 *forum name - board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - author = [] # 2 all authors of each topic - topic = [] # 3 all topics - views = [] # 4 number of views of each topic - posts = [] # 5 number of posts of each topic - href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between - addDate = [] # when the topic was created (difficult to find) - image_author = [] # 8 all author avatars used in each topic - - # Listing and Description pages) - #finding the board - - board = soup.find("h2", {"class": "forum-title"}).text - board = cleanString(board.strip()) - - type_of_posts = soup.find_all("li", {"class": re.compile("row bg\d")} ) - for literature in type_of_posts: - title_of_post = literature.find("a", {"class": "topictitle"}).text - title_of_post = cleanString(title_of_post) - topic.append(title_of_post) - user = literature.find("div", {"class": "topic-poster responsive-hide left-box"}).find("a", {"class": "username"}).text - author.append(user) - num_post = literature.find("dd", {"class": "posts"}).text.replace("Replies","").strip() - posts.append(num_post) - num_view = literature.find("dd", {"class": "views"}).text.replace("Views","").strip() - views.append(num_view) - #if int(num_post) != 0: join the last user who posted with the author? - # reply = literature.find("dd", {"class": "lastpost"}).find("a", {"class": "username"}).text - # user.append(reply) - - date_time_obj = literature.find('time').attrs - date = date_time_obj['datetime'][0:10] - time = date_time_obj['datetime'][11:19] - date_added = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S') - - addDate.append(date_added) - - listing_href = literature.find("a", {"class": "topictitle"}).get("href") - href.append(listing_href) - - image_author.append("-1") - - nm = len(topic) - - return organizeTopics( - forum=forum, - nm=nm, - board=board, - author=author, - topic=topic, - views=views, - posts=posts, - href=href, - addDate=addDate, - image_author=image_author - ) - - - - -def abyssForum_links_parser(soup): - - # Returning all links that should be visited by the Crawler - - href = [] - #print(soup.find('table', {"class": "tborder clear"}).find( - # 'tbody').find_all('tr', {"class": "inline_row"})) - listing = soup.find_all('dl', {"class": "row-item topic_read"}) - - for a in listing: - link = a.find('div', {"class": "list-inner"}).find('a').get('href') - - href.append(link) - - return href \ No newline at end of file diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py deleted file mode 100644 index 4dfa963..0000000 --- a/Forums/Altenens/crawler_selenium.py +++ /dev/null @@ -1,298 +0,0 @@ -__author__ = 'Helium' - -''' -Altenens Forum Crawler (Selenium) -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image - -import urllib.parse as urlparse -import os, re, time -from datetime import date -import configparser -import subprocess -from bs4 import BeautifulSoup -from Forums.Initialization.prepare_parser import new_parse -from Forums.Altenens.parser import altenens_links_parser -from Forums.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'https://altenens.is/' - - -# Opens Tor Browser, crawls the website -def startCrawling(): - forumName = getForumName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(forumName, baseURL, True) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - #click login button - login_link = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]').get_attribute('href') - driver.get(login_link) # open tab with url - - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[1]/dd/input') - #Username here - usernameBox.send_keys('mylittlepony45')#sends string to the username box - passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[2]/dd/div/div/input') - #Password here - passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox - - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - # wait for 50 sec until id = tab_content is found, then cont - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]'))) - - -# Returns the name of the website -def getForumName(): - name = 'Altenens' - return name - - -# Return the link of the website -def getFixedURL(): - url = 'https://altenens.is/' - return url - - -# Closes Tor Browser -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() #close tab - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from Forums.Initialization.forums_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - # ff_prof.set_preference("network.dns.disablePrefetch", True) - # ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", True) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url)# open url in browser - return driver - except: - driver.close()# close tab - return 'down' - - -# Saves the crawled html page -def savePage(driver, html, url): - cleanPage = cleanHTML(driver, html) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -def getFullPathName(url): - from Forums.Initialization.forums_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if (name == ''): - name = str(counter) - counter = counter + 1 - return name - - -def getInterestedLinks(): - links = [] - - # Hacking - links.append('https://altenens.is/forums/hacking.469162/') - # Hacking showoff - links.append('https://altenens.is/forums/hacking-showoff.469232/') - # Remote administration - links.append('https://altenens.is/forums/remote-administration.469161/') - # Cracking tools - links.append('https://altenens.is/forums/cracking-tools.469204/') - # Cracking tutorials - links.append('https://altenens.is/forums/cracking-tutorials-other-methods.469205/') - # Combo lists and configs - links.append('https://altenens.is/forums/combolists-and-configs.469206/') - # Programming - links.append('https://altenens.is/forums/programming.469239/') - - return links - - -# newest version of crawling -def crawlForum(driver): - print("Crawling the Altenens forum") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - - topics = topicPages(html) - for topic in topics: - has_next_topic_page = True - counter = 1 - page = topic - - while has_next_topic_page: - itemURL = urlparse.urljoin(baseURL, str(page)) - try: - driver.get(itemURL) - except: - driver.refresh() - - if isListingLink(driver.current_url): - break - - savePage(driver, driver.page_source, topic + f"page{counter}") # very important - - # # comment out - # if counter == 2: - # break - - try: - page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') - if page == "": - raise NoSuchElementException - counter += 1 - - except NoSuchElementException: - has_next_topic_page = False - - try: - driver.get(link) - except: - driver.refresh() - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - try: - link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the Altenens forum done.") - - -# Returns 'True' if the link is Topic link, may need to change for every website -def isDescriptionLink(url): - if 'threads' in url: - return True - return False - - -# Returns True if the link is a listingPage link, may need to change for every website -def isListingLink(url): - if '.is/forums' in url: - return True - return False - - -# calling the parser to define the links -def topicPages(html): - soup = BeautifulSoup(html, "html.parser") - #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) - return altenens_links_parser(soup) - - -def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/Forums/Altenens/parser.py b/Forums/Altenens/parser.py deleted file mode 100644 index e056cb2..0000000 --- a/Forums/Altenens/parser.py +++ /dev/null @@ -1,165 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from Forums.Utilities.utilities import * -from datetime import date -from datetime import timedelta -import re - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup - - -# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) -def altenens_description_parser(soup): - - topic = "-1" # 0 *topic name - user = [] # 1 *all users of each post - status = [] # 2 all user's authority in each post such as (adm, member, dangerous) - reputation = [] # 3 all user's karma in each post (usually found as a number) - interest = [] # 4 all user's interest in each post - sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) - post = [] # 6 all messages of each post - feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) - addDate = [] # 8 all dates of each post - image_user = [] # 9 all user avatars of each post - image_post = [] # 10 all first images of each post - - etopic = soup.find("h1", {"class": "p-title-value"}) - if etopic is not None: - topic = etopic.text - topic = cleanString(topic.strip()) - - body = soup.find('div', {"class": "block-container lbContainer"}) - iposts = body.find_all('article', {"class": "message message--post js-post js-inlineModContainer"}) - - for ipost in iposts: - - author = ipost.find('h4', {"class": "message-name"}).text - user.append(cleanString(author.strip())) - - stat = ipost.find('h5', {"class": "userTitle message-userTitle"}).text - status.append(cleanString(stat.strip())) - - bar = ipost.find('div', {"class": "xtr-progress-bar"}) - if bar is not None: - rep = bar.find('p').get('data-value') - else: - rep = "-1" - reputation.append(cleanString(rep)) - - interest.append("-1") - - signature = ipost.find('aside', {"class": "message-signature"}) - if signature is not None: - signature = signature.text.strip() - else: - signature = "-1" - sign.append(cleanString(signature)) - - inner = ipost.find('div', {"class": "bbWrapper"}).find(text=True, recursive=False) - if inner is not None: - inner = inner.strip() - else: - inner = "" # cannot use -1 because the post is hidden unless you reply - post.append(cleanString(inner)) - - feedback.append("-1") - - dt = ipost.find('time', {"class": "u-dt"}).get('datetime') - date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z') - addDate.append(date_time_obj) - - img = ipost.find('div', {"class": "message-avatar-wrapper"}).find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = "-1" - image_user.append(img) - - image_post.append("-1") - - # Populate the final variable (this should be a list with all fields scraped) - - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) - - # Sending the results - - return row - - -# This is the method to parse the Listing Pages (one page with many posts) -def altenens_listing_parser(soup): - - nm = 0 # *this variable should receive the number of topics - forum = "Altenens" # 0 *forum name - board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - author = [] # 2 *all authors of each topic - topic = [] # 3 *all topics - views = [] # 4 number of views of each topic - posts = [] # 5 number of posts of each topic - href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between - # Listing and Description pages) - addDate = [] # 7 when the topic was created (difficult to find) - image_author = [] # 8 all author avatars used in each topic - - board = soup.find('h1', {"class": "p-title-value"}).text - board = cleanString(board.strip()) - - regex = re.compile('structItem structItem--thread.*') - itopics = soup.find_all('div', {"class": regex}) - - nm = len(itopics) - - for itopic in itopics: - - topics = itopic.find('div', {"class": "structItem-title"}).text - topic.append(cleanString(topics.strip())) - - author_icon = itopic.find('a', {"class": "avatar avatar--s"}) - if author_icon != None: - author_icon = author_icon.find('img') - author_icon = author_icon.get('src') - author_icon = author_icon.split('base64,')[-1] - else: - author_icon = "-1" - image_author.append(author_icon) - - link = itopic.find('div', {"class": "structItem-title"}).find('a').get('href') - href.append(link) - - user = itopic.find('ul', {"class": "structItem-parts"}).find('a').text - author.append(cleanString(user.strip())) - - dt = itopic.find('time', {"class": "u-dt"}).get('datetime') - date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z') - addDate.append(date_time_obj) - - nposts = itopic.find('dl', {"class": "pairs pairs--justified"}).text - nposts = nposts.replace('Replies', '') - nposts = nposts.replace('K', '000') - posts.append(cleanString(nposts)) - - nviews = itopic.find('dl', {"class": "pairs pairs--justified structItem-minor"}).text - nviews = nviews.replace('Views', '') - nviews = nviews.replace('K', '000') - views.append(cleanString(nviews)) - - return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author) - - -def altenens_links_parser(soup): - - # Returning all links that should be visited by the Crawler - - href = [] - - listing = soup.find_all('div', {"class": "structItem-cell structItem-cell--main"}) - - for a in listing: - link = a.find('a', {"class": ""}).get('href') - - href.append(link) - - return href \ No newline at end of file diff --git a/Forums/Cardingleaks/crawler_selenium.py b/Forums/Cardingleaks/crawler_selenium.py deleted file mode 100644 index 0712956..0000000 --- a/Forums/Cardingleaks/crawler_selenium.py +++ /dev/null @@ -1,303 +0,0 @@ -__author__ = 'DarkWeb' - -''' -Cardingleaks Forum Crawler (Selenium) -Crawler updated and fixed - -The site has this thing sometime where you'll have to look at a new post everyday. makes sure -you login first before crawling. -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait - -from PIL import Image -import urllib.parse as urlparse -import os, re, time -import subprocess -from bs4 import BeautifulSoup -from Forums.Initialization.prepare_parser import new_parse -from Forums.Cardingleaks.parser import cardingleaks_links_parser -from Forums.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'https://leaks.ws/' - - -# Opens Tor Browser, crawls the website -def startCrawling(): - forumName = getForumName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(forumName, baseURL, True) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - #click login button - login_link = driver.find_element( - by=By.XPATH, value='/html/body/div[2]/div[1]/nav/div/div[3]/div[1]/a[1]').\ - get_attribute('href') - driver.get(login_link)# open tab with url - - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.NAME, value='login') - #Username here - usernameBox.send_keys('somanyfrogs')#sends string to the username box - passwordBox = driver.find_element(by=By.NAME, value='password') - #Password here - passwordBox.send_keys('therearewaytoomanyherehowwhy')# sends string to passwordBox - - login = driver.find_element(by=By.CLASS_NAME, value='block-container') - login_link = login.find_element(by=By.TAG_NAME, value='button') - login_link.click() - - # input('input') - - # wait for listing page show up (This Xpath may need to change based on different seed url) - # wait for 50 sec until id = tab_content is found, then cont - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.CLASS_NAME, 'p-body-pageContent'))) - - -# Returns the name of the website -def getForumName() -> str: - name = 'Cardingleaks' - return name - - -# Return the link of the website -def getFixedURL(): - url = 'https://leaks.ws/' - return url - - -# Closes Tor Browser -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() #close tab - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from Forums.Initialization.forums_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", True) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url) - return driver - except: - driver.close() - return 'down' - - -# Saves the crawled html page -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -def getFullPathName(url): - from Forums.Initialization.forums_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if name == '': - name = str(counter) - counter = counter + 1 - return name - - -def getInterestedLinks(): - links = [] - - # carding methods - links.append('https://leaks.ws/forums/carding-methods.82/') - # # carding schools - # links.append('https://leaks.ws/forums/help-desk-carding-school.35/') - # # carding discussion - # links.append('https://leaks.ws/forums/carding-discussion-desk.58/') - # # carding tutorials - # links.append('https://leaks.ws/forums/carding-tutorials.13/') - # # carding tools and software - # links.append('https://leaks.ws/forums/carding-tools-softwares.10/') - # # exploits and cracking tools - # links.append('https://leaks.ws/forums/exploits-cracking-tools.22/') - - return links - - -def crawlForum(driver): - print("Crawling the Cardingleaks forum") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - - topics = topicPages(html) - for topic in topics: - has_next_topic_page = True - counter = 1 - page = topic - - while has_next_topic_page: - itemURL = urlparse.urljoin(baseURL, str(page)) - try: - driver.get(itemURL) - except: - driver.refresh() - - if isListingLink(driver.current_url): - break - - savePage(driver, driver.page_source, topic + f"page{counter}") # very important - - # # comment out - # if counter == 2: - # break - - try: - page = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') - if page == "": - raise NoSuchElementException - counter += 1 - - except NoSuchElementException: - has_next_topic_page = False - - # making sure we go back to the listing page (browser back button simulation) - try: - driver.get(link) - except: - driver.refresh() - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - try: - link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the Cardingleaks forum done.") - - -# Returns 'True' if the link is Topic link, may need to change for every website -def isDescriptionLink(url): - if 'threads' in url: - return True - return False - - -# Returns True if the link is a listingPage link, may need to change for every website -def isListingLink(url): - if '.ws/forums' in url: - return True - return False - - -# calling the parser to define the links -def topicPages(html): - soup = BeautifulSoup(html, "html.parser") - return cardingleaks_links_parser(soup) - - -def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/Forums/Cardingleaks/parser.py b/Forums/Cardingleaks/parser.py deleted file mode 100644 index a2da87b..0000000 --- a/Forums/Cardingleaks/parser.py +++ /dev/null @@ -1,167 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from Forums.Utilities.utilities import * -from datetime import date -from datetime import timedelta -import re - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup, ResultSet, Tag - -# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) - - -def cardingleaks_description_parser(soup: Tag): - - # Fields to be parsed - - topic = "-1" # 0 *topic name - user = [] # 1 *all users of each post - status = [] # 2 all user's authority in each post such as (adm, member, dangerous) - reputation = [] # 3 all user's karma in each post (usually found as a number) - interest = [] # 4 all user's interest in each post - sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) - post = [] # 6 all messages of each post - feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) - addDate = [] # 8 all dates of each post - image_user = [] # 9 all user avatars of each post - image_post = [] # 10 all first images of each post - - li = soup.find("h1", {"class": "p-title-value"}) - topic = cleanString(li.text.strip()) - - post_list: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True}) - - for ipost in post_list: - username = ipost.get('data-author') - user.append(username) - - user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text - status.append(cleanString(user_status.strip())) - - user_statistics: ResultSet[Tag] = ipost.find("div", {"class": "message-userExtras"}).find_all("dl", {"class": "pairs pairs--justified"}) - - user_reputation = "-1" - - for stat in user_statistics: - data_type = stat.find("span").get("data-original-title") - if data_type == "Points": - user_reputation = stat.find("dd").text - break - - reputation.append(cleanString(user_reputation.strip())) - - interest.append("-1") - - sign.append("-1") - - user_post = ipost.find("div", {"class": "message-content js-messageContent"}).text - post.append(cleanString(user_post.strip())) - - feedback.append("-1") - - datetime_text = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime") - datetime_obj = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S%z") - addDate.append(datetime_obj) - - img = ipost.find('div', {"class": "message-content js-messageContent"}).find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = "-1" - image_post.append(img) - - img = ipost.find('div', {"class": "message-avatar"}).find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = "-1" - image_user.append(img) - - # Populate the final variable (this should be a list with all fields scraped) - - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) - - # Sending the results - - return row - -# This is the method to parse the Listing Pages (one page with many posts) - -def cardingleaks_listing_parser(soup: Tag): - - nm = 0 # *this variable should receive the number of topics - forum = "Cardingleaks" # 0 *forum name - board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - author = [] # 2 *all authors of each topic - topic = [] # 3 *all topics - views = [] # 4 number of views of each topic - posts = [] # 5 number of posts of each topic - href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between - # Listing and Description pages) - addDate = [] # 7 when the topic was created (difficult to find) - image_user = [] # 8 all user avatars used in each topic - - # Finding the board (should be just one) - - li = soup.find("h1", {"class": "p-title-value"}) - board = cleanString(li.text.strip()) - - thread_list = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True}) - - sticky = soup.find('div', {"class": "structItemContainer-group structItemContainer-group--sticky"}) - if sticky is not None: - thread_list = sticky.find_all("div", {"data-author": True}) + thread_list - - nm = len(thread_list) - - for thread in thread_list: - thread_author = thread.get("data-author") - author.append(thread_author) - - thread_topic = thread.find("div", {"class": "structItem-title"}).text - topic.append(cleanString(thread_topic.strip())) - - author_icon = thread.find("a", {"class": "avatar avatar--s"}) - if author_icon is not None: - author_icon = author_icon.find('img') - if author_icon is not None: - author_icon = author_icon.get('src').split('base64,')[-1] - image_user.append(author_icon) - else: - image_user.append('-1') - else: - image_user.append('-1') - - thread_view = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find("dd").text - # Context text view count (i.e., 8.8K) to numerical (i.e., 8800) - if thread_view.find("K") > 0: - thread_view = str(int(float(thread_view.replace("K", "")) * 1000)) - views.append(thread_view) - - thread_posts = thread.find("dl", {"class": "pairs pairs--justified"}).find("dd").text - posts.append(cleanString(thread_posts.strip())) - - thread_href = thread.find("div", {"class": "structItem-title"}).find("a").get("href") - href.append(thread_href) - - thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime") - datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z") - addDate.append(datetime_obj) - - return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_user) - - -def cardingleaks_links_parser(soup): - # Returning all links that should be visited by the Crawler - href = [] - listing = soup.find_all('div', {"class": "structItem-title"}) - - for a in listing: - link = a.find('a').get('href') - - href.append(link) - - return [href[-1]] diff --git a/Forums/CryptBB/crawler_mechanize.py b/Forums/CryptBB/crawler_mechanize.py deleted file mode 100644 index 7a763c6..0000000 --- a/Forums/CryptBB/crawler_mechanize.py +++ /dev/null @@ -1,257 +0,0 @@ -__author__ = '91Shadows' - -''' -CryptBB Crawler (Mechanize) -''' - -import codecs, os, re -import socks, socket, time -from datetime import date - -import urllib.parse as urlparse -import http.client as httplib -import mechanize -import subprocess -from bs4 import BeautifulSoup -from Forums.Initialization.prepare_parser import new_parse -from Forums.BestCardingWorld.parser import bestcardingworld_links_parser - -counter = 1 -httplib.HTTPConnection._http_vsn = 10 -httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' -baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5' -socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150) - - -# Opens Tor Browser, crawls the website -def startCrawling(): - opentor() - getUrl() - forumName = getForumName() - br = getAccess() - - if br != 'down': - crawlForum(br) - new_parse(forumName, False) - - # new_parse(forumName, False) - - closetor() - - -# Opens Tor Browser -def opentor(): - global pid - print("Connecting Tor...") - path = open('../../path.txt').readline() - pro = subprocess.Popen(path) - pid = pro.pid - time.sleep(7.5) - input("Tor Connected. Press ENTER to continue\n") - return - - -# Creates a connection through Tor Port -def getUrl(timeout=None): - socket.socket = socks.socksocket - socket.create_connection = create_connection - return - - -# Makes the onion address request -def create_connection(address, timeout=None, source_address=None): - sock = socks.socksocket() - sock.connect(address) - return sock - - -# Returns the name of website -def getForumName(): - name = 'CryptBB' - return name - - -# Return the link of website -def getFixedURL(): - url = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5' - - return url - - -# Closes Tor Browser -def closetor(): - global pid - os.system("taskkill /pid " + str(pid)) - print('Closing Tor...') - time.sleep(3) - return - - -# Creates a Mechanize browser and initializes its options -def createBrowser(): - br = mechanize.Browser() - cj = mechanize.CookieJar() - br.set_cookiejar(cj) - - # Browser options - br.set_handle_equiv(True) - br.set_handle_redirect(True) - br.set_handle_referer(True) - br.set_handle_robots(False) - br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) - br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'), - ('Accept', '*/*')] - - return br - - -def getAccess(): - url = getFixedURL() - br = createBrowser() - - try: - - br.open(url) - return br - - except: - - return 'down' - - -# Saves the crawled html page -def savePage(page, url): - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - a = page.read() - open(filePath, "wb").write(a) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -def getFullPathName(url): - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + '/' + 'Description/' + fileName + '.html' - else: - fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + '/' + 'Listing/' + fileName + '.html' - return fullPath - - -# Creates the name of the file based on URL -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if (name == ''): - name = str(counter) - counter = counter + 1 - return name - - -# Hacking and Markets related topics -def getInterestedLinks(): - links = [] - - links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43&sid=e12864ffccc5df877b03b573534955be') - - return links - - -# Start crawling Forum pages -def crawlForum(br): - print("Crawling CryptBB forum") - - linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - page = br.open(link) - savePage(page, link) - - res = br.response().read() - soup = BeautifulSoup(res, 'html.parser') - - next_link = soup.find("a", {"rel": "next"}) - if next_link != None: - full_url = urlparse.urljoin(linksToCrawl[i], next_link['href']) - linksToCrawl.insert(i + 1, full_url) - - listOfTopics = findDescriptionPages(link) - for topic in listOfTopics: - itemPage = br.open(str(topic)) - savePage(itemPage, topic) - - except Exception as e: - print('Error getting link: ', link, e) - i += 1 - - # finalTime = time.time() - # print finalTime - initialTime - - input("CryptBB forum done sucessfully. Press ENTER to continue\n") - - return - - -# Returns True if the link is 'Topic' Links, may need to change for diff websites -def isDescriptionLink(url): - if 'topic' in url: - return True - return False - - -# Returns True if the link is a listingPage link, may need to change for diff websites -def isListingLink(url): - ''' - reg = 'board=[0-9]+.[0-9]+\Z' - if len(re.findall(reg, url)) == 0: - return False - return True - ''' - if 'forum' in url: - return True - return False - - -# calling the parser to define the links -def findDescriptionPages(url): - soup = "" - - error = False - try: - html = codecs.open( - "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html", encoding='utf8') - soup = BeautifulSoup(html, "html.parser") - except: - try: - html = open( - "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html") - soup = BeautifulSoup(html, "html.parser") - except: - error = True - print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.") - - if not error: - return bestcardingworld_links_parser(soup) - - else: - return [] - - -def crawler(): - startCrawling() - print("Crawling and Parsing CryptBB .... DONE!") diff --git a/Forums/CryptBB/crawler_selenium.py b/Forums/CryptBB/crawler_selenium.py deleted file mode 100644 index e48b193..0000000 --- a/Forums/CryptBB/crawler_selenium.py +++ /dev/null @@ -1,331 +0,0 @@ -__author__ = 'DarkWeb' - -''' -CryptBB Forum Crawler (Selenium) -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait - -from PIL import Image -import urllib.parse as urlparse -import os, re, time -import subprocess -from bs4 import BeautifulSoup -from Forums.Initialization.prepare_parser import new_parse -from Forums.CryptBB.parser import cryptBB_links_parser -from Forums.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/' - - -# Opens Tor Browser, crawls the website -def startCrawling(): - forumName = getForumName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(forumName, baseURL, True) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - #click login button - login_link = driver.find_element( - by=By.XPATH, value='/html/body/div/div[2]/div/table/tbody/tr[2]/td/center/pre/strong/a[1]').\ - get_attribute('href') - driver.get(login_link)# open tab with url - - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input') - #Username here - usernameBox.send_keys('holyre')#sends string to the username box - passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input') - #Password here - passwordBox.send_keys('PlatinumBorn2')# sends string to passwordBox - - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div[2]/div/form/div/input"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='//*[@id="captcha_img"]').screenshot(r'..\CryptBB\captcha.png') - - # This method will show image in any image viewer - im = Image.open(r'..\CryptBB\captcha.png') - - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='//*[@id="imagestring"]') - - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div/form/div/input").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - # wait for 50 sec until id = tab_content is found, then cont - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="tab_content"]'))) - - -# Returns the name of the website -def getForumName() -> str: - name = 'CryptBB' - return name - - -# Return the link of the website -def getFixedURL(): - url = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/' - return url - - -# Closes Tor Browser -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() #close tab - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from Forums.Initialization.forums_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", True) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url) - return driver - except: - driver.close() - return 'down' - - -# Saves the crawled html page -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -def getFullPathName(url): - from Forums.Initialization.forums_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if name == '': - name = str(counter) - counter = counter + 1 - return name - - -def getInterestedLinks(): - links = [] - - # Beginner Programming - links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86') - # Beginner Carding and Fraud - links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91') - # Beginner Hacking - links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87') - # Newbie - links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=84') - # Beginner Hardware - links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=89') - # Training Challenges - links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=96') - # Darknet Discussions - links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88') - # Public Leaks and Warez - links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=97') - # Sell - links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=44') - - return links - - -def crawlForum(driver): - print("Crawling the CryptBB forum") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - - topics = topicPages(html) - for topic in topics: - has_next_topic_page = True - counter = 1 - page = topic - - while has_next_topic_page: - itemURL = urlparse.urljoin(baseURL, str(page)) - try: - driver.get(itemURL) - except: - driver.refresh() - - if isListingLink(driver.current_url): - break - - savePage(driver, driver.page_source, topic + f"page{counter}") # very important - - # # comment out - # if counter == 2: - # break - - try: - temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') - page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') - - if page == "": - raise NoSuchElementException - counter += 1 - - except NoSuchElementException: - has_next_topic_page = False - - # making sure we go back to the listing page (browser back button simulation) - try: - driver.get(link) - except: - driver.refresh() - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - try: - temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div') - link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') - - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the CrypttBB forum done.") - - -# Returns 'True' if the link is Topic link, may need to change for every website -def isDescriptionLink(url): - if 'thread' in url: - return True - return False - - -# Returns True if the link is a listingPage link, may need to change for every website -def isListingLink(url): - if '.onion/forumdisplay' in url: - return True - return False - - -# calling the parser to define the links -def topicPages(html): - soup = BeautifulSoup(html, "html.parser") - return cryptBB_links_parser(soup) - - -def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py deleted file mode 100644 index 1ac7bc6..0000000 --- a/Forums/CryptBB/parser.py +++ /dev/null @@ -1,282 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from Forums.Utilities.utilities import * -from datetime import date -from datetime import timedelta -import re - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup - -# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) - - -def cryptBB_description_parser(soup): - - # Fields to be parsed - - topic = "-1" # 0 *topic name - user = [] # 1 *all users of each post - status = [] # 2 all user's authority in each post such as (adm, member, dangerous) - reputation = [] # 3 all user's karma in each post (usually found as a number) - interest = [] # 4 all user's interest in each post - sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) - post = [] # 6 all messages of each post - feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) - addDate = [] # 8 all dates of each post - image_user = [] # 9 all user avatars of each post - image_post = [] # 10 all first images of each post - - # Finding the topic (should be just one coming from the Listing Page) - - li = soup.find("td", {"class": "thead"}).find('strong') - topic = li.text - topic = re.sub("\[\w*\]", '', topic) - - topic = topic.replace(",","") - topic = topic.replace("\n","") - topic = cleanString(topic.strip()) - - # Finding the repeated tag that corresponds to the listing of posts - - posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all( - 'div', {"class": "post"}) - - # For each message (post), get all the fields we are interested to: - - for ipost in posts: - - if ipost.find('div', {"class": "deleted_post_author"}): - continue - - # Finding a first level of the HTML page - - post_wrapper = ipost.find('span', {"class": "largetext"}) - - # Finding the author (user) of the post - - author = post_wrapper.text.strip() - user.append(cleanString(author)) # Remember to clean the problematic characters - - # Finding the status of the author - - smalltext = ipost.find('div', {"class": "post_author"}) - - if smalltext is not None: - - # CryptBB does have membergroup and postgroup - membergroup = smalltext.find('div', {"class": "profile-rank"}) - postgroup = smalltext.find('div', {"class": "postgroup"}) - if membergroup != None: - membergroup = membergroup.text.strip() - if postgroup != None: - postgroup = postgroup.text.strip() - membergroup = membergroup + " - " + postgroup - else: - if postgroup != None: - membergroup = postgroup.text.strip() - else: - membergroup = "-1" - status.append(cleanString(membergroup)) - - # Finding the interest of the author - # CryptBB does not have blurb - blurb = smalltext.find('li', {"class": "blurb"}) - if blurb != None: - blurb = blurb.text.strip() - else: - blurb = "-1" - interest.append(cleanString(blurb)) - - # Finding the reputation of the user - # CryptBB does have reputation - author_stats = smalltext.find('div', {"class": "author_statistics"}) - karma = author_stats.find('strong') - if karma != None: - karma = karma.text - karma = karma.replace("Community Rating: ", "") - karma = karma.replace("Karma: ", "") - karma = karma.strip() - else: - karma = "-1" - reputation.append(cleanString(karma)) - - else: - status.append('-1') - interest.append('-1') - reputation.append('-1') - - # Getting here another good tag to find the post date, post content and users' signature - - postarea = ipost.find('div', {"class": "post_content"}) - - dt = postarea.find('span', {"class": "post_date"}).text - # dt = dt.strip().split() - dt = dt.strip() - day=date.today() - if "Today" in dt: - today = day.strftime('%m-%d-%Y') - stime = dt.replace('Today,','').strip() - date_time_obj = today + ', '+stime - date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') - elif "Yesterday" in dt: - yesterday = day - timedelta(days=1) - yesterday = yesterday.strftime('%m-%d-%Y') - stime = dt.replace('Yesterday,','').strip() - date_time_obj = yesterday + ', '+stime - date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') - elif "ago" in dt: - date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] - date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p') - else: - date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') - addDate.append(date_time_obj) - - # Finding the post - - inner = postarea.find('div', {"class": "post_body scaleimages"}) - quote = inner.find('blockquote') - if quote is not None: - quote.decompose() - inner = inner.text.strip() - post.append(cleanString(inner)) - - # Finding the user's signature - - # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) - signature = ipost.find('div', {"class": "signature scaleimages"}) - if signature != None: - signature = signature.text.strip() - # print(signature) - else: - signature = "-1" - sign.append(cleanString(signature)) - - # As no information about user's feedback was found, just assign "-1" to the variable - - feedback.append("-1") - - img = ipost.find('div', {"class": "post_body scaleimages"}).find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = "-1" - image_post.append(img) - - avatar = ipost.find('div', {"class": "author_avatar"}) - if avatar is not None: - img = avatar.find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = "-1" - else: - img = "-1" - image_user.append(img) - - # Populate the final variable (this should be a list with all fields scraped) - - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) - - # Sending the results - - return row - -# This is the method to parse the Listing Pages (one page with many posts) - -def cryptBB_listing_parser(soup): - - nm = 0 # *this variable should receive the number of topics - forum = "CryptBB" # 0 *forum name - board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - author = [] # 2 *all authors of each topic - topic = [] # 3 *all topics - views = [] # 4 number of views of each topic - posts = [] # 5 number of posts of each topic - href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between - # Listing and Description pages) - addDate = [] # 7 when the topic was created (difficult to find) - image_author = [] # 8 all author avatars used in each topic - - - # Finding the board (should be just one) - - board = soup.find('span', {"class": "active"}).text - board = cleanString(board.strip()) - - # Finding the repeated tag that corresponds to the listing of topics - - itopics = soup.find_all('tr',{"class": "inline_row"}) - - # Counting how many topics - - nm = len(itopics) - - for itopic in itopics: - - # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them - # to don't miss any topic - - # Adding the topic to the topic list - try: - topics = itopic.find('span', {"class": "subject_old"}).find('a').text - except: - topics = itopic.find('span', {"class": "subject_new"}).find('a').text - topics = re.sub("\[\w*\]", '', topics) - topic.append(cleanString(topics)) - - image_author.append(-1) - - # Adding the url to the list of urls - try: - link = itopic.find('span', {"class": "subject_old"}).find('a').get('href') - except: - link = itopic.find('span',{"class": "subject_new"}).find('a').get('href') - href.append(link) - - # Finding the author of the topic - ps = itopic.find('div', {"class":"author smalltext"}).text - user = ps.strip() - author.append(cleanString(user)) - - # Finding the number of replies - columns = itopic.findChildren('td',recursive=False) - replies = columns[3].text - if replies == '-': - posts.append('-1') - else: - posts.append(cleanString(replies)) - - # Finding the number of Views - tview = columns[4].text - if tview == '-': - views.append('-1') - else: - views.append(cleanString(tview)) - - # If no information about when the topic was added, just assign "-1" to the variable - - addDate.append("-1") - - return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author) - - -def cryptBB_links_parser(soup): - - # Returning all links that should be visited by the Crawler - - href = [] - - listing = soup.find('table', {"class": "tborder clear"}).find('tbody').find_all('tr', {"class": "inline_row"}) - - for a in listing: - try: - link = a.find('span', {"class": "subject_old"}).find('a').get('href') - except: - link = a.find('span', {"class": "subject_new"}).find('a').get('href') - - href.append(link) - - return href diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/DWForums/crawler_selenium.py similarity index 67% rename from Forums/HiddenAnswers/crawler_selenium.py rename to Forums/DWForums/crawler_selenium.py index f972861..d1e1a21 100644 --- a/Forums/HiddenAnswers/crawler_selenium.py +++ b/Forums/DWForums/crawler_selenium.py @@ -1,7 +1,7 @@ -__author__ = 'Helium' +__author__ = 'DarkWeb' ''' -HiddenAnswers Crawler (Selenium) +DWForums Forum Crawler (Selenium) ''' from selenium import webdriver @@ -12,26 +12,24 @@ from selenium.webdriver.firefox.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image import urllib.parse as urlparse -import os, re, time +import os, time from datetime import date -import configparser import subprocess from bs4 import BeautifulSoup from Forums.Initialization.prepare_parser import new_parse -from Forums.HiddenAnswers.parser import hiddenanswers_links_parser +from Forums.DWForums.parser import dwForums_links_parser from Forums.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/' +baseURL = 'http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/' # Opens Tor Browser, crawls the website def startCrawling(): forumName = getForumName() - driver: webdriver.Firefox = getAccess() + driver = getAccess() if driver != 'down': try: @@ -41,25 +39,48 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(forumName, baseURL, True) + new_parse(forumName, baseURL, False) # Login using premade account credentials and do login captcha manually def login(driver): + #click login button + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.CSS_SELECTOR, ".button--icon--user"))) + login_link = driver.find_element(by=By.CSS_SELECTOR, value=".button--icon--user") + login_link.click() + + #entering username and password into input boxes + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[4]/div/div[2]/div/form/div[1]"))) + container = driver.find_element(by=By.XPATH, value="/html/body/div[4]/div/div[2]/div/form/div[1]") + # print(container.get_attribute("outerHTML")) + boxes = container.find_elements(by=By.CLASS_NAME, value="input") + # print(len(boxes)) + + #Username here + boxes[0].send_keys('nice_reamer08') + #Password here + boxes[1].send_keys('tjpv$]Nc}XG@`%LM') + # no captcha on this site + + # click the verify(submit) button + driver.find_element(by=By.CSS_SELECTOR, value=".button--icon--login").click() + # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[2]/div[2]/div/div[2]/div[4]/div/ul/li[14]/a'))) + (By.CSS_SELECTOR, '.p-staffBar-inner > div:nth-child(4) > div:nth-child(1) > a:nth-child(1)'))) # Returns the name of the website def getForumName(): - name = 'HiddenAnswers' + name = 'DWForums' return name # Return the link of the website def getFixedURL(): - url = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/' + url = 'http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/' return url @@ -106,11 +127,12 @@ def createFFDriver(): service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver + def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -157,26 +179,24 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # hacking - links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking') - # darknet and tor - links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/darknet-and-tor') - # internet - links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/internet') - # links - links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/links') - # programming - links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/programming') - # knowledge and information - links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/knowledge-and-information') - # other - links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/other') + # Hacking + links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/hacking-forum.33/') + # # Beginner Carding and Fraud + # links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/remote-administration.34/') + # # Cracking Tools + # links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/cracking-tools.35/') + # # Cracking Tutorials and Other Methods - error here about file not exisitng + # links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/cracking-tutorials-other-methods.36/') + # # Combolists and Configs + # links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/combolists-and-configs.58/') + # # Paid Software and Antivirus + # links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/paid-softwares-and-antivirus.59/') return links -def crawlForum(driver: webdriver.Firefox): - print("Crawling the HiddenAnswers forum") +def crawlForum(driver): + print("Crawling the DWForums forum") linksToCrawl = getInterestedLinks() @@ -208,18 +228,14 @@ def crawlForum(driver: webdriver.Firefox): driver.get(itemURL) except: driver.refresh() - - if isListingLink(driver.current_url): - break + savePage(driver, driver.page_source, topic + f"page{counter}") - savePage(driver, driver.page_source, topic + f"page{counter}") # very important - - # # comment out - # if counter == 2: - # break + # comment out + if counter == 2: + break try: - page = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href') + page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') if page == "": raise NoSuchElementException counter += 1 @@ -227,21 +243,19 @@ def crawlForum(driver: webdriver.Firefox): except NoSuchElementException: has_next_topic_page = False - # making sure we go back to the listing page (browser back button simulation) - try: - driver.get(link) - except: - driver.refresh() + for i in range(counter): + driver.back() + + # comment out + break - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + if count == 1: + break try: - link = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href') + temp = driver.find_element(by=By.LINK_TEXT, value="Next") + link = temp.get_attribute('href') if link == "": raise NoSuchElementException @@ -254,19 +268,19 @@ def crawlForum(driver: webdriver.Firefox): print(link, e) i += 1 - print("Crawling the HiddenAnswers forum done.") + input("Crawling DWForums forum done sucessfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link def isDescriptionLink(url): - if 'http' not in url: + if '/threads/' in url: return True return False # Returns True if the link is a listingPage link def isListingLink(url): - if 'http' in url: + if '/forums/' in url: return True return False @@ -275,9 +289,9 @@ def isListingLink(url): def topicPages(html): soup = BeautifulSoup(html, "html.parser") #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) - return hiddenanswers_links_parser(soup) + return dwForums_links_parser(soup) def crawler(): startCrawling() - # print("Crawling and Parsing Abyss .... DONE!") + # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/Forums/DWForums/parser.py b/Forums/DWForums/parser.py new file mode 100644 index 0000000..e3616e3 --- /dev/null +++ b/Forums/DWForums/parser.py @@ -0,0 +1,312 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from Forums.Utilities.utilities import * +from datetime import date +from datetime import timedelta +import re + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + +# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) + +def dwForums_description_parser(soup): + + # Fields to be parsed + + topic = "-1" # 0 *topic name + user = [] # 1 *all users of each post + status = [] # 2 all user's authority in each post such as (adm, member, dangerous) + reputation = [] # 3 all user's karma in each post (usually found as a number) + interest = [] # 4 all user's interest in each post + sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) + post = [] # 6 all messages of each post + feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) + addDate = [] # 8 all dates of each post + + # Finding the topic (should be just one coming from the Listing Page) + + li = soup.find("h1", {"class": "p-title-value"}) + + topic = li.text + topic = topic.replace(u'\xa0', ' ') + topic = topic.replace(",","") + topic = topic.replace("\n","") + topic = cleanString(topic.strip()) + # print(topic) + # Finding the repeated tag that corresponds to the listing of posts + + # posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \ + # soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"}) + + posts = soup.find('div', {"class": "js-replyNewMessageContainer"}).find_all( + 'article', {"class": "js-post"}, recursive=False) + # print(len(posts)) + + # For each message (post), get all the fields we are interested to: + + for ipost in posts: + + # Finding a first level of the HTML page + + # post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"}) + post_wrapper = ipost.find('h4', {"class": "message-name"}) + # Finding the author (user) of the post + + # author = post_wrapper.find('h4') + author = post_wrapper.text.strip() + # print("author " + author) + user.append(cleanString(author)) # Remember to clean the problematic characters + + # Finding the status of the author + + + + # Testing here two possibilities to find this status and combine them + # if ipost.find('h5', {"class": "deleted_post_author"}): + # status.append(-1) + # interest.append(-1) + # reputation.append(-1) + # addDate.append(-1) + # post.append("THIS POST HAS BEEN REMOVED!") + # sign.append(-1) + # feedback.append(-1) + # continue + + # CryptBB does have membergroup and postgroup + + membergroup = ipost.find('h5', {"class": "userTitle"}) + # DWForums doesnt have postgroups + postgroup = None + if membergroup != None: + membergroup = membergroup.text.strip() + if postgroup != None: + postgroup = postgroup.text.strip() + membergroup = membergroup + " - " + postgroup + else: + if postgroup != None: + membergroup = postgroup.text.strip() + else: + membergroup = "-1" + + status.append(cleanString(membergroup)) + # print("status " + cleanString(membergroup)) + # Finding the interest of the author + # DWForums does not have blurb + blurb = ipost.find('li', {"class": "blurb"}) + if blurb != None: + blurb = blurb.text.strip() + else: + blurb = "-1" + interest.append(cleanString(blurb)) + + # Finding the reputation of the user + # CryptBB does have reputation + author_stats = ipost.find('div', {"class": "message-userExtras"}) + if author_stats != None: + karma = author_stats.find_all('dl', {"class": "pairs"})[2] + else: + karma = None + if karma != None: + karma = karma.text + karma = karma.replace("Reaction score","") + karma = karma.replace(":", "") + karma = karma.strip() + else: + karma = "-1" + reputation.append(cleanString(karma)) + # print("karma " + cleanString(karma)) + # Getting here another good tag to find the post date, post content and users' signature + + postarea = ipost.find('div', {"class": "message-attribution-main"}) + + dt = postarea.find('time', {"class": "u-dt"})['datetime'] + # dt = dt.strip().split() + dt = dt.strip()[:16] + dt = dt.replace("T",", ") + day=date.today() + if "Yesterday" in dt: + yesterday = day - timedelta(days=1) + yesterday = yesterday.strftime('%m-%d-%Y') + stime = dt.replace('Yesterday,','').strip() + date_time_obj = yesterday+ ', '+stime + date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %H:%M') + elif "hours ago" in dt: + day = day.strftime('%m-%d-%Y') + date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] + date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %H:%M') + else: + date_time_obj = datetime.strptime(dt, '%Y-%m-%d, %H:%M') + stime = date_time_obj.strftime('%b %d, %Y') + sdate = date_time_obj.strftime('%I:%M %p') + + + addDate.append(date_time_obj) + # print("date " + str(date_time_obj)) + # Finding the date of the post + # date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p') + # smalltext = postarea.find('div', {"class": "flow_hidden"}).find('div', {"class": "keyinfo"})\ + # .find('div', {"class": "smalltext"}) + # sdatetime = smalltext.text + # sdatetime = sdatetime.replace(u"\xab","") # Removing unnecessary characters + # sdatetime = sdatetime.replace(u"\xbb","") # Removing unnecessary characters + # sdatetime = sdatetime.split("on: ") # Removing unnecessary characters + # sdatetime = sdatetime[1].strip() + # stime = sdatetime[:-12:-1] # Finding the time of the post + # stime = stime[::-1] + # sdate = sdatetime.replace(stime,"") # Finding the date of the post + # sdate = sdate.replace(",","") + # sdate = sdate.strip() + + # Covert the date of the post that can be informed as: "12 February 2016", "today", "yesterday". We need + # a date format here as "mm/dd/yyyy" + + # addDate.append(convertDate(sdate,"english", crawlerDate) + " " + stime) + + # Finding the post + + inner = ipost.find('article', {"class": "message-body"}) + inner = inner.text.strip() + # print(inner) + post.append(cleanString(inner)) + + # Finding the users's signature + + # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) + signature = ipost.find('aside', {"class": "message-signature"}) + if signature != None: + signature = signature.text.strip() + # print(signature) + else: + signature = "-1" + sign.append(cleanString(signature)) + + # As no information about users's feedback was found, just assign "-1" to the variable + + feedback.append("-1") + + # Populate the final variable (this should be a list with all fields scraped) + + row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) + + # Sending the results + + return row + +# This is the method to parse the Listing Pages (one page with many posts) + +def dwForums_listing_parser(soup): + + nm = 0 # *this variable should receive the number of topics + forum = "DWForums" # 0 *forum name + board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + author = [] # 2 *all authors of each topic + topic = [] # 3 *all topics + views = [] # 4 number of views of each topic + posts = [] # 5 number of posts of each topic + href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between + # Listing and Description pages) + addDate = [] # 7 when the topic was created (difficult to find) + + # Finding the board (should be just one) + + board = soup.find('h1', {"class": "p-title-value"}).text + board = cleanString(board.strip()) + + # Finding the repeated tag that corresponds to the listing of topics + + regex = re.compile('.*structItem--thread.*') + itopics = soup.find_all("div", {"class": regex}) + + for itopic in itopics: + + # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them + # to don't miss any topic + + # tds = itopic.findAll('td', {"class": "subject stickybg2"}) + # + # if len(tds) > 0: + # tag.append("strong") + # tag.append("subject stickybg2") + # tag.append("stats stickybg") + # else: + # tds = itopic.findAll('td', {"class": "subject windowbg2"}) + # if len(tds) > 0: + # tag.append("span") + # tag.append("subject windowbg2") + # tag.append("stats windowbg") + + # Adding the topic to the topic list + topics = itopic.find("div", {"class": "structItem-title"}).text + topics = topics.replace(",", "") + topics = topics.replace("\n", "") + topic.append(cleanString(topics.strip())) + + # Counting how many topics we have found so far + + nm = len(topic) + + # Adding the url to the list of urls + link = itopic.select_one('a[href^="/threads/"]') + link = link['href'] + link = cleanLink(link) + href.append(link) + + # Finding the author of the topic + minor = itopic.find('div', {"class": "structItem-minor"}) + ps = minor.find('li').text + user = ps.strip() + author.append(cleanString(user)) + + # Finding the number of replies + meta = itopic.find("div", {"class": "structItem-cell--meta"}) + meta = meta.find_all("dl") + post = meta[0].find("dd").text + post = post.replace("K", "000") + posts.append(cleanString(post)) + + # Finding the number of Views + tview = meta[1].find("dd").text + tview = tview.replace("K", "000") + views.append(cleanString(tview)) + + # If no information about when the topic was added, just assign "-1" to the variable + minor = itopic.find("div", {"class": "structItem-minor"}) + dt = minor.find('time')['datetime'] + dt = dt.strip()[:16] + dt = dt.replace("T", ", ") + day = date.today() + if "Yesterday" in dt: + yesterday = day - timedelta(days=1) + yesterday = yesterday.strftime('%m-%d-%Y') + stime = dt.replace('Yesterday,', '').strip() + date_time_obj = yesterday + ', ' + stime + date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %H:%M') + else: + date_time_obj = datetime.strptime(dt, '%Y-%m-%d, %H:%M') + stime = date_time_obj.strftime('%b %d, %Y') + sdate = date_time_obj.strftime('%I:%M %p') + addDate.append(date_time_obj) + + return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) + + +def dwForums_links_parser(soup): + + # Returning all links that should be visited by the Crawler + + href = [] + #print(soup.find('table', {"class": "tborder clear"}).find( + # 'tbody').find_all('tr', {"class": "inline_row"})) + regex = re.compile('.*structItem--thread.*') + listing = soup.find_all("div", {"class": regex}) + + for a in listing: + link = a.select_one('a[href^="/threads/"]') + link = link['href'] + + href.append(link) + + return href \ No newline at end of file diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/Dread/crawler_selenium.py similarity index 68% rename from Forums/AbyssForum/crawler_selenium.py rename to Forums/Dread/crawler_selenium.py index 27135f2..ce14732 100644 --- a/Forums/AbyssForum/crawler_selenium.py +++ b/Forums/Dread/crawler_selenium.py @@ -1,7 +1,7 @@ -__author__ = 'Helium' +__author__ = 'DarkWeb' ''' -AbyssForum Crawler (Selenium) +Dread Forum Crawler (Selenium) ''' from selenium import webdriver @@ -12,20 +12,18 @@ from selenium.webdriver.firefox.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image import urllib.parse as urlparse import os, re, time from datetime import date -import configparser import subprocess from bs4 import BeautifulSoup from Forums.Initialization.prepare_parser import new_parse -from Forums.AbyssForum.parser import abyssForum_links_parser +from Forums.Dread.parser import dread_links_parser from Forums.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/' +baseURL = 'http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/' # Opens Tor Browser, crawls the website @@ -41,25 +39,45 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(forumName, baseURL, True) + new_parse(forumName, baseURL, False) # Login using premade account credentials and do login captcha manually def login(driver): - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="sn-category-3"]'))) + ''' + # code for captcha, for now, it runs too slow so captcha expires + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.CSS_SELECTOR, ".image"))) + + inputBoxes = driver.find_elements(by=By.TAG_NAME, value='input') + for index, inputBox in enumerate(inputBoxes): + driver.find_element(by=By.CSS_SELECTOR, value='.image').screenshot(r'..\Dread\captcha.png') + im = Image.open(r'..\Dread\captcha.png') + + im.show() + userIn = input("Enter character: ") + inputBox.send_keys(userIn) + im.close() + if index != 5: + inputBoxes[index+1].click() + driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div/form/div/input").click() + ''' + input("Press ENTER when CAPTCHA is completed\n") + + #entering username and password into input boxes + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div/div[2]"))) # Returns the name of the website def getForumName(): - name = 'AbyssForum' + name = 'Dread' return name # Return the link of the website def getFixedURL(): - url = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/' + url = 'http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/' return url @@ -89,8 +107,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -101,16 +119,18 @@ def createFFDriver(): ff_prof.set_preference('network.proxy.socks_port', 9150) ff_prof.set_preference('network.proxy.socks_remote_dns', True) ff_prof.set_preference("javascript.enabled", True) + ff_prof.set_preference("xpinstall.signatures.required", False); ff_prof.update_preferences() service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver + def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -157,26 +177,24 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # Hacked Database - links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=26') - # Hire a Hacker - links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=27') - # Hacking Tools - links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=28') - # Carding Forums - links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=30') - # Social Media Hacking - links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=32') - # Hacking Tutorials - links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=12') - # Cracking Tutorials - links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=13') + # # OpSec + # links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/OpSec') + # Hacking 180 + links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/hacking') + # # Jobs4Crypto + # links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/Jobs4Crypto') + # # Hacktown + # links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/HackTown') + # # Malware + # links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/malware') + # # Programming + # links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/programming') return links def crawlForum(driver): - print("Crawling the AbyssForum forum") + print("Crawling the Dread forum") linksToCrawl = getInterestedLinks() @@ -208,20 +226,14 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - - if isListingLink(driver.current_url): - break - - savePage(driver, driver.page_source, topic + f"page{counter}") # very important + savePage(driver, driver.page_source, topic + f"page{counter}") - # # comment out - # if counter == 2: - # break + # comment out + if counter == 2: + break try: - temp = driver.find_element(By.CLASS_NAME, 'pagination') - temp = temp.find_element(by=By.CLASS_NAME, value='next') - page = temp.find_element(by=By.CLASS_NAME, value='button').get_attribute('href') + page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') if page == "": raise NoSuchElementException counter += 1 @@ -229,23 +241,20 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - # making sure we go back to the listing page (browser back button simulation) - try: - driver.get(link) - except: - driver.refresh() + for i in range(counter): + driver.back() + + # comment out + break - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + if count == 1: + break try: - temp = driver.find_element(By.CLASS_NAME, 'pagination') - temp = temp.find_element(by=By.CLASS_NAME, value='next') - link = temp.find_element(by=By.CLASS_NAME, value='button').get_attribute('href') + temp = driver.find_element(by=By.CLASS_NAME, value="pagination") + link = temp.find_element(by=By.CLASS_NAME, value="next").get_attribute('href') + if link == "": raise NoSuchElementException count += 1 @@ -257,19 +266,19 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the AbyssForum forum done.") + input("Crawling Dread forum done sucessfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link def isDescriptionLink(url): - if 'viewtopic' in url: + if '/post/' in url: return True return False # Returns True if the link is a listingPage link def isListingLink(url): - if '.onion/viewforum' in url: + if '/d/' in url: return True return False @@ -278,9 +287,9 @@ def isListingLink(url): def topicPages(html): soup = BeautifulSoup(html, "html.parser") #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) - return abyssForum_links_parser(soup) + return dread_links_parser(soup) def crawler(): startCrawling() - # print("Crawling and Parsing Abyss .... DONE!") + # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/Forums/Dread/parser.py b/Forums/Dread/parser.py new file mode 100644 index 0000000..8de9d0d --- /dev/null +++ b/Forums/Dread/parser.py @@ -0,0 +1,334 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +import datetime + +from Forums.Utilities.utilities import * +from datetime import date +from datetime import timedelta +import re +import traceback +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + + +# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) +def dread_description_parser(soup): + + # Fields to be parsed + + topic = "-1" # 0 *topic name + user = [] # 1 *all users of each post + status = [] # 2 all user's authority in each post such as (adm, member, dangerous) + reputation = [] # 3 all user's karma in each post (usually found as a number) + interest = [] # 4 all user's interest in each post + sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) + post = [] # 6 all messages of each post + feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) + addDate = [] # 8 all dates of each post + + # Finding the topic (should be just one coming from the Listing Page) + container = soup.find('div', {"class": "content"}) + li = container.find("a", {"class": "title"}) + if li == None: + return None + topic = li.text + topic = topic.replace(u'\xa0', ' ') + topic = topic.replace(",","") + topic = topic.replace("\n","") + topic = cleanString(topic.strip()) + # print(topic) + # Finding the repeated tag that corresponds to the listing of posts + + # posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \ + # soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"}) + + # putting the initial post data since it is separated from comments + # author name + init_post = container.find('div', {"class": "item"}) + author = init_post.find('div', {"class": "author"}).select_one('a[href^="/u/"]').text + flair = init_post.find('div', {"class": "author"}).find("span", {"class": "flair"}) + try: + flair = flair.text.strip() + author = author.replace(flair, '') + except: + pass + author = author.strip() + user.append(cleanString(author)) + # status + flair = init_post.find("span", {"class": "flair"}) + if flair != None: + flair = flair.text.strip() + else: + flair = "-1" + status.append(cleanString(flair)) + # no blurb + interest.append(-1) + # points for post + karma = init_post.find("div", {"class": "voteCount"}) + if karma != None: + karma = karma.text + karma = karma.replace("points", "") + karma = karma.replace(":", "") + karma = karma.strip() + else: + karma = "-1" + reputation.append(cleanString(karma)) + # date + spans = init_post.find('div', {"class": "author"}).find('span', recursive=False) + dt = spans['title'] + month = find_month(dt) + split_text = dt.split() + day = int(re.search(r'\d+', split_text[0]).group()) + year = int(split_text[2]) + hm = re.findall(r'\d+', split_text[-1]) + hm[0] = int(hm[0]) + hm[1] = int(hm[1]) + date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1]) + addDate.append(date_time_obj) + + # content + inner = init_post.find("div", {"class": "postContent"}) + inner = inner.text.strip() + post.append(cleanString(inner)) + + # no signature + sign.append(-1) + # no feedback + feedback.append(-1) + + + comments = soup.find('div', {"class": "postComments"}) + if comments == None: + row = (topic, post, user, addDate, feedback, status, reputation, sign, interest) + return row + else: + comments = soup.find('div', {"class": "postComments"}).find_all('div', "comment") + # print(len(posts)) + + # For each message (post), get all the fields we are interested to: + + for ipost in comments: + + # Finding a first level of the HTML page + + # post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"}) + cc = ipost.find('div', {"class": "commentContent"}) + + post_wrapper = cc.find('a', {"class": "username"}).text + flair = cc.find("span", {"class": "flair"}) + try: + flair = flair.text.strip() + post_wrapper = post_wrapper.replace(flair, '') + except: + pass + author = post_wrapper.strip() + user.append(cleanString(author)) + + + # Finding the status of the author + + # Dread does not have membergroup and postgroup, but it has flair, similar enough + + + postgroup = None + if flair != None: + if postgroup != None: + postgroup = postgroup.text.strip() + flair = flair + " - " + postgroup + else: + if postgroup != None: + flair = postgroup.text.strip() + else: + flair = "-1" + + status.append(cleanString(flair)) + # print("status " + cleanString(membergroup)) + # Finding the interest of the author + # Dread does not have blurb + + interest.append(-1) + + # Finding the reputation of the user + # Dread doesn't have reputation per user, but instead each post has its own point system + karma = cc.find('div', {"class": "votes"}) + + if karma != None: + karma = karma.text + karma = karma.replace("points","") + karma = karma.replace(":", "") + karma = karma.strip() + else: + karma = "-1" + reputation.append(cleanString(karma)) + # print("karma " + cleanString(karma)) + # Getting here another good tag to find the post date, post content and users' signature + + postarea = ipost.find('div', {"class": "timestamp"}).find('span', recursive=False) + dt = postarea['title'] + month = find_month(dt) + split_text = dt.split() + day = int(re.search(r'\d+', split_text[0]).group()) + year = int(split_text[2]) + hm = re.findall(r'\d+', split_text[-1]) + hm[0] = int(hm[0]) + hm[1] = int(hm[1]) + date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1]) + addDate.append(date_time_obj) + + # Finding the post + + inner = ipost.find('div', {"class": "commentBody"}) + inner = inner.text.strip() + # print(inner) + post.append(cleanString(inner)) + + # No signature for Dread + + sign.append(-1) + + # As no information about users's feedback was found, just assign "-1" to the variable + + feedback.append("-1") + + # Populate the final variable (this should be a list with all fields scraped) + + row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) + + # Sending the results + + return row + +# This is the method to parse the Listing Pages (one page with many posts) + +def dread_listing_parser(soup): + + nm = 0 # *this variable should receive the number of topics + forum = "Dread" # 0 *forum name + board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + author = [] # 2 *all authors of each topic + topic = [] # 3 *all topics + views = [] # 4 number of views of each topic + posts = [] # 5 number of posts of each topic + href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between + # Listing and Description pages) + addDate = [] # 7 when the topic was created (difficult to find) + + # Finding the board (should be just one) + + board = soup.find('a', {"class": "banner-top"}).text + board = cleanString(board.strip()) + + # Finding the repeated tag that corresponds to the listing of topics + + itopics = soup.find("div", {"class": "postBoard"}).find_all("div", {"class": "item"}, recursive=False) + + for itopic in itopics: + + # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them + # to don't miss any topic + + # Adding the topic to the topic list + topic_title = itopic.find("a", {"class": "title"}) + title_flair = topic_title.find('span', {"class": "flair"}) + topics = topic_title.text + try: + title_flair = title_flair.text.strip() + topics = topics.replace(title_flair, '') + except: + pass + topics = topics.replace(u'\xa0', ' ') + topics = topics.replace(",", "") + topics = topics.replace("\n", "") + topic.append(cleanString(topics.strip())) + + # Counting how many topics we have found so far + + nm = len(topic) + + # Adding the url to the list of urls + link = topic_title['href'] + link = cleanLink(link) + href.append(link) + + # Finding the author of the topic + ps = itopic.find('div', {"class": "author"}) + post_wrapper = ps.select_one('a[href^="/u/"]').text + flair = ps.find("span", {"class": "flair"}) + try: + flair = flair.text.strip() + post_wrapper = post_wrapper.replace(flair, '') + except: + pass + user = post_wrapper.strip() + author.append(cleanString(user)) + + # Finding the number of replies + meta = itopic.find("div", {"class": "postMain"}) + post = meta.find("a").text + post = post.replace("comments", '').strip() + posts.append(cleanString(post)) + + # Finding the number of Views - not shown in Dread + views.append("-1") + + # If no information about when the topic was added, just assign "-1" to the variable + spans = itopic.find('div', {"class": "author"}).find('span', recursive=False) + dt = spans['title'] + month = find_month(dt) + split_text = dt.split() + day = int(re.search(r'\d+', split_text[0]).group()) + year = int(split_text[2]) + hm = re.findall(r'\d+', split_text[-1]) + hm[0] = int(hm[0]) + hm[1] = int(hm[1]) + date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1]) + addDate.append(date_time_obj) + + return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) + + +def dread_links_parser(soup): + + # Returning all links that should be visited by the Crawler + + href = [] + #print(soup.find('table', {"class": "tborder clear"}).find( + # 'tbody').find_all('tr', {"class": "inline_row"})) + + listing = soup.find("div", {"class": "postBoard"}).find_all("div",{"class": "item"}, recursive=False) + + for a in listing: + link = a.find("a", {"class": "title"}) + link = link['href'] + + href.append(link) + + return href + +def find_month(s): + if 'January' in s: + return 1 + elif 'February' in s: + return 2 + elif 'March' in s: + return 3 + elif 'April' in s: + return 4 + elif 'May' in s: + return 5 + elif 'June' in s: + return 6 + elif 'July' in s: + return 7 + elif 'August' in s: + return 8 + elif 'September' in s: + return 9 + elif 'October' in s: + return 10 + elif 'November' in s: + return 11 + elif 'December' in s: + return 12 diff --git a/Forums/Helium/crawler_selenium.py b/Forums/Helium/crawler_selenium.py new file mode 100644 index 0000000..9de4236 --- /dev/null +++ b/Forums/Helium/crawler_selenium.py @@ -0,0 +1,328 @@ +__author__ = 'DarkWeb' + +''' +Helium Forum Crawler (Selenium) +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +from PIL import Image +import urllib.parse as urlparse +import os, time +from datetime import date +import subprocess +from bs4 import BeautifulSoup +from Forums.Initialization.prepare_parser import new_parse +from Forums.Helium.parser import helium_links_parser +from Forums.Utilities.utilities import cleanHTML + +counter = 1 +baseURL = 'http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/' + + +# Opens Tor Browser, crawls the website +def startCrawling(): + # opentor() + # forumName = getForumName() + driver = getAccess() + + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) + + # new_parse(forumName, False) + + +# Opens Tor Browser +def opentor(): + global pid + print("Connecting Tor...") + path = open('../../path.txt').readline().strip() + pro = subprocess.Popen(path) + pid = pro.pid + time.sleep(7.5) + input('Tor Connected. Press ENTER to continue\n') + return + + +# Login using premade account credentials and do login captcha manually +def login(driver): + #wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[2]/div/div[1]/div/div/div[2]/form/div[5]/div/button"))) + + #entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + #Username here + usernameBox.send_keys('holyre') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + #Password here + passwordBox.send_keys('PlatinumBorn2') + + ''' + # wait for captcha page show up + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="captcha_img"]'))) + + # save captcha to local + driver.find_element(by=By.XPATH, value='//*[@id="captcha_img"]').screenshot(r'..\Helium\captcha.png') + + # This method will show image in any image viewer + im = Image.open(r'..\Helium\captcha.png') + + im.show() + + # wait until input space show up + inputBox = driver.find_element(by=By.XPATH, value='//*[@id="captcha"]') + + # ask user input captcha solution in terminal + userIn = input("Enter solution: ") + + # send user solution into the input space + inputBox.send_keys(userIn) + + # click the verify(submit) button + driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div[1]/div/div/div[2]/form/div[5]/div/button").click() + ''' + input("Press ENTER when CAPTCHA is completed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 50).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[2]/div/p'))) + + +# Returns the name of the website +def getForumName(): + name = 'Helium' + return name + + +# Return the link of the website +def getFixedURL(): + url = 'http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/login' + return url + + +# Closes Tor Browser +def closetor(driver): + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + file = open('../../path.txt', 'r') + lines = file.readlines() + + ff_binary = FirefoxBinary(lines[0].strip()) + + ff_prof = FirefoxProfile(lines[1].strip()) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("permissions.default.image", 2) + ff_prof.set_preference("browser.download.folderList", 2) + ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", True) + ff_prof.update_preferences() + + service = Service(lines[2].strip()) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + return driver + + +def getAccess(): + url = getFixedURL() + driver = createFFDriver() + try: + driver.get(url) + return driver + except: + driver.close() + return 'down' + + +# Saves the crawled html page +def savePage(page, url): + cleanPage = cleanHTML(page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +# Gets the full path of the page to be saved along with its appropriate file name +def getFullPathName(url): + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = r'..\Helium\HTML_Pages\\' + str( + "%02d" % date.today().month) + str("%02d" % date.today().day) + str( + "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + else: + fullPath = r'..\Helium\HTML_Pages\\' + str( + "%02d" % date.today().month) + str("%02d" % date.today().day) + str( + "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + return fullPath + + +# Creates the file name from passed URL +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if name == '': + name = str(counter) + counter = counter + 1 + return name + + +def getInterestedLinks(): + links = [] + + # # General Discussion + # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/6') + # # Anonymity and Security + # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/8') + # # Programming + # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/9') + # # Carding Discussions + # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/10') + # # Hacked Database (free) + # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/11') + # Hacking tools, exploits and POC + links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/17') + # # Hacked Database + # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/12') + # # Hacking and other Services + # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/13') + # # Selling/Buying Malware, Exploits etc + # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/22') + # # General Tutorials + # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/18') + # # Hacking Tutorials + # links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/19') + + return links + + +def crawlForum(driver): + print("Crawling the Helium forum") + + linksToCrawl = getInterestedLinks() + # visited = set(linksToCrawl) + # initialTime = time.time() + + i = 0 + count = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] + print('Crawling :', link) + try: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + has_next_page = True + while has_next_page: + list = topicPages(html) + for item in list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver.page_source, item) + driver.back() + # comment out + break + + # comment out + if count == 1: + count = 0 + break + + try: + bar = driver.find_element(by=By.XPATH, value= + '/html/body/div[2]/div/div[3]/ul') + li = bar.find_elements(By.TAG_NAME, 'li')[-1] + link = li.find_element(By.TAG_NAME, 'a').get_attribute('href') + + if link == "": + raise NoSuchElementException + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + count += 1 + + except NoSuchElementException: + has_next_page = False + + except Exception as e: + print(link, e) + i += 1 + + # finalTime = time.time() + # print finalTime - initialTime + + input("Crawling Helium forum done successfully. Press ENTER to continue\n") + + +# Returns 'True' if the link is Topic link +def isDescriptionLink(url): + if 'topic' in url: + return True + return False + + +# Returns True if the link is a listingPage link +def isListingLink(url): + if 'board' in url: + return True + return False + + +# calling the parser to define the links +def topicPages(html): + soup = BeautifulSoup(html, "html.parser") + return helium_links_parser(soup) + + +def crawler(): + startCrawling() + # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/Forums/Helium/parser.py b/Forums/Helium/parser.py new file mode 100644 index 0000000..5a852a8 --- /dev/null +++ b/Forums/Helium/parser.py @@ -0,0 +1,248 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from Forums.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + +# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) +def helium_description_parser(soup): + + # Fields to be parsed + + topic = "-1" # topic name + user = [] # all users of each post + addDate = [] # all dated of each post + feedback = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format) + status = [] # all user's authority in each post such as (adm, member, dangerous) + reputation = [] # all users's karma in each post (usually found as a number) + sign = [] # all user's signature in each post (usually a standard message after the content of the post) + post = [] # all messages of each post + interest = [] # all user's interest in each post + + # Finding the topic (should be just one coming from the Listing Page) + + li = soup.find("h4", {"class": "text-truncated"}) + topic = li.text + topic = topic.replace("Topic:", "") + topic = topic.replace("Post Reply", "") + topic = topic.replace(",", "") + topic = topic.replace("\n", "") + topic = cleanString(topic.strip()) + + # Finding the repeated tag that corresponds to the listing of posts + + posts = soup.findAll('div', {"id": "a9"}) + + # For each message (post), get all the fields we are interested to: + + for ipost in posts: + + # Finding a first level of the HTML page + + # Finding the author (user) of the post + + heading = ipost.find('div', {"class": "panel-heading"}) + title = heading.find('div', {"class": "panel-title"}).text + author = title.replace("User:", "") + author = author.strip() + user.append(cleanString(author)) # Remember to clean the problematic characters + + # Finding the status of the author + # Testing here two possibilities to find this status and combine them + # Helium does not have membergroup and postgroup + + membergroup = heading.find('li', {"class": "membergroup"}) + postgroup = heading.find('li', {"class": "postgroup"}) + if membergroup != None: + membergroup = membergroup.text.strip() + if postgroup != None: + postgroup = postgroup.text.strip() + membergroup = membergroup + " - " + postgroup + else: + if postgroup != None: + membergroup = postgroup.text.strip() + else: + membergroup = "-1" + status.append(cleanString(membergroup)) + + # Finding the interest of the author + # Helium does not have blurb + + blurb = heading.find('li', {"class": "blurb"}) + if blurb != None: + blurb = blurb.text.strip() + else: + blurb = "-1" + interest.append(cleanString(blurb)) + + # Finding the reputation of the user + # Helium does not have karma + + karma = heading.find('li', {"class": "karma"}) + if karma != None: + karma = karma.text + karma = karma.replace("Community Rating: ","") + karma = karma.replace("Karma: ","") + karma = karma.strip() + else: + karma = "-1" + reputation.append(cleanString(karma)) + + # Getting here another good tag to find the post date, post content and users' signature + + postarea = ipost.find('div', {"class": "content_body"}) + + # Finding the date of the post + # Helium does not have date + + addDate.append("-1") + + # dt = ipost.find('p', {"class": "author"}).text.split('»')[1] + # # dt = dt.strip().split() + # dt = dt.strip() + # date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p') + # stime = date_time_obj.strftime('%a %b %d, %Y') + # sdate = date_time_obj.strftime('%I:%M %p') + # addDate.append(date_time_obj) + + # date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p') + # smalltext = postarea.find('div', {"class": "flow_hidden"}).find('div', {"class": "keyinfo"})\ + # .find('div', {"class": "smalltext"}) + # sdatetime = smalltext.text + # sdatetime = sdatetime.replace(u"\xab","") # Removing unnecessary characters + # sdatetime = sdatetime.replace(u"\xbb","") # Removing unnecessary characters + # sdatetime = sdatetime.split("on: ") # Removing unnecessary characters + # sdatetime = sdatetime[1].strip() + # stime = sdatetime[:-12:-1] # Finding the time of the post + # stime = stime[::-1] + # sdate = sdatetime.replace(stime,"") # Finding the date of the post + # sdate = sdate.replace(",","") + # sdate = sdate.strip() + + # Covert the date of the post that can be informed as: "12 February 2016", "today", "yesterday". We need + # a date format here as "mm/dd/yyyy" + + #addDate.append(convertDate(sdate,"english", crawlerDate) + " " + stime) + + # Finding the post + + paragraphs = postarea.find_all('p') + p = "" + for paragraph in paragraphs: + p += paragraph.text.strip() + " " + quote = postarea.find('div', {"class": "standard_quote"}) + if quote != None: + q = quote.text.strip() + p.replace(q, "") + post.append(cleanString(p.strip())) + + # Finding the users's signature + # Helium does not have signature + + #signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) + signature = ipost.find('div', {"class": "post_wrapper"}) + if signature != None: + signature = signature.text.strip() + else: + signature = "-1" + sign.append(cleanString(signature)) + + # As no information about users's feedback was found, just assign "-1" to the variable + + feedback.append("-1") + + # Populate the final variable (this should be a list with all fields scraped) + + row = (topic, post, user, addDate, feedback, status, reputation, sign, interest) + + # Sending the results + + return row + + +# This is the method to parse the Listing Pages (one page with many posts) +def helium_listing_parser(soup): + + board = "-1" # board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + + nm = 0 # this variable should receive the number of topics + topic = [] # all topics + user = [] # all users of each topic + post = [] # number of posts of each topic + view = [] # number of views of each topic + addDate = [] # when the topic was created (difficult to find) + href = [] # this variable should receive all cleaned urls (we will use this to do the marge between + # Listing and Description pages) + + # Finding the board (should be just one) + + parents = soup.find('div', {"class": "col-md-12"}).findAll('li') + board = parents[1].text + u"->" + parents[2].get('title') + board = board.replace("\n", "") + board = cleanString(board.strip()) + + # Finding the repeated tag that corresponds to the listing of topics + + itopics = soup.find('table', {"class": "table"}).find('tbody').findAll('td', {"class": "col-md-8"}) + repliesViews = soup.find('table', {"class": "table"}).find('tbody').findAll('td', {"class": "col-md-2"}) + + # Counting how many topics we have found so far + + nm = len(itopics) + + index = 0 + for itopic in itopics: + + # Adding the topic to the topic list + + topics = itopic.find('a').get('title') + topics = topics.replace(",", "") + topic.append(cleanString(topics.strip())) + + # Adding the url to the list of urls + link = itopic.find('a').get('href') + link = cleanLink(link) + href.append(link) + + # Finding the author of the topic + author = itopic.find('strong').text + user.append(cleanString(author.strip())) + + rv = repliesViews[index].find('p').text.split() + + # Finding the number of replies + posts = rv[0].replace("Replies", "") + post.append(cleanString(posts.strip())) + + # Finding the number of Views + tview = rv[1].replace("Views", "") + view.append(cleanString(tview.strip())) + + # If no information about when the topic was added, just assign "-1" to the variable + # dt = itopic.find('div', {"class": "responsive-hide"}).text.split('»')[1] + # dt = dt.strip() + # date_time_obj = datetime.strptime(dt,'%a %b %d, %Y %I:%M %p') + # addDate.append(date_time_obj) + addDate.append("-1") + index += 1 + + return organizeTopics("Helium", nm, topic, board, view, post, user, addDate, href) + + +def helium_links_parser(soup): + + # Returning all links that should be visited by the Crawler + + href = [] + + listing = soup.find('table', {"class": "table"}).find('tbody').findAll('td', {"class": "col-md-8"}) + + for a in listing: + bae = a.find('a', href=True) + link = bae['href'] + href.append(link) + + return href \ No newline at end of file diff --git a/Forums/HiddenAnswers/parser.py b/Forums/HiddenAnswers/parser.py deleted file mode 100644 index 0f2647f..0000000 --- a/Forums/HiddenAnswers/parser.py +++ /dev/null @@ -1,212 +0,0 @@ -__author__ = 'Helium' - -# Here, we are importing the auxiliary functions to clean or convert data -from typing import List -from Forums.Utilities.utilities import * -from datetime import date -from datetime import timedelta -import re - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup, ResultSet, Tag - -# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) - -def HiddenAnswers_description_parser(soup: BeautifulSoup): - - # Fields to be parsed - - topic: str = "-1" # 0 topic name - user: List[str] = [] # 1 all users of each post - addDate: List[datetime] = [] # 2 all dated of each post - feedback: List[str] = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format) - status: List[str] = [] # 4 all user's authority in each post such as (adm, member, dangerous) - reputation: List[str] = [] # 5 all user's karma in each post (usually found as a number) - sign: List[str] = [] # 6 all user's signature in each post (usually a standard message after the content of the post) - post: List[str] = [] # 7 all messages of each post - interest: List[str] = [] # 8 all user's interest in each post - image_user = [] # 9 all user avatars of each post - image_post = [] # 10 all first images of each post - - # Finding the topic (should be just one coming from the Listing Page) - li = soup.find("h1").find("span", {"itemprop": "name"}) - topic = li.text - - question: Tag = soup.find("div", {"class": "qa-part-q-view"}) - - question_user = question.find("span", {"class": "qa-q-view-who-data"}).text - user.append(cleanString(question_user.strip())) - - question_time = question.find("span", {"class": "qa-q-view-when-data"}).find("time").get("datetime") - datetime_string = question_time.split("+")[0] - datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S") - addDate.append(datetime_obj) - - question_user_status = question.find("span", {"class": "qa-q-view-who-title"}) - if question_user_status is not None: - question_user_status = question_user_status.text - status.append(cleanString(question_user_status.strip())) - else: - status.append('-1') - - question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}) - if question_user_karma is not None: - question_user_karma = question_user_karma.text - # Convert karma to pure numerical string - if question_user_karma.find("k") > -1: - question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000) - reputation.append(cleanString(question_user_karma.strip())) - else: - reputation.append('-1') - - question_content = question.find("div", {"class": "qa-q-view-content qa-post-content"}).text - post.append(cleanString(question_content.strip())) - - feedback.append("-1") - sign.append("-1") - interest.append("-1") - - img = question.find('div', {"class": "qa-q-view-content qa-post-content"}).find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = "-1" - image_post.append(img) - - img = question.find('span', {"class": "qa-q-view-avatar-meta"}).find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = "-1" - image_user.append(img) - - answer_list: ResultSet[Tag] = soup.find("div", {"class": "qa-a-list"}).find_all("div", {"class": "qa-a-list-item"}) - - - for replies in answer_list: - user_name = replies.find("span", {"class", "qa-a-item-who-data"}).text - user.append(cleanString(user_name.strip())) - - date_added = replies.find("span", {"class": "qa-a-item-when"}).find("time", {"itemprop": "dateCreated"}).get('datetime') - date_string = date_added.split("+")[0] - datetime_obj = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S") - addDate.append(datetime_obj) - - - post_data = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).text - post.append(cleanString(post_data.strip())) - - user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}) - if user_reputations is not None: - user_reputations = user_reputations.text - status.append(cleanString(user_reputations.strip())) - else: - status.append('-1') - - karma = replies.find("span", {"class": "qa-a-item-who-points-data"}) - if karma is not None: - karma = karma.text - # Convert karma to pure numerical string - if karma.find("k") > -1: - karma = str(float(karma.replace("k", "")) * 1000) - reputation.append(cleanString(karma.strip())) - else: - reputation.append('-1') - - feedback.append("-1") - sign.append("-1") - interest.append("-1") - - img = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = "-1" - image_post.append(img) - - img = replies.find('span', {"class": "qa-a-item-avatar-meta"}).find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = "-1" - image_user.append(img) - - # Populate the final variable (this should be a list with all fields scraped) - - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) - - # Sending the results - return row - - -def HiddenAnswers_listing_parser(soup: BeautifulSoup): - - nm: int = 0 # this variable should receive the number of topics - forum: str = "HiddenAnswers" # 0 *forum name - board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - user: List[str] = [] # 2 all users of each topic - topic: List[str] = [] # 3 all topics - view: List[int] = [] # 4 number of views of each topic - post: List[int] = [] # 5 number of posts of each topic - href: List[str] = [] # 6 this variable should receive all cleaned urls (we will use this to do the merge between - # Listing and Description pages) - addDate: List[str] = [] # 7 when the topic was created (difficult to find) - image_user = [] # 8 all user avatars used in each topic - - # Finding the board - board = soup.find("div", {"class": "qa-main-heading"}).find("h1").text - board = board.replace('Recent questions in', '') - board = cleanString(board.strip()) - - queries_by_user: ResultSet[Tag] = soup.find("div", {"class": "qa-q-list"}).find_all("div", {"class": "qa-q-list-item"}) - - for queries in queries_by_user: - topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text - topic.append(cleanString(topic_of_query.strip())) - - image_user.append("-1") # qa-q-item-where - - author = queries.find("span", {"class": "qa-q-item-who-data"}).text - user.append(cleanString(author.strip())) - - num_answers = queries.find("span", {"class": "qa-a-count-data"}).text - post.append(cleanString(num_answers.strip())) - - view.append("-1") - - date_posted = queries.find("span", {"class": "qa-q-item-when-data"}).text - - if date_posted.find("day") > 0: - datetime_obj = datetime.now() - timedelta(days=1) - else: - try: - datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y") - except ValueError: - datetime_obj = datetime.strptime(f"{date_posted}", "%b %d, %Y") - addDate.append(datetime_obj) - #this link will be cleaned - - listing_href = queries.find("div", {"class": "qa-q-item-title"}).find("a").get("href") - href.append(listing_href) - - nm = len(topic) - - return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate, image_user) - -#need to change this method -def hiddenanswers_links_parser(soup): - - # Returning all links that should be visited by the Crawler - - href = [] - #print(soup.find('table', {"class": "tborder clear"}).find( - # 'tbody').find_all('tr', {"class": "inline_row"})) - listing = soup.find_all('div', {"class": "qa-q-item-title"}) - - for a in listing: - link = a.find('a').get('href') - - href.append(link) - - return href \ No newline at end of file diff --git a/Forums/Initialization/forums_mining.py b/Forums/Initialization/forums_mining.py index 4d68840..af7ce47 100644 --- a/Forums/Initialization/forums_mining.py +++ b/Forums/Initialization/forums_mining.py @@ -6,14 +6,6 @@ Starting point of the Darkweb Forums Mining from datetime import * from Forums.BestCardingWorld.crawler_selenium import crawler as crawlerBestCardingWorld -from Forums.CryptBB.crawler_selenium import crawler as crawlerCryptBB -from Forums.OnniForums.crawler_selenium import crawler as crawlerOnniForums -from Forums.AbyssForum.crawler_selenium import crawler as crawlerAbyssForum -from Forums.Procrax.crawler_selenium import crawler as crawlerProcraxForum -from Forums.HiddenAnswers.crawler_selenium import crawler as crawlerHiddenAnswers -from Forums.Cardingleaks.crawler_selenium import crawler as crawlerCardingleaks -from Forums.Altenens.crawler_selenium import crawler as crawlerAltenens -from Forums.Libre.crawler_selenium import crawler as crawlerLibre import configparser import os @@ -118,22 +110,6 @@ if __name__ == '__main__': if forum == "BestCardingWorld": crawlerBestCardingWorld() - elif forum == "CryptBB": - crawlerCryptBB() - elif forum == "OnniForums": - crawlerOnniForums() - elif forum == "AbyssForum": - crawlerAbyssForum() - elif forum == "HiddenAnswers": - crawlerHiddenAnswers() - elif forum == 'Procrax': - crawlerProcraxForum() - elif forum == 'Cardingleaks': - crawlerCardingleaks() - elif forum == 'Altenens': - crawlerAltenens() - elif forum == 'Libre': - crawlerLibre() print("\nScraping process completed!") diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 79b79a7..91b662f 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -8,14 +8,6 @@ from psycopg2.extras import RealDictCursor from Forums.DB_Connection.db_connection import * from Forums.BestCardingWorld.parser import * -from Forums.Cardingleaks.parser import * -from Forums.CryptBB.parser import * -from Forums.OnniForums.parser import * -from Forums.Altenens.parser import * -from Forums.Procrax.parser import * -from Forums.Libre.parser import * -from Forums.HiddenAnswers.parser import * -from Forums.AbyssForum.parser import * from Forums.Classifier.classify_product import predict # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi @@ -124,22 +116,6 @@ def parse_listing(forum, listingFile, soup, createLog, logFile): if forum == "BestCardingWorld": rw = bestcardingworld_listing_parser(soup) - elif forum == "Cardingleaks": - rw = cardingleaks_listing_parser(soup) - elif forum == "CryptBB": - rw = cryptBB_listing_parser(soup) - elif forum == "OnniForums": - rw = onniForums_listing_parser(soup) - elif forum == "Altenens": - rw = altenens_listing_parser(soup) - elif forum == "Procrax": - rw = procrax_listing_parser(soup) - elif forum == "Libre": - rw = libre_listing_parser(soup) - elif forum == "HiddenAnswers": - rw = HiddenAnswers_listing_parser(soup) - elif forum == "AbyssForum": - rw = abyssForums_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -163,22 +139,6 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile): if forum == "BestCardingWorld": rmm = bestcardingworld_description_parser(soup) - elif forum == "Cardingleaks": - rmm = cardingleaks_description_parser(soup) - elif forum == "CryptBB": - rmm = cryptBB_description_parser(soup) - elif forum == "OnniForums": - rmm = onniForums_description_parser(soup) - elif forum == "Altenens": - rmm = altenens_description_parser(soup) - elif forum == "Procrax": - rmm = procrax_description_parser(soup) - elif forum == "Libre": - rmm = libre_description_parser(soup) - elif forum == "HiddenAnswers": - rmm = HiddenAnswers_description_parser(soup) - elif forum == "AbyssForum": - rmm = abyssForums_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py deleted file mode 100644 index 19a05aa..0000000 --- a/Forums/Libre/crawler_selenium.py +++ /dev/null @@ -1,302 +0,0 @@ -__author__ = 'DarkWeb' - -''' -Libre Forum Crawler (Selenium) -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait - -from PIL import Image -import urllib.parse as urlparse -import os, re, time -import subprocess -from bs4 import BeautifulSoup -from Forums.Initialization.prepare_parser import new_parse -from Forums.Libre.parser import libre_links_parser -from Forums.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/' - - -# Opens Tor Browser, crawls the website -def startCrawling(): - forumName = getForumName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(forumName, baseURL, True) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - - input('Press enter when CAPTCHA is completed, and you\'re at the login page') - - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.NAME, value='username') - #Username here - usernameBox.send_keys('ct1234')#sends string to the username box - passwordBox = driver.find_element(by=By.NAME, value='password') - #Password here - passwordBox.send_keys('r5o0wqmw')# sends string to passwordBox - - input("Press the login button and solve the CAPTCHA then press enter\n") - - # input('input') - - # wait for listing page show up (This Xpath may need to change based on different seed url) - # wait for 50 sec until id = tab_content is found, then cont - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.TAG_NAME, 'nav'))) - - # click link to correct forum board - login_link = driver.find_element(by=By.XPATH, value='/html/body/nav/div[1]/a[3]').get_attribute('href') - driver.get(login_link) # open tab with url - - # wait for listing page show up (This Xpath may need to change based on different seed url) - # wait for 50 sec until id = tab_content is found, then cont - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div/div/div[3]/div[5]'))) - - -# Returns the name of the website -def getForumName() -> str: - name = 'Libre' - return name - - -# Return the link of the website -def getFixedURL(): - url = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/' - return url - - -# Closes Tor Browser -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() #close tab - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from Forums.Initialization.forums_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", True) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url) - return driver - except: - driver.close() - return 'down' - - -# Saves the crawled html page -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -def getFullPathName(url): - from Forums.Initialization.forums_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if name == '': - name = str(counter) - counter = counter + 1 - return name - - -def getInterestedLinks(): - links = [] - - # cybersecurity - links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/CyberSecurity') - # services - links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/Services') - # programming - links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/Programming') - # jobs for crypto - links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/JobsforCypto') - # darknet markets - links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/DarkNetMarkets') - - return links - - -def crawlForum(driver): - print("Crawling the Libre forum") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - - topics = topicPages(html) - for topic in topics: - has_next_topic_page = True - counter = 1 - page = topic - - while has_next_topic_page: - itemURL = urlparse.urljoin(baseURL, str(page)) - try: - driver.get(itemURL) - except: - driver.refresh() - - if isListingLink(driver.current_url): - break - - savePage(driver, driver.page_source, topic + f"page{counter}") # very important - - # # comment out - # if counter == 2: - # break - - try: - page = "" # no next page so far may have some later on - if page == "": - raise NoSuchElementException - counter += 1 - - except NoSuchElementException: - has_next_topic_page = False - - # making sure we go back to the listing page (browser back button simulation) - try: - driver.get(link) - except: - driver.refresh() - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - try: - link = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href') - - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the Libre forum done.") - - -# Returns 'True' if the link is Topic link, may need to change for every website -def isDescriptionLink(url): - if '/p/' in url: - return True - return False - - -# Returns True if the link is a listingPage link, may need to change for every website -def isListingLink(url): - if '.onion/c' in url: - return True - return False - - -# calling the parser to define the links -def topicPages(html): - soup = BeautifulSoup(html, "html.parser") - return libre_links_parser(soup) - - -def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/Forums/Libre/parser.py b/Forums/Libre/parser.py deleted file mode 100644 index 16113f7..0000000 --- a/Forums/Libre/parser.py +++ /dev/null @@ -1,249 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from Forums.Utilities.utilities import * -from datetime import date -from datetime import timedelta -import re - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup, ResultSet, Tag - - -# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) -def libre_description_parser(soup: Tag): - # Fields to be parsed - - topic = "-1" # 0 *topic name - user = [] # 1 *all users of each post - status = [] # 2 all user's authority in each post such as (adm, member, dangerous) - reputation = [] # 3 all user's karma in each post (usually found as a number) - interest = [] # 4 all user's interest in each post - sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) - post = [] # 6 all messages of each post - feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) - addDate = [] # 8 all dates of each post - image_user = [] # 9 all user avatars of each post - image_post = [] # 10 all first images of each post - - # Finding the topic (should be just one coming from the Listing Page) - - topic_found = soup.find("a", {"class": "link text-xl text-zinc-300"}).text - topic = cleanString(topic_found.strip()) - - original_post: Tag = soup.find("div", {"class": "flex items-start"}) - - original_user = original_post.find("div", {"class": "info-p"}).find("a", {"class": "link"}).text - user.append(cleanString(original_user.replace("/u/", "").strip())) - - original_user_statistics: ResultSet[Tag] = original_post.find("div", {"class": "info-p"}).find_all("span") - - original_time = original_user_statistics[0].text[2:] - datetime_append = datetime.strptime(original_time, "%Y-%m-%d %H:%M:%S GMT") - addDate.append(datetime_append) - - original_karma = original_user_statistics[1].text[2] - reputation.append(cleanString(original_karma.strip())) - - original_content = soup.find("div", {"class": "content-p"}).text - post.append(cleanString(original_content.strip())) - - - status.append("-1") - interest.append("-1") - sign.append("-1") - feedback.append("-1") - - image_post.append("-1") - - img = original_post.find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = "-1" - image_user.append(img) - - # Finding the repeated tag that corresponds to the listing of posts - - # try: - posts: ResultSet[Tag] = soup.find_all("div", {"class": "flex items-stretch"}) - - # For each message (post), get all the fields we are interested to: - - for ipost in posts: - # Finding a first level of the HTML page - - # Finding the author (user) of the post - - user_name = ipost.find("p", {"class": "text-zinc-400 text-justify"}).find("a", {"class": "link"}).text - user.append(cleanString(user_name.replace("/u/", "").strip())) # Remember to clean the problematic characters - - status.append("-1") - - # Finding the interest of the author - # CryptBB does not have blurb - - interest.append("-1") - - # Finding the reputation of the user - # CryptBB does have reputation - - karma = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text - karma_cleaned = karma.split(" ")[6] - reputation.append(cleanString(karma_cleaned.strip())) - - # Getting here another good tag to find the post date, post content and users' signature - - date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text - date_time_cleaned = date_posted.replace(user_name, "")[3:-12] - datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT") - addDate.append(datetime_append) - - # Finding the post - user_post = ipost.find("div", {"class": "content-c"}).text - post.append(cleanString(user_post)) - - # Finding the user's signature - - sign.append("-1") - - # As no information about user's feedback was found, just assign "-1" to the variable - - feedback.append("-1") - - # As no information about post's image was found, just assign "-1" to the variable - - image_post.append("-1") - - # As no information about user's image was found, just assign "-1" to the variable - - image_user.append("-1") - - # Populate the final variable (this should be a list with all fields scraped) - # print(topic) - # print(user) - # print(status) - # print(reputation) - # print(interest) - # print(sign) - # print(post) - # print(feedback) - # print(addDate) - # print(len(user)) - # print(len(status)) - # print(len(reputation)) - # print(len(interest)) - # print(len(sign)) - # print(len(feedback)) - # print(len(addDate)) - - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) - - # Sending the results - - return row - - -# This is the method to parse the Listing Pages (one page with many posts) -def libre_listing_parser(soup): - nm = 0 # *this variable should receive the number of topics - forum = "Libre" # 0 *forum name - board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - author = [] # 2 *all authors of each topic - topic = [] # 3 *all topics - views = [] # 4 number of views of each topic - posts = [] # 5 number of posts of each topic - href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between - # Listing and Description pages) - addDate = [] # 7 when the topic was created (difficult to find) - image_author = [] # 8 all author avatars used in each topic - - # Finding the board (should be just one) - - board = soup.find('div', {"class": "title"}).find("h1").text - board = cleanString(board.strip()) - - # Finding the repeated tag that corresponds to the listing of topics - - itopics = soup.find("div", {"class", "space-y-2 mt-4"}).find_all('div', {"class": "flex box"}) - - nm = 0 - for itopic in itopics: - nm += 1 - # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them - # to don't miss any topic - - # Adding the topic to the topic list - topic_string = itopic.find("a", {"class": "link text-xl text-zinc-300"}).text - cleaned_topic_string = cleanString(topic_string.strip()) - topic.append(cleaned_topic_string) - - image_author.append("-1") - - # Adding the url to the list of urls - link_to_clean = itopic.find('div', {'class': 'flex space-x-2 items-center'}).find('a').get('href') - - href.append(link_to_clean) - - # Finding the author of the topic - username_not_cleaned = itopic.find('div', {"class": "flex-grow p-2 text-justify"}).find('a').text - username_cleaned = username_not_cleaned.split("/")[-1] - author.append(cleanString(username_cleaned)) - - # Finding the number of views - num_views = itopic.find_all("div", {"class": "flex items-center"})[0].find("p").text - views.append(cleanString(num_views)) - - # Finding the number of replies - num_replies = itopic.find_all("div", {"class": "flex items-center"})[1].find("p").text - posts.append(cleanString(num_replies)) - - # If no information about when the topic was added, just assign "-1" to the variable - - date_time_concatenated = itopic.find("p", {"class": "text-sm text-zinc-400 italic"}).text - date_time_cleaned = date_time_concatenated.replace(username_not_cleaned, "") - # creating the datetime object - date_time_array = date_time_cleaned[3:] - datetime_append = datetime.strptime(date_time_array, "%Y-%m-%d %H:%M:%S GMT") - addDate.append(datetime_append) - - # print(forum) - # print(nm) - # print(board) - # print(author) - # print(topic) - # print(views) - # print(href) - # print(addDate) - # print(len(author)) - # print(len(topic)) - # print(len(views)) - # print(len(href)) - # print(len(addDate)) - - return organizeTopics( - forum=forum, - nm=nm, - board=board, - author=author, - topic=topic, - views=views, - posts=posts, - href=href, - addDate=addDate, - image_author=image_author - ) - - -def libre_links_parser(soup): - # Returning all links that should be visited by the Crawler - href = [] - listing = soup.find("div", {"class", "space-y-2 mt-4"}).find_all('div', {"class": "flex box"}) - - for a in listing: - link = a.find('div', {'class': 'flex space-x-2 items-center'}).find('a').get('href') - - href.append(link) - - return href diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py deleted file mode 100644 index d801d29..0000000 --- a/Forums/OnniForums/crawler_selenium.py +++ /dev/null @@ -1,310 +0,0 @@ -__author__ = 'Helium' - -''' -OnniForums Crawler (Selenium) -Now goes through multiple topic pages. -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image - -import urllib.parse as urlparse -import os, re, time -import configparser -from datetime import date -import subprocess -from bs4 import BeautifulSoup -from Forums.Initialization.prepare_parser import new_parse -from Forums.OnniForums.parser import onniForums_links_parser -from Forums.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/' - - -# Opens Tor Browser, crawls the website -def startCrawling(): - forumName = getForumName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(forum=forumName, url=baseURL, createLog=True) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - #click login button - login_link = driver.find_element( - by=By.XPATH, value='/html/body/div/div[1]/div[2]/div[1]/div/span/a[1]').get_attribute('href') - driver.get(login_link) - - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input') - #Username here - usernameBox.send_keys('cabbage_purely') - passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input') - #Password here - passwordBox.send_keys('$ourP@tchK1ds') - - clicker = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/div/input') - clicker.click() - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="content"]'))) - - -# Returns the name of the website -def getForumName(): - name = 'OnniForums' - return name - - -# Return the link of the website -def getFixedURL(): - url = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/' - return url - - -# Closes Tor Browser -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from Forums.Initialization.forums_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", True) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url) - return driver - except: - driver.close() - return 'down' - - -# Saves the crawled html page -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -def getFullPathName(url): - from Forums.Initialization.forums_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if (name == ''): - name = str(counter) - counter = counter + 1 - return name - - -def getInterestedLinks(): - links = [] - - # Hacking & Cracking tutorials - links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials') - # # Hacking & Cracking questions - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions') - # # Exploit PoCs - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Exploit-PoCs') - # # sellers - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Sellers') - # # buyers questions - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Buyers-Questions') - # # combo lists - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Combo-lists') - # # Malware-development - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Malware-development') - # # coding - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Coding') - # # Carding & Fraud - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Carding-Fraud') - # # OPSEC - # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-OPSEC--13') - - return links - - -def crawlForum(driver): - print("Crawling the OnniForums forum") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - - topics = topicPages(html) - for topic in topics: - has_next_topic_page = True - counter = 1 - page = topic - - while has_next_topic_page: - itemURL = urlparse.urljoin(baseURL, str(page)) - try: - driver.get(itemURL) - except: - driver.refresh() - - if isListingLink(driver.current_url): - break - - savePage(driver, driver.page_source, topic + f"page{counter}") # very important - - # # comment out - # if counter == 2: - # break - - try: - temp = driver.find_element(by=By.CLASS_NAME, value='float_left') - page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') - - if page == "": - raise NoSuchElementException - counter += 1 - - except NoSuchElementException: - has_next_topic_page = False - - # making sure we go back to the listing page (browser back button simulation) - try: - driver.get(link) - except: - driver.refresh() - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - try: - temp = driver.find_element(by=By.CLASS_NAME, value='float_left') - link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') - - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the OnniForums forum done.") - - -# Returns 'True' if the link is Topic link -def isDescriptionLink(url): - if 'Thread' in url: - return True - return False - - -# Returns True if the link is a listingPage link -def isListingLink(url): - if '.onion/Forum' in url: - return True - return False - - -# calling the parser to define the links -def topicPages(html): - soup = BeautifulSoup(html, "html.parser") - #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) - return onniForums_links_parser(soup) - - -def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/Forums/OnniForums/parser.py b/Forums/OnniForums/parser.py deleted file mode 100644 index 72674b1..0000000 --- a/Forums/OnniForums/parser.py +++ /dev/null @@ -1,222 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from typing import List -from Forums.Utilities.utilities import * -from datetime import date -from datetime import timedelta -import re -import string - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup - -# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) - -def onniForums_description_parser(soup: BeautifulSoup) -> tuple: - - topicName: str = "-1" # 0 *topic name - users : List[str] = [] # 1 *all users of each post - statuses : List[str] = [] # 2 all user's authority in each post such as (adm, member, dangerous) - reputations : List[str] = [] # 3 all user's karma in each post (usually found as a number) - interests : List[str] = [] # 4 all user's interest in each post - signs : List[str] = [] # 5 all user's signature in each post (usually a standard message after the content of the post) - posts : List[str] = [] # 6 all messages of each post - feedbacks : List[str] = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) - addDates : List[datetime] = [] # 8 all dates of each post - image_user : List[str] = [] # 9 all user avatars of each post - image_post : List[str] = [] # 10 all first images of each post - - # Getting the topicName - topicName = soup.find("table", {"class": "tborder tfixed clear"}) \ - .find("td", {"class": "thead"}) \ - .find_all("div")[-1].text - - topicName = cleanString(topicName.strip()) - - topics_array = soup.find_all("div", {"class": "post"}) - - - for topic in topics_array: - # Extracting and cleaning author information - author_information: BeautifulSoup = topic.find("div", {"class": "author_information"}) - - username: str = author_information.find("span", {"class": "largetext"}).text - username_cleaned = cleanString(username.strip()) - users.append(username_cleaned) - - user_status: str = author_information.find("span", {"class": "smalltext"}).text - - - # Banned users often have weird text issues in HTML - # So we detect banned users and give them a unique string - if user_status.find("Banned") > 0: user_status_cleaned = "Banned" - - elif user_status.find("Unregistered") > 0: user_status_cleaned = "Unregistered" - - else: user_status_cleaned = cleanString(user_status.strip()) # Remove excessive spaces in string - - # Add cleaned data into array - statuses.append(user_status_cleaned) - - if user_status_cleaned in ['Unregistered', 'Banned']: reputations.append(-1) - else: - author_statistics: BeautifulSoup = topic.find("div", {"class": "author_statistics"}) - - reputation: str = author_statistics.find_all("div", {"class": "float_right"})[-1].text - reputation_cleaned = cleanString(reputation.strip()) - reputations.append(reputation_cleaned) - - # Append a "-1" to `interests` and `signs` array since they don't exist on this forum - interests.append("-1") - signs.append("-1") - - post_content: str = topic.find("div", {"class": "post_body scaleimages"}).text - # Clean post content of excessive spaces and characters - post_content_cleaned = post_content.replace("[You must reply to view this hidden content]", "") - post_content_cleaned = cleanString(post_content_cleaned.strip()) - posts.append(post_content_cleaned) - - # Append a "-1" to `feedbacks` array since they don't exists on this forum - feedbacks.append("-1") - - date_posted = topic.find("span", {"class": "post_date"}).text.strip() - if 'modified' in date_posted: - date_posted = date_posted.split('(')[0].strip() - - if 'Today' in date_posted or 'Yesterday' in date_posted: - day = topic.find("span", {"class": "post_date"}).find('span').get('title').strip() - time = date_posted.split(',')[1].strip() - date_posted = day + ', ' + time - date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p") - - elif 'hour' in date_posted or 'minute' in date_posted: - date_posted = topic.find("span", {"class": "post_date"}).find('span').get('title').strip() - date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p") - - else: - date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p") - - addDates.append(date_object) - - image_post.append("-1") - - avatar = topic.find('div', {"class": "author_avatar"}) - if avatar is not None: - img = avatar.find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = '-1' - else: - img = "-1" - image_user.append(img) - - # TESTING PURPOSES - DO NOT REMOVE - - # Populate the final variable (this should be a list with all fields scraped) - - row = (topicName, users, statuses, reputations, interests, signs, posts, feedbacks, addDates, image_user, image_post) - - # Sending the results - - return row - - - -def onniForums_listing_parser(soup: BeautifulSoup): - - nm = 0 # this variable should receive the number of topics - forum = "OnniForums" # 0 *forum name - boardName = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - user: List[str] = [] # 2 all users of each topic - topic : List[str] = [] # 3 all topics - view: List[int] = [] # 4 number of views of each topic - post : List[int] = [] # 5 number of posts of each topic - href: List[str] = [] # 6 this variable should receive all cleaned urls (we will use this to do the merge between Listing and Description pages) - addDate : List[str] = [] # 7 when the topic was created (difficult to find) - image_author : List[str] = [] # 8 all author avatars used in each topic - - # Finding the board (should be just one) - board_metadata: BeautifulSoup = soup.find("table",{"class" : "tborder clear"}) - - boardName = board_metadata.find_all("div")[1].text - boardName = cleanString(boardName.strip()) - - thread_arrays = board_metadata.find_all("tr", {"class":"inline_row"}) # gets the information of posts - - nm = len(thread_arrays) - - for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above - - body = thread.find("span",{"class": "subject_new"}) - try: - post_subject: str = body.text #getting the topic - except: - body = thread.find("span",{"class": "subject_old"}) - post_subject: str = body.text - - post_subject_cleaned = cleanString(post_subject.strip()) - topic.append(post_subject_cleaned) - - author_icon = thread.find('div', {"class": "lavatar-old lavatar-old-f"}) - if author_icon != None: - author_icon = author_icon.find('img') - author_icon = author_icon.get('src') - author_icon = author_icon.split('base64,')[-1] - else: - author_icon = "-1" - image_author.append(author_icon) - - reply_count = thread.find_all("td", {"align": "center"})[2].text - post.append(cleanNumbers(reply_count)) - - views = thread.find_all("td", {"align": "center"})[3].text - view.append(cleanNumbers(views)) - - # dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text - # dates_added_cleaned = dates_added.split(',')[0] - # addDate.append(dates_added_cleaned) - - author = thread.find("span",{"class" : "author smalltext"}).text - author_cleaned = cleanString(author.strip()) - user.append(author_cleaned) - - thread_link = body.find('a').get('href') - href.append(thread_link) - - return organizeTopics( - forum=forum, - nm=nm, - board=boardName, - author=user, - topic=topic, - views=view, - posts=post, - href=href, - addDate=addDate, - image_author=image_author - ) - - - - - - -# This is the method to parse the Listing Pages (one page with many posts) - -def onniForums_links_parser(soup: BeautifulSoup): - - href = [] - listing = soup.find_all('tr', {'class': 'inline_row'}) - - for thread in listing: - try: - link = thread.find('span', {"class": "subject_old"}).find('a').get('href') - except: - link = thread.find('span', {"class": "subject_new"}).find('a').get('href') - - href.append(link) - - return href \ No newline at end of file diff --git a/Forums/OnniForums/testing.py b/Forums/OnniForums/testing.py deleted file mode 100644 index c18cfd4..0000000 --- a/Forums/OnniForums/testing.py +++ /dev/null @@ -1,57 +0,0 @@ -import os -from Forums.OnniForums.parser import onniForums_description_parser -from Forums.OnniForums.parser import onniForums_listing_parser -from bs4 import BeautifulSoup - -baseUrl = './HTML_Pages/06272023/Listing/httponnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qdonionForumCardingFraud.html' - -with open(baseUrl, 'r') as file: - testHTML = file.read() - -soup = BeautifulSoup(testHTML, 'html.parser') - -output = onniForums_listing_parser(soup) - -print(output) - -all_descriptions = os.listdir("./HTML_Pages/06272023/Description/")[1:] - -total = len(all_descriptions) -descriptions_with_unicode_error = 0 - -print("\nTESTING DESCRIPTION PARSER:\n") - -for desc in all_descriptions: - - print(f"\nTesting: ./HTML_Pages/06272023/Description/{desc} \n") - - - - try: - with open(f"./HTML_Pages/06272023/Description/{desc}", "r") as file: - test_html = file.read() - - soup = BeautifulSoup(test_html, features="html.parser") - - description_output = onniForums_description_parser(soup) - - print(f"\nTopic name : {description_output[0]}") - print(f"Contents : {description_output[1]}") - print(f"Users : {description_output[2]}") - print(f"Dates posted: {description_output[3]}") - print(f"Feedbacks : {description_output[4]}") - print(f"Statuses : {description_output[5]}") - print(f"Reputations : {description_output[6]}") - print(f"Signatures : {description_output[7]}") - print(f"Interests : {description_output[8]}\n") - - except UnicodeDecodeError: - descriptions_with_unicode_error += 1 - print(f"UnicodeDecodeError: the file `{desc}` cannot be decoded by Python!") - -print("\nTESTING COMPLETE\n") -print(f"Number of descriptions : {total}") -print(f"Descriptions w/ errors : {descriptions_with_unicode_error}") -print(f"Failure percentage : {round(descriptions_with_unicode_error/total, 4) * 100}%\n") - - diff --git a/Forums/Procrax/crawler_selenium.py b/Forums/Procrax/crawler_selenium.py deleted file mode 100644 index c12088a..0000000 --- a/Forums/Procrax/crawler_selenium.py +++ /dev/null @@ -1,321 +0,0 @@ -__author__ = 'Helium' - -''' -Procrax Forum Crawler (Selenium) -rechecked and confirmed -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image - -import urllib.parse as urlparse -import os, re, time -from datetime import date -import configparser -import subprocess -from bs4 import BeautifulSoup -from Forums.Initialization.prepare_parser import new_parse -from Forums.Procrax.parser import procrax_links_parser -from Forums.Utilities.utilities import cleanHTML - -counter = 1 -BASE_URL = 'https://procrax.cx/' -FORUM_NAME = 'Procrax' - - -# Opens Tor Browser, crawls the website -def startCrawling(): - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(forum=FORUM_NAME, url=BASE_URL, createLog=True) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span'))) - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.NAME, value='login') - #Username here - usernameBox.send_keys('cheese_pizza_man')#sends string to the username box - passwordBox = driver.find_element(by=By.NAME, value='password') - #Password here - passwordBox.send_keys('Gr33nSp@m&3ggs')# sends string to passwordBox - - clicker = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span') - clicker.click() - - # # wait for listing page show up (This Xpath may need to change based on different seed url) - # # wait for 50 sec until id = tab_content is found, then cont - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[1]/div/div[1]/div'))) - - -# Returns the name of the website -def getForumName(): - name = 'Procrax' - return name - - -# Return the link of the website -def getFixedURL(): - url = 'https://procrax.cx/' - return url - - -# Closes Tor Browser -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() #close tab - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from Forums.Initialization.forums_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", True) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - -def getAccess(): - driver = createFFDriver() - try: - driver.get(BASE_URL)# open url in browser - return driver - except: - driver.close()# close tab - return 'down' - - -# Saves the crawled html page -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -def getFullPathName(url): - from Forums.Initialization.forums_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + FORUM_NAME + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if (name == ''): - name = str(counter) - counter = counter + 1 - return name - - -def getInterestedLinks(): - links = [] - - # verified sales - links.append('https://procrax.cx/forums/verified-sales-market.10/') - # unverified sales - links.append('https://procrax.cx/forums/unverified-sales-market.12/') - # combos - links.append('https://procrax.cx/forums/bases.79/') - # tools - links.append('https://procrax.cx/forums/tools.81/') - # configs - links.append('https://procrax.cx/forums/configs.82/') - # craxtube - links.append('https://procrax.cx/forums/craxtube.83/') - # general hacking - links.append('https://procrax.cx/forums/general-hacking.24/') - # hacking security tools - links.append('https://procrax.cx/forums/hacking-security-tools.20/') - # hacktube - links.append('https://procrax.cx/forums/hacktube.22/') - # cardingtube - links.append('https://procrax.cx/forums/cardingtube.26/') - # cardable - links.append('https://procrax.cx/forums/cardable-websites.28/') - # spam software - links.append('https://procrax.cx/forums/mailing.72/') - # spam tools - links.append('https://procrax.cx/forums/tools-bots-validators.73/') - # darknet news - links.append('https://procrax.cx/forums/darknet-news-articles.42/') - # links - links.append('https://procrax.cx/forums/darknet-markets-deep-onion-links.43/') - # courses - links.append('https://procrax.cx/forums/courses.59/') - # software - links.append('https://procrax.cx/forums/software.76/') - # general forum - links.append('https://procrax.cx/forums/forum-discussions-updates.7/') - - return links - - -def crawlForum(driver): - print("Crawling the Procrax forum") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - - topics = topicPages(html) - for topic in topics: - has_next_topic_page = True - counter = 1 - page = topic - - while has_next_topic_page: - itemURL = urlparse.urljoin(BASE_URL, str(page)) - try: - driver.get(itemURL) - except: - driver.refresh() - - if isListingLink(driver.current_url): - break - - savePage(driver, driver.page_source, topic + f"page{counter}") # very important - - # # comment out - # if counter == 2: - # break - - try: - page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') - - if page == "": - raise NoSuchElementException - counter += 1 - - except NoSuchElementException: - has_next_topic_page = False - - # making sure we go back to the listing page (browser back button simulation) - try: - driver.get(link) - except: - driver.refresh() - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - try: - - link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') - - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the Procrax forum done.") - - -# Returns 'True' if the link is Topic link, may need to change for every website -def isDescriptionLink(url): - if 'threads' in url: - return True - return False - - -# Returns True if the link is a listingPage link, may need to change for every website -def isListingLink(url): - if '.cx/forums' in url: - return True - return False - - -# calling the parser to define the links -def topicPages(html): - soup = BeautifulSoup(html, "html.parser") - #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) - return procrax_links_parser(soup) - - -def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/Forums/Procrax/parser.py b/Forums/Procrax/parser.py deleted file mode 100644 index dda0090..0000000 --- a/Forums/Procrax/parser.py +++ /dev/null @@ -1,189 +0,0 @@ -__author__ = 'Helium' - -# Here, we are importing the auxiliary functions to clean or convert data -from Forums.Utilities.utilities import * -from datetime import date -from datetime import timedelta -import re - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup, ResultSet, Tag - -# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) - - -def procrax_description_parser(soup: Tag): - - # Fields to be parsed - - topic = "-1" # 0 topic name - user = [] # 1 all users of each post - addDate = [] # 2 all dated of each post - feedback = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format) - status = [] # 4 all user's authority in each post such as (adm, member, dangerous) - reputation = [] # 5 all user's karma in each post (usually found as a number) - sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post) - post = [] # 7 all messages of each post - interest = [] # 8 all user's interest in each post - image_user = [] # 9 all user avatars of each post - image_post = [] # 10 all first images of each post - - # Finding the topic (should be just one coming from the Listing Page) - - li = soup.find("h1", {"class": "p-title-value"}) - topic = li.text - - thread: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True}) - - for ipost in thread: - username = ipost.find("h4", {"class": "message-name"}).text - user.append(cleanString(username.strip())) - - date_posted = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime") - datetime_obj = datetime.strptime(date_posted, "%Y-%m-%dT%H:%M:%S%z") - addDate.append(datetime_obj) - - feedback.append("-1") - - user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text - status.append(cleanString(user_status.strip())) - - user_lvl = ipost.find("div", {"class": "afAwardLevel"}) - if user_lvl is not None: - user_lvl = user_lvl.text - reputation.append(cleanString(user_lvl.strip())) - else: - reputation.append('-1') - - sign.append("-1") - - user_post = ipost.find("article", {"class": "message-body js-selectToQuote"}).text - post.append(cleanString(user_post.strip())) - - interest.append("-1") - - bbWrapper = ipost.find('div', {"class": "bbWrapper"}) - if bbWrapper is not None: - img = bbWrapper.find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = "-1" - else: - img = "-1" - image_post.append(img) - - avatar = ipost.find("a", {"class": "avatar avatar--m"}) - if avatar is not None: - img = avatar.find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] - else: - img = "-1" - else: - img = "-1" - image_user.append(img) - - # Populate the final variable (this should be a list with all fields scraped) - - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) - - # Sending the results - - return row - -# This is the method to parse the Listing Pages (one page with many posts) - -def procrax_listing_parser(soup: Tag): - - nm = 0 # this variable should receive the number of topics - forum: str = "Procrax" # 0 *forum name - board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - - author = [] # 2 all authors of each topic - topic = [] # 3 all topics - views = [] # 4 number of views of each topic - posts = [] # 5 number of posts of each topic - href = [] # 6this variable should receive all cleaned urls (we will use this to do the marge between - # Listing and Description pages) - addDate = [] # 7 when the topic was created (difficult to find) - image_author = [] # 8 all author avatars used in each topic - - # Finding the board (should be just one) - li = soup.find("h1", {"class": "p-title-value"}) - board = cleanString(li.text.strip()) - - threads_list = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True}) - - sticky = soup.find("div", {"class": "structItemContainer-group structItemContainer-group--sticky"}) - if sticky is not None: - threads_list = sticky.find_all("div", {"data-author": True}) + threads_list - - nm = len(threads_list) - - for thread in threads_list: - thread_title = thread.find("div", {"class": "structItem-title"}).text - topic.append(cleanString(thread_title.strip())) - - author_icon = thread.find('a', {"class": "avatar avatar--s"}) - if author_icon != None: - author_icon = author_icon.find('img') - if author_icon != None: - author_icon = author_icon.get('src') - author_icon = author_icon.split('base64,')[-1] - else: - author_icon = "-1" - else: - author_icon = "-1" - image_author.append(author_icon) - - thread_author = thread.get("data-author") - author.append(cleanString(thread_author)) - - thread_views = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find('dd').text - thread_views = thread_views.lower().replace("k", "000") - thread_views = thread_views.lower().replace("m", "000000") - views.append(thread_views.strip()) - - thread_replies = thread.find("dl", {"class": "pairs pairs--justified"}).find('dd').text - # All threads contain one topic post and reply posts - thread_total_posts = thread_replies.lower().replace("k", "000") - posts.append(thread_total_posts.strip()) - - thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime") - datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z") - addDate.append(datetime_obj) - - thread_link: str = thread.find("div", {"class": "structItem-title"}).find('a', {'class': ''}).get('href') - href.append(thread_link) - - - return organizeTopics( - forum=forum, - nm=nm, - board=board, - author=author, - topic=topic, - views=views, - posts=posts, - addDate=addDate, - href=href, - image_author=image_author - ) - - -def procrax_links_parser(soup): - - # Returning all links that should be visited by the Crawler - - href = [] - - listing = soup.find_all('div', {"class": "structItem-title"}) - - for a in listing: - link = a.find('a', {'class': ''}).get('href') - - href.append(link) - - return href \ No newline at end of file diff --git a/MarketPlaces/AnonMarket/crawler_selenium.py b/MarketPlaces/AnonMarket/crawler_selenium.py deleted file mode 100644 index eab9ea0..0000000 --- a/MarketPlaces/AnonMarket/crawler_selenium.py +++ /dev/null @@ -1,293 +0,0 @@ -__author__ = 'Helium' - -''' -Anon Market Crawler (Selenium) -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.common.by import By - -from PIL import Image -import urllib.parse as urlparse -import os, re, time -from datetime import date -import subprocess -import configparser -from bs4 import BeautifulSoup -from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.AnonMarket.parser import AnonMarket_links_parser -from MarketPlaces.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion' - - -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later -def startCrawling(): - mktName = getMKTName() - driver = getAccess() - - if driver != 'down': - try: - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(mktName, baseURL, True) - - -# Returns the name of the website -#return: name of site in string type -def getMKTName(): - name = 'AnonMarket' - return name - - -# Return the base link of the website -#return: url of base site in string type -def getFixedURL(): - url = 'http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion' - return url - - -# Closes Tor Browser -#@param: current selenium driver -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from MarketPlaces.Initialization.markets_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", False) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url) - return driver - except: - driver.close() - return 'down' - -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site -def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if (name == ''): - name = str(counter) - counter = counter + 1 - return name - - -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products -def getInterestedLinks(): - links = [] - - # Malware - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/malware') - # Bootkits - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/bootkits') - # Backdoors - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/backdoors') - # Keyloggers - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/keyloggers') - # Wireless Trackers - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wireless_trackers') - # Screen Scrapers - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/screen_scrapers') - # Mobile Forensic Tools - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/mobile_forensics_tools') - # Wifi Jammers - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wifi_jammers') - # Carding - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/carding') - # Worms - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/worms') - # Viruses - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/viruses') - # Trojans - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/trojans') - # Botnets - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/botnets') - # Security Technology - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security_technology') - # Hacks - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/hacks') - # Exploit kits - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/exploit_kit') - # Security - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security') - # Ransomware - links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/ransomware') - - return links - - -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver -def crawlForum(driver): - print("Crawling the Anon Market") - - linksToCrawl = getInterestedLinks() - - for link in linksToCrawl: - print('Crawling :', link) - - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - - html = driver.page_source - savePage(driver, html, link) - - # Get all product links on the current page - products_list = productPages(html) - for item in products_list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver, driver.page_source, item) - driver.back() # Go back to listing after visiting each product - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - # Locate the next page link - try: - # Find the active page number - active_page_element = driver.find_element(By.XPATH, '//div[@class="page activepage"]') - # current_page = int(active_page_element.text) - - next_page_element = active_page_element.find_element(By.XPATH, 'following-sibling::a[1]') - link = next_page_element.get_attribute('href') - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - - print("Crawling the Anon Market done.") - -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not -def isDescriptionLink(url): - if 'product' in url: - return True - return False - - -# Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not -def isListingLink(url): - if 'category' in url: - return True - return False - - -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through -def productPages(html): - soup = BeautifulSoup(html, "html.parser") - return AnonMarket_links_parser(soup) - -def crawler(): - startCrawling() - # print("Crawling and Parsing Nexus .... DONE!") - diff --git a/MarketPlaces/AnonMarket/parser.py b/MarketPlaces/AnonMarket/parser.py deleted file mode 100644 index 997d43e..0000000 --- a/MarketPlaces/AnonMarket/parser.py +++ /dev/null @@ -1,195 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from MarketPlaces.Utilities.utilities import * - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup - -import re - -#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of description page -#return: 'row' that contains a variety of lists that each hold info on the description page -def AnonMarket_description_parser(soup): - - # Fields to be parsed - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image - - name_of_product = soup.find("div", {"class": "heading"}).text - name = cleanString(name_of_product.strip()) - - description_div = soup.find("div", {"class": "tab1"}) - if description_div is None: - describe = "-1" - else: - describe = cleanString(description_div.text.strip()) - - info_div = soup.find('div', {'class': 'information'}) - table = info_div.find('table') if info_div else None - - # Find all table rows - rows = table.find_all('tr') - - # Parse each row to get relevant data - data = {} - for row in rows: - columns = row.find_all('td') - if len(columns) == 3: - key = columns[0].text.strip() - value = columns[2].text.strip() - data[key] = value - - # Extract specific data from the dictionary and assign them to individual variables - vendor = data.get('Vendor', '-1') - shipFrom = data.get('Location', '-1') - shipTo = data.get('Ships to', '-1') - category = data.get('Category', '-1') - USD = data.get('Price', '-1').split()[0] - left = data.get('Stock', '-1') - - # image - image = soup.find('img', {"class": "bigthumbnail"}) - image = image.get('src').split('base64,')[-1] - - # Populating the final variable (this should be a list with all fields scraped) - row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) - - # Sending the results - return row - - -#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of listing page -#return: 'row' that contains a variety of lists that each hold info on the listing page -def AnonMarket_listing_parser(soup): - - # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "AnonMarket" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor y - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this - MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft = [] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links - base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion" - - cat = soup.find("div", {'class': 'heading'}).text - - products_list = soup.find_all('div', {'class': 'item'}) - nm = 0 - for product in products_list: - name_of_product = product.find("div", {"class": "title"}).text.strip() - name.append(name_of_product) - - name_of_vendor = product.find("a", {'class': 'seller'}).text.strip() - vendor.append(name_of_vendor) - - category.append(cat) - - tbody = product.find('div', {"class": "info"}).find('tbody') - - # rating_item - width = tbody.find('div', {"class": "stars2"}).get('style') - rating_item.append(cleanNumbers(width.strip())) - - tr = tbody.findAll('tr', recursive=False) - td = tr[2].findAll('td') - - # sold - sold.append(td[0].text.strip()) - - # reviews - reviews.append(td[1].text.strip()) - - product_link_element = product.find("div", {"class": "title"}).find_parent('a') - link = product_link_element['href'] - full_link = base_url + link - href.append(full_link) - - # Append '-1' for unavailable data - rating_vendor.append("-1") - success.append("-1") - CVE.append("-1") - MS.append("-1") - describe.append("-1") - views.append("-1") - addDate.append("-1") - BTC.append("-1") - USD.append("-1") - EURO.append("-1") - qLeft.append("-1") - shipFrom.append("-1") - shipTo.append("-1") - - nm += 1 - - # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) - - -#called by the crawler to get description links on a listing page -#@param: beautifulsoup object that is using the correct html page (listing page) -#return: list of description links from a listing page -def AnonMarket_links_parser(soup): - # Base URL to prepend to each product link - base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion" - - # Returning all links that should be visited by the Crawler - href = [] - - # Using a shorter, but still unique, class name - listing = soup.find('div', {'class': 'items'}).find_all('a', href=True, attrs={'href': lambda x: "/product/" in x}) - - for a in listing: - link = a.get('href') - if link: # Checks if 'href' attribute is not None - # Prepending the base URL to the scraped link - full_link = base_url + link - href.append(full_link) - - # Filtering out any links that might not have '/product/' in them - product_links = [link for link in href if '/product/' in link] - - return product_links diff --git a/MarketPlaces/Apocalypse/parser.py b/MarketPlaces/Apocalypse/parser.py deleted file mode 100644 index 6610cc6..0000000 --- a/MarketPlaces/Apocalypse/parser.py +++ /dev/null @@ -1,226 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from MarketPlaces.Utilities.utilities import * - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup, ResultSet, Tag - - -def apocalypse_description_parser(soup: Tag): - - # Fields to be parsed - - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image - - content: Tag = soup.find("div", {'id': "article_page"}) - - product_name = content.find("p", {"class": "list-group-item text-center mb-0 box"}).text - name = cleanString(product_name.strip()) - - product_description = content.find("pre").text - describe = cleanString(product_description.strip()) - - # Finding Product Image - image = soup.find('div', {'class': 'col-md-7 text-center'}).find('img') - image = image.get('src').split('base64,')[-1] - - product_reviews_list: Tag = content.find("table", {"class": "table product_reviews"}) \ - .find_all("li") - - review = str(len(product_reviews_list)) - - product_category = content.find("a", {"class": "badge badge-danger"}).text - category = cleanString(product_category.strip()) - - product_ships_from = content.find("span", {"class": "badge badge-info"}).text - shipFrom = cleanString(product_ships_from.strip()) - - product_success_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-success"}) - product_ships_to = product_success_badge[1].text - shipTo = cleanString(product_ships_to.strip()) - - product_supply = content.find("span", {"class": "badge badge-warning"}).text - left = cleanString(product_supply.strip()) - - product_primary_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-primary"}) - - # Product vendor comes in the form of "@ vendor_name" - product_vendor = product_primary_badge[0].text.replace("@", "") - - vendor = cleanString(product_vendor.strip()) - sold = cleanString(product_primary_badge[1].text.strip()) - - product_prices: Tag = content.find("p", {"style": "border-bottom:1px solid GREY;"}) - USD = product_prices.find("span", {"class": "pr"}).text - prices_array: ResultSet[Tag] = product_prices.find_all("span", {"class": "pr1"}) - BTC = prices_array[1].text - - # Populating the final variable (this should be a list with all fields scraped) - row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) - - # Sending the results - return row - - -def apocalypse_listing_parser(soup: Tag): - - # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "Apocalypse" # 0 Marketplace_Name - name = [] # 1 Product_Name - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category - describe = [] # 5 Product_Description - escrow = [] # 6 Vendor_Warranty - views = [] # 7 Product_Number_Of_Views - reviews = [] # 8 Product_Number_Of_Reviews - addDate = [] # 9 Product_AddDate - lastSeen = [] # 10 Product_LastViewDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft =[] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links - - table = soup.find("div", {"class": "col-lg-9 my-4"}) - if table is None: - table = soup.find("div", {"class": "col-lg-9"}) - listings: ResultSet[Tag] = table.find_all("div", {"class": "col-lg-4 col-md-6 mb-1"}) - - for prod in listings: - - product_name = prod.find('h5', {"class": "art_title"}).text - name.append(cleanString(product_name.strip())) - - # Finding Product Image - product_image = prod.find('img', {'class': 'customHeight'}) - product_image = product_image.get('src').split('base64,')[-1] - image.append(product_image) - - CVE.append("-1") - MS.append("-1") - describe.append("-1") - escrow.append("-1") - reviews.append("-1") - addDate.append("-1") - lastSeen.append("-1") - BTC.append("-1") - EURO.append("-1") - shipTo.append("-1") - success.append("-1") - image_vendor.append("-1") - - product_price = prod.find("span", {"class": "priceP"}).text - USD.append(cleanString(product_price.strip())) - - product_sold = prod.find("span", {"class": "badge badge-success"}).text - sold.append(cleanString(product_sold.strip())) - - product_statistics: ResultSet[Tag] = prod.find_all("p", {"class": "mb-0 card-text"}) - - product_category = product_statistics[0].find("a").text - category.append(cleanString(product_category.strip())) - - product_sold = product_statistics[1].find("span").text - sold.append(cleanString(product_sold.strip())) - - product_quantity_left = product_statistics[2].find("span", {"class": "badge bluebadge"}).text - qLeft.append(cleanString(product_quantity_left.strip())) - - product_views = product_statistics[3].find("span").text - views.append(cleanString(product_views.strip())) - - product_ships_from = product_statistics[4].find("span").text - shipFrom.append(cleanString(product_ships_from.strip())) - - product_vendor_tag: Tag = product_statistics[5].find("a").find("span", {"class": "badge badge-primary"}) - # Product vendors & ratings are displayed as "vender_name ★ 5.0" - # When split by the star (★), it should return a 2-value array - product_vendor, product_vendor_rating = product_vendor_tag.text.split("★") - - try: - vendor.append(cleanString(product_vendor.strip())) - rating.append(cleanString(product_vendor_rating.strip())) - except Exception as e: - raise e - - product_href = prod.find('a').get('href') - href.append(product_href) - - nm += 1 - - return organizeProducts( - marketplace=mktName, - nm=nm, - vendor=vendor, - rating_vendor=rating, - success_vendor=success, - nombre=name, - CVE=CVE, - MS=MS, - category=category, - describe=describe, - views=views, - reviews=reviews, - rating_item=["-1" for _ in range(nm)], - addDate=addDate, - BTC=BTC, - USD=USD, - EURO=EURO, - sold=sold, - qLeft=qLeft, - shipFrom=shipFrom, - shipTo=shipTo, - href=href, - image=image, - image_vendor=image_vendor - ) - - -#called by the crawler to get description links on a listing page -#@param: beautifulsoup object that is using the correct html page (listing page) -#return: list of description links from a listing page -def apocalypse_links_parser(soup): - - # Returning all links that should be visited by the Crawler - - href = [] - listing = soup.findAll('div', {"class": "col-lg-4 col-md-6 mb-1"}) - - for a in listing: - bae = a.find('a', href=True) - link = bae['href'] - href.append(link) - - return href \ No newline at end of file diff --git a/MarketPlaces/LionMarketplace/crawler_selenium.py b/MarketPlaces/Ares/crawler_selenium.py similarity index 59% rename from MarketPlaces/LionMarketplace/crawler_selenium.py rename to MarketPlaces/Ares/crawler_selenium.py index e20f630..fbed2b1 100644 --- a/MarketPlaces/LionMarketplace/crawler_selenium.py +++ b/MarketPlaces/Ares/crawler_selenium.py @@ -1,7 +1,7 @@ -__author__ = 'Helium' +__author__ = 'DarkWeb' ''' -LionMarketplace Marketplace Crawler (Selenium) +Ares Market Crawler (Selenium) ''' from selenium import webdriver @@ -9,64 +9,107 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By - +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait from PIL import Image + import urllib.parse as urlparse -import os, re, time +import os, time from datetime import date import subprocess -import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.LionMarketplace.parser import lionmarketplace_links_parser +from MarketPlaces.Ares.parser import ares_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/' +baseURL = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion' -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later +# Opens Tor Browser, crawls the website def startCrawling(): - mktName = getMKTName() + marketName = getMarketName() driver = getAccess() if driver != 'down': try: - # login(driver) + login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) closeDriver(driver) - new_parse(mktName, baseURL, True) + new_parse(marketName, False) + + +# Login using premade account credentials and do login captcha manually +def login(driver): + #wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/center"))) + + #entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + #Username here + usernameBox.send_keys('blabri') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + #Password here + passwordBox.send_keys('fishowal') + + ''' + # wait for captcha page show up + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img"))) + + # save captcha to local + driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img').screenshot( + r'..\Ares\captcha.png') + + # This method will show image in any image viewer + im = Image.open(r'..\Ares\captcha.png') + + im.show() + + # wait until input space show up + inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/input') + + # ask user input captcha solution in terminal + userIn = input("Enter solution: ") + + # send user solution into the input space + inputBox.send_keys(userIn) + + # click the verify(submit) button + driver.find_element(by=By.XPATH, value="/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[4]/div/div/button").click() + ''' + input("Press ENTER when CAPTCHA is completed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 50).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[7]/div[3]/div[2]/div[1]/div[1]'))) # Returns the name of the website -#return: name of site in string type -def getMKTName(): - name = 'LionMarketplace' +def getMarketName(): + name = 'Ares' return name -# Return the base link of the website -#return: url of base site in string type +# Return the link of the website def getFixedURL(): - url = 'http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/' + url = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion' + return url # Closes Tor Browser -#@param: current selenium driver def closeDriver(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') - driver.close() + driver.quit() time.sleep(3) return @@ -103,14 +146,12 @@ def createFFDriver(): service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -122,30 +163,7 @@ def getAccess(): return 'down' -# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha -# then allows for manual solving of captcha in the terminal -#@param: current selenium web driver -def login(driver): - # wait for page to show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="username"]'))) - - # entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - # Username here - usernameBox.send_keys('blabri') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - # Password here - passwordBox.send_keys('fishowal') - - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/nav/div/div/ul[2]/form/button"))) - - -# Saves the crawled html page, makes the directory path for html pages if not made +# Saves the crawled html page def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -155,7 +173,6 @@ def savePage(driver, page, url): # Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE @@ -168,37 +185,47 @@ def getFullPathName(url): return fullPath -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site +# Creates the file name from passed URL def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) - if (name == ''): + if name == '': name = str(counter) counter = counter + 1 return name -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products def getInterestedLinks(): links = [] - # Hacking - links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/ba142ac0-c7e7-11ec-9bd1-fdd89c3d3f91') - # Digital - links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/12') + # # Digital - Other + # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/91ecd5d0-002c-11ec-9b46-ede2378c5d3c') + # # Digital - VPN + # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9431b830-002b-11ec-86d6-cdaf65cd97f1') + # # Digital - Coding + # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/948b7400-a939-11ec-adc5-2f775203130c') + # Digital - Malware + links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2') + # # Digital - Guides + # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662') + # # Digital - Hacking + # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921') + # # Digital - Malware + # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/b3258c50-002b-11ec-b658-876d3d651145') + # # Digital - Services + # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/bae64840-002b-11ec-bbcc-a93431540099') + # # Digital - Software + # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1') + # # Digital - Exploits + # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6') + # # Digital - Tutorials + # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8') return links -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver def crawlForum(driver): - print("Crawling the LionMarketplace market") + print("Crawling the Ares market") linksToCrawl = getInterestedLinks() @@ -228,16 +255,19 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + break + + # comment out + if count == 1: + break try: - nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/div[2]/nav') - link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href') + nav = driver.find_element(by=By.XPATH, value= + '/html/body/div[7]/div[3]/div/div[2]/nav') + a = nav.find_element(by=By.LINK_TEXT, value="Next") + link = a.get_attribute('href') + if link == "": raise NoSuchElementException count += 1 @@ -249,12 +279,10 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the LionMarketplace market done.") + input("Crawling Ares market done sucessfully. Press ENTER to continue\n") -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not +# Returns 'True' if the link is Topic link def isDescriptionLink(url): if 'product' in url: return True @@ -262,29 +290,16 @@ def isDescriptionLink(url): # Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not def isListingLink(url): if 'category' in url: return True return False -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through +# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - return lionmarketplace_links_parser(soup) - - -# Drop links that "signout" -# def isSignOut(url): -# #absURL = urlparse.urljoin(url.base_url, url.url) -# if 'signout' in url.lower() or 'logout' in url.lower(): -# return True -# -# return False + return ares_links_parser(soup) def crawler(): diff --git a/MarketPlaces/Ares/parser.py b/MarketPlaces/Ares/parser.py new file mode 100644 index 0000000..3232b0c --- /dev/null +++ b/MarketPlaces/Ares/parser.py @@ -0,0 +1,227 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from MarketPlaces.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + + +# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) +def ares_description_parser(soup): + + # Fields to be parsed + + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + + # Finding Product Name + name = soup.find('div', {'class': "col-md-12 my-2"}).text + name = name.replace('\n', ' ') + name = name.replace(",", "") + name = name.strip() + + bae = soup.find('div', {'class': "col-md-7"}).find('span').find_all('span') + + # Finding Vendor + vendor = bae[0].text + vendor = vendor.replace(",", "") + vendor = vendor.replace("...", "") + vendor = vendor.strip() + + # Finding Vendor Rating + full_stars = bae[2].find_all('i', {'class': "fas fa-star"}) + half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) + rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0) + + # Finding Successful Transactions + success = bae[4].text + success = success.replace("Sales ", "") + success = success.strip() + + bae = soup.find('span', {'class': "text-left"}).find_all('span') + + # Finding Prices + USD = bae[0].text + USD = USD.replace("\n$", "") + USD = USD.strip() + + shipping_info = bae[4].text + if "Digital" not in shipping_info: + shipping_info = shipping_info.split(" ") + + # Finding Shipment Information (Origin) + shipFrom = shipping_info[0].strip() + + # Finding Shipment Information (Destination) + shipTo = shipping_info[1].strip() + + bae = soup.find_all('textarea') + + # Finding the Product description + describe = bae[0].text + describe = describe.replace("\n", " ") + describe = describe.replace("\r", " ") + describe = describe.strip() + + # Finding the Terms and Conditions + terms = bae[1].text + terms = terms.replace("\n", " ") + terms = terms.strip() + + ''' + # Finding the Number of Product Reviews + tag = soup.findAll(text=re.compile('Reviews')) + for index in tag: + reviews = index + par = reviews.find('(') + if par >=0: + reviews = reviews.replace("Reviews (","") + reviews = reviews.replace(")","") + reviews = reviews.split(",") + review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) + else : + review = "-1" + ''' + + # Searching for CVE and MS categories + cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if cve: + CVE = " " + for idx in cve: + CVE += (idx) + CVE += " " + CVE = CVE.replace(',', ' ') + CVE = CVE.replace('\n', '') + ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) + if ms: + MS = " " + for im in ms: + MS += (im) + MS += " " + MS = MS.replace(',', ' ') + MS = MS.replace('\n', '') + + # Populating the final variable (this should be a list with all fields scraped) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo) + + # Sending the results + return row + + +# This is the method to parse the Listing Pages +def ares_listing_parser(soup): + + # Fields to be parsed + nm = 0 # *Total_Products (Should be Integer) + mktName = "Ares" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = [] # 6 Product_MS_Classification (Microsoft Security) + category = [] # 7 Product_Category + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + href = [] # 20 Product_Links + + listing = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) + + # Populating the Number of Products + nm = len(listing) + + for a in listing: + bae = a.findAll('a', href=True) + + # Adding the url to the list of urls + link = bae[0].get('href') + link = cleanLink(link) + href.append(link) + + # Finding the Vendor + vendor_name = bae[1].text + vendor_name = vendor_name.replace(",", "") + vendor_name = vendor_name.strip() + vendor.append(vendor_name) + + # Finding the Product + product = bae[2].find('img').get('alt') + product = product.replace('\n', ' ') + product = product.replace(",", "") + product = product.strip() + name.append(product) + + # Searching for CVE and MS categories + cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if not cve: + cveValue="-1" + else: + cee = " " + for idx in cve: + cee += (idx) + cee += " " + cee = cee.replace(',', ' ') + cee = cee.replace('\n', '') + cveValue=cee + CVE.append(cveValue) + + ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) + if not ms: + MSValue="-1" + else: + me = " " + for im in ms: + me += (im) + me += " " + me = me.replace(',', ' ') + me = me.replace('\n', '') + MSValue=me + MS.append(MSValue) + + # Populate the final variable (this should be a list with all fields scraped) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + + +def ares_links_parser(soup): + + # Returning all links that should be visited by the Crawler + href = [] + + listing = soup.findAll('a', {"class": "btn btn-success w-100 my-1"}) + + for a in listing: + + link = a['href'] + href.append(link) + + return href \ No newline at end of file diff --git a/MarketPlaces/ThiefWorld/crawler_selenium.py b/MarketPlaces/Bohemia/crawler_selenium.py similarity index 51% rename from MarketPlaces/ThiefWorld/crawler_selenium.py rename to MarketPlaces/Bohemia/crawler_selenium.py index 95db8ff..c923f60 100644 --- a/MarketPlaces/ThiefWorld/crawler_selenium.py +++ b/MarketPlaces/Bohemia/crawler_selenium.py @@ -1,7 +1,7 @@ -__author__ = 'Helium' +__author__ = 'DarkWeb' ''' -ThiefWorld Market Crawler (Selenium) +Bohemia Market Crawler (Selenium) ''' from selenium import webdriver @@ -10,6 +10,7 @@ from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.ui import Select from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By @@ -18,16 +19,13 @@ import urllib.parse as urlparse import os, re, time from datetime import date import subprocess -import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse - -from MarketPlaces.ThiefWorld.parser import thiefworld_links_parser +from MarketPlaces.Bohemia.parser import bohemia_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/' - +baseURL = 'http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/' # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later @@ -37,26 +35,81 @@ def startCrawling(): if driver != 'down': try: + captcha(driver) login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) closeDriver(driver) - new_parse(mktName, baseURL, True) + new_parse(mktName, False) + + +def login(driver): + #wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div/div[4]/div/div/form/input[1]"))) + + #click on login page confirmation + driver.find_element(by=By.XPATH, value="/html/body/div/div[4]/div/div/form/input[1]").click() + + #wait until next page shows up + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[1]/input"))) + + #entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[1]/input') + #username here + usernameBox.send_keys('ct-1234') + passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[2]/input') + #password here + passwordBox.send_keys('DementedBed123-') + #session time + session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[3]/select')) + session_select.select_by_visible_text('300 Minutes') + + ''' + #wait for captcha page to show up + inputBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[4]/div/input') + + #save captcha to local + driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Bohemia\captcha2.png') + im = Image.open(r'..\Bohemia\captcha2.png') + im.show() + + #ask user input captcha solution in terminal + userIn = input("Enter Solution: ") + + #send user solution into input field + inputBox.send_keys(userIn) + + #click the submit button + driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[5]/button').click() + ''' + input("Press ENTER when CAPTCHA is completed\n") + + #wait for listing page to show up + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[2]/div[2]/div[1]/div"))) # Returns the name of the website #return: name of site in string type def getMKTName(): - name = 'ThiefWorld' + name = 'Bohemia' return name +# Returns credentials needed for the mkt +def getCredentials(): + credentials = 'blank blank blank blank cap 0' + return credentials + + # Return the base link of the website #return: url of base site in string type def getFixedURL(): - url = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/' + url = 'http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/' return url @@ -80,13 +133,13 @@ def createFFDriver(): ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + # ff_prof.set_preference("places.history.enabled", False) + # ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + # ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + # ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + # ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + # ff_prof.set_preference("signon.rememberSignons", False) + # ff_prof.set_preference("network.cookie.lifetimePolicy", 2) # ff_prof.set_preference("network.dns.disablePrefetch", True) # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) @@ -98,18 +151,17 @@ def createFFDriver(): ff_prof.set_preference('network.proxy.socks', '127.0.0.1') ff_prof.set_preference('network.proxy.socks_port', 9150) ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", False) + ff_prof.set_preference("javascript.enabled", True) ff_prof.update_preferences() service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver - #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' #return: return the selenium driver or string 'down' def getAccess(): @@ -126,18 +178,77 @@ def getAccess(): # Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha # then allows for manual solving of captcha in the terminal #@param: current selenium web driver -def login(driver): - # wait for page to show up (This Xpath may need to change based on different seed url) +def captcha(driver): + ''' + # wait for captcha page show up (for bohemia it takes A WHILE) + print("Connecting Bohemia...") + time.sleep(7.5) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/div"))) + input('Bohemia Connected. Press ENTER to continue\n') + + # save captcha to local + driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div/div").screenshot(r'..\Bohemia\captcha.png') + + # open method used to open different extension image file + im = Image.open(r'..\Bohemia\captcha.png') + + # This method will show image in any image viewer + im.show() + + # Prints link to console since captcha requires the link + print(getFixedURL()) + + # wait until input space show up + inputBox = driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div/div/input") + + # ask user input captha solution in terminal + userIn = input("Enter solution: ") + + # send user solution into the input space + inputBox.send_keys(userIn) + + # click the verify(submit) button + driver.find_element(by=By.XPATH, value='/html/body/div/div/form/button[1]').click() + + # im.close() + ''' + input("Press ENTER when CAPTCHA is completed\n") + + # wait for next captcha to show up WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/header/div[2]/div/nav/div[2]/a[1]"))) + (By.XPATH, "/html/body/div/div/form"))) + + ''' + for square in range(1,7): + + inputBox = driver.find_element(by=By.XPATH, value=f"/html/body/div/div/form/div[1]/input[{square}]") + inputBox.click() + time.sleep(.5) + # userIn = input("Enter Solution: ") + # inputBox.send_keys(userIn) + + # Takes screenshot every iteration because after input the captcha changes + driver.find_element(by=By.XPATH, value="/html/body/div/div/form").screenshot(r'..\Bohemia\captcha1.png') + + # Opens and crops image + im = Image.open(r'..\Bohemia\captcha1.png') + im = im.crop(((im.width // 2 - 80), (im.height // 2 - 100), (im.width // 2 + 80), (im.height // 2 + 60))) + im.show() + # im.close() - temp = driver.find_element(By.XPATH, '/html/body/div/header/div[2]/div/nav/div[2]/a[1]').get_attribute( - 'href') # /html/body/div/div[2]/div/div[2]/div - link = urlparse.urljoin(baseURL, str(temp)) - driver.get(link) # open - # wait for listing page show up (This Xpath may need to change based on different seed url) + userIn = input("Enter Solution: ") + inputBox.send_keys(userIn) + + #locate and press submit button + driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click() + # driver.find_element(by=By.XPATH, value='/html/body/div/div/form/button[2]') + ''' + + input("Press ENTER when CAPTCHA is completed\n") + + #wait for next page to show up WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.ID, "side-bar"))) + (By.XPATH, "/html/body/div/div[4]/div/div/form/input[1]"))) # Saves the crawled html page, makes the directory path for html pages if not made def savePage(driver, page, url): @@ -172,7 +283,6 @@ def getNameFromURL(url): counter = counter + 1 return name - # returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list #in this example, there are a couple of categories some threads fall under such as # Guides and Tutorials, Digital Products, and Software and Malware @@ -180,30 +290,31 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # Hacking and DOSS - links.append(['Hacking and DOSS', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/35']) - # Carding Manuals - links.append(['Carding Manuals', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/20']) - # Software - links.append(['Software', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/37']) - # Database - links.append(['Database', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/38']) + # Malware and Botnets + links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?page=1&type=all&catid=95') + # #Exploits + # links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?page=1&type=all&catid=99') + # #Methods + # links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?catid=100') + # #Exploit kits + # links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?catid=101') + # #Hacking Software + # links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?catid=103') - return links + return links # gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through #topic and description pages are crawled through here, where both types of pages are saved #@param: selenium driver def crawlForum(driver): - print("Crawling the ThiefWorld market") + print("Crawling the Bohemia Market") linksToCrawl = getInterestedLinks() i = 0 while i < len(linksToCrawl): - cat = linksToCrawl[i][0] - link = linksToCrawl[i][1] + link = linksToCrawl[i] print('Crawling :', link) try: has_next_page = True @@ -215,7 +326,6 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - html += f"{cat}" savePage(driver, html, link) list = productPages(html) @@ -228,17 +338,18 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + break + + # comment out + if count == 1: + break try: - nav = driver.find_element(by=By.XPATH, value='/html/body/div/div[1]/div/div/div[2]/div[3]') - right = nav.find_element(by=By.CLASS_NAME, value='pag_right') - link = right.find_element(by=By.TAG_NAME, value='a').get_attribute('href') + nav = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[2]/ul') + a = nav.find_element(by=By.PARTIAL_LINK_TEXT, value="Next") + link = a.get_attribute('href') + if link == "": raise NoSuchElementException count += 1 @@ -250,14 +361,14 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the ThiefWorld market done.") + input("Crawling Bohemia Market done sucessfully. Press ENTER to continue\n") # Returns 'True' if the link is a description link #@param: url of any url crawled #return: true if is a description page, false if not def isDescriptionLink(url): - if 'product' in url: + if bool(re.search(r'\blisting\b',url)): # accurate with bohemia return True return False @@ -266,7 +377,7 @@ def isDescriptionLink(url): #@param: url of any url crawled #return: true if is a Listing page, false if not def isListingLink(url): - if 'catalog' in url: + if bool(re.search(r'\blistings\b',url)): # accurate with bohemia return True return False @@ -276,16 +387,16 @@ def isListingLink(url): #return: list of description links that should be crawled through def productPages(html): soup = BeautifulSoup(html, "html.parser") - return thiefworld_links_parser(soup) + return bohemia_links_parser(soup) # Drop links that "signout" -# def isSignOut(url): -# #absURL = urlparse.urljoin(url.base_url, url.url) -# if 'signout' in url.lower() or 'logout' in url.lower(): -# return True -# -# return False +def isSignOut(url): + #absURL = urlparse.urljoin(url.base_url, url.url) + if 'signout' in url.lower() or 'logout' in url.lower(): + return True + + return False def crawler(): diff --git a/MarketPlaces/DarkBazar/parser.py b/MarketPlaces/Bohemia/parser.py similarity index 57% rename from MarketPlaces/DarkBazar/parser.py rename to MarketPlaces/Bohemia/parser.py index 9386d18..7157722 100644 --- a/MarketPlaces/DarkBazar/parser.py +++ b/MarketPlaces/Bohemia/parser.py @@ -1,5 +1,7 @@ __author__ = 'DarkWeb' +import re + # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * @@ -11,7 +13,8 @@ from bs4 import BeautifulSoup # stores info it needs in different lists, these lists are returned after being organized # @param: soup object looking at html page of description page # return: 'row' that contains a variety of lists that each hold info on the description page -def darkbazar_description_parser(soup): +def bohemia_description_parser(soup): + # Fields to be parsed vendor = "-1" # 0 *Vendor_Name @@ -33,75 +36,83 @@ def darkbazar_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - divmb = soup.findAll('div', {'class': "mb-1"}) - - name = divmb[0].text + name = soup.find('h1', {"style": "margin: 0; margin-bottom: 0.5em;"}).text name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() # Finding Vendor - vendor = divmb[1].find('a').text.strip() + vendor = soup.find('div', {"class": "user-photo"}).find_next_sibling('a').text + vendor = vendor.strip() # Finding Vendor Rating - temp = soup.find('div', {'class': ""}).text - temp = temp.split('(') - rating = temp[0].replace("Vendor's Review : ", "") - rating = rating.replace("%", "") - rating_vendor = rating.strip() - - # Finding the Product Rating and Number of Product Reviews - reviews = temp[2].replace(" review)", "") - reviews = reviews.strip() + rating_vendor = soup.find('span', {'class': "user-percent"}).text.strip() - temp = temp[1].split(")") - rating = temp[1].replace("Product Review : ", "") - rating = rating.replace("%", "") - rating_item = rating.strip() + # Finding Users' Successful Transactions + temp = '' + success = soup.find('span', {'class': "smalltext shadow-text"}).text + temp = success.split("|") + success = str(temp[1]) + success = success.strip() # Finding Prices - USD = soup.find('div', {'class': "h3 text-primary"}).text.strip() + prices = soup.find('div', {'class': "col-md-3 sidebar-navigation user-details"} + ).find('div', {'class': "container text-left"}) + USD = prices.find('h1').text.strip() + BTC = prices.find('h1').find_next_sibling('h3').text + BTC = BTC.replace("BTC", "") + BTC = BTC.strip() - # Finding the Product Category - pmb = soup.findAll('p', {'class': "mb-1"}) + detail_row = soup.find('div', {'class': "detail-container text-left"}).find_all('strong') - category = pmb[-1].text - category = category.replace("Category: ", "").strip() + # Finding the Product Category (there isnt a thing for it on the page + # category = li[1].find('span', {'class': "tag is-dark"}).text.strip() # Finding the Product Quantity Available - left = divmb[-1].text - left = left.split(",", 1)[1] - left = left.replace("in stock", "") - left = left.strip() + left = soup.find('div', {'class': "container detail-container text-left"}) + left = left.find('div', {'class': "detail-row"}).text.replace('\n', '') + left = left.split("Available Stock:") + left = left[1].strip() # Finding Number Sold - sold = divmb[-1].text - sold = sold.split(",", 1)[0] - sold = sold.replace("sold", "") + sold = detail_row[0].find_parent() + sold = sold.text + sold = sold.replace("Total Sold:", "") sold = sold.strip() - # Finding Shipment Information (Origin) - pmb[0].text - shipFrom = shipFrom.replace("Ships from: ", "").strip() - - # Finding Shipment Information (Destination) - pmb[1].text - shipTo = shipTo.replace("Ships to: ", "").strip() + # Finding Shipment Information (Origin) (There is no shipping information) + '''if "Ships from:" in li[-2].text: + shipFrom = li[-2].text + shipFrom = shipFrom.replace("Ships from: ", "") + # shipFrom = shipFrom.replace(",", "") + shipFrom = shipFrom.strip()''' + + # Finding Shipment Information (Destination) (No shipping info + '''shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text + shipTo = shipTo.replace("Ships to: ", "") + shipTo = shipTo.strip() + if "certain countries" in shipTo: + countries = "" + tags = li[-1].find_all('span', {'class': "tag"}) + for tag in tags: + country = tag.text.strip() + countries += country + ", " + shipTo = countries.strip(", ")''' # Finding the Product description - cardbody = soup.findAll('div', {'class': "card-body"}) - describe = cardbody[1].text.strip() + describe = soup.find('div', {'class': "container feedback-container"}) + describe = describe.find_next_sibling('div', {'class': "container"}).find('p').text + describe = describe.replace("\n", " ") + describe = describe.strip() - # Finding Product Image - image = soup.find('div', {'class': 'product-primary'}).find('img') - image = image.get('src') - image = image.split('base64,')[-1] + # Finding the Number of Product Reviews + review = detail_row[2].find_parent().text + review = review.split("Based on") + review = review[1].replace("ratings)", "").strip() - # Searching for CVE and MS categories + # Searching for CVE and MS categories (cant find it) cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if cve: CVE = " " @@ -121,7 +132,7 @@ def darkbazar_description_parser(soup): # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) + BTC, USD, EURO, sold, left, shipFrom, shipTo) # Sending the results return row @@ -131,17 +142,17 @@ def darkbazar_description_parser(soup): # stores info it needs in different lists, these lists are returned after being organized # @param: soup object looking at html page of listing page # return: 'row' that contains a variety of lists that each hold info on the listing page -def darkbazar_listing_parser(soup): +def bohemia_listing_parser(soup): # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "DarkBazar" # 0 *Marketplace_Name + nm = 0 # *Total_Products (Should be Integer) + mktName = "Bohemia" # 0 *Marketplace_Name vendor = [] # 1 *Vendor y rating_vendor = [] # 2 Vendor_Rating success = [] # 3 Vendor_Successful_Transactions name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this - MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = [] # 6 Product_MS_Classification (Microsoft Security) category = [] # 7 Product_Category y describe = [] # 8 Product_Description views = [] # 9 Product_Number_Of_Views @@ -152,21 +163,18 @@ def darkbazar_listing_parser(soup): USD = [] # 14 Product_USD_SellingPrice y EURO = [] # 15 Product_EURO_SellingPrice sold = [] # 16 Product_QuantitySold - qLeft = [] # 17 Product_QuantityLeft + qLeft =[] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links + href = [] # 20 Product_Links - listing = soup.findAll('div', {"id": "itembox"}) + listing = soup.findAll('div', {"class": "product-link"}) # Populating the Number of Products nm = len(listing) for a in listing: bae = a.findAll('a', href=True) - lb = a.findAll('div', {"id": "littlebox"}) # Adding the url to the list of urls link = bae[0].get('href') @@ -174,64 +182,60 @@ def darkbazar_listing_parser(soup): href.append(link) # Finding the Product - product = lb[1].find('a').text + product = bae[0].text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.replace("...", "") product = product.strip() name.append(product) - # Finding Product Image - product_image = a.find('img') - product_image = product_image.get('src') - product_image = product_image.split('base64,')[-1] - image.append(product_image) + bae = a.find('div', {'class': "container"}) # Finding Prices - price = lb[-1].find('div', {"class": "mb-1"}).text - price = price.replace("$","") - price = price.strip() - USD.append(price) + price = bae.find('div', {'class': "product-price"}).find('h2').text + ud = price.replace("USD", " ") + # u = ud.replace("$","") + ud = ud.replace(",", "") + ud = ud.strip() + USD.append(ud) + bc = bae.find('div', {'class': "product-price"}).find('span', {'class': "shadow-text smalltext boldtext"}).text + bc = bc.replace("\n", "") + bc = bc.split() + bc = bc[0].replace("BTC", "").strip() + BTC.append(bc) # Finding the Vendor - vendor_name = lb[-1].find("a").text - vendor_name = vendor_name.replace(",", "") + vendor_name = bae.find('b').find('a').text vendor_name = vendor_name.strip() vendor.append(vendor_name) - image_vendor.append("-1") - # Finding the Category - cat = lb[-1].find("span").text - cat = cat.replace("class:", "") + cat = bae.find('span', {'class': "shadow-text smalltext"}).find('strong').text cat = cat.strip() category.append(cat) - span = lb[0].findAll("span") - - # Finding Number of Views - num = span[0].text - num = num.replace("views:", "") + # Finding Number Sold and Quantity Left + num = bae.find('div', {'class': "product-details-bottom"}).find('span').text + num = num.replace("Sold", "") + num = num.replace("times in total", "") num = num.strip() sold.append(num) - # Finding Number Sold - num = span[2].text - num = num.replace("Sold:", "") - num = num.strip() - sold.append(num) - - # Finding Quantity Left - quant = span[1].text - quant = quant.replace("stock:", "") - quant = quant.strip() + quant = bae.find('div', {'class': "product-price"}).text + quant = quant.replace("\n", "") + quant = quant.split("Available") + quant = quant[0].replace("Autoship", "").strip() qLeft.append(quant) - # add shipping information - ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->") - shipFrom.append(ship[0].replace("Ship from ", "").strip()) - shipTo.append(ship[1].replace("to ", "").strip()) + # Finding Successful Transactions + freq = bae.find('div', {'title': "Total Sales"}).find_parent().text.replace("\n", "") + freq = freq.strip().split() + freq = freq[-1].strip() + success.append(freq) + # find vendor rating + rate = bae.find('b').find('strong').text.strip() + rating_vendor.append(rate) # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -262,28 +266,24 @@ def darkbazar_listing_parser(soup): # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) # called by the crawler to get description links on a listing page # @param: beautifulsoup object that is using the correct html page (listing page) # return: list of description links from a listing page -def darkbazar_links_parser(soup): +def bohemia_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('div', {"id": "itembox"}) - # for a in listing: - # bae = a.find('a', {"class": "text-info"}, href=True) - # link = bae['href'] - # href.append(link) + temp = soup.find('div', {"class": "col-md-9 sidebar-content-right listing-content"}) + temp = temp.find('div', {"class": "product-listing"}) + listing = temp.findAll('div', {"class": "product-heading"}) for a in listing: - bae = a.findAll('a', href=True) - - # Adding the url to the list of urls - link = bae[0].get('href') + bae = a.find('a', href=True) + link = bae['href'] href.append(link) return href \ No newline at end of file diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py deleted file mode 100644 index fdfb640..0000000 --- a/MarketPlaces/DarkBazar/crawler_selenium.py +++ /dev/null @@ -1,262 +0,0 @@ -__author__ = 'DarkWeb' - -''' -DarkBazar Marketplace Crawler (Selenium) -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support.ui import Select -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.common.by import By - -from PIL import Image -import urllib.parse as urlparse -import os, re, time -from datetime import date -import subprocess -import configparser -from bs4 import BeautifulSoup -from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.DarkBazar.parser import darkbazar_links_parser -from MarketPlaces.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/' - - -def startCrawling(): - mktName = getMKTName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(mktName, baseURL, True) - - -# Returns the name of the website -def getMKTName(): - name = 'DarkBazar' - return name - - -# Return the base link of the website -def getFixedURL(): - url = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/' - return url - - -# Closes Tor Browser -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from MarketPlaces.Initialization.markets_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - # ff_prof.set_preference("network.dns.disablePrefetch", True) - # ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", False) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url) - return driver - except: - driver.close() - return 'down' - - -def login(driver): - input("Press ENTER when CAPTCHA is complete and login page has loaded\n") - - # entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]') - # Username here - usernameBox.send_keys('aliciamykeys') - passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]') - # Password here - passwordBox.send_keys('aliciawherearemykey$') - # session time - session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select')) - session_select.select_by_visible_text('Session 60min') - - input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="submit"]'))) - - -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -def getMKTName() -> str: - name = 'DarkBazar' - return name - - -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if name == '': - name = str(counter) - counter = counter + 1 - return name - - -def getInterestedLinks(): - links = [] - - # Digital Goods - links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=3') - # Services - links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=5') - - return links - - -def crawlForum(driver): - - print("Crawling the DarkBazar market") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - - list = productPages(html) - - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver, driver.page_source, item) - driver.back() - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - try: - link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the DarkBazar market done.") - - -# Returns 'True' if the link is Topic link, may need to change for every website -def isDescriptionLink(url): - if 'item' in url: - return True - return False - - -# Returns True if the link is a listingPage link, may need to change for every website -def isListingLink(url): - if 'category=' in url: - return True - return False - - -def productPages(html): - soup = BeautifulSoup(html, "html.parser") - return darkbazar_links_parser(soup) - - -def crawler(): - startCrawling() diff --git a/MarketPlaces/DarkMatter/crawler_selenium.py b/MarketPlaces/DarkMatter/crawler_selenium.py deleted file mode 100644 index b75eea5..0000000 --- a/MarketPlaces/DarkMatter/crawler_selenium.py +++ /dev/null @@ -1,284 +0,0 @@ -__author__ = 'Helium' - -''' -DarkMatter Marketplace Crawler (Selenium) -Crawler works, but it slow since there is a speed check for clicking -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.common.by import By - -from PIL import Image -import urllib.parse as urlparse -import os, re, time -from datetime import date -import subprocess -import configparser -from bs4 import BeautifulSoup -from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.DarkMatter.parser import darkmatter_links_parser -from MarketPlaces.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/' - - -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later -def startCrawling(): - mktName = getMKTName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(mktName, baseURL, True) - - -# Returns the name of the website -#return: name of site in string type -def getMKTName(): - name = 'DarkMatter' - return name - - -# Return the base link of the website -#return: url of base site in string type -def getFixedURL(): - url = 'http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/' - return url - - -# Closes Tor Browser -#@param: current selenium driver -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from MarketPlaces.Initialization.markets_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - #ff_prof.set_preference("network.dns.disablePrefetch", True)#connection issue - #ff_prof.set_preference("network.http.sendRefererHeader", 0)#connection issue - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", False) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url) - return driver - except: - driver.close() - return 'down' - - -# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha -# then allows for manual solving of captcha in the terminal -#@param: current selenium web driver -def login(driver): - input("Press ENTER when CAPTCHA is completed and page is loaded\n") - # wait for page to show up (This Xpath may need to change based on different seed url) - -# Saves the crawled html page, makes the directory path for html pages if not made -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site -def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if (name == ''): - name = str(counter) - counter = counter + 1 - return name - - -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products -def getInterestedLinks(): - links = [] - - # digital fraud software - links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=76') - # legit - links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=78') - # hack guides - links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=94') - # services - links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=117') - # software/malware - links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=121') - - return links - - -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver -def crawlForum(driver): - print("Crawling the DarkMatter market") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - - list = productPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - time.sleep(3) # to keep from detecting click speed - driver.get(itemURL) - except: - driver.refresh() - savePage(driver, driver.page_source, item) - time.sleep(3) # to keep from detecting click speed - driver.back() - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - try: - link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href') - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the DarkMatter market done.") - - -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not -def isDescriptionLink(url): - if 'products/' in url and '/products/?category' not in url: - return True - return False - - -# Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not -def isListingLink(url): - if '?category' in url: - return True - return False - - -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through -def productPages(html): - soup = BeautifulSoup(html, "html.parser") - return darkmatter_links_parser(soup) - - -# Drop links that "signout" -# def isSignOut(url): -# #absURL = urlparse.urljoin(url.base_url, url.url) -# if 'signout' in url.lower() or 'logout' in url.lower(): -# return True -# -# return False - - -def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/DarkMatter/parser.py b/MarketPlaces/DarkMatter/parser.py deleted file mode 100644 index 2a681bc..0000000 --- a/MarketPlaces/DarkMatter/parser.py +++ /dev/null @@ -1,261 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from MarketPlaces.Utilities.utilities import * - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup - - -#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of description page -#return: 'row' that contains a variety of lists that each hold info on the description page -def darkmatter_description_parser(soup): - - # Fields to be parsed - - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image - - # 0 *Vendor_Name - try: - temp = soup.find('table', {'class', 'vtable'}) - temp = temp.findAll('tr') - temp2 = temp[3].find('a').text - vendor = cleanString(temp2.strip()) - except: - temp = soup.find('table', {'class', 'vtable'}) - temp = temp.findAll('tr') - temp2 = temp[4].find('a').text - vendor = cleanString(temp2.strip()) - - # product name - name = soup.find('div', {'class', 'title-h2'}).text - name = cleanString(name.strip()) - - #product description - temp = soup.find('pre', {'class', 'description'}).text - temp = temp.replace('\n', ' ') - describe = cleanString(temp.strip()) - - #product category - try: - temp = soup.find('table', {'class', 'vtable'}) - temp = temp.findAll('tr') - temp2 = temp[4].find('th').text - temp2 = cleanString(temp2) - if (temp2 == "Category"): - temp2 = temp[4].find('a').text - category = cleanString(temp2.strip()) - except: - temp = soup.find('table', {'class', 'vtable'}) - temp = temp.findAll('tr') - temp2 = temp[5].find('th').text - temp2 = cleanString(temp2.strip) - if (temp2 == "Category"): - temp2 = temp[5].find('a').text - category = cleanString(temp2.strip()) - - # usd - temp = soup.find('table', {'class', 'vtable'}) - temp = temp.findAll('tr') - temp2 = temp[1].find('td').text - temp2 = temp2.replace(' USD', '') - USD = cleanString(temp2) - - # 15 Product_QuantitySold - temp = soup.find('table', {'class', 'vtable'}) - temp = temp.findAll('tr') - temp2 = temp[5].find('th').text - temp2 = cleanString(temp2) - temp3 = temp[6].find('th').text - temp3 = cleanString(temp3) - if (temp2 == "Sold"): - temp2 = temp[5].find('td').text - sold = cleanString(temp2.strip()) - elif (temp3 == "Sold"): - temp2 = temp[6].find('td').text - sold = cleanString(temp2.strip()) - - # Finding Product Image - image = soup.find('td', {"class": "vtop"}).find('img') - if image is not None: - image = image.get('src').split('base64,')[-1] - else: - image = '-1' - - # Populating the final variable (this should be a list with all fields scraped) - row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) - - # Sending the results - return row - - -#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of listing page -#return: 'row' that contains a variety of lists that each hold info on the listing page -def darkmatter_listing_parser(soup): - - # Fields to be parsed - - nm = 0 # *Total_Products (Should be Integer) - mktName = "DarkMatter" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor y - rating = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft =[] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links - - names = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", "colspan": "3"}) - left = soup.find('div', {"class": "content"}).findAll('table', {"class": "vtable"}) - right = soup.find('div', {"class": "content"}).findAll('td', {"class": "vtop centered"}) - images = soup.find('div', {"class": "content"}).findAll('td', {"class": "vcentered"}) - - # vtop centered - count = 0 - # Populating the Number of Products - nm = len(names) - - for a in names: - # product name - temp = a.find('a').text - if ("pcs x " in temp): - index = temp.index("pcs x ") - result = temp[index + len("pcs x "):] - name.append(cleanString(result)) - elif("pks x " in temp): - index = temp.index("pks x ") - result = temp[index + len("pks x "):] - name.append(cleanString(result)) - elif ("job x " in temp): - index = temp.index("job x ") - result = temp[index + len("job x "):] - name.append(cleanString(result)) - - CVE.append("-1") - MS.append("-1") - - temp2 = left[count].findAll('tr') - - length_2 = len(temp2) - 1 - - # category - temp = temp2[1].find('td').text - category.append(cleanString(temp.strip())) - - describe.append("-1") - #escrow.append("-1") - views.append("-1") - reviews.append("-1") - addDate.append("-1") - #lastSeen.append("-1") - BTC.append("-1") - image_vendor.append("-1") - - # usd - temp3 = right[count*2].find('span').text - temp = temp3.replace(' USD', '') - USD.append(cleanString(temp)) - - EURO.append("-1") - - # 14 Product_QuantitySold - temp3 = temp2[length_2].find('th').text - temp3 = cleanString(temp3) - if (temp3 == "Sold:"): - temp = temp2[length_2].find('td').text - sold.append(cleanString(temp.strip())) - else: - sold.append("-1") - - qLeft.append("-1") - shipFrom.append("-1") - - # ship to - temp3 = temp2[length_2].find('th').text - temp3 = cleanString(temp3) - if (temp3 == "Ship To:"): - temp = temp2[length_2].find('td').text - shipTo.append(cleanString(temp.strip())) - else: - shipTo.append("-1") - - # vendor - temp = temp2[0].find('a').text - vendor.append(cleanString(temp.strip())) - - # add product rating (stars) - rating.append("-1") - success.append("-1") - - temp = a.find('a').get('href') - href.append(temp) - - # Finding Product Image - image = images[count*2].find('img').get('src') - image = image.split('base64,')[-1] - - count += 1 - - rating_item.append("-1") - - # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, vendor, rating, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) - - -#called by the crawler to get description links on a listing page -#@param: beautifulsoup object that is using the correct html page (listing page) -#return: list of description links from a listing page -def darkmatter_links_parser(soup): - - # Returning all links that should be visited by the Crawler - - href = [] - listing = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", 'colspan': '3'}) - - for a in listing: - bae = a.find('a', href=True) - link = bae['href'] - href.append(link) - - return href \ No newline at end of file diff --git a/MarketPlaces/DigitalThriftShop/crawler_selenium.py b/MarketPlaces/DigitalThriftShop/crawler_selenium.py deleted file mode 100644 index 163e135..0000000 --- a/MarketPlaces/DigitalThriftShop/crawler_selenium.py +++ /dev/null @@ -1,286 +0,0 @@ -__author__ = 'Helium' - -''' -DigitalThriftShop Marketplace Crawler (Selenium) -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.common.by import By - -from PIL import Image -import urllib.parse as urlparse -import os, re, time -from datetime import date -import subprocess -import configparser -from bs4 import BeautifulSoup -from MarketPlaces.Initialization.prepare_parser import new_parse - -from MarketPlaces.DigitalThriftShop.parser import digitalthriftshop_links_parser -from MarketPlaces.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/' - - -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later -def startCrawling(): - mktName = getMKTName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(mktName, baseURL, True) - - -# Returns the name of the website -#return: name of site in string type -def getMKTName(): - name = 'DigitalThriftShop' - return name - - -# Return the base link of the website -#return: url of base site in string type -def getFixedURL(): - url = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/' - return url - - -# Closes Tor Browser -#@param: current selenium driver -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from MarketPlaces.Initialization.markets_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", False) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url) - return driver - except: - driver.close() - return 'down' - - -# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha -# then allows for manual solving of captcha in the terminal -#@param: current selenium web driver -def login(driver): - # wait for page to show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.ID, "woocommerce_product_categories-2"))) - -# Saves the crawled html page, makes the directory path for html pages if not made -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site -def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if (name == ''): - name = str(counter) - counter = counter + 1 - return name - - -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products -def getInterestedLinks(): - links = [] - - # Apps - links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/apps/') - # Books - links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/books/') - # Bot nets - links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/botnets/') - # ransomware - links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/ransomware/') - # rats - links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/rats/') - # scripts - links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/scripts/') - - return links - - -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver -def crawlForum(driver): - print("Crawling the DigitalThriftShop market") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - - list = productPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver, driver.page_source, item) - driver.back() - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - try: - nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/div[2]/main/div[1]/nav') - link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='→').get_attribute('href') - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the DigitalThriftShop market done.") - - -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not -def isDescriptionLink(url): - if 'product/' in url: - return True - return False - - -# Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not -def isListingLink(url): - if 'product-' in url: - return True - return False - - -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through -def productPages(html): - soup = BeautifulSoup(html, "html.parser") - return digitalthriftshop_links_parser(soup) - - -# Drop links that "signout" -# def isSignOut(url): -# #absURL = urlparse.urljoin(url.base_url, url.url) -# if 'signout' in url.lower() or 'logout' in url.lower(): -# return True -# -# return False - - -def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/DigitalThriftShop/parser.py b/MarketPlaces/DigitalThriftShop/parser.py deleted file mode 100644 index 8a4126c..0000000 --- a/MarketPlaces/DigitalThriftShop/parser.py +++ /dev/null @@ -1,173 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from MarketPlaces.Utilities.utilities import * - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup, ResultSet, Tag - - -#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of description page -#return: 'row' that contains a variety of lists that each hold info on the description page -def digitalThriftShop_description_parser(soup: Tag): - - # Fields to be parsed - - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image - - - product_name = soup.find("h1", {"class": "product_title entry-title"}).text - name = cleanString(product_name.strip()) - - product_description = soup.find("div", {"id": "tab-description"}).find("p").text - describe = cleanString(product_description.strip()) - - # Finding Product Image - image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img') - image = image.get('src').split('base64,')[-1] - - product_category = soup.find("span", {"class": "posted_in"}).find("a").text - category = cleanString(product_category.strip()) - - product_rating: Tag = soup.find("div", {"class": "woocommerce-product-rating"}) - if product_rating is not None: - rating_item = product_rating.find("strong", {"class": "rating"}).text - reviews = product_rating.find("span", {"class": "rating"}).text - - product_BTC = soup.find("div", {"id": "price-BTC"}).find("span", {"class": "priceinfo cw-noselect"}).text - BTC = cleanString(product_BTC.strip()) - - product_USD = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text - USD = cleanString(product_USD.replace("$", "").strip()) - - # Populating the final variable (this should be a list with all fields scraped) - row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) - - # Sending the results - return row - - -#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of listing page -#return: 'row' that contains a variety of lists that each hold info on the listing page -def digitalThriftShop_listing_parser(soup: Tag): - - # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "DigitalThriftShop" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor y - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft =[] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links - - product_category = soup.find("h1", {"class": "woocommerce-products-header__title page-title"}).text - - products_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-5"}).find_all("li") - - for product in products_list: - nm += 1 - vendor.append(mktName) - rating_vendor.append("-1") - success.append("-1") - - product_name = product.find("h2", {"class": "woocommerce-loop-product__title"}).text - name.append(cleanString(product_name.strip())) - - # Finding Product Image - product_image = product.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'}) - product_image = product_image.get('src').split('base64,')[-1] - image.append(product_image) - - CVE.append("-1") - MS.append("-1") - category.append(cleanString(product_category.strip())) - describe.append("-1") - views.append("-1") - reviews.append("-1") - image_vendor.append("-1") - - try: - product_rating = product.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text - rating_item.append(cleanString(product_rating.strip())) - except: - rating_item.append("-1") - - addDate.append("-1") - BTC.append("-1") - - product_USD = product.find("span", {"class": "price"}).text - USD.append(product_USD.replace("$", "").strip()) - - EURO.append("-1") - sold.append("-1") - qLeft.append("-1") - shipFrom.append("-1") - shipTo.append("-1") - - product_href = product.find("a", {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}).get("href") - href.append(cleanString(product_href.strip())) - - - # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) - - -#called by the crawler to get description links on a listing page -#@param: beautifulsoup object that is using the correct html page (listing page) -#return: list of description links from a listing page -def digitalthriftshop_links_parser(soup): - - # Returning all links that should be visited by the Crawler - - href = [] - listing = soup.find('ul', {"class": "products columns-5"}).findAll('li') - - for a in listing: - bae = a.find('a', href=True) - link = bae['href'] - href.append(link) - - return href \ No newline at end of file diff --git a/MarketPlaces/HiddenMarket/parser.py b/MarketPlaces/HiddenMarket/parser.py deleted file mode 100644 index eb36a5b..0000000 --- a/MarketPlaces/HiddenMarket/parser.py +++ /dev/null @@ -1,288 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from MarketPlaces.Utilities.utilities import * - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup - - -# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) -def hiddenmarket_description_parser(soup): - # Fields to be parsed - - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image - - bae = soup.find('div', {'class': "main"}) - - # Finding Product Name - name = bae.find('div', {'class': "heading"}).text - name = name.replace('\n', ' ') - name = name.replace(",", "") - name = name.strip() - - mb = bae.find('div', {'class': "information"}).findAll('tr') - - # Finding Vendor - vendor = mb[1].find('a').text - vendor = vendor.replace(",", "") - vendor = vendor.strip() - - # # Finding Vendor Rating - # full_stars = bae[2].find_all('i', {'class': "fas fa-star"}) - # half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) - # rating = len(full_stars) + (0.5 if half_star is not None else 0) - - # Finding Quantity Left - temp = mb[-3].text - left = temp.replace("Quantity in stock:", "") - left = left.strip() - - # Finding USD - USD = mb[0].text - USD = USD.replace("Price:", "") - USD = USD.replace("USD", "") - USD = USD.strip() - - # Finding BTC - # temp = bae.find('div', {"class": "small"}).text.split("BTC") - - # BTC = temp[0].strip() - - # Finding Shipment Information (Origin) - shipFrom = mb[2].text - shipFrom = shipFrom.replace("Seller location:", "") - shipFrom = shipFrom.strip() - - # Finding Shipment Information (Destination) - shipTo = mb[3].text - shipTo = shipTo.replace("Ships to (seller):", "") - shipTo = shipTo.strip() - - # Finding the Product description - describe = bae.find('div', {"class": "twotabs"}).find('div', {'class': "tab1"}).text - describe = cleanString(describe.strip()) - - # Finding Product Image - image = soup.find('div', {"class": "thumbnails"}).find('img', {"class": "bigthumbnail"}) - image = image.get('src').split('base64,')[-1] - - # Finding the Product Category - category = mb[-4].text - category = category.replace("Category:", "") - category = category.strip() - - #Finding the number of reviews - reviews = bae.find_all('div', {'class': "heading"}) - reviews = reviews[-2].text - reviews = reviews.replace("Comments (", "") - reviews = reviews.replace(")", "") - - # Searching for CVE and MS categories - cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if cve: - CVE = " " - for idx in cve: - CVE += (idx) - CVE += " " - CVE = CVE.replace(',', ' ') - CVE = CVE.replace('\n', '') - ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) - if ms: - MS = " " - for im in ms: - MS += (im) - MS += " " - MS = MS.replace(',', ' ') - MS = MS.replace('\n', '') - - # Populating the final variable (this should be a list with all fields scraped) - row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) - - # Sending the results - return row - - -# This is the method to parse the Listing Pages -def hiddenmarket_listing_parser(soup): - # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "HiddenMarket" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor y - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft = [] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links - - listing = soup.findAll('div', {"class": "item"}) - - # Populating the Number of Products - nm = len(listing) - - # Finding Category - cat = soup.find("div", {'class': "heading"}).text - cat = cat.replace(",", "") - cat = cat.strip() - - for card in listing: - - category.append(cat) - - # Adding the url to the list of urls - link = card.find_all('a') - link = link[1].get('href') - - href.append(link) - - # Finding Product Name - product = card.find('div', {'class': "title"}) - product = product.text - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.strip() - name.append(product) - - # Finding Product Image - image.append("-1") - - # Finding Vendor - vendor_name = card.find('div', {"class": "seller"}).text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - image_vendor.append("-1") - - # Finding USD - usd = card.find('div', {"class": "buttons"}).find('div', {'class': "price"}).text - usd = usd.replace("USD", "") - usd = usd.strip() - USD.append(usd) - - tb = card.find("div", {"class": "stats"}) - tb = tb.find_all('td') - - # Finding Reviews - num = tb[-1].text - num = num.strip() - reviews.append(num) - - # Finding Views - view = tb[-3].text.strip() - views.append(view) - - # Finding Num of Sales - sale = tb[-2].text.strip() - sold.append(sale) - - # Finding Item Rating - if num == '0': - item_rating = '-1' - else: - item_rating = card.find('div', {'class': 'stats'}).find('div', {'class': "stars2"}) - item_rating = item_rating.get('style') - item_rating = item_rating.replace("width:", "") - item_rating = item_rating.replace("%", "") - rating_item.append(item_rating) - - - # Finding shipping info - shipping = card.find('div', {'class': "shipping"}).text.split('>') - # SHip from - origin = shipping[0].strip() - shipFrom.append(origin) - #Ship to - destination = shipping[1].strip() - shipTo.append(destination) - - # Finding description (site only shows partial description on listing pages) - # description = card.next_sibling.find('div', {'class': "description"}).text - # description = description.replace("\n", " ") - # description = description.replace("\r", " ") - # description = description.replace("-", " ") - # description = description.strip() - # describe.append(description) - - # Searching for CVE and MS categories - cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if not cve: - cveValue = "-1" - else: - cee = " " - for idx in cve: - cee += (idx) - cee += " " - cee = cee.replace(',', ' ') - cee = cee.replace('\n', '') - cveValue = cee - CVE.append(cveValue) - - ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) - if not ms: - MSValue = "-1" - else: - me = " " - for im in ms: - me += (im) - me += " " - me = me.replace(',', ' ') - me = me.replace('\n', '') - MSValue = me - MS.append(MSValue) - - # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) - - -def hiddenmarket_links_parser(soup): - # Returning all links that should be visited by the Crawler - href = [] - - listing = soup.findAll('div', {"class": "item"}) - - for div in listing: - link = div.findAll('a') - link = link[1] - link = link['href'] - href.append(link) - - return href diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index 5ec07b6..f85b46c 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -6,28 +6,14 @@ Starting point of the Darkweb Markets Mining from datetime import * from MarketPlaces.DarkFox.crawler_selenium import crawler as crawlerDarkFox -from MarketPlaces.Tor2door.crawler_selenium import crawler as crawlerTor2door -from MarketPlaces.ThiefWorld.crawler_selenium import crawler as crawlerThiefWorld -from MarketPlaces.TorBay.crawler_selenium import crawler as crawlerTorBay -from MarketPlaces.LionMarketplace.crawler_selenium import crawler as crawlerLionMarketplace -from MarketPlaces.TorMarket.crawler_selenium import crawler as crawlerTorMarket from MarketPlaces.MikesGrandStore.crawler_selenium import crawler as crawlerMikesGrandStore from MarketPlaces.DarkTor.crawler_selenium import crawler as crawlerDarkTor -from MarketPlaces.DigitalThriftShop.crawler_selenium import crawler as crawlerDigitalThriftShop from MarketPlaces.AnonymousMarketplace.crawler_selenium import crawler as crawlerAnonymousMarketplace -from MarketPlaces.Apocalypse.crawler_selenium import crawler as crawlerApocalypseMarketplace from MarketPlaces.CityMarket.crawler_selenium import crawler as crawlerCityMarket -from MarketPlaces.DarkMatter.crawler_selenium import crawler as crawlerDarkMatter from MarketPlaces.M00nkeyMarket.crawler_selenium import crawler as crawlerM00nkeyMarket from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity -from MarketPlaces.HiddenMarket.crawler_selenium import crawler as crawlerHiddenMarket -from MarketPlaces.RobinhoodMarket.crawler_selenium import crawler as crawlerRobinhoodMarket -from MarketPlaces.Nexus.crawler_selenium import crawler as crawlerNexus from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher -from MarketPlaces.DarkBazar.crawler_selenium import crawler as crawlerDarkBazar from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar -from MarketPlaces.AnonMarket.crawler_selenium import crawler as crawlerAnonMarket -from MarketPlaces.MetaVerseMarket.crawler_selenium import crawler as crawlerMetaVerse import configparser import os @@ -105,49 +91,21 @@ if __name__ == '__main__': # if crawlerDarkFox(base["url"], base["categories"]): # break crawlerDarkFox() - elif mkt == 'Tor2door': - crawlerTor2door() - elif mkt == "ThiefWorld": - crawlerThiefWorld() - elif mkt == "TorBay": - crawlerTorBay() - elif mkt == "LionMarketplace": - crawlerLionMarketplace() - elif mkt == "TorMarket": - crawlerTorMarket() elif mkt == "MikesGrandStore": crawlerMikesGrandStore() elif mkt == "DarkTor": crawlerDarkTor() - elif mkt == "DigitalThriftShop": - crawlerDigitalThriftShop() elif mkt == "AnonymousMarketplace": crawlerAnonymousMarketplace() - elif mkt == "Apocalypse": - crawlerApocalypseMarketplace() elif mkt == "CityMarket": crawlerCityMarket() - elif mkt == "DarkMatter": - crawlerDarkMatter() elif mkt == "M00nkeyMarket": crawlerM00nkeyMarket() elif mkt == "ViceCity": crawlerViceCity() - elif mkt == "HiddenMarket": - crawlerHiddenMarket() - elif mkt == "RobinhoodMarket": - crawlerRobinhoodMarket() - elif mkt == "Nexus": - crawlerNexus() elif mkt == "CypherMarketplace": crawlerCypher() - elif mkt == "DarkBazar": - crawlerDarkBazar() elif mkt == "PabloEscobarMarket": crawlerPabloEscobar() - elif mkt == "AnonMarket": - crawlerAnonMarket() - elif mkt == "MetaVerseMarket": - crawlerMetaVerse() print("\nScraping process completed!") diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 9cfe2a9..e075541 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -9,26 +9,12 @@ from psycopg2.extras import RealDictCursor from MarketPlaces.DB_Connection.db_connection import * from MarketPlaces.DarkFox.parser import * -from MarketPlaces.Tor2door.parser import * -from MarketPlaces.Apocalypse.parser import * -from MarketPlaces.ThiefWorld.parser import * from MarketPlaces.AnonymousMarketplace.parser import * from MarketPlaces.ViceCity.parser import * -from MarketPlaces.TorBay.parser import * from MarketPlaces.M00nkeyMarket.parser import * -from MarketPlaces.DarkMatter.parser import * -from MarketPlaces.DigitalThriftShop.parser import * -from MarketPlaces.LionMarketplace.parser import * -from MarketPlaces.TorMarket.parser import * -from MarketPlaces.HiddenMarket.parser import * -from MarketPlaces.RobinhoodMarket.parser import * -from MarketPlaces.Nexus.parser import * from MarketPlaces.MikesGrandStore.parser import * -from MarketPlaces.DarkBazar.parser import * from MarketPlaces.PabloEscobarMarket.parser import * -from MarketPlaces.AnonMarket.parser import * from MarketPlaces.CityMarket.parser import * -from MarketPlaces.MetaVerseMarket.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -132,46 +118,18 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): if marketPlace == "DarkFox": rw = darkfox_listing_parser(soup) - elif marketPlace == "Tor2door": - rw = tor2door_listing_parser(soup) - elif marketPlace == "Apocalypse": - rw = apocalypse_listing_parser(soup) - elif marketPlace == "ThiefWorld": - rw = thiefWorld_listing_parser(soup) elif marketPlace == "AnonymousMarketplace": rw = anonymousMarketplace_listing_parser(soup) elif marketPlace == "ViceCity": rw = vicecity_listing_parser(soup) - elif marketPlace == "TorBay": - rw = torbay_listing_parser(soup) elif marketPlace == "M00nkeyMarket": rw = m00nkey_listing_parser(soup) - elif marketPlace == "HiddenMarket": - rw = hiddenmarket_listing_parser(soup) - elif marketPlace == "DarkMatter": - rw = darkmatter_listing_parser(soup) - elif marketPlace == "DigitalThriftShop": - rw = digitalThriftShop_listing_parser(soup) - elif marketPlace == "LionMarketplace": - rw = lionmarketplace_listing_parser(soup) - elif marketPlace == "TorMarket": - rw = tormarket_listing_parser(soup) - elif marketPlace == "RobinhoodMarket": - rw = Robinhood_listing_parser(soup) - elif marketPlace == "Nexus": - rw = nexus_listing_parser(soup) elif marketPlace == "MikesGrandStore": rw = mikesGrandStore_listing_parser(soup) - elif marketPlace == "DarkBazar": - rw = darkbazar_listing_parser(soup) elif marketPlace == "PabloEscobarMarket": rw = pabloescobarmarket_listing_parser(soup) - elif marketPlace == "AnonMarket": - rw = AnonMarket_listing_parser(soup) elif marketPlace == "CityMarket": rw = city_listing_parser(soup) - elif marketPlace == "MetaVerseMarket": - rw = metaversemarket_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -194,46 +152,18 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): if marketPlace == "DarkFox": rmm = darkfox_description_parser(soup) - elif marketPlace == "Tor2door": - rmm = tor2door_description_parser(soup) - elif marketPlace == "Apocalypse": - rmm = apocalypse_description_parser(soup) - elif marketPlace == "ThiefWorld": - rmm = thiefWorld_description_parser(soup) elif marketPlace == "AnonymousMarketplace": rmm = anonymousMarketplace_description_parser(soup) elif marketPlace == "ViceCity": rmm = vicecity_description_parser(soup) - elif marketPlace == "TorBay": - rmm = torbay_description_parser(soup) elif marketPlace == "M00nkeyMarket": rmm = m00nkey_description_parser(soup) - elif marketPlace == "HiddenMarket": - rmm = hiddenmarket_description_parser(soup) - elif marketPlace == "DarkMatter": - rmm = darkmatter_description_parser(soup) - elif marketPlace == "DigitalThriftShop": - rmm = digitalThriftShop_description_parser(soup) - elif marketPlace == "LionMarketplace": - rmm = lionmarketplace_description_parser(soup) - elif marketPlace == "TorMarket": - rmm = tormarket_description_parser(soup) - elif marketPlace == "RobinhoodMarket": - rmm = Robinhood_description_parser(soup) - elif marketPlace == "Nexus": - rmm = nexus_description_parser(soup) elif marketPlace == "MikesGrandStore": rmm = mikesGrandStore_description_parser(soup) - elif marketPlace == "DarkBazar": - rmm = darkbazar_description_parser(soup) elif marketPlace == "PabloEscobarMarket": rmm = pabloescobarmarket_description_parser(soup) - elif marketPlace == "AnonMarket": - rmm = AnonMarket_description_parser(soup) elif marketPlace == "CityMarket": rmm = city_description_parser(soup) - elif marketPlace == "MetaVerseMarket": - rmm = metaversemarket_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception diff --git a/MarketPlaces/Kingdom/crawler_mechanize.py b/MarketPlaces/Kingdom/crawler_mechanize.py new file mode 100644 index 0000000..9a680a8 --- /dev/null +++ b/MarketPlaces/Kingdom/crawler_mechanize.py @@ -0,0 +1,325 @@ +__author__ = '91Shadows' + +''' +DarkFox marketplace Crawler +''' + +import codecs +import socks, socket, time +from datetime import date +import urllib.parse as urlparse +import http.client as httplib +import mechanize +import os +import subprocess +from bs4 import BeautifulSoup +from MarketPlaces.Initialization.prepare_parser import new_parse +from MarketPlaces.DarkFox.parser import darkfox_links_parser + +counter = 1 +httplib.HTTPConnection._http_vsn = 10 +httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' +baseURL = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/' +socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150) + + +# Opens Tor Browser, crawls the mkt +def startCrawling(): + + opentor() + getUrl() + url = getFixedURL() + mktName = getMKTName() + credentials = getCredentials() + br = getAccess(url, credentials) + + if br != 'down': + crawlMkt(url, br) + #new_parse(mktName, False) + + #new_parse(mktName, False) + + closetor() + + +#Opens Tor Browser +def opentor(): + global pid + print("Connecting Tor...") + path = open('../../path.txt').readline() + pro = subprocess.Popen(path) + pid = pro.pid + time.sleep(5) + input("Tor Connected. Press ENTER to continue\n") + return + + +# Creates a connection through Tor Port +def getUrl(timeout=None): + socket.socket = socks.socksocket + socket.create_connection = create_connection + return + + +# Makes the onion address request +def create_connection(address, timeout=None, source_address=None): + sock = socks.socksocket() + sock.connect(address) + return sock + + +# Returns the name of the mkt (Crypto) +def getMKTName(): + name = 'DarkFox' + return name + + +# Returns credentials needed for the mkt +def getCredentials(): + credentials = 'blank blank blank blank cap 0' + return credentials + + +# Return the link of the mkt (DarkFox Link) +def getFixedURL(): + url = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/' + return url + + +# Closes Tor Browser +def closetor(): + global pid + os.system("taskkill /pid " + str(pid)) + print('Closing Tor...') + time.sleep(3) + return + + +# Creates a Mechanize browser and initializes its options +def createBrowser(): + br = mechanize.Browser() + cj = mechanize.CookieJar() + br.set_cookiejar(cj) + + # Browser options + br.set_handle_equiv( True ) + br.set_handle_redirect( True ) + br.set_handle_referer( True ) + br.set_handle_robots(False) + br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 ) + + br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'), + ('Accept', '*/*')] + + return br + + +def getAccess(loginPage, credentials): + + logInName = credentials.split()[0] + userName = credentials.split()[1] + logInPass = credentials.split()[2] + password = credentials.split()[3] + captchaName = credentials.split()[4] + formId = credentials.split()[5] + + br = createBrowser() + + try: + keepTrying = True + while (keepTrying): + + br.open(loginPage) + time.sleep(7) + html = br.response() + soup = BeautifulSoup(html) + image_tags = soup.findAll('div', {"class": "imgWrap"}) + captchaLink = image_tags[0] + imagelink = captchaLink['style'].split('url(')[1][:-1] + data = br.open(imagelink).read() + br.back() + open('captcha.png', "wb").write(data) + ''' + subprocess.Popen("python capt.py", shell=False) + time.sleep(61) + captchaAnswerFile = open("answer.txt", "r") + captchaAnswer = captchaAnswerFile.read().__str__() + ''' + captchaAnswer = input('Please provide me with captcha : ') + formIndex = int(formId) + br.select_form(nr=formIndex) + #br[logInName] = userName + #br[logInPass] = password + br[captchaName] = captchaAnswer.__str__() + br.submit() + if br.geturl() != 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/': + keepTrying = False + + return br + + except: + + return 'down' + + +# Saves the crawled html page +def savePage(page, url): + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + a = page.read() + open(filePath, "wb").write(a) + return + + +# Gets the full path of the page to be saved along with its appropriate file name +def getFullPathName(url): + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str( + "%02d" % date.today().month) + str("%02d" % date.today().day) + str( + "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + else: + fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str( + "%02d" % date.today().month) + str("%02d" % date.today().day) + str( + "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + return fullPath + + +# Creates the name of the file based on URL +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if (name == ''): + name = str(counter) + counter = counter + 1 + return name + + +# Hacking and Markets related topics +def getInterestedLinks(): + links = [] + + # Guides and Tutorials + links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/30739153-1fcd-45cd-b919-072b439c6e06') + # Digital Products + links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781') + # Software and Malware + links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc') + # Services + links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280') + # Miscellaneous + links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb') + # Hosting and Security + links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14') + + # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing1.html') + # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing2.html') + + return links + + +def crawlMkt(url, br): + + print("Crawling the DarkFox marketplace") + + linksToCrawl = getInterestedLinks() + visited = set(linksToCrawl) + initialTime = time.time() + + i = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] + print('Crawling :', link) + try : + page = br.open(link) + savePage(page, link) + for l in br.links(): + absURL = urlparse.urljoin(l.base_url, l.url) + if absURL not in visited and not isSignOut(absURL) and isListingLink(absURL): + visited.add(absURL) + + #disabling the process of finding other links + #linksToCrawl.append(absURL) + + # crawler asks parser to get links of ALL products on ALL listing pages + list = productPages(link) + j = 0 + for item in list: + if j == 2: + break + #itemURL = baseURL + str(item) + try: + #itemPage = br.open(itemURL) + itemPage = br.open(item) + savePage(itemPage, item) + except: + #print 'Error in page: ', itemURL + print('Error in page: ', item) + j+=1 + + except Exception as e: + print(link, e.message) + i += 1 + + #finalTime = time.time() + #print finalTime - initialTime + + input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n") + + return + + +def isDescriptionLink(url): + if 'product' in url: + return True + return False + + +# Returns True if the link is a listingPage link +def isListingLink(url): + if 'category' in url: + return True + return False + + +# calling the parser to define the links +def productPages(url): + + soup = "" + + error = False + try: + html = codecs.open( + r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str( + "%02d" % date.today().month) + str("%02d" % date.today().day) + str( + "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html', encoding='utf8') + soup = BeautifulSoup(html, "html.parser") + except: + try: + html = open( + r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str( + "%02d" % date.today().month) + str("%02d" % date.today().day) + str( + "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html') + soup = BeautifulSoup(html, "html.parser") + except: + error = True + print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.") + + if error: + return [] + else: + return darkfox_links_parser(soup) + + +# Drop links that "singout" +def isSignOut(url): + #absURL = urlparse.urljoin(url.base_url, url.url) + if 'signout' in url.lower() or 'logout' in url.lower(): + return True + + return False + + +def crawler(): + startCrawling() + #print "Crawling and Parsing Crypto .... DONE!" diff --git a/MarketPlaces/Kingdom/crawler_selenium.py b/MarketPlaces/Kingdom/crawler_selenium.py new file mode 100644 index 0000000..e6b489f --- /dev/null +++ b/MarketPlaces/Kingdom/crawler_selenium.py @@ -0,0 +1,342 @@ +__author__ = 'DarkWeb' + +''' +Kingdom Market Crawler (Selenium) +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.common.exceptions import TimeoutException +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.ui import Select +from PIL import Image +import base64 +from io import BytesIO + + +import urllib.parse as urlparse +import os, re, time +from datetime import date +import subprocess +from bs4 import BeautifulSoup +from MarketPlaces.Initialization.prepare_parser import new_parse +from MarketPlaces.Kingdom.parser import kingdom_links_parser +from MarketPlaces.Utilities.utilities import cleanHTML + +counter = 1 +baseURL = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion' + + +# Opens Tor Browser, crawls the website +def startCrawling(): + # marketName = getMarketName() + driver = getAccess() + + if driver != 'down': + try: + captcha(driver) + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closeDriver(driver) + + # new_parse(marketName, False) + + +def captcha(driver): + ''' + # wait for captcha page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div/div[1]"))) + + # save captcha to local + driver.find_element(by=By.XPATH, value='/html/body/div/div[2]').screenshot( + r'..\Kingdom\captcha1.png') + + # This method will show image in any image viewer + im = Image.open(r'..\Kingdom\captcha1.png') + im.show() + + iframes = driver.find_elements(by=By.TAG_NAME, value='iframe') + + # ask user input captcha solution in terminal + print("Enter squares from smallest to largest (squares are numbered 1-9 left to right)") + for order in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']: + id = input(f"{order}: ") + iframes[int(id)-1].click() + ''' + input("Press ENTER when CAPTCHA is completed\n") + + # wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) + + +# Login using premade account credentials and do login captcha manually +def login(driver): + # wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-name"]') + # Username here + usernameBox.send_keys('blabri') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-passwd"]') + # Password here + passwordBox.send_keys('fishowal') + + select = Select(driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-sessiontime"]')) + select.select_by_visible_text('24 hours') + + ''' + # wait for captcha page show up + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="captcha"]'))) + + # save captcha to local + driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Kingdom\captcha2.png') + + # This method will show image in any image viewer + im = Image.open(r'..\Kingdom\captcha2.png') + im.show() + + # wait until input space show up + inputBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-captcha"]') + + # ask user input captcha solution in terminal + userIn = input("Enter solution: ") + + # send user solution into the input space + inputBox.send_keys(userIn) + + # click the verify(submit) button + driver.find_element(by=By.XPATH, value="/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button").click() + ''' + input("Press ENTER when CAPTCHA is completed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 50).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div/div/div[3]/div[2]'))) + + +# Returns the name of the website +def getMarketName(): + name = 'Kingdom' + return name + + +# Return the link of the website +def getFixedURL(): + url = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion' + + return url + + +# Closes Tor Browser +def closeDriver(driver): + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) + + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("permissions.default.image", 3) + ff_prof.set_preference("browser.download.folderList", 2) + ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", False) + ff_prof.update_preferences() + + service = Service(config.get('TOR', 'geckodriver_path')) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + driver.maximize_window() + + return driver + + +def getAccess(): + url = getFixedURL() + driver = createFFDriver() + try: + driver.get(url) + return driver + except: + driver.close() + return 'down' + + +# Saves the crawled html page +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +# Gets the full path of the page to be saved along with its appropriate file name +def getFullPathName(url): + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') + else: + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') + return fullPath + + +# Creates the file name from passed URL +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if (name == ''): + name = str(counter) + counter = counter + 1 + return name + + +def getInterestedLinks(): + links = [] + + # Software and Malware + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=c298a77d9e93ad32') + # # Services + # links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=c298a77d9e93ad32') + # # Exploits + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45') + # # Tools + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46') + # # Malware + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47') + # # Cryptography + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48') + # # Others + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49') + # # Hacking Tutorials + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50') + # # Hacked Accounts and Database Dumps + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30') + # # Android Moded pak + # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53') + + return links + + +def crawlForum(driver): + print("Crawling the Kingdom market") + + linksToCrawl = getInterestedLinks() + + i = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] + print('Crawling :', link) + try: + has_next_page = True + count = 0 + + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(driver, html, link) + + list = productPages(html) + for item in list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver, driver.page_source, item) + driver.back() + + # comment out + break + + # comment out + if count == 1: + break + + try: + temp = driver.find_element(by=By.XPATH, value= + '/html/body/div/div/div[3]/div[2]/div[2]/div/div/ul') + next = temp.find_element_by_class_name("next") + link = link.find_element_by_tag_name('a').get_attribute('href') + if link == "": + raise NoSuchElementException + count += 1 + + except NoSuchElementException: + has_next_page = False + + except Exception as e: + print(link, e) + i += 1 + + input("Crawling Kingdom Market done sucessfully. Press ENTER to continue\n") + + +# Returns 'True' if the link is Topic link +def isDescriptionLink(url): + if 'view' in url: + return True + return False + + +# Returns True if the link is a listingPage link +def isListingLink(url): + if 'category' in url: + return True + return False + + +# calling the parser to define the links +def productPages(html): + soup = BeautifulSoup(html, "html.parser") + #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) + return kingdom_links_parser(soup) + + +def crawler(): + startCrawling() + # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/Kingdom/parser.py b/MarketPlaces/Kingdom/parser.py new file mode 100644 index 0000000..b1e05d5 --- /dev/null +++ b/MarketPlaces/Kingdom/parser.py @@ -0,0 +1,188 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from MarketPlaces.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + + +# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) +def kingdom_description_parser(soup): + + # Fields to be parsed + + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + + # Finding Product Name + + tag = soup.find('div', {"class": "col-md-9"}) + + desc = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-cont"}) + name = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-head"}).text + name = name.replace('\n', ' ') + name = name.replace(',', ' ') + name = name.strip() + + # Finding Prices + # Kingdom prices can be shown in a variety of currencies, not all in USD, so keeping currency + rows = desc.find_all('div', {"class", "row"}, recursive=False) + price = rows[-1].find('div', {"class": "row"}).find('h3').text + price = price.replace(',', '') + price = price.strip() + # USD = price.replace("USD",'') + BTC = rows[-1].find('div', {"class": "row"}).find_next_sibling('div').find('span').text + + # Finding Vendor + vendor = rows[0].select_one('a[href^="/user"]').text + vendor = vendor.replace(",", " ") + vendor = vendor.strip() + + # Finding Shipment Information (Origem) + descs = rows[0].find_all('div', {"class": "col-md-3 text-right"}) + shipFrom = descs[2].text + shipFrom = shipFrom.replace(",", "") + shipFrom = shipFrom.strip() + + # Finding Shipment Information (Destiny) + shipTo = rows[-1].find('div', {"class": "col-md-6"}).text + shipTo = shipTo.replace("Ship to:","") + shipTo = shipTo.replace(",","").strip() + if(shipTo == ''): + shipTo = -1 + + # Finding the Product Category + category = descs[0].text + category = category.replace(",", "") + category = category.strip() + + # Finding the Product Quantity Available + left = descs[1].text + left = left.replace(",", "") + left = left.strip() + + # Finding when the Product was Added + dt = descs[-1].text.strip() + addDate = datetime.strptime(dt, '%d.%m.%Y') + + # Finding the Product description + describe = cleanString(soup.find('div', {"id": "descriptionContent"}).text) + + # Finding the Number of Product Reviews + review = len(soup.find('div', {"id": "feedbackContent"}).find_all(recursive=False)) + + # Searching for CVE and MS categories + # no cve or ms in Kingdom + + # Populating the final variable (this should be a list with all fields scraped) + + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo) + + # Sending the results + + return row + + +def kingdom_listing_parser(soup): + + # Fields to be parsed + nm = 0 # *Total_Products (Should be Integer) + mktName = "Kingdom" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = [] # 6 Product_MS_Classification (Microsoft Security) + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft =[] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + href = [] # 20 Product_Links + + listing = soup.find('div', {"id": "p0"}).find('div').find_all('div', {"class": "row"}, recursive=False) + + # Populating the Number of Products + nm = len(listing) + + for a in listing: + + # Finding Prices + #in array USD, there may be prices not in USD, so includes currency as well + prices = a.find('div', {"class": "col-md-3"}) + u = prices.find('h3').text + u = u.strip() + u = u.replace(',', '') + u = u.strip() + USD.append(u) + bc = prices.find('div').find('span').text + BTC.append(bc) + + # Finding the Product + product = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]').text + product = product.replace('\n', ' ') + product = product.replace(","," ") + product = product.strip() + name.append(product) + + # Finding the Vendor + vendor_name = a.select_one('a[href^="/user"]').text + vendor_name = vendor_name.replace(",", " ").replace('/', '') + vendor_name = vendor_name.strip() + vendor.append(vendor_name) + + # Adding the url to the list of urls + link = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]')['href'] + link = cleanLink(link) + href.append(link) + + # Searching for CVE and MS categories + # cve and ms not in kingdom + + # Populate the final variable (this should be a list with all fields scraped) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + + +def kingdom_links_parser(soup): + + # Returning all links that should be visited by the Crawler + + href = [] + + listing = soup.findAll('div', {"class": "col-md-7"}) + + for a in listing: + link = a.select_one('a[href^="/offer/view?"]') + link = link['href'] + href.append(link) + + return href \ No newline at end of file diff --git a/MarketPlaces/LionMarketplace/parser.py b/MarketPlaces/LionMarketplace/parser.py deleted file mode 100644 index 3b5dc27..0000000 --- a/MarketPlaces/LionMarketplace/parser.py +++ /dev/null @@ -1,235 +0,0 @@ -__author__ = 'Helium' - -# Here, we are importing the auxiliary functions to clean or convert data -from MarketPlaces.Utilities.utilities import * - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup - - -#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of description page -#return: 'row' that contains a variety of lists that each hold info on the description page -def lionmarketplace_description_parser(soup): - - # Fields to be parsed - - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image - - # vendor name - temp = soup.find('div', {'class': 'btn-group'}).find('a').text - vendor = (cleanString(temp.strip())) - - # table with info - table = soup.find('table') - rows = table.findAll('tr') - - # vendor rating - pos = soup.find('span', {"class": "fas fa-plus-circle text-success"}).parent.text - pos = int(pos.strip()) - neu = soup.find('span', {"class": "fas fa-stop-circle text-secondary"}).parent.text - neu = int(neu.strip()) - neg = soup.find('span', {"class": "fas fa-minus-circle text-danger"}).parent.text - neg = int(neg.strip()) - total = pos + neu + neg - if total > 0: - rating_vendor = str((pos + 0.5*neu) / total) - - # product name - temp = soup.find('div', {'class', 'row'}).find('h2').text - name = (cleanString(temp.strip())) - - # product description - temp = soup.find('div', {'class': "mt-4"}).contents[-1] - describe = cleanString(temp.strip()) - - # Finding Product Image - image = soup.find('div', {'id': 'slide-1'}).find('img') - image = image.get('src') - image = image.split('base64,')[-1] - - full = rows[0].findAll('i', {"class": "fas fa-star"}) - half = rows[0].find('i', {"class": "fas fa-star-half-alt"}) - rating_item = len(full) - if half is not None: - rating_item += 0.5 - rating_item = str(rating_item) - - # USD selling price - temp = rows[2].find('strong').text - if " $" in temp: - temp = temp.replace(" $", "") - elif "$" in temp: - temp = temp.replace("$", "") - USD = cleanString((temp.strip())) - - # product sold - temp = rows[4].find('td') - if temp is not None and cleanString(temp.text.strip()) == 'Left/Sold': - temp = rows[4].findAll('td') - temp = temp[1].findAll('span') - - # left - sold = temp[1].text - left = temp[0].text - - sold = cleanNumbers(sold.strip()) - left = cleanNumbers(left.strip()) - else: - sold = '-1' - left = "-1" - - # Populating the final variable (this should be a list with all fields scraped) - row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) - - # Sending the results - return row - - -#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of listing page -#return: 'row' that contains a variety of lists that each hold info on the listing page -def lionmarketplace_listing_parser(soup): - - # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "LionMarketplace" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor y - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft =[] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links - - listings = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) - - # Populating the Number of Products - nm = len(listings) - - for listing in listings: - - a = listing.find('div', {"class": "card-body"}) - row = a.findAll('p') - - # vendor - temp = row[3].text - temp = temp.replace("Vendor:", "") - vendor.append(cleanString(temp.strip())) - - image_vendor.append("-1") - - # vendor rating - rating_vendor.append("-1") - - # successful transactions CHECK AGAIN HERE - success.append("-1") - - # product name - temp = a.find('a').text - name.append(cleanString(temp.strip())) - - # Finding Product Image - product_image = listing.find('img', {'class': 'card-img-top rounded'}) - product_image = product_image.get('src') - product_image = product_image.split('base64,')[-1] - image.append(product_image) - - CVE.append('-1') - MS.append('-1') - - # product category - temp = row[1].text - temp = temp.replace("Category: ", "") - category.append(cleanString(temp.strip())) - - describe.append('-1') - - # product views - vnum = listing.find('p', {"class": "position-absolute bg-primary opacity-60 text-white mt-4 mr-5 pr-1"}).text - views.append(cleanNumbers(vnum.strip())) - - reviews.append('-1') # 10 Product_Number_Of_Reviews - rating_item.append('-1') # 11 Product_Rating - addDate.append('-1') # 12 Product_AddDate - - # BTC - BTC.append('-1') - - # USD - temp = row[0].find('strong').text - USD.append(cleanNumbers(temp.strip())) # 14 Product_USD_SellingPrice - - EURO.append("-1") # 15 Product_EURO_SellingPrice - - # product sold - sold.append("-1") - - qLeft.append('-1') # 17 Product_QuantityLeft - shipFrom.append('-1') # 18 Product_ShippedFrom - shipTo.append('-1') # 19 Product_ShippedTo - - # href - temp = a.find('a').get('href') - href.append(temp) - - # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) - - -#called by the crawler to get description links on a listing page -#@param: beautifulsoup object that is using the correct html page (listing page) -#return: list of description links from a listing page -def lionmarketplace_links_parser(soup): - - # Returning all links that should be visited by the Crawler - - href = [] - listings = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) - - for listing in listings: - a = listing.find('div', {"class": "card-body"}) - bae = a.find('a', href=True) - link = bae['href'] - href.append(link) - - return href \ No newline at end of file diff --git a/MarketPlaces/MetaVerseMarket/crawler_selenium.py b/MarketPlaces/MetaVerseMarket/crawler_selenium.py deleted file mode 100644 index 44eb335..0000000 --- a/MarketPlaces/MetaVerseMarket/crawler_selenium.py +++ /dev/null @@ -1,291 +0,0 @@ -__author__ = 'Helium' - -''' -MetaVerseMarket Marketplace Crawler (Selenium) -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.common.by import By - -from PIL import Image -import urllib.parse as urlparse -import os, re, time -from datetime import date -import subprocess -import configparser -from bs4 import BeautifulSoup -from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.MetaVerseMarket.parser import metaversemarket_links_parser -from MarketPlaces.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/login' - - -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later -def startCrawling(): - mktName = getMKTName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(mktName, baseURL, True) - - -# Returns the name of the website -#return: name of site in string type -def getMKTName(): - name = 'MetaVerseMarket' - return name - - -# Return the base link of the website -#return: url of base site in string type -def getFixedURL(): - url = 'http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/login' - return url - - -# Closes Tor Browser -#@param: current selenium driver -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from MarketPlaces.Initialization.markets_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", False) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url) - return driver - except: - driver.close() - return 'down' - - -# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha -# then allows for manual solving of captcha in the terminal -#@param: current selenium web driver -def login(driver): - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="username"]'))) - - # entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - # Username here - usernameBox.send_keys('metotomoto') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - # Password here - passwordBox.send_keys('lionking_kumba1ya') - - input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="searchq"]'))) - -# Saves the crawled html page, makes the directory path for html pages if not made -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site -def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if (name == ''): - name = str(counter) - counter = counter + 1 - return name - - -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products -def getInterestedLinks(): - links = [] - - # software and malware - links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/softwares-and-malwares') - # guides and tutorials - links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/guides-and-tutorials') - # services - links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/services') - - return links - - -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver -def crawlForum(driver): - print("Crawling the MetaVerse market") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - - list = productPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver, driver.page_source, item) - driver.back() - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - try: - link = driver.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href') - if link.endswith('#') or link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the MetaVerse market done.") - - -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not -def isDescriptionLink(url): - if 'PR' in url: - return True - return False - - -# Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not -def isListingLink(url): - if 'products' in url: - return True - return False - - -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through -def productPages(html): - soup = BeautifulSoup(html, "html.parser") - return metaversemarket_links_parser(soup) - - -# Drop links that "signout" -# def isSignOut(url): -# #absURL = urlparse.urljoin(url.base_url, url.url) -# if 'signout' in url.lower() or 'logout' in url.lower(): -# return True -# -# return False - - -def crawler(): - startCrawling() - # print("Crawling and Parsing MetaVerseMarket .... DONE!") diff --git a/MarketPlaces/MetaVerseMarket/parser.py b/MarketPlaces/MetaVerseMarket/parser.py deleted file mode 100644 index 5c12390..0000000 --- a/MarketPlaces/MetaVerseMarket/parser.py +++ /dev/null @@ -1,269 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from MarketPlaces.Utilities.utilities import * - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup - - -# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs -# stores info it needs in different lists, these lists are returned after being organized -# @param: soup object looking at html page of description page -# return: 'row' that contains a variety of lists that each hold info on the description page -def metaversemarket_description_parser(soup): - # Fields to be parsed - - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image - - # Finding Product Name - name = soup.find('div', {'class': "panel-heading"}).text - name = cleanString(name.strip()) - - temp = soup.findAll('div', {'class': "col-xs-12 col-sm-6 mt-5"}) - - # Finding Product Image - image = temp[0].find('img') - image = image.get('src') - image = image.split('base64,')[-1] - - # Finding Vendor - temp = temp[1].findAll('span') - vendor = temp[1].find('b').text - vendor = cleanString(vendor.strip()) - - # Finding Vendor Rating - pos = soup.find('span', {'class': "badge bg-success fs-12px"}).text - pos = int(cleanNumbers(pos).strip()) - neg = soup.find('span', {'class': "badge bg-danger fs-12px"}).text - neg = int(cleanNumbers(neg).strip()) - total = pos + neg - if total > 0: - rating_vendor = str(pos / total) - - # Finding Prices - USD = soup.find('h3', {'class': "mb-2"}).text - USD = cleanNumbers(USD).strip() - - # Finding the Product Category - temp = soup.select('div[class="mt-2"]')[1].text - temp = temp.replace("Category:", "") - category = temp.strip() - - # Finding Number of Views - views = soup.find('button', {"class": "btn btn-secondary text-center w-33 fw-bold"}).text - views = views.strip() - - # Finding the Product Quantity Available - temp = soup.find('button', {"class": "btn btn-success text-center w-33 fw-bold"}).text - temp = temp.split("/") - left = temp[1].strip() - - # Finding Number Sold - sold = temp[0].strip() - - # Finding Shipment Information (Origin) - temp = soup.find('div', {'class': "alert alert-info"}).text - temp = temp.split("to") - shipFrom = temp[0].replace("Shipping from ", "").strip() - - # Finding Shipment Information (Destination) - shipTo = temp[1].split("for") - shipTo = shipTo[0].strip() - - # Finding the Product description - describe = soup.find('p', {'class': "card-text"}).text - describe = cleanString(describe.strip()) - - # Searching for CVE and MS categories - cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if cve: - CVE = " " - for idx in cve: - CVE += (idx) - CVE += " " - CVE = CVE.replace(',', ' ') - CVE = CVE.replace('\n', '') - ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) - if ms: - MS = " " - for im in ms: - MS += (im) - MS += " " - MS = MS.replace(',', ' ') - MS = MS.replace('\n', '') - - # Populating the final variable (this should be a list with all fields scraped) - row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) - - # Sending the results - return row - - -# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs -# stores info it needs in different lists, these lists are returned after being organized -# @param: soup object looking at html page of listing page -# return: 'row' that contains a variety of lists that each hold info on the listing page -def metaversemarket_listing_parser(soup): - # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "MetaVerseMarket" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor y - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this - MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft = [] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links - - listing = soup.findAll('div', {"class": "col-12 col-sm-4 col-xl-3 product_item_col p-1"}) - - # Populating the Number of Products - nm = len(listing) - - for a in listing: - bae = a.findAll('a', href=True) - - # Adding the url to the list of urls - link = bae[0].get('href') - link = cleanLink(link) - href.append(link) - - # Finding the Product - product = bae[1].find('span', {"class": "text-primary"}).text - name.append(cleanString(product.strip())) - - # Finding Prices - price = a.find('strong').text - USD.append(cleanNumbers(price).strip()) - - # Finding the Vendor - temp = a.find('div', {'class': "mt-1 fs-12px"}) - temp = temp.findAll('span') - vendor_name = temp[1].find('b').text - vendor.append(cleanString(vendor_name.strip())) - - # Finding the Category - cat = a.select_one('div[class="fs-12px"]') - cat = cat.findAll('span')[1].text - cat = cat.strip() - category.append(cat) - - ul = a.find('ul', {"class": "product-actions"}) - - # Finding Number Sold and Quantity Left - temp = ul.find('span', {'class': "badge bg-success"}).text - temp = temp.split("/") - num = temp[0] - num = num.replace('k', '000') - sold.append(cleanNumbers(num).strip()) - - quant = temp[1] - quant = quant.replace('k', '000') - qLeft.append(cleanNumbers(quant).strip()) - - # Finding Descrption - # description = a.find('p', {'class': "alert alert-light text-ssbold p-1"}).text - # description = description.replace("\n", " ") - # description = description.strip() - # describe.append(cleanString(description)) - - # Finding Number of Views - view = ul.find('span', {'class': "badge bg-primary"}).text - view = view.replace('.', '') - view = view.replace('K', '000') - views.append(view.strip()) - - # Find where ships from - ships = a.find('div', {'class': "alert alert-info item_alert fs-12px p-1"}) - ships = ships.findAll('b') - sFrom = ships[0].text.strip() - shipFrom.append(sFrom) - - # Find where it ships to - sTo = ships[1].text.strip() - shipTo.append(sTo) - - # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if not cve: - cveValue = "-1" - else: - cee = " " - for idx in cve: - cee += (idx) - cee += " " - cee = cee.replace(',', ' ') - cee = cee.replace('\n', '') - cveValue = cee - CVE.append(cveValue) - - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) - if not ms: - MSValue = "-1" - else: - me = " " - for im in ms: - me += (im) - me += " " - me = me.replace(',', ' ') - me = me.replace('\n', '') - MSValue = me - MS.append(MSValue) - - # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) - - -# called by the crawler to get description links on a listing page -# @param: beautifulsoup object that is using the correct html page (listing page) -# return: list of description links from a listing page -def metaversemarket_links_parser(soup): - # Returning all links that should be visited by the Crawler - - href = [] - listing = soup.findAll('div', {"class": "col-12 col-sm-4 col-xl-3 product_item_col p-1"}) - - for a in listing: - bae = a.find('a', href=True) - link = bae['href'] - href.append(link) - - return href \ No newline at end of file diff --git a/MarketPlaces/Nexus/crawler_selenium.py b/MarketPlaces/Nexus/crawler_selenium.py deleted file mode 100644 index bd76f59..0000000 --- a/MarketPlaces/Nexus/crawler_selenium.py +++ /dev/null @@ -1,289 +0,0 @@ -__author__ = 'Helium' - -''' -Nexus Market Crawler (Selenium) -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.common.by import By - -from PIL import Image -import urllib.parse as urlparse -import os, re, time -from datetime import date -import subprocess -import configparser -from bs4 import BeautifulSoup -from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.Nexus.parser import nexus_links_parser -from MarketPlaces.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion' - - -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later -def startCrawling(): - mktName = getMKTName() - driver = getAccess() - - if driver != 'down': - try: - input("Press ENTER when page loads after DDOS protection") - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(mktName, baseURL, True) - - -# Returns the name of the website -#return: name of site in string type -def getMKTName(): - name = 'Nexus' - return name - - -# Return the base link of the website -#return: url of base site in string type -def getFixedURL(): - url = 'http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion' - return url - - -# Closes Tor Browser -#@param: current selenium driver -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from MarketPlaces.Initialization.markets_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - # ff_prof.set_preference("network.dns.disablePrefetch", True) - # ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", True) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url) - return driver - except: - driver.close() - return 'down' - - -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site -def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isListingLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if (name == ''): - name = str(counter) - counter = counter + 1 - return name - - -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products -def getInterestedLinks(): - links = [] - - # malware - links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/') - # hacking-spam - links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/') - # hacking services - links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/hacking/') - # programming services - links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/programacao/') - # remote admin services - links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/administracao-remota/') - # hacking guides - links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-hacking/') - # malware guides - links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-malware/') - # fraud guides - links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-fraudes/') - # fraud software - links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/fraudes/software-de-fraude/') - - return links - - -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver -def crawlForum(driver): - print("Crawling the Nexus market") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - - # waiting for btc price to load - try: - WebDriverWait(driver, 1).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/ul/li[1]/div/span/span[3]"))) - time.sleep(5) - except: - pass - - html = driver.page_source - savePage(driver, html, link) - - list = productPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - - # waiting for btc price to load - try: - WebDriverWait(driver, 1).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/div[3]/div[2]/p/span[3]"))) - except: - pass - - savePage(driver, driver.page_source, item) - driver.back() - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - try: - link = driver.find_element(by=By.LINK_TEXT, value='→').get_attribute('href') - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the Nexus market done.") - - -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not -def isDescriptionLink(url): - if 'produto' in url: - return True - return False - - -# Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not -def isListingLink(url): - if 'categoria-produto' in url: - return True - return False - - -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through -def productPages(html): - soup = BeautifulSoup(html, "html.parser") - return nexus_links_parser(soup) - -def crawler(): - startCrawling() - # print("Crawling and Parsing Nexus .... DONE!") - diff --git a/MarketPlaces/Nexus/parser.py b/MarketPlaces/Nexus/parser.py deleted file mode 100644 index 107a80a..0000000 --- a/MarketPlaces/Nexus/parser.py +++ /dev/null @@ -1,236 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from MarketPlaces.Utilities.utilities import * - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup - -import re - -usd_to_brl_r = None - - -#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of description page -#return: 'row' that contains a variety of lists that each hold info on the description page -def nexus_description_parser(soup): - - # Fields to be parsed - - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image - - - #finding the name of the product - name_of_product = soup.find("h1", {"class": "product_title entry-title"}).text - name = cleanString(name_of_product.strip()) - - # Finding USD Price - real = soup.find('span', {"class": "price"}).find('bdi').text - real = real.split(',') - whole = cleanNumbers(real[0]).replace('.', '') - real = whole + '.' + real[1] - usd = float(real) / usd_to_brl_r - USD = str(round(usd, 2)) - - # Find the BTC Price - prices = soup.find('p', {"class": "price"}).findAll('span', {"class": "cs"}) - if len(prices) > 0: - BTC = prices[0].text - BTC = cleanNumbers(BTC.strip()) - - # finding the description of the product - description_div = soup.find("div", {"class": "woocommerce-product-details__short-description"}) - if description_div is None: - describe = "-1" - else: - describe = cleanString(description_div.text.strip()) - - # Finding Product Image - image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img') - image = image.get('src') - image = image.split('base64,')[-1] - - #find the category of the product - name_of_category = soup.find("span", {"class": "posted_in"}).find("a").text - category = cleanString(name_of_category.strip()) - - #finding the name of the vendor - name_of_vendor = soup.find("div", {"class": "dokan-vendor-name"}).find("h5").text - vendor = cleanString(name_of_vendor) - - #finding the vendor's rating - vendorRating = soup.find("div", {"class": "dokan-vendor-rating"}).find("p").text - rating_vendor = cleanString(vendorRating) - #everything else gets a -1 because they are not found - - # Populating the final variable (this should be a list with all fields scraped) - row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) - - - # Sending the results - return row - - -#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of listing page -#return: 'row' that contains a variety of lists that each hold info on the listing page -def nexus_listing_parser(soup): - - global usd_to_brl_r - while usd_to_brl_r is None: - try: - usd_to_brl_r = float(input("1 US Dollar = (Brazilian Real) ")) - except ValueError: - pass - - # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "Nexus" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor y - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft =[] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links - - main = soup.find('main', {'id': 'main'}) - products_list = main.find('ul', recursive=False).find_all('li', recursive=False) - nm = len(products_list) - - for product in products_list: - # Finding the name of the product - name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text - name_of_product_cleaned = cleanString(name_of_product.strip()) - # print(name_of_product_cleaned) - name.append(name_of_product_cleaned) - #finding the URL - try: - url = product.find("a", class_="woocommerce-loop-product__link").get('href') - href.append(url) - except AttributeError as e: - print("I can't find the link") - raise e - - # Finding Product Image - product_image = product.find('a', {'class': 'woocommerce-loop-image-link woocommerce-LoopProduct-link woocommerce-loop-product__link'}).find('img') - product_image = product_image.get('src') - product_image = product_image.split('base64,')[-1] - image.append(product_image) - - # Finding USD Price - real = product.find('span', {"class": "price"}).find('bdi').text - real = real.split(',') - whole = cleanNumbers(real[0]).replace('.', '') - real = whole + '.' + real[1] - usd = float(real) / usd_to_brl_r - USD.append(str(round(usd, 2))) - - # Finding BTC Price - prices = product.find('span', {"class": "price"}).findAll('span', {"class": "cs"}) - if len(prices) > 0: - price = prices[0].text - BTC.append(cleanNumbers(price.strip())) - - #everything else appends a -1 - rating_vendor.append("-1") - vendor.append('-1') - success.append("-1") - CVE.append("-1") - MS.append("-1") - category.append("-1") - describe.append("-1") - views.append("-1") - reviews.append("-1") - addDate.append("-1") - EURO.append("-1") - sold.append("-1") - qLeft.append("-1") - shipFrom.append("-1") - shipTo.append("-1") - image_vendor.append("-1") - - # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts( - marketplace = mktName, - nm = nm, - vendor = vendor, - rating_vendor = rating_vendor, - success_vendor = success, - nombre = name, - CVE = CVE, - MS = MS, - category = category, - describe = describe, - views = views, - reviews = reviews, - rating_item = rating_item, - addDate = addDate, - BTC = BTC, - USD = USD, - EURO = EURO, - sold = sold, - qLeft = qLeft, - shipFrom = shipFrom, - shipTo = shipTo, - href = href, - image = image, - image_vendor = image_vendor - ) - - -#called by the crawler to get description links on a listing page -#@param: beautifulsoup object that is using the correct html page (listing page) -#return: list of description links from a listing page -def nexus_links_parser(soup): - # Returning all links that should be visited by the Crawler - - href = [] - # Using a shorter, but still unique, class name - listing = soup.find_all("a", class_="woocommerce-loop-product__link") - - for a in listing: - link = a.get('href') - if link: # Checks if 'href' attribute is not None - href.append(link) - - return href diff --git a/MarketPlaces/Tor2door/crawler_selenium.py b/MarketPlaces/Quest/crawler_selenium.py similarity index 72% rename from MarketPlaces/Tor2door/crawler_selenium.py rename to MarketPlaces/Quest/crawler_selenium.py index 17988be..69287a9 100644 --- a/MarketPlaces/Tor2door/crawler_selenium.py +++ b/MarketPlaces/Quest/crawler_selenium.py @@ -1,7 +1,7 @@ __author__ = 'DarkWeb' ''' -Tor2door Market Crawler (Selenium) +Quest Market Crawler (Selenium) ''' from selenium import webdriver @@ -16,22 +16,22 @@ from PIL import Image import urllib.parse as urlparse import os, re, time +from datetime import date import subprocess -import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.Tor2door.parser import tor2door_links_parser +from MarketPlaces.Quest.parser import quest_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion' +baseURL = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion' # Opens Tor Browser, crawls the website def startCrawling(): - marketName = getMKTName() + marketName = getMarketName() driver = getAccess() - + if driver != 'down': try: login(driver) @@ -39,15 +39,15 @@ def startCrawling(): except Exception as e: print(driver.current_url, e) closeDriver(driver) - - new_parse(marketName, baseURL, True) + + new_parse(marketName, False) # Login using premade account credentials and do login captcha manually def login(driver): #wait for login page WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="username"]'))) + (By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button"))) #entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') @@ -60,19 +60,19 @@ def login(driver): ''' # wait for captcha page show up WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/main/div/div/div/div/div/form/div[3]/div/div[1]/label/img"))) + (By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img"))) # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[3]/div/div[1]/label/img').screenshot( - r'..\Tor2door\captcha.png') + driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img').screenshot( + r'..\Quest\captcha.png') # This method will show image in any image viewer - im = Image.open(r'..\Tor2door\captcha.png') + im = Image.open(r'..\Quest\captcha.png') im.show() # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='//*[@id="captcha"]') + inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/input') # ask user input captcha solution in terminal userIn = input("Enter solution: ") @@ -81,24 +81,24 @@ def login(driver): inputBox.send_keys(userIn) # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/main/div/div/div/div/div/form/div[4]/button").click() + driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button").click() ''' input("Press ENTER when CAPTCHA is completed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/main/div/div/div[1]/div/div[1]/div/h5'))) + WebDriverWait(driver, 50).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[5]/div/div/div/span'))) # Returns the name of the website -def getMKTName(): - name = 'Tor2door' +def getMarketName(): + name = 'Quest' return name # Return the link of the website def getFixedURL(): - url = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/login' + url = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion' return url @@ -109,7 +109,7 @@ def closeDriver(driver): # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') - driver.quit() + driver.close() time.sleep(3) return @@ -129,8 +129,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - # ff_prof.set_preference("network.dns.disablePrefetch", True) - # ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -146,7 +146,7 @@ def createFFDriver(): service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver @@ -198,22 +198,24 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # Digital - Guides - Hacking - links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=55') - # Digital - Guides - Others - links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=57') - # Digital - Software - links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=60') - # Software - Malware - links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=69') - # Software - Others - links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=78') + # # Digital - Services + # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/8ae67900-22ed-11ec-a710-31f963ce8d35') + # # Digital - Software + # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/92809300-22ed-11ec-b143-af312e1dab77') + # # Digital - Tutorials + # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/9d1592b0-22ed-11ec-b82d-c3d2878a8716') + # # Digital - Malware + # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/a35bae90-22ed-11ec-ad2e-410f5a5339b5') + # # Digital - Hacking + # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/b4252cf0-22ed-11ec-8032-751549438ed5') + # Digital - Exploits + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/c0c3ac60-22ed-11ec-9e97-41cd1912fdee') return links def crawlForum(driver): - print("Crawling the Tor2door market") + print("Crawling the Quest market") linksToCrawl = getInterestedLinks() @@ -243,17 +245,17 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + break + + # comment out + if count == 1: + break try: - nav = driver.find_element(by=By.XPATH, value= - '/html/body/main/div/div/div[2]/div[11]/div/nav') - a = nav.find_element(by=By.LINK_TEXT, value="›") + nav = driver.find_element(by=By.XPATH, value='/html/body/div[6]/nav') + li = nav.find_elements(By.TAG_NAME, 'li') + a = li[-1].find_element(By.TAG_NAME, 'a') link = a.get_attribute('href') if link == "": raise NoSuchElementException @@ -266,19 +268,19 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the Tor2door market done.") + input("Crawling Quest market done sucessfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link def isDescriptionLink(url): - if 'products/' in url: + if 'product' in url: return True return False # Returns True if the link is a listingPage link def isListingLink(url): - if 'category=' in url: + if 'category' in url: return True return False @@ -286,7 +288,7 @@ def isListingLink(url): # calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - return tor2door_links_parser(soup) + return quest_links_parser(soup) def crawler(): diff --git a/MarketPlaces/Quest/parser.py b/MarketPlaces/Quest/parser.py new file mode 100644 index 0000000..6761ed9 --- /dev/null +++ b/MarketPlaces/Quest/parser.py @@ -0,0 +1,232 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from MarketPlaces.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + + +# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) +def quest_description_parser(soup): + + # Fields to be parsed + + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + + row = soup.find_all('div', {'class': "row"}) + + # Finding Product Name + name = row[1].text + name = name.replace('\n', ' ') + name = name.replace(",", "") + name = name.strip() + + small = row[3].find_all('small') + + # Finding Vendor + vendor = small[0].text + vendor = vendor.replace("Vendor:", "") + vendor = vendor.replace(",", "") + vendor = vendor.strip() + + # Finding Vendor Rating + full_stars = small[2].find_all('i', {'class': "fas fa-star"}) + half_star = small[2].find('i', {'class': "fas fa-star-half-alt"}) + rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0) + + # Finding Successful Transactions + success = small[4].text + success = success.replace("Total Sales:", "") + success = success.strip() + + small = row[2].find('p', {'class': "text-left"}).find_all('small') + + # Finding Prices + USD = small[1].text + USD = USD.replace("$", "") + USD = USD.strip() + + shipping_info = row[2].find('p', {'class': "text-left"}).find('span').text.strip() + if "Digital" not in shipping_info: + shipping_info = shipping_info.split(" ") + + # Finding Shipment Information (Origin) + shipFrom = shipping_info[0].strip() + + # Finding Shipment Information (Destination) + shipTo = shipping_info[1].strip() + + textarea = row[2].find_all('textarea') + + # Finding the Product description + describe = textarea[0].text + describe = describe.replace("\n", " ") + describe = describe.replace("\r", " ") + describe = describe.strip() + + ''' + # Finding the Number of Product Reviews + tag = soup.findAll(text=re.compile('Reviews')) + for index in tag: + reviews = index + par = reviews.find('(') + if par >=0: + reviews = reviews.replace("Reviews (","") + reviews = reviews.replace(")","") + reviews = reviews.split(",") + review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) + else : + review = "-1" + ''' + + # Searching for CVE and MS categories + cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if cve: + CVE = " " + for idx in cve: + CVE += (idx) + CVE += " " + CVE = CVE.replace(',', ' ') + CVE = CVE.replace('\n', '') + ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) + if ms: + MS = " " + for im in ms: + MS += (im) + MS += " " + MS = MS.replace(',', ' ') + MS = MS.replace('\n', '') + + # Populating the final variable (this should be a list with all fields scraped) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo) + + # Sending the results + return row + + +# This is the method to parse the Listing Pages +def quest_listing_parser(soup): + + # Fields to be parsed + nm = 0 # *Total_Products (Should be Integer) + mktName = "Quest" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = [] # 6 Product_MS_Classification (Microsoft Security) + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft =[] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + href = [] # 20 Product_Links + + # Finding category of listing page + cat = soup.find('span', {'class': "btn btn-sm btn-outline-mgray active border-info"}).text + cat = cat.replace("Digital -", "") + cat = cat.strip() + + listing = soup.find_all('div', {"class": "col-md-2 my-md-0 col-12"}) + + # Populating the Number of Products + nm = len(listing) + + for a in listing: + bae = a.find_all('a', href=True) + + # Adding the category + category.append(cat) + + # Adding the url to the list of urls + link = bae[0].get('href') + link = cleanLink(link) + href.append(link) + + # Finding the Vendor + vendor_name = bae[2].text + vendor_name = vendor_name.replace(",", "") + vendor_name = vendor_name.strip() + vendor.append(vendor_name) + + # Finding the Product + product = bae[1].find('img').get('alt') + product = product.replace('\n', ' ') + product = product.replace(",", "") + product = product.strip() + name.append(product) + + # Searching for CVE and MS categories + cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if not cve: + cveValue="-1" + else: + cee = " " + for idx in cve: + cee += (idx) + cee += " " + cee = cee.replace(',', ' ') + cee = cee.replace('\n', '') + cveValue=cee + CVE.append(cveValue) + + ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) + if not ms: + MSValue="-1" + else: + me = " " + for im in ms: + me += (im) + me += " " + me = me.replace(',', ' ') + me = me.replace('\n', '') + MSValue=me + MS.append(MSValue) + + # Populate the final variable (this should be a list with all fields scraped) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + + +def quest_links_parser(soup): + + # Returning all links that should be visited by the Crawler + href = [] + + listing = soup.findAll('div', {"class": "col-md-2 my-md-0 col-12"}) + + for div in listing: + + link = div.find('a')["href"] + href.append(link) + + return href \ No newline at end of file diff --git a/MarketPlaces/RobinhoodMarket/crawler_selenium.py b/MarketPlaces/RobinhoodMarket/crawler_selenium.py deleted file mode 100644 index 232fac7..0000000 --- a/MarketPlaces/RobinhoodMarket/crawler_selenium.py +++ /dev/null @@ -1,256 +0,0 @@ -__author__ = 'chris' - -''' -RobinhoodMarket Market Crawler (Selenium) -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image - -import urllib.parse as urlparse -import os, re, time -import subprocess -import configparser -from bs4 import BeautifulSoup -from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.RobinhoodMarket.parser import Robinhood_links_parser -from MarketPlaces.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/' - - -# Opens Tor Browser, crawls the website -def startCrawling(): - marketName = getMKTName() - - driver = getAccess() - - if driver != 'down': - try: - # Captcha - input("Press ENTER when website has loaded") - # Robinhood doesn't need login - # login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(marketName, baseURL, True) - - -# Login is not needed in Robinhood -def login(driver): - pass - - -# Returns the name of the website -def getMKTName(): - name = 'RobinhoodMarket' - return name - - -# Return the link of the website -def getFixedURL(): - url = 'http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/' - - return url - - -# Closes Tor Browser -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.quit() - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from MarketPlaces.Initialization.markets_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", False) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url) - return driver - except: - driver.close() - return 'down' - - -# Saves the crawled html page -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if name == '': - name = str(counter) - counter = counter + 1 - return name - - -def getInterestedLinks(): - links = [] - - # Hacking - links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/hacking/') - # Other Software - links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/') - - return links - - -def crawlForum(driver): - print("Crawling the Robinhood market") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - - list = productPages(html) - for c, item in enumerate(list): - - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver, driver.page_source, item) - driver.back() - - # comment out - # if c == 4: - # break - - # comment out - # if count == 1: - # break - - # go to next page of market - try: - nav = driver.find_element(by=By.XPATH, value="//a[@class='next page-numbers']") - link = nav.get_attribute('href') - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the Robinhood market done.") - - -# Returns 'True' if the link is Topic link -def isDescriptionLink(url): - if 'product' in url and 'category' not in url: - return True - return False - - -# Returns True if the link is a listingPage link -def isListingLink(url): - if 'category=' in url: - return True - return False - - -# calling the parser to define the links -def productPages(html): - soup = BeautifulSoup(html, "html.parser") - return Robinhood_links_parser(soup) - - -def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") - - -if __name__ == '__main__': - startCrawling() diff --git a/MarketPlaces/RobinhoodMarket/parser.py b/MarketPlaces/RobinhoodMarket/parser.py deleted file mode 100644 index 059d327..0000000 --- a/MarketPlaces/RobinhoodMarket/parser.py +++ /dev/null @@ -1,334 +0,0 @@ -__author__ = 'chris' - -import re -import traceback - -# Here, we are importing the auxiliary functions to clean or convert data -from MarketPlaces.Utilities.utilities import * - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup - -# Import for test run -import glob -import os -import codecs -import shutil - -# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) -def Robinhood_description_parser(soup): - - # Fields to be parsed - - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image - - # Finding Product Name - name = soup.find('h1').text - name = name.replace('\n', ' ') - name = name.replace(",", "") - name = name.strip() - - # Finding description - desc = '' - tab = soup.find('div', {"id": "tab-description"}) - if tab is not None: - for p in tab.findAll('p'): - desc += p.text - if desc == '': - short = soup.find('div', {"class": "woocommerce-product-details__short-description"}) - if short is not None: - desc = short.text - describe = cleanString(desc.strip()) - - # Finding Product Image - image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img') - image = image.get('src') - image = image.split('base64,')[-1] - - # Finding Vendor - vendor = soup.find('a', {'class': 'wcfm_dashboard_item_title'}).text - vendor = vendor.replace(",", "") - vendor = vendor.replace("Sold by:", "") - vendor = vendor.strip() - - # Finding Vendor Image - vendor_image = soup.find('div', {'class': 'wcfmmp_sold_by_container_left'}).find('img') - vendor_image = vendor_image.get('src') - vendor_image = vendor_image.split('base64,')[-1] - - # Finding Category - catSpan = soup.find('span', {'class': 'posted_in'}) - category = catSpan.find('a').text - - # Finding USD - priceText = soup.find('p', {'class': 'price'}).text - USD = str(priceText).strip() - - # Searching for CVE and MS categories - cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if cve: - CVE = " " - for idx in cve: - CVE += (idx) - CVE += " " - CVE = CVE.replace(',', ' ') - CVE = CVE.replace('\n', '') - ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) - if ms: - MS = " " - for im in ms: - MS += (im) - MS += " " - MS = MS.replace(',', ' ') - MS = MS.replace('\n', '') - - # Populating the final variable (this should be a list with all fields scraped) - row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) - - # Sending the results - return row - - -# This is the method to parse the Listing Pages -def Robinhood_listing_parser(soup): - - # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "RobinhoodMarket" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor y - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft =[] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links - - listing = soup.find('ul', {"class": "products columns-4"}) - items = listing.findAll('li') - - # Populating the Number of Products - nm = len(items) - - for card in items: - # Finding Category - cat = soup.find("h1").text - cat = cat.replace('\n', ' ') - cat = cat.replace(",", "") - cat = cat.strip() - category.append(cat) - - bae = card.findAll('a') - - # Adding the url to the list of urls - link = card.find('a').get('href') - href.append(link) - - # Finding Product Name - product = card.find("h2").text - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.strip() - name.append(product) - - # Finding Product Image - product_image = card.find('a').find('img') - product_image = product_image.get('src') - product_image = product_image.split('base64,')[-1] - image.append(product_image) - - info = card.find('div', {'class': 'wcfmmp_sold_by_container'}) - - # Finding Vendor - vendor_name = info.find('a', {'class', 'wcfm_dashboard_item_title'}).text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding Vendor Image - vendor_icon = info.find('img', {'class', 'wcfmmp_sold_by_logo'}) - vendor_icon = vendor_icon.get('src') - vendor_icon = vendor_icon.split('base64,')[-1] - image_vendor.append(vendor_icon) - - # Finding USD - span = card.find('span', {'class': 'price'}) - if span is not None: - bdi = span.find('bdi') - usdText = bdi.find('span').next_sibling - usdVal = usdText.text - else: - usdVal = "0" - USD.append(usdVal) - - # Searching for CVE and MS categories - cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if not cve: - cveValue="-1" - else: - cee = " " - for idx in cve: - cee += (idx) - cee += " " - cee = cee.replace(',', ' ') - cee = cee.replace('\n', '') - cveValue=cee - CVE.append(cveValue) - - ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) - if not ms: - MSValue="-1" - else: - me = " " - for im in ms: - me += (im) - me += " " - me = me.replace(',', ' ') - me = me.replace('\n', '') - MSValue=me - MS.append(MSValue) - - #print(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - # reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) - - # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) - - -def Robinhood_links_parser(soup): - - # Returning all links that should be visited by the Crawler - href = [] - - #list = soup.findAll('div', {"class": "woocommerce columns-4"}) - listing = soup.find('ul', {"class": "products columns-4"}).findAll('li') - - for item in listing: - - link = item.find('a')['href'] - href.append(link) - - return href - - -if __name__ == '__main__': - nError = 0 - marketPlace = 'RobinhoodMarket' - - lines = [] # listing pages - lns = [] # description pages - detPage = {} - - ''' - # reading description pages - count = 0 - for fileDescription in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\08082023\\Description", '*.html')): - count += 1 - lns.append(fileDescription) - # if count > 5: - # break - - for index, line2 in enumerate(lns): - - print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns))) - - try: - html = codecs.open(line2.strip('\n'), encoding='utf8') - soup = BeautifulSoup(html, "html.parser") - html.close() - except: - - try: - html = open(line2.strip('\n')) - soup = BeautifulSoup(html, "html.parser") - html.close() - except: - - nError += 1 - print("There was a problem to read the file " + line2 + " in the Description section!") - # if createLog: - # logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section.\n") - continue - - try: - print(Robinhood_description_parser(soup)) - except: - traceback.print_exc() - print("There was a problem to parse the file " + line2 + " in the Description section!") - ''' - - # reading listing pages - count = 0 - for fileListing in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\08082023\\Listing", '*.html')): - count += 1 - lines.append(fileListing) - #if count > 1: - # break - - for index, line1 in enumerate(lines): - - print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines))) - - readError = False - try: - html = codecs.open(line1.strip('\n'), encoding='utf8') - soup = BeautifulSoup(html, "html.parser") - html.close() - except: - try: - html = open(line1.strip('\n')) - soup = BeautifulSoup(html, "html.parser") - html.close() - except: - print("There was a problem to read the file " + line1 + " in the Listing section!") - readError = True - - if not readError: - - parseError = False - try: - test = Robinhood_listing_parser(soup) - print(Robinhood_listing_parser(soup)) - except: - traceback.print_exc() - print("There was a problem to parse the file " + line1 + " in the listing section!") - parseError = True - - - print("DONE") \ No newline at end of file diff --git a/MarketPlaces/Apocalypse/crawler_selenium.py b/MarketPlaces/Royal/crawler_selenium.py similarity index 54% rename from MarketPlaces/Apocalypse/crawler_selenium.py rename to MarketPlaces/Royal/crawler_selenium.py index b91bf0e..857cb27 100644 --- a/MarketPlaces/Apocalypse/crawler_selenium.py +++ b/MarketPlaces/Royal/crawler_selenium.py @@ -1,68 +1,171 @@ -__author__ = 'Helium' +__author__ = 'DarkWeb' ''' -Apocalypse Forum Crawler (Selenium) -two captchas. if you get a captcha wrong you have to reload program. +Royal Marketplace Crawler (Selenium) ''' from selenium import webdriver +from selenium.webdriver.support.select import Select from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By - +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait from PIL import Image + + import urllib.parse as urlparse import os, re, time from datetime import date import subprocess -import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.Apocalypse.parser import apocalypse_links_parser +from MarketPlaces.Royal.parser import royal_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/' +baseURL = 'http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion' -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later +# Opens Tor Browser, crawls the website def startCrawling(): - mktName = getMKTName() + marketName = getMarketName() driver = getAccess() if driver != 'down': try: + captcha(driver) login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) - closetor(driver) + closeDriver(driver) + + new_parse(marketName, False) + + +def captcha(driver): + ''' + # wait for captcha page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[2]/div/div/div/div/form/div/div[2]/button"))) + + inputChars = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[2]/div[1]/input') + inputNum = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[2]/div[2]/input') + + driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[1]/div/div').screenshot( + r'..\Royal\captcha1.png') + + im = Image.open(r'..\Royal\captcha1.png') + im.show() + + chars = input("Enter characters: ") + inputChars.send_keys(chars) + + num = input("Enter number of wrong puzzle pieces: ") + inputNum.send_keys(num) + + # click the verify(submit) button + driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div/div/div/form/div/div[2]/button").click() + ''' + input("Press ENTER when CAPTCHA is completed\n") - new_parse(mktName, baseURL, True) + # wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[2]/div/div/div[2]/h1"))) + + ''' + temp = driver.find_element(by=By.XPATH, value='/html/body/div/div/form/div[1]') + boxes = temp.find_elements(by=By.TAG_NAME, value='input') + + for box in boxes: + # click box to update captcha image + box.click() + + # save clock captcha to local + time.sleep(1) + driver.find_element(by=By.XPATH, value='/html/body/div/div/form/div[1]/div').screenshot( + r'..\Royal\captcha1.png') + + im = Image.open(r'..\Royal\captcha1.png') + im.show() + + letter = input("Enter letter: ") + box.send_keys(letter) + + # click the verify(submit) button + driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click() + + # wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[1]/div/div/div[2]/form/input[3]"))) + ''' + + +# Login using premade account credentials and do login captcha manually +def login(driver): + # wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[2]/div/div/div[2]/form/div[4]"))) + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + # Username here + usernameBox.send_keys('blabri') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + # Password here + passwordBox.send_keys('fishowal') + + # click "Login" + driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[2]/form/div[4]').click() + + ''' + # wait for captcha page show up + time.sleep(3) + + # save captcha to local + driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[2]/form/div[4]/label/div/div').screenshot( + r'..\Royal\captcha2.png') + + # This method will show image in any image viewer + im = Image.open(r'..\Royal\captcha2.png') + im.show() + + # ask user input captcha solution in terminal + userIn = input("Enter location of wrong pieces (squares are numbered 1-24 left to right, # # #): ") + squares = userIn.split() + + # send user solution into the input space + for id in squares: + driver.find_element(by=By.XPATH, value='//*[@id="cl[' + str((int(id)-1)) + ']"]').click() + + # click the verify(submit) button + driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div/div[2]/form/div[4]/label/div/div/div/button").click() + ''' + input("Press ENTER when CAPTCHA is completed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 50).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[3]/div/div[5]/div[1]'))) # Returns the name of the website -#return: name of site in string type -def getMKTName(): - name = 'Apocalypse' +def getMarketName(): + name = 'Royal' return name -# Return the base link of the website -#return: url of base site in string type +# Return the link of the website def getFixedURL(): - url = 'http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/' + url = 'http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion' + return url # Closes Tor Browser -#@param: current selenium driver -def closetor(driver): +def closeDriver(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") @@ -87,8 +190,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -104,14 +207,12 @@ def createFFDriver(): service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -123,32 +224,7 @@ def getAccess(): return 'down' -# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha -# then allows for manual solving of captcha in the terminal -#@param: current selenium web driver -def login(driver): - input("Press ENTER when CAPTCHA is completed\n") - - # wait for page to show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="name"]'))) - - # entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="name"]') - # Username here - usernameBox.send_keys('shooby') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - # Password here - passwordBox.send_keys('perry_1the2_platypu$') - - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[1]/div[2]/div[1]/div[1]/a[13]"))) - - -# Saves the crawled html page, makes the directory path for html pages if not made +# Saves the crawled html page def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -158,7 +234,6 @@ def savePage(driver, page, url): # Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE @@ -171,41 +246,33 @@ def getFullPathName(url): return fullPath -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site +# Creates the file name from passed URL def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) - if (name == ''): + if name == '': name = str(counter) counter = counter + 1 return name -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products def getInterestedLinks(): links = [] - # Digital Goods - links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74') - # Fraud - links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75') - # Services - links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76') - # software and malware - links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30') + # Digital - Fraud Software + links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Fraud%20Software') + # # Digital - Guides and Tutorials + # links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Guides%20&%20Tutorials') + # # Digital - Legitimate Software + # links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Legitimiate%20Software') + # # Services - Carding + # links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Carding') return links -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver def crawlForum(driver): - print("Crawling the Apocalypse market") + print("Crawling the Royal market") linksToCrawl = getInterestedLinks() @@ -233,22 +300,20 @@ def crawlForum(driver): except: driver.refresh() savePage(driver, driver.page_source, item) - # driver.back() - try: - driver.get(link) - except: - driver.refresh() + driver.back() + + # comment out + break - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + if count == 1: + break try: - nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div/div[2]/nav') - link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='»').get_attribute('href') + nav = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[1]/div[2]/nav') + li = nav.find_elements(by=By.TAG_NAME, value='li') + a = li[-1].find_element(by=By.TAG_NAME, value='a') + link = a.get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -260,42 +325,27 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the Apocalypse market done.") + input("Crawling Royal forum done sucessfully. Press ENTER to continue\n") -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not +# Returns 'True' if the link is Topic link def isDescriptionLink(url): - if 'article' in url: + if 'product' in url: return True return False # Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not def isListingLink(url): if 'category' in url: return True return False -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through +# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - return apocalypse_links_parser(soup) - - -# Drop links that "signout" -# def isSignOut(url): -# #absURL = urlparse.urljoin(url.base_url, url.url) -# if 'signout' in url.lower() or 'logout' in url.lower(): -# return True -# -# return False + return royal_links_parser(soup) def crawler(): diff --git a/MarketPlaces/Tor2door/parser.py b/MarketPlaces/Royal/parser.py similarity index 68% rename from MarketPlaces/Tor2door/parser.py rename to MarketPlaces/Royal/parser.py index 49e0a93..dfb2d32 100644 --- a/MarketPlaces/Tor2door/parser.py +++ b/MarketPlaces/Royal/parser.py @@ -8,7 +8,7 @@ from bs4 import BeautifulSoup # This is the method to parse the Description Pages (one page to each Product in the Listing Pages) -def tor2door_description_parser(soup): +def royal_description_parser(soup): # Fields to be parsed @@ -31,23 +31,18 @@ def tor2door_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image - - bae = soup.find('div', {'class': "col-9"}) # Finding Product Name - name = bae.find('h2').text + name = soup.find('h5', {'class': "bold"}).text name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() - mb = bae.findAll('div', {"class": "mb-1"}) + bae = soup.find('div', {'class': "card-header bg-light"}) # Finding Vendor - vendor = mb[0].text + vendor = bae.find('a').text vendor = vendor.replace(",", "") - vendor = vendor.replace("Sold by:", "") vendor = vendor.strip() # # Finding Vendor Rating @@ -55,24 +50,45 @@ def tor2door_description_parser(soup): # half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) # rating = len(full_stars) + (0.5 if half_star is not None else 0) - # Finding Quantity Sold and Left - temp = mb[4].text.split(',') - - sold = temp[0].replace("sold", "") - sold = sold.strip() - - left = temp[1].replace("in stock", "") - left = left.strip() - - # Finding USD - USD = bae.find('div', {"class": "h3 text-secondary"}).text - USD = USD.replace("$", "") - USD = USD.strip() - - # Finding BTC - temp = bae.find('div', {"class": "small"}).text.split("BTC") - - BTC = temp[0].strip() + # Finding Successful Transactions + success = bae.find('b').text + success = success.replace("(", "") + success = success.replace(")", "") + success = success.strip() + + form = soup.find_all('form', {'method': "POST"}) + bae = form[1].find_all('div', {'class': "row"}) + + # Finding Quantity Sold + div = bae[2].find_all('div', {'class': "col"}) + temp = div[1].text + temp = temp.split() + if len(temp) > 0: + sold = temp[0].strip() + sold = re.sub(r'[^0-9.]', "", sold) + if sold == "": + sold = -1 + else: + sold = -1 + + # Finding Quantity Left + div = bae[3].find_all('div', {'class': "col"}) + temp = div[1].text + temp = temp.split() + if len(temp) > 0: + left = temp[0].strip() + left = re.sub(r'[^0-9.]', "", left) + if left == "": + left = -1 + else: + left = -1 + + # Finding Prices + temp = bae[-2].find('strong').text + temp = temp.replace("Price:", "") + temp = temp.split() + USD = temp[0].strip() + USD = re.sub(r'[^0-9.]', "", USD) # shipping_info = bae[4].text # if "Digital" not in shipping_info: @@ -85,7 +101,7 @@ def tor2door_description_parser(soup): # shipTo = shipping_info[1].strip() # Finding the Product description - describe = bae.find('div', {"class": "card border-top-0"}).text + describe = soup.find('xmp').text describe = describe.replace("\n", " ") describe = describe.replace("\r", " ") describe = describe.strip() @@ -108,23 +124,20 @@ def tor2door_description_parser(soup): MS = MS.replace(',', ' ') MS = MS.replace('\n', '') - image = bae.find('div', {"class": "product-primary"}).find('img') - image = image.get('src').split('base64,')[-1] - # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) + BTC, USD, EURO, sold, left, shipFrom, shipTo) # Sending the results return row # This is the method to parse the Listing Pages -def tor2door_listing_parser(soup): +def royal_listing_parser(soup): # Fields to be parsed nm = 0 # *Total_Products (Should be Integer) - mktName = "Tor2door" # 0 *Marketplace_Name + mktName = "Royal" # 0 *Marketplace_Name vendor = [] # 1 *Vendor y rating_vendor = [] # 2 Vendor_Rating success = [] # 3 Vendor_Successful_Transactions @@ -144,69 +157,43 @@ def tor2door_listing_parser(soup): qLeft =[] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links + href = [] # 20 Product_Links + + # Finding category of listing page + cat = soup.find('li', {'class': "breadcrumb-item active"}).text + cat = cat.strip() - listing = soup.findAll('div', {"class": "card product-card mb-3"}) + listing = soup.findAll('div', {'class': "card search border shadow-sm mb-3"}) # Populating the Number of Products nm = len(listing) - # Finding Category - cat = soup.find("div", {"class": "col-9"}) - cat = cat.find("h2").text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() + for a in listing: + bae = a.findAll('a', href=True) - for card in listing: + # Adding the category category.append(cat) - bae = card.findAll('a') - # Adding the url to the list of urls - link = bae[0].get('href') + link = bae[1].get('href') + link = cleanLink(link) href.append(link) - # Finding Product Name - product = bae[1].text - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.strip() - name.append(product) - - # Finding Vendor - vendor_name = bae[2].text + # Finding the Vendor + vendor_name = bae[0].text vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.strip() vendor.append(vendor_name) - # Finding USD - usd = card.find('div', {"class": "mb-1"}).text - usd = usd.replace("$", "") - usd = usd.strip() - USD.append(usd) - - # Finding Rating - stars = card.find("ul", {"class": "star-list"}) - full = stars.findAll('i', {"class": "fas fa-star star star-active"}) - half = stars.find('i', {"class": "fas fa-star-half star star-active"}) - rating = len(full) - if half is not None: - rating += 0.5 - rating_item.append(str(rating)) - - # Finding Reviews - num = card.find("span", {"class": "rate-count"}).text - num = num.replace("(", "") - num = num.replace("review)", "") - num = num.replace("reviews)", "") - num = num.strip() - reviews.append(num) + # Finding the Product + product = bae[2].get('title') + product = product.replace('\n', ' ') + product = product.replace(",", "") + product = product.strip() + name.append(product) # Searching for CVE and MS categories - cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: cveValue="-1" else: @@ -219,7 +206,7 @@ def tor2door_listing_parser(soup): cveValue=cee CVE.append(cveValue) - ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) + ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: MSValue="-1" else: @@ -232,24 +219,22 @@ def tor2door_listing_parser(soup): MSValue=me MS.append(MSValue) - image = bae[0].find('img') - image = image.get('src').split('base64,')[-1] - # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) -def tor2door_links_parser(soup): +def royal_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('div', {"class": "card product-card mb-3"}) + listing = soup.findAll('div', {"class": "card search border shadow-sm mb-3"}) for div in listing: - link = div.find('a')['href'] + a = div.find_all('a') + link = a[1].get('href') href.append(link) return href \ No newline at end of file diff --git a/MarketPlaces/ThiefWorld/parser.py b/MarketPlaces/ThiefWorld/parser.py deleted file mode 100644 index ba0f51c..0000000 --- a/MarketPlaces/ThiefWorld/parser.py +++ /dev/null @@ -1,190 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from typing import List, Tuple -from MarketPlaces.Utilities.utilities import * - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup, ResultSet, Tag - - -def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple: - - # Fields to be parsed - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image - - name = soup.find("h1", {'class': 'title'}).text - name = cleanString(name.strip()) - - describe = soup.find('div', {'id': 'descriptionContent'}).text - describe = cleanString(describe.strip()) - - # Finding Product Image - image = soup.find('div', {'class': 'product_img_big'}).find('img') - image = image.get('src') - image = image.split('base64,')[-1] - - commentListTag: Tag = soup.find('ul', {'class': 'comment_list scrollbar'}) - commentList = commentListTag.find_all('li') - review = str(len(commentList)) - - citySelection: str = soup.find('ul', {'class': 'meta text-muted i_location'}).text - shipFrom = cleanString(citySelection.strip()) - - vendor = soup.find('h1', {'class': 'title over'}).text - vendor = cleanString(vendor.strip()) - - usdTag: Tag = soup.find('div', {'class': 'product_price__big'}).find('span') - usdText = usdTag.text.strip('/')[0] - # usdText format: " USD " (i.e., "70 000 USD ") - USD = cleanString(usdText.replace("USD", "").strip()) - - ratingDiv = soup.find('div', {'class': 'rating_star'}) - rating_vendor = ratingDiv.get('title').split(' ')[1] - - rating_item = soup.find('div', {'class': 'product_rate'}).text - rating_item = rating_item.replace("rating", "") - rating_item = cleanString(rating_item.strip()) - - # Populating the final variable (this should be a list with all fields scraped) - row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) - - # Sending the results - return row - - -def thiefWorld_listing_parser(soup: BeautifulSoup): - - # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "ThiefWorld" # 0 Marketplace_Name - vendor = [] # 1 *Vendor y - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft =[] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links - - productList: ResultSet[Tag] = soup.find_all('div', {'class': 'catalog_item'}) - - nm = len(productList) - - for product in productList: - - productTitle: Tag = product.find('div', {'class': 'title'}).find('a') - - productName = cleanString(productTitle.text.strip()) - name.append(productName) - - # Finding Product Image - product_image = product.find('noscript').find('img') - product_image = product_image.get('src') - product_image = product_image.split('base64,')[-1] - image.append(product_image) - - productHref = productTitle.get('href') - href.append(productHref) - - CVE.append('-1') - MS.append('-1') - - cat = soup.find('calsys-cat').text - category.append(cat.strip()) - - productDescription = product.find('div', {'class': 'text'}).text - productDescription = cleanString(productDescription.strip()) - describe.append(productDescription) - - views.append('-1') - reviews.append('-1') - addDate.append('-1') - BTC.append('-1') - - priceText = product.find('span', {'class': 'price'}).find('span').text - priceText = priceText.split('USD')[0] - priceText = cleanString(priceText.strip()) - USD.append(priceText) - - EURO.append('-1') - sold.append('-1') - qLeft.append('-1') - shipFrom.append('-1') - shipTo.append('-1') - - productVendor = product.find('div', {'class': 'market over'}).find('a').text - productVendor = cleanString(productVendor.strip()) - vendor.append(productVendor) - - image_vendor.append('-1') - - rating_vendor.append('-1') - #rating_item.append('-1') - - rating = product.find('div', {'class': 'rating_star_yellow'}).attrs.get('style') - rating = rating.replace("width: ", "") - rating_item.append(cleanString(rating)) - - success.append('-1') - - - # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) - - - - - -#called by the crawler to get description links on a listing page -#@param: beautifulsoup object that is using the correct html page (listing page) -#return: list of description links from a listing page -def thiefworld_links_parser(soup): - - # Returning all links that should be visited by the Crawler - - href = [] - listing = soup.find('div', {"class": "row tile__list tileitems_filter pad15 tileproduct__list"}).findAll('div', {"class": "desc"}) - - for a in listing: - bae = a.find('div', {"class": "title"}).find('a', href=True) - link = bae['href'] - href.append(link) - - return href \ No newline at end of file diff --git a/MarketPlaces/TorBay/crawler_selenium.py b/MarketPlaces/TorBay/crawler_selenium.py deleted file mode 100644 index 18a04be..0000000 --- a/MarketPlaces/TorBay/crawler_selenium.py +++ /dev/null @@ -1,268 +0,0 @@ -__author__ = 'Helium' - -''' -TorBay Market Forum Crawler (Selenium) -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.common.by import By - -from PIL import Image -import urllib.parse as urlparse -import os, time -from datetime import date -import subprocess -import configparser -import subprocess -from bs4 import BeautifulSoup -from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.TorBay.parser import torbay_links_parser -from MarketPlaces.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion/' - - -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later -def startCrawling(): - mktName = getMKTName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(mktName, baseURL, True) - - -# Returns the name of the website -#return: name of site in string type -def getMKTName(): - name = 'TorBay' - return name - - -# Return the base link of the website -#return: url of base site in string type -def getFixedURL(): - url = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion/' - return url - - -# Closes Tor Browser -#@param: current selenium driver -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from MarketPlaces.Initialization.markets_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", True) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url) - return driver - except: - driver.close() - return 'down' - - -# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha -# then allows for manual solving of captcha in the terminal -#@param: current selenium web driver -def login(driver): - # wait for page to show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[2]/div/div/div/ul/li[6]/a"))) - - -# Saves the crawled html page, makes the directory path for html pages if not made -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site -def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if (name == ''): - name = str(counter) - counter = counter + 1 - return name - - -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products -def getInterestedLinks(): - links = [] - - # Hacking - links.append('http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion/category/hacking') - - return links - - -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver -def crawlForum(driver): - print("Crawling the TorBay Market") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - - list = productPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver, driver.page_source, item) - driver.back() - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - try: - nav = driver.find_element(by=By.XPATH, value='/html/body/section/div/div/div[2]/div/div[2]/ul') - link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href') - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the TorBay market done.") - - -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not -def isDescriptionLink(url): - if 'product' in url: - return True - return False - - -# Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not -def isListingLink(url): - if 'category' in url: - return True - return False - - -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through -def productPages(html): - soup = BeautifulSoup(html, "html.parser") - return torbay_links_parser(soup) - - -def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/TorBay/parser.py b/MarketPlaces/TorBay/parser.py deleted file mode 100644 index 69d2cfb..0000000 --- a/MarketPlaces/TorBay/parser.py +++ /dev/null @@ -1,183 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from MarketPlaces.Utilities.utilities import * -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup - - -#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of description page -#return: 'row' that contains a variety of lists that each hold info on the description page -def torbay_description_parser(soup): - - # Fields to be parsed - - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image - - # Finding Product Name - try: - product_name = soup.find('div', {'class': 'product-information'}).find('h1').text - name = cleanString(product_name.strip()) - except: - product_name = soup.find('div', {'class': 'profile-info'}).find('h2').text - name = cleanString(product_name.strip()) - - # Finding Vendor FIx - vendor_name = soup.find('div', {"class": "profile-info"}).find('h2').text - vendor = cleanString(vendor_name.strip()) - - # Finding Vendor Image - vendor_image = soup.find('div', {'class': 'avatar'}).find('img') - vendor_image = vendor_image.get('src') - vendor_image = vendor_image.split('base64,')[-1] - - # Finding Prices - USD = soup.find('div', {'class': "total-price"}).find('span').text.strip() - - # Finding the Product Category - cat = soup.find('div', {'class': "profile-info"}).find('p').text - category = cleanString(cat.strip()) - - # Finding the Product description - try: - describe = soup.find('div', {'class': "info"}).find('p').text - if "\n" in describe: - describe = describe.replace("\n", " ") - describe = describe.replace("\r", " ") - describe = cleanString(describe.strip()) - except: - # print("product desc") - describe = soup.find('div', {'class': 'info'}).text - describe = cleanString(describe.strip()) - - # Finding Product Image - image = soup.find('div', {'class': 'image text-center'}).find('img') - image = image.get('src') - image = image.split('base64,')[-1] - - # Populating the final variable (this should be a list with all fields scraped) - row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) - - # Sending the results - return row - - -#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of listing page -#return: 'row' that contains a variety of lists that each hold info on the listing page -def torbay_listing_parser(soup): - - # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "TorBay" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor y - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft =[] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links - - listing = soup.findAll('div', {"class": "product-card"}) - - # Populating the Number of Products - nm = len(listing) - - for a in listing: - - product_name = a.find('p', {'class': 'name'}).text - name.append(cleanString(product_name.strip())) - - # Finding Product Image - image.append("-1") - - prod = a.find('p', {'class': 'price'}).text # price - USD.append(cleanString(prod.strip())) - - ven = a.find('div', {'class': 'pc-footer'}).find('div').find('a').text # pc-footer - vendor.append(cleanString(ven.strip())) - # print(ven) - - # Finding Vendor Image - image_vendor.append("-1") - - h = a.find('p', {'class': 'name'}).find('a').get('href') - href.append(h) - - CVE.append("-1") - MS.append("-1") - rating_vendor.append("-1") - success.append("-1") - describe.append("-1") - views.append("-1") - reviews.append("-1") - rating_item.append("-1") - addDate.append("-1") - BTC.append("-1") - EURO.append("-1") - sold.append("-1") - qLeft.append("-1") - shipFrom.append("-1") - shipTo.append("-1") - category.append("Hacking") - - # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) - - -#called by the crawler to get description links on a listing page -#@param: beautifulsoup object that is using the correct html page (listing page) -#return: list of description links from a listing page -def torbay_links_parser(soup): - - # Returning all links that should be visited by the Crawler - - href = [] - listing = soup.find('section', {"id": "content"}).findAll('div', {"class": "product-card"}) - - for a in listing: - bae = a.find('div', {"class": "pc-footer"}).find('a', {"class": "btn btn-primary"}, href=True) - link = bae['href'] - href.append(link) - - return href \ No newline at end of file diff --git a/MarketPlaces/TorMarket/crawler_selenium.py b/MarketPlaces/TorMarket/crawler_selenium.py deleted file mode 100644 index 7021abc..0000000 --- a/MarketPlaces/TorMarket/crawler_selenium.py +++ /dev/null @@ -1,277 +0,0 @@ -__author__ = 'Helium' - -''' -TorMarket Forum Crawler (Selenium) -''' - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.common.by import By - -from PIL import Image -import urllib.parse as urlparse -import os, re, time -from datetime import date -import subprocess -import configparser -from bs4 import BeautifulSoup -from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.TorMarket.parser import tormarket_links_parser -from MarketPlaces.Utilities.utilities import cleanHTML - -counter = 1 -baseURL = 'http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/' - - -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later -def startCrawling(): - mktName = getMKTName() - driver = getAccess() - - if driver != 'down': - try: - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) - - new_parse(mktName, baseURL, True) - - -# Returns the name of the website -#return: name of site in string type -def getMKTName(): - name = 'TorMarket' - return name - - -# Return the base link of the website -#return: url of base site in string type -def getFixedURL(): - url = 'http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/' - return url - - -# Closes Tor Browser -#@param: current selenium driver -def closeDriver(driver): - # global pid - # os.system("taskkill /pid " + str(pro.pid)) - # os.system("taskkill /t /f /im tor.exe") - print('Closing Tor...') - driver.close() - time.sleep(3) - return - - -# Creates FireFox 'driver' and configure its 'Profile' -# to use Tor proxy and socket -def createFFDriver(): - from MarketPlaces.Initialization.markets_mining import config - - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - ff_prof.set_preference("places.history.enabled", False) - ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - ff_prof.set_preference("signon.rememberSignons", False) - ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - # ff_prof.set_preference("network.dns.disablePrefetch", True) - # ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 3) - ff_prof.set_preference("browser.download.folderList", 2) - ff_prof.set_preference("browser.download.manager.showWhenStarting", False) - ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") - ff_prof.set_preference('network.proxy.type', 1) - ff_prof.set_preference("network.proxy.socks_version", 5) - ff_prof.set_preference('network.proxy.socks', '127.0.0.1') - ff_prof.set_preference('network.proxy.socks_port', 9150) - ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", False) - ff_prof.update_preferences() - - service = Service(config.get('TOR', 'geckodriver_path')) - - driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - - driver.maximize_window() - - return driver - - -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' -def getAccess(): - url = getFixedURL() - driver = createFFDriver() - try: - driver.get(url) - return driver - except: - driver.close() - return 'down' - - -# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha -# then allows for manual solving of captcha in the terminal -#@param: current selenium web driver -def login(driver): - # wait for page to show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[2]/div/div/div/main/article/div/section[4]/div/div[1]/div/div/div/div/ul/li[15]/ul/li[3]/a"))) - -# Saves the crawled html page, makes the directory path for html pages if not made -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(cleanPage.encode('utf-8')) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site -def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') - return fullPath - - -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if (name == ''): - name = str(counter) - counter = counter + 1 - return name - - -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products -def getInterestedLinks(): - links = [] - - # Tutorials - links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/') - # Malware - links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/malware/') - # Services - links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/') - - return links - - -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver -def crawlForum(driver): - print("Crawling the TorMarket market") - - linksToCrawl = getInterestedLinks() - - i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - has_next_page = True - count = 0 - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - - list = productPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver, driver.page_source, item) - driver.back() - - # # comment out - # break - # - # # comment out - # if count == 1: - # break - - try: - link = driver.find_element(by=By.LINK_TEXT, value='NEXT').get_attribute('href') - if link == "": - raise NoSuchElementException - count += 1 - - except NoSuchElementException: - has_next_page = False - - except Exception as e: - print(link, e) - i += 1 - - print("Crawling the TorMarket market done.") - - -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not -def isDescriptionLink(url): - if 'shop' in url: - return True - return False - - -# Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not -def isListingLink(url): - if 'product-category' in url: - return True - return False - - -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through -def productPages(html): - soup = BeautifulSoup(html, "html.parser") - return tormarket_links_parser(soup) - - -# Drop links that "signout" -# def isSignOut(url): -# #absURL = urlparse.urljoin(url.base_url, url.url) -# if 'signout' in url.lower() or 'logout' in url.lower(): -# return True -# -# return False - - -def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/TorMarket/parser.py b/MarketPlaces/TorMarket/parser.py deleted file mode 100644 index 6a6fac0..0000000 --- a/MarketPlaces/TorMarket/parser.py +++ /dev/null @@ -1,189 +0,0 @@ -__author__ = 'DarkWeb' - -# Here, we are importing the auxiliary functions to clean or convert data -from MarketPlaces.Utilities.utilities import * - -# Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup - -import re - -#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of description page -#return: 'row' that contains a variety of lists that each hold info on the description page -def tormarket_description_parser(soup): - - # Fields to be parsed - - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo - image = "-1" # 19 Product_Image - vendor_image = "-1" # 20 Vendor_Image - - #finding the name of the product - name_of_product = soup.find("h1", {"class": "product_title entry-title"}).find("a").text - name = cleanString(name_of_product.strip()) - - #finding the description of the product - description_of_product = soup.find("div", {"class": "woocommerce-product-details__short-description"}).text - describe = cleanString(description_of_product.strip()) - - #finding the name of the vendor - name_of_vendor = soup.find("div", {"class": "wcfmmp_sold_by_store"}) - if name_of_vendor is not None: - name_of_vendor = name_of_vendor.find("a").text - vendor = cleanString(name_of_vendor.strip()) - else: - vendor = "TorMarket" - - #finding the price of the item - price = soup.find("p", {"class": "price"}).find("bdi").text - price_cleaned = price[1:] - USD = price_cleaned.strip() - - category = soup.find('span', {"class": "posted_in"}).text - category = category.split(':')[-1] - category = category.replace(',', '/') - category = cleanString(category.strip()) - #everything else gets a -1 because they are not found - - # Populating the final variable (this should be a list with all fields scraped) - row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) - - # Sending the results - return row - - -#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of listing page -#return: 'row' that contains a variety of lists that each hold info on the listing page -def tormarket_listing_parser(soup): - - # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "TorMarket" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor y - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this - MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft = [] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - image = [] # 20 Product_Image - image_vendor = [] # 21 Vendor_Image - href = [] # 22 Product_Links - - products_list = soup.find('ul', {"class": "products columns-3 tablet-columns-2 mobile-columns-1"}).find_all('li') - nm = len(products_list) - - for product in products_list: - # Finding the name of the product - name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text - name_of_product_cleaned = cleanString(name_of_product.strip()) - # print(name_of_product_cleaned) - name.append(name_of_product_cleaned) - #finding the URL - try: - url = product.find("div", {"class": "product-loop-content text-center"}).find("a").get("href") - # print(url) - href.append(url) - except AttributeError as e: - print("I can't find the link") - raise e - - #finding the rating of the product - rating_score_of_product = product.find("div", {"class": "product-loop-content text-center"}).find("div").find("span").text - rating_item.append(cleanString(rating_score_of_product.strip())) - # print("done") - #finding the rating of the vendors - rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"}) - if rating_score_of_vendor is not None: - rating_score_of_vendor = rating_score_of_vendor.find("strong").text - rating_vendor.append(cleanString(rating_score_of_vendor.strip())) - else: - rating_vendor.append('-1') - # print("done") - #finding the cost in USD - cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text - USD.append(cost) - # print("done") - #finding the name of the vendor - vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"}) - if vendor_name is not None: - vendor_name = vendor_name.find("a").text - vendor.append(cleanString(vendor_name.strip())) - else: - vendor.append(mktName) - # print("done") - #everything else appends a -1 - success.append("-1") - CVE.append("-1") - MS.append("-1") - category.append("-1") - describe.append("-1") - views.append("-1") - reviews.append("-1") - addDate.append("-1") - BTC.append("-1") - EURO.append("-1") - sold.append("-1") - qLeft.append("-1") - shipFrom.append("-1") - shipTo.append("-1") - # print("Done! moving onto the next product!") - # print(len(shipTo)) - - - # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) - - -#called by the crawler to get description links on a listing page -#@param: beautifulsoup object that is using the correct html page (listing page) -#return: list of description links from a listing page -def tormarket_links_parser(soup): - - # Returning all links that should be visited by the Crawler - - href = [] - listing = soup.findAll('div', {"class": "product-loop-content text-center"}) - - for a in listing: - bae = a.find('h2', {"class": "woocommerce-loop-product__title"}).find('a', href=True) - link = bae['href'] - href.append(link) - - return href \ No newline at end of file diff --git a/MarketPlaces/HiddenMarket/crawler_selenium.py b/MarketPlaces/WeTheNorth/crawler_selenium.py similarity index 63% rename from MarketPlaces/HiddenMarket/crawler_selenium.py rename to MarketPlaces/WeTheNorth/crawler_selenium.py index 533129a..c6d5b70 100644 --- a/MarketPlaces/HiddenMarket/crawler_selenium.py +++ b/MarketPlaces/WeTheNorth/crawler_selenium.py @@ -1,7 +1,7 @@ __author__ = 'DarkWeb' ''' -HiddenMarket Market Crawler (Selenium) +WeTheNorth Market Crawler (Selenium) ''' from selenium import webdriver @@ -16,20 +16,20 @@ from PIL import Image import urllib.parse as urlparse import os, re, time +from datetime import date import subprocess -import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.HiddenMarket.parser import hiddenmarket_links_parser +from MarketPlaces.WeTheNorth.parser import wethenorth_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/' +baseURL = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion' # Opens Tor Browser, crawls the website def startCrawling(): - marketName = getMKTName() + marketName = getMarketName() driver = getAccess() if driver != 'down': @@ -40,39 +40,40 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(marketName, baseURL, True) + new_parse(marketName, False) # Login using premade account credentials and do login captcha manually def login(driver): - # wait for login page + time.sleep(3) + #wait for login page WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[3]/div[3]"))) + (By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[1]/input"))) - # entering username and password into input boxes - # usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - # Username here - # usernameBox.send_keys('ct1234') - # passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - # Password here - # passwordBox.send_keys('DementedBed1230') + #entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[1]/input') + #Username here + usernameBox.send_keys('blabri') + passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[2]/input') + #Password here + passwordBox.send_keys('fishowal') ''' # wait for captcha page show up WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/main/div/div/div/div/div/form/div[3]/div/div[1]/label/img"))) + (By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img"))) # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[3]/div/div[1]/label/img').screenshot( - r'..\captcha.png') + driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img').screenshot( + r'..\WeTheNorth\captcha.png') # This method will show image in any image viewer - im = Image.open(r'..\captcha.png') + im = Image.open(r'..\WeTheNorth\captcha.png') im.show() # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='//*[@id="captcha"]') + inputBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[4]/input') # ask user input captcha solution in terminal userIn = input("Enter solution: ") @@ -81,24 +82,24 @@ def login(driver): inputBox.send_keys(userIn) # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/main/div/div/div/div/div/form/div[4]/button").click() + driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div[2]/div/div[3]/form/div[5]/input").click() ''' - # input("Press ENTER when CAPTCHA is completed\n") + input("Press ENTER when CAPTCHA is completed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) - # WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - # (By.XPATH, '/html/body/main/div/div/div[1]/div/div[1]/div/h5'))) + WebDriverWait(driver, 50).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="information"]'))) # Returns the name of the website -def getMKTName(): - name = 'HiddenMarket' +def getMarketName(): + name = 'WeTheNorth' return name # Return the link of the website def getFixedURL(): - url = 'http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/' + url = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion' return url @@ -109,7 +110,7 @@ def closeDriver(driver): # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') - driver.quit() + driver.close() time.sleep(3) return @@ -140,7 +141,7 @@ def createFFDriver(): ff_prof.set_preference('network.proxy.socks', '127.0.0.1') ff_prof.set_preference('network.proxy.socks_port', 9150) ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", False) + ff_prof.set_preference("javascript.enabled", True) ff_prof.update_preferences() service = Service(config.get('TOR', 'geckodriver_path')) @@ -198,34 +199,19 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # Civil Software - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/civil_softwares') - # Tutorials - Carding - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/carding') - # Digital - Hacks - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/hacks') - # Digital - Exploit Kit - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/exploit_kit') - # 0Day - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/0day') - # Digital Forensics - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/digital_forensics') - # Tutorials - Mining - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/mining') - # Tutorials - Worms - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/worms') - # Tutorials - Viruses - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/viruses') - # Tutorials - Trojans - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/trojans') - # Tutorials - Botnets - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/botnets') + # # Fraud Software + # links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=5&podcategory=3') + # # Guides and Tutorials - Hacking + # links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=3&podcategory=3') + # Software and Malware + links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=10') + return links def crawlForum(driver): - print("Crawling the HiddenMarket market") + print("Crawling the WeTheNorth market") linksToCrawl = getInterestedLinks() @@ -233,20 +219,15 @@ def crawlForum(driver): while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) - categoryLink = link - maxNumPages = 0 # temp value. + try: + pg_counter = 1 has_next_page = True count = 0 - pageCount = 1 while has_next_page: try: driver.get(link) - if pageCount == 1: - maxNumPages = int(driver.find_element(by=By.CLASS_NAME, value='main') - .find_element(by=By.CLASS_NAME, value='pages') - .find_elements(By.CLASS_NAME, value='page')[-1].text) except: driver.refresh() html = driver.page_source @@ -262,19 +243,22 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + break + + # comment out + if count == 1: + break try: - pageCount += 1 - if pageCount > maxNumPages: + nav = driver.find_element(by=By.XPATH, value= + '/html/body/div[2]/div[3]/div[3]/div[2]/div[7]') + pg_counter += 1 + pg_counter_str = "p=" + str(pg_counter) + "&" + a = nav.find_element(by=By.XPATH, value = '//a[contains(@href,"'+pg_counter_str+'")]') + link = a.get_attribute('href') + if link == "": raise NoSuchElementException - pageLink = "/" + str(pageCount) + "/" - link = categoryLink + pageLink count += 1 except NoSuchElementException: @@ -284,7 +268,7 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the HiddenMarket market done.") + input("Crawling WeTheNorth market done sucessfully. Press ENTER to continue\n") # Returns 'True' if the link is Topic link @@ -304,7 +288,7 @@ def isListingLink(url): # calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - return hiddenmarket_links_parser(soup) + return wethenorth_links_parser(soup) def crawler(): diff --git a/MarketPlaces/WeTheNorth/parser.py b/MarketPlaces/WeTheNorth/parser.py new file mode 100644 index 0000000..56a42ec --- /dev/null +++ b/MarketPlaces/WeTheNorth/parser.py @@ -0,0 +1,248 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from MarketPlaces.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + + +# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) +def wethenorth_description_parser(soup): + + # Fields to be parsed + + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + + # Finding Product Name + listDes = soup.find('div', {'class': "listDes"}) + name = listDes.find('h2').text + name = name.replace('\n', ' ') + name = name.replace(",", "") + name = name.strip() + + # Finding Vendor + vendor = listDes.find('b').text + vendor = vendor.replace(",", "") + vendor = vendor.replace("...", "") + vendor = vendor.replace("-", "") + vendor = vendor.strip() + + # Finding Vendor Rating + # rating = listDes.find('span',{'class':'levelSet'}) + # rating = rating.text + # rating = rating.replace('\n', ' ') + # rating = rating.replace(",", "") + # rating = rating.strip() + + # Finding Successful Transactions + success = listDes.find_all('p')[1] + success = success.find('span').text + success = success.split() + success = success[0].strip() + + # Finding Prices - all prices in We The North are in CAD, I left the CAD in the resulting String so that it would show CAD for all prices + padp = listDes.find('p',{'class':'padp'}) + USD = padp.find('span').text + USD = USD.strip() + + # Finding Escrow - no escrow on WTN market + + shipping_info = listDes.find('tbody') + if "Digital" not in shipping_info: + shipping_info = shipping_info.find_all('tr') + row1 = shipping_info[0].find_all('td') + + # Finding Shipment Information (Origin) + shipFrom = row1[-1].text + shipFrom=shipFrom.strip() + if shipFrom=="": + shipFrom="-1" + + row2 = shipping_info[1].find_all('td') + + # Finding Shipment Information (Destination) + shipTo = row2[-1].text + shipTo= shipTo.strip() + if shipTo == "": + shipTo = "-1" + + # Finding the Product description + describe = soup.find("div",{'class':'tabcontent'}) + describe = describe.find('p').text + describe = describe.replace("\n", " ") + describe = describe.replace("\r", " ") + describe = describe.strip() + + ''' + # Finding the Number of Product Reviews + tag = soup.findAll(text=re.compile('Reviews')) + for index in tag: + reviews = index + par = reviews.find('(') + if par >=0: + reviews = reviews.replace("Reviews (","") + reviews = reviews.replace(")","") + reviews = reviews.split(",") + review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) + else : + review = "-1" + ''' + + # Searching for CVE and MS categories + # no CVE or MS for WTN market + + # Populating the final variable (this should be a list with all fields scraped) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo) + + # Sending the results + return row + + +# This is the method to parse the Listing Pages +def wethenorth_listing_parser(soup): + + # Fields to be parsed + nm = 0 # *Total_Products (Should be Integer) + mktName = "WeTheNorth" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = [] # 6 Product_MS_Classification (Microsoft Security) + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft =[] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + href = [] # 20 Product_Links + + right_content = soup.find('div', {"class": "right-content"}) + listing = right_content.findAll('div', {"class": "col-1search"}) + listing = listing[3:] + + # Populating the Number of Products + nm = len(listing) + + for a in listing: + bae = a.findAll('a', href=True) + + # Adding the url to the list of urls + link = bae[0].get('href') + link = cleanLink(link) + href.append(link) + + # Finding the Vendor + vendor_name = a.find('p', {'class': 'padp'}) + vendor_name = vendor_name.find('a').text + vendor_name = vendor_name.replace(",", "") + vendor_name = vendor_name.strip() + vendor.append(vendor_name) + + # Finding the Product + product = bae[0].text + product = product.replace('\n', ' ') + product = product.replace(",", "") + product = product.strip() + name.append(product) + + # Finding the Category + category_name = a.find('p', {'class': 'padp'}).text + first_dash = category_name.find('-') + second_dash = category_name[first_dash+1:].find('-') + category_name = category_name[first_dash+1:second_dash] + category_name=category_name.strip() + category.append(category_name) + + # Finding Views + view_count = a.text + view_count = view_count[view_count.find('Views:'): view_count.find('Sales:')] + view_count = view_count.replace('Views:', ' ') + view_count = view_count.replace('/', ' ') + view_count = view_count.strip() + views.append(view_count) + + # Finding success sales + sold_count = a.text + sold_count = sold_count[sold_count.find('Sales:'): sold_count.find('Short')] + sold_count = sold_count.replace('Sales:', ' ') + sold_count = sold_count.replace('/', ' ') + sold_count = sold_count.strip() + success.append(sold_count) + + # Searching for CVE and MS categories + # no CVE or MS in WTN market + cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if not cve: + cveValue="-1" + else: + cee = " " + for idx in cve: + cee += (idx) + cee += " " + cee = cee.replace(',', ' ') + cee = cee.replace('\n', '') + cveValue=cee + CVE.append(cveValue) + + ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) + if not ms: + MSValue="-1" + else: + me = " " + for im in ms: + me += (im) + me += " " + me = me.replace(',', ' ') + me = me.replace('\n', '') + MSValue=me + MS.append(MSValue) + + # Populate the final variable (this should be a list with all fields scraped) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + + +def wethenorth_links_parser(soup): + + # Returning all links that should be visited by the Crawler + href = [] + right_content = soup.find('div',{"class": "right-content"}) + listing = right_content.findAll('div', {"class": "col-1search"}) + #cut out the irrelevant products that are in blue, the first three products of each page usually unrelated + listing = listing[3:] + for a in listing: + + link = a.find('a') + link = link['href'] + href.append(link) + + return href \ No newline at end of file