diff --git a/Forums/BestCardingWorld/parser.py b/Forums/BestCardingWorld/parser.py index 7ad385b..4865b65 100644 --- a/Forums/BestCardingWorld/parser.py +++ b/Forums/BestCardingWorld/parser.py @@ -18,15 +18,15 @@ def bestcardingworld_description_parser(soup): topic = "-1" # 0 topic name user = [] # 1 all users of each post - addDate = [] # 2 all dated of each post - feedback = [] # 3 all feedbacks of each user (this was found in just one Forum and with a number format) - status = [] # 4 all user's authority in each post such as (adm, member, dangerous) - reputation = [] # 5 all users's karma in each post (usually found as a number) - sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post) - post = [] # 7 all messages of each post - interest = [] # 8 all user's interest in each post - image = [] - image_user = [] + status = [] # 2 all user's authority in each post such as (adm, member, dangerous) + reputation = [] # 3 all users's karma in each post (usually found as a number) + interest = [] # 4 all user's interest in each post + sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) + post = [] # 6 all messages of each post + feedback = [] # 7 all feedbacks of each user (this was found in just one Forum and with a number format) + addDate = [] # 8 all dated of each post + image_user = [] # 9 all user avatars of each post + image_post = [] # 10 all first images of each post # Finding the topic (should be just one coming from the Listing Page) @@ -157,15 +157,18 @@ def bestcardingworld_description_parser(soup): img = img.get('src').split('base64,')[-1] else: img = "-1" - image.append(img) + image_post.append(img) img = ipost.find('div', {"class": "avatar-container"}).find('img', {"class": "avatar"}) - img = img.get('src').split('base64,')[-1] + if img is not None: + img = img.get('src').split('base64,')[-1] + else: + img = "-1" image_user.append(img) # Populate the final variable (this should be a list with all fields scraped) - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) + row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) # Sending the results @@ -179,17 +182,18 @@ def bestcardingworld_description_parser(soup): #return: 'row' that contains a variety of lists that each hold info on the listing page def bestcardingworld_listing_parser(soup): - nm = 0 # *this variable should receive the number of topics + nm = 0 # *this variable should receive the number of topics forum = "BestCardingWorld" # 0 *forum name - board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - author = [] # 2 *all authors of each topic - topic = [] # 3 *all topics - views = [] # 4 number of views of each topic - posts = [] # 5 number of posts of each topic - href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between - # Listing and Description pages) - addDate = [] # 7 when the topic was created (difficult to find) + board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + author = [] # 2 *all authors of each topic + topic = [] # 3 *all topics + views = [] # 4 number of views of each topic + posts = [] # 5 number of posts of each topic + href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between + # Listing and Description pages) + addDate = [] # 7 when the topic was created (difficult to find) + image_author = [] # 8 all author avatars used in each topic # Finding the board (should be just one) @@ -235,7 +239,6 @@ def bestcardingworld_listing_parser(soup): # Adding the url to the list of urls link = itopic.find('a', {"class": "topictitle"}).get('href') - link = cleanLink(link) href.append(link) # Finding the author of the topic @@ -243,6 +246,8 @@ def bestcardingworld_listing_parser(soup): user = ps.strip() author.append(cleanString(user)) + image_author.append(-1) + # Finding the number of replies post = replies[index].text.split()[0] post = post.strip() @@ -263,7 +268,7 @@ def bestcardingworld_listing_parser(soup): index += 1 - return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) + return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author) #called by the crawler to get description links on a listing page diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index e4f6c5d..6cc9c60 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -3,7 +3,7 @@ __author__ = 'DarkWeb' import psycopg2 import traceback from Forums.Utilities.utilities import * - +from dateutil.relativedelta import relativedelta, FR def connectDataBase(): @@ -484,6 +484,28 @@ def create_posts(cur, row, forumId, topicId): 'dateinserted_post': row[8], 'postId': postId}) +def create_status(cur, forumId, date, listings, descriptions, status): + + date = datetime.strptime(date, "%m%d%Y") + + # getting last Fridays a reference date + date_reference = date + relativedelta(weekday=FR(-1)) + + # checking if status already exists + sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" + cur.execute(sql, {'forum_id': forumId, 'date_inserted': date}) + + recset = cur.fetchall() + if recset: + sql = "Update forums_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \ + "where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" + recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'forum_id': forumId, 'date_inserted': date} + else: + sql = "Insert into forums_status (forum_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)" + recset = [forumId, date, listings, descriptions, status, date_reference] + + cur.execute(sql, recset) + def create_database(cur, con): try: @@ -496,12 +518,18 @@ def create_database(cur, con): sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)" cur.execute(sql) + sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, " \ + "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL, " \ + "constraint forums_status_pk PRIMARY KEY (forum_id, date_inserted), " \ + "constraint forums_status_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))" + cur.execute(sql) + sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \ "255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \ "interest_user character varying(5000) null, signature_user character varying(1000) null, " \ "image_user character varying(10000000) null, dateinserted_user timestamp(6) with time zone NOT NULL, " \ "constraint users_pk primary key (user_id), " \ - "constraint users_forum_id_fkey foreign key (forum_id) references forums (forum_id))" + "constraint users_forum_id_fk foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create unique index unique_user ON users USING btree (forum_id ASC NULLS LAST, name_user ASC NULLS LAST)" @@ -513,17 +541,17 @@ def create_database(cur, con): "signature_user character varying(1000) null, image_user character varying(10000000) null, " \ "dateinserted_user timestamp(6) with time zone NOT NULL, " \ "constraint users_history_pk primary key (user_id, version_user), " \ - "constraint users_history_user_id_fkey foreign key (user_id) references " \ - "users (user_id), constraint users_history_forum_id_fkey foreign key (forum_id) references forums (forum_id))" + "constraint users_history_user_id_fk foreign key (user_id) references users (user_id), " \ + "constraint users_history_forum_id_fk foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \ "title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer null, " \ "posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with time zone null, " \ "dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double precision NOT NULL, " \ - "constraint topics_pk primary key (topic_id), constraint topics_author_id_fkey " \ - "foreign key (author_id) references users (user_id), constraint topics_forum_id_fkey foreign key (" \ - "forum_id) references forums (forum_id))" + "constraint topics_pk primary key (topic_id), " \ + "constraint topics_author_id_fk foreign key (author_id) references users (user_id), " \ + "constraint topics_forum_id_fk foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create unique index unique_topic ON topics USING btree (forum_id ASC NULLS LAST, author_id ASC NULLS LAST, " \ @@ -536,9 +564,9 @@ def create_database(cur, con): "dateadded_topic timestamp(6) with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, " \ "classification_topic double precision NOT NULL, " \ "constraint topics_history_pk primary key (topic_id, version_topic), " \ - "constraint topics_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \ - "constraint topics_history_author_id_fkey foreign key (author_id) references users (user_id), " \ - "constraint topics_history_board_id_fkey foreign key (forum_id) references forums (forum_id))" + "constraint topics_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \ + "constraint topics_history_author_id_f foreign key (author_id) references users (user_id), " \ + "constraint topics_history_board_id_fk foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create table posts(post_id integer NOT NULL, topic_id integer NOT NULL, " \ @@ -546,8 +574,8 @@ def create_database(cur, con): "image_post character varying(10000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ "dateinserted_post timestamp(6) with time zone NOT NULL, " \ "constraint posts_pk primary key (post_id), " \ - "constraint posts_user_id_fkey foreign key (user_id) references users (user_id), constraint " \ - "posts_topic_id_fkey foreign key (topic_id) references topics (topic_id))" + "constraint posts_user_id_fk foreign key (user_id) references users (user_id), " \ + "constraint posts_topic_id_fk foreign key (topic_id) references topics (topic_id))" cur.execute(sql) sql = "create unique index unique_post ON posts USING btree (topic_id ASC NULLS LAST, user_id ASC NULLS LAST, " \ @@ -559,9 +587,9 @@ def create_database(cur, con): "image_post character varying(10000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ "dateinserted_post timestamp(6) with time zone NOT NULL, " \ "constraint posts_history_pk primary key (post_id, version_post), " \ - "constraint posts_history_user_id_fkey foreign key (user_id) references users (user_id), " \ - "constraint posts_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \ - "constraint posts_history_post_id_fkey foreign key (post_id) references posts (post_id))" + "constraint posts_history_user_id_fk foreign key (user_id) references users (user_id), " \ + "constraint posts_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \ + "constraint posts_history_post_id_fk foreign key (post_id) references posts (post_id))" cur.execute(sql) con.commit() diff --git a/Forums/Incogsnoo/crawler_selenium.py b/Forums/Incogsnoo/crawler_selenium.py new file mode 100644 index 0000000..fd8b92f --- /dev/null +++ b/Forums/Incogsnoo/crawler_selenium.py @@ -0,0 +1,289 @@ +__author__ = 'DarkWeb' + +''' +Incogsnoo Forum Crawler (Selenium) +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.common.by import By + +import urllib.parse as urlparse +import os, time +from datetime import date +import subprocess +from bs4 import BeautifulSoup +from Forums.Initialization.prepare_parser import new_parse +from Forums.Incogsnoo.parser import incogsnoo_links_parser +from Forums.Utilities.utilities import cleanHTML + +counter = 1 +baseURL = 'http://tedditfyn6idalzso5wam5qd3kdtxoljjhbrbbx34q2xkcisvshuytad.onion/' + + +# Opens Tor Browser, crawls the website, then parses, then closes tor +#acts like the main method for the crawler, another function at the end of this code calls this function later +def startCrawling(): + forumName = getForumName() + driver = getAccess() + + if driver != 'down': + try: + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closeDriver(driver) + + new_parse(forumName, baseURL, True) + + +# Returns the name of the website +#return: name of site in string type +def getForumName(): + name = 'Incogsnoo' + return name + + +# Return the base link of the website +#return: url of base site in string type +def getFixedURL(): + url = 'http://tedditfyn6idalzso5wam5qd3kdtxoljjhbrbbx34q2xkcisvshuytad.onion/' + return url + + +# Closes Tor Browser +#@param: current selenium driver +def closeDriver(driver): + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + from Forums.Initialization.forums_mining import config + + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) + + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + ff_prof.set_preference("network.dns.disablePrefetch", True)#might need to turn off + ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("permissions.default.image", 3) + ff_prof.set_preference("browser.download.folderList", 2) + ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", True) + ff_prof.update_preferences() + + service = Service(config.get('TOR', 'geckodriver_path')) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + driver.maximize_window() + + return driver + + +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' +#return: return the selenium driver or string 'down' +def getAccess(): + url = getFixedURL() + driver = createFFDriver() + try: + driver.get(url) + return driver + except: + driver.close() + return 'down' + + +# Saves the crawled html page, makes the directory path for html pages if not made +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +# Gets the full path of the page to be saved along with its appropriate file name +#@param: raw url as crawler crawls through every site +def getFullPathName(url): + from Forums.Initialization.forums_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') + else: + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') + return fullPath + + +# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned +#@param: raw url as crawler crawls through every site +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if (name == ''): + name = str(counter) + counter = counter + 1 + return name + + +# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list +#in this example, there are a couple of categories some threads fall under such as +#exploits, malware, and hacking tutorials +def getInterestedLinks(): + links = [] + + # Malware + links.append('http://tedditfyn6idalzso5wam5qd3kdtxoljjhbrbbx34q2xkcisvshuytad.onion/r/Malware') + # + return links + + +# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through +#topic and description pages are crawled through here, where both types of pages are saved +#@param: selenium driver +def crawlForum(driver): + print("Crawling the Incogsnoo forum") + + # edge cases: + # 1. if a comment thread goes too deep, need to click "continue this thread" to show more replies + # 2. the site will sometimes rate limit you and not show the contents. + # right now, there is no detection mechanism and it won't throw any errors + + linksToCrawl = getInterestedLinks() + + i = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] + print('Crawling :', link) + try: + has_next_page = True + count = 0 + + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(driver, html, link) + + topics = topicPages(html) + for topic in topics: + has_next_topic_page = True + counter = 1 + page = topic + + while has_next_topic_page: + itemURL = urlparse.urljoin(baseURL, str(page)) + try: + driver.get(itemURL) + except: + driver.refresh() + + if isListingLink(driver.current_url): + break + + savePage(driver, driver.page_source, topic + f"page{counter}") # very important + + # comment out + if counter == 2: + break + + try: + # incogsnoo doesn't have next button to load more pages of the description + link_tag = driver.find_element(by=By.XPATH, value="/html/body/div[2]/div[last()]/a[contains(text(),'next')]") + link = link_tag.get_attribute("href") + + if link == "": + raise NoSuchElementException + counter += 1 + + except NoSuchElementException: + has_next_topic_page = False + + # making sure we go back to the listing page (browser back button simulation) + try: + driver.get(link) + except: + driver.refresh() + + # comment out + # break + + # comment out + if count == 1: + break + + try: + link_tag = driver.find_element(by=By.XPATH, value="/html/body/div[2]/div[last()]/a[contains(text(),'next')]") + link = link_tag.get_attribute("href") + + if link == "": + raise NoSuchElementException + count += 1 + + except NoSuchElementException: + has_next_page = False + + except Exception as e: + print(link, e) + i += 1 + + print("Crawling the Incogsnoo forum done.") + + +# Returns 'True' if the link is a description link +#@param: url of any url crawled +#return: true if is a description page, false if not +def isDescriptionLink(url): + if 'comments' in url: + return True + return False + + +# Returns True if the link is a listingPage link +#@param: url of any url crawled +#return: true if is a Listing page, false if not +def isListingLink(url): + if isDescriptionLink(url): + return False + return True + + +# calling the parser to define the links, the html is the url of a link from the list of interested link list +#@param: link from interested link list +#return: list of description links that should be crawled through +def topicPages(html): + soup = BeautifulSoup(html, "html.parser") + #print(soup.find('div', {"class": "forumbg"}).find('ul', {"class": "topiclist topics"}).find('li', {"class": "row bg1"}).find('a', {"class": "topictitle"}, href=True)) + return incogsnoo_links_parser(soup) + + +def crawler(): + startCrawling() + # print("Crawling and Parsing BestCardingWorld .... DONE!") \ No newline at end of file diff --git a/Forums/Incogsnoo/parser.py b/Forums/Incogsnoo/parser.py new file mode 100644 index 0000000..b24caa8 --- /dev/null +++ b/Forums/Incogsnoo/parser.py @@ -0,0 +1,271 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from Forums.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + + +# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) +#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +#stores info it needs in different lists, these lists are returned after being organized +#@param: soup object looking at html page of description page +#return: 'row' that contains a variety of lists that each hold info on the description page +def incogsnoo_description_parser(soup): + + # Fields to be parsed + + topic = "-1" # 0 topic name ***$ + user = [] # 1 all users of each post ***$ author + status = [] # 2 all user's authority in each post such as (adm, member, dangerous) + reputation = [] # 3 all users's karma in each post (usually found as a number) ??? ups + interest = [] # 4 all user's interest in each post + sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) + post = [] # 6 all messages of each post + feedback = [] # 7 all feedbacks of each user (this was found in just one Forum and with a number format) + addDate = [] # 8 all dated of each post ***$ created + image_user = [] # 9 all user avatars of each post + image_post = [] # 10 all first images of each post + + # Finding the topic (should be just one coming from the Listing Page) + topic = soup.find("div", {"class": "title"}).find("h2").text + topic = topic.replace('"', '') + topic = cleanString(topic.strip()) + + # the first post's html is separated from all subsequent comments/replies/posts to the first post + # so parse the first post by itself first + + # Finding body of first post + post_text = soup.find("div", {"class": "md"}) + if post_text: + post_text = post_text.text.strip() + post.append(cleanString(post_text)) + else: # some posts just links to other sites/articles/videos and have no text by itself + post_link = soup.find("div", {"class": "title"}).find("a").get("href") + post_link = cleanLink(post_link) + post.append(post_link) + + # User + p_tag = soup.find("p", {"class": "submitted"}) + author = p_tag.find("a") + if author: + author = author.text.strip() + elif "[deleted]" in p_tag.text: + author = "deleted" + else: + author = "-1" + user.append(cleanString(author)) + + # Finding the status of the author + status.append("-1") + + # Finding the reputation of the user + reputation.append("-1") + + # Finding the interest of the author + interest.append("-1") + + # Finding signature + sign.append("-1") + + # Finding feedback + upvote = soup.find("div", {"class": "score"}).find("span") + if upvote: + upvote = upvote.text.strip() + else: + upvote = "-1" + feedback.append(cleanString(upvote)) + + # Finding the date of the post - e.g. "Fri, 18 December 2023 05:49:20 GMT" + dt = soup.find("p", {"class": "submitted"}).find("span")["title"] + # Convert to datetime object - e.g. 2023-12-18 05:49:20 + date_time_obj = datetime.strptime(dt, '%a, %d %b %Y %H:%M:%S %Z') + sdate = date_time_obj.strftime('%m %d %Y') + stime = date_time_obj.strftime('%I:%M %p') + + date = convertDate(sdate, "english", datetime.now()) + " " + stime + # e.g. "12/18/2023 05:49 AM" + addDate.append(date) + + image_user.append("-1") + image_post.append("-1") + + + + posts = soup.find("div", {"class": "comments"}).findAll("details") + + + # For each message (post), get all the fields we are interested to: + + for ipost in posts: + + # Finding user + p_tag = ipost.find("p", {"class": "author"}) + author = p_tag.find("a") + if author: + author = author.text.strip() + elif "[deleted]" in p_tag.text: + author = "deleted" + else: + author = "-1" + user.append(cleanString(author)) + + # Finding the status of the author + status.append("-1") + + # Finding the reputation of the user + reputation.append("-1") + + # Finding the interest of the author + interest.append("-1") + + # Finding signature + sign.append("-1") + + # Finding the post + comment = ipost.find("div", {"class": "md"}) + if comment: + comment = comment.text.strip() + else: + comment = "-1" + post.append(cleanString(comment)) + + # Finding feedback + upvote = ipost.find("p", {"class": "ups"}) + if upvote: + upvote = upvote.text.strip().split()[0] + else: + upvote = "-1" + feedback.append(cleanString(upvote)) + + # Finding the date of the post - e.g. "Fri, 18 December 2023 05:49:20 GMT" + dt = ipost.find("p", {"class": "created"})["title"] + # Convert to datetime object - e.g. 2023-12-18 05:49:20 + date_time_obj = datetime.strptime(dt, '%a, %d %b %Y %H:%M:%S %Z') + sdate = date_time_obj.strftime('%m %d %Y') + stime = date_time_obj.strftime('%I:%M %p') + + date = convertDate(sdate, "english", datetime.now()) + " " + stime + # e.g. "12/18/2023 05:49 AM" + addDate.append(date) + + image_user.append("-1") + image_post.append("-1") + + + # Populate the final variable (this should be a list with all fields scraped) + + row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) + + # Sending the results + + return row + + + +# This is the method to parse the Listing Pages (one page with many posts) +#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +#stores info it needs in different lists, these lists are returned after being organized +#@param: soup object looking at html page of listing page +#return: 'row' that contains a variety of lists that each hold info on the listing page +def incogsnoo_listing_parser(soup): + + nm = 0 # *this variable should receive the number of topics + forum = "Incogsnoo" # 0 *forum name + board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + author = [] # 2 *all authors of each topic + topic = [] # 3 *all topics + views = [] # 4 number of views of each topic + posts = [] # 5 number of posts of each topic + href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between + # Listing and Description pages) + addDate = [] # 7 when the topic was created (difficult to find) + image_author = [] # 8 all author avatars used in each topic + + # Finding the board (should be just one) + board = soup.find("a", {"class": "subreddit"}).find("h2") + board = cleanString(board.text.strip()) + + # Finding the repeated tag that corresponds to the listing of topics + itopics = soup.find("div", {"id": "links", "class": "sr"}).findAll("div", {"class": "link"}) + itopics.pop() + # Counting how many topics we have found so far + + nm = len(itopics) + + index = 0 + for itopic in itopics: + + # Finding the author of the topic + p_tag = itopic.find("p", {"class": "submitted"}) + user = p_tag.find("a") + if user: + user = user.text.strip() + elif "[deleted]" in p_tag.text: + user = "deleted" + else: + user = "-1" + author.append(cleanString(user)) + + # Adding the topic to the topic list + topic_title = itopic.find("div", {"class": "title"}).find("h2").text + topic.append(cleanString(topic_title)) + + # Finding the number of Views + views.append("-1") + + # Finding the number of posts + comments = itopic.find("a", {"class": "comments"}).text + number_comments = comments.split()[0] + posts.append(cleanString(number_comments)) + + # Adding the url to the list of urls + link = itopic.find("a", {"class": "comments"}).get("href") + link = cleanLink(link) + href.append(link) + + # Finding dates + p_tag = itopic.find("p", {"class": "submitted"}) + dt = p_tag.find("span")["title"] + date_time_obj = datetime.strptime(dt,'%a, %d %b %Y %H:%M:%S %Z') + sdate = date_time_obj.strftime('%m %d %Y') + stime = date_time_obj.strftime('%I:%M %p') + date = convertDate(sdate, "english", datetime.now()) + " " + stime + # e.g. "12/18/2023 05:49 AM" + addDate.append(date) + + image_author.append("-1") + + index += 1 + + return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author) + + +#called by the crawler to get description links on a listing page +#@param: beautifulsoup object that is using the correct html page (listing page) +#return: list of description links from a listing page +def incogsnoo_links_parser(soup): + + # Returning all links that should be visited by the Crawler + + href = [] + + listing_parent = soup.find("div", {"id": "links", "class": "sr"}) + listing = listing_parent.findAll("div", {"class": "entry"}) + + count = 0 + for entry in listing: + + parent_div = entry.find("div", {"class": "meta"}).find("div", {"class", "links"}) + a_tag = parent_div.find("a", {"class", "comments"}) + if a_tag: + href.append(a_tag.get("href")) + + # if count == 10: + # break + + count += 1 + + return href \ No newline at end of file diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 86438c5..f6c0499 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -423,6 +423,16 @@ def new_parse(forum, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) + # registering the current forum status (up/down) and the number of scraped pages in the database + forumId = verifyForum(cur, forum) + if (forumId > 0): + + readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html')) + readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html')) + + create_status(cur, forumId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') + con.commit() + if createLog: logFile.close() diff --git a/MarketPlaces/Ares/crawler_selenium.py b/MarketPlaces/Ares/crawler_selenium.py index fbed2b1..1f865ad 100644 --- a/MarketPlaces/Ares/crawler_selenium.py +++ b/MarketPlaces/Ares/crawler_selenium.py @@ -1,7 +1,7 @@ __author__ = 'DarkWeb' ''' -Ares Market Crawler (Selenium) +Ares Marketplace Crawler (Selenium) ''' from selenium import webdriver @@ -9,27 +9,28 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image +from selenium.webdriver.support.ui import Select +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from PIL import Image import urllib.parse as urlparse -import os, time +import os, re, time from datetime import date import subprocess +import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.Ares.parser import ares_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion' +baseURL = 'http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/' -# Opens Tor Browser, crawls the website def startCrawling(): - marketName = getMarketName() + mktName = getMKTName() driver = getAccess() if driver != 'down': @@ -40,66 +41,18 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(marketName, False) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - #wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/center"))) - - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - #Username here - usernameBox.send_keys('blabri') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - #Password here - passwordBox.send_keys('fishowal') - - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img').screenshot( - r'..\Ares\captcha.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Ares\captcha.png') - - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/input') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[4]/div/div/button").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[7]/div[3]/div[2]/div[1]/div[1]'))) + new_parse(mktName, baseURL, True) # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Ares' return name -# Return the link of the website +# Return the base link of the website def getFixedURL(): - url = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion' - + url = 'http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/' return url @@ -109,7 +62,7 @@ def closeDriver(driver): # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') - driver.quit() + driver.close() time.sleep(3) return @@ -146,12 +99,13 @@ def createFFDriver(): service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -163,7 +117,26 @@ def getAccess(): return 'down' -# Saves the crawled html page +def login(driver): + # input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="username"]'))) + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + # Username here + usernameBox.send_keys('blabri') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + # Password here + passwordBox.send_keys('fishowal') + + input("Press ENTER when BROKEN CIRCLE is pressed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[6]/div[3]/div[2]/div[1]/div[1]'))) + + def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -172,7 +145,6 @@ def savePage(driver, page, url): return -# Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE @@ -185,7 +157,11 @@ def getFullPathName(url): return fullPath -# Creates the file name from passed URL +def getMKTName() -> str: + name = 'Ares' + return name + + def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) @@ -198,33 +174,26 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Digital - Other - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/91ecd5d0-002c-11ec-9b46-ede2378c5d3c') - # # Digital - VPN - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9431b830-002b-11ec-86d6-cdaf65cd97f1') - # # Digital - Coding - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/948b7400-a939-11ec-adc5-2f775203130c') # Digital - Malware - links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2') - # # Digital - Guides - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662') - # # Digital - Hacking - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921') - # # Digital - Malware - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/b3258c50-002b-11ec-b658-876d3d651145') - # # Digital - Services - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/bae64840-002b-11ec-bbcc-a93431540099') - # # Digital - Software - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1') - # # Digital - Exploits - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6') - # # Digital - Tutorials - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8') + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2') + # Digital - Guides (Mostly carding, some useful hacking guides. probably dont use) + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662') + # Digital - Hacking + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921') + # Digital - Malware2 + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/b3258c50-002b-11ec-b658-876d3d651145') + # Digital - Sofware (50/50 hacking stuff and cracked software) + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1') + # Digital - Exploits + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6') + # Digital - Tutorials (Mostly random stuff, some useful tutorials, probably dont use) + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8') return links def crawlForum(driver): + print("Crawling the Ares market") linksToCrawl = getInterestedLinks() @@ -244,8 +213,8 @@ def crawlForum(driver): driver.refresh() html = driver.page_source savePage(driver, html, link) - list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -255,19 +224,15 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - nav = driver.find_element(by=By.XPATH, value= - '/html/body/div[7]/div[3]/div/div[2]/nav') - a = nav.find_element(by=By.LINK_TEXT, value="Next") - link = a.get_attribute('href') - + link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -279,24 +244,23 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Ares market done sucessfully. Press ENTER to continue\n") + print("Crawling the Ares market done.") -# Returns 'True' if the link is Topic link +# Returns 'True' if the link is Topic link, may need to change for every website def isDescriptionLink(url): if 'product' in url: return True return False -# Returns True if the link is a listingPage link +# Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): if 'category' in url: return True return False -# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") return ares_links_parser(soup) @@ -304,4 +268,3 @@ def productPages(html): def crawler(): startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/Ares/parser.py b/MarketPlaces/Ares/parser.py index 3232b0c..597a1eb 100644 --- a/MarketPlaces/Ares/parser.py +++ b/MarketPlaces/Ares/parser.py @@ -7,99 +7,86 @@ from MarketPlaces.Utilities.utilities import * from bs4 import BeautifulSoup -# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) +# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of description page +# return: 'row' that contains a variety of lists that each hold info on the description page def ares_description_parser(soup): - # Fields to be parsed - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - name = soup.find('div', {'class': "col-md-12 my-2"}).text + divmb = soup.find('div', {'class': "col-md-12 my-2"}) + + name = divmb.find('span', {'class': "btn btn-sm btn-outline-dark w-100 active rounded-0"}).text name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() - bae = soup.find('div', {'class': "col-md-7"}).find('span').find_all('span') + box = soup.find('div', {'class': "col-md-7"}).find('span') + box = box.findAll('span', {'class': "btn btn-mgray btn-sm w-100 active border-danger"}) # Finding Vendor - vendor = bae[0].text - vendor = vendor.replace(",", "") - vendor = vendor.replace("...", "") - vendor = vendor.strip() + vendor = soup.find('a', {'class': "btn btn-sm btn-mgray my-1 w-100 text-white"}).get('href') + vendor = vendor.split('otherParty=')[-1] + vendor = cleanString(vendor).strip() # Finding Vendor Rating - full_stars = bae[2].find_all('i', {'class': "fas fa-star"}) - half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) - rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0) + temp = box[1] + rating_vendor = len(temp.findAll('i', {"class": "fas fa-star"})) + half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) + if half_stars > 0: + rating_vendor += 0.5 # Finding Successful Transactions - success = bae[4].text - success = success.replace("Sales ", "") - success = success.strip() + success = box[2].text + success = cleanNumbers(success).strip() - bae = soup.find('span', {'class': "text-left"}).find_all('span') + box2 = soup.find('div', {"class": "col-md-4 text-center"}).find('span', {"class": "text-left"}).findAll('span') - # Finding Prices - USD = bae[0].text - USD = USD.replace("\n$", "") + # Finding USD + USD = box2[0].text + USD = USD.replace('\n', '') + USD = USD.replace('$', '') USD = USD.strip() - shipping_info = bae[4].text - if "Digital" not in shipping_info: - shipping_info = shipping_info.split(" ") - - # Finding Shipment Information (Origin) - shipFrom = shipping_info[0].strip() - - # Finding Shipment Information (Destination) - shipTo = shipping_info[1].strip() - - bae = soup.find_all('textarea') + # Finding Vendor Image + vendor_image = soup.find('img', {"class": 'img-fluid'}).get('src') + vendor_image = vendor_image.split('base64,')[-1] # Finding the Product description - describe = bae[0].text - describe = describe.replace("\n", " ") - describe = describe.replace("\r", " ") - describe = describe.strip() - - # Finding the Terms and Conditions - terms = bae[1].text - terms = terms.replace("\n", " ") - terms = terms.strip() - - ''' - # Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1" - ''' + temp = soup.find('div', {"class": 'row-md-12'}).find('div', {"class": 'col-md-4'}) + cardbody = temp.find('textarea', {"class": 'disabled form-control form-control-sm w-100 bg-mgray text-white rounded-0 border-danger'}) + describe = cleanString(cardbody.text).strip() + + # Finding Product Image + image = soup.find('div', {"class": 'row-md-12'}).find('div', {"class": 'col-md-4 text-center'}).find('img') + if image is not None: + image = image.get('src') + image = image.split('base64,')[-1] + else: + image = "-1" # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -121,69 +108,101 @@ def ares_description_parser(soup): # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row -# This is the method to parse the Listing Pages +# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of listing page +# return: 'row' that contains a variety of lists that each hold info on the listing page def ares_listing_parser(soup): # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "Ares" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft = [] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links - - listing = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) + nm = 0 # *Total_Products (Should be Integer) + mktName = "Ares" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + + cat = soup.find('span', {"class": "btn btn-sm btn-outline-dark w-100 active"}).text + cat = cleanString(cat).strip() + + listing = soup.find('div', {"class": 'card-body text-black text-left bg-dark'}).findAll('div', {"class": 'card mb-4 border-danger rounded-0'}) # Populating the Number of Products nm = len(listing) for a in listing: - bae = a.findAll('a', href=True) + + category.append(cat) # Adding the url to the list of urls - link = bae[0].get('href') + link = a.find('a', {'class': "badge badge-danger w-100 text-white"}).get('href') link = cleanLink(link) href.append(link) + # Finding the Product name + product = a.find('div', {"class": 'marquee-parent'}).find('div', {"class": "marquee-child"}).text + product = product.replace('\n', ' ') + product = product.replace(",", "") + product = product.replace("...", "") + product = product.strip() + name.append(product) + + # Finding Product Image + product_image = a.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + + # Finding Prices + price = a.findAll('a', {"class": "text-white"})[-1].text + price = price.replace("$","") + price = price.strip() + USD.append(price) + + # Finding Item Rating + temp = a.find('small', {"class": "text-white"}) + rating = len(temp.findAll('i', {"class": "fas fa-star"})) + half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) + if half_stars > 0: + rating += 0.5 + rating_item.append(str(rating)) + # Finding the Vendor - vendor_name = bae[1].text + vendor_name = a.find('a', {"class": 'badge badge-dark w-100 text-white my-1'}).text vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.strip() vendor.append(vendor_name) - # Finding the Product - product = bae[2].find('img').get('alt') - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.strip() - name.append(product) + image_vendor.append("-1") # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: - cveValue="-1" + cveValue = "-1" else: cee = " " for idx in cve: @@ -191,12 +210,12 @@ def ares_listing_parser(soup): cee += " " cee = cee.replace(',', ' ') cee = cee.replace('\n', '') - cveValue=cee + cveValue = cee CVE.append(cveValue) - + ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: - MSValue="-1" + MSValue = "-1" else: me = " " for im in ms: @@ -204,24 +223,28 @@ def ares_listing_parser(soup): me += " " me = me.replace(',', ' ') me = me.replace('\n', '') - MSValue=me + MSValue = me MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) +# called by the crawler to get description links on a listing page +# @param: beautifulsoup object that is using the correct html page (listing page) +# return: list of description links from a listing page def ares_links_parser(soup): - # Returning all links that should be visited by the Crawler - href = [] - listing = soup.findAll('a', {"class": "btn btn-success w-100 my-1"}) + href = [] + listing = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) for a in listing: + bae = a.findAll('a', href=True) - link = a['href'] + # Adding the url to the list of urls + link = bae[0].get('href') href.append(link) return href \ No newline at end of file diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py index b257c40..9efe7bc 100644 --- a/MarketPlaces/BlackPyramid/crawler_selenium.py +++ b/MarketPlaces/BlackPyramid/crawler_selenium.py @@ -1,9 +1,7 @@ -__author__ = 'Helium' +__author__ = 'cern' ''' -BlackPyramid Forum Crawler (Selenium) -cannot use bc no links are used -kept in case issues are solved +BlackPyramid Market Crawler (Selenium) ''' from selenium import webdriver @@ -11,29 +9,32 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By - +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver import ActionChains +import selenium.webdriver.support.ui as uiClasses +from selenium.webdriver.common.keys import Keys from PIL import Image + import urllib.parse as urlparse import os, re, time -from datetime import date import subprocess import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.BlackPyramid.parser import blackpyramid_links_parser +from MarketPlaces.BlackPyramid.parser import BlackPyramid_links_parser from MarketPlaces.Utilities.utilities import cleanHTML +import traceback + counter = 1 -baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/' +baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/' -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later +# Opens Tor Browser, crawls the website def startCrawling(): - mktName = getMKTName() + marketName = getMKTName() driver = getAccess() if driver != 'down': @@ -42,28 +43,47 @@ def startCrawling(): crawlForum(driver) except Exception as e: print(driver.current_url, e) - closeDriver(driver) + closetor(driver) + + new_parse(marketName, baseURL, True) - new_parse(mktName, baseURL, True) + +# Login +def login(driver): + # wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "//input[@name='username_login']"))) + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value="//input[@name='username_login']") + # Username here + usernameBox.send_keys('ChipotleSteakBurrito') + passwordBox = driver.find_element(by=By.XPATH, value="//input[@name='password_login']") + # Password here + passwordBox.send_keys('BlackBeans') + + input("Press ENTER when CAPTCHA is completed and you closed the newsletter\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="form93b"]'))) # Returns the name of the website -#return: name of site in string type def getMKTName(): name = 'BlackPyramid' return name -# Return the base link of the website -#return: url of base site in string type +# Return the link of the website def getFixedURL(): - url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/' + url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/?login=1' + return url # Closes Tor Browser -#@param: current selenium driver -def closeDriver(driver): +def closetor(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") @@ -88,8 +108,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -111,8 +131,6 @@ def createFFDriver(): return driver -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -124,31 +142,7 @@ def getAccess(): return 'down' -# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha -# then allows for manual solving of captcha in the terminal -#@param: current selenium web driver -def login(driver): - # wait for login page - login_link = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[3]/div/main/div/div/div/div[2]/div/div/div/section[1]/input[1]') - login_link.click() # open tab with url - - # entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - # Username here - usernameBox.send_keys('ChipotleSteakBurrito') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - # Password here - passwordBox.send_keys('BlackBeans') - - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[2]/form/nav/nav/ul/li[2]/div/a/span[1]'))) - - - -# Saves the crawled html page, makes the directory path for html pages if not made +# Saves the crawled html page def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -158,7 +152,6 @@ def savePage(driver, page, url): # Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE @@ -171,87 +164,134 @@ def getFullPathName(url): return fullPath -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site +# Creates the file name from passed URL def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) - if (name == ''): + if name == '': name = str(counter) counter = counter + 1 return name -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products +def page_is_fully_loaded(driver): + return driver.execute_script("return document.readyState") == "complete" + + +def goToPage(driver, page): + # hover over digital -> hacking tools + a = ActionChains(driver) + + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "//li[@class='dig940']/div/a"))) + + # hover + digitalB = driver.find_element(By.XPATH, "//li[@class='dig940']/div/a") + time.sleep(1) + a.move_to_element(digitalB).perform() + # print(digitalB) + + # delay for website to register hover + time.sleep(5) + + # click + xpath = "//input[@name='" + page + "']" + link = driver.find_element(By.XPATH, xpath) + time.sleep(1) + a.move_to_element(link).click().perform() + # print(link) + + # wait for website to load + time.sleep(10) + WebDriverWait(driver, 100).until(page_is_fully_loaded) + + def getInterestedLinks(): links = [] - # Hacking Guides - links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Exploits - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # botnets/malware - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # fraud software - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Other Tools - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Services - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') + # h11 -> Hacking Tools + links.append('h11') + # g3 -> Guides, Hacking + links.append('g3') + # se3 -> Services + links.append('se11') + # f6 -> Fraud + links.append('f11') return links -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver def crawlForum(driver): + print("Crawling the BlackPyramid market") - linksToCrawl = getInterestedLinks() + pages = getInterestedLinks() i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) + for listing in pages: + print('Crawling :', listing) try: + driver.get(baseURL) + goToPage(driver, listing) + has_next_page = True count = 0 + currentPage = 1 while has_next_page: - try: - clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a') - clicker.click() # open tab with url - driver.get(link) - except: - driver.refresh() + html = driver.page_source - savePage(driver, html, link) + savePage(driver, html, listing + "page" + str(currentPage)) + # get a list of urls for each listing list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: driver.get(itemURL) except: - driver.refresh() + # driver.refresh() + continue savePage(driver, driver.page_source, item) - driver.back() - - # comment out - break + # can't use the back button in dark pyramid + # driver.back() - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break + # go to next page of market try: - clicker = driver.find_element(by=By.XPATH, value= - '/html/body/center/div[4]/div/div[3]/div[23]/div[2]/input[1]') - if clicker == "": + # Scroll to top of page to see navigation bar + driver.find_element(by=By.XPATH, value="//body").send_keys(Keys.CONTROL + Keys.HOME) + + goToPage(driver, listing) + nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']") + + if nav.is_enabled(): + # select next page + pgnum = uiClasses.Select(driver.find_element(by=By.XPATH, value="//select[@name='pageination']")) + # print("pg options:", pgnum.options) + numberOfPages = len(pgnum.options) + + if currentPage >= numberOfPages: + raise NoSuchElementException + + pgnum.select_by_index(currentPage) + currentPage += 1 + + # click button + pgbutton = driver.find_element(by=By.XPATH, value="//input[@value='go to page']") + pgbutton.click() + + # wait for website to load + time.sleep(10) + WebDriverWait(driver, 100).until(page_is_fully_loaded) + else: raise NoSuchElementException count += 1 @@ -259,39 +299,32 @@ def crawlForum(driver): has_next_page = False except Exception as e: - print(link, e) + print(listing, e) i += 1 print("Crawling the BlackPyramid market done.") -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not +# Returns 'True' if the link is Topic link def isDescriptionLink(url): - if 'products' in url: + if 'product' in url: return True return False # Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not def isListingLink(url): - if 'search' in url: + if 'category=' in url: return True return False -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through +# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - return blackpyramid_links_parser(soup) - + return BlackPyramid_links_parser(soup) def crawler(): startCrawling() - # print("Crawling and Parsing BlackPyramid .... DONE!") + # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/BlackPyramid/parser.py b/MarketPlaces/BlackPyramid/parser.py index 743466a..3980fc4 100644 --- a/MarketPlaces/BlackPyramid/parser.py +++ b/MarketPlaces/BlackPyramid/parser.py @@ -1,4 +1,4 @@ -__author__ = 'Helium' +__author__ = 'cern' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * @@ -11,111 +11,113 @@ from bs4 import BeautifulSoup #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of description page #return: 'row' that contains a variety of lists that each hold info on the description page -def darkfox_description_parser(soup): +def blackpyramid_description_parser(soup): # Fields to be parsed - name = "-1" # 0 Product_Name - describe = "-1" # 1 Product_Description - lastSeen = "-1" # 2 Product_LastViewDate - rules = "-1" # 3 NOT USED ... - CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 5 Product_MS_Classification (Microsoft Security) - review = "-1" # 6 Product_Number_Of_Reviews + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) category = "-1" # 7 Product_Category - shipFrom = "-1" # 8 Product_ShippedFrom - shipTo = "-1" # 9 Product_ShippedTo - left = "-1" # 10 Product_QuantityLeft - escrow = "-1" # 11 Vendor_Warranty - terms = "-1" # 12 Vendor_TermsAndConditions - vendor = "-1" # 13 Vendor_Name - sold = "-1" # 14 Product_QuantitySold - addDate = "-1" # 15 Product_AddedDate - available = "-1" # 16 NOT USED ... - endDate = "-1" # 17 NOT USED ... - BTC = "-1" # 18 Product_BTC_SellingPrice - USD = "-1" # 19 Product_USD_SellingPrice - rating = "-1" # 20 Vendor_Rating - success = "-1" # 21 Vendor_Successful_Transactions - EURO = "-1" # 22 Product_EURO_SellingPrice + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - name = soup.find('h1').text + name = soup.find('div', {'class': 'panel39002'}).find('span').next_sibling name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() - # Finding Vendor - vendor = soup.find('h3').find('a').text.strip() + # Finding Product Rating + rating_span = soup.find('span', {'class': 'to3098503t'}).find_next_sibling('span') + rating_num = rating_span.find('b').text + if rating_num != 'N/A': + rating_item = rating_num[0:3] + + # product description + describe = soup.findAll('div', {'class': 'fer048953'})[1].text + describe = describe.replace('\n', ' ') + describe = describe.replace(",", "") + describe = describe.strip() - # Finding Vendor Rating - rating = soup.find('span', {'class': "tag is-dark"}).text.strip() + # Finding Vendor + vendor = soup.find('div', {'class': 'bold03905 vstat364'}).text + vendor = vendor.split(" ") + vendor = vendor[2][:-1] + vendor = vendor.replace('\n', ' ') + vendor = vendor.replace(",", "") + vendor = vendor.strip() + + # Finding Product Rating + rating_div = soup.find('div', {'class': 'bold03905 vstat364'}).find_next_sibling('div').find_next_sibling('div') + rating_vendor = cleanNumbers(rating_div.text) + if rating_vendor == "": + rating_vendor = "-1" # Finding Successful Transactions - success = soup.find('h3').text - success = success.replace("Vendor: ", "") - success = success.replace(vendor, "") - success = success.replace("(", "") - success = success.replace(")", "") + success_container = soup.find('ul', {'class': 'ul3o00953'}).findAll('li')[1] + success = success_container.find('div').text + success = success.replace('"', '') + success = success.replace("\n", " ") + success = success.replace(",", "") success = success.strip() - bae = soup.find('div', {'class': "box"}).find_all('ul') - # Finding Prices - USD = bae[1].find('strong').text.strip() - - li = bae[2].find_all('li') + USD_text = soup.find('li', {'class': 'vul2994 vghul995'}).find('div').text + USD = USD_text.split(',')[1] + USD = USD.replace('\n', ' ') + USD = USD.replace(",", "") + USD = USD.strip() - # Finding Escrow - escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Category - category = li[1].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Quantity Available - left = li[3].find('span', {'class': "tag is-dark"}).text.strip() + container = soup.find('ul', {'class': 'bic03095'}) # Finding Number Sold - sold = li[4].find('span', {'class': "tag is-dark"}).text.strip() - - li = bae[3].find_all('li') - - # Finding Shipment Information (Origin) - if "Ships from:" in li[-2].text: - shipFrom = li[-2].text - shipFrom = shipFrom.replace("Ships from: ", "") - # shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() - - # Finding Shipment Information (Destination) - shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text - shipTo = shipTo.replace("Ships to: ", "") - shipTo = shipTo.strip() - if "certain countries" in shipTo: - countries = "" - tags = li[-1].find_all('span', {'class': "tag"}) - for tag in tags: - country = tag.text.strip() - countries += country + ", " - shipTo = countries.strip(", ") - - # Finding the Product description - describe = soup.find('div', {'class': "pre-line"}).text - describe = describe.replace("\n", " ") - describe = describe.strip() + sold_container = container.find('li') + sold_div = sold_container.findAll('div')[2] + sold = sold_div.find('b').next_sibling + sold = sold.replace('"', '') + sold = sold.replace("\n", " ") + sold = sold.replace(",", "") + sold = sold.strip() - '''# Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1"''' + # Finding the Product Quantity Available + left_container = container.find('li') + left_div = left_container.findAll('div')[3] + left = left_div.find('b').next_sibling + left = left.replace('"', '') + left = left.replace("\n", " ") + left = left.replace(",", "") + left = left.strip() + + # Finding number of reviews + positive = soup.find('span', {'class': 'ar04999324'}).text + neutral = soup.find('span', {'class': 'ti9400005 can39953'}).text + negative = soup.find('span', {'class': 'ti9400005 ti90088 can39953'}).text + reviews = int(positive) + int(neutral) + int(negative) + + # Finding product image + image = soup.find('img', {'class': 'img0390503'}) + image = image.get('src') + image = image.split('base64,')[-1] + + vendor_image = soup.find('img', {'class': 'img0390503'}) + vendor_image = vendor_image.get('src') + vendor_image = vendor_image.split('base64,')[-1] # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -136,8 +138,8 @@ def darkfox_description_parser(soup): MS = MS.replace('\n', '') # Populating the final variable (this should be a list with all fields scraped) - row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, - sold, addDate, available, endDate, BTC, USD, rating, success, EURO) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row @@ -147,102 +149,109 @@ def darkfox_description_parser(soup): #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of listing page #return: 'row' that contains a variety of lists that each hold info on the listing page -def darkfox_listing_parser(soup): +def blackpyramid_listing_parser(soup): # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "DarkFox" # 0 Marketplace_Name - name = [] # 1 Product_Name - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category - describe = [] # 5 Product_Description - escrow = [] # 6 Vendor_Warranty - views = [] # 7 Product_Number_Of_Views - reviews = [] # 8 Product_Number_Of_Reviews - addDate = [] # 9 Product_AddDate - lastSeen = [] # 10 Product_LastViewDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft =[] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions - href = [] # 23 Product_Links (Urls) - - listing = soup.findAll('div', {"class": "card"}) + nm = 0 # *Total_Products (Should be Integer) + mktName = "BlackPyramid" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + + listing = soup.findAll('article', {"class": "product"}) + + # Some listing pages have an additional article section which is blank + if not listing[-1].findAll('a', href=True): + listing = listing[:-1] + # Populating the Number of Products nm = len(listing) - for a in listing: - bae = a.findAll('a', href=True) + for card in listing: + bae = card.findAll('a', href=True) # Adding the url to the list of urls - link = bae[0].get('href') - link = cleanLink(link) + link = bae[2].get('href') href.append(link) # Finding the Product - product = bae[1].find('p').text + product = bae[3].text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.replace("...", "") product = product.strip() name.append(product) - bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') - - if len(bae) >= 5: - # Finding Prices - price = bae[0].text - ud = price.replace(" USD", " ") - # u = ud.replace("$","") - u = ud.replace(",", "") - u = u.strip() - USD.append(u) - # bc = (prc[1]).strip(' BTC') - # BTC.append(bc) - - # Finding the Vendor - vendor_name = bae[1].find('a').text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Category - cat = bae[2].find('small').text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() - category.append(cat) - - # Finding Number Sold and Quantity Left - num = bae[3].text - num = num.replace("Sold: ", "") - num = num.strip() - sold.append(num) - - quant = bae[4].find('small').text - quant = quant.replace("In stock: ", "") - quant = quant.strip() - qLeft.append(quant) - - # Finding Successful Transactions - freq = bae[1].text - freq = freq.replace(vendor_name, "") - freq = re.sub(r'Vendor Level \d+', "", freq) - freq = freq.replace("(", "") - freq = freq.replace(")", "") - freq = freq.strip() - success.append(freq) + # # Finding description + # # 'recurisve = False' only searches direct children + # desc = card.findChildren('div', recursive=False)[0] + # desc = desc.findAll('div', recursive=False)[3].text + # desc = desc.replace('\n', ' ') + # desc = desc.replace(",", "") + # desc = desc.strip() + # describe.append(desc) + + # Finding Vendor Name + vendor_name = bae[4].find('span').text + vendor_name = vendor_name.split(' ')[1] + vendor_name = vendor_name.replace('\n', ' ') + vendor_name = vendor_name.replace(",", "") + vendor_name = vendor_name.strip() + vendor.append(vendor_name) + + # Finding the Category + cat = card.findAll('div', recursive=False)[0].findAll('div', recursive=False)[1].find('span').text + cat = cat.replace("\n", "") + cat = cat.replace(",", "") + cat = cat.strip() + category.append(cat) + + bae = card.findAll('div', recursive=False)[1].findAll('div', recursive=False)[1] + + # Finding amount left + left = bae.findAll('div', recursive=False)[1].text + left = left.replace("x", "") + left = left.replace('\n', ' ') + left = left.replace(",", "") + left = left.strip() + qLeft.append(left) + + # Finding amount sold + qsold = bae.findAll('div', recursive=False)[2].text + qsold = qsold.replace('\n', ' ') + qsold = qsold.replace("x", "") + qsold = qsold.replace(",", "") + qsold = qsold.strip() + sold.append(qsold) + + # Finding product image + product_image = card.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: cveValue="-1" else: @@ -255,7 +264,7 @@ def darkfox_listing_parser(soup): cveValue=cee CVE.append(cveValue) - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) + ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: MSValue="-1" else: @@ -269,23 +278,24 @@ def darkfox_listing_parser(soup): MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, - BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, + image_vendor) #called by the crawler to get description links on a listing page #@param: beautifulsoup object that is using the correct html page (listing page) #return: list of description links from a listing page -def blackpyramid_links_parser(soup): +def BlackPyramid_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] listing = soup.findAll('article', {"class": "product"}) - for div in listing: + for item in listing: - link = div.find('a', {"class": "ah39063"})['href'] + link = item.find('a', {"class": "ah39063"})['href'] href.append(link) - return href \ No newline at end of file + return href diff --git a/MarketPlaces/CityMarket/crawler_selenium.py b/MarketPlaces/CityMarket/crawler_selenium.py index 704b840..a00179e 100644 --- a/MarketPlaces/CityMarket/crawler_selenium.py +++ b/MarketPlaces/CityMarket/crawler_selenium.py @@ -131,10 +131,10 @@ def login(driver): input("Press ENTER when CAPTCHA is complete and login page has loaded\n") # entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + usernameBox = driver.find_element(by=By.XPATH, value='//input[@id="username"]') # Username here usernameBox.send_keys('findingmykeys') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="inputPassword3"]') + passwordBox = driver.find_element(by=By.XPATH, value='//input[@id="inputPassword3"]') # Password here passwordBox.send_keys('ican’tFindMycarKey$') @@ -144,6 +144,7 @@ def login(driver): WebDriverWait(driver, 100).until(EC.visibility_of_element_located( (By.XPATH, '//*[@id="collapse3"]'))) + # Saves the crawled html page, makes the directory path for html pages if not made def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) @@ -185,17 +186,19 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Hiring hacker + # # Hire hacker # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=3') - # virus and malware - # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=15') - # # ddos - # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=16') - # # software - # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=17') - # # botnets - # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=18') - # # hacking service + # # other + # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=14') + # malware + links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=15') + # ddos + links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=16') + # software + links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=17') + # botnet + links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=18') + # hacking service links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=31') return links @@ -223,10 +226,11 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(driver, html, link) + savePage(driver, html, linksToCrawl[i] + f"page{count+1}") list = productPages(html) for item in list: + # what is this line doing? itemURL = urlparse.urljoin(baseURL, str(item)) try: driver.get(itemURL) @@ -235,16 +239,15 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - link = driver.find_element(by=By.XPATH, value= - '/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href') + link = driver.find_element(by=By.XPATH, value='//a[@rel="next"]').get_attribute('href') if link == "": raise NoSuchElementException count += 1 diff --git a/MarketPlaces/CityMarket/parser.py b/MarketPlaces/CityMarket/parser.py index 75ca4fa..5679b95 100644 --- a/MarketPlaces/CityMarket/parser.py +++ b/MarketPlaces/CityMarket/parser.py @@ -38,60 +38,25 @@ def city_description_parser(soup): vendor_image = "-1" # 20 Vendor_Image divmd7 = soup.find('div', {'class': "col-md-7"}) - ptag = soup.findAll('p') - - # Finding Product Name - # NA # Finding Vendor vendor = divmd7.find('a').text.strip() - # Finding Vendor Rating - # NA - - # Finding Successful Transactions - success = soup.find('span', {'class': "badge-primary"}) - # Finding Prices USD = soup.find('span', {'class': "total"}).text.strip() - BTC = soup.find('div', {'class': "text-center"}).text.strip() - - # Finding Escrow - escrow = ptag[-1].text.strip() - - # Finding the Product Category - category = ptag[-2].text.strip() + tempBTC = soup.find('div', {'class': "text-center"}).text.strip() + BTC = tempBTC.replace("BTC", "").strip() - # Finding the Product Quantity Available - # NA - - # Finding Number Sold - # NA - - # Finding Shipment Information (Origin) - # NA - - # Finding Shipment Information (Destination) - # NA + # Finding Product Image + image = soup.find('img', {'class': 'img-fluid'}) + image = image.get('src') + image = image.split('base64,')[-1] # Finding the Product description describe = soup.find('div', {'class': "text-white"}).text describe = cleanString(describe.strip()) - '''# Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1"''' - # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if cve: @@ -150,7 +115,7 @@ def city_listing_parser(soup): image_vendor = [] # 21 Vendor_Image href = [] # 22 Product_Links - listing = soup.findAll('div', {"class": "card"}) + listing = soup.findAll('div', {"class": "p-4"}) # Populating the Number of Products nm = len(listing) @@ -160,10 +125,14 @@ def city_listing_parser(soup): # Adding the url to the list of urls link = bae[0].get('href') - link = cleanLink(link) href.append(link) - # Finding the Product + # Category + tempCategory = soup.find('select', {"name": "category"}) + tempCategory = tempCategory.find('option', selected=True).text.strip() + category.append(tempCategory) + + # Product Name product = a.find('h4', {"class": "text-center"}).text product = product.replace('\n', ' ') product = product.replace(",", "") @@ -171,31 +140,27 @@ def city_listing_parser(soup): product = product.strip() name.append(product) - bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') - - # Finding Prices + # USD and BTC Price price = a.find('div', {"class": "price"}).text - tempUSD = price.split("~")[0] - tempUSD = tempUSD.replace("$", "") - tempUSD = tempUSD.strip() - USD.append(tempUSD) - - tempBTC = price.split("~")[1] - tempBTC = tempBTC.replace("BTC", "") - tempBTC = tempBTC.strip() - BTC.append(tempBTC) - - # Finding the Vendor - # NA - - # Finding the Category - # NA - - # Finding Number Sold and Quantity Left - # NA - - # Finding Successful Transactions - # NA + if "~" in price: + tempUSD = price.split("~")[0] + tempUSD = tempUSD.replace("$", "") + tempUSD = tempUSD.strip() + USD.append(tempUSD) + + tempBTC = price.split("~")[1] + tempBTC = tempBTC.replace("BTC", "") + tempBTC = tempBTC.strip() + BTC.append(tempBTC) + else: + USD.append("-1") + BTC.append("-1") + + # Img + product_image = a.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) diff --git a/MarketPlaces/CypherMarketplace/crawler_selenium.py b/MarketPlaces/CypherMarketplace/crawler_selenium.py index 6eff758..9eb7d43 100644 --- a/MarketPlaces/CypherMarketplace/crawler_selenium.py +++ b/MarketPlaces/CypherMarketplace/crawler_selenium.py @@ -11,6 +11,7 @@ from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.ui import Select from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By @@ -87,8 +88,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -127,15 +128,13 @@ def getAccess(): # then allows for manual solving of captcha in the terminal #@param: current selenium web driver def login(driver): - # wait for page to show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div[1]/div/div[1]/div[1]/ul"))) + input("Press ENTER when CAPTCHA is completed\n") # entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input') + usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]') # Username here usernameBox.send_keys('beachyoga278') # sends string to the username box - passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input') + passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]') # Password here passwordBox.send_keys('sunfish278') # sends string to passwordBox @@ -143,7 +142,8 @@ def login(driver): # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[2]/div/div/div[1]/div/div/div[1]/div[2]/ul/li[8]/a"))) + (By.XPATH, '//input[@name="search"]'))) + # Saves the crawled html page, makes the directory path for html pages if not made def savePage(driver, page, url): @@ -186,10 +186,20 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # software + # Hacking + links.append('http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion/category/35a35d10-3cfb-11ea-9b14-65b8930c1372') + # Carding + links.append('http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion/category/3d335a10-3cfb-11ea-9224-fdf701883e72') + # Software links.append('http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion/category/040ca140-3cfc-11ea-9364-87edd8c0a63f') - # # guides - # links.append('http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion/category/35a35d10-3cfb-11ea-9b14-65b8930c1372') + # Exploit Kits + links.append('http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion/category/da5e6480-3cfb-11ea-b85f-9b6c3bb4c534') + # Botnets + links.append('http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion/category/e3d58e10-3cfb-11ea-a343-ebe7c6036eb1') + # Rats & Trojans + links.append('http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion/category/edf6e020-3cfb-11ea-9438-f7719eecbffe') + # Digital Goods + links.append('http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion/category/fbf0b5b0-3cfb-11ea-827f-0dc5e9988952') return links @@ -228,17 +238,17 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - temp = driver.find_element(by=By.XPATH, value= - '/html/body/div[2]/div/div/div[2]/div/nav/ul') - link = temp.find_element(by=By.TAG_NAME, value='page-link').get_attribute('href') + # temp = driver.find_element(by=By.XPATH, value= + # '/html/body/div[2]/div/div/div[2]/div/nav/ul') + link = driver.find_element(by=By.XPATH, value='//a[rel="next"]').get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -290,4 +300,4 @@ def productPages(html): def crawler(): startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") + # print("Crawling and Parsing CypherMarketplace .... DONE!") diff --git a/MarketPlaces/CypherMarketplace/parser.py b/MarketPlaces/CypherMarketplace/parser.py index 6ac14d6..4f5fd53 100644 --- a/MarketPlaces/CypherMarketplace/parser.py +++ b/MarketPlaces/CypherMarketplace/parser.py @@ -11,7 +11,7 @@ from bs4 import BeautifulSoup #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of description page #return: 'row' that contains a variety of lists that each hold info on the description page -def darkfox_description_parser(soup): +def cyphermarketplace_description_parser(soup): # Fields to be parsed @@ -147,11 +147,11 @@ def darkfox_description_parser(soup): #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of listing page #return: 'row' that contains a variety of lists that each hold info on the listing page -def darkfox_listing_parser(soup): +def cyphermarketplace_listing_parser(soup): # Fields to be parsed nm = 0 # Total_Products (Should be Integer) - mktName = "DarkFox" # 0 Marketplace_Name + mktName = "CypherMarketplace" # 0 Marketplace_Name name = [] # 1 Product_Name CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = [] # 3 Product_MS_Classification (Microsoft Security) @@ -281,10 +281,10 @@ def cyphermarketplace_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('div', {"class": "card-body"}) + listing = soup.findAll('div', {"class": "col-12 col-sm-6 col-md-4 my-1"}) for a in listing: - bae = a.find('a', {"class": "text-info"}, href=True) + bae = a.find('div', {"class": "card-body"}).find('a', {"class": "text-info"}, href=True) link = bae['href'] href.append(link) diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index 8769869..2f3341a 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -4,7 +4,7 @@ import psycopg2 import traceback import configparser from MarketPlaces.Utilities.utilities import * - +from dateutil.relativedelta import relativedelta, FR def connectDataBase(): @@ -273,6 +273,8 @@ def create_items(cur, row, marketId, vendorId): if newItem: + # decode_decrypt_image_in_base64(row[20]) + sql = "Insert into items (item_id, market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \ "views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \ "quantityleft_item, shippedfrom_item, shippedto_item, lastseen_item, image_item, href_item, dateinserted_item, " \ @@ -312,7 +314,7 @@ def create_items(cur, row, marketId, vendorId): recset = cur.fetchall() - # decode_decrypt_image_in_base64(recset[0][20]) + # decode_decrypt_image_in_base64(recset[0]['image_item']) if (str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or str(recset[0]['cve_item']) != str(row[6] if row[6] != '-1' else None) or @@ -401,6 +403,27 @@ def create_items(cur, row, marketId, vendorId): return itemId +def create_status(cur, marketId, date, listings, descriptions, status): + + date = datetime.strptime(date, "%m%d%Y") + + # getting last Fridays a reference date + date_reference = date + relativedelta(weekday=FR(-1)) + + # checking if status already exists + sql = "select * from marketplaces_status where market_id = %(market_id)s and date_inserted = %(date_inserted)s" + cur.execute(sql, {'market_id': marketId, 'date_inserted': date}) + + recset = cur.fetchall() + if recset: + sql = "Update marketplaces_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \ + "where market_id = %(market_id)s and date_inserted = %(date_inserted)s" + recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'market_id': marketId, 'date_inserted': date} + else: + sql = "Insert into marketplaces_status (market_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)" + recset = [marketId, date, listings, descriptions, status, date_reference] + + cur.execute(sql, recset) def create_database(cur, con): try: @@ -413,6 +436,12 @@ def create_database(cur, con): sql = "create unique index unique_market ON marketplaces USING btree (name_market ASC NULLS LAST)" cur.execute(sql) + sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, " \ + "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \ + "CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \ + "CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))" + cur.execute(sql) + sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \ "varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \ "null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py index fdfb640..dac91b0 100644 --- a/MarketPlaces/DarkBazar/crawler_selenium.py +++ b/MarketPlaces/DarkBazar/crawler_selenium.py @@ -216,12 +216,12 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + break + + # comment out + if count == 1: + break try: link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') @@ -236,7 +236,7 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the DarkBazar market done.") + print("Crawling the DarkBazar market done.") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/MarketPlaces/DarkBazar/parser.py b/MarketPlaces/DarkBazar/parser.py index 9386d18..3d56e92 100644 --- a/MarketPlaces/DarkBazar/parser.py +++ b/MarketPlaces/DarkBazar/parser.py @@ -170,7 +170,6 @@ def darkbazar_listing_parser(soup): # Adding the url to the list of urls link = bae[0].get('href') - link = cleanLink(link) href.append(link) # Finding the Product diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index 17840f3..56f4454 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -14,6 +14,12 @@ from MarketPlaces.M00nkeyMarket.crawler_selenium import crawler as crawlerM00nke from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar +from MarketPlaces.DarkBazar.crawler_selenium import crawler as crawlerDarkBazar +from MarketPlaces.Sonanza.crawler_selenium import crawler as crawlerSonanza +from MarketPlaces.Kingdom.crawler_selenium import crawler as crawlerKingdom +from MarketPlaces.BlackPyramid.crawler_selenium import crawler as crawlerBlackPyramid +from MarketPlaces.Quest.crawler_selenium import crawler as crawlerQuest +from MarketPlaces.Ares.crawler_selenium import crawler as crawlerAres from MarketPlaces.Bohemia.crawler_selenium import crawler as crawlerBohemia from MarketPlaces.TheDarkMarket.crawler_selenium import crawler as crawlerTheDarkMarket @@ -109,6 +115,18 @@ if __name__ == '__main__': crawlerCypher() elif mkt == "PabloEscobarMarket": crawlerPabloEscobar() + elif mkt == "DarkBazar": + crawlerDarkBazar() + elif mkt == "Sonanza": + crawlerSonanza() + elif mkt == "Kingdom": + crawlerKingdom() + elif mkt == "BlackPyramid": + crawlerBlackPyramid() + elif mkt == "Quest": + crawlerQuest() + elif mkt == "Ares": + crawlerAres() elif mkt == "TheDarkMarket": crawlerTheDarkMarket() diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index fdbb740..67341b7 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -16,6 +16,13 @@ from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.PabloEscobarMarket.parser import * from MarketPlaces.CityMarket.parser import * +from MarketPlaces.DarkBazar.parser import * +from MarketPlaces.Sonanza.parser import * +from MarketPlaces.Kingdom.parser import * +from MarketPlaces.BlackPyramid.parser import * +from MarketPlaces.Quest.parser import * +from MarketPlaces.Ares.parser import * +from MarketPlaces.CypherMarketplace.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -131,6 +138,20 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = pabloescobarmarket_listing_parser(soup) elif marketPlace == "CityMarket": rw = city_listing_parser(soup) + elif marketPlace == "Ares": + rw = ares_listing_parser(soup) + elif marketPlace == "DarkBazar": + rw = darkbazar_listing_parser(soup) + elif marketPlace == "Sonanza": + rw = sonanza_listing_parser(soup) + elif marketPlace == "Kingdom": + rw = kingdom_listing_parser(soup) + elif marketPlace == "BlackPyramid": + rw = blackpyramid_listing_parser(soup) + elif marketPlace == "Quest": + rw = quest_listing_parser(soup) + elif marketPlace == "CypherMarketplace": + rw = cyphermarketplace_listing_parser(soup) elif marketPlace == "TheDarkMarket": rw = darkmarket_listing_parser(soup) @@ -168,6 +189,20 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = pabloescobarmarket_description_parser(soup) elif marketPlace == "CityMarket": rmm = city_description_parser(soup) + elif marketPlace == "Ares": + rmm = ares_description_parser(soup) + elif marketPlace == "DarkBazar": + rmm = darkbazar_description_parser(soup) + elif marketPlace == "Sonanza": + rmm = sonanza_description_parser(soup) + elif marketPlace == "Kingdom": + rmm = kingdom_description_parser(soup) + elif marketPlace == "BlackPyramid": + rmm = blackpyramid_description_parser(soup) + elif marketPlace == "Quest": + rmm = quest_description_parser(soup) + elif marketPlace == "CypherMarketplace": + rmm = cyphermarketplace_description_parser(soup) elif marketPlace == "TheDarkMarket": rmm = darkmarket_description_parser(soup) else: @@ -369,6 +404,16 @@ def new_parse(marketPlace, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) + # registering the current forum status (up/down) and the number of scraped pages in the database + marketId = verifyMarketPlace(cur, marketPlace) + if (marketId > 0): + + readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html')) + readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html')) + + create_status(cur, marketId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') + con.commit() + if createLog: logFile.close() diff --git a/MarketPlaces/Kingdom/crawler_selenium.py b/MarketPlaces/Kingdom/crawler_selenium.py index e6b489f..b8e99f0 100644 --- a/MarketPlaces/Kingdom/crawler_selenium.py +++ b/MarketPlaces/Kingdom/crawler_selenium.py @@ -1,4 +1,4 @@ -__author__ = 'DarkWeb' +__author__ = 'Helium' ''' Kingdom Market Crawler (Selenium) @@ -35,55 +35,28 @@ baseURL = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion # Opens Tor Browser, crawls the website def startCrawling(): - # marketName = getMarketName() + mktName = getMKTName() driver = getAccess() if driver != 'down': try: - captcha(driver) login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) closeDriver(driver) - # new_parse(marketName, False) + new_parse(mktName, baseURL, True) -def captcha(driver): - ''' - # wait for captcha page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div[1]"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div/div[2]').screenshot( - r'..\Kingdom\captcha1.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Kingdom\captcha1.png') - im.show() - - iframes = driver.find_elements(by=By.TAG_NAME, value='iframe') +# Login using premade account credentials and do login captcha manually +def login(driver): - # ask user input captcha solution in terminal - print("Enter squares from smallest to largest (squares are numbered 1-9 left to right)") - for order in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']: - id = input(f"{order}: ") - iframes[int(id)-1].click() - ''' input("Press ENTER when CAPTCHA is completed\n") # wait for login page WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - # wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) + (By.XPATH, '//*[@id="login-form"]'))) # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-name"]') @@ -96,39 +69,15 @@ def login(driver): select = Select(driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-sessiontime"]')) select.select_by_visible_text('24 hours') - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="captcha"]'))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Kingdom\captcha2.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Kingdom\captcha2.png') - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-captcha"]') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") + input("Press ENTER when CAPTCHA and DDOS is completed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div/div/div[3]/div[2]'))) + (By.XPATH, '/html/body/div/div/div[3]/div[1]/div/div[3]'))) # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Kingdom' return name @@ -166,8 +115,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -236,30 +185,17 @@ def getInterestedLinks(): links = [] # Software and Malware - links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=c298a77d9e93ad32') + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127') # # Services - # links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=c298a77d9e93ad32') - # # Exploits - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45') - # # Tools - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46') - # # Malware - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47') - # # Cryptography - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48') - # # Others - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49') - # # Hacking Tutorials - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50') - # # Hacked Accounts and Database Dumps - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30') - # # Android Moded pak - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53') + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45') + # # guides and tutorials + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=107') return links def crawlForum(driver): + print("Crawling the Kingdom market") linksToCrawl = getInterestedLinks() @@ -281,6 +217,7 @@ def crawlForum(driver): savePage(driver, html, link) list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -290,18 +227,15 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - temp = driver.find_element(by=By.XPATH, value= - '/html/body/div/div/div[3]/div[2]/div[2]/div/div/ul') - next = temp.find_element_by_class_name("next") - link = link.find_element_by_tag_name('a').get_attribute('href') + link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "»")]').get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -313,7 +247,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Kingdom Market done sucessfully. Press ENTER to continue\n") + print("Crawling the Kingdom market done.") # Returns 'True' if the link is Topic link @@ -325,7 +259,7 @@ def isDescriptionLink(url): # Returns True if the link is a listingPage link def isListingLink(url): - if 'category' in url: + if 'filter_category' in url: return True return False @@ -333,10 +267,8 @@ def isListingLink(url): # calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) return kingdom_links_parser(soup) def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") + startCrawling() \ No newline at end of file diff --git a/MarketPlaces/Kingdom/parser.py b/MarketPlaces/Kingdom/parser.py index b1e05d5..ae75d67 100644 --- a/MarketPlaces/Kingdom/parser.py +++ b/MarketPlaces/Kingdom/parser.py @@ -1,4 +1,4 @@ -__author__ = 'DarkWeb' +__author__ = 'Helium' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * @@ -31,6 +31,8 @@ def kingdom_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name @@ -38,56 +40,49 @@ def kingdom_description_parser(soup): desc = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-cont"}) name = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-head"}).text - name = name.replace('\n', ' ') - name = name.replace(',', ' ') - name = name.strip() + name = cleanString(name).strip() # Finding Prices # Kingdom prices can be shown in a variety of currencies, not all in USD, so keeping currency rows = desc.find_all('div', {"class", "row"}, recursive=False) - price = rows[-1].find('div', {"class": "row"}).find('h3').text - price = price.replace(',', '') - price = price.strip() - # USD = price.replace("USD",'') + USD = rows[-1].find('div', {"class": "row"}).find('h3').text + USD = cleanNumbers(USD).strip() BTC = rows[-1].find('div', {"class": "row"}).find_next_sibling('div').find('span').text + BTC = cleanNumbers(BTC).strip() # Finding Vendor vendor = rows[0].select_one('a[href^="/user"]').text - vendor = vendor.replace(",", " ") - vendor = vendor.strip() + vendor = cleanString(vendor).strip() # Finding Shipment Information (Origem) descs = rows[0].find_all('div', {"class": "col-md-3 text-right"}) shipFrom = descs[2].text - shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() + shipFrom = cleanString(shipFrom).strip() # Finding Shipment Information (Destiny) shipTo = rows[-1].find('div', {"class": "col-md-6"}).text shipTo = shipTo.replace("Ship to:","") - shipTo = shipTo.replace(",","").strip() - if(shipTo == ''): - shipTo = -1 + shipTo = cleanString(shipTo).strip() + if shipTo == '': + shipTo = "-1" # Finding the Product Category category = descs[0].text - category = category.replace(",", "") - category = category.strip() + category = cleanString(category).strip() # Finding the Product Quantity Available left = descs[1].text - left = left.replace(",", "") - left = left.strip() + left = cleanString(left).strip() # Finding when the Product was Added dt = descs[-1].text.strip() addDate = datetime.strptime(dt, '%d.%m.%Y') # Finding the Product description - describe = cleanString(soup.find('div', {"id": "descriptionContent"}).text) + describe = cleanString(soup.find('div', {"id": "descriptionContent"}).text).strip() # Finding the Number of Product Reviews - review = len(soup.find('div', {"id": "feedbackContent"}).find_all(recursive=False)) + reviews = str(len(soup.find('div', {"id": "feedbackContent"}).find_all(recursive=False))) # Searching for CVE and MS categories # no cve or ms in Kingdom @@ -95,7 +90,7 @@ def kingdom_description_parser(soup): # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results @@ -126,7 +121,9 @@ def kingdom_listing_parser(soup): qLeft =[] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links listing = soup.find('div', {"id": "p0"}).find('div').find_all('div', {"class": "row"}, recursive=False) @@ -139,29 +136,38 @@ def kingdom_listing_parser(soup): #in array USD, there may be prices not in USD, so includes currency as well prices = a.find('div', {"class": "col-md-3"}) u = prices.find('h3').text - u = u.strip() - u = u.replace(',', '') - u = u.strip() - USD.append(u) + USD.append(cleanNumbers(u).strip()) bc = prices.find('div').find('span').text - BTC.append(bc) + BTC.append(cleanNumbers(bc).strip()) # Finding the Product product = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]').text - product = product.replace('\n', ' ') - product = product.replace(","," ") - product = product.strip() - name.append(product) + name.append(cleanString(product).strip()) + + # Finding Product Image + product_image = a.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) # Finding the Vendor vendor_name = a.select_one('a[href^="/user"]').text - vendor_name = vendor_name.replace(",", " ").replace('/', '') - vendor_name = vendor_name.strip() - vendor.append(vendor_name) + vendor_name = vendor_name.replace('/', '') + vendor.append(cleanString(vendor_name).strip()) + + # Finding Views + product_views = a.find('div', {"class": "col-md-7"}).find_all('p')[0].text + views.append(cleanNumbers(product_views).strip()) + + # Finding Sold + product_sold = a.find('div', {"class": "base-label label label-rounded label-success"}) + if product_sold is not None: + sold.append(cleanNumbers(product_sold.text).strip()) + else: + sold.append("-1") # Adding the url to the list of urls link = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]')['href'] - link = cleanLink(link) href.append(link) # Searching for CVE and MS categories @@ -169,7 +175,8 @@ def kingdom_listing_parser(soup): # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, + image, image_vendor) def kingdom_links_parser(soup): diff --git a/MarketPlaces/MikesGrandStore/crawler_selenium.py b/MarketPlaces/MikesGrandStore/crawler_selenium.py index 2bb9e1d..492b306 100644 --- a/MarketPlaces/MikesGrandStore/crawler_selenium.py +++ b/MarketPlaces/MikesGrandStore/crawler_selenium.py @@ -1,7 +1,7 @@ -__author__ = 'Helium' +__author__ = 'cern' ''' -Mikes Grand Store Crawler (Selenium) +MikesGrandStore Marketplace Crawler (Selenium) ''' from selenium import webdriver @@ -10,6 +10,7 @@ from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.ui import Select from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By @@ -21,22 +22,21 @@ import subprocess import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.MikesGrandStore.parser import mikesgrandstore_links_parser +from MarketPlaces.MikesGrandStore.parser import MikesGrandStore_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/' +baseURL = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion' -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): mktName = getMKTName() driver = getAccess() if driver != 'down': try: - login(driver) + # Login not needed in MikesGrandStore + # login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) @@ -46,21 +46,18 @@ def startCrawling(): # Returns the name of the website -#return: name of site in string type def getMKTName(): name = 'MikesGrandStore' return name # Return the base link of the website -#return: url of base site in string type def getFixedURL(): - url = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/' + url = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion' return url # Closes Tor Browser -#@param: current selenium driver def closeDriver(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) @@ -86,8 +83,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -110,7 +107,6 @@ def createFFDriver(): #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -122,16 +118,27 @@ def getAccess(): return 'down' -# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha -# then allows for manual solving of captcha in the terminal -#@param: current selenium web driver def login(driver): - # wait for page to show up (This Xpath may need to change based on different seed url) + input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]') + # Username here + usernameBox.send_keys('aliciamykeys') + passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]') + # Password here + passwordBox.send_keys('aliciawherearemykey$') + # session time + session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select')) + session_select.select_by_visible_text('Session 60min') + + input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[1]/header/div/div[3]/div/div/ul/li[6]/a"))) + (By.XPATH, '//*[@id="submit"]'))) -# Saves the crawled html page, makes the directory path for html pages if not made def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -140,50 +147,43 @@ def savePage(driver, page, url): return -# Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') - else: + if isListingLink(url): fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') + else: + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') return fullPath -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site +def getMKTName() -> str: + name = 'MikesGrandStore' + return name + + def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) - if (name == ''): + if name == '': name = str(counter) counter = counter + 1 return name -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products def getInterestedLinks(): links = [] - # Hacking and DDOS + # Hacking links.append('http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/product-category/hacking/') - # # databases - # links.append('http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/product-category/databases/') return links -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver def crawlForum(driver): + print("Crawling the MikesGrandStore market") linksToCrawl = getInterestedLinks() @@ -205,6 +205,7 @@ def crawlForum(driver): savePage(driver, html, link) list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -215,15 +216,15 @@ def crawlForum(driver): driver.back() # comment out - break + #break # comment out - if count == 1: - break + #if count == 1: + # break + # go to next page try: - link = driver.find_element(by=By.XPATH, value= - '/html/body/div[1]/main/div/div[1]/div/div[3]/nav/ul/li[6]/a').get_attribute('href') + link = driver.find_element(by=By.XPATH, value="//a[@class='next page-number']").get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -238,41 +239,28 @@ def crawlForum(driver): print("Crawling the MikesGrandStore market done.") -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not +# Returns 'True' if the link is Topic link, may need to change for every website def isDescriptionLink(url): - if 'product/' in url: + if 'item' in url: return True return False -# Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not +# Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): - if 'product-category' in url: + if 'category' in url: return True return False -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through def productPages(html): soup = BeautifulSoup(html, "html.parser") - return mikesgrandstore_links_parser(soup) - - -# Drop links that "signout" -# def isSignOut(url): -# #absURL = urlparse.urljoin(url.base_url, url.url) -# if 'signout' in url.lower() or 'logout' in url.lower(): -# return True -# -# return False + return MikesGrandStore_links_parser(soup) def crawler(): startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") + + +if __name__ == '__main__': + startCrawling() \ No newline at end of file diff --git a/MarketPlaces/MikesGrandStore/parser.py b/MarketPlaces/MikesGrandStore/parser.py index fe9bd61..1207eb2 100644 --- a/MarketPlaces/MikesGrandStore/parser.py +++ b/MarketPlaces/MikesGrandStore/parser.py @@ -1,223 +1,211 @@ __author__ = 'DarkWeb' # Here, we are importing the auxiliary functions to clean or convert data -from typing import List, Tuple from MarketPlaces.Utilities.utilities import * # Here, we are importing BeautifulSoup to search through the HTML tree from bs4 import BeautifulSoup -def mikesGrandStore_description_parser(soup: BeautifulSoup) -> Tuple: - - name = "-1" # 0 Product_Name - describe = "-1" # 1 Product_Description - lastSeen = "-1" # 2 Product_LastViewDate - rules = "-1" # 3 NOT USED ... - CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 5 Product_MS_Classification (Microsoft Security) - review = "-1" # 6 Product_Number_Of_Reviews +# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of description page +# return: 'row' that contains a variety of lists that each hold info on the description page +def MikesGrandStore_description_parser(soup): + # Fields to be parsed + + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) category = "-1" # 7 Product_Category - shipFrom = "-1" # 8 Product_ShippedFrom - shipTo = "-1" # 9 Product_ShippedTo - left = "-1" # 10 Product_QuantityLeft - escrow = "-1" # 11 Vendor_Warranty - terms = "-1" # 12 Vendor_TermsAndConditions - vendor = "-1" # 13 Vendor_Name - sold = "-1" # 14 Product_QuantitySold - addDate = "-1" # 15 Product_AddedDate - available = "-1" # 16 NOT USED ... - endDate = "-1" # 17 NOT USED ... - BTC = "-1" # 18 Product_BTC_SellingPrice - USD = "-1" # 19 Product_USD_SellingPrice - rating = "-1" # 20 Vendor_Rating - success = "-1" # 21 Vendor_Successful_Transactions - EURO = "-1" # 22 Product_EURO_SellingPrice - - - name: str = soup.find("h1", {"class": "product-title product_title entry-title"}).text - - describe = soup.find("div", {"id": "tab-description"}).text - - commentsList: List[BeautifulSoup] = soup.find("ol", {"class": "commentlist"}).find_all("li") - - if len(commentsList) > 0: - lastReview: BeautifulSoup = commentsList[0] - lastSeen = lastReview.find("time").get("datetime").text - - reviewTab: str = soup.find('a', {'href': '#tab-reivews'}).text - review = reviewTab.split('(')[1].split(')')[0] - - navbarBreadcrumbs: List[BeautifulSoup] = soup.find('nav', {'class': 'woocommerce-breadcrumb breadcrumbs '}).find_all('a') - category = navbarBreadcrumbs[1].text - - USD = soup.find("div", {"class": "price-wrapper"}).text - - reviewStats: str = soup.find("div", {"class": "star-rating"}).text - rating = reviewStats.split(' ')[1] - - row = ( - name, - describe, - lastSeen, - rules, - CVE, - MS, - review, - category, - shipFrom, - shipTo, - left, - escrow, - terms, - vendor, - sold, - addDate, - available, - endDate, - BTC, - USD, - rating, - success, - EURO - ) - + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image + + # Finding Product Name + name = soup.find('h1', {'class': 'product-title product_title entry-title'}).text + name = name.replace('\n', ' ') + name = name.replace(",", "") + name = name.strip() + + divmb = soup.findAll('div', {'class': "mb-1"}) + + # Finding Vendor + # no vendor + vendor = "MikesGrandStore" + + # Finding the Product Rating + rating_item = soup.find('strong', {'class', 'rating'}).text + rating_item = rating_item.replace('\n', ' ') + rating_item = rating_item.replace(",", "") + rating_item = rating_item.strip() + + # Finding Number of Product Reviews + review_container = soup.find('li', {'id': 'tab-title-reviews'}) + reviews = review_container.find('a').text + reviews = reviews.replace('Reviews', '') + reviews = reviews.replace('(', '') + reviews = reviews.replace(')', '') + reviews = reviews.replace('\n', ' ') + reviews = reviews.replace(",", "") + reviews = reviews.strip() + + # Finding Prices + USD = soup.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling + USD = USD.replace('\n', ' ') + USD = USD.replace(",", "") + USD = USD.strip() + + # Finding the Product Category + cat_container = soup.find('span', {'class': 'posted_in'}) + cat = cat_container.findAll('a') + category = "" + for name in cat: + category = category + " " + name.text + + # Finding the Product Quantity Available + stock = soup.find('p', {'class': 'stock in-stock'}) + if stock is not None: + left = stock.text + left = left.replace("in stock", "") + left = left.strip() + + # Finding the Product description + desc_cont = soup.find('div', {'class': 'product-short-description'}) + describe = desc_cont.find('p').text.strip() + + # Finding Product Image + image = soup.find('img', {'class': 'wp-post-image skip-lazy'}) + image = image.get('src') + image = image.split('base64,')[-1] + + # Searching for CVE and MS categories + cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if cve: + CVE = " " + for idx in cve: + CVE += (idx) + CVE += " " + CVE = CVE.replace(',', ' ') + CVE = CVE.replace('\n', '') + ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) + if ms: + MS = " " + for im in ms: + MS += (im) + MS += " " + MS = MS.replace(',', ' ') + MS = MS.replace('\n', '') + + # Populating the final variable (this should be a list with all fields scraped) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) + + # Sending the results return row -def mikesGrandStore_listing_parser(soup: BeautifulSoup) -> List: - - # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "MikesGrandStore" # 0 Marketplace_Name - name = [] # 1 Product_Name - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category - describe = [] # 5 Product_Description - escrow = [] # 6 Vendor_Warranty - views = [] # 7 Product_Number_Of_Views - reviews = [] # 8 Product_Number_Of_Reviews - addDate = [] # 9 Product_AddDate - lastSeen = [] # 10 Product_LastViewDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft =[] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions - href = [] # 23 Product_Links (Urls) - - - - pass - -#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs -#stores info it needs in different lists, these lists are returned after being organized -#@param: soup object looking at html page of listing page -#return: 'row' that contains a variety of lists that each hold info on the listing page -def darkfox_listing_parser(soup): +# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of listing page +# return: 'row' that contains a variety of lists that each hold info on the listing page +def MikesGrandStore_listing_parser(soup): # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "DarkFox" # 0 Marketplace_Name - name = [] # 1 Product_Name - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category - describe = [] # 5 Product_Description - escrow = [] # 6 Vendor_Warranty - views = [] # 7 Product_Number_Of_Views - reviews = [] # 8 Product_Number_Of_Reviews - addDate = [] # 9 Product_AddDate - lastSeen = [] # 10 Product_LastViewDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft =[] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions - href = [] # 23 Product_Links (Urls) - - listing = soup.findAll('div', {"class": "card"}) + nm = 0 # *Total_Products (Should be Integer) + mktName = "MikesGrandStore" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + + listing_container = soup.find('div', {'class': 'products row row-small large-columns-3 medium-columns-3 small-columns-2 equalize-box'}) + listing = listing_container.findAll('div', recursive=False) # Populating the Number of Products nm = len(listing) for a in listing: bae = a.findAll('a', href=True) + lb = a.findAll('div', {"id": "littlebox"}) # Adding the url to the list of urls - link = bae[0].get('href') - link = cleanLink(link) + link = a.find('a', {'class': 'woocommerce-LoopProduct-link woocommerce-loop-product__link'}).get('href') href.append(link) # Finding the Product - product = bae[1].find('p').text + product = a.find('a', {'class': 'woocommerce-LoopProduct-link woocommerce-loop-product__link'}).text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.replace("...", "") product = product.strip() name.append(product) - bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') - - if len(bae) >= 5: - # Finding Prices - price = bae[0].text - ud = price.replace(" USD", " ") - # u = ud.replace("$","") - u = ud.replace(",", "") - u = u.strip() - USD.append(u) - # bc = (prc[1]).strip(' BTC') - # BTC.append(bc) - - # Finding the Vendor - vendor_name = bae[1].find('a').text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Category - cat = bae[2].find('small').text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() - category.append(cat) - - # Finding Number Sold and Quantity Left - num = bae[3].text - num = num.replace("Sold: ", "") - num = num.strip() - sold.append(num) - - quant = bae[4].find('small').text - quant = quant.replace("In stock: ", "") - quant = quant.strip() - qLeft.append(quant) - - # Finding Successful Transactions - freq = bae[1].text - freq = freq.replace(vendor_name, "") - freq = re.sub(r'Vendor Level \d+', "", freq) - freq = freq.replace("(", "") - freq = freq.replace(")", "") - freq = freq.strip() - success.append(freq) + # Finding Product Image + product_image = a.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'}) + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + + # Finding Prices + price = a.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling + price = price.strip() + USD.append(price) + + # Finding the Vendor + vendor_name = "MikesGrandStore" + vendor.append(vendor_name) + + image_vendor.append("-1") + + # Finding the Category + cat = a.find('p', {'class': 'category uppercase is-smaller no-text-overflow product-cat op-7'}).text + cat = cat.replace("class:", "") + cat = cat.strip() + category.append(cat) + + # Finding product rating + rating = a.find('strong', {'class': 'rating'}).text + rating = rating.strip() + rating_item.append(rating) + # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: - cveValue="-1" + cveValue = "-1" else: cee = " " for idx in cve: @@ -225,12 +213,12 @@ def darkfox_listing_parser(soup): cee += " " cee = cee.replace(',', ' ') cee = cee.replace('\n', '') - cveValue=cee + cveValue = cee CVE.append(cveValue) - + ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: - MSValue="-1" + MSValue = "-1" else: me = " " for im in ms: @@ -238,27 +226,34 @@ def darkfox_listing_parser(soup): me += " " me = me.replace(',', ' ') me = me.replace('\n', '') - MSValue=me + MSValue = me MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, - BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) - + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) -#called by the crawler to get description links on a listing page -#@param: beautifulsoup object that is using the correct html page (listing page) -#return: list of description links from a listing page -def mikesgrandstore_links_parser(soup): +# called by the crawler to get description links on a listing page +# @param: beautifulsoup object that is using the correct html page (listing page) +# return: list of description links from a listing page +def MikesGrandStore_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('div', {"class": "box-image"}) + container = soup.find('div', {"class": "products row row-small large-columns-3 medium-columns-3 small-columns-2 equalize-box"}) + listing = container.findAll('div', recursive=False) + + # for a in listing: + # bae = a.find('a', {"class": "text-info"}, href=True) + # link = bae['href'] + # href.append(link) for a in listing: - bae = a.find('div', {"class": "image-fade_in_back"}).find('a', href=True) - link = bae['href'] + bae = a.findAll('a', href=True) + + # Adding the url to the list of urls + link = bae[0].get('href') href.append(link) - return href \ No newline at end of file + return href diff --git a/MarketPlaces/Quest/crawler_selenium.py b/MarketPlaces/Quest/crawler_selenium.py index 69287a9..8a84d68 100644 --- a/MarketPlaces/Quest/crawler_selenium.py +++ b/MarketPlaces/Quest/crawler_selenium.py @@ -1,7 +1,7 @@ __author__ = 'DarkWeb' ''' -Quest Market Crawler (Selenium) +Quest Marketplace Crawler (Selenium) ''' from selenium import webdriver @@ -9,15 +9,17 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image +from selenium.webdriver.support.ui import Select +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from PIL import Image import urllib.parse as urlparse import os, re, time from datetime import date import subprocess +import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.Quest.parser import quest_links_parser @@ -27,9 +29,8 @@ counter = 1 baseURL = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion' -# Opens Tor Browser, crawls the website def startCrawling(): - marketName = getMarketName() + mktName = getMKTName() driver = getAccess() if driver != 'down': @@ -40,66 +41,18 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(marketName, False) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - #wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button"))) - - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - #Username here - usernameBox.send_keys('blabri') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - #Password here - passwordBox.send_keys('fishowal') - - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img').screenshot( - r'..\Quest\captcha.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Quest\captcha.png') - - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/input') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[5]/div/div/div/span'))) + new_parse(mktName, baseURL, True) # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Quest' return name -# Return the link of the website +# Return the base link of the website def getFixedURL(): url = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion' - return url @@ -129,8 +82,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -146,12 +99,13 @@ def createFFDriver(): service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -163,7 +117,28 @@ def getAccess(): return 'down' -# Saves the crawled html page +def login(driver): + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="username"]'))) + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + # Username here + usernameBox.send_keys('CashCarti') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + # Password here + passwordBox.send_keys('Mahogany') + # Clicking the login button + # login_button = driver.find_element(By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button') + # login_button.click() + + input("Press ENTER when CAPTCHA is completed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[1]/nav/div/a/img'))) + + def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -172,7 +147,6 @@ def savePage(driver, page, url): return -# Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE @@ -185,7 +159,11 @@ def getFullPathName(url): return fullPath -# Creates the file name from passed URL +def getMKTName() -> str: + name = 'Quest' + return name + + def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) @@ -198,23 +176,26 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Digital - Services - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/8ae67900-22ed-11ec-a710-31f963ce8d35') - # # Digital - Software - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/92809300-22ed-11ec-b143-af312e1dab77') - # # Digital - Tutorials - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/9d1592b0-22ed-11ec-b82d-c3d2878a8716') - # # Digital - Malware - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/a35bae90-22ed-11ec-ad2e-410f5a5339b5') - # # Digital - Hacking - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/b4252cf0-22ed-11ec-8032-751549438ed5') - # Digital - Exploits + ## Services + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/8ae67900-22ed-11ec-a710-31f963ce8d35') + ## Software + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/92809300-22ed-11ec-b143-af312e1dab77') + ## Tutorial + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/9d1592b0-22ed-11ec-b82d-c3d2878a8716') + ## Malware + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/a35bae90-22ed-11ec-ad2e-410f5a5339b5') + ## Hacking + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/b4252cf0-22ed-11ec-8032-751549438ed5') + ## Exploits links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/c0c3ac60-22ed-11ec-9e97-41cd1912fdee') + ## Carding + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/cbe06b00-22ec-11ec-ab3a-816857220dec') return links def crawlForum(driver): + print("Crawling the Quest market") linksToCrawl = getInterestedLinks() @@ -236,6 +217,7 @@ def crawlForum(driver): savePage(driver, html, link) list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -245,18 +227,16 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - nav = driver.find_element(by=By.XPATH, value='/html/body/div[6]/nav') - li = nav.find_elements(By.TAG_NAME, 'li') - a = li[-1].find_element(By.TAG_NAME, 'a') - link = a.get_attribute('href') + link_elem = driver.find_element(by=By.CSS_SELECTOR, value='a.page-link[rel="next"]') + link = link_elem.get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -268,24 +248,23 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Quest market done sucessfully. Press ENTER to continue\n") + print("Crawling the Quest market done.") -# Returns 'True' if the link is Topic link +# Returns 'True' if the link is Topic link, may need to change for every website def isDescriptionLink(url): if 'product' in url: return True return False -# Returns True if the link is a listingPage link +# Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): if 'category' in url: return True return False -# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") return quest_links_parser(soup) @@ -293,4 +272,3 @@ def productPages(html): def crawler(): startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/Quest/parser.py b/MarketPlaces/Quest/parser.py index 6761ed9..6852b04 100644 --- a/MarketPlaces/Quest/parser.py +++ b/MarketPlaces/Quest/parser.py @@ -7,9 +7,11 @@ from MarketPlaces.Utilities.utilities import * from bs4 import BeautifulSoup -# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) +# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of description page +# return: 'row' that contains a variety of lists that each hold info on the description page def quest_description_parser(soup): - # Fields to be parsed vendor = "-1" # 0 *Vendor_Name @@ -31,111 +33,69 @@ def quest_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo - - row = soup.find_all('div', {'class': "row"}) + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - name = row[1].text - name = name.replace('\n', ' ') - name = name.replace(",", "") - name = name.strip() + name = soup.find('div', class_='card-header bg-dark text-white rounded-0 text-center').text + name = cleanString(name).strip() - small = row[3].find_all('small') + # USD Price + USD = soup.find('small', text='Product Price:').find_next('small').text.replace('$', '').strip() - # Finding Vendor - vendor = small[0].text - vendor = vendor.replace("Vendor:", "") - vendor = vendor.replace(",", "") - vendor = vendor.strip() + # Product Description + describe = soup.find('textarea').text + describe = cleanString(describe).strip() - # Finding Vendor Rating - full_stars = small[2].find_all('i', {'class': "fas fa-star"}) - half_star = small[2].find('i', {'class': "fas fa-star-half-alt"}) - rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0) + # Finding Product Image + image = soup.find('img', {'class': 'img-fluid'}) + image = image.get('src').split('base64,')[-1] + + # Finding Vendor Image + vendor_image = soup.select_one('.card-body.bg-mgray.css-selector.shadow img') + vendor_image = vendor_image.get('src').split('base64,')[-1] # Finding Successful Transactions - success = small[4].text - success = success.replace("Total Sales:", "") - success = success.strip() - - small = row[2].find('p', {'class': "text-left"}).find_all('small') - - # Finding Prices - USD = small[1].text - USD = USD.replace("$", "") - USD = USD.strip() - - shipping_info = row[2].find('p', {'class': "text-left"}).find('span').text.strip() - if "Digital" not in shipping_info: - shipping_info = shipping_info.split(" ") - - # Finding Shipment Information (Origin) - shipFrom = shipping_info[0].strip() - - # Finding Shipment Information (Destination) - shipTo = shipping_info[1].strip() - - textarea = row[2].find_all('textarea') - - # Finding the Product description - describe = textarea[0].text - describe = describe.replace("\n", " ") - describe = describe.replace("\r", " ") - describe = describe.strip() - - ''' - # Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1" - ''' - - # Searching for CVE and MS categories - cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if cve: - CVE = " " - for idx in cve: - CVE += (idx) - CVE += " " - CVE = CVE.replace(',', ' ') - CVE = CVE.replace('\n', '') - ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) - if ms: - MS = " " - for im in ms: - MS += (im) - MS += " " - MS = MS.replace(',', ' ') - MS = MS.replace('\n', '') + success = soup.find('strong', text='Total Sales:').parent.text + success = cleanNumbers(success).strip() + + # Finding Vendor Rating + temp = soup.find('strong', text='Rating:').parent + rating_vendor = len(temp.findAll('i', {"class": "fas fa-star"})) + half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) + if half_stars > 0: + rating_vendor += 0.5 + + # Finding Item Rating + temp = soup.find('small', text='Average Product Score:').find_next('small') + rating_item = len(temp.findAll('i', {"class": "fas fa-star"})) + half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) + if half_stars > 0: + rating_item += 0.5 # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row -# This is the method to parse the Listing Pages +# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of listing page +# return: 'row' that contains a variety of lists that each hold info on the listing page def quest_listing_parser(soup): # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "Quest" # 0 *Marketplace_Name + nm = 0 # *Total_Products (Should be Integer) + mktName = "quest" # 0 *Marketplace_Name vendor = [] # 1 *Vendor y rating_vendor = [] # 2 Vendor_Rating success = [] # 3 Vendor_Successful_Transactions name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this category = [] # 7 Product_Category y describe = [] # 8 Product_Description views = [] # 9 Product_Number_Of_Views @@ -146,87 +106,73 @@ def quest_listing_parser(soup): USD = [] # 14 Product_USD_SellingPrice y EURO = [] # 15 Product_EURO_SellingPrice sold = [] # 16 Product_QuantitySold - qLeft =[] # 17 Product_QuantityLeft + qLeft = [] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links - - # Finding category of listing page - cat = soup.find('span', {'class': "btn btn-sm btn-outline-mgray active border-info"}).text - cat = cat.replace("Digital -", "") - cat = cat.strip() - - listing = soup.find_all('div', {"class": "col-md-2 my-md-0 col-12"}) + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + # Extract all product listings + listing = soup.findAll('div', class_='col-md-2 my-md-0 col-12') # Populating the Number of Products nm = len(listing) for a in listing: - bae = a.find_all('a', href=True) - - # Adding the category - category.append(cat) - - # Adding the url to the list of urls - link = bae[0].get('href') - link = cleanLink(link) - href.append(link) - - # Finding the Vendor - vendor_name = bae[2].text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Product - product = bae[1].find('img').get('alt') - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.strip() - name.append(product) - - # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if not cve: - cveValue="-1" - else: - cee = " " - for idx in cve: - cee += (idx) - cee += " " - cee = cee.replace(',', ' ') - cee = cee.replace('\n', '') - cveValue=cee - CVE.append(cveValue) - - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) - if not ms: - MSValue="-1" - else: - me = " " - for im in ms: - me += (im) - me += " " - me = me.replace(',', ' ') - me = me.replace('\n', '') - MSValue=me - MS.append(MSValue) + + # Extracting Product URL & Name + product_link_tags = a.find_all('a', class_='badge-info') + if product_link_tags: + # Using the first tag as default + product_link_tag = product_link_tags[0] + href.append(product_link_tag['href']) + name.append(cleanString(product_link_tag.text).strip()) + + # Extracting Product Image + img_tag = a.find('img') + if img_tag: + image_data = img_tag['src'].split('base64,')[-1] + image.append(image_data) + + # Extracting Vendor Name + vendor_tag = a.find('a', class_='badge-dark') + if vendor_tag: + vendor.append(cleanString(vendor_tag.text.replace('👤', '')).strip()) + + # Extracting Product Price in USD + price_tag = a.find('a', class_='text') + if price_tag: + USD.append(price_tag.text.replace("$", "").strip()) + + category_tag = soup.find('span', class_= 'btn btn-sm btn-outline-mgray active border-info') + if category_tag: + category.append(cleanString(category_tag.text).strip()) # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) +# called by the crawler to get description links on a listing page +# @param: beautifulsoup object that is using the correct html page (listing page) +# return: list of description links from a listing page def quest_links_parser(soup): + # Returning all product links - # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('div', {"class": "col-md-2 my-md-0 col-12"}) + # Locate all divs with class 'row' + row_divs = soup.findAll('div', class_='row') + + for row_div in row_divs: + # Locate all product divs within the current 'row' div + product_divs = row_div.findAll('div', class_='col-md-2 my-md-0 col-12') - for div in listing: + for product_div in product_divs: + # Locate the anchor tag containing the product link within each product div + product_link_tag = product_div.find('a', class_='badge-info') - link = div.find('a')["href"] - href.append(link) + if product_link_tag and product_link_tag.has_attr('href'): + href.append(product_link_tag['href']) return href \ No newline at end of file diff --git a/MarketPlaces/Sonanza/crawler_selenium.py b/MarketPlaces/Sonanza/crawler_selenium.py new file mode 100644 index 0000000..c0ea415 --- /dev/null +++ b/MarketPlaces/Sonanza/crawler_selenium.py @@ -0,0 +1,263 @@ +__author__ = 'DarkWeb' + +''' +Sonanza Marketplace Crawler (Selenium) +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.ui import Select +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +from PIL import Image +import urllib.parse as urlparse +import os, re, time +from datetime import date +import subprocess +import configparser +from bs4 import BeautifulSoup +from MarketPlaces.Initialization.prepare_parser import new_parse +from MarketPlaces.Sonanza.parser import sonanza_links_parser +from MarketPlaces.Utilities.utilities import cleanHTML + +counter = 1 +baseURL = 'http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/' + + +def startCrawling(): + mktName = getMKTName() + driver = getAccess() + + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closeDriver(driver) + + new_parse(mktName, baseURL, True) + + +# Returns the name of the website +def getMKTName(): + name = 'Sonanza' + return name + + +# Return the base link of the website +def getFixedURL(): + url = 'http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/' + return url + + +# Closes Tor Browser +def closeDriver(driver): + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) + + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("permissions.default.image", 3) + ff_prof.set_preference("browser.download.folderList", 2) + ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", False) + ff_prof.update_preferences() + + service = Service(config.get('TOR', 'geckodriver_path')) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + driver.maximize_window() + + return driver + + +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' +def getAccess(): + url = getFixedURL() + driver = createFFDriver() + try: + driver.get(url) + return driver + except: + driver.close() + return 'down' + + +def login(driver): + # input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + # + # # entering username and password into input boxes + # usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]') + # # Username here + # usernameBox.send_keys('aliciamykeys') + # passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]') + # # Password here + # passwordBox.send_keys('aliciawherearemykey$') + # # session time + # session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select')) + # session_select.select_by_visible_text('Session 60min') + + input("Press ENTER when CAPTCHA is completed and listing page loaded\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="searchbar"]'))) + + +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +def getFullPathName(url): + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') + else: + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') + return fullPath + + +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if name == '': + name = str(counter) + counter = counter + 1 + return name + + +def getInterestedLinks(): + links = [] + + # Guides and Tutorials + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/3') + # Software and Malware + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/4') + # Fraud + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/5') + # Digital Products + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/21') + # Services + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/22') + + return links + + +def crawlForum(driver): + + print("Crawling the Sonanza market") + + linksToCrawl = getInterestedLinks() + + i = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] + print('Crawling :', link) + try: + has_next_page = True + count = 0 + + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(driver, html, link) + + list = productPages(html) + + for item in list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver, driver.page_source, item) + driver.back() + + # # comment out + # break + # + # # comment out + # if count == 1: + # break + + try: + link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "›")]').get_attribute('href') + if link == "": + raise NoSuchElementException + count += 1 + + except NoSuchElementException: + has_next_page = False + + except Exception as e: + print(link, e) + i += 1 + + print("Crawling the Sonanza market done.") + + +# Returns 'True' if the link is Topic link, may need to change for every website +def isDescriptionLink(url): + if 'article' in url: + return True + return False + + +# Returns True if the link is a listingPage link, may need to change for every website +def isListingLink(url): + if 'category' in url: + return True + return False + + +def productPages(html): + soup = BeautifulSoup(html, "html.parser") + return sonanza_links_parser(soup) + + +def crawler(): + startCrawling() diff --git a/MarketPlaces/Sonanza/parser.py b/MarketPlaces/Sonanza/parser.py new file mode 100644 index 0000000..10166f5 --- /dev/null +++ b/MarketPlaces/Sonanza/parser.py @@ -0,0 +1,238 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from MarketPlaces.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + + +# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of description page +# return: 'row' that contains a variety of lists that each hold info on the description page +def sonanza_description_parser(soup): + # Fields to be parsed + + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image + + listing = soup.find('div', {"id": "article_page"}) + + # Finding the Product + name = listing.find('div', {"class": "row box"}).text + name = cleanString(name).strip() + + # Finding Product Image + product_image = listing.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image = product_image + + table = listing.find('div', {"class": "col-md-5"}) + + # Finding Prices + USD = table.find('span', {"class": "pr"}).text + USD = USD.replace("$", "").strip() + + BTC = table.find_all('span', {"class": "pr1"})[1].text + BTC = BTC.replace("BTC", "").strip() + + rows = table.find_all('p', {"class": "mb-0"}) + for row in rows: + temp = row.text + if "CATEGORY" in temp: + category = temp.replace("CATEGORY :", "") + category = cleanString(category).strip() + elif "VENDOR LEVEL" in temp: + rating_vendor = temp.replace("VENDOR LEVEL :", "") + rating_vendor = cleanString(rating_vendor).strip() + + rows = listing.find_all('p', {"class": "mb-1"}) + for row in rows: + temp = row.text + if "VENDOR" in temp: + vendor = temp.replace("VENDOR :", "") + vendor = cleanString(vendor).strip() + elif "SHIPS TO" in temp: + shipTo = temp.replace("SHIPS TO :", "") + shipTo = cleanString(shipTo).strip() + elif "SOLD" in temp: + sold = cleanNumbers(temp).strip() + + # Finding Product Description + describe = listing.find('pre').text + describe = cleanString(describe).strip() + + # Searching for CVE and MS categories + cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if cve: + CVE = " " + for idx in cve: + CVE += (idx) + CVE += " " + CVE = CVE.replace(',', ' ') + CVE = CVE.replace('\n', '') + ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) + if ms: + MS = " " + for im in ms: + MS += (im) + MS += " " + MS = MS.replace(',', ' ') + MS = MS.replace('\n', '') + + # Populating the final variable (this should be a list with all fields scraped) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) + + # Sending the results + return row + + +# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of listing page +# return: 'row' that contains a variety of lists that each hold info on the listing page +def sonanza_listing_parser(soup): + + # Fields to be parsed + nm = 0 # *Total_Products (Should be Integer) + mktName = "Sonanza" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + + listings = soup.findAll('div', {"class": "col-sm-4 col-md-3"}) + + # Populating the Number of Products + nm = len(listings) + + for listing in listings: + + # Adding the url to the list of urls + bae = listing.find('a', href=True) + link = bae.get('href') + href.append(link) + + # Finding Product Image + product_image = listing.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + + # Finding the Product + product = listing.find('h5', {"class": "art_title"}).text + product = cleanString(product) + name.append(product.strip()) + + # Finding Prices + price = listing.find('span', {"class": "priceP"}).text + price = price.replace("$", "") + USD.append(price.strip()) + + rows = listing.find_all('p', {"class": "mb-0 card-text"}) + for row in rows: + temp = row.text + if "CATEGORY" in temp: + cat = temp.replace("CATEGORY :", "") + cat = cleanString(cat) + category.append(cat.strip()) + elif "VENDOR" in temp: + vendor_name = temp.replace("VENDOR :", "") + vendor_name = cleanString(vendor_name) + vendor.append(vendor_name.strip()) + + # Finding Vendor Rating + rating = listing.find('span', {"class": "badge badge-info"}).text + rating = rating.replace("VENDOR LEVEL :", "") + rating = cleanString(rating) + rating_vendor.append(rating.strip()) + + # Searching for CVE and MS categories + cve = listing.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if not cve: + cveValue = "-1" + else: + cee = " " + for idx in cve: + cee += (idx) + cee += " " + cee = cee.replace(',', ' ') + cee = cee.replace('\n', '') + cveValue = cee + CVE.append(cveValue) + + ms = listing.findAll(text=re.compile('MS\d{2}-\d{3}')) + if not ms: + MSValue = "-1" + else: + me = " " + for im in ms: + me += (im) + me += " " + me = me.replace(',', ' ') + me = me.replace('\n', '') + MSValue = me + MS.append(MSValue) + + # Populate the final variable (this should be a list with all fields scraped) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) + + +# called by the crawler to get description links on a listing page +# @param: beautifulsoup object that is using the correct html page (listing page) +# return: list of description links from a listing page +def sonanza_links_parser(soup): + # Returning all links that should be visited by the Crawler + + href = [] + listings = soup.findAll('div', {"class": "col-sm-4 col-md-3"}) + + for listing in listings: + a = listing.find('a', href=True) + + # Adding the url to the list of urls + link = a.get('href') + href.append(link) + + return href \ No newline at end of file diff --git a/MarketPlaces/Utilities/utilities.py b/MarketPlaces/Utilities/utilities.py index 915f284..c6aa192 100644 --- a/MarketPlaces/Utilities/utilities.py +++ b/MarketPlaces/Utilities/utilities.py @@ -252,7 +252,7 @@ def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nom lne = marketplace # 0 lne += "," - lne += vendor[n] # 1 + lne += "-1" if len(vendor) == 0 else vendor[n] # 1 lne += "," lne += "-1" if len(rating_vendor) == 0 else rating_vendor[n] # 2 lne += ","