diff --git a/Forums/BestCardingWorld/parser.py b/Forums/BestCardingWorld/parser.py index 7ad385b..4865b65 100644 --- a/Forums/BestCardingWorld/parser.py +++ b/Forums/BestCardingWorld/parser.py @@ -18,15 +18,15 @@ def bestcardingworld_description_parser(soup): topic = "-1" # 0 topic name user = [] # 1 all users of each post - addDate = [] # 2 all dated of each post - feedback = [] # 3 all feedbacks of each user (this was found in just one Forum and with a number format) - status = [] # 4 all user's authority in each post such as (adm, member, dangerous) - reputation = [] # 5 all users's karma in each post (usually found as a number) - sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post) - post = [] # 7 all messages of each post - interest = [] # 8 all user's interest in each post - image = [] - image_user = [] + status = [] # 2 all user's authority in each post such as (adm, member, dangerous) + reputation = [] # 3 all users's karma in each post (usually found as a number) + interest = [] # 4 all user's interest in each post + sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) + post = [] # 6 all messages of each post + feedback = [] # 7 all feedbacks of each user (this was found in just one Forum and with a number format) + addDate = [] # 8 all dated of each post + image_user = [] # 9 all user avatars of each post + image_post = [] # 10 all first images of each post # Finding the topic (should be just one coming from the Listing Page) @@ -157,15 +157,18 @@ def bestcardingworld_description_parser(soup): img = img.get('src').split('base64,')[-1] else: img = "-1" - image.append(img) + image_post.append(img) img = ipost.find('div', {"class": "avatar-container"}).find('img', {"class": "avatar"}) - img = img.get('src').split('base64,')[-1] + if img is not None: + img = img.get('src').split('base64,')[-1] + else: + img = "-1" image_user.append(img) # Populate the final variable (this should be a list with all fields scraped) - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) + row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) # Sending the results @@ -179,17 +182,18 @@ def bestcardingworld_description_parser(soup): #return: 'row' that contains a variety of lists that each hold info on the listing page def bestcardingworld_listing_parser(soup): - nm = 0 # *this variable should receive the number of topics + nm = 0 # *this variable should receive the number of topics forum = "BestCardingWorld" # 0 *forum name - board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - author = [] # 2 *all authors of each topic - topic = [] # 3 *all topics - views = [] # 4 number of views of each topic - posts = [] # 5 number of posts of each topic - href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between - # Listing and Description pages) - addDate = [] # 7 when the topic was created (difficult to find) + board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + author = [] # 2 *all authors of each topic + topic = [] # 3 *all topics + views = [] # 4 number of views of each topic + posts = [] # 5 number of posts of each topic + href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between + # Listing and Description pages) + addDate = [] # 7 when the topic was created (difficult to find) + image_author = [] # 8 all author avatars used in each topic # Finding the board (should be just one) @@ -235,7 +239,6 @@ def bestcardingworld_listing_parser(soup): # Adding the url to the list of urls link = itopic.find('a', {"class": "topictitle"}).get('href') - link = cleanLink(link) href.append(link) # Finding the author of the topic @@ -243,6 +246,8 @@ def bestcardingworld_listing_parser(soup): user = ps.strip() author.append(cleanString(user)) + image_author.append(-1) + # Finding the number of replies post = replies[index].text.split()[0] post = post.strip() @@ -263,7 +268,7 @@ def bestcardingworld_listing_parser(soup): index += 1 - return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) + return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author) #called by the crawler to get description links on a listing page diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index e4f6c5d..6cc9c60 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -3,7 +3,7 @@ __author__ = 'DarkWeb' import psycopg2 import traceback from Forums.Utilities.utilities import * - +from dateutil.relativedelta import relativedelta, FR def connectDataBase(): @@ -484,6 +484,28 @@ def create_posts(cur, row, forumId, topicId): 'dateinserted_post': row[8], 'postId': postId}) +def create_status(cur, forumId, date, listings, descriptions, status): + + date = datetime.strptime(date, "%m%d%Y") + + # getting last Fridays a reference date + date_reference = date + relativedelta(weekday=FR(-1)) + + # checking if status already exists + sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" + cur.execute(sql, {'forum_id': forumId, 'date_inserted': date}) + + recset = cur.fetchall() + if recset: + sql = "Update forums_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \ + "where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" + recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'forum_id': forumId, 'date_inserted': date} + else: + sql = "Insert into forums_status (forum_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)" + recset = [forumId, date, listings, descriptions, status, date_reference] + + cur.execute(sql, recset) + def create_database(cur, con): try: @@ -496,12 +518,18 @@ def create_database(cur, con): sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)" cur.execute(sql) + sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, " \ + "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL, " \ + "constraint forums_status_pk PRIMARY KEY (forum_id, date_inserted), " \ + "constraint forums_status_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))" + cur.execute(sql) + sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \ "255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \ "interest_user character varying(5000) null, signature_user character varying(1000) null, " \ "image_user character varying(10000000) null, dateinserted_user timestamp(6) with time zone NOT NULL, " \ "constraint users_pk primary key (user_id), " \ - "constraint users_forum_id_fkey foreign key (forum_id) references forums (forum_id))" + "constraint users_forum_id_fk foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create unique index unique_user ON users USING btree (forum_id ASC NULLS LAST, name_user ASC NULLS LAST)" @@ -513,17 +541,17 @@ def create_database(cur, con): "signature_user character varying(1000) null, image_user character varying(10000000) null, " \ "dateinserted_user timestamp(6) with time zone NOT NULL, " \ "constraint users_history_pk primary key (user_id, version_user), " \ - "constraint users_history_user_id_fkey foreign key (user_id) references " \ - "users (user_id), constraint users_history_forum_id_fkey foreign key (forum_id) references forums (forum_id))" + "constraint users_history_user_id_fk foreign key (user_id) references users (user_id), " \ + "constraint users_history_forum_id_fk foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \ "title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer null, " \ "posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with time zone null, " \ "dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double precision NOT NULL, " \ - "constraint topics_pk primary key (topic_id), constraint topics_author_id_fkey " \ - "foreign key (author_id) references users (user_id), constraint topics_forum_id_fkey foreign key (" \ - "forum_id) references forums (forum_id))" + "constraint topics_pk primary key (topic_id), " \ + "constraint topics_author_id_fk foreign key (author_id) references users (user_id), " \ + "constraint topics_forum_id_fk foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create unique index unique_topic ON topics USING btree (forum_id ASC NULLS LAST, author_id ASC NULLS LAST, " \ @@ -536,9 +564,9 @@ def create_database(cur, con): "dateadded_topic timestamp(6) with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, " \ "classification_topic double precision NOT NULL, " \ "constraint topics_history_pk primary key (topic_id, version_topic), " \ - "constraint topics_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \ - "constraint topics_history_author_id_fkey foreign key (author_id) references users (user_id), " \ - "constraint topics_history_board_id_fkey foreign key (forum_id) references forums (forum_id))" + "constraint topics_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \ + "constraint topics_history_author_id_f foreign key (author_id) references users (user_id), " \ + "constraint topics_history_board_id_fk foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create table posts(post_id integer NOT NULL, topic_id integer NOT NULL, " \ @@ -546,8 +574,8 @@ def create_database(cur, con): "image_post character varying(10000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ "dateinserted_post timestamp(6) with time zone NOT NULL, " \ "constraint posts_pk primary key (post_id), " \ - "constraint posts_user_id_fkey foreign key (user_id) references users (user_id), constraint " \ - "posts_topic_id_fkey foreign key (topic_id) references topics (topic_id))" + "constraint posts_user_id_fk foreign key (user_id) references users (user_id), " \ + "constraint posts_topic_id_fk foreign key (topic_id) references topics (topic_id))" cur.execute(sql) sql = "create unique index unique_post ON posts USING btree (topic_id ASC NULLS LAST, user_id ASC NULLS LAST, " \ @@ -559,9 +587,9 @@ def create_database(cur, con): "image_post character varying(10000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ "dateinserted_post timestamp(6) with time zone NOT NULL, " \ "constraint posts_history_pk primary key (post_id, version_post), " \ - "constraint posts_history_user_id_fkey foreign key (user_id) references users (user_id), " \ - "constraint posts_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \ - "constraint posts_history_post_id_fkey foreign key (post_id) references posts (post_id))" + "constraint posts_history_user_id_fk foreign key (user_id) references users (user_id), " \ + "constraint posts_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \ + "constraint posts_history_post_id_fk foreign key (post_id) references posts (post_id))" cur.execute(sql) con.commit() diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 91b662f..b86b5c6 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -341,6 +341,16 @@ def new_parse(forum, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) + # registering the current forum status (up/down) and the number of scraped pages in the database + forumId = verifyForum(cur, forum) + if (forumId > 0): + + readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html')) + readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html')) + + create_status(cur, forumId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') + con.commit() + if createLog: logFile.close() diff --git a/MarketPlaces/Ares/crawler_selenium.py b/MarketPlaces/Ares/crawler_selenium.py index fbed2b1..1f865ad 100644 --- a/MarketPlaces/Ares/crawler_selenium.py +++ b/MarketPlaces/Ares/crawler_selenium.py @@ -1,7 +1,7 @@ __author__ = 'DarkWeb' ''' -Ares Market Crawler (Selenium) +Ares Marketplace Crawler (Selenium) ''' from selenium import webdriver @@ -9,27 +9,28 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image +from selenium.webdriver.support.ui import Select +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from PIL import Image import urllib.parse as urlparse -import os, time +import os, re, time from datetime import date import subprocess +import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.Ares.parser import ares_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion' +baseURL = 'http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/' -# Opens Tor Browser, crawls the website def startCrawling(): - marketName = getMarketName() + mktName = getMKTName() driver = getAccess() if driver != 'down': @@ -40,66 +41,18 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(marketName, False) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - #wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/center"))) - - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - #Username here - usernameBox.send_keys('blabri') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - #Password here - passwordBox.send_keys('fishowal') - - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img').screenshot( - r'..\Ares\captcha.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Ares\captcha.png') - - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/input') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[4]/div/div/button").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[7]/div[3]/div[2]/div[1]/div[1]'))) + new_parse(mktName, baseURL, True) # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Ares' return name -# Return the link of the website +# Return the base link of the website def getFixedURL(): - url = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion' - + url = 'http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/' return url @@ -109,7 +62,7 @@ def closeDriver(driver): # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') - driver.quit() + driver.close() time.sleep(3) return @@ -146,12 +99,13 @@ def createFFDriver(): service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -163,7 +117,26 @@ def getAccess(): return 'down' -# Saves the crawled html page +def login(driver): + # input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="username"]'))) + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + # Username here + usernameBox.send_keys('blabri') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + # Password here + passwordBox.send_keys('fishowal') + + input("Press ENTER when BROKEN CIRCLE is pressed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[6]/div[3]/div[2]/div[1]/div[1]'))) + + def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -172,7 +145,6 @@ def savePage(driver, page, url): return -# Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE @@ -185,7 +157,11 @@ def getFullPathName(url): return fullPath -# Creates the file name from passed URL +def getMKTName() -> str: + name = 'Ares' + return name + + def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) @@ -198,33 +174,26 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Digital - Other - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/91ecd5d0-002c-11ec-9b46-ede2378c5d3c') - # # Digital - VPN - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9431b830-002b-11ec-86d6-cdaf65cd97f1') - # # Digital - Coding - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/948b7400-a939-11ec-adc5-2f775203130c') # Digital - Malware - links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2') - # # Digital - Guides - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662') - # # Digital - Hacking - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921') - # # Digital - Malware - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/b3258c50-002b-11ec-b658-876d3d651145') - # # Digital - Services - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/bae64840-002b-11ec-bbcc-a93431540099') - # # Digital - Software - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1') - # # Digital - Exploits - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6') - # # Digital - Tutorials - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8') + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2') + # Digital - Guides (Mostly carding, some useful hacking guides. probably dont use) + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662') + # Digital - Hacking + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921') + # Digital - Malware2 + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/b3258c50-002b-11ec-b658-876d3d651145') + # Digital - Sofware (50/50 hacking stuff and cracked software) + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1') + # Digital - Exploits + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6') + # Digital - Tutorials (Mostly random stuff, some useful tutorials, probably dont use) + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8') return links def crawlForum(driver): + print("Crawling the Ares market") linksToCrawl = getInterestedLinks() @@ -244,8 +213,8 @@ def crawlForum(driver): driver.refresh() html = driver.page_source savePage(driver, html, link) - list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -255,19 +224,15 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - nav = driver.find_element(by=By.XPATH, value= - '/html/body/div[7]/div[3]/div/div[2]/nav') - a = nav.find_element(by=By.LINK_TEXT, value="Next") - link = a.get_attribute('href') - + link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -279,24 +244,23 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Ares market done sucessfully. Press ENTER to continue\n") + print("Crawling the Ares market done.") -# Returns 'True' if the link is Topic link +# Returns 'True' if the link is Topic link, may need to change for every website def isDescriptionLink(url): if 'product' in url: return True return False -# Returns True if the link is a listingPage link +# Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): if 'category' in url: return True return False -# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") return ares_links_parser(soup) @@ -304,4 +268,3 @@ def productPages(html): def crawler(): startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/Ares/parser.py b/MarketPlaces/Ares/parser.py index 3232b0c..597a1eb 100644 --- a/MarketPlaces/Ares/parser.py +++ b/MarketPlaces/Ares/parser.py @@ -7,99 +7,86 @@ from MarketPlaces.Utilities.utilities import * from bs4 import BeautifulSoup -# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) +# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of description page +# return: 'row' that contains a variety of lists that each hold info on the description page def ares_description_parser(soup): - # Fields to be parsed - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - name = soup.find('div', {'class': "col-md-12 my-2"}).text + divmb = soup.find('div', {'class': "col-md-12 my-2"}) + + name = divmb.find('span', {'class': "btn btn-sm btn-outline-dark w-100 active rounded-0"}).text name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() - bae = soup.find('div', {'class': "col-md-7"}).find('span').find_all('span') + box = soup.find('div', {'class': "col-md-7"}).find('span') + box = box.findAll('span', {'class': "btn btn-mgray btn-sm w-100 active border-danger"}) # Finding Vendor - vendor = bae[0].text - vendor = vendor.replace(",", "") - vendor = vendor.replace("...", "") - vendor = vendor.strip() + vendor = soup.find('a', {'class': "btn btn-sm btn-mgray my-1 w-100 text-white"}).get('href') + vendor = vendor.split('otherParty=')[-1] + vendor = cleanString(vendor).strip() # Finding Vendor Rating - full_stars = bae[2].find_all('i', {'class': "fas fa-star"}) - half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) - rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0) + temp = box[1] + rating_vendor = len(temp.findAll('i', {"class": "fas fa-star"})) + half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) + if half_stars > 0: + rating_vendor += 0.5 # Finding Successful Transactions - success = bae[4].text - success = success.replace("Sales ", "") - success = success.strip() + success = box[2].text + success = cleanNumbers(success).strip() - bae = soup.find('span', {'class': "text-left"}).find_all('span') + box2 = soup.find('div', {"class": "col-md-4 text-center"}).find('span', {"class": "text-left"}).findAll('span') - # Finding Prices - USD = bae[0].text - USD = USD.replace("\n$", "") + # Finding USD + USD = box2[0].text + USD = USD.replace('\n', '') + USD = USD.replace('$', '') USD = USD.strip() - shipping_info = bae[4].text - if "Digital" not in shipping_info: - shipping_info = shipping_info.split(" ") - - # Finding Shipment Information (Origin) - shipFrom = shipping_info[0].strip() - - # Finding Shipment Information (Destination) - shipTo = shipping_info[1].strip() - - bae = soup.find_all('textarea') + # Finding Vendor Image + vendor_image = soup.find('img', {"class": 'img-fluid'}).get('src') + vendor_image = vendor_image.split('base64,')[-1] # Finding the Product description - describe = bae[0].text - describe = describe.replace("\n", " ") - describe = describe.replace("\r", " ") - describe = describe.strip() - - # Finding the Terms and Conditions - terms = bae[1].text - terms = terms.replace("\n", " ") - terms = terms.strip() - - ''' - # Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1" - ''' + temp = soup.find('div', {"class": 'row-md-12'}).find('div', {"class": 'col-md-4'}) + cardbody = temp.find('textarea', {"class": 'disabled form-control form-control-sm w-100 bg-mgray text-white rounded-0 border-danger'}) + describe = cleanString(cardbody.text).strip() + + # Finding Product Image + image = soup.find('div', {"class": 'row-md-12'}).find('div', {"class": 'col-md-4 text-center'}).find('img') + if image is not None: + image = image.get('src') + image = image.split('base64,')[-1] + else: + image = "-1" # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -121,69 +108,101 @@ def ares_description_parser(soup): # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row -# This is the method to parse the Listing Pages +# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of listing page +# return: 'row' that contains a variety of lists that each hold info on the listing page def ares_listing_parser(soup): # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "Ares" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft = [] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links - - listing = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) + nm = 0 # *Total_Products (Should be Integer) + mktName = "Ares" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + + cat = soup.find('span', {"class": "btn btn-sm btn-outline-dark w-100 active"}).text + cat = cleanString(cat).strip() + + listing = soup.find('div', {"class": 'card-body text-black text-left bg-dark'}).findAll('div', {"class": 'card mb-4 border-danger rounded-0'}) # Populating the Number of Products nm = len(listing) for a in listing: - bae = a.findAll('a', href=True) + + category.append(cat) # Adding the url to the list of urls - link = bae[0].get('href') + link = a.find('a', {'class': "badge badge-danger w-100 text-white"}).get('href') link = cleanLink(link) href.append(link) + # Finding the Product name + product = a.find('div', {"class": 'marquee-parent'}).find('div', {"class": "marquee-child"}).text + product = product.replace('\n', ' ') + product = product.replace(",", "") + product = product.replace("...", "") + product = product.strip() + name.append(product) + + # Finding Product Image + product_image = a.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + + # Finding Prices + price = a.findAll('a', {"class": "text-white"})[-1].text + price = price.replace("$","") + price = price.strip() + USD.append(price) + + # Finding Item Rating + temp = a.find('small', {"class": "text-white"}) + rating = len(temp.findAll('i', {"class": "fas fa-star"})) + half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) + if half_stars > 0: + rating += 0.5 + rating_item.append(str(rating)) + # Finding the Vendor - vendor_name = bae[1].text + vendor_name = a.find('a', {"class": 'badge badge-dark w-100 text-white my-1'}).text vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.strip() vendor.append(vendor_name) - # Finding the Product - product = bae[2].find('img').get('alt') - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.strip() - name.append(product) + image_vendor.append("-1") # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: - cveValue="-1" + cveValue = "-1" else: cee = " " for idx in cve: @@ -191,12 +210,12 @@ def ares_listing_parser(soup): cee += " " cee = cee.replace(',', ' ') cee = cee.replace('\n', '') - cveValue=cee + cveValue = cee CVE.append(cveValue) - + ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: - MSValue="-1" + MSValue = "-1" else: me = " " for im in ms: @@ -204,24 +223,28 @@ def ares_listing_parser(soup): me += " " me = me.replace(',', ' ') me = me.replace('\n', '') - MSValue=me + MSValue = me MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) +# called by the crawler to get description links on a listing page +# @param: beautifulsoup object that is using the correct html page (listing page) +# return: list of description links from a listing page def ares_links_parser(soup): - # Returning all links that should be visited by the Crawler - href = [] - listing = soup.findAll('a', {"class": "btn btn-success w-100 my-1"}) + href = [] + listing = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) for a in listing: + bae = a.findAll('a', href=True) - link = a['href'] + # Adding the url to the list of urls + link = bae[0].get('href') href.append(link) return href \ No newline at end of file diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py index b257c40..a008bf5 100644 --- a/MarketPlaces/BlackPyramid/crawler_selenium.py +++ b/MarketPlaces/BlackPyramid/crawler_selenium.py @@ -1,9 +1,7 @@ -__author__ = 'Helium' +__author__ = 'cern' ''' -BlackPyramid Forum Crawler (Selenium) -cannot use bc no links are used -kept in case issues are solved +BlackPyramid Market Crawler (Selenium) ''' from selenium import webdriver @@ -11,29 +9,32 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By - +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver import ActionChains +import selenium.webdriver.support.ui as uiClasses +from selenium.webdriver.common.keys import Keys from PIL import Image + import urllib.parse as urlparse import os, re, time -from datetime import date import subprocess import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.BlackPyramid.parser import blackpyramid_links_parser +from MarketPlaces.BlackPyramid.parser import BlackPyramid_links_parser from MarketPlaces.Utilities.utilities import cleanHTML +import traceback + counter = 1 -baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/' +baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/' -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later +# Opens Tor Browser, crawls the website def startCrawling(): - mktName = getMKTName() + marketName = getMKTName() driver = getAccess() if driver != 'down': @@ -42,28 +43,47 @@ def startCrawling(): crawlForum(driver) except Exception as e: print(driver.current_url, e) - closeDriver(driver) + closetor(driver) + + new_parse(marketName, baseURL, False) - new_parse(mktName, baseURL, True) + +# Login +def login(driver): + # wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "//input[@name='username_login']"))) + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value="//input[@name='username_login']") + # Username here + usernameBox.send_keys('ChipotleSteakBurrito') + passwordBox = driver.find_element(by=By.XPATH, value="//input[@name='password_login']") + # Password here + passwordBox.send_keys('BlackBeans') + + input("Press ENTER when CAPTCHA is completed and you closed the newsletter\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="form93b"]'))) # Returns the name of the website -#return: name of site in string type def getMKTName(): name = 'BlackPyramid' return name -# Return the base link of the website -#return: url of base site in string type +# Return the link of the website def getFixedURL(): - url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/' + url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/?login=1' + return url # Closes Tor Browser -#@param: current selenium driver -def closeDriver(driver): +def closetor(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") @@ -88,8 +108,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -111,8 +131,6 @@ def createFFDriver(): return driver -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -124,31 +142,7 @@ def getAccess(): return 'down' -# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha -# then allows for manual solving of captcha in the terminal -#@param: current selenium web driver -def login(driver): - # wait for login page - login_link = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[3]/div/main/div/div/div/div[2]/div/div/div/section[1]/input[1]') - login_link.click() # open tab with url - - # entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - # Username here - usernameBox.send_keys('ChipotleSteakBurrito') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - # Password here - passwordBox.send_keys('BlackBeans') - - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[2]/form/nav/nav/ul/li[2]/div/a/span[1]'))) - - - -# Saves the crawled html page, makes the directory path for html pages if not made +# Saves the crawled html page def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -158,7 +152,6 @@ def savePage(driver, page, url): # Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE @@ -171,75 +164,92 @@ def getFullPathName(url): return fullPath -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site +# Creates the file name from passed URL def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) - if (name == ''): + if name == '': name = str(counter) counter = counter + 1 return name -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products +def page_is_fully_loaded(driver): + return driver.execute_script("return document.readyState") == "complete" + + +def goToPage(driver, page): + # hover over digital -> hacking tools + a = ActionChains(driver) + + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "//li[@class='dig940']/div/a"))) + + # hover + digitalB = driver.find_element(By.XPATH, "//li[@class='dig940']/div/a") + time.sleep(1) + a.move_to_element(digitalB).perform() + # print(digitalB) + + # delay for website to register hover + time.sleep(5) + + # click + xpath = "//input[@name='" + page + "']" + link = driver.find_element(By.XPATH, xpath) + time.sleep(1) + a.move_to_element(link).click().perform() + # print(link) + + # wait for website to load + time.sleep(10) + WebDriverWait(driver, 100).until(page_is_fully_loaded) + + def getInterestedLinks(): - links = [] - - # Hacking Guides - links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Exploits - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # botnets/malware - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # fraud software - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Other Tools - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Services - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') + # h11 -> Hacking Tools + # g3 -> Guides, Hacking + # se3 -> Services, Hacking + # f6 -> Fraud software + links = ['h11','g3','se3','f6'] return links -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver def crawlForum(driver): + print("Crawling the BlackPyramid market") - linksToCrawl = getInterestedLinks() + pages = getInterestedLinks() i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) + for listing in pages: + print('Crawling :', listing) try: + goToPage(driver, listing) + has_next_page = True count = 0 + currentPage = 1 while has_next_page: - try: - clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a') - clicker.click() # open tab with url - driver.get(link) - except: - driver.refresh() + html = driver.page_source - savePage(driver, html, link) + savePage(driver, html, listing + "page" + str(currentPage)) + # get a list of urls for each listing list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: driver.get(itemURL) except: - driver.refresh() + # driver.refresh() + continue savePage(driver, driver.page_source, item) - driver.back() + # can't use the back button in dark pyramid + # driver.back() # comment out break @@ -248,10 +258,34 @@ def crawlForum(driver): if count == 1: break + # go to next page of market try: - clicker = driver.find_element(by=By.XPATH, value= - '/html/body/center/div[4]/div/div[3]/div[23]/div[2]/input[1]') - if clicker == "": + # Scroll to top of page to see navigation bar + driver.find_element(by=By.XPATH, value="//body").send_keys(Keys.CONTROL + Keys.HOME) + + goToPage(driver, listing) + nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']") + + if nav.is_enabled(): + # select next page + pgnum = uiClasses.Select(driver.find_element(by=By.XPATH, value="//select[@name='pageination']")) + # print("pg options:", pgnum.options) + numberOfPages = len(pgnum.options) + + if currentPage >= numberOfPages: + raise NoSuchElementException + + pgnum.select_by_index(currentPage) + currentPage += 1 + + # click button + pgbutton = driver.find_element(by=By.XPATH, value="//input[@value='go to page']") + pgbutton.click() + + # wait for website to load + time.sleep(10) + WebDriverWait(driver, 100).until(page_is_fully_loaded) + else: raise NoSuchElementException count += 1 @@ -259,39 +293,37 @@ def crawlForum(driver): has_next_page = False except Exception as e: - print(link, e) + print(listing, e) i += 1 print("Crawling the BlackPyramid market done.") -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not +# Returns 'True' if the link is Topic link def isDescriptionLink(url): - if 'products' in url: + if 'product' in url: return True return False # Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not def isListingLink(url): - if 'search' in url: + if 'category=' in url: return True return False -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through +# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - return blackpyramid_links_parser(soup) - + return BlackPyramid_links_parser(soup) def crawler(): startCrawling() - # print("Crawling and Parsing BlackPyramid .... DONE!") + # print("Crawling and Parsing BestCardingWorld .... DONE!") + + +if __name__ == "__main__": + #crawler() + new_parse("BlackPyramid", baseURL, False) diff --git a/MarketPlaces/BlackPyramid/parser.py b/MarketPlaces/BlackPyramid/parser.py index 743466a..c1ea43d 100644 --- a/MarketPlaces/BlackPyramid/parser.py +++ b/MarketPlaces/BlackPyramid/parser.py @@ -1,4 +1,4 @@ -__author__ = 'Helium' +__author__ = 'cern' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * @@ -11,111 +11,107 @@ from bs4 import BeautifulSoup #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of description page #return: 'row' that contains a variety of lists that each hold info on the description page -def darkfox_description_parser(soup): +def blackpyramid_description_parser(soup): # Fields to be parsed - name = "-1" # 0 Product_Name - describe = "-1" # 1 Product_Description - lastSeen = "-1" # 2 Product_LastViewDate - rules = "-1" # 3 NOT USED ... - CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 5 Product_MS_Classification (Microsoft Security) - review = "-1" # 6 Product_Number_Of_Reviews + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) category = "-1" # 7 Product_Category - shipFrom = "-1" # 8 Product_ShippedFrom - shipTo = "-1" # 9 Product_ShippedTo - left = "-1" # 10 Product_QuantityLeft - escrow = "-1" # 11 Vendor_Warranty - terms = "-1" # 12 Vendor_TermsAndConditions - vendor = "-1" # 13 Vendor_Name - sold = "-1" # 14 Product_QuantitySold - addDate = "-1" # 15 Product_AddedDate - available = "-1" # 16 NOT USED ... - endDate = "-1" # 17 NOT USED ... - BTC = "-1" # 18 Product_BTC_SellingPrice - USD = "-1" # 19 Product_USD_SellingPrice - rating = "-1" # 20 Vendor_Rating - success = "-1" # 21 Vendor_Successful_Transactions - EURO = "-1" # 22 Product_EURO_SellingPrice + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - name = soup.find('h1').text + name = soup.find('div', {'class': 'panel39002'}).find('span').next_sibling name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() + # product description + describe = soup.findAll('div', {'class': 'fer048953'})[1].text + describe = describe.replace('\n', ' ') + describe = describe.replace(",", "") + describe = describe.strip() + # Finding Vendor - vendor = soup.find('h3').find('a').text.strip() + vendor = soup.find('div', {'class': 'bold03905 vstat364'}).text + vendor = vendor.split(" ") + vendor = vendor[2][:-1] + vendor = vendor.replace('\n', ' ') + vendor = vendor.replace(",", "") + vendor = vendor.strip() # Finding Vendor Rating - rating = soup.find('span', {'class': "tag is-dark"}).text.strip() + rating_span = soup.find('span', {'class': 'to3098503t'}).find_next_sibling('span') + rating_num = rating_span.find('b').text + if rating_num != 'N/A': + rating = rating_num[0:3] # Finding Successful Transactions - success = soup.find('h3').text - success = success.replace("Vendor: ", "") - success = success.replace(vendor, "") - success = success.replace("(", "") - success = success.replace(")", "") + success_container = soup.find('ul', {'class': 'ul3o00953'}).findAll('li')[1] + success = success_container.find('div').text + success = success.replace('"', '') + success = success.replace("\n", " ") + success = success.replace(",", "") success = success.strip() - bae = soup.find('div', {'class': "box"}).find_all('ul') - # Finding Prices - USD = bae[1].find('strong').text.strip() - - li = bae[2].find_all('li') + USD_text = soup.find('li', {'class': 'vul2994 vghul995'}).find('div').text + USD = USD_text.split(',')[1] + USD = USD.replace('\n', ' ') + USD = USD.replace(",", "") + USD = USD.strip() - # Finding Escrow - escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Category - category = li[1].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Quantity Available - left = li[3].find('span', {'class': "tag is-dark"}).text.strip() + container = soup.find('ul', {'class': 'bic03095'}) # Finding Number Sold - sold = li[4].find('span', {'class': "tag is-dark"}).text.strip() - - li = bae[3].find_all('li') - - # Finding Shipment Information (Origin) - if "Ships from:" in li[-2].text: - shipFrom = li[-2].text - shipFrom = shipFrom.replace("Ships from: ", "") - # shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() - - # Finding Shipment Information (Destination) - shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text - shipTo = shipTo.replace("Ships to: ", "") - shipTo = shipTo.strip() - if "certain countries" in shipTo: - countries = "" - tags = li[-1].find_all('span', {'class': "tag"}) - for tag in tags: - country = tag.text.strip() - countries += country + ", " - shipTo = countries.strip(", ") - - # Finding the Product description - describe = soup.find('div', {'class': "pre-line"}).text - describe = describe.replace("\n", " ") - describe = describe.strip() + sold_container = container.find('li') + sold_div = sold_container.findAll('div')[2] + sold = sold_div.find('b').next_sibling + sold = sold.replace('"', '') + sold = sold.replace("\n", " ") + sold = sold.replace(",", "") + sold = sold.strip() - '''# Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1"''' + # Finding the Product Quantity Available + left_container = container.find('li') + left_div = left_container.findAll('div')[3] + left = left_div.find('b').next_sibling + left = left.replace('"', '') + left = left.replace("\n", " ") + left = left.replace(",", "") + left = left.strip() + + # Finding number of reviews + positive = soup.find('span', {'class': 'ar04999324'}).text + neutral = soup.find('span', {'class': 'ti9400005 can39953'}).text + negative = soup.find('span', {'class': 'ti9400005 ti90088 can39953'}).text + review = int(positive) + int(neutral) + int(negative) + + # Finding product image + image = soup.find('img', {'class': 'img0390503'}) + image = image.get('src') + image = image.split('base64,')[-1] + + vendor_image = soup.find('img', {'class': 'img0390503'}) + vendor_image = vendor_image.get('src') + vendor_image = vendor_image.split('base64,')[-1] # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -136,8 +132,8 @@ def darkfox_description_parser(soup): MS = MS.replace('\n', '') # Populating the final variable (this should be a list with all fields scraped) - row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, - sold, addDate, available, endDate, BTC, USD, rating, success, EURO) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row @@ -147,102 +143,109 @@ def darkfox_description_parser(soup): #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of listing page #return: 'row' that contains a variety of lists that each hold info on the listing page -def darkfox_listing_parser(soup): +def blackpyramid_listing_parser(soup): # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "DarkFox" # 0 Marketplace_Name - name = [] # 1 Product_Name - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category - describe = [] # 5 Product_Description - escrow = [] # 6 Vendor_Warranty - views = [] # 7 Product_Number_Of_Views - reviews = [] # 8 Product_Number_Of_Reviews - addDate = [] # 9 Product_AddDate - lastSeen = [] # 10 Product_LastViewDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft =[] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions - href = [] # 23 Product_Links (Urls) - - listing = soup.findAll('div', {"class": "card"}) + nm = 0 # *Total_Products (Should be Integer) + mktName = "Black Pyramid" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + + listing = soup.findAll('article', {"class": "product"}) + + # Some listing pages have an additional article section which is blank + if not listing[-1].findAll('a', href=True): + listing = listing[:-1] + # Populating the Number of Products nm = len(listing) - for a in listing: - bae = a.findAll('a', href=True) + for card in listing: + bae = card.findAll('a', href=True) # Adding the url to the list of urls - link = bae[0].get('href') - link = cleanLink(link) + link = bae[2].get('href') href.append(link) # Finding the Product - product = bae[1].find('p').text + product = bae[3].text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.replace("...", "") product = product.strip() name.append(product) - bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') - - if len(bae) >= 5: - # Finding Prices - price = bae[0].text - ud = price.replace(" USD", " ") - # u = ud.replace("$","") - u = ud.replace(",", "") - u = u.strip() - USD.append(u) - # bc = (prc[1]).strip(' BTC') - # BTC.append(bc) - - # Finding the Vendor - vendor_name = bae[1].find('a').text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Category - cat = bae[2].find('small').text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() - category.append(cat) - - # Finding Number Sold and Quantity Left - num = bae[3].text - num = num.replace("Sold: ", "") - num = num.strip() - sold.append(num) - - quant = bae[4].find('small').text - quant = quant.replace("In stock: ", "") - quant = quant.strip() - qLeft.append(quant) - - # Finding Successful Transactions - freq = bae[1].text - freq = freq.replace(vendor_name, "") - freq = re.sub(r'Vendor Level \d+', "", freq) - freq = freq.replace("(", "") - freq = freq.replace(")", "") - freq = freq.strip() - success.append(freq) + # Finding description + # 'recurisve = False' only searches direct children + desc = card.findChildren('div', recursive=False)[0] + desc = desc.findAll('div', recursive=False)[3].text + desc = desc.replace('\n', ' ') + desc = desc.replace(",", "") + desc = desc.strip() + describe.append(desc) + + # Finding Vendor Name + vendor_name = bae[4].find('span').text + vendor_name = vendor_name.split(' ')[1] + vendor_name = vendor_name.replace('\n', ' ') + vendor_name = vendor_name.replace(",", "") + vendor_name = vendor_name.strip() + vendor.append(vendor_name) + + # Finding the Category + cat = card.findAll('div', recursive=False)[0].findAll('div', recursive=False)[1].find('span').text + cat = cat.replace("\n", "") + cat = cat.replace(",", "") + cat = cat.strip() + category.append(cat) + + bae = card.findAll('div', recursive=False)[1].findAll('div', recursive=False)[1] + + # Finding amount left + left = bae.findAll('div', recursive=False)[1].text + left = left.replace("x", "") + left = left.replace('\n', ' ') + left = left.replace(",", "") + left = left.strip() + qLeft.append(left) + + # Finding amount sold + qsold = bae.findAll('div', recursive=False)[2].text + qsold = qsold.replace('\n', ' ') + qsold = qsold.replace("x", "") + qsold = qsold.replace(",", "") + qsold = qsold.strip() + sold.append(qsold) + + # Finding product image + product_image = card.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: cveValue="-1" else: @@ -255,7 +258,7 @@ def darkfox_listing_parser(soup): cveValue=cee CVE.append(cveValue) - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) + ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: MSValue="-1" else: @@ -269,23 +272,24 @@ def darkfox_listing_parser(soup): MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, - BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, + image_vendor) #called by the crawler to get description links on a listing page #@param: beautifulsoup object that is using the correct html page (listing page) #return: list of description links from a listing page -def blackpyramid_links_parser(soup): +def BlackPyramid_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] listing = soup.findAll('article', {"class": "product"}) - for div in listing: + for item in listing: - link = div.find('a', {"class": "ah39063"})['href'] + link = item.find('a', {"class": "ah39063"})['href'] href.append(link) - return href \ No newline at end of file + return href diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index 8769869..2f3341a 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -4,7 +4,7 @@ import psycopg2 import traceback import configparser from MarketPlaces.Utilities.utilities import * - +from dateutil.relativedelta import relativedelta, FR def connectDataBase(): @@ -273,6 +273,8 @@ def create_items(cur, row, marketId, vendorId): if newItem: + # decode_decrypt_image_in_base64(row[20]) + sql = "Insert into items (item_id, market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \ "views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \ "quantityleft_item, shippedfrom_item, shippedto_item, lastseen_item, image_item, href_item, dateinserted_item, " \ @@ -312,7 +314,7 @@ def create_items(cur, row, marketId, vendorId): recset = cur.fetchall() - # decode_decrypt_image_in_base64(recset[0][20]) + # decode_decrypt_image_in_base64(recset[0]['image_item']) if (str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or str(recset[0]['cve_item']) != str(row[6] if row[6] != '-1' else None) or @@ -401,6 +403,27 @@ def create_items(cur, row, marketId, vendorId): return itemId +def create_status(cur, marketId, date, listings, descriptions, status): + + date = datetime.strptime(date, "%m%d%Y") + + # getting last Fridays a reference date + date_reference = date + relativedelta(weekday=FR(-1)) + + # checking if status already exists + sql = "select * from marketplaces_status where market_id = %(market_id)s and date_inserted = %(date_inserted)s" + cur.execute(sql, {'market_id': marketId, 'date_inserted': date}) + + recset = cur.fetchall() + if recset: + sql = "Update marketplaces_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \ + "where market_id = %(market_id)s and date_inserted = %(date_inserted)s" + recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'market_id': marketId, 'date_inserted': date} + else: + sql = "Insert into marketplaces_status (market_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)" + recset = [marketId, date, listings, descriptions, status, date_reference] + + cur.execute(sql, recset) def create_database(cur, con): try: @@ -413,6 +436,12 @@ def create_database(cur, con): sql = "create unique index unique_market ON marketplaces USING btree (name_market ASC NULLS LAST)" cur.execute(sql) + sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, " \ + "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \ + "CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \ + "CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))" + cur.execute(sql) + sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \ "varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \ "null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py index fdfb640..dac91b0 100644 --- a/MarketPlaces/DarkBazar/crawler_selenium.py +++ b/MarketPlaces/DarkBazar/crawler_selenium.py @@ -216,12 +216,12 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + break + + # comment out + if count == 1: + break try: link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') @@ -236,7 +236,7 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the DarkBazar market done.") + print("Crawling the DarkBazar market done.") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/MarketPlaces/DarkBazar/parser.py b/MarketPlaces/DarkBazar/parser.py index 9386d18..3d56e92 100644 --- a/MarketPlaces/DarkBazar/parser.py +++ b/MarketPlaces/DarkBazar/parser.py @@ -170,7 +170,6 @@ def darkbazar_listing_parser(soup): # Adding the url to the list of urls link = bae[0].get('href') - link = cleanLink(link) href.append(link) # Finding the Product diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 87f811c..9d7692b 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -ThiefWorld \ No newline at end of file +Ares \ No newline at end of file diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index f85b46c..e5fe69a 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -14,6 +14,7 @@ from MarketPlaces.M00nkeyMarket.crawler_selenium import crawler as crawlerM00nke from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar +from MarketPlaces.Ares.crawler_selenium import crawler as crawlerAres import configparser import os @@ -107,5 +108,7 @@ if __name__ == '__main__': crawlerCypher() elif mkt == "PabloEscobarMarket": crawlerPabloEscobar() + elif mkt == "Ares": + crawlerAres() print("\nScraping process completed!") diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index e075541..79a2bdc 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -15,6 +15,10 @@ from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.PabloEscobarMarket.parser import * from MarketPlaces.CityMarket.parser import * +from MarketPlaces.DarkBazar.parser import * +from MarketPlaces.Sonanza.parser import * +from MarketPlaces.Kingdom.parser import * +from MarketPlaces.Ares.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -130,6 +134,14 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = pabloescobarmarket_listing_parser(soup) elif marketPlace == "CityMarket": rw = city_listing_parser(soup) + elif marketPlace == "Ares": + rw = ares_listing_parser(soup) + elif marketPlace == "DarkBazar": + rw = darkbazar_listing_parser(soup) + elif marketPlace == "Sonanza": + rw = sonanza_listing_parser(soup) + elif marketPlace == "Kingdom": + rw = kingdom_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -164,6 +176,14 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = pabloescobarmarket_description_parser(soup) elif marketPlace == "CityMarket": rmm = city_description_parser(soup) + elif marketPlace == "Ares": + rmm = ares_description_parser(soup) + elif marketPlace == "DarkBazar": + rmm = darkbazar_description_parser(soup) + elif marketPlace == "Sonanza": + rmm = sonanza_description_parser(soup) + elif marketPlace == "Kingdom": + rmm = kingdom_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -363,6 +383,16 @@ def new_parse(marketPlace, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) + # registering the current forum status (up/down) and the number of scraped pages in the database + marketId = verifyMarketPlace(cur, marketPlace) + if (marketId > 0): + + readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html')) + readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html')) + + create_status(cur, marketId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') + con.commit() + if createLog: logFile.close() diff --git a/MarketPlaces/Kingdom/crawler_selenium.py b/MarketPlaces/Kingdom/crawler_selenium.py index e6b489f..b8e99f0 100644 --- a/MarketPlaces/Kingdom/crawler_selenium.py +++ b/MarketPlaces/Kingdom/crawler_selenium.py @@ -1,4 +1,4 @@ -__author__ = 'DarkWeb' +__author__ = 'Helium' ''' Kingdom Market Crawler (Selenium) @@ -35,55 +35,28 @@ baseURL = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion # Opens Tor Browser, crawls the website def startCrawling(): - # marketName = getMarketName() + mktName = getMKTName() driver = getAccess() if driver != 'down': try: - captcha(driver) login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) closeDriver(driver) - # new_parse(marketName, False) + new_parse(mktName, baseURL, True) -def captcha(driver): - ''' - # wait for captcha page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div[1]"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div/div[2]').screenshot( - r'..\Kingdom\captcha1.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Kingdom\captcha1.png') - im.show() - - iframes = driver.find_elements(by=By.TAG_NAME, value='iframe') +# Login using premade account credentials and do login captcha manually +def login(driver): - # ask user input captcha solution in terminal - print("Enter squares from smallest to largest (squares are numbered 1-9 left to right)") - for order in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']: - id = input(f"{order}: ") - iframes[int(id)-1].click() - ''' input("Press ENTER when CAPTCHA is completed\n") # wait for login page WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - # wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) + (By.XPATH, '//*[@id="login-form"]'))) # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-name"]') @@ -96,39 +69,15 @@ def login(driver): select = Select(driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-sessiontime"]')) select.select_by_visible_text('24 hours') - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="captcha"]'))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Kingdom\captcha2.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Kingdom\captcha2.png') - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-captcha"]') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") + input("Press ENTER when CAPTCHA and DDOS is completed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div/div/div[3]/div[2]'))) + (By.XPATH, '/html/body/div/div/div[3]/div[1]/div/div[3]'))) # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Kingdom' return name @@ -166,8 +115,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -236,30 +185,17 @@ def getInterestedLinks(): links = [] # Software and Malware - links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=c298a77d9e93ad32') + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127') # # Services - # links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=c298a77d9e93ad32') - # # Exploits - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45') - # # Tools - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46') - # # Malware - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47') - # # Cryptography - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48') - # # Others - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49') - # # Hacking Tutorials - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50') - # # Hacked Accounts and Database Dumps - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30') - # # Android Moded pak - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53') + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45') + # # guides and tutorials + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=107') return links def crawlForum(driver): + print("Crawling the Kingdom market") linksToCrawl = getInterestedLinks() @@ -281,6 +217,7 @@ def crawlForum(driver): savePage(driver, html, link) list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -290,18 +227,15 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - temp = driver.find_element(by=By.XPATH, value= - '/html/body/div/div/div[3]/div[2]/div[2]/div/div/ul') - next = temp.find_element_by_class_name("next") - link = link.find_element_by_tag_name('a').get_attribute('href') + link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "»")]').get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -313,7 +247,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Kingdom Market done sucessfully. Press ENTER to continue\n") + print("Crawling the Kingdom market done.") # Returns 'True' if the link is Topic link @@ -325,7 +259,7 @@ def isDescriptionLink(url): # Returns True if the link is a listingPage link def isListingLink(url): - if 'category' in url: + if 'filter_category' in url: return True return False @@ -333,10 +267,8 @@ def isListingLink(url): # calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) return kingdom_links_parser(soup) def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") + startCrawling() \ No newline at end of file diff --git a/MarketPlaces/Kingdom/parser.py b/MarketPlaces/Kingdom/parser.py index b1e05d5..ae75d67 100644 --- a/MarketPlaces/Kingdom/parser.py +++ b/MarketPlaces/Kingdom/parser.py @@ -1,4 +1,4 @@ -__author__ = 'DarkWeb' +__author__ = 'Helium' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * @@ -31,6 +31,8 @@ def kingdom_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name @@ -38,56 +40,49 @@ def kingdom_description_parser(soup): desc = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-cont"}) name = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-head"}).text - name = name.replace('\n', ' ') - name = name.replace(',', ' ') - name = name.strip() + name = cleanString(name).strip() # Finding Prices # Kingdom prices can be shown in a variety of currencies, not all in USD, so keeping currency rows = desc.find_all('div', {"class", "row"}, recursive=False) - price = rows[-1].find('div', {"class": "row"}).find('h3').text - price = price.replace(',', '') - price = price.strip() - # USD = price.replace("USD",'') + USD = rows[-1].find('div', {"class": "row"}).find('h3').text + USD = cleanNumbers(USD).strip() BTC = rows[-1].find('div', {"class": "row"}).find_next_sibling('div').find('span').text + BTC = cleanNumbers(BTC).strip() # Finding Vendor vendor = rows[0].select_one('a[href^="/user"]').text - vendor = vendor.replace(",", " ") - vendor = vendor.strip() + vendor = cleanString(vendor).strip() # Finding Shipment Information (Origem) descs = rows[0].find_all('div', {"class": "col-md-3 text-right"}) shipFrom = descs[2].text - shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() + shipFrom = cleanString(shipFrom).strip() # Finding Shipment Information (Destiny) shipTo = rows[-1].find('div', {"class": "col-md-6"}).text shipTo = shipTo.replace("Ship to:","") - shipTo = shipTo.replace(",","").strip() - if(shipTo == ''): - shipTo = -1 + shipTo = cleanString(shipTo).strip() + if shipTo == '': + shipTo = "-1" # Finding the Product Category category = descs[0].text - category = category.replace(",", "") - category = category.strip() + category = cleanString(category).strip() # Finding the Product Quantity Available left = descs[1].text - left = left.replace(",", "") - left = left.strip() + left = cleanString(left).strip() # Finding when the Product was Added dt = descs[-1].text.strip() addDate = datetime.strptime(dt, '%d.%m.%Y') # Finding the Product description - describe = cleanString(soup.find('div', {"id": "descriptionContent"}).text) + describe = cleanString(soup.find('div', {"id": "descriptionContent"}).text).strip() # Finding the Number of Product Reviews - review = len(soup.find('div', {"id": "feedbackContent"}).find_all(recursive=False)) + reviews = str(len(soup.find('div', {"id": "feedbackContent"}).find_all(recursive=False))) # Searching for CVE and MS categories # no cve or ms in Kingdom @@ -95,7 +90,7 @@ def kingdom_description_parser(soup): # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results @@ -126,7 +121,9 @@ def kingdom_listing_parser(soup): qLeft =[] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links listing = soup.find('div', {"id": "p0"}).find('div').find_all('div', {"class": "row"}, recursive=False) @@ -139,29 +136,38 @@ def kingdom_listing_parser(soup): #in array USD, there may be prices not in USD, so includes currency as well prices = a.find('div', {"class": "col-md-3"}) u = prices.find('h3').text - u = u.strip() - u = u.replace(',', '') - u = u.strip() - USD.append(u) + USD.append(cleanNumbers(u).strip()) bc = prices.find('div').find('span').text - BTC.append(bc) + BTC.append(cleanNumbers(bc).strip()) # Finding the Product product = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]').text - product = product.replace('\n', ' ') - product = product.replace(","," ") - product = product.strip() - name.append(product) + name.append(cleanString(product).strip()) + + # Finding Product Image + product_image = a.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) # Finding the Vendor vendor_name = a.select_one('a[href^="/user"]').text - vendor_name = vendor_name.replace(",", " ").replace('/', '') - vendor_name = vendor_name.strip() - vendor.append(vendor_name) + vendor_name = vendor_name.replace('/', '') + vendor.append(cleanString(vendor_name).strip()) + + # Finding Views + product_views = a.find('div', {"class": "col-md-7"}).find_all('p')[0].text + views.append(cleanNumbers(product_views).strip()) + + # Finding Sold + product_sold = a.find('div', {"class": "base-label label label-rounded label-success"}) + if product_sold is not None: + sold.append(cleanNumbers(product_sold.text).strip()) + else: + sold.append("-1") # Adding the url to the list of urls link = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]')['href'] - link = cleanLink(link) href.append(link) # Searching for CVE and MS categories @@ -169,7 +175,8 @@ def kingdom_listing_parser(soup): # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, + image, image_vendor) def kingdom_links_parser(soup): diff --git a/MarketPlaces/Quest/crawler_selenium.py b/MarketPlaces/Quest/crawler_selenium.py index 69287a9..8a84d68 100644 --- a/MarketPlaces/Quest/crawler_selenium.py +++ b/MarketPlaces/Quest/crawler_selenium.py @@ -1,7 +1,7 @@ __author__ = 'DarkWeb' ''' -Quest Market Crawler (Selenium) +Quest Marketplace Crawler (Selenium) ''' from selenium import webdriver @@ -9,15 +9,17 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image +from selenium.webdriver.support.ui import Select +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from PIL import Image import urllib.parse as urlparse import os, re, time from datetime import date import subprocess +import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.Quest.parser import quest_links_parser @@ -27,9 +29,8 @@ counter = 1 baseURL = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion' -# Opens Tor Browser, crawls the website def startCrawling(): - marketName = getMarketName() + mktName = getMKTName() driver = getAccess() if driver != 'down': @@ -40,66 +41,18 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(marketName, False) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - #wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button"))) - - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - #Username here - usernameBox.send_keys('blabri') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - #Password here - passwordBox.send_keys('fishowal') - - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img').screenshot( - r'..\Quest\captcha.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Quest\captcha.png') - - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/input') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[5]/div/div/div/span'))) + new_parse(mktName, baseURL, True) # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Quest' return name -# Return the link of the website +# Return the base link of the website def getFixedURL(): url = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion' - return url @@ -129,8 +82,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -146,12 +99,13 @@ def createFFDriver(): service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -163,7 +117,28 @@ def getAccess(): return 'down' -# Saves the crawled html page +def login(driver): + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="username"]'))) + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + # Username here + usernameBox.send_keys('CashCarti') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + # Password here + passwordBox.send_keys('Mahogany') + # Clicking the login button + # login_button = driver.find_element(By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button') + # login_button.click() + + input("Press ENTER when CAPTCHA is completed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[1]/nav/div/a/img'))) + + def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -172,7 +147,6 @@ def savePage(driver, page, url): return -# Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE @@ -185,7 +159,11 @@ def getFullPathName(url): return fullPath -# Creates the file name from passed URL +def getMKTName() -> str: + name = 'Quest' + return name + + def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) @@ -198,23 +176,26 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Digital - Services - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/8ae67900-22ed-11ec-a710-31f963ce8d35') - # # Digital - Software - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/92809300-22ed-11ec-b143-af312e1dab77') - # # Digital - Tutorials - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/9d1592b0-22ed-11ec-b82d-c3d2878a8716') - # # Digital - Malware - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/a35bae90-22ed-11ec-ad2e-410f5a5339b5') - # # Digital - Hacking - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/b4252cf0-22ed-11ec-8032-751549438ed5') - # Digital - Exploits + ## Services + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/8ae67900-22ed-11ec-a710-31f963ce8d35') + ## Software + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/92809300-22ed-11ec-b143-af312e1dab77') + ## Tutorial + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/9d1592b0-22ed-11ec-b82d-c3d2878a8716') + ## Malware + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/a35bae90-22ed-11ec-ad2e-410f5a5339b5') + ## Hacking + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/b4252cf0-22ed-11ec-8032-751549438ed5') + ## Exploits links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/c0c3ac60-22ed-11ec-9e97-41cd1912fdee') + ## Carding + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/cbe06b00-22ec-11ec-ab3a-816857220dec') return links def crawlForum(driver): + print("Crawling the Quest market") linksToCrawl = getInterestedLinks() @@ -236,6 +217,7 @@ def crawlForum(driver): savePage(driver, html, link) list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -245,18 +227,16 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - nav = driver.find_element(by=By.XPATH, value='/html/body/div[6]/nav') - li = nav.find_elements(By.TAG_NAME, 'li') - a = li[-1].find_element(By.TAG_NAME, 'a') - link = a.get_attribute('href') + link_elem = driver.find_element(by=By.CSS_SELECTOR, value='a.page-link[rel="next"]') + link = link_elem.get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -268,24 +248,23 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Quest market done sucessfully. Press ENTER to continue\n") + print("Crawling the Quest market done.") -# Returns 'True' if the link is Topic link +# Returns 'True' if the link is Topic link, may need to change for every website def isDescriptionLink(url): if 'product' in url: return True return False -# Returns True if the link is a listingPage link +# Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): if 'category' in url: return True return False -# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") return quest_links_parser(soup) @@ -293,4 +272,3 @@ def productPages(html): def crawler(): startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/Quest/parser.py b/MarketPlaces/Quest/parser.py index 6761ed9..6852b04 100644 --- a/MarketPlaces/Quest/parser.py +++ b/MarketPlaces/Quest/parser.py @@ -7,9 +7,11 @@ from MarketPlaces.Utilities.utilities import * from bs4 import BeautifulSoup -# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) +# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of description page +# return: 'row' that contains a variety of lists that each hold info on the description page def quest_description_parser(soup): - # Fields to be parsed vendor = "-1" # 0 *Vendor_Name @@ -31,111 +33,69 @@ def quest_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo - - row = soup.find_all('div', {'class': "row"}) + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - name = row[1].text - name = name.replace('\n', ' ') - name = name.replace(",", "") - name = name.strip() + name = soup.find('div', class_='card-header bg-dark text-white rounded-0 text-center').text + name = cleanString(name).strip() - small = row[3].find_all('small') + # USD Price + USD = soup.find('small', text='Product Price:').find_next('small').text.replace('$', '').strip() - # Finding Vendor - vendor = small[0].text - vendor = vendor.replace("Vendor:", "") - vendor = vendor.replace(",", "") - vendor = vendor.strip() + # Product Description + describe = soup.find('textarea').text + describe = cleanString(describe).strip() - # Finding Vendor Rating - full_stars = small[2].find_all('i', {'class': "fas fa-star"}) - half_star = small[2].find('i', {'class': "fas fa-star-half-alt"}) - rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0) + # Finding Product Image + image = soup.find('img', {'class': 'img-fluid'}) + image = image.get('src').split('base64,')[-1] + + # Finding Vendor Image + vendor_image = soup.select_one('.card-body.bg-mgray.css-selector.shadow img') + vendor_image = vendor_image.get('src').split('base64,')[-1] # Finding Successful Transactions - success = small[4].text - success = success.replace("Total Sales:", "") - success = success.strip() - - small = row[2].find('p', {'class': "text-left"}).find_all('small') - - # Finding Prices - USD = small[1].text - USD = USD.replace("$", "") - USD = USD.strip() - - shipping_info = row[2].find('p', {'class': "text-left"}).find('span').text.strip() - if "Digital" not in shipping_info: - shipping_info = shipping_info.split(" ") - - # Finding Shipment Information (Origin) - shipFrom = shipping_info[0].strip() - - # Finding Shipment Information (Destination) - shipTo = shipping_info[1].strip() - - textarea = row[2].find_all('textarea') - - # Finding the Product description - describe = textarea[0].text - describe = describe.replace("\n", " ") - describe = describe.replace("\r", " ") - describe = describe.strip() - - ''' - # Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1" - ''' - - # Searching for CVE and MS categories - cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if cve: - CVE = " " - for idx in cve: - CVE += (idx) - CVE += " " - CVE = CVE.replace(',', ' ') - CVE = CVE.replace('\n', '') - ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) - if ms: - MS = " " - for im in ms: - MS += (im) - MS += " " - MS = MS.replace(',', ' ') - MS = MS.replace('\n', '') + success = soup.find('strong', text='Total Sales:').parent.text + success = cleanNumbers(success).strip() + + # Finding Vendor Rating + temp = soup.find('strong', text='Rating:').parent + rating_vendor = len(temp.findAll('i', {"class": "fas fa-star"})) + half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) + if half_stars > 0: + rating_vendor += 0.5 + + # Finding Item Rating + temp = soup.find('small', text='Average Product Score:').find_next('small') + rating_item = len(temp.findAll('i', {"class": "fas fa-star"})) + half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) + if half_stars > 0: + rating_item += 0.5 # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row -# This is the method to parse the Listing Pages +# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of listing page +# return: 'row' that contains a variety of lists that each hold info on the listing page def quest_listing_parser(soup): # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "Quest" # 0 *Marketplace_Name + nm = 0 # *Total_Products (Should be Integer) + mktName = "quest" # 0 *Marketplace_Name vendor = [] # 1 *Vendor y rating_vendor = [] # 2 Vendor_Rating success = [] # 3 Vendor_Successful_Transactions name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this category = [] # 7 Product_Category y describe = [] # 8 Product_Description views = [] # 9 Product_Number_Of_Views @@ -146,87 +106,73 @@ def quest_listing_parser(soup): USD = [] # 14 Product_USD_SellingPrice y EURO = [] # 15 Product_EURO_SellingPrice sold = [] # 16 Product_QuantitySold - qLeft =[] # 17 Product_QuantityLeft + qLeft = [] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links - - # Finding category of listing page - cat = soup.find('span', {'class': "btn btn-sm btn-outline-mgray active border-info"}).text - cat = cat.replace("Digital -", "") - cat = cat.strip() - - listing = soup.find_all('div', {"class": "col-md-2 my-md-0 col-12"}) + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + # Extract all product listings + listing = soup.findAll('div', class_='col-md-2 my-md-0 col-12') # Populating the Number of Products nm = len(listing) for a in listing: - bae = a.find_all('a', href=True) - - # Adding the category - category.append(cat) - - # Adding the url to the list of urls - link = bae[0].get('href') - link = cleanLink(link) - href.append(link) - - # Finding the Vendor - vendor_name = bae[2].text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Product - product = bae[1].find('img').get('alt') - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.strip() - name.append(product) - - # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if not cve: - cveValue="-1" - else: - cee = " " - for idx in cve: - cee += (idx) - cee += " " - cee = cee.replace(',', ' ') - cee = cee.replace('\n', '') - cveValue=cee - CVE.append(cveValue) - - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) - if not ms: - MSValue="-1" - else: - me = " " - for im in ms: - me += (im) - me += " " - me = me.replace(',', ' ') - me = me.replace('\n', '') - MSValue=me - MS.append(MSValue) + + # Extracting Product URL & Name + product_link_tags = a.find_all('a', class_='badge-info') + if product_link_tags: + # Using the first tag as default + product_link_tag = product_link_tags[0] + href.append(product_link_tag['href']) + name.append(cleanString(product_link_tag.text).strip()) + + # Extracting Product Image + img_tag = a.find('img') + if img_tag: + image_data = img_tag['src'].split('base64,')[-1] + image.append(image_data) + + # Extracting Vendor Name + vendor_tag = a.find('a', class_='badge-dark') + if vendor_tag: + vendor.append(cleanString(vendor_tag.text.replace('👤', '')).strip()) + + # Extracting Product Price in USD + price_tag = a.find('a', class_='text') + if price_tag: + USD.append(price_tag.text.replace("$", "").strip()) + + category_tag = soup.find('span', class_= 'btn btn-sm btn-outline-mgray active border-info') + if category_tag: + category.append(cleanString(category_tag.text).strip()) # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) +# called by the crawler to get description links on a listing page +# @param: beautifulsoup object that is using the correct html page (listing page) +# return: list of description links from a listing page def quest_links_parser(soup): + # Returning all product links - # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('div', {"class": "col-md-2 my-md-0 col-12"}) + # Locate all divs with class 'row' + row_divs = soup.findAll('div', class_='row') + + for row_div in row_divs: + # Locate all product divs within the current 'row' div + product_divs = row_div.findAll('div', class_='col-md-2 my-md-0 col-12') - for div in listing: + for product_div in product_divs: + # Locate the anchor tag containing the product link within each product div + product_link_tag = product_div.find('a', class_='badge-info') - link = div.find('a')["href"] - href.append(link) + if product_link_tag and product_link_tag.has_attr('href'): + href.append(product_link_tag['href']) return href \ No newline at end of file diff --git a/MarketPlaces/Sonanza/crawler_selenium.py b/MarketPlaces/Sonanza/crawler_selenium.py new file mode 100644 index 0000000..c0ea415 --- /dev/null +++ b/MarketPlaces/Sonanza/crawler_selenium.py @@ -0,0 +1,263 @@ +__author__ = 'DarkWeb' + +''' +Sonanza Marketplace Crawler (Selenium) +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.ui import Select +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +from PIL import Image +import urllib.parse as urlparse +import os, re, time +from datetime import date +import subprocess +import configparser +from bs4 import BeautifulSoup +from MarketPlaces.Initialization.prepare_parser import new_parse +from MarketPlaces.Sonanza.parser import sonanza_links_parser +from MarketPlaces.Utilities.utilities import cleanHTML + +counter = 1 +baseURL = 'http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/' + + +def startCrawling(): + mktName = getMKTName() + driver = getAccess() + + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closeDriver(driver) + + new_parse(mktName, baseURL, True) + + +# Returns the name of the website +def getMKTName(): + name = 'Sonanza' + return name + + +# Return the base link of the website +def getFixedURL(): + url = 'http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/' + return url + + +# Closes Tor Browser +def closeDriver(driver): + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) + + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("permissions.default.image", 3) + ff_prof.set_preference("browser.download.folderList", 2) + ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", False) + ff_prof.update_preferences() + + service = Service(config.get('TOR', 'geckodriver_path')) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + driver.maximize_window() + + return driver + + +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' +def getAccess(): + url = getFixedURL() + driver = createFFDriver() + try: + driver.get(url) + return driver + except: + driver.close() + return 'down' + + +def login(driver): + # input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + # + # # entering username and password into input boxes + # usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]') + # # Username here + # usernameBox.send_keys('aliciamykeys') + # passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]') + # # Password here + # passwordBox.send_keys('aliciawherearemykey$') + # # session time + # session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select')) + # session_select.select_by_visible_text('Session 60min') + + input("Press ENTER when CAPTCHA is completed and listing page loaded\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="searchbar"]'))) + + +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +def getFullPathName(url): + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') + else: + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') + return fullPath + + +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if name == '': + name = str(counter) + counter = counter + 1 + return name + + +def getInterestedLinks(): + links = [] + + # Guides and Tutorials + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/3') + # Software and Malware + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/4') + # Fraud + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/5') + # Digital Products + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/21') + # Services + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/22') + + return links + + +def crawlForum(driver): + + print("Crawling the Sonanza market") + + linksToCrawl = getInterestedLinks() + + i = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] + print('Crawling :', link) + try: + has_next_page = True + count = 0 + + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(driver, html, link) + + list = productPages(html) + + for item in list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver, driver.page_source, item) + driver.back() + + # # comment out + # break + # + # # comment out + # if count == 1: + # break + + try: + link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "›")]').get_attribute('href') + if link == "": + raise NoSuchElementException + count += 1 + + except NoSuchElementException: + has_next_page = False + + except Exception as e: + print(link, e) + i += 1 + + print("Crawling the Sonanza market done.") + + +# Returns 'True' if the link is Topic link, may need to change for every website +def isDescriptionLink(url): + if 'article' in url: + return True + return False + + +# Returns True if the link is a listingPage link, may need to change for every website +def isListingLink(url): + if 'category' in url: + return True + return False + + +def productPages(html): + soup = BeautifulSoup(html, "html.parser") + return sonanza_links_parser(soup) + + +def crawler(): + startCrawling() diff --git a/MarketPlaces/Sonanza/parser.py b/MarketPlaces/Sonanza/parser.py new file mode 100644 index 0000000..10166f5 --- /dev/null +++ b/MarketPlaces/Sonanza/parser.py @@ -0,0 +1,238 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from MarketPlaces.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + + +# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of description page +# return: 'row' that contains a variety of lists that each hold info on the description page +def sonanza_description_parser(soup): + # Fields to be parsed + + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image + + listing = soup.find('div', {"id": "article_page"}) + + # Finding the Product + name = listing.find('div', {"class": "row box"}).text + name = cleanString(name).strip() + + # Finding Product Image + product_image = listing.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image = product_image + + table = listing.find('div', {"class": "col-md-5"}) + + # Finding Prices + USD = table.find('span', {"class": "pr"}).text + USD = USD.replace("$", "").strip() + + BTC = table.find_all('span', {"class": "pr1"})[1].text + BTC = BTC.replace("BTC", "").strip() + + rows = table.find_all('p', {"class": "mb-0"}) + for row in rows: + temp = row.text + if "CATEGORY" in temp: + category = temp.replace("CATEGORY :", "") + category = cleanString(category).strip() + elif "VENDOR LEVEL" in temp: + rating_vendor = temp.replace("VENDOR LEVEL :", "") + rating_vendor = cleanString(rating_vendor).strip() + + rows = listing.find_all('p', {"class": "mb-1"}) + for row in rows: + temp = row.text + if "VENDOR" in temp: + vendor = temp.replace("VENDOR :", "") + vendor = cleanString(vendor).strip() + elif "SHIPS TO" in temp: + shipTo = temp.replace("SHIPS TO :", "") + shipTo = cleanString(shipTo).strip() + elif "SOLD" in temp: + sold = cleanNumbers(temp).strip() + + # Finding Product Description + describe = listing.find('pre').text + describe = cleanString(describe).strip() + + # Searching for CVE and MS categories + cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if cve: + CVE = " " + for idx in cve: + CVE += (idx) + CVE += " " + CVE = CVE.replace(',', ' ') + CVE = CVE.replace('\n', '') + ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) + if ms: + MS = " " + for im in ms: + MS += (im) + MS += " " + MS = MS.replace(',', ' ') + MS = MS.replace('\n', '') + + # Populating the final variable (this should be a list with all fields scraped) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) + + # Sending the results + return row + + +# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of listing page +# return: 'row' that contains a variety of lists that each hold info on the listing page +def sonanza_listing_parser(soup): + + # Fields to be parsed + nm = 0 # *Total_Products (Should be Integer) + mktName = "Sonanza" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + + listings = soup.findAll('div', {"class": "col-sm-4 col-md-3"}) + + # Populating the Number of Products + nm = len(listings) + + for listing in listings: + + # Adding the url to the list of urls + bae = listing.find('a', href=True) + link = bae.get('href') + href.append(link) + + # Finding Product Image + product_image = listing.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + + # Finding the Product + product = listing.find('h5', {"class": "art_title"}).text + product = cleanString(product) + name.append(product.strip()) + + # Finding Prices + price = listing.find('span', {"class": "priceP"}).text + price = price.replace("$", "") + USD.append(price.strip()) + + rows = listing.find_all('p', {"class": "mb-0 card-text"}) + for row in rows: + temp = row.text + if "CATEGORY" in temp: + cat = temp.replace("CATEGORY :", "") + cat = cleanString(cat) + category.append(cat.strip()) + elif "VENDOR" in temp: + vendor_name = temp.replace("VENDOR :", "") + vendor_name = cleanString(vendor_name) + vendor.append(vendor_name.strip()) + + # Finding Vendor Rating + rating = listing.find('span', {"class": "badge badge-info"}).text + rating = rating.replace("VENDOR LEVEL :", "") + rating = cleanString(rating) + rating_vendor.append(rating.strip()) + + # Searching for CVE and MS categories + cve = listing.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if not cve: + cveValue = "-1" + else: + cee = " " + for idx in cve: + cee += (idx) + cee += " " + cee = cee.replace(',', ' ') + cee = cee.replace('\n', '') + cveValue = cee + CVE.append(cveValue) + + ms = listing.findAll(text=re.compile('MS\d{2}-\d{3}')) + if not ms: + MSValue = "-1" + else: + me = " " + for im in ms: + me += (im) + me += " " + me = me.replace(',', ' ') + me = me.replace('\n', '') + MSValue = me + MS.append(MSValue) + + # Populate the final variable (this should be a list with all fields scraped) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) + + +# called by the crawler to get description links on a listing page +# @param: beautifulsoup object that is using the correct html page (listing page) +# return: list of description links from a listing page +def sonanza_links_parser(soup): + # Returning all links that should be visited by the Crawler + + href = [] + listings = soup.findAll('div', {"class": "col-sm-4 col-md-3"}) + + for listing in listings: + a = listing.find('a', href=True) + + # Adding the url to the list of urls + link = a.get('href') + href.append(link) + + return href \ No newline at end of file