diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml index ba22e8a..08a5719 100644 --- a/.idea/DW_Pipeline_Test.iml +++ b/.idea/DW_Pipeline_Test.iml @@ -2,7 +2,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 11f1ea0..dc9ea49 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 1f089e6..e3cc468 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -206,17 +206,17 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript def move_file(filePath, createLog, logFile): - # source = line2.replace(os.path.basename(line2), "") + filename source = filePath destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' try: - shutil.move(source, destination) + shutil.move(source, destination, shutil.copytree) return True except: - print("There was a problem to move the file " + filePath) incrementError() + print("There was a problem to move the file " + filePath) + traceback.print_exc() if createLog: logFile.write( str(nError) + ". There was a problem to move the file " + filePath + "\n") diff --git a/MarketPlaces/AnonMarket/crawler_selenium.py b/MarketPlaces/AnonMarket/crawler_selenium.py new file mode 100644 index 0000000..2171d84 --- /dev/null +++ b/MarketPlaces/AnonMarket/crawler_selenium.py @@ -0,0 +1,289 @@ +__author__ = 'Helium' + +''' +Anon Market Crawler (Selenium) +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +from PIL import Image +import urllib.parse as urlparse +import os, re, time +from datetime import date +import subprocess +import configparser +from bs4 import BeautifulSoup +from MarketPlaces.Initialization.prepare_parser import new_parse +from MarketPlaces.AnonMarket.parser import AnonMarket_links_parser +from MarketPlaces.Utilities.utilities import cleanHTML + +counter = 1 +baseURL = 'http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion' + + +# Opens Tor Browser, crawls the website, then parses, then closes tor +#acts like the main method for the crawler, another function at the end of this code calls this function later +def startCrawling(): + opentor() + mktName = getMKTName() + driver = getAccess() + + if driver != 'down': + try: + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) + + new_parse(mktName, baseURL, True) + +# Opens Tor Browser +#prompts for ENTER input to continue +def opentor(): + from MarketPlaces.Initialization.markets_mining import config + + global pid + print("Connecting Tor...") + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) + pid = pro.pid + time.sleep(7.5) + input('Tor Connected. Press ENTER to continue\n') + return + +# Returns the name of the website +#return: name of site in string type +def getMKTName(): + name = 'AnonMarket' + return name + + +# Return the base link of the website +#return: url of base site in string type +def getFixedURL(): + url = 'http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion' + return url + + +# Closes Tor Browser +#@param: current selenium driver +def closetor(driver): + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) + + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("permissions.default.image", 2) + ff_prof.set_preference("browser.download.folderList", 2) + ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", False) + ff_prof.update_preferences() + + service = Service(config.get('TOR', 'geckodriver_path')) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + driver.maximize_window() + + return driver + + +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' +#return: return the selenium driver or string 'down' +def getAccess(): + url = getFixedURL() + driver = createFFDriver() + try: + driver.get(url) + return driver + except: + driver.close() + return 'down' + +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +# Gets the full path of the page to be saved along with its appropriate file name +#@param: raw url as crawler crawls through every site +def getFullPathName(url): + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') + else: + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') + return fullPath + + +# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned +#@param: raw url as crawler crawls through every site +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if (name == ''): + name = str(counter) + counter = counter + 1 + return name + + +# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list +#in this example, there are a couple of categories some threads fall under such as +# Guides and Tutorials, Digital Products, and Software and Malware +#as you can see they are categories of products +def getInterestedLinks(): + links = [] + # # Software + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/civil_softwares') + # # Malware + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/malware') + # # Bootkits + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/bootkits') + # # Backdoors + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/backdoors') + # # Keyloggers + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/keyloggers') + # # Wireless Trackers + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wireless_trackers') + # # Screen Scrapers + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/screen_scrapers') + # # Mobile Forensic Tools + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/mobile_forensics_tools') + # # Wifi Jammers + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wifi_jammers') + # # Carding + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/carding') + # # Worms + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/worms') + # # Viruses + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/viruses') + # # Trojans + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/trojans') + # # Botnets + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/botnets') + # # Security Technology + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security_technology') + # # Hacks + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/hacks') + # # Exploit kits + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/exploit_kit') + # # Security + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security') + + return links + + +# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through +#topic and description pages are crawled through here, where both types of pages are saved +#@param: selenium driver +def crawlForum(driver): + print("Crawling Anon Market") + + linksToCrawl = getInterestedLinks() + + for link in linksToCrawl: + print('Crawling :', link) + + has_next_page = True + + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + + html = driver.page_source + savePage(driver, html, link) + + # Get all product links on the current page + products_list = productPages(html) + for item in products_list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver, driver.page_source, item) + driver.back() # Go back to listing after visiting each product + + # Find the active page number + active_page_element = driver.find_element(By.XPATH, '//div[@class="page activepage"]') + current_page = int(active_page_element.text) + + # Locate the next page link + try: + next_page_element = active_page_element.find_element(By.XPATH, 'following-sibling::a[1]') + link = next_page_element.get_attribute('href') + except NoSuchElementException: + has_next_page = False + + print("Crawling Anon Market done.") + +# Returns 'True' if the link is a description link +#@param: url of any url crawled +#return: true if is a description page, false if not +def isDescriptionLink(url): + if 'product' in url: + return True + return False + + +# Returns True if the link is a listingPage link +#@param: url of any url crawled +#return: true if is a Listing page, false if not +def isListingLink(url): + if 'category' in url: + return True + return False + + +# calling the parser to define the links, the html is the url of a link from the list of interested link list +#@param: link from interested link list ie. getInterestingLinks() +#return: list of description links that should be crawled through +def productPages(html): + soup = BeautifulSoup(html, "html.parser") + return AnonMarket_links_parser(soup) + +def crawler(): + startCrawling() + # print("Crawling and Parsing Nexus .... DONE!") + diff --git a/MarketPlaces/AnonMarket/parser.py b/MarketPlaces/AnonMarket/parser.py new file mode 100644 index 0000000..c53283c --- /dev/null +++ b/MarketPlaces/AnonMarket/parser.py @@ -0,0 +1,207 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from MarketPlaces.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + +import re + +#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +#stores info it needs in different lists, these lists are returned after being organized +#@param: soup object looking at html page of description page +#return: 'row' that contains a variety of lists that each hold info on the description page +def AnonMarket_description_parser(soup): + + # Fields to be parsed + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + + name_of_product = soup.find("div", {"class": "heading"}).text + name = cleanString(name_of_product.strip()) + + description_div = soup.find("div", {"class": "tab1"}) + if description_div is None: + describe = "-1" + else: + describe = cleanString(description_div.text.strip()) + + info_div = soup.find('div', {'class': 'information'}) + table = info_div.find('table') if info_div else None + + if table: + # Find all table rows + rows = table.find_all('tr') + + # Parse each row to get relevant data + data = {} + for row in rows: + columns = row.find_all('td') + if len(columns) == 3: + key = columns[0].text.strip() + value = columns[2].text.strip() + data[key] = value + + # Extract specific data from the dictionary and assign them to individual variables + vendor = data.get('Vendor', '-1') + shipFrom = data.get('Location', '-1') + shipTo = data.get('Ships to', '-1') + category = data.get('Category', '-1') + USD = data.get('Price', '-1').split()[0] + left = data.get('Stock', '-1') + + # Populating the final variable (this should be a list with all fields scraped) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo) + + + # Sending the results + return row + + +#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +#stores info it needs in different lists, these lists are returned after being organized +#@param: soup object looking at html page of listing page +#return: 'row' that contains a variety of lists that each hold info on the listing page +def AnonMarket_listing_parser(soup): + + # Fields to be parsed + nm = 0 # *Total_Products (Should be Integer) + mktName = "AnonMarket" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = [] # 6 Product_MS_Classification (Microsoft Security) + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + href = [] # 20 Product_Links + base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion" + + products_list = soup.find_all('div', {'class': 'item'}) + nm = 0 + for product in products_list: + try: + name_of_product = product.find("div", {"class": "title"}).text.strip() + name.append(name_of_product) + + name_of_vendor = product.find("a", {'class': 'seller'}).text.strip() + vendor.append(name_of_vendor) + + cat = soup.find("div", {'class': 'heading'}).text + category.append(cat) + + product_link_element = product.find("div", {"class": "title"}).find_parent('a') + if product_link_element: + link = product_link_element['href'] + if "/product/" in link and "/user/" not in link: + full_link = base_url + link + href.append(full_link) + else: + href.append("-1") + else: + href.append("-1") + + # Append '-1' for unavailable data + rating_vendor.append("-1") + success.append("-1") + CVE.append("-1") + MS.append("-1") + describe.append("-1") + views.append("-1") + reviews.append("-1") + addDate.append("-1") + BTC.append("-1") + EURO.append("-1") + sold.append("-1") + qLeft.append("-1") + shipFrom.append("-1") + shipTo.append("-1") + + nm += 1 + + except AttributeError as e: + print("I'm somewhere I don't belong. I'm going to leave") + continue + + # Populate the final variable (this should be a list with all fields scraped) + return organizeProducts( + marketplace = "AnonMarket", + nm = nm, + vendor = vendor, + rating_vendor = rating_vendor, + success_vendor = success, + nombre = name, + CVE = CVE, + MS = MS, + category = category, + describe = describe, + views = views, + reviews = reviews, + rating_item = rating_item, + addDate = addDate, + BTC = BTC, + USD = USD, + EURO = EURO, + sold = sold, + qLeft = qLeft, + shipFrom = shipFrom, + shipTo = shipTo, + href = href + ) + + +#called by the crawler to get description links on a listing page +#@param: beautifulsoup object that is using the correct html page (listing page) +#return: list of description links from a listing page +def AnonMarket_links_parser(soup): + # Base URL to prepend to each product link + base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion" + + # Returning all links that should be visited by the Crawler + href = [] + + # Using a shorter, but still unique, class name + listing = soup.find('div', {'class': 'items'}).find_all('a', href=True, attrs={'href': lambda x: "/product/" in x}) + + for a in listing: + link = a.get('href') + if link: # Checks if 'href' attribute is not None + # Prepending the base URL to the scraped link + full_link = base_url + link + href.append(full_link) + + # Filtering out any links that might not have '/product/' in them + product_links = [link for link in href if '/product/' in link] + + return product_links diff --git a/MarketPlaces/CityMarket/crawler_selenium.py b/MarketPlaces/CityMarket/crawler_selenium.py index 8dc0e4f..704b840 100644 --- a/MarketPlaces/CityMarket/crawler_selenium.py +++ b/MarketPlaces/CityMarket/crawler_selenium.py @@ -188,7 +188,7 @@ def getInterestedLinks(): # # Hiring hacker # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=3') # virus and malware - links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=15') + # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=15') # # ddos # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=16') # # software @@ -196,7 +196,7 @@ def getInterestedLinks(): # # botnets # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=18') # # hacking service - # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=31') + links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=31') return links diff --git a/MarketPlaces/CityMarket/parser.py b/MarketPlaces/CityMarket/parser.py index dbab917..e5f3575 100644 --- a/MarketPlaces/CityMarket/parser.py +++ b/MarketPlaces/CityMarket/parser.py @@ -11,7 +11,7 @@ from bs4 import BeautifulSoup #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of description page #return: 'row' that contains a variety of lists that each hold info on the description page -def darkfox_description_parser(soup): +def city_description_parser(soup): # Fields to be parsed @@ -39,68 +39,46 @@ def darkfox_description_parser(soup): success = "-1" # 21 Vendor_Successful_Transactions EURO = "-1" # 22 Product_EURO_SellingPrice + divmd7 = soup.find('div', {'class': "col-md-7"}) + ptag = soup.findAll('p') + # Finding Product Name - name = soup.find('h1').text - name = name.replace('\n', ' ') - name = name.replace(",", "") - name = name.strip() + # NA # Finding Vendor - vendor = soup.find('h3').find('a').text.strip() + vendor = divmd7.find('a').text.strip() # Finding Vendor Rating - rating = soup.find('span', {'class': "tag is-dark"}).text.strip() + # NA # Finding Successful Transactions - success = soup.find('h3').text - success = success.replace("Vendor: ", "") - success = success.replace(vendor, "") - success = success.replace("(", "") - success = success.replace(")", "") - success = success.strip() - - bae = soup.find('div', {'class': "box"}).find_all('ul') + success = soup.find('span', {'class': "badge-primary"}) # Finding Prices - USD = bae[1].find('strong').text.strip() + USD = soup.find('span', {'class': "total"}).text.strip() - li = bae[2].find_all('li') + BTC = soup.find('div', {'class': "text-center"}).text.strip() # Finding Escrow - escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip() + escrow = ptag[-1].text.strip() # Finding the Product Category - category = li[1].find('span', {'class': "tag is-dark"}).text.strip() + category = ptag[-2].text.strip() # Finding the Product Quantity Available - left = li[3].find('span', {'class': "tag is-dark"}).text.strip() + # NA # Finding Number Sold - sold = li[4].find('span', {'class': "tag is-dark"}).text.strip() - - li = bae[3].find_all('li') + # NA # Finding Shipment Information (Origin) - if "Ships from:" in li[-2].text: - shipFrom = li[-2].text - shipFrom = shipFrom.replace("Ships from: ", "") - # shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() + # NA # Finding Shipment Information (Destination) - shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text - shipTo = shipTo.replace("Ships to: ", "") - shipTo = shipTo.strip() - if "certain countries" in shipTo: - countries = "" - tags = li[-1].find_all('span', {'class': "tag"}) - for tag in tags: - country = tag.text.strip() - countries += country + ", " - shipTo = countries.strip(", ") + # NA # Finding the Product description - describe = soup.find('div', {'class': "pre-line"}).text + describe = soup.find('div', {'class': "text-white"}).text describe = describe.replace("\n", " ") describe = describe.strip() @@ -147,11 +125,11 @@ def darkfox_description_parser(soup): #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of listing page #return: 'row' that contains a variety of lists that each hold info on the listing page -def darkfox_listing_parser(soup): +def city_listing_parser(soup): # Fields to be parsed nm = 0 # Total_Products (Should be Integer) - mktName = "DarkFox" # 0 Marketplace_Name + mktName = "CityMarket" # 0 Marketplace_Name name = [] # 1 Product_Name CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = [] # 3 Product_MS_Classification (Microsoft Security) @@ -188,7 +166,7 @@ def darkfox_listing_parser(soup): href.append(link) # Finding the Product - product = bae[1].find('p').text + product = a.find('h4', {"class": "text-center"}).text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.replace("...", "") @@ -197,49 +175,29 @@ def darkfox_listing_parser(soup): bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') - if len(bae) >= 5: - # Finding Prices - price = bae[0].text - ud = price.replace(" USD", " ") - # u = ud.replace("$","") - u = ud.replace(",", "") - u = u.strip() - USD.append(u) - # bc = (prc[1]).strip(' BTC') - # BTC.append(bc) - - # Finding the Vendor - vendor_name = bae[1].find('a').text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Category - cat = bae[2].find('small').text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() - category.append(cat) - - # Finding Number Sold and Quantity Left - num = bae[3].text - num = num.replace("Sold: ", "") - num = num.strip() - sold.append(num) - - quant = bae[4].find('small').text - quant = quant.replace("In stock: ", "") - quant = quant.strip() - qLeft.append(quant) - - # Finding Successful Transactions - freq = bae[1].text - freq = freq.replace(vendor_name, "") - freq = re.sub(r'Vendor Level \d+', "", freq) - freq = freq.replace("(", "") - freq = freq.replace(")", "") - freq = freq.strip() - success.append(freq) + # Finding Prices + price = a.find('div', {"class": "price"}).text + tempUSD = price.split("~")[0] + tempUSD = tempUSD.replace("$", "") + tempUSD = tempUSD.strip() + USD.append(tempUSD) + + tempBTC = price.split("~")[1] + tempBTC = tempBTC.replace("BTC", "") + tempBTC = tempBTC.strip() + BTC.append(tempBTC) + + # Finding the Vendor + # NA + + # Finding the Category + # NA + + # Finding Number Sold and Quantity Left + # NA + + # Finding Successful Transactions + # NA # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -281,7 +239,7 @@ def city_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('div', {"class": "col-12 p-0"}) + listing = soup.findAll('div', {"class": "p-4"}) for a in listing: bae = a.find('a', href=True) diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index 7779f10..62e97f8 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -26,6 +26,7 @@ from MarketPlaces.Nexus.crawler_selenium import crawler as crawlerNexus from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher from MarketPlaces.DarkBazar.crawler_selenium import crawler as crawlerDarkBazar from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar +from MarketPlaces.AnonMarket.crawler_selenium import crawler as crawlerAnonMarket import configparser import os @@ -143,5 +144,7 @@ if __name__ == '__main__': crawlerDarkBazar() elif mkt == "PabloEscobarMarket": crawlerPabloEscobar() + elif mkt == "AnonMarket": + crawlerAnonMarket() print("\nScraping process completed!") diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index c7699bd..60abf80 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -23,6 +23,7 @@ from MarketPlaces.Nexus.parser import * from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.DarkBazar.parser import * from MarketPlaces.PabloEscobarMarket.parser import * +from MarketPlaces.AnonMarket.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -158,6 +159,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = darkbazar_listing_parser(soup) elif marketPlace == "PabloEscobarMarket": rw = pabloescobarmarket_listing_parser(soup) + elif marketPlace == "AnonMarket": + rw = AnonMarket_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -213,6 +216,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = darkbazar_description_parser(soup) elif marketPlace == "PabloEscobarMarket": rmm = pabloescobarmarket_description_parser(soup) + elif marketPlace == "AnonMarket": + rmm = AnonMarket_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -253,17 +258,18 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript def move_file(filePath, createLog, logFile): - # source = line2.replace(os.path.basename(line2), "") + filename + source = filePath destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' try: - shutil.move(source, destination) + shutil.move(source, destination, shutil.copytree) return True except: - print("There was a problem to move the file " + filePath) incrementError() + print("There was a problem to move the file " + filePath) + traceback.print_exc() if createLog: logFile.write( str(nError) + ". There was a problem to move the file " + filePath + "\n") diff --git a/MarketPlaces/LionMarketplace/parser.py b/MarketPlaces/LionMarketplace/parser.py index 81a911c..a37febf 100644 --- a/MarketPlaces/LionMarketplace/parser.py +++ b/MarketPlaces/LionMarketplace/parser.py @@ -56,7 +56,7 @@ def lionmarketplace_description_parser(soup): name = (cleanString(temp.strip())) # product description - temp = soup.find('div', {'class': "mt-4"}).find(text=True, recursive=False) + temp = soup.find('div', {'class': "mt-4"}).contents[-1] describe = cleanString(temp.strip()) # Finding Product Image diff --git a/MarketPlaces/MetaVerseMarket/parser.py b/MarketPlaces/MetaVerseMarket/parser.py index 8c83293..047db35 100644 --- a/MarketPlaces/MetaVerseMarket/parser.py +++ b/MarketPlaces/MetaVerseMarket/parser.py @@ -11,7 +11,7 @@ from bs4 import BeautifulSoup # stores info it needs in different lists, these lists are returned after being organized # @param: soup object looking at html page of description page # return: 'row' that contains a variety of lists that each hold info on the description page -def darkfox_description_parser(soup): +def metaversemarket_description_parser(soup): # Fields to be parsed name = "-1" # 0 Product_Name @@ -36,67 +36,58 @@ def darkfox_description_parser(soup): EURO = "-1" # 22 Product_EURO_SellingPrice # Finding Product Name - name = soup.find('h1').text - name = name.replace('\n', ' ') - name = name.replace(",", "") - name = name.strip() + name = soup.find('div', {'class': "panel-heading"}).text.strip # Finding Vendor - vendor = soup.find('h3').find('a').text.strip() + temp = soup.findAll('div', {'class': "col-xs-12 col-sm-6 mt-5"}) + temp = temp[1].findAll('span') + temp = temp[1].find('b').text + name = temp.replace("@", "") - # Finding Vendor Rating - rating = soup.find('span', {'class': "tag is-dark"}).text.strip() + # Finding Product Reviews + review = soup.find('span', {'class': "badge bg-success fs-12px"}).text.strip() # Finding Successful Transactions - success = soup.find('h3').text - success = success.replace("Vendor: ", "") - success = success.replace(vendor, "") - success = success.replace("(", "") - success = success.replace(")", "") - success = success.strip() + # NA - bae = soup.find('div', {'class': "box"}).find_all('ul') # Finding Prices - USD = bae[1].find('strong').text.strip() - - li = bae[2].find_all('li') + USD = soup.find('h3', {'class': "mb-2"}).text() + USD = USD.replace("Price: $", "").strip() # Finding Escrow - escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip() + escrow = soup.find('div', {'class': "alert alert-info text-center fw-bold"}).text + escrow = escrow.replace('You are protected by ', "").strip() # Finding the Product Category - category = li[1].find('span', {'class': "tag is-dark"}).text.strip() + temp = soup.select('div[class="mt-2"]') + temp = temp[0].findAll('span') + category = temp[1].text.strip() # Finding the Product Quantity Available - left = li[3].find('span', {'class': "tag is-dark"}).text.strip() + # temp = soup.find('em', {'class': "icon ni ni-layers-fill"}).parent.parent.parent + # left = temp.text + # left = left.replace("Supply:", "") + # left = left.strip() + temp = soup.findAll('span', {'class': "badge bg-success"}) + temp = temp[1].text.split("/") + left = temp[1].strip() # Finding Number Sold - sold = li[4].find('span', {'class': "tag is-dark"}).text.strip() + sold = temp[0].strip() - li = bae[3].find_all('li') # Finding Shipment Information (Origin) - if "Ships from:" in li[-2].text: - shipFrom = li[-2].text - shipFrom = shipFrom.replace("Ships from: ", "") - # shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() + temp = soup.findAll('div', {'class': "alert alert-info"}) + temp = temp[1].text.split("to") + shipFrom = temp[0].replace("Shipping from ", "").strip() # Finding Shipment Information (Destination) - shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text - shipTo = shipTo.replace("Ships to: ", "") - shipTo = shipTo.strip() - if "certain countries" in shipTo: - countries = "" - tags = li[-1].find_all('span', {'class': "tag"}) - for tag in tags: - country = tag.text.strip() - countries += country + ", " - shipTo = countries.strip(", ") + shipTo = temp[1].split("for") + shipTo = shipTo[0].strip() # Finding the Product description - describe = soup.find('div', {'class': "pre-line"}).text + describe = soup.find('p', {'class': "card-text"}).text describe = describe.replace("\n", " ") describe = describe.strip() @@ -143,7 +134,7 @@ def darkfox_description_parser(soup): # stores info it needs in different lists, these lists are returned after being organized # @param: soup object looking at html page of listing page # return: 'row' that contains a variety of lists that each hold info on the listing page -def darkfox_listing_parser(soup): +def metaversemarket_listing_parser(soup): # Fields to be parsed nm = 0 # Total_Products (Should be Integer) mktName = "DarkFox" # 0 Marketplace_Name @@ -169,7 +160,7 @@ def darkfox_listing_parser(soup): success = [] # 20 Vendor_Successful_Transactions href = [] # 23 Product_Links (Urls) - listing = soup.findAll('div', {"class": "card"}) + listing = soup.findAll('div', {"class": "col-12 col-sm-4 col-xl-3 product_item_col p-1"}) # Populating the Number of Products nm = len(listing) @@ -183,58 +174,77 @@ def darkfox_listing_parser(soup): href.append(link) # Finding the Product - product = bae[1].find('p').text + product = bae[1].find('span', {"class": "text-primary"}).text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.replace("...", "") product = product.strip() name.append(product) - bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') - - if len(bae) >= 5: - # Finding Prices - price = bae[0].text - ud = price.replace(" USD", " ") - # u = ud.replace("$","") - u = ud.replace(",", "") - u = u.strip() - USD.append(u) - # bc = (prc[1]).strip(' BTC') - # BTC.append(bc) - - # Finding the Vendor - vendor_name = bae[1].find('a').text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Category - cat = bae[2].find('small').text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() - category.append(cat) - - # Finding Number Sold and Quantity Left - num = bae[3].text - num = num.replace("Sold: ", "") - num = num.strip() - sold.append(num) - - quant = bae[4].find('small').text - quant = quant.replace("In stock: ", "") - quant = quant.strip() - qLeft.append(quant) - - # Finding Successful Transactions - freq = bae[1].text - freq = freq.replace(vendor_name, "") - freq = re.sub(r'Vendor Level \d+', "", freq) - freq = freq.replace("(", "") - freq = freq.replace(")", "") - freq = freq.strip() - success.append(freq) + # Finding Prices + price = a.find('strong').text + price = price.replace("Buy for $", "") + price = price.strip() + USD.append(price) + + + # Finding the Vendor + temp = a.find('div', {'class': "mt-1 fs-12px"}) + temp = temp.findAll('span') + temp = temp[1].find('b').text + vendor_name = temp.replace("@", "").strip() + vendor.append(vendor_name) + + # Finding the Category + cat = a.select_one('div[class="fs-12px"]') + cat = cat.findAll('span')[1].text + cat = cat.text + cat = cat.strip() + category.append(cat) + + badge = a.findAll('span', {'class': "badge bg-success"}) + # Finding Number Sold and Quantity Left + temp = badge[1].text + temp = temp.split("/") + num = temp[0] + num = num.strip() + sold.append(num) + + quant = temp[1] + quant = quant.strip() + qLeft.append(quant) + + # Finding Successful Transactions + # NA + + # Finding Product review + review = a.find('span', {'class': "badge bg-success fs-12px"}).text + review = review.replace("+ ", "") + reviews.append(review) + + # Finding Descrption + description = a.find('p', {'class': "alert alert-light text-ssbold p-1"}).text + description = description.replace("\n", " ") + description = description.strip() + describe.append(description) + + # Finding Escrow + es = a.find('span', {'class': "fw-bold"}).text.strip() + escrow.append(es) + + # Finding Number of Views + view = a.find('span', {'class': "badge bg-primary"}).text.strip() + views.append(view) + + # Find where ships from + ships = a.find('div', {'class': "alert alert-info item_alert fs-12px p-1"}) + ships = ships.findAll('b') + sFrom = ships[0].text.strips() + shipFrom.append(sFrom) + + # Find where it ships to + sTo = ships[1].text.strips() + shipTo.append(sTo) # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -275,7 +285,7 @@ def metaversemarket_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('div', {"class": "col-12 p-0"}) + listing = soup.findAll('div', {"class": "col-12 col-sm-4 col-xl-3 product_item_col p-1"}) for a in listing: bae = a.find('a', href=True) diff --git a/MarketPlaces/Nexus/crawler_selenium.py b/MarketPlaces/Nexus/crawler_selenium.py index d7c84c2..4ae7cfe 100644 --- a/MarketPlaces/Nexus/crawler_selenium.py +++ b/MarketPlaces/Nexus/crawler_selenium.py @@ -85,8 +85,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -96,7 +96,7 @@ def createFFDriver(): ff_prof.set_preference('network.proxy.socks', '127.0.0.1') ff_prof.set_preference('network.proxy.socks_port', 9150) ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", False) + ff_prof.set_preference("javascript.enabled", True) ff_prof.update_preferences() service = Service(config.get('TOR', 'geckodriver_path')) @@ -204,6 +204,12 @@ def crawlForum(driver): driver.get(link) except: driver.refresh() + + # waiting for btc price to load + WebDriverWait(driver, 30).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/ul/li[1]/div/span/span[3]"))) + time.sleep(5) + html = driver.page_source savePage(driver, html, link) @@ -214,6 +220,11 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() + + # waiting for btc price to load + WebDriverWait(driver, 30).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/div[3]/div[2]/p/span[3]"))) + savePage(driver, driver.page_source, item) driver.back() @@ -225,8 +236,7 @@ def crawlForum(driver): break try: - link = driver.find_element(by=By.XPATH, value= - '/html/body/div[1]/div[2]/div/div/main/nav/ul/li[3]/a').get_attribute('href') + link = driver.find_element(by=By.LINK_TEXT, value='→').get_attribute('href') if link == "": raise NoSuchElementException count += 1 diff --git a/MarketPlaces/Nexus/parser.py b/MarketPlaces/Nexus/parser.py index 5b9636b..f673110 100644 --- a/MarketPlaces/Nexus/parser.py +++ b/MarketPlaces/Nexus/parser.py @@ -43,6 +43,10 @@ def nexus_description_parser(soup): name_of_product = soup.find("h1", {"class": "product_title entry-title"}).text name = cleanString(name_of_product.strip()) + # Find the BTC Price + prices = soup.find('p', {"class": "price"}).findAll('span', {"class": "cs"}) + BTC = prices[0].text + BTC = cleanNumbers(BTC.strip()) # finding the description of the product description_div = soup.find("div", {"class": "woocommerce-product-details__short-description"}) @@ -52,7 +56,7 @@ def nexus_description_parser(soup): describe = cleanString(description_div.text.strip()) # Finding Product Image - image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img') + image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img') image = image.get('src') image = image.split('base64,')[-1] @@ -110,56 +114,53 @@ def nexus_listing_parser(soup): image_vendor = [] # 21 Vendor_Image href = [] # 22 Product_Links - products_list = soup.find_all('li') - nm = 0 + main = soup.find('main', {'id': 'main'}) + products_list = main.find('ul', recursive=False).find_all('li', recursive=False) + nm = len(products_list) + for product in products_list: + # Finding the name of the product + name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text + name_of_product_cleaned = cleanString(name_of_product.strip()) + # print(name_of_product_cleaned) + name.append(name_of_product_cleaned) + #finding the URL try: - # Finding the name of the product - name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text - name_of_product_cleaned = cleanString(name_of_product.strip()) - # print(name_of_product_cleaned) - name.append(name_of_product_cleaned) - #finding the URL - try: - url = product.find("a", class_="woocommerce-loop-product__link").get('href') - href.append(url) - except AttributeError as e: - print("I can't find the link") - raise e - - # Finding Product Image - product_image = product.find('a', {'class': 'woocommerce-loop-image-link woocommerce-LoopProduct-link woocommerce-loop-product__link'}).find('img') - product_image = product_image.get('src') - product_image = product_image.split('base64,')[-1] - image.append(product_image) - - BTC.append("-1") - - #everything else appends a -1 - rating_vendor.append("-1") - USD.append("-1") - vendor.append("-1") - success.append("-1") - CVE.append("-1") - MS.append("-1") - category.append("-1") - describe.append("-1") - views.append("-1") - reviews.append("-1") - addDate.append("-1") - EURO.append("-1") - sold.append("-1") - qLeft.append("-1") - shipFrom.append("-1") - shipTo.append("-1") - image_vendor.append("-1") - # print("Done! moving onto the next product!") - # print(len(shipTo)) - nm += 1 + url = product.find("a", class_="woocommerce-loop-product__link").get('href') + href.append(url) except AttributeError as e: - print("I'm somewhere I don't belong. I'm going to leave") - continue - + print("I can't find the link") + raise e + + # Finding Product Image + product_image = product.find('a', {'class': 'woocommerce-loop-image-link woocommerce-LoopProduct-link woocommerce-loop-product__link'}).find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + + # Finding BTC Price + prices = product.find('span', {"class": "price"}).findAll('span', {"class": "cs"}) + price = prices[0].text + BTC.append(cleanNumbers(price.strip())) + + #everything else appends a -1 + rating_vendor.append("-1") + USD.append("-1") + vendor.append('-1') + success.append("-1") + CVE.append("-1") + MS.append("-1") + category.append("-1") + describe.append("-1") + views.append("-1") + reviews.append("-1") + addDate.append("-1") + EURO.append("-1") + sold.append("-1") + qLeft.append("-1") + shipFrom.append("-1") + shipTo.append("-1") + image_vendor.append("-1") # Populate the final variable (this should be a list with all fields scraped) return organizeProducts( diff --git a/MarketPlaces/RobinhoodMarket/parser.py b/MarketPlaces/RobinhoodMarket/parser.py index 5de7a70..c036d17 100644 --- a/MarketPlaces/RobinhoodMarket/parser.py +++ b/MarketPlaces/RobinhoodMarket/parser.py @@ -51,14 +51,17 @@ def Robinhood_description_parser(soup): # Finding description desc = '' tab = soup.find('div', {"id": "tab-description"}) - for p in tab.findAll('p'): - desc += p.text + if tab is not None: + for p in tab.findAll('p'): + desc += p.text if desc == '': - desc = soup.find('div', {"class": "woocommerce-product-details__short-description"}).text + short = soup.find('div', {"class": "woocommerce-product-details__short-description"}) + if short is not None: + desc = short.text describe = cleanString(desc.strip()) # Finding Product Image - image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img') + image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img') image = image.get('src') image = image.split('base64,')[-1] @@ -164,7 +167,7 @@ def Robinhood_listing_parser(soup): name.append(product) # Finding Product Image - product_image = card.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'}) + product_image = card.find('a').find('img') product_image = product_image.get('src') product_image = product_image.split('base64,')[-1] image.append(product_image)