From d90bf734d0ad334a87ea8530ba8b3530e6c259c2 Mon Sep 17 00:00:00 2001 From: dabadcuber5 Date: Tue, 25 Jul 2023 10:37:45 -0700 Subject: [PATCH] finished TorMarket --- MarketPlaces/Initialization/marketsList.txt | 2 +- MarketPlaces/Initialization/prepare_parser.py | 15 +- MarketPlaces/TorBay/parser.py | 1 - MarketPlaces/TorMarket/crawler_selenium.py | 32 +- MarketPlaces/TorMarket/parser.py | 345 +++++++----------- setup.ini | 14 +- 6 files changed, 165 insertions(+), 244 deletions(-) diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 356649b..0393154 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -LionMarketplace \ No newline at end of file +TorMarket \ No newline at end of file diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 21663fb..75090b1 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -16,7 +16,7 @@ from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.DarkMatter.parser import * from MarketPlaces.DigitalThriftShop.parser import * from MarketPlaces.LionMarketplace.parser import * - +from MarketPlaces.TorMarket.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -166,7 +166,8 @@ def new_parse(marketPlace, url, createLog): rmm = digitalThriftShop_description_parser(soup) elif marketPlace == "LionMarketplace": rmm = lionmarketplace_description_parser(soup) - + elif marketPlace == "TorMarket": + rmm = tormarket_description_parser(soup) # key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2] key = u"Url:" + os.path.basename(line2).replace(".html", "") @@ -197,8 +198,8 @@ def new_parse(marketPlace, url, createLog): html = open(line1.strip('\n')) soup = BeautifulSoup(html, "html.parser") html.close() - except: - + except Exception as e: + raise e nError += 1 print("There was a problem to read the file " + line1 + " in the Listing section!") if createLog: @@ -232,13 +233,17 @@ def new_parse(marketPlace, url, createLog): rw = digitalThriftShop_listing_parser(soup) elif marketPlace == "LionMarketplace": rw = lionmarketplace_listing_parser(soup) + elif marketPlace == "TorMarket": + rw = tormarket_listing_parser(soup) else: parseError = True - except: + except Exception as e: nError += 1 print("There was a problem to parse the file " + line1 + " in the listing section!") + print("I am here for some reason!") + raise e if createLog: logFile.write( str(nError) + ". There was a problem to parse the file " + line1 + " in the Listing section.\n") diff --git a/MarketPlaces/TorBay/parser.py b/MarketPlaces/TorBay/parser.py index 25134fa..5c4f164 100644 --- a/MarketPlaces/TorBay/parser.py +++ b/MarketPlaces/TorBay/parser.py @@ -2,7 +2,6 @@ __author__ = 'DarkWeb' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * - # Here, we are importing BeautifulSoup to search through the HTML tree from bs4 import BeautifulSoup diff --git a/MarketPlaces/TorMarket/crawler_selenium.py b/MarketPlaces/TorMarket/crawler_selenium.py index 0528a05..ed94a8b 100644 --- a/MarketPlaces/TorMarket/crawler_selenium.py +++ b/MarketPlaces/TorMarket/crawler_selenium.py @@ -31,19 +31,19 @@ baseURL = 'http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() - # mktName = getMKTName() - driver = getAccess() + # opentor() + mktName = getMKTName() + # driver = getAccess() + # + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) - - # new_parse(forumName, baseURL, False) + new_parse(mktName, baseURL, False) # Opens Tor Browser @@ -187,9 +187,9 @@ def getInterestedLinks(): # Hacking Tutorials links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/hacking/') # # Malware - # links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/malware/') + links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/malware/') # # Hacking Services - # links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/hacking-services/') + links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/hacking-services/') return links @@ -232,8 +232,8 @@ def crawlForum(driver): break # comment out - if count == 1: - break + # if count == 1: + # break try: link = driver.find_element(by=By.XPATH, value= diff --git a/MarketPlaces/TorMarket/parser.py b/MarketPlaces/TorMarket/parser.py index 847ca50..69d680c 100644 --- a/MarketPlaces/TorMarket/parser.py +++ b/MarketPlaces/TorMarket/parser.py @@ -6,12 +6,13 @@ from MarketPlaces.Utilities.utilities import * # Here, we are importing BeautifulSoup to search through the HTML tree from bs4 import BeautifulSoup +import re #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of description page #return: 'row' that contains a variety of lists that each hold info on the description page -def darkfox_description_parser(soup): +def tormarket_description_parser(soup): # Fields to be parsed @@ -39,101 +40,32 @@ def darkfox_description_parser(soup): success = "-1" # 21 Vendor_Successful_Transactions EURO = "-1" # 22 Product_EURO_SellingPrice - # Finding Product Name - name = soup.find('h1').text - name = name.replace('\n', ' ') - name = name.replace(",", "") - name = name.strip() - - # Finding Vendor - vendor = soup.find('h3').find('a').text.strip() - - # Finding Vendor Rating - rating = soup.find('span', {'class': "tag is-dark"}).text.strip() - - # Finding Successful Transactions - success = soup.find('h3').text - success = success.replace("Vendor: ", "") - success = success.replace(vendor, "") - success = success.replace("(", "") - success = success.replace(")", "") - success = success.strip() - - bae = soup.find('div', {'class': "box"}).find_all('ul') - - # Finding Prices - USD = bae[1].find('strong').text.strip() - - li = bae[2].find_all('li') - - # Finding Escrow - escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Category - category = li[1].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Quantity Available - left = li[3].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding Number Sold - sold = li[4].find('span', {'class': "tag is-dark"}).text.strip() - - li = bae[3].find_all('li') - - # Finding Shipment Information (Origin) - if "Ships from:" in li[-2].text: - shipFrom = li[-2].text - shipFrom = shipFrom.replace("Ships from: ", "") - # shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() - - # Finding Shipment Information (Destination) - shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text - shipTo = shipTo.replace("Ships to: ", "") - shipTo = shipTo.strip() - if "certain countries" in shipTo: - countries = "" - tags = li[-1].find_all('span', {'class': "tag"}) - for tag in tags: - country = tag.text.strip() - countries += country + ", " - shipTo = countries.strip(", ") - - # Finding the Product description - describe = soup.find('div', {'class': "pre-line"}).text - describe = describe.replace("\n", " ") - describe = describe.strip() - - '''# Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1"''' - - # Searching for CVE and MS categories - cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if cve: - CVE = " " - for idx in cve: - CVE += (idx) - CVE += " " - CVE = CVE.replace(',', ' ') - CVE = CVE.replace('\n', '') - ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) - if ms: - MS = " " - for im in ms: - MS += (im) - MS += " " - MS = MS.replace(',', ' ') - MS = MS.replace('\n', '') + #finding the name of the product + name_of_product = soup.find("h1", {"class": "product_title entry-title"}).find("a").text + name = cleanString(name_of_product.strip()) + #finding the description of the product + description_of_product = soup.find("div", {"class": "woocommerce-product-details__short-description"}).text + describe = cleanString(description_of_product.strip()) + #finding the replies + inquires_about_product = soup.find("div", {"class": "woocommerce-Tabs-panel woocommerce-Tabs-panel--wcfm_enquiry_tab panel entry-content wc-tab"}).find("p").text + if inquires_about_product == "There are no inquiries yet.": + review = 0 + else: + review = -1 #fix later pls + + #finding the terms and conditions + terms_and_conditions = soup.find("div", {"class": "woocommerce-Tabs-panel woocommerce-Tabs-panel--wcfm_enquiry_tab panel entry-content wc-tab"}).find("p").text + term = cleanString(terms_and_conditions) + + #finding the name of the vendor + name_of_vendor = soup.find("div", {"class": "wcfmmp_sold_by_store"}).find("a").text + vendor = cleanString(name_of_vendor) + + #finding the price of the item + price = soup.find("p", {"class": "price"}).find("bdi").text + price_cleaned = price[1:] + USD = price_cleaned.strip() + #everything else gets a -1 because they are not found # Populating the final variable (this should be a list with all fields scraped) row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, @@ -147,130 +79,113 @@ def darkfox_description_parser(soup): #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of listing page #return: 'row' that contains a variety of lists that each hold info on the listing page -def darkfox_listing_parser(soup): +def tormarket_listing_parser(soup): # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "DarkFox" # 0 Marketplace_Name - name = [] # 1 Product_Name - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category - describe = [] # 5 Product_Description - escrow = [] # 6 Vendor_Warranty - views = [] # 7 Product_Number_Of_Views - reviews = [] # 8 Product_Number_Of_Reviews - addDate = [] # 9 Product_AddDate - lastSeen = [] # 10 Product_LastViewDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft =[] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions - href = [] # 23 Product_Links (Urls) - - listing = soup.findAll('div', {"class": "card"}) - - # Populating the Number of Products - nm = len(listing) - - for a in listing: - bae = a.findAll('a', href=True) - - # Adding the url to the list of urls - link = bae[0].get('href') - link = cleanLink(link) - href.append(link) - - # Finding the Product - product = bae[1].find('p').text - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.replace("...", "") - product = product.strip() - name.append(product) - - bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') - - if len(bae) >= 5: - # Finding Prices - price = bae[0].text - ud = price.replace(" USD", " ") - # u = ud.replace("$","") - u = ud.replace(",", "") - u = u.strip() - USD.append(u) - # bc = (prc[1]).strip(' BTC') - # BTC.append(bc) - - # Finding the Vendor - vendor_name = bae[1].find('a').text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Category - cat = bae[2].find('small').text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() - category.append(cat) - - # Finding Number Sold and Quantity Left - num = bae[3].text - num = num.replace("Sold: ", "") - num = num.strip() - sold.append(num) - - quant = bae[4].find('small').text - quant = quant.replace("In stock: ", "") - quant = quant.strip() - qLeft.append(quant) - - # Finding Successful Transactions - freq = bae[1].text - freq = freq.replace(vendor_name, "") - freq = re.sub(r'Vendor Level \d+', "", freq) - freq = freq.replace("(", "") - freq = freq.replace(")", "") - freq = freq.strip() - success.append(freq) + nm = 0 # *Total_Products (Should be Integer) + mktName = "TorMarket" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = [] # 6 Product_MS_Classification (Microsoft Security) + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + href = [] # 20 Product_Links + products_list = soup.find_all('li') + nm = 0 + for product in products_list: + try: + # Finding the name of the product + name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text + name_of_product_cleaned = cleanString(name_of_product.strip()) + print(name_of_product_cleaned) + name.append(name_of_product_cleaned) + #finding the URL + try: + url = product.find("div", {"class": "product-loop-content text-center"}).find("a").get("href") + print(url) + href.append(url) + except AttributeError as e: + print("I can't find the link") + raise e + + #finding the rating of the product + rating_score_of_product = product.find("div", {"class": "product-loop-content text-center"}).find("div").find("span").text + rating_item.append(cleanString(rating_score_of_product.strip())) + print("done") + #finding the rating of the vendors + rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"}).find("strong").text + rating_vendor.append(cleanString(rating_score_of_vendor.strip())) + print("done") + #finding the cost in USD + cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text + USD.append(cost) + print("done") + #finding the name of the vendor + vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"}).find("a").text + vendor.append(cleanString(vendor_name.strip())) + print("done") + #everything else appends a -1 + success.append("-1") + CVE.append("-1") + MS.append("-1") + category.append("-1") + describe.append("-1") + views.append("-1") + reviews.append("-1") + addDate.append("-1") + BTC.append("-1") + EURO.append("-1") + sold.append("-1") + qLeft.append("-1") + shipFrom.append("-1") + shipTo.append("-1") + print("Done! moving onto the next product!") + print(len(shipTo)) + nm += 1 + except AttributeError as e: + print("I'm somewhere I don't belong. I'm going to leave") + continue - # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if not cve: - cveValue="-1" - else: - cee = " " - for idx in cve: - cee += (idx) - cee += " " - cee = cee.replace(',', ' ') - cee = cee.replace('\n', '') - cveValue=cee - CVE.append(cveValue) - - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) - if not ms: - MSValue="-1" - else: - me = " " - for im in ms: - me += (im) - me += " " - me = me.replace(',', ' ') - me = me.replace('\n', '') - MSValue=me - MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, - BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) + return organizeProducts( + marketplace = "TorMarket", + nm = nm, + vendor = vendor, + rating_vendor = rating_vendor, + success_vendor = success, + nombre = name, + CVE = CVE, + MS = MS, + category = category, + describe = describe, + views = views, + reviews = reviews, + rating_item = rating_item, + addDate = addDate, + BTC = BTC, + USD = USD, + EURO = EURO, + sold = sold, + qLeft = qLeft, + shipFrom = shipFrom, + shipTo = shipTo, + href = href + ) #called by the crawler to get description links on a listing page diff --git a/setup.ini b/setup.ini index 641d3f1..3ff6f3e 100644 --- a/setup.ini +++ b/setup.ini @@ -1,15 +1,17 @@ [TOR] -firefox_binary_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe -firefox_profile_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Data\\Browser\\profile.default -geckodriver_path = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test\\selenium\\geckodriver.exe +firefox_binary_path = C:\Users\dabadcuber5\Desktop\Tor Browser\Browser\firefox.exe +firefox_profile_path = C:\Users\dabadcuber5\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default +geckodriver_path = C:\Users\dabadcuber5\dw_pipeline_test\selenium\geckodriver.exe [Project] -project_directory = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test -shared_folder = \\VBoxSvr\\Shared +project_directory = C:\Users\dabadcuber5\dw_pipeline_test +shared_folder = \\Mac\\Shared + + [PostgreSQL] ip = localhost username = postgres -password = password +password = Ilovelucky1! database = darkweb_markets_forums \ No newline at end of file