From 1d1d2e8e96781557eeee4b031c66e64881c2bff9 Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Thu, 8 Feb 2024 12:28:11 -0800 Subject: [PATCH] finished WeTheNorth and MikesGrandStore --- MarketPlaces/GoFish/crawler_selenium.py | 25 ++-- .../MikesGrandStore/crawler_selenium.py | 29 ++--- MarketPlaces/MikesGrandStore/parser.py | 57 +++------ MarketPlaces/WeTheNorth/crawler_selenium.py | 62 +++------ MarketPlaces/WeTheNorth/parser.py | 119 ++++++++++-------- 5 files changed, 123 insertions(+), 169 deletions(-) diff --git a/MarketPlaces/GoFish/crawler_selenium.py b/MarketPlaces/GoFish/crawler_selenium.py index 19612cb..aad1dc5 100644 --- a/MarketPlaces/GoFish/crawler_selenium.py +++ b/MarketPlaces/GoFish/crawler_selenium.py @@ -118,7 +118,9 @@ def getAccess(): def login(driver): - input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="username"]'))) # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') @@ -131,6 +133,7 @@ def login(driver): submit = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/form/div[7]/input') submit.click() + input("Press ENTER when CAPTCHA is complete and home page has loaded\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( @@ -175,17 +178,17 @@ def getInterestedLinks(): links = [] # Hosting and Security - # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84') + links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84') # Exploits and Kits links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107') # Botnets and Malware - # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97') + links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97') # Other Software - # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108') + links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108') # Hacking Guide - # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129') + links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129') # Fraud (mostly carding) - # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128') + links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128') return links @@ -222,12 +225,12 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - # break - - # comment out + # # comment out + # break + # + # # comment out # if count == 1: - # break + # break try: link = driver.find_element(by=By.XPATH, value='/html/body/div/div[3]/div[2]/div[2]/nav/ul/li[3]/a').get_attribute('href') diff --git a/MarketPlaces/MikesGrandStore/crawler_selenium.py b/MarketPlaces/MikesGrandStore/crawler_selenium.py index 492b306..cd3021e 100644 --- a/MarketPlaces/MikesGrandStore/crawler_selenium.py +++ b/MarketPlaces/MikesGrandStore/crawler_selenium.py @@ -26,7 +26,7 @@ from MarketPlaces.MikesGrandStore.parser import MikesGrandStore_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion' +baseURL = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/' def startCrawling(): @@ -83,8 +83,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - # ff_prof.set_preference("network.dns.disablePrefetch", True) - # ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -159,11 +159,6 @@ def getFullPathName(url): return fullPath -def getMKTName() -> str: - name = 'MikesGrandStore' - return name - - def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) @@ -178,6 +173,10 @@ def getInterestedLinks(): # Hacking links.append('http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/product-category/hacking/') + # Carding + links.append('http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/product-category/carding/') + # Databases + links.append('http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/product-category/databases/') return links @@ -215,11 +214,11 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - #break - - # comment out - #if count == 1: + # # comment out + # break + # + # # comment out + # if count == 1: # break # go to next page @@ -260,7 +259,3 @@ def productPages(html): def crawler(): startCrawling() - - -if __name__ == '__main__': - startCrawling() \ No newline at end of file diff --git a/MarketPlaces/MikesGrandStore/parser.py b/MarketPlaces/MikesGrandStore/parser.py index 1207eb2..2889742 100644 --- a/MarketPlaces/MikesGrandStore/parser.py +++ b/MarketPlaces/MikesGrandStore/parser.py @@ -38,55 +38,40 @@ def MikesGrandStore_description_parser(soup): # Finding Product Name name = soup.find('h1', {'class': 'product-title product_title entry-title'}).text - name = name.replace('\n', ' ') - name = name.replace(",", "") + name = cleanString(name) name = name.strip() - divmb = soup.findAll('div', {'class': "mb-1"}) - # Finding Vendor # no vendor vendor = "MikesGrandStore" # Finding the Product Rating rating_item = soup.find('strong', {'class', 'rating'}).text - rating_item = rating_item.replace('\n', ' ') - rating_item = rating_item.replace(",", "") + rating_item = cleanNumbers(rating_item) rating_item = rating_item.strip() # Finding Number of Product Reviews review_container = soup.find('li', {'id': 'tab-title-reviews'}) reviews = review_container.find('a').text - reviews = reviews.replace('Reviews', '') - reviews = reviews.replace('(', '') - reviews = reviews.replace(')', '') - reviews = reviews.replace('\n', ' ') - reviews = reviews.replace(",", "") + reviews = cleanNumbers(reviews) reviews = reviews.strip() # Finding Prices - USD = soup.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling - USD = USD.replace('\n', ' ') - USD = USD.replace(",", "") + USD = soup.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling.text + USD = cleanNumbers(USD) USD = USD.strip() - # Finding the Product Category - cat_container = soup.find('span', {'class': 'posted_in'}) - cat = cat_container.findAll('a') - category = "" - for name in cat: - category = category + " " + name.text - # Finding the Product Quantity Available stock = soup.find('p', {'class': 'stock in-stock'}) if stock is not None: left = stock.text - left = left.replace("in stock", "") + left = cleanNumbers(left) left = left.strip() # Finding the Product description - desc_cont = soup.find('div', {'class': 'product-short-description'}) - describe = desc_cont.find('p').text.strip() + describe = soup.find('div', {'id': 'tab-description'}).text + describe = cleanString(describe) + describe = describe.strip() # Finding Product Image image = soup.find('img', {'class': 'wp-post-image skip-lazy'}) @@ -158,29 +143,19 @@ def MikesGrandStore_listing_parser(soup): nm = len(listing) for a in listing: - bae = a.findAll('a', href=True) - lb = a.findAll('div', {"id": "littlebox"}) - # Adding the url to the list of urls link = a.find('a', {'class': 'woocommerce-LoopProduct-link woocommerce-loop-product__link'}).get('href') href.append(link) # Finding the Product product = a.find('a', {'class': 'woocommerce-LoopProduct-link woocommerce-loop-product__link'}).text - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.replace("...", "") + product = cleanString(product) product = product.strip() name.append(product) - # Finding Product Image - product_image = a.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'}) - product_image = product_image.get('src') - product_image = product_image.split('base64,')[-1] - image.append(product_image) - # Finding Prices - price = a.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling + price = a.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling.text + price = cleanNumbers(price) price = price.strip() USD.append(price) @@ -193,15 +168,16 @@ def MikesGrandStore_listing_parser(soup): # Finding the Category cat = a.find('p', {'class': 'category uppercase is-smaller no-text-overflow product-cat op-7'}).text cat = cat.replace("class:", "") + cat = cleanString(cat) cat = cat.strip() category.append(cat) # Finding product rating rating = a.find('strong', {'class': 'rating'}).text + rating = cleanNumbers(rating) rating = rating.strip() rating_item.append(rating) - # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: @@ -244,11 +220,6 @@ def MikesGrandStore_links_parser(soup): container = soup.find('div', {"class": "products row row-small large-columns-3 medium-columns-3 small-columns-2 equalize-box"}) listing = container.findAll('div', recursive=False) - # for a in listing: - # bae = a.find('a', {"class": "text-info"}, href=True) - # link = bae['href'] - # href.append(link) - for a in listing: bae = a.findAll('a', href=True) diff --git a/MarketPlaces/WeTheNorth/crawler_selenium.py b/MarketPlaces/WeTheNorth/crawler_selenium.py index 1557688..ee5f768 100644 --- a/MarketPlaces/WeTheNorth/crawler_selenium.py +++ b/MarketPlaces/WeTheNorth/crawler_selenium.py @@ -27,12 +27,12 @@ from MarketPlaces.Utilities.utilities import cleanHTML import selenium counter = 1 -baseURL = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion' +baseURL = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/' # Opens Tor Browser, crawls the website def startCrawling(): - marketName = getMarketName() + marketName = getMKTName() driver = getAccess() if driver != 'down': @@ -48,7 +48,6 @@ def startCrawling(): # Login using premade account credentials and do login captcha manually def login(driver): - time.sleep(3) #wait for login page input("Press ENTER when CAPTCHA is completed\n") @@ -60,32 +59,6 @@ def login(driver): #Password here passwordBox.send_keys('fishowal') - - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img').screenshot( - r'..\WeTheNorth\captcha.png') - - # This method will show image in any image viewer - im = Image.open(r'..\WeTheNorth\captcha.png') - - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[4]/input') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div[2]/div/div[3]/form/div[5]/input").click() - input("Press ENTER when CAPTCHA is completed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) @@ -93,11 +66,7 @@ def login(driver): (By.XPATH, '//*[@id="information"]'))) # Returns the name of the website -def getMarketName(): - name = 'WeTheNorth' - return name - -def getMKTName() -> str: +def getMKTName(): name = 'WeTheNorth' return name @@ -132,8 +101,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - # ff_prof.set_preference("network.dns.disablePrefetch", True) - # ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -201,10 +170,10 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Fraud Software - # links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=5&podcategory=3') - # # Guides and Tutorials - Hacking - # links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=3&podcategory=3') + # Fraud Software + links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=5&podcategory=3') + # Guides and Tutorials - Hacking + links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=3&podcategory=3') # Software and Malware links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=10') @@ -222,7 +191,6 @@ def crawlForum(driver): print('Crawling :', link) try: - pg_counter = 1 has_next_page = True count = 0 @@ -244,13 +212,17 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() + # comment out + break + + # comment out + if count == 1: + break try: nav = driver.find_element(by=By.XPATH, value= '/html/body/div[2]/div[3]/div[3]/div[2]/div[7]') - pg_counter += 1 - pg_counter_str = "p=" + str(pg_counter) + "&" - a = nav.find_element(by=By.XPATH, value = '//a[contains(@href,"'+pg_counter_str+'")]') + a = nav.find_element(by=By.XPATH, value=f'//a[contains(@href,"p={count + 2}&")]') link = a.get_attribute('href') if link == "": raise NoSuchElementException @@ -263,7 +235,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling WeTheNorth market done sucessfully. Press ENTER to continue\n") + print("Crawling WeTheNorth market done.") # Returns 'True' if the link is Topic link diff --git a/MarketPlaces/WeTheNorth/parser.py b/MarketPlaces/WeTheNorth/parser.py index b530c93..73941bb 100644 --- a/MarketPlaces/WeTheNorth/parser.py +++ b/MarketPlaces/WeTheNorth/parser.py @@ -31,39 +31,35 @@ def wethenorth_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name listDes = soup.find('div', {'class': "listDes"}) name = listDes.find('h2').text - name = name.replace('\n', ' ') - name = name.replace(",", "") + name = cleanString(name) name = name.strip() # Finding Vendor vendor = listDes.find('b').text - vendor = vendor.replace(",", "") - vendor = vendor.replace("...", "") - vendor = vendor.replace("-", "") + vendor = cleanString(vendor) vendor = vendor.strip() # Finding Vendor Rating - rating = listDes.find('span',{'class':'levelSet'}) + rating = listDes.find('span', {'class': 'levelSet'}) rating = rating.text - rating = rating.replace('\n', ' ') - rating = rating.replace(",", "") - rating = rating.strip() - - # Finding Successful Transactions - success = listDes.find_all('p')[1] - success = success.find('span').text - success = success.split() - success = success[0].strip() + rating = cleanNumbers(rating) + rating_vendor = rating.strip() # Finding Prices - all prices in We The North are in CAD, I left the CAD in the resulting String so that it would show CAD for all prices - padp = listDes.find('p',{'class':'padp'}) + padp = listDes.find('p', {'class': 'padp'}) USD = padp.find('span').text USD = USD.strip() + BTC = padp.find_next_sibling('p').text + BTC = cleanNumbers(BTC) + BTC = BTC.strip() + # Finding Escrow - no escrow on WTN market shipping_info = listDes.find('tbody') @@ -73,47 +69,32 @@ def wethenorth_description_parser(soup): # Finding Shipment Information (Origin) shipFrom = row1[-1].text - shipFrom=shipFrom.strip() - if shipFrom=="": - shipFrom="-1" + shipFrom = cleanString(shipFrom) + shipFrom = shipFrom.strip() + if shipFrom == "": + shipFrom = "-1" row2 = shipping_info[1].find_all('td') # Finding Shipment Information (Destination) shipTo = row2[-1].text - shipTo= shipTo.strip() + shipTo = cleanString(shipTo) + shipTo = shipTo.strip() if shipTo == "": shipTo = "-1" # Finding the Product description - describe = soup.find("div",{'class':'tabcontent'}) + describe = soup.find("div", {'class': 'tabcontent'}) describe = describe.find('p').text - describe = describe.replace("\n", " ") - describe = describe.replace("\r", " ") + describe = cleanString(describe) describe = describe.strip() - # cannot find any tag for these - ''' - # Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1" - ''' - # Searching for CVE and MS categories # no CVE or MS for WTN market # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row @@ -144,7 +125,9 @@ def wethenorth_listing_parser(soup): qLeft =[] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links right_content = soup.find('div', {"class": "right-content"}) listing = right_content.findAll('div', {"class": "col-1search"}) @@ -158,20 +141,19 @@ def wethenorth_listing_parser(soup): # Adding the url to the list of urls link = bae[0].get('href') - link = cleanLink(link) href.append(link) # Finding the Vendor vendor_name = a.find('p', {'class': 'padp'}) vendor_name = vendor_name.find('a').text - vendor_name = vendor_name.replace(",", "") + vendor_name = cleanString(vendor_name) vendor_name = vendor_name.strip() vendor.append(vendor_name) # Finding the Product - product = bae[0].text - product = product.replace('\n', ' ') - product = product.replace(",", "") + product = a.find('div', {'class': 'col-1centre'}) + product = product.find('div', {'class': 'head'}).find('a').text + product = cleanString(product) product = product.strip() name.append(product) @@ -179,25 +161,56 @@ def wethenorth_listing_parser(soup): category_name = a.find('p', {'class': 'padp'}).text first_dash = category_name.find('-') second_dash = category_name[first_dash+1:].find('-') - category_name = category_name[first_dash+1:second_dash] - category_name=category_name.strip() + category_name = category_name[first_dash+1: first_dash + second_dash] + category_name = cleanString(category_name) + category_name = category_name.strip() category.append(category_name) + # Finding Success Transactions + vendor_success = a.find('p', {'class': 'padp'}).text + first_dash = vendor_success.find('(') + vendor_success = vendor_success[first_dash + 1:] + vendor_success = cleanNumbers(vendor_success) + vendor_success = vendor_success.strip() + success.append(vendor_success) + # Finding Views view_count = a.text view_count = view_count[view_count.find('Views:'): view_count.find('Sales:')] view_count = view_count.replace('Views:', ' ') - view_count = view_count.replace('/', ' ') + view_count = cleanNumbers(view_count) view_count = view_count.strip() views.append(view_count) - # Finding success sales + # Finding Quantity Sold sold_count = a.text sold_count = sold_count[sold_count.find('Sales:'): sold_count.find('Short')] sold_count = sold_count.replace('Sales:', ' ') - sold_count = sold_count.replace('/', ' ') + sold_count = cleanNumbers(sold_count) sold_count = sold_count.strip() - success.append(sold_count) + sold.append(sold_count) + + right = a.find('div', {'class': 'col-1right'}) + + # Finding USD + usd = right.find('a').text + usd = "CAD " + usd.strip() + USD.append(usd) + + # Finding BTC + btc = right.text + first_dash = btc.find('(') + second_dash = btc[first_dash + 1:].find(')') + btc = btc[first_dash + 1: first_dash + second_dash] + btc = cleanNumbers(btc) + btc = btc.strip() + BTC.append(btc) + + # Finding Product Image + product_image = right.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) # Searching for CVE and MS categories # no CVE or MS in WTN market @@ -229,7 +242,7 @@ def wethenorth_listing_parser(soup): # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) def wethenorth_links_parser(soup):