From dcc07f5d2bc68099000a8ca46581afe833141204 Mon Sep 17 00:00:00 2001 From: Helium Date: Mon, 12 Feb 2024 13:18:20 -0800 Subject: [PATCH] completed GoFish --- MarketPlaces/GoFish/crawler_selenium.py | 64 +++--- MarketPlaces/GoFish/parser.py | 261 ++++++++---------------- 2 files changed, 118 insertions(+), 207 deletions(-) diff --git a/MarketPlaces/GoFish/crawler_selenium.py b/MarketPlaces/GoFish/crawler_selenium.py index aad1dc5..65b2133 100644 --- a/MarketPlaces/GoFish/crawler_selenium.py +++ b/MarketPlaces/GoFish/crawler_selenium.py @@ -31,15 +31,15 @@ baseURL = 'http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion def startCrawling(): mktName = getMKTName() - driver = getAccess() + # driver = getAccess() - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closeDriver(driver) + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closeDriver(driver) new_parse(mktName, baseURL, True) @@ -82,8 +82,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - # ff_prof.set_preference("network.dns.disablePrefetch", True) - # ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -118,9 +118,9 @@ def getAccess(): def login(driver): - + # input("Press ENTER when CAPTCHA is complete and login page has loaded\n") WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="username"]'))) + (By.XPATH, '//*[@id="js-off"]'))) # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') @@ -129,15 +129,12 @@ def login(driver): passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') # Password here passwordBox.send_keys('DementedBed123-') - # submit - submit = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/form/div[7]/input') - submit.click() - input("Press ENTER when CAPTCHA is complete and home page has loaded\n") + input("Press ENTER when CAPTCHA and login is pressed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div/div[3]/div[2]/div[1]'))) + (By.XPATH, '/html/body/div/div[3]/div[1]/div[3]'))) def savePage(driver, page, url): @@ -177,18 +174,20 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # Hosting and Security - links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84') - # Exploits and Kits - links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107') - # Botnets and Malware - links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97') - # Other Software - links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108') - # Hacking Guide - links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129') - # Fraud (mostly carding) - links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128') + # # Fraud Software + links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=135') + # # hacking guide + # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129') + # # malware tutorial + # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=130') + # # programming tutorial + # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=131') + # # social engineering tutorial + # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=86') + # # botnets + # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97') + # # exploits + # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107') return links @@ -233,10 +232,9 @@ def crawlForum(driver): # break try: - link = driver.find_element(by=By.XPATH, value='/html/body/div/div[3]/div[2]/div[2]/nav/ul/li[3]/a').get_attribute('href') + link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') if link == "": raise NoSuchElementException - link = urlparse.urljoin(baseURL, str(link)) count += 1 except NoSuchElementException: @@ -251,14 +249,14 @@ def crawlForum(driver): # Returns 'True' if the link is Topic link, may need to change for every website def isDescriptionLink(url): - if 'a=' in url: + if '.onion/?c' not in url: return True return False # Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): - if 'c=' in url: + if '.onion/?c' in url: return True return False diff --git a/MarketPlaces/GoFish/parser.py b/MarketPlaces/GoFish/parser.py index 6efbae3..5502532 100644 --- a/MarketPlaces/GoFish/parser.py +++ b/MarketPlaces/GoFish/parser.py @@ -2,6 +2,7 @@ __author__ = 'DarkWeb' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * + # Here, we are importing BeautifulSoup to search through the HTML tree from bs4 import BeautifulSoup @@ -35,105 +36,50 @@ def gofish_description_parser(soup): image = "-1" # 19 Product_Image vendor_image = "-1" # 20 Vendor_Image - # Finding Product Name - - divmb = soup.find('div', {'class': "p-3 mb-1 fs-3 fw-bold border border-2 bg-white rounded"}) - if divmb is None: - divmb = soup.find('div', {'class': "p-3 mb-1 fs-4 fw-bold border border-2 bg-white rounded"}) - - name = divmb.text - name = name.replace('\n', ' ') - name = name.replace('\r', ' ') - name = name.replace('\t', ' ') - name = name.replace(",", "") - name = name.strip() - - # Finding Vendor - vendor = soup.find('div', {'class': 'my-1'}).find('a').text.strip() - - # Finding Vendor Rating - # temp = soup.find('div', {'class': ""}).text - # temp = temp.split('(') - # rating = temp[0].replace("Vendor's Review : ", "") - # rating = rating.replace("%", "") - # rating_vendor = rating.strip() - - # Finding the Product Rating and Number of Product Reviews - # reviews = temp[2].replace(" review)", "") - # reviews = reviews.strip() - - # temp = temp[1].split(")") - # rating = temp[1].replace("Product Review : ", "") - # rating = rating.replace("%", "") - # rating_item = rating.strip() - - # Finding Prices - precios = soup.findAll('td', {'class': "text-end text-nowrap"}) - USD = precios[0].text.strip().replace('$', '') - - # Finding the Product Category - # pmb = soup.findAll('p', {'class': "mb-1"}) - - # category = pmb[-1].text - # category = category.replace("Category: ", "").strip() - - # Finding the Product Quantity Available - # left = divmb[-1].text - # left = left.split(",", 1)[1] - # left = left.replace("in stock", "") - # left = left.strip() - - # Finding Number Sold - # sold = divmb[-1].text - # sold = sold.split(",", 1)[0] - # sold = sold.replace("sold", "") - # sold = sold.strip() - - # Finding Shipment Information (Origin) - origin = soup.findAll('div', {'class': "p-3 mt-2 mb-3 border border-2 bg-white rounded"}) - remove = origin[0].find('span').text.strip() - origin = origin[0].text.strip() - origin = origin.replace(remove, '') - - shipFrom = origin.strip() - - # Finding Shipment Information (Destination) - dest = soup.findAll('div', {'class': 'p-3 mb-3 overflow-auto border border-2 bg-white rounded'}) - dest = dest[-1].text.strip() - dest = dest.replace('[', '') - dest = dest.replace(']', '') - - shipTo = dest[1:].strip() - - # Finding the Product description - cardbody = soup.findAll('div', {'class': "p-3 mb-3 overflow-auto border border-2 bg-white rounded"}) - describe = cardbody[0].text - describe = describe.replace('\n', ' ') - describe = describe.strip() + temp = soup.find('div', {'class': 'col-lg-5'}) + + # find vendor name + vendor = temp.find('a', {'class': 'text-decoration-none fw-bold'}).text.strip() + if vendor is None: + print('vendor') + + # find product name + temp2 = soup.find('nav', {'aria-label': 'breadcrumb'}).findAll('li', {'class': 'breadcrumb-item'}) + name = soup.find('li', {'class': 'breadcrumb-item active text-truncate'}).text.strip() + if name is None: + print('name') + + describe = soup.find('div', {'class': 'p-3 mb-3 overflow-auto border border-2 bg-white rounded'}).text + describe = cleanString(describe) + if describe is None: + print('describe') + + category = temp2[2].text + if category is None: + print('category') + + USD = soup.find('td', {'class': 'text-end text-nowrap'}).text + if USD is None: + print('USD') + + shipFrom = soup.find('span', {'class': 'lh-1 me-2 fs-4'}).text + print(shipFrom) + + shipTo = soup.find('span', {'class': 'lh-1 me-2 fs-4'}).text + if shipTo.isalnum(): + shipTo = shipTo + else: + shipTo = 'Worldwide' # Finding Product Image - img = soup.findAll('figure', {'class': 'image-feature'})[0] - image = img.find('img', {'class': 'image-block rounded'}) - image = image.get('src') - image = image.split('base64,')[-1] - - # Searching for CVE and MS categories - cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if cve: - CVE = " " - for idx in cve: - CVE += (idx) - CVE += " " - CVE = CVE.replace(',', ' ') - CVE = CVE.replace('\n', '') - ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) - if ms: - MS = " " - for im in ms: - MS += (im) - MS += " " - MS = MS.replace(',', ' ') - MS = MS.replace('\n', '') + image = soup.find('figure', {"class": 'image-feature'}).find('img') + if image is not None: + image = image.get('src') + image = image.split('base64,')[-1] + else: + print('img') + image = "-1" + # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, @@ -175,20 +121,26 @@ def gofish_listing_parser(soup): image_vendor = [] # 21 Vendor_Image href = [] # 22 Product_Links - listing = soup.find('tbody', {"class": "border border-2 align-middle"}).findAll('tr') + temp = soup.find('div', {"class": "col-9"}) + cat = temp.find('nav', {'aria-label': 'breadcrumb'}).find('li', {'class': 'breadcrumb-item active'}).text.strip() + cat = cleanString(cat) + + listing = temp.find('tbody', {"class": 'border border-2 align-middle'}).findAll('tr') # Populating the Number of Products nm = len(listing) for a in listing: - bae = a.findAll('a', href=True) + + category.append(cat) # Adding the url to the list of urls - link = bae[0].get('href') + link = a.find('a').get('href') + link = cleanLink(link) href.append(link) - # Finding the Product - product = bae[1].text + # Finding the Product name + product = a.find('a', {"class": 'text-decoration-none'}).text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.replace("...", "") @@ -196,83 +148,50 @@ def gofish_listing_parser(soup): name.append(product) # Finding Product Image - product_image = bae[0].find('img') + product_image = a.find('img') product_image = product_image.get('src') product_image = product_image.split('base64,')[-1] image.append(product_image) - # Finding Prices - price = a.find('span', {"class": "fw-bold text-nowrap"}).text - price = price.replace("$","") - price = price.strip() - USD.append(price) - # Finding the Vendor - vendor_name = bae[-1].text + vendor_name = a.find('a', {"class": 'text-decoration-none fw-bold'}).text vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.strip() vendor.append(vendor_name) + # image vendor image_vendor.append("-1") - # Finding the Category - # cat = lb[-1].find("span").text - # cat = cat.replace("class:", "") - # cat = cat.strip() - # category.append(cat) - - # span = lb[0].findAll("span") - - # Finding Number of Views - # num = span[0].text - # num = num.replace("views:", "") - # num = num.strip() - # sold.append(num) - - # Finding Number Sold - # num = span[2].text - # num = num.replace("Sold:", "") - # num = num.strip() - # sold.append(num) - - # Finding Quantity Left - # quant = span[1].text - # quant = quant.replace("stock:", "") - # quant = quant.strip() - # qLeft.append(quant) - - # add shipping information - # ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->") - # shipFrom.append(ship[0].replace("Ship from ", "").strip()) - # shipTo.append(ship[1].replace("to ", "").strip()) - - - # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if not cve: - cveValue = "-1" - else: - cee = " " - for idx in cve: - cee += (idx) - cee += " " - cee = cee.replace(',', ' ') - cee = cee.replace('\n', '') - cveValue = cee - CVE.append(cveValue) - - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) - if not ms: - MSValue = "-1" + # USD + usd = a.find('div', {'class': 'text-nowrap'}). find('span', {'class': 'fw-bold text-nowrap'}).text.strip() + USD.append(usd) + + temp = a.findAll('span', {'class': 'fs-4 lh-1'}) + shipF = temp[0].text + shipFrom.append(shipF) + + shipT = temp[1].text + + if shipT.isalnum(): + shipTo.append(shipT) else: - me = " " - for im in ms: - me += (im) - me += " " - me = me.replace(',', ' ') - me = me.replace('\n', '') - MSValue = me - MS.append(MSValue) + shipTo.append('Worldwide') + + + + rating_vendor.append('-1') + success.append('-1') + CVE.append('-1') + MS.append('-1') + describe.append('-1') + views.append('-1') + reviews.append('-1') + rating_item.append('-1') + addDate.append('-1') + BTC.append('-1') + EURO.append('-1') + sold.append('-1') + qLeft.append('-1') # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, @@ -286,13 +205,7 @@ def gofish_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - listing = soup.find('tbody', {'class': 'border border-2 align-middle'}) - listing = soup.findAll('tr') - listing = listing[1:] - # for a in listing: - # bae = a.find('a', {"class": "text-info"}, href=True) - # link = bae['href'] - # href.append(link) + listing = soup.find('div', {"class": "col-9"}).find('tbody', {'class': 'border border-2 align-middle'}).findAll('tr') for a in listing: bae = a.findAll('a', href=True)