diff --git a/MarketPlaces/GoFish/crawler_selenium.py b/MarketPlaces/GoFish/crawler_selenium.py index dbdbd50..42f7bfa 100644 --- a/MarketPlaces/GoFish/crawler_selenium.py +++ b/MarketPlaces/GoFish/crawler_selenium.py @@ -41,7 +41,7 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(mktName, baseURL, True) + # new_parse(mktName, baseURL, True) # Returns the name of the website @@ -121,21 +121,20 @@ def login(driver): input("Press ENTER when CAPTCHA is complete and login page has loaded\n") # entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]') + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') # Username here - usernameBox.send_keys('aliciamykeys') - passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]') + usernameBox.send_keys('itsmedio') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') # Password here - passwordBox.send_keys('aliciawherearemykey$') - # session time - session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select')) - session_select.select_by_visible_text('Session 60min') + passwordBox.send_keys('DementedBed123-') + # submit + submit = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/form/div[7]/input') + submit.click() - input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="submit"]'))) + (By.XPATH, '/html/body/div/div[3]/div[2]/div[1]'))) def savePage(driver, page, url): @@ -176,17 +175,17 @@ def getInterestedLinks(): links = [] # Hosting and Security - links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84') + # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84') # Exploits and Kits links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107') # Botnets and Malware - links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97') + # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97') # Other Software - links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108') + # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108') # Hacking Guide - links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129') + # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129') # Fraud (mostly carding) - links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128') + # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128') return links @@ -212,7 +211,6 @@ def crawlForum(driver): driver.refresh() html = driver.page_source savePage(driver, html, link) - list = productPages(html) for item in list: @@ -225,16 +223,17 @@ def crawlForum(driver): driver.back() # comment out - break + # break # comment out - if count == 1: - break + # if count == 1: + # break try: - link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') + link = driver.find_element(by=By.XPATH, value='/html/body/div/div[3]/div[2]/div[2]/nav/ul/li[3]/a').get_attribute('href') if link == "": raise NoSuchElementException + link = urlparse.urljoin(baseURL, str(link)) count += 1 except NoSuchElementException: @@ -249,14 +248,14 @@ def crawlForum(driver): # Returns 'True' if the link is Topic link, may need to change for every website def isDescriptionLink(url): - if 'item' in url: + if 'a=' in url: return True return False # Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): - if 'category=' in url: + if 'c=' in url: return True return False diff --git a/MarketPlaces/GoFish/parser.py b/MarketPlaces/GoFish/parser.py index 61cbd48..de10035 100644 --- a/MarketPlaces/GoFish/parser.py +++ b/MarketPlaces/GoFish/parser.py @@ -271,8 +271,9 @@ def gofish_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('div', {"id": "itembox"}) - + listing = soup.find('tbody', {'class': 'border border-2 align-middle'}) + listing = soup.findAll('tr') + listing = listing[1:] # for a in listing: # bae = a.find('a', {"class": "text-info"}, href=True) # link = bae['href'] diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index b656841..f81b343 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -CypherMarketplace \ No newline at end of file +GoFish \ No newline at end of file diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index e5fe69a..5f14289 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -15,6 +15,7 @@ from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar from MarketPlaces.Ares.crawler_selenium import crawler as crawlerAres +from MarketPlaces.GoFish.crawler_selenium import crawler as crawlerGoFish import configparser import os @@ -110,5 +111,7 @@ if __name__ == '__main__': crawlerPabloEscobar() elif mkt == "Ares": crawlerAres() + elif mkt == "GoFish": + crawlerGoFish() print("\nScraping process completed!")