Browse Source

finished crawler for GoFish

main
Joshua 1 year ago
parent
commit
48b3fee56a
4 changed files with 28 additions and 25 deletions
  1. +21
    -22
      MarketPlaces/GoFish/crawler_selenium.py
  2. +3
    -2
      MarketPlaces/GoFish/parser.py
  3. +1
    -1
      MarketPlaces/Initialization/marketsList.txt
  4. +3
    -0
      MarketPlaces/Initialization/markets_mining.py

+ 21
- 22
MarketPlaces/GoFish/crawler_selenium.py View File

@ -41,7 +41,7 @@ def startCrawling():
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# new_parse(mktName, baseURL, True)
# Returns the name of the website
@ -121,21 +121,20 @@ def login(driver):
input("Press ENTER when CAPTCHA is complete and login page has loaded\n")
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]')
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
# Username here
usernameBox.send_keys('aliciamykeys')
passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]')
usernameBox.send_keys('itsmedio')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
# Password here
passwordBox.send_keys('aliciawherearemykey$')
# session time
session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select'))
session_select.select_by_visible_text('Session 60min')
passwordBox.send_keys('DementedBed123-')
# submit
submit = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/form/div[7]/input')
submit.click()
input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="submit"]')))
(By.XPATH, '/html/body/div/div[3]/div[2]/div[1]')))
def savePage(driver, page, url):
@ -176,17 +175,17 @@ def getInterestedLinks():
links = []
# Hosting and Security
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84')
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84')
# Exploits and Kits
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107')
# Botnets and Malware
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97')
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97')
# Other Software
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108')
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108')
# Hacking Guide
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129')
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129')
# Fraud (mostly carding)
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128')
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128')
return links
@ -212,7 +211,6 @@ def crawlForum(driver):
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
@ -225,16 +223,17 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:
break
# if count == 1:
# break
try:
link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
link = driver.find_element(by=By.XPATH, value='/html/body/div/div[3]/div[2]/div[2]/nav/ul/li[3]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
link = urlparse.urljoin(baseURL, str(link))
count += 1
except NoSuchElementException:
@ -249,14 +248,14 @@ def crawlForum(driver):
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'item' in url:
if 'a=' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if 'category=' in url:
if 'c=' in url:
return True
return False


+ 3
- 2
MarketPlaces/GoFish/parser.py View File

@ -271,8 +271,9 @@ def gofish_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"id": "itembox"})
listing = soup.find('tbody', {'class': 'border border-2 align-middle'})
listing = soup.findAll('tr')
listing = listing[1:]
# for a in listing:
# bae = a.find('a', {"class": "text-info"}, href=True)
# link = bae['href']


+ 1
- 1
MarketPlaces/Initialization/marketsList.txt View File

@ -1 +1 @@
CypherMarketplace
GoFish

+ 3
- 0
MarketPlaces/Initialization/markets_mining.py View File

@ -15,6 +15,7 @@ from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity
from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher
from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar
from MarketPlaces.Ares.crawler_selenium import crawler as crawlerAres
from MarketPlaces.GoFish.crawler_selenium import crawler as crawlerGoFish
import configparser
import os
@ -110,5 +111,7 @@ if __name__ == '__main__':
crawlerPabloEscobar()
elif mkt == "Ares":
crawlerAres()
elif mkt == "GoFish":
crawlerGoFish()
print("\nScraping process completed!")

Loading…
Cancel
Save