finished crawler for GoFish

1 year ago · 48b3fee56a
--- a/MarketPlaces/GoFish/crawler_selenium.py
+++ b/MarketPlaces/GoFish/crawler_selenium.py
@ -41,7 +41,7 @@ def startCrawling():
            print(driver.current_url, e)
        closeDriver(driver)

    new_parse(mktName, baseURL, True)
    # new_parse(mktName, baseURL, True)


 # Returns the name of the website
@ -121,21 +121,20 @@ def login(driver):
    input("Press ENTER when CAPTCHA is complete and login page has loaded\n")

    # entering username and password into input boxes
    usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]')
    usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
    # Username here
    usernameBox.send_keys('aliciamykeys')
    passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]')
    usernameBox.send_keys('itsmedio')
    passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
    # Password here
    passwordBox.send_keys('aliciawherearemykey$')
    # session time
    session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select'))
    session_select.select_by_visible_text('Session 60min')
    passwordBox.send_keys('DementedBed123-')
    # submit
    submit = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/form/div[7]/input')
    submit.click()

    input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n")

    # wait for listing page show up (This Xpath may need to change based on different seed url)
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, '//*[@id="submit"]')))
        (By.XPATH, '/html/body/div/div[3]/div[2]/div[1]')))


 def savePage(driver, page, url):
@ -176,17 +175,17 @@ def getInterestedLinks():
    links = []

    # Hosting and Security
    links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84')
    # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84')
    # Exploits and Kits
    links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107')
    # Botnets and Malware
    links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97')
    # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97')
    # Other Software
    links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108')
    # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108')
    # Hacking Guide
    links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129')
    # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129')
    # Fraud (mostly carding)
    links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128')
    # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128')

    return links

@ -212,7 +211,6 @@ def crawlForum(driver):
                    driver.refresh()
                html = driver.page_source
                savePage(driver, html, link)

                list = productPages(html)

                for item in list:
@ -225,16 +223,17 @@ def crawlForum(driver):
                    driver.back()

                    # comment out
                    break
                    # break

                # comment out
                if count == 1:
                    break
                # if count == 1:
                    # break

                try:
                    link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
                    link = driver.find_element(by=By.XPATH, value='/html/body/div/div[3]/div[2]/div[2]/nav/ul/li[3]/a').get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    link = urlparse.urljoin(baseURL, str(link))
                    count += 1

                except NoSuchElementException:
@ -249,14 +248,14 @@ def crawlForum(driver):

 # Returns 'True' if the link is Topic link, may need to change for every website
 def isDescriptionLink(url):
    if 'item' in url:
    if 'a=' in url:
        return True
    return False


 # Returns True if the link is a listingPage link, may need to change for every website
 def isListingLink(url):
    if 'category=' in url:
    if 'c=' in url:
        return True
    return False

--- a/MarketPlaces/GoFish/parser.py
+++ b/MarketPlaces/GoFish/parser.py
@ -271,8 +271,9 @@ def gofish_links_parser(soup):
    # Returning all links that should be visited by the Crawler

    href = []
    listing = soup.findAll('div', {"id": "itembox"})

    listing = soup.find('tbody', {'class': 'border border-2 align-middle'})
    listing = soup.findAll('tr')
    listing = listing[1:]
    # for a in listing:
    #     bae = a.find('a', {"class": "text-info"}, href=True)
    #     link = bae['href']
--- a/MarketPlaces/Initialization/marketsList.txt
+++ b/MarketPlaces/Initialization/marketsList.txt
@ -1 +1 @@
 CypherMarketplace
 GoFish
--- a/MarketPlaces/Initialization/markets_mining.py
+++ b/MarketPlaces/Initialization/markets_mining.py
@ -15,6 +15,7 @@ from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity
 from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher
 from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar
 from MarketPlaces.Ares.crawler_selenium import crawler as crawlerAres
 from MarketPlaces.GoFish.crawler_selenium import crawler as crawlerGoFish

 import configparser
 import os
@ -110,5 +111,7 @@ if __name__ == '__main__':
            crawlerPabloEscobar()
        elif mkt == "Ares":
            crawlerAres()
        elif mkt == "GoFish":
            crawlerGoFish()

    print("\nScraping process completed!")