josh's progress on torzon market

1 year ago · e0e32e68b8
--- a/MarketPlaces/Initialization/markets_mining.py
+++ b/MarketPlaces/Initialization/markets_mining.py
@ -23,6 +23,7 @@ from MarketPlaces.Ares.crawler_selenium import crawler as crawlerAres
 from MarketPlaces.Bohemia.crawler_selenium import crawler as crawlerBohemia
 from MarketPlaces.TheDarkMarket.crawler_selenium import crawler as crawlerTheDarkMarket
 from MarketPlaces.GoFish.crawler_selenium import crawler as crawlerGoFish
 from MarketPlaces.Torzon.crawler_selenium import crawler as crawlerTorzon

 import configparser
 import os
@ -132,5 +133,7 @@ if __name__ == '__main__':
            crawlerGoFish()
        elif mkt == "TheDarkMarket":
            crawlerTheDarkMarket()
        elif mkt == "Torzon":
            crawlerTorzon()

    print("\nScraping process completed!")
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@ -24,6 +24,7 @@ from MarketPlaces.Quest.parser import *
 from MarketPlaces.Ares.parser import *
 from MarketPlaces.CypherMarketplace.parser import *
 from MarketPlaces.WeTheNorth.parser import *
 from MarketPlaces.Torzon.parser import *
 from MarketPlaces.GoFish.parser import *

 from MarketPlaces.Classifier.classify_product import predict
@ -160,6 +161,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
            rw = wethenorth_listing_parser(soup)
        elif marketPlace == "GoFish":
            rw = gofish_listing_parser(soup)
        elif marketPlace == "Torzon":
            rw = torzon_listing_parser(soup)
        else:
            print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
            raise Exception
@ -214,6 +217,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
            rmm = wethenorth_description_parser(soup)
        elif marketPlace == "GoFish":
            rmm = gofish_description_parser(soup)
        elif marketPlace == "Torzon":
            rmm = torzon_description_parser(soup)
        else:
            print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
            raise Exception
--- a/MarketPlaces/Torzon/crawler_selenium.py
+++ b/MarketPlaces/Torzon/crawler_selenium.py
@ -45,7 +45,7 @@ def startCrawling():
            print(driver.current_url, e)
        closeDriver(driver)

    new_parse(mktName, BASE_URL, False)
    # new_parse(mktName, BASE_URL, False)


 # Returns the name of the website
@ -129,8 +129,35 @@ def getAccess():
 # then allows for manual solving of captcha in the terminal
 #@param: current selenium web driver
 def login(driver):
    input("Press ENTER when CAPTCHA is completed and page is loaded\n")
    input("Press ENTER when CAPTCHA is completed and LOGIN page is loaded\n")
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, '//*[@id="username"]')))
    # wait for  page to show up (This Xpath may need to change based on different seed url)
    # entering username and password into input boxes
    usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
    # Username here
    usernameBox.send_keys('LordTachonky')
    passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
    # Password here
    passwordBox.send_keys('BorderRanked')

    input("Press ENTER when CAPTCHA is finished\n")

    login = driver.find_element(by=By.XPATH, value='/html/body/div/form/input[4]')
    login.click()

    # wait for listing page show up (This Xpath may need to change based on different seed url)
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, '/html/body/div/center')))

    pinBox = driver.find_element(by=By.XPATH, value='//*[@id="pin"]')
    pinBox.send_keys('541236')
    submit = driver.find_element(by=By.XPATH, value='/html/body/div/form/input[2]')
    submit.click()

    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, '/html/body/div[1]/p')))
    # driver.find_element(by=By.XPATH, value='/html/body/div[1]/label').click()


 # Saves the crawled html page, makes the directory path for html pages if not made
@ -210,33 +237,30 @@ def crawlForum(driver):
                    driver.get(link)
                except:
                    driver.refresh()
                WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
                    (By.XPATH, '/html/body/div[3]/div/table/tbody/tr/td[2]/center/table/tbody/tr[1]/td[1]')))
                html = driver.page_source
                savePage(driver, html, link)

                list = productPages(html)

                for item in list:
                    itemURL = urlparse.urljoin(BASE_URL, str(item))
                    itemURL = urlparse.urljoin(getFixedURL(), str(item))
                    try:
                        # time.sleep(1.5) # to keep from detecting click speed
                        driver.get(itemURL)
                    except:
                        driver.refresh()
                    savePage(driver, driver.page_source, item)
                    # time.sleep(1.5)
                    driver.back()
                     # to keep from detecting click speed

                    # comment out
                    break

                # comment out
                if count == 1:
                    break
                #     # comment out
                #     break
                #
                # # comment out
                # if count == 1:
                #     break

                try:
                    # nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]')
                    # a = nav.find_element(by=By.LINK_TEXT, value=">")
                    link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href')
                    link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    count += 1
@ -246,7 +270,6 @@ def crawlForum(driver):

        except Exception as e:
            print(link, e)
            # raise e
        i += 1

    input("Crawling Torzon market done sucessfully. Press ENTER to continue\n")
--- a/MarketPlaces/Torzon/parser.py
+++ b/MarketPlaces/Torzon/parser.py
@ -318,7 +318,7 @@ def torzon_links_parser(soup):

    href = []
    # listing = soup.findAll('div', {"class": "card mt-1"})
    listing = soup.find('td', {"valign": "top"}).find("table", {"border": "0"}).findAll('td', {'width': '50%'})
    listing = soup.find('td', {"valign": "top"}).find("tbody").findAll('td', {'width': '50%'})

    for a in listing:
        bae = a.find('a',  href=True)#card-title rounded text-truncate