diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index 29f17b5..d2b799b 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -24,6 +24,7 @@ from MarketPlaces.Bohemia.crawler_selenium import crawler as crawlerBohemia from MarketPlaces.TheDarkMarket.crawler_selenium import crawler as crawlerTheDarkMarket from MarketPlaces.GoFish.crawler_selenium import crawler as crawlerGoFish from MarketPlaces.ZeroDay.crawler_selenium import crawler as crawlerZeroDay +from MarketPlaces.Torzon.crawler_selenium import crawler as crawlerTorzon import configparser import os @@ -135,5 +136,7 @@ if __name__ == '__main__': crawlerTheDarkMarket() elif mkt == "ZeroDay": crawlerZeroDay() + elif mkt == "Torzon": + crawlerTorzon() print("\nScraping process completed!") diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index a5a8d2c..4cf169e 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -24,6 +24,7 @@ from MarketPlaces.Quest.parser import * from MarketPlaces.Ares.parser import * from MarketPlaces.CypherMarketplace.parser import * from MarketPlaces.WeTheNorth.parser import * +from MarketPlaces.Torzon.parser import * from MarketPlaces.GoFish.parser import * from MarketPlaces.ZeroDay.parser import * @@ -163,6 +164,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = gofish_listing_parser(soup) elif marketPlace == "ZeroDay": rw = zeroday_listing_parser(soup) + elif marketPlace == "Torzon": + rw = torzon_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -219,6 +222,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = gofish_description_parser(soup) elif marketPlace == "ZeroDay": rmm = zeroday_description_parser(soup) + elif marketPlace == "Torzon": + rmm = torzon_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception diff --git a/MarketPlaces/Torzon/crawler_selenium.py b/MarketPlaces/Torzon/crawler_selenium.py index 8560c57..0e17779 100644 --- a/MarketPlaces/Torzon/crawler_selenium.py +++ b/MarketPlaces/Torzon/crawler_selenium.py @@ -45,7 +45,7 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(mktName, BASE_URL, False) + # new_parse(mktName, BASE_URL, False) # Returns the name of the website @@ -129,8 +129,35 @@ def getAccess(): # then allows for manual solving of captcha in the terminal #@param: current selenium web driver def login(driver): - input("Press ENTER when CAPTCHA is completed and page is loaded\n") + input("Press ENTER when CAPTCHA is completed and LOGIN page is loaded\n") + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="username"]'))) # wait for page to show up (This Xpath may need to change based on different seed url) + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + # Username here + usernameBox.send_keys('LordTachonky') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + # Password here + passwordBox.send_keys('BorderRanked') + + input("Press ENTER when CAPTCHA is finished\n") + + login = driver.find_element(by=By.XPATH, value='/html/body/div/form/input[4]') + login.click() + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div/center'))) + + pinBox = driver.find_element(by=By.XPATH, value='//*[@id="pin"]') + pinBox.send_keys('541236') + submit = driver.find_element(by=By.XPATH, value='/html/body/div/form/input[2]') + submit.click() + + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[1]/p'))) + # driver.find_element(by=By.XPATH, value='/html/body/div[1]/label').click() # Saves the crawled html page, makes the directory path for html pages if not made @@ -210,33 +237,30 @@ def crawlForum(driver): driver.get(link) except: driver.refresh() + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[3]/div/table/tbody/tr/td[2]/center/table/tbody/tr[1]/td[1]'))) html = driver.page_source savePage(driver, html, link) - list = productPages(html) + for item in list: - itemURL = urlparse.urljoin(BASE_URL, str(item)) + itemURL = urlparse.urljoin(getFixedURL(), str(item)) try: - # time.sleep(1.5) # to keep from detecting click speed driver.get(itemURL) except: driver.refresh() savePage(driver, driver.page_source, item) - # time.sleep(1.5) driver.back() - # to keep from detecting click speed - - # comment out - break - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - # nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]') - # a = nav.find_element(by=By.LINK_TEXT, value=">") - link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href') + link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -246,7 +270,6 @@ def crawlForum(driver): except Exception as e: print(link, e) - # raise e i += 1 input("Crawling Torzon market done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/Torzon/parser.py b/MarketPlaces/Torzon/parser.py index edb8cc4..fb1de32 100644 --- a/MarketPlaces/Torzon/parser.py +++ b/MarketPlaces/Torzon/parser.py @@ -318,7 +318,7 @@ def torzon_links_parser(soup): href = [] # listing = soup.findAll('div', {"class": "card mt-1"}) - listing = soup.find('td', {"valign": "top"}).find("table", {"border": "0"}).findAll('td', {'width': '50%'}) + listing = soup.find('td', {"valign": "top"}).find("tbody").findAll('td', {'width': '50%'}) for a in listing: bae = a.find('a', href=True)#card-title rounded text-truncate