Browse Source

josh's progress on torzon market

main
Joshua 10 months ago
parent
commit
e0e32e68b8
4 changed files with 49 additions and 18 deletions
  1. +3
    -0
      MarketPlaces/Initialization/markets_mining.py
  2. +5
    -0
      MarketPlaces/Initialization/prepare_parser.py
  3. +40
    -17
      MarketPlaces/Torzon/crawler_selenium.py
  4. +1
    -1
      MarketPlaces/Torzon/parser.py

+ 3
- 0
MarketPlaces/Initialization/markets_mining.py View File

@ -23,6 +23,7 @@ from MarketPlaces.Ares.crawler_selenium import crawler as crawlerAres
from MarketPlaces.Bohemia.crawler_selenium import crawler as crawlerBohemia
from MarketPlaces.TheDarkMarket.crawler_selenium import crawler as crawlerTheDarkMarket
from MarketPlaces.GoFish.crawler_selenium import crawler as crawlerGoFish
from MarketPlaces.Torzon.crawler_selenium import crawler as crawlerTorzon
import configparser
import os
@ -132,5 +133,7 @@ if __name__ == '__main__':
crawlerGoFish()
elif mkt == "TheDarkMarket":
crawlerTheDarkMarket()
elif mkt == "Torzon":
crawlerTorzon()
print("\nScraping process completed!")

+ 5
- 0
MarketPlaces/Initialization/prepare_parser.py View File

@ -24,6 +24,7 @@ from MarketPlaces.Quest.parser import *
from MarketPlaces.Ares.parser import *
from MarketPlaces.CypherMarketplace.parser import *
from MarketPlaces.WeTheNorth.parser import *
from MarketPlaces.Torzon.parser import *
from MarketPlaces.GoFish.parser import *
from MarketPlaces.Classifier.classify_product import predict
@ -160,6 +161,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
rw = wethenorth_listing_parser(soup)
elif marketPlace == "GoFish":
rw = gofish_listing_parser(soup)
elif marketPlace == "Torzon":
rw = torzon_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@ -214,6 +217,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
rmm = wethenorth_description_parser(soup)
elif marketPlace == "GoFish":
rmm = gofish_description_parser(soup)
elif marketPlace == "Torzon":
rmm = torzon_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception


+ 40
- 17
MarketPlaces/Torzon/crawler_selenium.py View File

@ -45,7 +45,7 @@ def startCrawling():
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, BASE_URL, False)
# new_parse(mktName, BASE_URL, False)
# Returns the name of the website
@ -129,8 +129,35 @@ def getAccess():
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
input("Press ENTER when CAPTCHA is completed and page is loaded\n")
input("Press ENTER when CAPTCHA is completed and LOGIN page is loaded\n")
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="username"]')))
# wait for page to show up (This Xpath may need to change based on different seed url)
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
# Username here
usernameBox.send_keys('LordTachonky')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
# Password here
passwordBox.send_keys('BorderRanked')
input("Press ENTER when CAPTCHA is finished\n")
login = driver.find_element(by=By.XPATH, value='/html/body/div/form/input[4]')
login.click()
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div/center')))
pinBox = driver.find_element(by=By.XPATH, value='//*[@id="pin"]')
pinBox.send_keys('541236')
submit = driver.find_element(by=By.XPATH, value='/html/body/div/form/input[2]')
submit.click()
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[1]/p')))
# driver.find_element(by=By.XPATH, value='/html/body/div[1]/label').click()
# Saves the crawled html page, makes the directory path for html pages if not made
@ -210,33 +237,30 @@ def crawlForum(driver):
driver.get(link)
except:
driver.refresh()
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[3]/div/table/tbody/tr/td[2]/center/table/tbody/tr[1]/td[1]')))
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(BASE_URL, str(item))
itemURL = urlparse.urljoin(getFixedURL(), str(item))
try:
# time.sleep(1.5) # to keep from detecting click speed
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
# time.sleep(1.5)
driver.back()
# to keep from detecting click speed
# comment out
break
# comment out
if count == 1:
break
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
# nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]')
# a = nav.find_element(by=By.LINK_TEXT, value=">")
link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href')
link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
@ -246,7 +270,6 @@ def crawlForum(driver):
except Exception as e:
print(link, e)
# raise e
i += 1
input("Crawling Torzon market done sucessfully. Press ENTER to continue\n")


+ 1
- 1
MarketPlaces/Torzon/parser.py View File

@ -318,7 +318,7 @@ def torzon_links_parser(soup):
href = []
# listing = soup.findAll('div', {"class": "card mt-1"})
listing = soup.find('td', {"valign": "top"}).find("table", {"border": "0"}).findAll('td', {'width': '50%'})
listing = soup.find('td', {"valign": "top"}).find("tbody").findAll('td', {'width': '50%'})
for a in listing:
bae = a.find('a', href=True)#card-title rounded text-truncate


Loading…
Cancel
Save