Browse Source

Merge branch 'main' into '0day'

# Conflicts:
#   MarketPlaces/Initialization/markets_mining.py
#   MarketPlaces/Initialization/prepare_parser.py
main
Lokaranjan Munta 10 months ago
parent
commit
a00dd301b0
4 changed files with 49 additions and 18 deletions
  1. +3
    -0
      MarketPlaces/Initialization/markets_mining.py
  2. +5
    -0
      MarketPlaces/Initialization/prepare_parser.py
  3. +40
    -17
      MarketPlaces/Torzon/crawler_selenium.py
  4. +1
    -1
      MarketPlaces/Torzon/parser.py

+ 3
- 0
MarketPlaces/Initialization/markets_mining.py View File

@ -24,6 +24,7 @@ from MarketPlaces.Bohemia.crawler_selenium import crawler as crawlerBohemia
from MarketPlaces.TheDarkMarket.crawler_selenium import crawler as crawlerTheDarkMarket from MarketPlaces.TheDarkMarket.crawler_selenium import crawler as crawlerTheDarkMarket
from MarketPlaces.GoFish.crawler_selenium import crawler as crawlerGoFish from MarketPlaces.GoFish.crawler_selenium import crawler as crawlerGoFish
from MarketPlaces.ZeroDay.crawler_selenium import crawler as crawlerZeroDay from MarketPlaces.ZeroDay.crawler_selenium import crawler as crawlerZeroDay
from MarketPlaces.Torzon.crawler_selenium import crawler as crawlerTorzon
import configparser import configparser
import os import os
@ -135,5 +136,7 @@ if __name__ == '__main__':
crawlerTheDarkMarket() crawlerTheDarkMarket()
elif mkt == "ZeroDay": elif mkt == "ZeroDay":
crawlerZeroDay() crawlerZeroDay()
elif mkt == "Torzon":
crawlerTorzon()
print("\nScraping process completed!") print("\nScraping process completed!")

+ 5
- 0
MarketPlaces/Initialization/prepare_parser.py View File

@ -24,6 +24,7 @@ from MarketPlaces.Quest.parser import *
from MarketPlaces.Ares.parser import * from MarketPlaces.Ares.parser import *
from MarketPlaces.CypherMarketplace.parser import * from MarketPlaces.CypherMarketplace.parser import *
from MarketPlaces.WeTheNorth.parser import * from MarketPlaces.WeTheNorth.parser import *
from MarketPlaces.Torzon.parser import *
from MarketPlaces.GoFish.parser import * from MarketPlaces.GoFish.parser import *
from MarketPlaces.ZeroDay.parser import * from MarketPlaces.ZeroDay.parser import *
@ -163,6 +164,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
rw = gofish_listing_parser(soup) rw = gofish_listing_parser(soup)
elif marketPlace == "ZeroDay": elif marketPlace == "ZeroDay":
rw = zeroday_listing_parser(soup) rw = zeroday_listing_parser(soup)
elif marketPlace == "Torzon":
rw = torzon_listing_parser(soup)
else: else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception raise Exception
@ -219,6 +222,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
rmm = gofish_description_parser(soup) rmm = gofish_description_parser(soup)
elif marketPlace == "ZeroDay": elif marketPlace == "ZeroDay":
rmm = zeroday_description_parser(soup) rmm = zeroday_description_parser(soup)
elif marketPlace == "Torzon":
rmm = torzon_description_parser(soup)
else: else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception raise Exception


+ 40
- 17
MarketPlaces/Torzon/crawler_selenium.py View File

@ -45,7 +45,7 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closeDriver(driver) closeDriver(driver)
new_parse(mktName, BASE_URL, False)
# new_parse(mktName, BASE_URL, False)
# Returns the name of the website # Returns the name of the website
@ -129,8 +129,35 @@ def getAccess():
# then allows for manual solving of captcha in the terminal # then allows for manual solving of captcha in the terminal
#@param: current selenium web driver #@param: current selenium web driver
def login(driver): def login(driver):
input("Press ENTER when CAPTCHA is completed and page is loaded\n")
input("Press ENTER when CAPTCHA is completed and LOGIN page is loaded\n")
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="username"]')))
# wait for page to show up (This Xpath may need to change based on different seed url) # wait for page to show up (This Xpath may need to change based on different seed url)
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
# Username here
usernameBox.send_keys('LordTachonky')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
# Password here
passwordBox.send_keys('BorderRanked')
input("Press ENTER when CAPTCHA is finished\n")
login = driver.find_element(by=By.XPATH, value='/html/body/div/form/input[4]')
login.click()
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div/center')))
pinBox = driver.find_element(by=By.XPATH, value='//*[@id="pin"]')
pinBox.send_keys('541236')
submit = driver.find_element(by=By.XPATH, value='/html/body/div/form/input[2]')
submit.click()
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[1]/p')))
# driver.find_element(by=By.XPATH, value='/html/body/div[1]/label').click()
# Saves the crawled html page, makes the directory path for html pages if not made # Saves the crawled html page, makes the directory path for html pages if not made
@ -210,33 +237,30 @@ def crawlForum(driver):
driver.get(link) driver.get(link)
except: except:
driver.refresh() driver.refresh()
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[3]/div/table/tbody/tr/td[2]/center/table/tbody/tr[1]/td[1]')))
html = driver.page_source html = driver.page_source
savePage(driver, html, link) savePage(driver, html, link)
list = productPages(html) list = productPages(html)
for item in list: for item in list:
itemURL = urlparse.urljoin(BASE_URL, str(item))
itemURL = urlparse.urljoin(getFixedURL(), str(item))
try: try:
# time.sleep(1.5) # to keep from detecting click speed
driver.get(itemURL) driver.get(itemURL)
except: except:
driver.refresh() driver.refresh()
savePage(driver, driver.page_source, item) savePage(driver, driver.page_source, item)
# time.sleep(1.5)
driver.back() driver.back()
# to keep from detecting click speed
# comment out
break
# comment out
if count == 1:
break
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try: try:
# nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]')
# a = nav.find_element(by=By.LINK_TEXT, value=">")
link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href')
link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
if link == "": if link == "":
raise NoSuchElementException raise NoSuchElementException
count += 1 count += 1
@ -246,7 +270,6 @@ def crawlForum(driver):
except Exception as e: except Exception as e:
print(link, e) print(link, e)
# raise e
i += 1 i += 1
input("Crawling Torzon market done sucessfully. Press ENTER to continue\n") input("Crawling Torzon market done sucessfully. Press ENTER to continue\n")


+ 1
- 1
MarketPlaces/Torzon/parser.py View File

@ -318,7 +318,7 @@ def torzon_links_parser(soup):
href = [] href = []
# listing = soup.findAll('div', {"class": "card mt-1"}) # listing = soup.findAll('div', {"class": "card mt-1"})
listing = soup.find('td', {"valign": "top"}).find("table", {"border": "0"}).findAll('td', {'width': '50%'})
listing = soup.find('td', {"valign": "top"}).find("tbody").findAll('td', {'width': '50%'})
for a in listing: for a in listing:
bae = a.find('a', href=True)#card-title rounded text-truncate bae = a.find('a', href=True)#card-title rounded text-truncate


Loading…
Cancel
Save