Browse Source

moved Vortex back to test

main
westernmeadow 6 months ago
parent
commit
07e71ec2ea
2 changed files with 32 additions and 33 deletions
  1. +25
    -26
      MarketPlaces/Vortex/crawler_selenium.py
  2. +7
    -7
      MarketPlaces/Vortex/parser.py

+ 25
- 26
MarketPlaces/Vortex/crawler_selenium.py View File

@ -26,19 +26,19 @@ from MarketPlaces.Vortex.parser import vortex_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://mq7ozbnrqdjc6cof3yakegs44kmo6vl3ajcyzdeya3zjtmi65jtmwqid.onion/login'
baseURL = 'http://mq7ozbnrqdjc6cof3yakegs44kmo6vl3ajcyzdeya3zjtmi65jtmwqid.onion/'
def startCrawling():
mktName = getMKTName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closeDriver(driver)
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
@ -137,15 +137,14 @@ def login(driver):
input("Press ENTER when captcha is solved")
try:
agree_button = driver.find_element(by=By.NAME, value='login')
agree_button.click()
except Exception as e:
print('Problem with clicking login button', e)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="main"]')))
# try:
# agree_button = driver.find_element(by=By.NAME, value='login')
# agree_button.click()
# except Exception as e:
# print('Problem with clicking login button', e)
#
# WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
# (By.XPATH, '//*[@id="main"]')))
def savePage(driver, page, url):
@ -186,11 +185,11 @@ def getInterestedLinks():
links = []
# security and hacking
# links.append('http://mq7ozbnrqdjc6cof3yakegs44kmo6vl3ajcyzdeya3zjtmi65jtmwqid.onion/home?cat=Security+%26+Hacking')
links.append('http://mq7ozbnrqdjc6cof3yakegs44kmo6vl3ajcyzdeya3zjtmi65jtmwqid.onion/home?cat=Security+%26+Hacking')
# fraud
links.append('http://mq7ozbnrqdjc6cof3yakegs44kmo6vl3ajcyzdeya3zjtmi65jtmwqid.onion/home?cat=Fraud')
# malware, nothing here for now
# links.append('http://mq7ozbnrqdjc6cof3yakegs44kmo6vl3ajcyzdeya3zjtmi65jtmwqid.onion/home?cat=Cracked+softwares%26comma%3B+Botnets+%26+Malware')
links.append('http://mq7ozbnrqdjc6cof3yakegs44kmo6vl3ajcyzdeya3zjtmi65jtmwqid.onion/home?cat=Cracked+softwares%26comma%3B+Botnets+%26+Malware')
return links
@ -228,12 +227,12 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item)
driver.back()
# comment out
break
# # comment out
# break
#
# comment out
if count == 1:
break
# # comment out
# if count == 1:
# break
try:
temp = driver.find_element(by=By.XPATH, value = '//*[@id="main"]')


+ 7
- 7
MarketPlaces/Vortex/parser.py View File

@ -142,10 +142,10 @@ def vortex_listing_parser(soup):
href = [] # 22 Product_Links y
temp = soup.find('main', {'id': 'main'}).find('section', {'id':'page_container'})
listings = temp.findAll('div', {"class": "product-card col-sm-6 col-md-3 col-xl-4 mb-5"})
listings = temp.findAll('div', {"class": "product-card col-sm-6 col-md-3 col-xl-4 mb-0"})
cat = soup.find('section', {'class': 'row px-md-4 mx-0 mb-3'}).find('ol').find_all('li')
cat = cat[1].find('a').text
# cat = soup.find('section', {'class': 'row px-md-4 mx-0 my-3'}).find('ol').find_all('li')
# cat = cat[1].find('a').text
# Populating the Number of Products
nm = len(listings)
@ -203,9 +203,9 @@ def vortex_listing_parser(soup):
MSValue = me
MS.append(MSValue)
# Finding the category - check
category_text = cleanString(cat).strip()
category.append(category_text)
# # Finding the category - check
# category_text = cleanString(cat).strip()
# category.append(category_text)
# Finding the hrefs - check
description_link = listing.find('h4').find('a')['href']
@ -278,7 +278,7 @@ def vortex_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listings = soup.find('main').findAll('div', {"class": "product-card col-sm-6 col-md-3 col-xl-4 mb-5"})
listings = soup.find('main').findAll('div', {"class": "product-card col-sm-6 col-md-3 col-xl-4 mb-0"})
for listing in listings:
# Adding the url to the list of urls


Loading…
Cancel
Save