Browse Source

finished torbay scraper

finished m00nkey description parser. need to work on m00nkey listing
main
Helium 1 year ago
parent
commit
d0ed3511f4
6 changed files with 1791 additions and 437 deletions
  1. +1439
    -0
      MarketPlaces/Initialization/geckodriver.log
  2. +15
    -8
      MarketPlaces/Initialization/prepare_parser.py
  3. +16
    -16
      MarketPlaces/M00nkeyMarket/crawler_selenium.py
  4. +229
    -239
      MarketPlaces/M00nkeyMarket/parser.py
  5. +12
    -12
      MarketPlaces/TorBay/crawler_selenium.py
  6. +80
    -162
      MarketPlaces/TorBay/parser.py

+ 1439
- 0
MarketPlaces/Initialization/geckodriver.log
File diff suppressed because it is too large
View File


+ 15
- 8
MarketPlaces/Initialization/prepare_parser.py View File

@ -10,6 +10,8 @@ from MarketPlaces.Tor2door.parser import *
from MarketPlaces.Apocalypse.parser import * from MarketPlaces.Apocalypse.parser import *
from MarketPlaces.ThiefWorld.parser import * from MarketPlaces.ThiefWorld.parser import *
from MarketPlaces.AnonymousMarketplace.parser import * from MarketPlaces.AnonymousMarketplace.parser import *
from MarketPlaces.TorBay.parser import *
from MarketPlaces.M00nkeyMarket.parser import *
from MarketPlaces.Classifier.classify_product import predict from MarketPlaces.Classifier.classify_product import predict
@ -148,14 +150,18 @@ def new_parse(marketPlace, url, createLog):
rmm = thiefWorld_description_parser(soup) rmm = thiefWorld_description_parser(soup)
elif marketPlace =="AnonymousMarketplace": elif marketPlace =="AnonymousMarketplace":
rmm = anonymousMarketplace_description_parser(soup) rmm = anonymousMarketplace_description_parser(soup)
elif marketPlace == "TorBay":
rmm = torbay_description_parser(soup)
elif marketPlace == "M00nkeyMarket":
rmm = m00nkey_description_parser(soup)
# key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2] # key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2]
key = u"Url:" + os.path.basename(line2).replace(".html", "") key = u"Url:" + os.path.basename(line2).replace(".html", "")
# save file address with description record in memory # save file address with description record in memory
detPage[key] = {'rmm': rmm, 'filename': os.path.basename(line2)} detPage[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
except : except:
nError += 1 nError += 1
print("There was a problem to parse the file " + line2 + " in the Description section!") print("There was a problem to parse the file " + line2 + " in the Description section!")
@ -187,7 +193,7 @@ def new_parse(marketPlace, url, createLog):
readError = True readError = True
if not readError: if not readError:
print("Hello!")
parseError = False parseError = False
try: try:
@ -200,12 +206,15 @@ def new_parse(marketPlace, url, createLog):
elif marketPlace == "ThiefWorld": elif marketPlace == "ThiefWorld":
rw = thiefWorld_listing_parser(soup) rw = thiefWorld_listing_parser(soup)
elif marketPlace == "AnonymousMarketplace": elif marketPlace == "AnonymousMarketplace":
rw = anonymousMarketplace_listing_parser(soup) rw = anonymousMarketplace_listing_parser(soup)
elif marketPlace == "TorBay":
rw = torbay_listing_parser(soup)
elif marketPlace == "M00nkeyMarket":
rw = m00nkey_listing_parser(soup)
else: else:
parseError = True parseError = True
except Exception as e: except:
raise e
nError += 1 nError += 1
print("There was a problem to parse the file " + line1 + " in the listing section!") print("There was a problem to parse the file " + line1 + " in the listing section!")
@ -224,7 +233,6 @@ def new_parse(marketPlace, url, createLog):
for rec in rw: for rec in rw:
rec = rec.split(',') rec = rec.split(',')
print(rec)
# if len(detPage) > 0: #It was created here just because Zeroday Market does not have Description Pages # if len(detPage) > 0: #It was created here just because Zeroday Market does not have Description Pages
# key = rec[23] # key = rec[23]
@ -232,7 +240,6 @@ def new_parse(marketPlace, url, createLog):
# key = u"Pr:" + rec[1].upper()[:list_lim1] + u" Vendor:" + rec[18].upper()[:list_lim2] # key = u"Pr:" + rec[1].upper()[:list_lim1] + u" Vendor:" + rec[18].upper()[:list_lim2]
key = u"Url:" + cleanLink(rec[20]) key = u"Url:" + cleanLink(rec[20])
print(key)
# if the associated description page is parsed # if the associated description page is parsed
if key in detPage: if key in detPage:


+ 16
- 16
MarketPlaces/M00nkeyMarket/crawler_selenium.py View File

@ -33,19 +33,19 @@ baseURL = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor() # opentor()
# mktName = getMKTName() mktName = getMKTName()
driver = getAccess() # driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
if driver != 'down': new_parse(mktName, baseURL, False)
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
# Opens Tor Browser # Opens Tor Browser
@ -246,12 +246,12 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
break # break
# comment out # comment out
# if count == 1: if count == 1:
# count = 0 count = 0
# break break
try: try:
link = driver.find_element(by=By.LINK_TEXT, value='Next ›').get_attribute('href') link = driver.find_element(by=By.LINK_TEXT, value='Next ›').get_attribute('href')


+ 229
- 239
MarketPlaces/M00nkeyMarket/parser.py View File

@ -1,4 +1,4 @@
__author__ = 'DarkWeb' __author__ = 'Helium'
# Here, we are importing the auxiliary functions to clean or convert data # Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import * from MarketPlaces.Utilities.utilities import *
@ -11,133 +11,168 @@ from bs4 import BeautifulSoup
#stores info it needs in different lists, these lists are returned after being organized #stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page #@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page #return: 'row' that contains a variety of lists that each hold info on the description page
def darkfox_description_parser(soup): def m00nkey_description_parser(soup):
# Fields to be parsed # Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
name = "-1" # 0 Product_Name success = "-1" # 1 Vendor_Successful_Transactions
describe = "-1" # 1 Product_Description rating_vendor = "-1" # 2 Vendor_Rating
lastSeen = "-1" # 2 Product_LastViewDate name = "-1" # 3 *Product_Name
rules = "-1" # 3 NOT USED ... describe = "-1" # 4 Product_Description
CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
MS = "-1" # 5 Product_MS_Classification (Microsoft Security) MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
review = "-1" # 6 Product_Number_Of_Reviews category = "-1" # 7 Product_Category
category = "-1" # 7 Product_Category views = "-1" # 8 Product_Number_Of_Views
shipFrom = "-1" # 8 Product_ShippedFrom reviews = "-1" # 9 Product_Number_Of_Reviews
shipTo = "-1" # 9 Product_ShippedTo rating_item = "-1" # 10 Product_Rating
left = "-1" # 10 Product_QuantityLeft addDate = "-1" # 11 Product_AddedDate
escrow = "-1" # 11 Vendor_Warranty BTC = "-1" # 12 Product_BTC_SellingPrice
terms = "-1" # 12 Vendor_TermsAndConditions USD = "-1" # 13 Product_USD_SellingPrice
vendor = "-1" # 13 Vendor_Name EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 14 Product_QuantitySold sold = "-1" # 15 Product_QuantitySold
addDate = "-1" # 15 Product_AddedDate left = "-1" # 16 Product_QuantityLeft
available = "-1" # 16 NOT USED ... shipFrom = "-1" # 17 Product_ShippedFrom
endDate = "-1" # 17 NOT USED ... shipTo = "-1" # 18 Product_ShippedTo
BTC = "-1" # 18 Product_BTC_SellingPrice #vendor name
USD = "-1" # 19 Product_USD_SellingPrice try:
rating = "-1" # 20 Vendor_Rating temp = soup.find('div', {'class': 'box rounded mb-0'}).find('a').text
success = "-1" # 21 Vendor_Successful_Transactions vendor = (cleanString(temp.strip()))
EURO = "-1" # 22 Product_EURO_SellingPrice except:
print("Error in vendor")
# Finding Product Name #successful transaction
name = soup.find('h1').text try:
name = name.replace('\n', ' ') temp = soup.findAll('div', {'class','text-center text-truncate column-flex ml-1 mr-1'}) #card sidebar-menu mb-4 card sidebar-menu mb-4
name = name.replace(",", "") temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'})
name = name.strip() temp = temp2[1].text
success = (cleanString(temp.strip()))
# Finding Vendor except:
vendor = soup.find('h3').find('a').text.strip() print("Error in successful")
sucess = "-1"
# Finding Vendor Rating #vendor rating 5
rating = soup.find('span', {'class': "tag is-dark"}).text.strip() try:
temp = soup.findAll('div', {'class', 'text-center text-truncate column-flex ml-1 mr-1'}) # card sidebar-menu mb-4 card sidebar-menu mb-4
# Finding Successful Transactions temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'})
success = soup.find('h3').text temp = temp2[5].text
success = success.replace("Vendor: ", "") rating_vendor = (cleanString(temp.strip()))
success = success.replace(vendor, "") except:
success = success.replace("(", "") print("Error in vendor rating")
success = success.replace(")", "") rating_vendor = "-1"
success = success.strip() # product name
try:
bae = soup.find('div', {'class': "box"}).find_all('ul') temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').text
name = (cleanString(temp.strip()))
# Finding Prices except:
USD = bae[1].find('strong').text.strip() print("Error in product name")
name = "-1"
li = bae[2].find_all('li') # product description
try:
# Finding Escrow describe = soup.find('div', {'class': "box rounded flex-fill"}).find('pre').text
escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip() if "\n" in describe:
describe = describe.replace("\n", " ")
# Finding the Product Category describe = describe.replace("\r", " ")
category = li[1].find('span', {'class': "tag is-dark"}).text.strip() describe = cleanString(describe.strip())
except:
# Finding the Product Quantity Available print("Product description")
left = li[3].find('span', {'class': "tag is-dark"}).text.strip() describe = "-1"
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
# Finding Number Sold MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
sold = li[4].find('span', {'class': "tag is-dark"}).text.strip() # product category
try:
li = bae[3].find_all('li') temp = soup.findAll('table', {'class', 'table table-hover'})
temp2 = temp[1].find('tr').findAll('td')
# Finding Shipment Information (Origin) temp = temp2[1].text
if "Ships from:" in li[-2].text: category = cleanString(temp.strip())
shipFrom = li[-2].text except:
shipFrom = shipFrom.replace("Ships from: ", "") try:
# shipFrom = shipFrom.replace(",", "") temp = soup.find('table', {'class', 'table table-hover'})
shipFrom = shipFrom.strip() temp2 = temp.find('tbody').find('tr').findAll('td')
temp = temp2[1].text
# Finding Shipment Information (Destination) category = cleanString(temp.strip())
shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text except:
shipTo = shipTo.replace("Ships to: ", "") print('Product category')
shipTo = shipTo.strip() category = "-1"
if "certain countries" in shipTo: # product number of view
countries = "" try:
tags = li[-1].find_all('span', {'class': "tag"}) temp = soup.find('div', {'class', 'box rounded mb-0'})
for tag in tags: temp2 = temp.findAll('i')
country = tag.text.strip() temp = temp2[2].text
countries += country + ", " views = cleanString((temp.strip()))
shipTo = countries.strip(", ") except:
print('Product number of view')
# Finding the Product description views = "-1"
describe = soup.find('div', {'class': "pre-line"}).text reviews = "-1" # 9 Product_Number_Of_Reviews
describe = describe.replace("\n", " ") rating_item = "-1" # 10 Product_Rating
describe = describe.strip() addDate = "-1" # 11 Product_AddedDate
#BTC selling price box box-rounded mt-2
'''# Finding the Number of Product Reviews try:
tag = soup.findAll(text=re.compile('Reviews')) temp = soup.find('div', {'class', 'box box-rounded mt-2'})
for index in tag: temp2 = temp.findAll('i', {'class', 'float-right color-prices'})
reviews = index temp = temp2[1].text
par = reviews.find('(') BTC = cleanString((temp.strip()))
if par >=0: except:
reviews = reviews.replace("Reviews (","") print('Product BTC')
reviews = reviews.replace(")","") BTC = "-1"
reviews = reviews.split(",") # USD selling price
review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) try:
else : temp = soup.find('div', {'class', 'box box-rounded mt-2'})
review = "-1"''' temp2 = temp.findAll('center')
temp = temp2[1].find('i').text
# Searching for CVE and MS categories if "$" in temp:
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) temp = temp.replace("$", "")
if cve: USD = cleanString((temp.strip()))
CVE = " " except:
for idx in cve: print('Product USD')
CVE += (idx) USD = "-1"
CVE += " " EURO = "-1" # 14 Product_EURO_SellingPrice
CVE = CVE.replace(',', ' ') # product sold
CVE = CVE.replace('\n', '') try:
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) temp = soup.find('div', {'class', 'box rounded mb-0'}) # card sidebar-menu mb-4 card sidebar-menu mb-4
if ms: temp2 = temp.find('i')
MS = " " temp = temp2.text
for im in ms: sold = (cleanString(temp.strip()))
MS += (im) except:
MS += " " print("Error in successful")
MS = MS.replace(',', ' ') sold = "-1"
MS = MS.replace('\n', '') # product quantatiy left ###ERRROR
try:
temp = soup.findAll('table', {'class', 'table table-hover'})
temp2 = temp[1].findAll('tr')
temp3 = temp2[1].findAll('td')
temp = temp3[1].text
left = cleanString(temp.strip())
except:
try:
temp = soup.find('table', {'class', 'table table-hover'})
temp2 = temp.findAll('tr')
temp3 = temp2[1].findAll('td')
temp = temp3[1].text
left = cleanString(temp.strip())
except:
print('Product quantity')
left = "-1"
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
# Populating the final variable (this should be a list with all fields scraped) # Populating the final variable (this should be a list with all fields scraped)
row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
sold, addDate, available, endDate, BTC, USD, rating, success, EURO) BTC, USD, EURO, sold, left, shipFrom, shipTo)
# Sending the results # Sending the results
return row return row
@ -147,131 +182,86 @@ def darkfox_description_parser(soup):
#stores info it needs in different lists, these lists are returned after being organized #stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page #@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page #return: 'row' that contains a variety of lists that each hold info on the listing page
def darkfox_listing_parser(soup): def m00nkey_listing_parser(soup):
# Fields to be parsed # Fields to be parsed
nm = 0 # Total_Products (Should be Integer) nm = 0 # *Total_Products (Should be Integer)
mktName = "DarkFox" # 0 Marketplace_Name mktName = "M00nkeyMarket" # 0 *Marketplace_Name
name = [] # 1 Product_Name vendor = [] # 1 *Vendor y
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) rating_vendor = [] # 2 Vendor_Rating
MS = [] # 3 Product_MS_Classification (Microsoft Security) success = [] # 3 Vendor_Successful_Transactions
category = [] # 4 Product_Category name = [] # 4 *Product_Name y
describe = [] # 5 Product_Description CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
escrow = [] # 6 Vendor_Warranty MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
views = [] # 7 Product_Number_Of_Views category = [] # 7 Product_Category y
reviews = [] # 8 Product_Number_Of_Reviews describe = [] # 8 Product_Description
addDate = [] # 9 Product_AddDate views = [] # 9 Product_Number_Of_Views
lastSeen = [] # 10 Product_LastViewDate reviews = [] # 10 Product_Number_Of_Reviews
BTC = [] # 11 Product_BTC_SellingPrice rating_item = [] # 11 Product_Rating
USD = [] # 12 Product_USD_SellingPrice addDate = [] # 12 Product_AddDate
EURO = [] # 13 Product_EURO_SellingPrice BTC = [] # 13 Product_BTC_SellingPrice
sold = [] # 14 Product_QuantitySold USD = [] # 14 Product_USD_SellingPrice y
qLeft =[] # 15 Product_QuantityLeft EURO = [] # 15 Product_EURO_SellingPrice
shipFrom = [] # 16 Product_ShippedFrom sold = [] # 16 Product_QuantitySold
shipTo = [] # 17 Product_ShippedTo qLeft = [] # 17 Product_QuantityLeft
vendor = [] # 18 Vendor shipFrom = [] # 18 Product_ShippedFrom
rating = [] # 19 Vendor_Rating shipTo = [] # 19 Product_ShippedTo
success = [] # 20 Vendor_Successful_Transactions href = [] # 20 Product_Links
href = [] # 23 Product_Links (Urls) listing = soup.findAll('div', {"class": "card mt-1"})
listing = soup.findAll('div', {"class": "card"})
# Populating the Number of Products # Populating the Number of Products
nm = len(listing) nm = len(listing)
for a in listing: for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls # vendor
link = bae[0].get('href') try:
link = cleanLink(link) temp = a.find('col-5 justify-content-between mx-auto').find('a').text
href.append(link) vendor.append(cleanString(temp.strip()))
except:
print('vendor')
#vendor rating
#successful transactions
try:
temp = a.find('col-5 justify-content-between mx-auto').find('div').text
success.append(cleanString(temp.strip()))
except:
print('vendor')
# product name
try:
temp = a.find('card-title rounded text-truncate').find('a').text
name.append(cleanString(temp.strip()))
except:
print('vendor')
CVE.append('-1')
MS.append('-1')
rating_vendor.append("-1")
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
# Finding the Product
product = bae[1].find('p').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
if len(bae) >= 5:
# Finding Prices
price = bae[0].text
ud = price.replace(" USD", " ")
# u = ud.replace("$","")
u = ud.replace(",", "")
u = u.strip()
USD.append(u)
# bc = (prc[1]).strip(' BTC')
# BTC.append(bc)
# Finding the Vendor
vendor_name = bae[1].find('a').text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Category
cat = bae[2].find('small').text
cat = cat.replace("Category: ", "")
cat = cat.replace(",", "")
cat = cat.strip()
category.append(cat)
# Finding Number Sold and Quantity Left
num = bae[3].text
num = num.replace("Sold: ", "")
num = num.strip()
sold.append(num)
quant = bae[4].find('small').text
quant = quant.replace("In stock: ", "")
quant = quant.strip()
qLeft.append(quant)
# Finding Successful Transactions
freq = bae[1].text
freq = freq.replace(vendor_name, "")
freq = re.sub(r'Vendor Level \d+', "", freq)
freq = freq.replace("(", "")
freq = freq.replace(")", "")
freq = freq.strip()
success.append(freq)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped) # Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
#called by the crawler to get description links on a listing page #called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page) #@param: beautifulsoup object that is using the correct html page (listing page)


+ 12
- 12
MarketPlaces/TorBay/crawler_selenium.py View File

@ -32,19 +32,19 @@ baseURL = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor() # opentor()
mktName = getMKTName() mktName = getMKTName()
driver = getAccess() # driver = getAccess()
#
if driver != 'down': # if driver != 'down':
try: # try:
login(driver) # login(driver)
crawlForum(driver) # crawlForum(driver)
except Exception as e: # except Exception as e:
print(driver.current_url, e) # print(driver.current_url, e)
closetor(driver) # closetor(driver)
#
# new_parse(forumName, baseURL, False) new_parse(mktName, baseURL, False)
# Opens Tor Browser # Opens Tor Browser


+ 80
- 162
MarketPlaces/TorBay/parser.py View File

@ -35,88 +35,51 @@ def torbay_description_parser(soup):
shipTo = "-1" # 18 Product_ShippedTo shipTo = "-1" # 18 Product_ShippedTo
# Finding Product Name # Finding Product Name
name = soup.find('div', {'class': 'product-information'}).find('h1').text.strip() try:
product_name = soup.find('div', {'class': 'product-information'}).find('h1').text
# Finding Vendor name = cleanString(product_name.strip())
vendor = soup.find('div', {"class": "profile-info"}).find('a').text.strip() except:
try:
# Finding Vendor Rating product_name = soup.find('div', {'class': 'profile-info'}).find('h2').text
rating_vendor.append(-1) name = cleanString(product_name.strip())
except:
# Finding Successful Transactions # print(e)
success.append(-1) print("product name")
# Finding Vendor FIx
bae = soup.find('div', {'class': "box"}).find_all('ul') try:
vendor_name = soup.find('div', {"class": "profile-info"}).find('h2').text
vendor = cleanString(vendor_name.strip())
except:
print("description vendor name failed\n")
# Finding Prices # Finding Prices
USD = soup.find('div', {'class': "total-price"}).find('span').text.strip() try:
USD = soup.find('div', {'class': "total-price"}).find('span').text.strip()
except:
print("description price failed\n")
# Finding the Product Category # Finding the Product Category
category = soup.find('div', {'class': "profile-info"}).find('p').find('a').text.strip() try:
cat = soup.find('div', {'class': "profile-info"}).find('p').text
# Finding the Product Quantity Available category = cleanString(cat.strip())
left.append(-1) except:
print("description product category failed")
# Finding Number Sold
sold.append(-1)
li = bae[3].find_all('li')
# Finding Shipment Information (Origin)
if "Ships from:" in li[-2].text:
shipFrom = li[-2].text
shipFrom = shipFrom.replace("Ships from: ", "")
# shipFrom = shipFrom.replace(",", "")
shipFrom = shipFrom.strip()
# Finding Shipment Information (Destination)
shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
shipTo = shipTo.replace("Ships to: ", "")
shipTo = shipTo.strip()
if "certain countries" in shipTo:
countries = ""
tags = li[-1].find_all('span', {'class': "tag"})
for tag in tags:
country = tag.text.strip()
countries += country + ", "
shipTo = countries.strip(", ")
# Finding the Product description # Finding the Product description
describe = soup.find('div', {'class': "pre-line"}).text try:
describe = describe.replace("\n", " ") describe = soup.find('div', {'class': "info"}).find('p').text
describe = describe.strip() if "\n" in describe:
describe = describe.replace("\n", " ")
'''# Finding the Number of Product Reviews describe = describe.replace("\r", " ")
tag = soup.findAll(text=re.compile('Reviews')) describe = cleanString(describe.strip())
for index in tag: except:
reviews = index # print("product desc")
par = reviews.find('(') try:
if par >=0: describe = soup.find('div', {'class': 'info'}).text
reviews = reviews.replace("Reviews (","") describe = cleanString(describe.strip())
reviews = reviews.replace(")","") except:
reviews = reviews.split(",") print("Product description")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped) # Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
@ -162,93 +125,48 @@ def torbay_listing_parser(soup):
nm = len(listing) nm = len(listing)
for a in listing: for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
product = bae[1].find('p').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
if len(bae) >= 5:
# Finding Prices
price = bae[0].text
ud = price.replace(" USD", " ")
# u = ud.replace("$","")
u = ud.replace(",", "")
u = u.strip()
USD.append(u)
# bc = (prc[1]).strip(' BTC')
# BTC.append(bc)
# Finding the Vendor
vendor_name = bae[1].find('a').text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Category
cat = bae[2].find('small').text
cat = cat.replace("Category: ", "")
cat = cat.replace(",", "")
cat = cat.strip()
category.append(cat)
# Finding Number Sold and Quantity Left
num = bae[3].text
num = num.replace("Sold: ", "")
num = num.strip()
sold.append(num)
quant = bae[4].find('small').text
quant = quant.replace("In stock: ", "")
quant = quant.strip()
qLeft.append(quant)
# Finding Successful Transactions
freq = bae[1].text
freq = freq.replace(vendor_name, "")
freq = re.sub(r'Vendor Level \d+', "", freq)
freq = freq.replace("(", "")
freq = freq.replace(")", "")
freq = freq.strip()
success.append(freq)
# Searching for CVE and MS categories try:
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) product_name = a.find('p', {'class': 'name'}).text
if not cve: name.append(cleanString(product_name.strip()))
cveValue="-1" except:
else: print("product name")
cee = " " try:
for idx in cve: prod = a.find('p', {'class': 'price'}).text # price
cee += (idx) USD.append(cleanString(prod.strip()))
cee += " " except:
cee = cee.replace(',', ' ') print("USD")
cee = cee.replace('\n', '') try:
cveValue=cee ven = a.find('div', {'class': 'pc-footer'}).find('div').find('a').text # pc-footer
CVE.append(cveValue) vendor.append(cleanString(ven.strip()))
# print(ven)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) except:
if not ms: print("vendor")
MSValue="-1" try:
else: h = a.find('p', {'class': 'name'}).find('a').get('href')
me = " " href.append(h)
for im in ms: except:
me += (im) print("in href")
me += " " CVE.append("-1")
me = me.replace(',', ' ') MS.append("-1")
me = me.replace('\n', '') rating_vendor.append("-1")
MSValue=me success.append("-1")
MS.append(MSValue) describe.append("-1")
views.append("-1")
reviews.append("-1")
rating_item.append("-1")
addDate.append("-1")
BTC.append("-1")
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
category.append("Hacking")
# Populate the final variable (this should be a list with all fields scraped) # Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,


|||||||
x
 
000:0
Loading…
Cancel
Save