Browse Source

finished torbay scraper

finished m00nkey description parser. need to work on m00nkey listing
main
Helium 1 year ago
parent
commit
d0ed3511f4
6 changed files with 1791 additions and 437 deletions
  1. +1439
    -0
      MarketPlaces/Initialization/geckodriver.log
  2. +15
    -8
      MarketPlaces/Initialization/prepare_parser.py
  3. +16
    -16
      MarketPlaces/M00nkeyMarket/crawler_selenium.py
  4. +229
    -239
      MarketPlaces/M00nkeyMarket/parser.py
  5. +12
    -12
      MarketPlaces/TorBay/crawler_selenium.py
  6. +80
    -162
      MarketPlaces/TorBay/parser.py

+ 1439
- 0
MarketPlaces/Initialization/geckodriver.log
File diff suppressed because it is too large
View File


+ 15
- 8
MarketPlaces/Initialization/prepare_parser.py View File

@ -10,6 +10,8 @@ from MarketPlaces.Tor2door.parser import *
from MarketPlaces.Apocalypse.parser import *
from MarketPlaces.ThiefWorld.parser import *
from MarketPlaces.AnonymousMarketplace.parser import *
from MarketPlaces.TorBay.parser import *
from MarketPlaces.M00nkeyMarket.parser import *
from MarketPlaces.Classifier.classify_product import predict
@ -148,14 +150,18 @@ def new_parse(marketPlace, url, createLog):
rmm = thiefWorld_description_parser(soup)
elif marketPlace =="AnonymousMarketplace":
rmm = anonymousMarketplace_description_parser(soup)
elif marketPlace == "TorBay":
rmm = torbay_description_parser(soup)
elif marketPlace == "M00nkeyMarket":
rmm = m00nkey_description_parser(soup)
# key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2]
key = u"Url:" + os.path.basename(line2).replace(".html", "")
# save file address with description record in memory
detPage[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
except :
except:
nError += 1
print("There was a problem to parse the file " + line2 + " in the Description section!")
@ -187,7 +193,7 @@ def new_parse(marketPlace, url, createLog):
readError = True
if not readError:
print("Hello!")
parseError = False
try:
@ -200,12 +206,15 @@ def new_parse(marketPlace, url, createLog):
elif marketPlace == "ThiefWorld":
rw = thiefWorld_listing_parser(soup)
elif marketPlace == "AnonymousMarketplace":
rw = anonymousMarketplace_listing_parser(soup)
rw = anonymousMarketplace_listing_parser(soup)
elif marketPlace == "TorBay":
rw = torbay_listing_parser(soup)
elif marketPlace == "M00nkeyMarket":
rw = m00nkey_listing_parser(soup)
else:
parseError = True
except Exception as e:
raise e
except:
nError += 1
print("There was a problem to parse the file " + line1 + " in the listing section!")
@ -224,7 +233,6 @@ def new_parse(marketPlace, url, createLog):
for rec in rw:
rec = rec.split(',')
print(rec)
# if len(detPage) > 0: #It was created here just because Zeroday Market does not have Description Pages
# key = rec[23]
@ -232,7 +240,6 @@ def new_parse(marketPlace, url, createLog):
# key = u"Pr:" + rec[1].upper()[:list_lim1] + u" Vendor:" + rec[18].upper()[:list_lim2]
key = u"Url:" + cleanLink(rec[20])
print(key)
# if the associated description page is parsed
if key in detPage:


+ 16
- 16
MarketPlaces/M00nkeyMarket/crawler_selenium.py View File

@ -33,19 +33,19 @@ baseURL = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
opentor()
# mktName = getMKTName()
driver = getAccess()
# opentor()
mktName = getMKTName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
new_parse(mktName, baseURL, False)
# Opens Tor Browser
@ -246,12 +246,12 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
# if count == 1:
# count = 0
# break
if count == 1:
count = 0
break
try:
link = driver.find_element(by=By.LINK_TEXT, value='Next ›').get_attribute('href')


+ 229
- 239
MarketPlaces/M00nkeyMarket/parser.py View File

@ -1,4 +1,4 @@
__author__ = 'DarkWeb'
__author__ = 'Helium'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
@ -11,133 +11,168 @@ from bs4 import BeautifulSoup
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def darkfox_description_parser(soup):
def m00nkey_description_parser(soup):
# Fields to be parsed
name = "-1" # 0 Product_Name
describe = "-1" # 1 Product_Description
lastSeen = "-1" # 2 Product_LastViewDate
rules = "-1" # 3 NOT USED ...
CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
review = "-1" # 6 Product_Number_Of_Reviews
category = "-1" # 7 Product_Category
shipFrom = "-1" # 8 Product_ShippedFrom
shipTo = "-1" # 9 Product_ShippedTo
left = "-1" # 10 Product_QuantityLeft
escrow = "-1" # 11 Vendor_Warranty
terms = "-1" # 12 Vendor_TermsAndConditions
vendor = "-1" # 13 Vendor_Name
sold = "-1" # 14 Product_QuantitySold
addDate = "-1" # 15 Product_AddedDate
available = "-1" # 16 NOT USED ...
endDate = "-1" # 17 NOT USED ...
BTC = "-1" # 18 Product_BTC_SellingPrice
USD = "-1" # 19 Product_USD_SellingPrice
rating = "-1" # 20 Vendor_Rating
success = "-1" # 21 Vendor_Successful_Transactions
EURO = "-1" # 22 Product_EURO_SellingPrice
# Finding Product Name
name = soup.find('h1').text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
vendor = soup.find('h3').find('a').text.strip()
# Finding Vendor Rating
rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
# Finding Successful Transactions
success = soup.find('h3').text
success = success.replace("Vendor: ", "")
success = success.replace(vendor, "")
success = success.replace("(", "")
success = success.replace(")", "")
success = success.strip()
bae = soup.find('div', {'class': "box"}).find_all('ul')
# Finding Prices
USD = bae[1].find('strong').text.strip()
li = bae[2].find_all('li')
# Finding Escrow
escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Category
category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Quantity Available
left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
# Finding Number Sold
sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
li = bae[3].find_all('li')
# Finding Shipment Information (Origin)
if "Ships from:" in li[-2].text:
shipFrom = li[-2].text
shipFrom = shipFrom.replace("Ships from: ", "")
# shipFrom = shipFrom.replace(",", "")
shipFrom = shipFrom.strip()
# Finding Shipment Information (Destination)
shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
shipTo = shipTo.replace("Ships to: ", "")
shipTo = shipTo.strip()
if "certain countries" in shipTo:
countries = ""
tags = li[-1].find_all('span', {'class': "tag"})
for tag in tags:
country = tag.text.strip()
countries += country + ", "
shipTo = countries.strip(", ")
# Finding the Product description
describe = soup.find('div', {'class': "pre-line"}).text
describe = describe.replace("\n", " ")
describe = describe.strip()
'''# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
#vendor name
try:
temp = soup.find('div', {'class': 'box rounded mb-0'}).find('a').text
vendor = (cleanString(temp.strip()))
except:
print("Error in vendor")
#successful transaction
try:
temp = soup.findAll('div', {'class','text-center text-truncate column-flex ml-1 mr-1'}) #card sidebar-menu mb-4 card sidebar-menu mb-4
temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'})
temp = temp2[1].text
success = (cleanString(temp.strip()))
except:
print("Error in successful")
sucess = "-1"
#vendor rating 5
try:
temp = soup.findAll('div', {'class', 'text-center text-truncate column-flex ml-1 mr-1'}) # card sidebar-menu mb-4 card sidebar-menu mb-4
temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'})
temp = temp2[5].text
rating_vendor = (cleanString(temp.strip()))
except:
print("Error in vendor rating")
rating_vendor = "-1"
# product name
try:
temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').text
name = (cleanString(temp.strip()))
except:
print("Error in product name")
name = "-1"
# product description
try:
describe = soup.find('div', {'class': "box rounded flex-fill"}).find('pre').text
if "\n" in describe:
describe = describe.replace("\n", " ")
describe = describe.replace("\r", " ")
describe = cleanString(describe.strip())
except:
print("Product description")
describe = "-1"
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
# product category
try:
temp = soup.findAll('table', {'class', 'table table-hover'})
temp2 = temp[1].find('tr').findAll('td')
temp = temp2[1].text
category = cleanString(temp.strip())
except:
try:
temp = soup.find('table', {'class', 'table table-hover'})
temp2 = temp.find('tbody').find('tr').findAll('td')
temp = temp2[1].text
category = cleanString(temp.strip())
except:
print('Product category')
category = "-1"
# product number of view
try:
temp = soup.find('div', {'class', 'box rounded mb-0'})
temp2 = temp.findAll('i')
temp = temp2[2].text
views = cleanString((temp.strip()))
except:
print('Product number of view')
views = "-1"
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
#BTC selling price box box-rounded mt-2
try:
temp = soup.find('div', {'class', 'box box-rounded mt-2'})
temp2 = temp.findAll('i', {'class', 'float-right color-prices'})
temp = temp2[1].text
BTC = cleanString((temp.strip()))
except:
print('Product BTC')
BTC = "-1"
# USD selling price
try:
temp = soup.find('div', {'class', 'box box-rounded mt-2'})
temp2 = temp.findAll('center')
temp = temp2[1].find('i').text
if "$" in temp:
temp = temp.replace("$", "")
USD = cleanString((temp.strip()))
except:
print('Product USD')
USD = "-1"
EURO = "-1" # 14 Product_EURO_SellingPrice
# product sold
try:
temp = soup.find('div', {'class', 'box rounded mb-0'}) # card sidebar-menu mb-4 card sidebar-menu mb-4
temp2 = temp.find('i')
temp = temp2.text
sold = (cleanString(temp.strip()))
except:
print("Error in successful")
sold = "-1"
# product quantatiy left ###ERRROR
try:
temp = soup.findAll('table', {'class', 'table table-hover'})
temp2 = temp[1].findAll('tr')
temp3 = temp2[1].findAll('td')
temp = temp3[1].text
left = cleanString(temp.strip())
except:
try:
temp = soup.find('table', {'class', 'table table-hover'})
temp2 = temp.findAll('tr')
temp3 = temp2[1].findAll('td')
temp = temp3[1].text
left = cleanString(temp.strip())
except:
print('Product quantity')
left = "-1"
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
# Populating the final variable (this should be a list with all fields scraped)
row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
# Sending the results
return row
@ -147,131 +182,86 @@ def darkfox_description_parser(soup):
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def darkfox_listing_parser(soup):
def m00nkey_listing_parser(soup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "DarkFox" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft =[] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
listing = soup.findAll('div', {"class": "card"})
nm = 0 # *Total_Products (Should be Integer)
mktName = "M00nkeyMarket" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
listing = soup.findAll('div', {"class": "card mt-1"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# vendor
try:
temp = a.find('col-5 justify-content-between mx-auto').find('a').text
vendor.append(cleanString(temp.strip()))
except:
print('vendor')
#vendor rating
#successful transactions
try:
temp = a.find('col-5 justify-content-between mx-auto').find('div').text
success.append(cleanString(temp.strip()))
except:
print('vendor')
# product name
try:
temp = a.find('card-title rounded text-truncate').find('a').text
name.append(cleanString(temp.strip()))
except:
print('vendor')
CVE.append('-1')
MS.append('-1')
rating_vendor.append("-1")
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
# Finding the Product
product = bae[1].find('p').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
if len(bae) >= 5:
# Finding Prices
price = bae[0].text
ud = price.replace(" USD", " ")
# u = ud.replace("$","")
u = ud.replace(",", "")
u = u.strip()
USD.append(u)
# bc = (prc[1]).strip(' BTC')
# BTC.append(bc)
# Finding the Vendor
vendor_name = bae[1].find('a').text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Category
cat = bae[2].find('small').text
cat = cat.replace("Category: ", "")
cat = cat.replace(",", "")
cat = cat.strip()
category.append(cat)
# Finding Number Sold and Quantity Left
num = bae[3].text
num = num.replace("Sold: ", "")
num = num.strip()
sold.append(num)
quant = bae[4].find('small').text
quant = quant.replace("In stock: ", "")
quant = quant.strip()
qLeft.append(quant)
# Finding Successful Transactions
freq = bae[1].text
freq = freq.replace(vendor_name, "")
freq = re.sub(r'Vendor Level \d+', "", freq)
freq = freq.replace("(", "")
freq = freq.replace(")", "")
freq = freq.strip()
success.append(freq)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)


+ 12
- 12
MarketPlaces/TorBay/crawler_selenium.py View File

@ -32,19 +32,19 @@ baseURL = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
opentor()
# opentor()
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
#
new_parse(mktName, baseURL, False)
# Opens Tor Browser


+ 80
- 162
MarketPlaces/TorBay/parser.py View File

@ -35,88 +35,51 @@ def torbay_description_parser(soup):
shipTo = "-1" # 18 Product_ShippedTo
# Finding Product Name
name = soup.find('div', {'class': 'product-information'}).find('h1').text.strip()
# Finding Vendor
vendor = soup.find('div', {"class": "profile-info"}).find('a').text.strip()
# Finding Vendor Rating
rating_vendor.append(-1)
# Finding Successful Transactions
success.append(-1)
bae = soup.find('div', {'class': "box"}).find_all('ul')
try:
product_name = soup.find('div', {'class': 'product-information'}).find('h1').text
name = cleanString(product_name.strip())
except:
try:
product_name = soup.find('div', {'class': 'profile-info'}).find('h2').text
name = cleanString(product_name.strip())
except:
# print(e)
print("product name")
# Finding Vendor FIx
try:
vendor_name = soup.find('div', {"class": "profile-info"}).find('h2').text
vendor = cleanString(vendor_name.strip())
except:
print("description vendor name failed\n")
# Finding Prices
USD = soup.find('div', {'class': "total-price"}).find('span').text.strip()
try:
USD = soup.find('div', {'class': "total-price"}).find('span').text.strip()
except:
print("description price failed\n")
# Finding the Product Category
category = soup.find('div', {'class': "profile-info"}).find('p').find('a').text.strip()
# Finding the Product Quantity Available
left.append(-1)
# Finding Number Sold
sold.append(-1)
li = bae[3].find_all('li')
# Finding Shipment Information (Origin)
if "Ships from:" in li[-2].text:
shipFrom = li[-2].text
shipFrom = shipFrom.replace("Ships from: ", "")
# shipFrom = shipFrom.replace(",", "")
shipFrom = shipFrom.strip()
# Finding Shipment Information (Destination)
shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
shipTo = shipTo.replace("Ships to: ", "")
shipTo = shipTo.strip()
if "certain countries" in shipTo:
countries = ""
tags = li[-1].find_all('span', {'class': "tag"})
for tag in tags:
country = tag.text.strip()
countries += country + ", "
shipTo = countries.strip(", ")
try:
cat = soup.find('div', {'class': "profile-info"}).find('p').text
category = cleanString(cat.strip())
except:
print("description product category failed")
# Finding the Product description
describe = soup.find('div', {'class': "pre-line"}).text
describe = describe.replace("\n", " ")
describe = describe.strip()
'''# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
try:
describe = soup.find('div', {'class': "info"}).find('p').text
if "\n" in describe:
describe = describe.replace("\n", " ")
describe = describe.replace("\r", " ")
describe = cleanString(describe.strip())
except:
# print("product desc")
try:
describe = soup.find('div', {'class': 'info'}).text
describe = cleanString(describe.strip())
except:
print("Product description")
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
@ -162,93 +125,48 @@ def torbay_listing_parser(soup):
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
product = bae[1].find('p').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
if len(bae) >= 5:
# Finding Prices
price = bae[0].text
ud = price.replace(" USD", " ")
# u = ud.replace("$","")
u = ud.replace(",", "")
u = u.strip()
USD.append(u)
# bc = (prc[1]).strip(' BTC')
# BTC.append(bc)
# Finding the Vendor
vendor_name = bae[1].find('a').text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Category
cat = bae[2].find('small').text
cat = cat.replace("Category: ", "")
cat = cat.replace(",", "")
cat = cat.strip()
category.append(cat)
# Finding Number Sold and Quantity Left
num = bae[3].text
num = num.replace("Sold: ", "")
num = num.strip()
sold.append(num)
quant = bae[4].find('small').text
quant = quant.replace("In stock: ", "")
quant = quant.strip()
qLeft.append(quant)
# Finding Successful Transactions
freq = bae[1].text
freq = freq.replace(vendor_name, "")
freq = re.sub(r'Vendor Level \d+', "", freq)
freq = freq.replace("(", "")
freq = freq.replace(")", "")
freq = freq.strip()
success.append(freq)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
try:
product_name = a.find('p', {'class': 'name'}).text
name.append(cleanString(product_name.strip()))
except:
print("product name")
try:
prod = a.find('p', {'class': 'price'}).text # price
USD.append(cleanString(prod.strip()))
except:
print("USD")
try:
ven = a.find('div', {'class': 'pc-footer'}).find('div').find('a').text # pc-footer
vendor.append(cleanString(ven.strip()))
# print(ven)
except:
print("vendor")
try:
h = a.find('p', {'class': 'name'}).find('a').get('href')
href.append(h)
except:
print("in href")
CVE.append("-1")
MS.append("-1")
rating_vendor.append("-1")
success.append("-1")
describe.append("-1")
views.append("-1")
reviews.append("-1")
rating_item.append("-1")
addDate.append("-1")
BTC.append("-1")
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
category.append("Hacking")
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,


Loading…
Cancel
Save