|
@ -8,7 +8,7 @@ from bs4 import BeautifulSoup |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) |
|
|
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) |
|
|
def tor2door_description_parser(soup): |
|
|
|
|
|
|
|
|
def vicecity_description_parser(soup): |
|
|
# Fields to be parsed |
|
|
# Fields to be parsed |
|
|
|
|
|
|
|
|
vendor = "-1" # 0 *Vendor_Name |
|
|
vendor = "-1" # 0 *Vendor_Name |
|
@ -31,62 +31,68 @@ def tor2door_description_parser(soup): |
|
|
shipFrom = "-1" # 17 Product_ShippedFrom |
|
|
shipFrom = "-1" # 17 Product_ShippedFrom |
|
|
shipTo = "-1" # 18 Product_ShippedTo |
|
|
shipTo = "-1" # 18 Product_ShippedTo |
|
|
|
|
|
|
|
|
bae = soup.find('div', {'class': "col-9"}) |
|
|
|
|
|
|
|
|
|
|
|
# Finding Product Name |
|
|
# Finding Product Name |
|
|
name = bae.find('h2').text |
|
|
|
|
|
|
|
|
name = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}) |
|
|
|
|
|
name = name.find('span', {'style': "font-size:18px;font-weight: bold;color: #fff"}).text |
|
|
name = name.replace('\n', ' ') |
|
|
name = name.replace('\n', ' ') |
|
|
name = name.replace(",", "") |
|
|
name = name.replace(",", "") |
|
|
name = name.strip() |
|
|
name = name.strip() |
|
|
|
|
|
|
|
|
mb = bae.findAll('div', {"class": "mb-1"}) |
|
|
|
|
|
|
|
|
|
|
|
# Finding Vendor |
|
|
# Finding Vendor |
|
|
vendor = mb[0].text |
|
|
|
|
|
vendor = vendor.replace(",", "") |
|
|
|
|
|
vendor = vendor.replace("Sold by:", "") |
|
|
|
|
|
vendor = vendor.strip() |
|
|
|
|
|
|
|
|
vendor = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}).find('a').text.strip() |
|
|
|
|
|
|
|
|
# # Finding Vendor Rating |
|
|
|
|
|
# full_stars = bae[2].find_all('i', {'class': "fas fa-star"}) |
|
|
|
|
|
# half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) |
|
|
|
|
|
# rating = len(full_stars) + (0.5 if half_star is not None else 0) |
|
|
|
|
|
|
|
|
# Finding Vendor Rating |
|
|
|
|
|
rating = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}).find('a').get('title') |
|
|
|
|
|
rating = str(re.match(r"\d+%", rating)).strip() |
|
|
|
|
|
|
|
|
# Finding Quantity Sold and Left |
|
|
# Finding Quantity Sold and Left |
|
|
temp = mb[4].text.split(',') |
|
|
|
|
|
|
|
|
# temp = mb[4].text.split(',') |
|
|
|
|
|
# |
|
|
|
|
|
# sold = temp[0].replace("sold", "") |
|
|
|
|
|
# sold = sold.strip() |
|
|
|
|
|
# |
|
|
|
|
|
# left = temp[1].replace("in stock", "") |
|
|
|
|
|
# left = left.strip() |
|
|
|
|
|
|
|
|
sold = temp[0].replace("sold", "") |
|
|
|
|
|
sold = sold.strip() |
|
|
|
|
|
|
|
|
# Finding Successful Transactions |
|
|
|
|
|
success = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}).find('a').get('title') |
|
|
|
|
|
success = str(re.compile(r"\d+(?= sales)", success)).strip() |
|
|
|
|
|
|
|
|
left = temp[1].replace("in stock", "") |
|
|
|
|
|
left = left.strip() |
|
|
|
|
|
|
|
|
bae = soup.find('pre') |
|
|
|
|
|
|
|
|
# Finding USD |
|
|
# Finding USD |
|
|
USD = bae.find('div', {"class": "h3 text-secondary"}).text |
|
|
|
|
|
USD = USD.replace("$", "") |
|
|
|
|
|
USD = USD.strip() |
|
|
|
|
|
|
|
|
USD = bae.find('span').text |
|
|
|
|
|
USD = str(re.compile(r"\$\d+(?:\.\d+)?", USD)) |
|
|
|
|
|
USD = USD.replace("$", "").strip() |
|
|
|
|
|
|
|
|
# Finding BTC |
|
|
# Finding BTC |
|
|
temp = bae.find('div', {"class": "small"}).text.split("BTC") |
|
|
|
|
|
|
|
|
BTC = bae.findall('span') |
|
|
|
|
|
BTC = str(re.compile(r"\d+(?:\.\d+)?", BTC[1].text)).strip() |
|
|
|
|
|
|
|
|
BTC = temp[0].strip() |
|
|
|
|
|
|
|
|
# Finding the Product Category |
|
|
|
|
|
category = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}) |
|
|
|
|
|
category = category.find('span', {'style': "font-size:15px;color: #a1a1a1"}).text |
|
|
|
|
|
category = category.replace("Category:", "").strip() |
|
|
|
|
|
|
|
|
# shipping_info = bae[4].text |
|
|
|
|
|
# if "Digital" not in shipping_info: |
|
|
|
|
|
# shipping_info = shipping_info.split(" ") |
|
|
|
|
|
# |
|
|
|
|
|
# # Finding Shipment Information (Origin) |
|
|
|
|
|
# shipFrom = shipping_info[0].strip() |
|
|
|
|
|
# |
|
|
|
|
|
# # Finding Shipment Information (Destination) |
|
|
|
|
|
# shipTo = shipping_info[1].strip() |
|
|
|
|
|
|
|
|
li = bae.find('span', {'style': "float:right"}).find_all('span') |
|
|
|
|
|
|
|
|
|
|
|
# Finding Shipment Information (Origin) |
|
|
|
|
|
shipFrom = li[1].text.strip() |
|
|
|
|
|
|
|
|
|
|
|
# Finding Shipment Information (Destination) |
|
|
|
|
|
shipTo = li[-2].text.strip() |
|
|
|
|
|
|
|
|
# Finding the Product description |
|
|
# Finding the Product description |
|
|
describe = bae.find('div', {"class": "card border-top-0"}).text |
|
|
|
|
|
|
|
|
describe = soup.find('p', { |
|
|
|
|
|
'style': "width:705px;margin-left:-305px;background-color: #242424;border-radius: 3px;border: 1px solid #373737;padding: 5px;"}).text |
|
|
describe = describe.replace("\n", " ") |
|
|
describe = describe.replace("\n", " ") |
|
|
describe = describe.replace("\r", " ") |
|
|
|
|
|
describe = describe.strip() |
|
|
describe = describe.strip() |
|
|
|
|
|
|
|
|
|
|
|
# Finding the Number of Product Reviews |
|
|
|
|
|
li = soup.find_all('label', {'class': "tc_label threetabs"}) |
|
|
|
|
|
review = li[1].text |
|
|
|
|
|
review = str(re.compile(r"\d+", review)).strip() |
|
|
|
|
|
|
|
|
# Searching for CVE and MS categories |
|
|
# Searching for CVE and MS categories |
|
|
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) |
|
|
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) |
|
|
if cve: |
|
|
if cve: |
|
@ -114,10 +120,10 @@ def tor2door_description_parser(soup): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# This is the method to parse the Listing Pages |
|
|
# This is the method to parse the Listing Pages |
|
|
def tor2door_listing_parser(soup): |
|
|
|
|
|
|
|
|
def vicecity_listing_parser(soup): |
|
|
# Fields to be parsed |
|
|
# Fields to be parsed |
|
|
nm = 0 # *Total_Products (Should be Integer) |
|
|
nm = 0 # *Total_Products (Should be Integer) |
|
|
mktName = "Tor2door" # 0 *Marketplace_Name |
|
|
|
|
|
|
|
|
mktName = "ViceCity" # 0 *Marketplace_Name |
|
|
vendor = [] # 1 *Vendor y |
|
|
vendor = [] # 1 *Vendor y |
|
|
rating_vendor = [] # 2 Vendor_Rating |
|
|
rating_vendor = [] # 2 Vendor_Rating |
|
|
success = [] # 3 Vendor_Successful_Transactions |
|
|
success = [] # 3 Vendor_Successful_Transactions |
|
@ -139,56 +145,69 @@ def tor2door_listing_parser(soup): |
|
|
shipTo = [] # 19 Product_ShippedTo |
|
|
shipTo = [] # 19 Product_ShippedTo |
|
|
href = [] # 20 Product_Links |
|
|
href = [] # 20 Product_Links |
|
|
|
|
|
|
|
|
listing = soup.findAll('div', {"class": "card product-card mb-3"}) |
|
|
|
|
|
|
|
|
listing = soup.find('div', {"class": "frontpage"}).findAll('div', {"class": "wLf"}) |
|
|
|
|
|
|
|
|
# Populating the Number of Products |
|
|
# Populating the Number of Products |
|
|
nm = len(listing) |
|
|
nm = len(listing) |
|
|
|
|
|
|
|
|
# Finding Category |
|
|
|
|
|
cat = soup.find("div", {"class": "col-9"}) |
|
|
|
|
|
cat = cat.find("h2").text |
|
|
|
|
|
cat = cat.replace("Category: ", "") |
|
|
|
|
|
cat = cat.replace(",", "") |
|
|
|
|
|
cat = cat.strip() |
|
|
|
|
|
|
|
|
# # Finding Category |
|
|
|
|
|
# cat = soup.find("div", {"class": "col-9"}) |
|
|
|
|
|
# cat = cat.find("h2").text |
|
|
|
|
|
# cat = cat.replace("Category: ", "") |
|
|
|
|
|
# cat = cat.replace(",", "") |
|
|
|
|
|
# cat = cat.strip() |
|
|
|
|
|
|
|
|
for card in listing: |
|
|
|
|
|
category.append(cat) |
|
|
|
|
|
|
|
|
for a in listing: |
|
|
|
|
|
# category.append(cat) |
|
|
|
|
|
|
|
|
bae = card.findAll('a') |
|
|
|
|
|
|
|
|
# bae = card.findAll('a') |
|
|
|
|
|
|
|
|
# Adding the url to the list of urls |
|
|
# Adding the url to the list of urls |
|
|
link = bae[0].get('href') |
|
|
|
|
|
|
|
|
link = a.find('div', {"class": "wLfLeft"}).find('a', href=True).get('href') |
|
|
|
|
|
link = cleanLink(link) |
|
|
href.append(link) |
|
|
href.append(link) |
|
|
|
|
|
|
|
|
# Finding Product Name |
|
|
|
|
|
product = bae[1].text |
|
|
|
|
|
|
|
|
# Finding the Product Name |
|
|
|
|
|
product = a.find('div', {"class": "wLfName"}).find('a').text |
|
|
product = product.replace('\n', ' ') |
|
|
product = product.replace('\n', ' ') |
|
|
product = product.replace(",", "") |
|
|
product = product.replace(",", "") |
|
|
|
|
|
product = product.replace("...", "") |
|
|
product = product.strip() |
|
|
product = product.strip() |
|
|
name.append(product) |
|
|
name.append(product) |
|
|
|
|
|
|
|
|
# Finding Vendor |
|
|
|
|
|
vendor_name = bae[2].text |
|
|
|
|
|
|
|
|
# Finding the Vendor |
|
|
|
|
|
vendor_name = a.find('div', {"class": "wLfVendor"}).find('a').text |
|
|
vendor_name = vendor_name.replace(",", "") |
|
|
vendor_name = vendor_name.replace(",", "") |
|
|
vendor_name = vendor_name.strip() |
|
|
vendor_name = vendor_name.strip() |
|
|
vendor.append(vendor_name) |
|
|
vendor.append(vendor_name) |
|
|
|
|
|
|
|
|
# Finding USD |
|
|
|
|
|
usd = card.find('div', {"class": "mb-1"}).text |
|
|
|
|
|
usd = usd.replace("$", "") |
|
|
|
|
|
usd = usd.strip() |
|
|
|
|
|
USD.append(usd) |
|
|
|
|
|
|
|
|
|
|
|
# Finding Reviews |
|
|
|
|
|
num = card.find("span", {"class": "rate-count"}).text |
|
|
|
|
|
num = num.replace("(", "") |
|
|
|
|
|
num = num.replace("review)", "") |
|
|
|
|
|
num = num.replace("reviews)", "") |
|
|
|
|
|
num = num.strip() |
|
|
|
|
|
reviews.append(num) |
|
|
|
|
|
|
|
|
# Finding Prices |
|
|
|
|
|
price = a.find('div', {"class": "wLfPrice"}).find_all('span') |
|
|
|
|
|
ud = price[0].text.replace(" USD", " ") |
|
|
|
|
|
# u = ud.replace("$","") |
|
|
|
|
|
u = ud.replace(",", "") |
|
|
|
|
|
u = u.strip() |
|
|
|
|
|
USD.append(u) |
|
|
|
|
|
bc = price[1].text |
|
|
|
|
|
bc = str(re.compile(r"\d+(?:\.\d+)?", bc)) |
|
|
|
|
|
BTC.append(bc) |
|
|
|
|
|
|
|
|
|
|
|
# # Finding Reviews |
|
|
|
|
|
# num = card.find("span", {"class": "rate-count"}).text |
|
|
|
|
|
# num = num.replace("(", "") |
|
|
|
|
|
# num = num.replace("review)", "") |
|
|
|
|
|
# num = num.replace("reviews)", "") |
|
|
|
|
|
# num = num.strip() |
|
|
|
|
|
# reviews.append(num) |
|
|
|
|
|
|
|
|
|
|
|
# Finding Successful Transactions |
|
|
|
|
|
freq = a.find('div', {"class": "wLfVendor"}).find('a').get('title') |
|
|
|
|
|
freq = re.compile(r'\d+(?= sales)', freq) |
|
|
|
|
|
freq = freq.strip() |
|
|
|
|
|
success.append(freq) |
|
|
|
|
|
|
|
|
# Searching for CVE and MS categories |
|
|
# Searching for CVE and MS categories |
|
|
cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) |
|
|
|
|
|
|
|
|
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) |
|
|
if not cve: |
|
|
if not cve: |
|
|
cveValue = "-1" |
|
|
cveValue = "-1" |
|
|
else: |
|
|
else: |
|
@ -201,7 +220,7 @@ def tor2door_listing_parser(soup): |
|
|
cveValue = cee |
|
|
cveValue = cee |
|
|
CVE.append(cveValue) |
|
|
CVE.append(cveValue) |
|
|
|
|
|
|
|
|
ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) |
|
|
|
|
|
|
|
|
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) |
|
|
if not ms: |
|
|
if not ms: |
|
|
MSValue = "-1" |
|
|
MSValue = "-1" |
|
|
else: |
|
|
else: |
|
@ -219,14 +238,15 @@ def tor2door_listing_parser(soup): |
|
|
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) |
|
|
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def tor2door_links_parser(soup): |
|
|
|
|
|
|
|
|
def vicecity_links_parser(soup): |
|
|
# Returning all links that should be visited by the Crawler |
|
|
# Returning all links that should be visited by the Crawler |
|
|
href = [] |
|
|
|
|
|
|
|
|
|
|
|
listing = soup.findAll('div', {"class": "card product-card mb-3"}) |
|
|
|
|
|
|
|
|
href = [] |
|
|
|
|
|
listing = soup.find('div', {"class": "frontpage"}).findAll('div', {"class": "wLf"}) |
|
|
|
|
|
|
|
|
for div in listing: |
|
|
|
|
|
link = div.find('a')['href'] |
|
|
|
|
|
|
|
|
for a in listing: |
|
|
|
|
|
bae = a.find('div', {"class": "wLfLeft"}).find('a', href=True) |
|
|
|
|
|
link = bae['href'] |
|
|
href.append(link) |
|
|
href.append(link) |
|
|
|
|
|
|
|
|
return href |
|
|
|
|
|
|
|
|
return href |