Browse Source

Added SilkRoad4 Parser and Crawler

main
Isabelle Wang 6 months ago
parent
commit
dc4ca1d879
1 changed files with 13 additions and 13 deletions
  1. +13
    -13
      MarketPlaces/SilkRoad4/parser.py

+ 13
- 13
MarketPlaces/SilkRoad4/parser.py View File

@ -58,7 +58,7 @@ def silkroad4_description_parser(soup):
USD = price_list[0].replace("$", "").strip() USD = price_list[0].replace("$", "").strip()
BTC = price_list[1].replace("BTC","").strip() BTC = price_list[1].replace("BTC","").strip()
#Not extracted into databse/PGAdmin yet!
# Not stored into databse/PGAdmin yet!
LTC = price_list[2].replace("LTC", "").strip() LTC = price_list[2].replace("LTC", "").strip()
XMR = price_list[3].replace("XMR", "").strip() XMR = price_list[3].replace("XMR", "").strip()
#print(USD, "", BTC,"", LTC,"", XMR) #print(USD, "", BTC,"", LTC,"", XMR)
@ -74,13 +74,13 @@ def silkroad4_description_parser(soup):
describe = soup.find('div', {'style': 'color:#555;font-weight:normal;font-size:12px'}).text describe = soup.find('div', {'style': 'color:#555;font-weight:normal;font-size:12px'}).text
describe = cleanString(describe.strip()) describe = cleanString(describe.strip())
#Finding Rating
# Finding Rating
rate = soup.find('div', {'style': 'padding:0px; margin-bottom:10px; font-size:12px;'}) rate = soup.find('div', {'style': 'padding:0px; margin-bottom:10px; font-size:12px;'})
rate = rate.find('p') rate = rate.find('p')
if rate is not None: if rate is not None:
rate = rate.text.strip() rate = rate.text.strip()
#Some descriptions has 0 rating as 'No Rating yet', can convert to -1 for consistency in database
#Some descriptions have rating as 'No Rating yet', can convert to -1 for consistency in database
#if(rate is 'No rating yet'): #if(rate is 'No rating yet'):
# rating_item = -1 # rating_item = -1
@ -89,7 +89,7 @@ def silkroad4_description_parser(soup):
#Finding Number of Reviews
# Finding Number of Reviews
table = soup.find('div', {'class': 'table-responsive'}) table = soup.find('div', {'class': 'table-responsive'})
if table is not None: if table is not None:
num_rev = table.findAll('tr') num_rev = table.findAll('tr')
@ -177,14 +177,14 @@ def silkroad4_listing_parser(soup):
link = cleanLink(link) link = cleanLink(link)
href.append(link) href.append(link)
#Finding Price
# Finding Price
temp = a.find('b', {'style': 'color:#333'}) temp = a.find('b', {'style': 'color:#333'})
temp = temp.text temp = temp.text
price = temp.split("/") price = temp.split("/")
USD.append(price[0].replace('$', '').strip()) USD.append(price[0].replace('$', '').strip())
BTC.append(price[1].replace('BTC', '').strip()) BTC.append(price[1].replace('BTC', '').strip())
#LTC and XMR will not be stored in Pgadmin as of now
# LTC and XMR will not be stored in Pgadmin as of now
LTC.append(price[2].replace('LTC', '').strip()) LTC.append(price[2].replace('LTC', '').strip())
XMR.append(price[3].replace('XMR', '').strip()) XMR.append(price[3].replace('XMR', '').strip())
#print(USD, " ", BTC, ' ', LTC, '', XMR) #print(USD, " ", BTC, ' ', LTC, '', XMR)
@ -204,11 +204,11 @@ def silkroad4_listing_parser(soup):
name.append(product) name.append(product)
#Finding ShipFrom
# Finding ShipFrom
shipf = bae[len(bae)-1].text.strip() shipf = bae[len(bae)-1].text.strip()
shipFrom.append(shipf) shipFrom.append(shipf)
#Finding image
# Finding image
product_image = a.find('img') product_image = a.find('img')
product_image = product_image.get('src') product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1] product_image = product_image.split('base64,')[-1]
@ -252,24 +252,24 @@ def silkroad4_links_parser(soup):
# Returning all links that should be visited by the Crawler # Returning all links that should be visited by the Crawler
href = [] href = []
#finds all div with id, vp
# finds all div with id, vp
divs = soup.findAll('div', {"id": "vp"}) divs = soup.findAll('div', {"id": "vp"})
listing = [] listing = []
#for all div with id:vp, find first a with href
# for all div with id:vp, find first a with href
for div in divs: for div in divs:
a_s = div.find('a', href=True) a_s = div.find('a', href=True)
#if div has no href True, then move on. otherwise, store the a with href into listing
# if div has no href True, then move on. otherwise, store the a with href into listing
if a_s is not None: if a_s is not None:
listing.append(a_s) listing.append(a_s)
#loop through listing with a with href and extract/return the href
# loop through listing with a with href and extract/return the href
for a in listing: for a in listing:
link = a['href'] link = a['href']
#if '?listing' is in link, then it is a product, so append to the href list
# if '?listing' is in link, then it is a product, so append to the href list. Otherwise, move on.
if "?listing" in link: if "?listing" in link:
href.append(link) href.append(link)

Loading…
Cancel
Save