|
|
@ -58,7 +58,7 @@ def silkroad4_description_parser(soup): |
|
|
|
USD = price_list[0].replace("$", "").strip() |
|
|
|
BTC = price_list[1].replace("BTC","").strip() |
|
|
|
|
|
|
|
#Not extracted into databse/PGAdmin yet! |
|
|
|
# Not stored into databse/PGAdmin yet! |
|
|
|
LTC = price_list[2].replace("LTC", "").strip() |
|
|
|
XMR = price_list[3].replace("XMR", "").strip() |
|
|
|
#print(USD, "", BTC,"", LTC,"", XMR) |
|
|
@ -74,13 +74,13 @@ def silkroad4_description_parser(soup): |
|
|
|
describe = soup.find('div', {'style': 'color:#555;font-weight:normal;font-size:12px'}).text |
|
|
|
describe = cleanString(describe.strip()) |
|
|
|
|
|
|
|
#Finding Rating |
|
|
|
# Finding Rating |
|
|
|
rate = soup.find('div', {'style': 'padding:0px; margin-bottom:10px; font-size:12px;'}) |
|
|
|
rate = rate.find('p') |
|
|
|
if rate is not None: |
|
|
|
rate = rate.text.strip() |
|
|
|
|
|
|
|
#Some descriptions has 0 rating as 'No Rating yet', can convert to -1 for consistency in database |
|
|
|
#Some descriptions have rating as 'No Rating yet', can convert to -1 for consistency in database |
|
|
|
#if(rate is 'No rating yet'): |
|
|
|
# rating_item = -1 |
|
|
|
|
|
|
@ -89,7 +89,7 @@ def silkroad4_description_parser(soup): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#Finding Number of Reviews |
|
|
|
# Finding Number of Reviews |
|
|
|
table = soup.find('div', {'class': 'table-responsive'}) |
|
|
|
if table is not None: |
|
|
|
num_rev = table.findAll('tr') |
|
|
@ -177,14 +177,14 @@ def silkroad4_listing_parser(soup): |
|
|
|
link = cleanLink(link) |
|
|
|
href.append(link) |
|
|
|
|
|
|
|
#Finding Price |
|
|
|
# Finding Price |
|
|
|
temp = a.find('b', {'style': 'color:#333'}) |
|
|
|
temp = temp.text |
|
|
|
price = temp.split("/") |
|
|
|
USD.append(price[0].replace('$', '').strip()) |
|
|
|
BTC.append(price[1].replace('BTC', '').strip()) |
|
|
|
|
|
|
|
#LTC and XMR will not be stored in Pgadmin as of now |
|
|
|
# LTC and XMR will not be stored in Pgadmin as of now |
|
|
|
LTC.append(price[2].replace('LTC', '').strip()) |
|
|
|
XMR.append(price[3].replace('XMR', '').strip()) |
|
|
|
#print(USD, " ", BTC, ' ', LTC, '', XMR) |
|
|
@ -204,11 +204,11 @@ def silkroad4_listing_parser(soup): |
|
|
|
name.append(product) |
|
|
|
|
|
|
|
|
|
|
|
#Finding ShipFrom |
|
|
|
# Finding ShipFrom |
|
|
|
shipf = bae[len(bae)-1].text.strip() |
|
|
|
shipFrom.append(shipf) |
|
|
|
|
|
|
|
#Finding image |
|
|
|
# Finding image |
|
|
|
product_image = a.find('img') |
|
|
|
product_image = product_image.get('src') |
|
|
|
product_image = product_image.split('base64,')[-1] |
|
|
@ -252,24 +252,24 @@ def silkroad4_links_parser(soup): |
|
|
|
# Returning all links that should be visited by the Crawler |
|
|
|
href = [] |
|
|
|
|
|
|
|
#finds all div with id, vp |
|
|
|
# finds all div with id, vp |
|
|
|
divs = soup.findAll('div', {"id": "vp"}) |
|
|
|
|
|
|
|
listing = [] |
|
|
|
|
|
|
|
#for all div with id:vp, find first a with href |
|
|
|
# for all div with id:vp, find first a with href |
|
|
|
for div in divs: |
|
|
|
a_s = div.find('a', href=True) |
|
|
|
|
|
|
|
#if div has no href True, then move on. otherwise, store the a with href into listing |
|
|
|
# if div has no href True, then move on. otherwise, store the a with href into listing |
|
|
|
if a_s is not None: |
|
|
|
listing.append(a_s) |
|
|
|
|
|
|
|
#loop through listing with a with href and extract/return the href |
|
|
|
# loop through listing with a with href and extract/return the href |
|
|
|
for a in listing: |
|
|
|
link = a['href'] |
|
|
|
|
|
|
|
#if '?listing' is in link, then it is a product, so append to the href list |
|
|
|
# if '?listing' is in link, then it is a product, so append to the href list. Otherwise, move on. |
|
|
|
if "?listing" in link: |
|
|
|
href.append(link) |
|
|
|
|