Browse Source

Merge remote-tracking branch 'origin/main'

main
Helium 11 months ago
parent
commit
7b552c8d70
5 changed files with 123 additions and 169 deletions
  1. +14
    -11
      MarketPlaces/GoFish/crawler_selenium.py
  2. +12
    -17
      MarketPlaces/MikesGrandStore/crawler_selenium.py
  3. +14
    -43
      MarketPlaces/MikesGrandStore/parser.py
  4. +17
    -45
      MarketPlaces/WeTheNorth/crawler_selenium.py
  5. +66
    -53
      MarketPlaces/WeTheNorth/parser.py

+ 14
- 11
MarketPlaces/GoFish/crawler_selenium.py View File

@ -118,7 +118,9 @@ def getAccess():
def login(driver): def login(driver):
input("Press ENTER when CAPTCHA is complete and login page has loaded\n")
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="username"]')))
# entering username and password into input boxes # entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
@ -131,6 +133,7 @@ def login(driver):
submit = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/form/div[7]/input') submit = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/form/div[7]/input')
submit.click() submit.click()
input("Press ENTER when CAPTCHA is complete and home page has loaded\n")
# wait for listing page show up (This Xpath may need to change based on different seed url) # wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
@ -175,17 +178,17 @@ def getInterestedLinks():
links = [] links = []
# Hosting and Security # Hosting and Security
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84')
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84')
# Exploits and Kits # Exploits and Kits
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107') links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107')
# Botnets and Malware # Botnets and Malware
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97')
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97')
# Other Software # Other Software
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108')
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108')
# Hacking Guide # Hacking Guide
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129')
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129')
# Fraud (mostly carding) # Fraud (mostly carding)
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128')
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128')
return links return links
@ -222,12 +225,12 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item) savePage(driver, driver.page_source, item)
driver.back() driver.back()
# comment out
# break
# comment out
# # comment out
# break
#
# # comment out
# if count == 1: # if count == 1:
# break
# break
try: try:
link = driver.find_element(by=By.XPATH, value='/html/body/div/div[3]/div[2]/div[2]/nav/ul/li[3]/a').get_attribute('href') link = driver.find_element(by=By.XPATH, value='/html/body/div/div[3]/div[2]/div[2]/nav/ul/li[3]/a').get_attribute('href')


+ 12
- 17
MarketPlaces/MikesGrandStore/crawler_selenium.py View File

@ -26,7 +26,7 @@ from MarketPlaces.MikesGrandStore.parser import MikesGrandStore_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1 counter = 1
baseURL = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion'
baseURL = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/'
def startCrawling(): def startCrawling():
@ -83,8 +83,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -159,11 +159,6 @@ def getFullPathName(url):
return fullPath return fullPath
def getMKTName() -> str:
name = 'MikesGrandStore'
return name
def getNameFromURL(url): def getNameFromURL(url):
global counter global counter
name = ''.join(e for e in url if e.isalnum()) name = ''.join(e for e in url if e.isalnum())
@ -178,6 +173,10 @@ def getInterestedLinks():
# Hacking # Hacking
links.append('http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/product-category/hacking/') links.append('http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/product-category/hacking/')
# Carding
links.append('http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/product-category/carding/')
# Databases
links.append('http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/product-category/databases/')
return links return links
@ -215,11 +214,11 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item) savePage(driver, driver.page_source, item)
driver.back() driver.back()
# comment out
#break
# comment out
#if count == 1:
# # comment out
# break
#
# # comment out
# if count == 1:
# break # break
# go to next page # go to next page
@ -260,7 +259,3 @@ def productPages(html):
def crawler(): def crawler():
startCrawling() startCrawling()
if __name__ == '__main__':
startCrawling()

+ 14
- 43
MarketPlaces/MikesGrandStore/parser.py View File

@ -38,55 +38,40 @@ def MikesGrandStore_description_parser(soup):
# Finding Product Name # Finding Product Name
name = soup.find('h1', {'class': 'product-title product_title entry-title'}).text name = soup.find('h1', {'class': 'product-title product_title entry-title'}).text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = cleanString(name)
name = name.strip() name = name.strip()
divmb = soup.findAll('div', {'class': "mb-1"})
# Finding Vendor # Finding Vendor
# no vendor # no vendor
vendor = "MikesGrandStore" vendor = "MikesGrandStore"
# Finding the Product Rating # Finding the Product Rating
rating_item = soup.find('strong', {'class', 'rating'}).text rating_item = soup.find('strong', {'class', 'rating'}).text
rating_item = rating_item.replace('\n', ' ')
rating_item = rating_item.replace(",", "")
rating_item = cleanNumbers(rating_item)
rating_item = rating_item.strip() rating_item = rating_item.strip()
# Finding Number of Product Reviews # Finding Number of Product Reviews
review_container = soup.find('li', {'id': 'tab-title-reviews'}) review_container = soup.find('li', {'id': 'tab-title-reviews'})
reviews = review_container.find('a').text reviews = review_container.find('a').text
reviews = reviews.replace('Reviews', '')
reviews = reviews.replace('(', '')
reviews = reviews.replace(')', '')
reviews = reviews.replace('\n', ' ')
reviews = reviews.replace(",", "")
reviews = cleanNumbers(reviews)
reviews = reviews.strip() reviews = reviews.strip()
# Finding Prices # Finding Prices
USD = soup.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling
USD = USD.replace('\n', ' ')
USD = USD.replace(",", "")
USD = soup.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling.text
USD = cleanNumbers(USD)
USD = USD.strip() USD = USD.strip()
# Finding the Product Category
cat_container = soup.find('span', {'class': 'posted_in'})
cat = cat_container.findAll('a')
category = ""
for name in cat:
category = category + " " + name.text
# Finding the Product Quantity Available # Finding the Product Quantity Available
stock = soup.find('p', {'class': 'stock in-stock'}) stock = soup.find('p', {'class': 'stock in-stock'})
if stock is not None: if stock is not None:
left = stock.text left = stock.text
left = left.replace("in stock", "")
left = cleanNumbers(left)
left = left.strip() left = left.strip()
# Finding the Product description # Finding the Product description
desc_cont = soup.find('div', {'class': 'product-short-description'})
describe = desc_cont.find('p').text.strip()
describe = soup.find('div', {'id': 'tab-description'}).text
describe = cleanString(describe)
describe = describe.strip()
# Finding Product Image # Finding Product Image
image = soup.find('img', {'class': 'wp-post-image skip-lazy'}) image = soup.find('img', {'class': 'wp-post-image skip-lazy'})
@ -158,29 +143,19 @@ def MikesGrandStore_listing_parser(soup):
nm = len(listing) nm = len(listing)
for a in listing: for a in listing:
bae = a.findAll('a', href=True)
lb = a.findAll('div', {"id": "littlebox"})
# Adding the url to the list of urls # Adding the url to the list of urls
link = a.find('a', {'class': 'woocommerce-LoopProduct-link woocommerce-loop-product__link'}).get('href') link = a.find('a', {'class': 'woocommerce-LoopProduct-link woocommerce-loop-product__link'}).get('href')
href.append(link) href.append(link)
# Finding the Product # Finding the Product
product = a.find('a', {'class': 'woocommerce-LoopProduct-link woocommerce-loop-product__link'}).text product = a.find('a', {'class': 'woocommerce-LoopProduct-link woocommerce-loop-product__link'}).text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = cleanString(product)
product = product.strip() product = product.strip()
name.append(product) name.append(product)
# Finding Product Image
product_image = a.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
# Finding Prices # Finding Prices
price = a.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling
price = a.find('span', {'class': 'woocommerce-Price-currencySymbol'}).next_sibling.text
price = cleanNumbers(price)
price = price.strip() price = price.strip()
USD.append(price) USD.append(price)
@ -193,15 +168,16 @@ def MikesGrandStore_listing_parser(soup):
# Finding the Category # Finding the Category
cat = a.find('p', {'class': 'category uppercase is-smaller no-text-overflow product-cat op-7'}).text cat = a.find('p', {'class': 'category uppercase is-smaller no-text-overflow product-cat op-7'}).text
cat = cat.replace("class:", "") cat = cat.replace("class:", "")
cat = cleanString(cat)
cat = cat.strip() cat = cat.strip()
category.append(cat) category.append(cat)
# Finding product rating # Finding product rating
rating = a.find('strong', {'class': 'rating'}).text rating = a.find('strong', {'class': 'rating'}).text
rating = cleanNumbers(rating)
rating = rating.strip() rating = rating.strip()
rating_item.append(rating) rating_item.append(rating)
# Searching for CVE and MS categories # Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve: if not cve:
@ -244,11 +220,6 @@ def MikesGrandStore_links_parser(soup):
container = soup.find('div', {"class": "products row row-small large-columns-3 medium-columns-3 small-columns-2 equalize-box"}) container = soup.find('div', {"class": "products row row-small large-columns-3 medium-columns-3 small-columns-2 equalize-box"})
listing = container.findAll('div', recursive=False) listing = container.findAll('div', recursive=False)
# for a in listing:
# bae = a.find('a', {"class": "text-info"}, href=True)
# link = bae['href']
# href.append(link)
for a in listing: for a in listing:
bae = a.findAll('a', href=True) bae = a.findAll('a', href=True)


+ 17
- 45
MarketPlaces/WeTheNorth/crawler_selenium.py View File

@ -27,12 +27,12 @@ from MarketPlaces.Utilities.utilities import cleanHTML
import selenium import selenium
counter = 1 counter = 1
baseURL = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion'
baseURL = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/'
# Opens Tor Browser, crawls the website # Opens Tor Browser, crawls the website
def startCrawling(): def startCrawling():
marketName = getMarketName()
marketName = getMKTName()
driver = getAccess() driver = getAccess()
if driver != 'down': if driver != 'down':
@ -48,7 +48,6 @@ def startCrawling():
# Login using premade account credentials and do login captcha manually # Login using premade account credentials and do login captcha manually
def login(driver): def login(driver):
time.sleep(3)
#wait for login page #wait for login page
input("Press ENTER when CAPTCHA is completed\n") input("Press ENTER when CAPTCHA is completed\n")
@ -60,32 +59,6 @@ def login(driver):
#Password here #Password here
passwordBox.send_keys('fishowal') passwordBox.send_keys('fishowal')
# wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img")))
# save captcha to local
driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img').screenshot(
r'..\WeTheNorth\captcha.png')
# This method will show image in any image viewer
im = Image.open(r'..\WeTheNorth\captcha.png')
im.show()
# wait until input space show up
inputBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[4]/input')
# ask user input captcha solution in terminal
userIn = input("Enter solution: ")
# send user solution into the input space
inputBox.send_keys(userIn)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div[2]/div/div[3]/form/div[5]/input").click()
input("Press ENTER when CAPTCHA is completed\n") input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url) # wait for listing page show up (This Xpath may need to change based on different seed url)
@ -93,11 +66,7 @@ def login(driver):
(By.XPATH, '//*[@id="information"]'))) (By.XPATH, '//*[@id="information"]')))
# Returns the name of the website # Returns the name of the website
def getMarketName():
name = 'WeTheNorth'
return name
def getMKTName() -> str:
def getMKTName():
name = 'WeTheNorth' name = 'WeTheNorth'
return name return name
@ -132,8 +101,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -201,10 +170,10 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# # Fraud Software
# links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=5&podcategory=3')
# # Guides and Tutorials - Hacking
# links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=3&podcategory=3')
# Fraud Software
links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=5&podcategory=3')
# Guides and Tutorials - Hacking
links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=3&podcategory=3')
# Software and Malware # Software and Malware
links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=10') links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=10')
@ -222,7 +191,6 @@ def crawlForum(driver):
print('Crawling :', link) print('Crawling :', link)
try: try:
pg_counter = 1
has_next_page = True has_next_page = True
count = 0 count = 0
@ -244,13 +212,17 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item) savePage(driver, driver.page_source, item)
driver.back() driver.back()
# comment out
break
# comment out
if count == 1:
break
try: try:
nav = driver.find_element(by=By.XPATH, value= nav = driver.find_element(by=By.XPATH, value=
'/html/body/div[2]/div[3]/div[3]/div[2]/div[7]') '/html/body/div[2]/div[3]/div[3]/div[2]/div[7]')
pg_counter += 1
pg_counter_str = "p=" + str(pg_counter) + "&"
a = nav.find_element(by=By.XPATH, value = '//a[contains(@href,"'+pg_counter_str+'")]')
a = nav.find_element(by=By.XPATH, value=f'//a[contains(@href,"p={count + 2}&")]')
link = a.get_attribute('href') link = a.get_attribute('href')
if link == "": if link == "":
raise NoSuchElementException raise NoSuchElementException
@ -263,7 +235,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling WeTheNorth market done sucessfully. Press ENTER to continue\n")
print("Crawling WeTheNorth market done.")
# Returns 'True' if the link is Topic link # Returns 'True' if the link is Topic link


+ 66
- 53
MarketPlaces/WeTheNorth/parser.py View File

@ -31,39 +31,35 @@ def wethenorth_description_parser(soup):
left = "-1" # 16 Product_QuantityLeft left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name # Finding Product Name
listDes = soup.find('div', {'class': "listDes"}) listDes = soup.find('div', {'class': "listDes"})
name = listDes.find('h2').text name = listDes.find('h2').text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = cleanString(name)
name = name.strip() name = name.strip()
# Finding Vendor # Finding Vendor
vendor = listDes.find('b').text vendor = listDes.find('b').text
vendor = vendor.replace(",", "")
vendor = vendor.replace("...", "")
vendor = vendor.replace("-", "")
vendor = cleanString(vendor)
vendor = vendor.strip() vendor = vendor.strip()
# Finding Vendor Rating # Finding Vendor Rating
rating = listDes.find('span',{'class':'levelSet'})
rating = listDes.find('span', {'class': 'levelSet'})
rating = rating.text rating = rating.text
rating = rating.replace('\n', ' ')
rating = rating.replace(",", "")
rating = rating.strip()
# Finding Successful Transactions
success = listDes.find_all('p')[1]
success = success.find('span').text
success = success.split()
success = success[0].strip()
rating = cleanNumbers(rating)
rating_vendor = rating.strip()
# Finding Prices - all prices in We The North are in CAD, I left the CAD in the resulting String so that it would show CAD for all prices # Finding Prices - all prices in We The North are in CAD, I left the CAD in the resulting String so that it would show CAD for all prices
padp = listDes.find('p',{'class':'padp'})
padp = listDes.find('p', {'class': 'padp'})
USD = padp.find('span').text USD = padp.find('span').text
USD = USD.strip() USD = USD.strip()
BTC = padp.find_next_sibling('p').text
BTC = cleanNumbers(BTC)
BTC = BTC.strip()
# Finding Escrow - no escrow on WTN market # Finding Escrow - no escrow on WTN market
shipping_info = listDes.find('tbody') shipping_info = listDes.find('tbody')
@ -73,47 +69,32 @@ def wethenorth_description_parser(soup):
# Finding Shipment Information (Origin) # Finding Shipment Information (Origin)
shipFrom = row1[-1].text shipFrom = row1[-1].text
shipFrom=shipFrom.strip()
if shipFrom=="":
shipFrom="-1"
shipFrom = cleanString(shipFrom)
shipFrom = shipFrom.strip()
if shipFrom == "":
shipFrom = "-1"
row2 = shipping_info[1].find_all('td') row2 = shipping_info[1].find_all('td')
# Finding Shipment Information (Destination) # Finding Shipment Information (Destination)
shipTo = row2[-1].text shipTo = row2[-1].text
shipTo= shipTo.strip()
shipTo = cleanString(shipTo)
shipTo = shipTo.strip()
if shipTo == "": if shipTo == "":
shipTo = "-1" shipTo = "-1"
# Finding the Product description # Finding the Product description
describe = soup.find("div",{'class':'tabcontent'})
describe = soup.find("div", {'class': 'tabcontent'})
describe = describe.find('p').text describe = describe.find('p').text
describe = describe.replace("\n", " ")
describe = describe.replace("\r", " ")
describe = cleanString(describe)
describe = describe.strip() describe = describe.strip()
# cannot find any tag for these
'''
# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"
'''
# Searching for CVE and MS categories # Searching for CVE and MS categories
# no CVE or MS for WTN market # no CVE or MS for WTN market
# Populating the final variable (this should be a list with all fields scraped) # Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results # Sending the results
return row return row
@ -144,7 +125,9 @@ def wethenorth_listing_parser(soup):
qLeft =[] # 17 Product_QuantityLeft qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
right_content = soup.find('div', {"class": "right-content"}) right_content = soup.find('div', {"class": "right-content"})
listing = right_content.findAll('div', {"class": "col-1search"}) listing = right_content.findAll('div', {"class": "col-1search"})
@ -158,20 +141,19 @@ def wethenorth_listing_parser(soup):
# Adding the url to the list of urls # Adding the url to the list of urls
link = bae[0].get('href') link = bae[0].get('href')
link = cleanLink(link)
href.append(link) href.append(link)
# Finding the Vendor # Finding the Vendor
vendor_name = a.find('p', {'class': 'padp'}) vendor_name = a.find('p', {'class': 'padp'})
vendor_name = vendor_name.find('a').text vendor_name = vendor_name.find('a').text
vendor_name = vendor_name.replace(",", "")
vendor_name = cleanString(vendor_name)
vendor_name = vendor_name.strip() vendor_name = vendor_name.strip()
vendor.append(vendor_name) vendor.append(vendor_name)
# Finding the Product # Finding the Product
product = bae[0].text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = a.find('div', {'class': 'col-1centre'})
product = product.find('div', {'class': 'head'}).find('a').text
product = cleanString(product)
product = product.strip() product = product.strip()
name.append(product) name.append(product)
@ -179,25 +161,56 @@ def wethenorth_listing_parser(soup):
category_name = a.find('p', {'class': 'padp'}).text category_name = a.find('p', {'class': 'padp'}).text
first_dash = category_name.find('-') first_dash = category_name.find('-')
second_dash = category_name[first_dash+1:].find('-') second_dash = category_name[first_dash+1:].find('-')
category_name = category_name[first_dash+1:second_dash]
category_name=category_name.strip()
category_name = category_name[first_dash+1: first_dash + second_dash]
category_name = cleanString(category_name)
category_name = category_name.strip()
category.append(category_name) category.append(category_name)
# Finding Success Transactions
vendor_success = a.find('p', {'class': 'padp'}).text
first_dash = vendor_success.find('(')
vendor_success = vendor_success[first_dash + 1:]
vendor_success = cleanNumbers(vendor_success)
vendor_success = vendor_success.strip()
success.append(vendor_success)
# Finding Views # Finding Views
view_count = a.text view_count = a.text
view_count = view_count[view_count.find('Views:'): view_count.find('Sales:')] view_count = view_count[view_count.find('Views:'): view_count.find('Sales:')]
view_count = view_count.replace('Views:', ' ') view_count = view_count.replace('Views:', ' ')
view_count = view_count.replace('/', ' ')
view_count = cleanNumbers(view_count)
view_count = view_count.strip() view_count = view_count.strip()
views.append(view_count) views.append(view_count)
# Finding success sales
# Finding Quantity Sold
sold_count = a.text sold_count = a.text
sold_count = sold_count[sold_count.find('Sales:'): sold_count.find('Short')] sold_count = sold_count[sold_count.find('Sales:'): sold_count.find('Short')]
sold_count = sold_count.replace('Sales:', ' ') sold_count = sold_count.replace('Sales:', ' ')
sold_count = sold_count.replace('/', ' ')
sold_count = cleanNumbers(sold_count)
sold_count = sold_count.strip() sold_count = sold_count.strip()
success.append(sold_count)
sold.append(sold_count)
right = a.find('div', {'class': 'col-1right'})
# Finding USD
usd = right.find('a').text
usd = "CAD " + usd.strip()
USD.append(usd)
# Finding BTC
btc = right.text
first_dash = btc.find('(')
second_dash = btc[first_dash + 1:].find(')')
btc = btc[first_dash + 1: first_dash + second_dash]
btc = cleanNumbers(btc)
btc = btc.strip()
BTC.append(btc)
# Finding Product Image
product_image = right.find('img')
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
# Searching for CVE and MS categories # Searching for CVE and MS categories
# no CVE or MS in WTN market # no CVE or MS in WTN market
@ -229,7 +242,7 @@ def wethenorth_listing_parser(soup):
# Populate the final variable (this should be a list with all fields scraped) # Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
def wethenorth_links_parser(soup): def wethenorth_links_parser(soup):


Loading…
Cancel
Save