Browse Source

debugged LionMarketplace, Nexus, and RobinhoodMarket

main
westernmeadow 1 year ago
parent
commit
803f4b47a2
6 changed files with 80 additions and 65 deletions
  1. +3
    -3
      Forums/Initialization/prepare_parser.py
  2. +4
    -3
      MarketPlaces/Initialization/prepare_parser.py
  3. +1
    -1
      MarketPlaces/LionMarketplace/parser.py
  4. +15
    -5
      MarketPlaces/Nexus/crawler_selenium.py
  5. +49
    -48
      MarketPlaces/Nexus/parser.py
  6. +8
    -5
      MarketPlaces/RobinhoodMarket/parser.py

+ 3
- 3
Forums/Initialization/prepare_parser.py View File

@ -206,17 +206,17 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
def move_file(filePath, createLog, logFile):
# source = line2.replace(os.path.basename(line2), "") + filename
source = filePath
destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
try:
shutil.move(source, destination)
shutil.move(source, destination, shutil.copytree)
return True
except:
print("There was a problem to move the file " + filePath)
incrementError()
print("There was a problem to move the file " + filePath)
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to move the file " + filePath + "\n")


+ 4
- 3
MarketPlaces/Initialization/prepare_parser.py View File

@ -253,17 +253,18 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
def move_file(filePath, createLog, logFile):
# source = line2.replace(os.path.basename(line2), "") + filename
source = filePath
destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
try:
shutil.move(source, destination)
shutil.move(source, destination, shutil.copytree)
return True
except:
print("There was a problem to move the file " + filePath)
incrementError()
print("There was a problem to move the file " + filePath)
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to move the file " + filePath + "\n")


+ 1
- 1
MarketPlaces/LionMarketplace/parser.py View File

@ -56,7 +56,7 @@ def lionmarketplace_description_parser(soup):
name = (cleanString(temp.strip()))
# product description
temp = soup.find('div', {'class': "mt-4"}).find(text=True, recursive=False)
temp = soup.find('div', {'class': "mt-4"}).contents[-1]
describe = cleanString(temp.strip())
# Finding Product Image


+ 15
- 5
MarketPlaces/Nexus/crawler_selenium.py View File

@ -85,8 +85,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -96,7 +96,7 @@ def createFFDriver():
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
@ -204,6 +204,12 @@ def crawlForum(driver):
driver.get(link)
except:
driver.refresh()
# waiting for btc price to load
WebDriverWait(driver, 30).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[1]/div[2]/div/div/main/ul/li[1]/div/span/span[3]")))
time.sleep(5)
html = driver.page_source
savePage(driver, html, link)
@ -214,6 +220,11 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
# waiting for btc price to load
WebDriverWait(driver, 30).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[1]/div[2]/div/div/main/div[3]/div[2]/p/span[3]")))
savePage(driver, driver.page_source, item)
driver.back()
@ -225,8 +236,7 @@ def crawlForum(driver):
break
try:
link = driver.find_element(by=By.XPATH, value=
'/html/body/div[1]/div[2]/div/div/main/nav/ul/li[3]/a').get_attribute('href')
link = driver.find_element(by=By.LINK_TEXT, value='').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1


+ 49
- 48
MarketPlaces/Nexus/parser.py View File

@ -43,6 +43,10 @@ def nexus_description_parser(soup):
name_of_product = soup.find("h1", {"class": "product_title entry-title"}).text
name = cleanString(name_of_product.strip())
# Find the BTC Price
prices = soup.find('p', {"class": "price"}).findAll('span', {"class": "cs"})
BTC = prices[0].text
BTC = cleanNumbers(BTC.strip())
# finding the description of the product
description_div = soup.find("div", {"class": "woocommerce-product-details__short-description"})
@ -52,7 +56,7 @@ def nexus_description_parser(soup):
describe = cleanString(description_div.text.strip())
# Finding Product Image
image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
@ -110,56 +114,53 @@ def nexus_listing_parser(soup):
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
products_list = soup.find_all('li')
nm = 0
main = soup.find('main', {'id': 'main'})
products_list = main.find('ul', recursive=False).find_all('li', recursive=False)
nm = len(products_list)
for product in products_list:
# Finding the name of the product
name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
name_of_product_cleaned = cleanString(name_of_product.strip())
# print(name_of_product_cleaned)
name.append(name_of_product_cleaned)
#finding the URL
try:
# Finding the name of the product
name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
name_of_product_cleaned = cleanString(name_of_product.strip())
# print(name_of_product_cleaned)
name.append(name_of_product_cleaned)
#finding the URL
try:
url = product.find("a", class_="woocommerce-loop-product__link").get('href')
href.append(url)
except AttributeError as e:
print("I can't find the link")
raise e
# Finding Product Image
product_image = product.find('a', {'class': 'woocommerce-loop-image-link woocommerce-LoopProduct-link woocommerce-loop-product__link'}).find('img')
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
BTC.append("-1")
#everything else appends a -1
rating_vendor.append("-1")
USD.append("-1")
vendor.append(mktName)
success.append("-1")
CVE.append("-1")
MS.append("-1")
category.append("-1")
describe.append("-1")
views.append("-1")
reviews.append("-1")
addDate.append("-1")
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
image_vendor.append("-1")
# print("Done! moving onto the next product!")
# print(len(shipTo))
nm += 1
url = product.find("a", class_="woocommerce-loop-product__link").get('href')
href.append(url)
except AttributeError as e:
print("I'm somewhere I don't belong. I'm going to leave")
continue
print("I can't find the link")
raise e
# Finding Product Image
product_image = product.find('a', {'class': 'woocommerce-loop-image-link woocommerce-LoopProduct-link woocommerce-loop-product__link'}).find('img')
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
# Finding BTC Price
prices = product.find('span', {"class": "price"}).findAll('span', {"class": "cs"})
price = prices[0].text
BTC.append(cleanNumbers(price.strip()))
#everything else appends a -1
rating_vendor.append("-1")
USD.append("-1")
vendor.append('-1')
success.append("-1")
CVE.append("-1")
MS.append("-1")
category.append("-1")
describe.append("-1")
views.append("-1")
reviews.append("-1")
addDate.append("-1")
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
image_vendor.append("-1")
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(


+ 8
- 5
MarketPlaces/RobinhoodMarket/parser.py View File

@ -51,14 +51,17 @@ def Robinhood_description_parser(soup):
# Finding description
desc = ''
tab = soup.find('div', {"id": "tab-description"})
for p in tab.findAll('p'):
desc += p.text
if tab is not None:
for p in tab.findAll('p'):
desc += p.text
if desc == '':
desc = soup.find('div', {"class": "woocommerce-product-details__short-description"}).text
short = soup.find('div', {"class": "woocommerce-product-details__short-description"})
if short is not None:
desc = short.text
describe = cleanString(desc.strip())
# Finding Product Image
image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
@ -164,7 +167,7 @@ def Robinhood_listing_parser(soup):
name.append(product)
# Finding Product Image
product_image = card.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
product_image = card.find('a').find('img')
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)


Loading…
Cancel
Save