diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 1f089e6..e3cc468 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -206,17 +206,17 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript def move_file(filePath, createLog, logFile): - # source = line2.replace(os.path.basename(line2), "") + filename source = filePath destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' try: - shutil.move(source, destination) + shutil.move(source, destination, shutil.copytree) return True except: - print("There was a problem to move the file " + filePath) incrementError() + print("There was a problem to move the file " + filePath) + traceback.print_exc() if createLog: logFile.write( str(nError) + ". There was a problem to move the file " + filePath + "\n") diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index c7699bd..c50bb1a 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -253,17 +253,18 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript def move_file(filePath, createLog, logFile): - # source = line2.replace(os.path.basename(line2), "") + filename + source = filePath destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' try: - shutil.move(source, destination) + shutil.move(source, destination, shutil.copytree) return True except: - print("There was a problem to move the file " + filePath) incrementError() + print("There was a problem to move the file " + filePath) + traceback.print_exc() if createLog: logFile.write( str(nError) + ". There was a problem to move the file " + filePath + "\n") diff --git a/MarketPlaces/LionMarketplace/parser.py b/MarketPlaces/LionMarketplace/parser.py index 81a911c..a37febf 100644 --- a/MarketPlaces/LionMarketplace/parser.py +++ b/MarketPlaces/LionMarketplace/parser.py @@ -56,7 +56,7 @@ def lionmarketplace_description_parser(soup): name = (cleanString(temp.strip())) # product description - temp = soup.find('div', {'class': "mt-4"}).find(text=True, recursive=False) + temp = soup.find('div', {'class': "mt-4"}).contents[-1] describe = cleanString(temp.strip()) # Finding Product Image diff --git a/MarketPlaces/Nexus/crawler_selenium.py b/MarketPlaces/Nexus/crawler_selenium.py index d7c84c2..4ae7cfe 100644 --- a/MarketPlaces/Nexus/crawler_selenium.py +++ b/MarketPlaces/Nexus/crawler_selenium.py @@ -85,8 +85,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -96,7 +96,7 @@ def createFFDriver(): ff_prof.set_preference('network.proxy.socks', '127.0.0.1') ff_prof.set_preference('network.proxy.socks_port', 9150) ff_prof.set_preference('network.proxy.socks_remote_dns', True) - ff_prof.set_preference("javascript.enabled", False) + ff_prof.set_preference("javascript.enabled", True) ff_prof.update_preferences() service = Service(config.get('TOR', 'geckodriver_path')) @@ -204,6 +204,12 @@ def crawlForum(driver): driver.get(link) except: driver.refresh() + + # waiting for btc price to load + WebDriverWait(driver, 30).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/ul/li[1]/div/span/span[3]"))) + time.sleep(5) + html = driver.page_source savePage(driver, html, link) @@ -214,6 +220,11 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() + + # waiting for btc price to load + WebDriverWait(driver, 30).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/div[3]/div[2]/p/span[3]"))) + savePage(driver, driver.page_source, item) driver.back() @@ -225,8 +236,7 @@ def crawlForum(driver): break try: - link = driver.find_element(by=By.XPATH, value= - '/html/body/div[1]/div[2]/div/div/main/nav/ul/li[3]/a').get_attribute('href') + link = driver.find_element(by=By.LINK_TEXT, value='→').get_attribute('href') if link == "": raise NoSuchElementException count += 1 diff --git a/MarketPlaces/Nexus/parser.py b/MarketPlaces/Nexus/parser.py index 093188e..f673110 100644 --- a/MarketPlaces/Nexus/parser.py +++ b/MarketPlaces/Nexus/parser.py @@ -43,6 +43,10 @@ def nexus_description_parser(soup): name_of_product = soup.find("h1", {"class": "product_title entry-title"}).text name = cleanString(name_of_product.strip()) + # Find the BTC Price + prices = soup.find('p', {"class": "price"}).findAll('span', {"class": "cs"}) + BTC = prices[0].text + BTC = cleanNumbers(BTC.strip()) # finding the description of the product description_div = soup.find("div", {"class": "woocommerce-product-details__short-description"}) @@ -52,7 +56,7 @@ def nexus_description_parser(soup): describe = cleanString(description_div.text.strip()) # Finding Product Image - image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img') + image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img') image = image.get('src') image = image.split('base64,')[-1] @@ -110,56 +114,53 @@ def nexus_listing_parser(soup): image_vendor = [] # 21 Vendor_Image href = [] # 22 Product_Links - products_list = soup.find_all('li') - nm = 0 + main = soup.find('main', {'id': 'main'}) + products_list = main.find('ul', recursive=False).find_all('li', recursive=False) + nm = len(products_list) + for product in products_list: + # Finding the name of the product + name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text + name_of_product_cleaned = cleanString(name_of_product.strip()) + # print(name_of_product_cleaned) + name.append(name_of_product_cleaned) + #finding the URL try: - # Finding the name of the product - name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text - name_of_product_cleaned = cleanString(name_of_product.strip()) - # print(name_of_product_cleaned) - name.append(name_of_product_cleaned) - #finding the URL - try: - url = product.find("a", class_="woocommerce-loop-product__link").get('href') - href.append(url) - except AttributeError as e: - print("I can't find the link") - raise e - - # Finding Product Image - product_image = product.find('a', {'class': 'woocommerce-loop-image-link woocommerce-LoopProduct-link woocommerce-loop-product__link'}).find('img') - product_image = product_image.get('src') - product_image = product_image.split('base64,')[-1] - image.append(product_image) - - BTC.append("-1") - - #everything else appends a -1 - rating_vendor.append("-1") - USD.append("-1") - vendor.append(mktName) - success.append("-1") - CVE.append("-1") - MS.append("-1") - category.append("-1") - describe.append("-1") - views.append("-1") - reviews.append("-1") - addDate.append("-1") - EURO.append("-1") - sold.append("-1") - qLeft.append("-1") - shipFrom.append("-1") - shipTo.append("-1") - image_vendor.append("-1") - # print("Done! moving onto the next product!") - # print(len(shipTo)) - nm += 1 + url = product.find("a", class_="woocommerce-loop-product__link").get('href') + href.append(url) except AttributeError as e: - print("I'm somewhere I don't belong. I'm going to leave") - continue - + print("I can't find the link") + raise e + + # Finding Product Image + product_image = product.find('a', {'class': 'woocommerce-loop-image-link woocommerce-LoopProduct-link woocommerce-loop-product__link'}).find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + + # Finding BTC Price + prices = product.find('span', {"class": "price"}).findAll('span', {"class": "cs"}) + price = prices[0].text + BTC.append(cleanNumbers(price.strip())) + + #everything else appends a -1 + rating_vendor.append("-1") + USD.append("-1") + vendor.append('-1') + success.append("-1") + CVE.append("-1") + MS.append("-1") + category.append("-1") + describe.append("-1") + views.append("-1") + reviews.append("-1") + addDate.append("-1") + EURO.append("-1") + sold.append("-1") + qLeft.append("-1") + shipFrom.append("-1") + shipTo.append("-1") + image_vendor.append("-1") # Populate the final variable (this should be a list with all fields scraped) return organizeProducts( diff --git a/MarketPlaces/RobinhoodMarket/parser.py b/MarketPlaces/RobinhoodMarket/parser.py index 5de7a70..c036d17 100644 --- a/MarketPlaces/RobinhoodMarket/parser.py +++ b/MarketPlaces/RobinhoodMarket/parser.py @@ -51,14 +51,17 @@ def Robinhood_description_parser(soup): # Finding description desc = '' tab = soup.find('div', {"id": "tab-description"}) - for p in tab.findAll('p'): - desc += p.text + if tab is not None: + for p in tab.findAll('p'): + desc += p.text if desc == '': - desc = soup.find('div', {"class": "woocommerce-product-details__short-description"}).text + short = soup.find('div', {"class": "woocommerce-product-details__short-description"}) + if short is not None: + desc = short.text describe = cleanString(desc.strip()) # Finding Product Image - image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img') + image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img') image = image.get('src') image = image.split('base64,')[-1] @@ -164,7 +167,7 @@ def Robinhood_listing_parser(soup): name.append(product) # Finding Product Image - product_image = card.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'}) + product_image = card.find('a').find('img') product_image = product_image.get('src') product_image = product_image.split('base64,')[-1] image.append(product_image)