debugged LionMarketplace, Nexus, and RobinhoodMarket

1 year ago · 803f4b47a2
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@ -206,17 +206,17 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript

 def move_file(filePath, createLog, logFile):

    # source = line2.replace(os.path.basename(line2), "") + filename
    source = filePath
    destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'

    try:
        shutil.move(source, destination)
        shutil.move(source, destination, shutil.copytree)
        return True
    except:

        print("There was a problem to move the file " + filePath)
        incrementError()
        print("There was a problem to move the file " + filePath)
        traceback.print_exc()
        if createLog:
            logFile.write(
                str(nError) + ". There was a problem to move the file " + filePath + "\n")
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@ -253,17 +253,18 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript


 def move_file(filePath, createLog, logFile):
    # source = line2.replace(os.path.basename(line2), "") + filename

    source = filePath
    destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'

    try:
        shutil.move(source, destination)
        shutil.move(source, destination, shutil.copytree)
        return True
    except:

        print("There was a problem to move the file " + filePath)
        incrementError()
        print("There was a problem to move the file " + filePath)
        traceback.print_exc()
        if createLog:
            logFile.write(
                str(nError) + ". There was a problem to move the file " + filePath + "\n")
--- a/MarketPlaces/LionMarketplace/parser.py
+++ b/MarketPlaces/LionMarketplace/parser.py
@ -56,7 +56,7 @@ def lionmarketplace_description_parser(soup):
    name = (cleanString(temp.strip()))

    # product description
    temp = soup.find('div', {'class': "mt-4"}).find(text=True, recursive=False)
    temp = soup.find('div', {'class': "mt-4"}).contents[-1]
    describe = cleanString(temp.strip())

    # Finding Product Image
--- a/MarketPlaces/Nexus/crawler_selenium.py
+++ b/MarketPlaces/Nexus/crawler_selenium.py
@ -85,8 +85,8 @@ def createFFDriver():
    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
    ff_prof.set_preference("signon.rememberSignons", False)
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    ff_prof.set_preference("network.dns.disablePrefetch", True)
    ff_prof.set_preference("network.http.sendRefererHeader", 0)
    # ff_prof.set_preference("network.dns.disablePrefetch", True)
    # ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("permissions.default.image", 3)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -96,7 +96,7 @@ def createFFDriver():
    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
    ff_prof.set_preference('network.proxy.socks_port', 9150)
    ff_prof.set_preference('network.proxy.socks_remote_dns', True)
    ff_prof.set_preference("javascript.enabled", False)
    ff_prof.set_preference("javascript.enabled", True)
    ff_prof.update_preferences()

    service = Service(config.get('TOR', 'geckodriver_path'))
@ -204,6 +204,12 @@ def crawlForum(driver):
                    driver.get(link)
                except:
                    driver.refresh()

                # waiting for btc price to load
                WebDriverWait(driver, 30).until(EC.visibility_of_element_located(
                    (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/ul/li[1]/div/span/span[3]")))
                time.sleep(5)

                html = driver.page_source
                savePage(driver, html, link)

@ -214,6 +220,11 @@ def crawlForum(driver):
                        driver.get(itemURL)
                    except:
                        driver.refresh()

                    # waiting for btc price to load
                    WebDriverWait(driver, 30).until(EC.visibility_of_element_located(
                        (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/div[3]/div[2]/p/span[3]")))

                    savePage(driver, driver.page_source, item)
                    driver.back()

@ -225,8 +236,7 @@ def crawlForum(driver):
                    break

                try:
                    link = driver.find_element(by=By.XPATH, value=
                        '/html/body/div[1]/div[2]/div/div/main/nav/ul/li[3]/a').get_attribute('href')
                    link = driver.find_element(by=By.LINK_TEXT, value='→').get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    count += 1
--- a/MarketPlaces/Nexus/parser.py
+++ b/MarketPlaces/Nexus/parser.py
@ -43,6 +43,10 @@ def nexus_description_parser(soup):
    name_of_product = soup.find("h1", {"class": "product_title entry-title"}).text
    name = cleanString(name_of_product.strip())

    # Find the BTC Price
    prices = soup.find('p', {"class": "price"}).findAll('span', {"class": "cs"})
    BTC = prices[0].text
    BTC = cleanNumbers(BTC.strip())

    # finding the description of the product
    description_div = soup.find("div", {"class": "woocommerce-product-details__short-description"})
@ -52,7 +56,7 @@ def nexus_description_parser(soup):
        describe = cleanString(description_div.text.strip())

    # Finding Product Image
    image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
    image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img')
    image = image.get('src')
    image = image.split('base64,')[-1]

@ -110,56 +114,53 @@ def nexus_listing_parser(soup):
    image_vendor = []                         # 21 Vendor_Image
    href = []                                 # 22 Product_Links

    products_list = soup.find_all('li')
    nm = 0
    main = soup.find('main', {'id': 'main'})
    products_list = main.find('ul', recursive=False).find_all('li', recursive=False)
    nm = len(products_list)

    for product in products_list:
        # Finding the name of the product
        name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
        name_of_product_cleaned = cleanString(name_of_product.strip())
        # print(name_of_product_cleaned)
        name.append(name_of_product_cleaned)
        #finding the URL
        try:
            # Finding the name of the product
            name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
            name_of_product_cleaned = cleanString(name_of_product.strip())
            # print(name_of_product_cleaned)
            name.append(name_of_product_cleaned)
            #finding the URL
            try:
                url = product.find("a", class_="woocommerce-loop-product__link").get('href')
                href.append(url)
            except AttributeError as e:
                print("I can't find the link")
                raise e

            # Finding Product Image
            product_image = product.find('a', {'class': 'woocommerce-loop-image-link woocommerce-LoopProduct-link woocommerce-loop-product__link'}).find('img')
            product_image = product_image.get('src')
            product_image = product_image.split('base64,')[-1]
            image.append(product_image)

            BTC.append("-1")

            #everything else appends a -1
            rating_vendor.append("-1")
            USD.append("-1")
            vendor.append(mktName)
            success.append("-1")
            CVE.append("-1")
            MS.append("-1")
            category.append("-1")
            describe.append("-1")
            views.append("-1")
            reviews.append("-1")
            addDate.append("-1")
            EURO.append("-1")
            sold.append("-1")
            qLeft.append("-1")
            shipFrom.append("-1")
            shipTo.append("-1")
            image_vendor.append("-1")
            # print("Done! moving onto the next product!")
            # print(len(shipTo))
            nm += 1
            url = product.find("a", class_="woocommerce-loop-product__link").get('href')
            href.append(url)
        except AttributeError as e:
            print("I'm somewhere I don't belong. I'm going to leave")
            continue

            print("I can't find the link")
            raise e

        # Finding Product Image
        product_image = product.find('a', {'class': 'woocommerce-loop-image-link woocommerce-LoopProduct-link woocommerce-loop-product__link'}).find('img')
        product_image = product_image.get('src')
        product_image = product_image.split('base64,')[-1]
        image.append(product_image)

        # Finding BTC Price
        prices = product.find('span', {"class": "price"}).findAll('span', {"class": "cs"})
        price = prices[0].text
        BTC.append(cleanNumbers(price.strip()))

        #everything else appends a -1
        rating_vendor.append("-1")
        USD.append("-1")
        vendor.append('-1')
        success.append("-1")
        CVE.append("-1")
        MS.append("-1")
        category.append("-1")
        describe.append("-1")
        views.append("-1")
        reviews.append("-1")
        addDate.append("-1")
        EURO.append("-1")
        sold.append("-1")
        qLeft.append("-1")
        shipFrom.append("-1")
        shipTo.append("-1")
        image_vendor.append("-1")

    # Populate the final variable (this should be a list with all fields scraped)
    return organizeProducts(
--- a/MarketPlaces/RobinhoodMarket/parser.py
+++ b/MarketPlaces/RobinhoodMarket/parser.py
@ -51,14 +51,17 @@ def Robinhood_description_parser(soup):
    # Finding description
    desc = ''
    tab = soup.find('div', {"id": "tab-description"})
    for p in tab.findAll('p'):
        desc += p.text
    if tab is not None:
        for p in tab.findAll('p'):
            desc += p.text
    if desc == '':
        desc = soup.find('div', {"class": "woocommerce-product-details__short-description"}).text
        short = soup.find('div', {"class": "woocommerce-product-details__short-description"})
        if short is not None:
            desc = short.text
    describe = cleanString(desc.strip())

    # Finding Product Image
    image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
    image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img')
    image = image.get('src')
    image = image.split('base64,')[-1]

@ -164,7 +167,7 @@ def Robinhood_listing_parser(soup):
        name.append(product)

        # Finding Product Image
        product_image = card.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
        product_image = card.find('a').find('img')
        product_image = product_image.get('src')
        product_image = product_image.split('base64,')[-1]
        image.append(product_image)