fully ran Apocalypse and DarkBazar, fixed move bug

1 year ago · 0844b120bc
--- a/.idea/DW_Pipeline_Test.iml
+++ b/.idea/DW_Pipeline_Test.iml
@ -29,6 +29,7 @@
        <option value="$MODULE_DIR$/Forums/Procrax" />
        <option value="$MODULE_DIR$/MarketPlaces/DarkBazar" />
        <option value="$MODULE_DIR$/MarketPlaces/AnonMarket" />
        <option value="$MODULE_DIR$/MarketPlaces/Tor2door" />
      </list>
    </option>
  </component>
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@ -105,7 +105,7 @@ def read_file(filePath, createLog, logFile):
            print("There was a problem to read the file " + filePath)
            if createLog:
                logFile.write(
                    str(nError) + ". There was a problem to read the file " + filePath + "\n")
                    str(nError) + ". There was a problem to read the file " + filePath + "\n" + traceback.format_exc() + "\n")
            return None
@ -141,7 +141,8 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
        traceback.print_exc()
        if createLog:
            logFile.write(
                str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n")
                str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n"
                + traceback.format_exc() + "\n")
        return None
@ -177,7 +178,8 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
        traceback.print_exc()
        if createLog:
            logFile.write(
                str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n")
                str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n"
                + traceback.format_exc() + "\n")
        return None
@ -191,17 +193,14 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
        con.rollback()
        trace = traceback.format_exc()
        if trace.find("already exists") == -1:
            incrementError()
            print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!")
            traceback.print_exc()
            if createLog:
                logFile.write(str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
            return False
        else:
            return True
        incrementError()
        print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!")
        traceback.print_exc()
        if createLog:
            logFile.write(
                str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n"
                + traceback.format_exc() + "\n")
        return False
 def move_file(filePath, createLog, logFile):
@ -210,17 +209,21 @@ def move_file(filePath, createLog, logFile):
    destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
    try:
        shutil.move(source, destination, shutil.copytree)
        shutil.move(source, destination, shutil.copy2)
        return True
    except:
        incrementError()
        print("There was a problem to move the file " + filePath)
        traceback.print_exc()
        if createLog:
            logFile.write(
                str(nError) + ". There was a problem to move the file " + filePath + "\n")
        return False
        try:
            shutil.move(source, destination, shutil.copytree)
            return True
        except:
            incrementError()
            print("There was a problem to move the file " + filePath)
            traceback.print_exc()
            if createLog:
                logFile.write(
                    str(nError) + ". There was a problem to move the file " + filePath + "\n" + traceback.format_exc() + "\n")
            return False
 #main method for this program, what actually gets the parsed info from the parser, and persists them into the db
--- a/MarketPlaces/Apocalypse/crawler_selenium.py
+++ b/MarketPlaces/Apocalypse/crawler_selenium.py
@ -189,12 +189,12 @@ def getNameFromURL(url):
 def getInterestedLinks():
    links = []
    # # Digital Goods
    # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74')
    # # Fraud
    # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75')
    # # Services
    # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76')
    # Digital Goods
    links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74')
    # Fraud
    links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75')
    # Services
    links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76')
    # software and malware
    links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30')
@ -239,16 +239,16 @@ def crawlForum(driver):
                    except:
                        driver.refresh()
                    # comment out
                    # break
                # comment out
                if count == 1:
                    break
                #     # comment out
                #     break
                #
                # # comment out
                # if count == 1:
                #     break
                try:
                    link = driver.find_element(by=By.XPATH, value=
                        '/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href')
                    nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div/div[2]/nav')
                    link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='»').get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    count += 1
--- a/MarketPlaces/Apocalypse/parser.py
+++ b/MarketPlaces/Apocalypse/parser.py
@ -113,7 +113,10 @@ def apocalypse_listing_parser(soup: Tag):
    image_vendor = []                         # 21 Vendor_Image
    href = []                                 # 22 Product_Links
    listings: ResultSet[Tag] = soup.find("div", {"class": "col-lg-9 my-4"}).find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
    table = soup.find("div", {"class": "col-lg-9 my-4"})
    if table is None:
        table = soup.find("div", {"class": "col-lg-9"})
    listings: ResultSet[Tag] = table.find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
    for prod in listings:
--- a/MarketPlaces/DarkBazar/crawler_selenium.py
+++ b/MarketPlaces/DarkBazar/crawler_selenium.py
@ -175,8 +175,8 @@ def getNameFromURL(url):
 def getInterestedLinks():
    links = []
    # # Digital Goods
    # links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=4')
    # Digital Goods
    links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=3')
    # Services
    links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=5')
@ -216,12 +216,12 @@ def crawlForum(driver):
                    savePage(driver, driver.page_source, item)
                    driver.back()
                    # comment out
                    # break
                # comment out
                if count == 1:
                    break
                #     # comment out
                #     break
                #
                # # comment out
                # if count == 1:
                #     break
                try:
                    link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@ -4,6 +4,8 @@ import glob
 import os
 import codecs
 import shutil
 import traceback
 from MarketPlaces.DB_Connection.db_connection import *
 from MarketPlaces.DarkFox.parser import *
 from MarketPlaces.Tor2door.parser import *
@ -118,7 +120,7 @@ def read_file(filePath, createLog, logFile):
            print("There was a problem to read the file " + filePath)
            if createLog:
                logFile.write(
                    str(nError) + ". There was a problem to read the file " + filePath + "\n")
                    str(nError) + ". There was a problem to read the file " + filePath + "\n" + traceback.format_exc() + "\n")
            return None
@ -179,7 +181,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
        traceback.print_exc()
        if createLog:
            logFile.write(
                str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n")
                str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n"
                + traceback.format_exc() + "\n")
        return None
@ -240,7 +243,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
        traceback.print_exc()
        if createLog:
            logFile.write(
                str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n")
                str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n"
                + traceback.format_exc() + "\n")
        return None
@ -258,27 +262,32 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
        traceback.print_exc()
        if createLog:
            logFile.write(
                str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
                str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n"
                + traceback.format_exc() + "\n")
        return False
 def move_file(filePath, createLog, logFile):
    source = filePath
    destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
    destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' + os.path.basename(filePath)
    try:
        shutil.move(source, destination, shutil.copytree)
        shutil.move(source, destination, shutil.copy2)
        return True
    except:
        incrementError()
        print("There was a problem to move the file " + filePath)
        traceback.print_exc()
        if createLog:
            logFile.write(
                str(nError) + ". There was a problem to move the file " + filePath + "\n")
        return False
        try:
            shutil.move(source, destination, shutil.copytree)
            return True
        except:
            incrementError()
            print("There was a problem to move the file " + filePath)
            traceback.print_exc()
            if createLog:
                logFile.write(
                    str(nError) + ". There was a problem to move the file " + filePath + "\n" + traceback.format_exc() + "\n")
            return False
 def new_parse(marketPlace, url, createLog):
--- a/MarketPlaces/Tor2door/crawler_selenium.py
+++ b/MarketPlaces/Tor2door/crawler_selenium.py
@ -24,7 +24,7 @@ from MarketPlaces.Tor2door.parser import tor2door_links_parser
 from MarketPlaces.Utilities.utilities import cleanHTML
 counter = 1
 baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion'
 baseURL = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion'
 # Opens Tor Browser, crawls the website
@ -98,7 +98,7 @@ def getMKTName():
 # Return the link of the website
 def getFixedURL():
    url = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/login'
    url = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/login'
    return url
@ -129,8 +129,8 @@ def createFFDriver():
    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
    ff_prof.set_preference("signon.rememberSignons", False)
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    ff_prof.set_preference("network.dns.disablePrefetch", True)
    ff_prof.set_preference("network.http.sendRefererHeader", 0)
    # ff_prof.set_preference("network.dns.disablePrefetch", True)
    # ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("permissions.default.image", 3)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -199,15 +199,15 @@ def getInterestedLinks():
    links = []
    # # Digital - Guides - Hacking
    # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=55')
    # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=55')
    # # Digital - Guides - Others
    # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=57')
    # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=57')
    # # Digital - Software
    # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=60')
    # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=60')
    # Software - Malware
    links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=69')
    links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=69')
    # # Software - Others
    # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=78')
    # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=78')
    return links
@ -244,7 +244,7 @@ def crawlForum(driver):
                    driver.back()
                    # comment out
                    break
                    # break
                # comment out
                if count == 1:
--- a/MarketPlaces/Tor2door/parser.py
+++ b/MarketPlaces/Tor2door/parser.py
@ -31,6 +31,8 @@ def tor2door_description_parser(soup):
    left = "-1"                         # 16 Product_QuantityLeft
    shipFrom = "-1"                     # 17 Product_ShippedFrom
    shipTo = "-1"                       # 18 Product_ShippedTo
    image = "-1"                        # 19 Product_Image
    vendor_image = "-1"                 # 20 Vendor_Image
    bae = soup.find('div', {'class': "col-9"})
@ -106,9 +108,12 @@ def tor2door_description_parser(soup):
            MS = MS.replace(',', ' ')
            MS = MS.replace('\n', '')
    image = bae.find('div', {"class": "product-primary"}).find('img')
    image = image.get('src').split('base64,')[-1]
    # Populating the final variable (this should be a list with all fields scraped)
    row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
           BTC, USD, EURO, sold, left, shipFrom, shipTo)
           BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
    # Sending the results
    return row
@ -139,7 +144,9 @@ def tor2door_listing_parser(soup):
    qLeft =[]                                 # 17 Product_QuantityLeft
    shipFrom = []                             # 18 Product_ShippedFrom
    shipTo = []                               # 19 Product_ShippedTo
    href = []                                 # 20 Product_Links
    image = []                                # 20 Product_Image
    image_vendor = []                         # 21 Vendor_Image
    href = []                                 # 22 Product_Links
    listing = soup.findAll('div', {"class": "card product-card mb-3"})
@ -181,6 +188,15 @@ def tor2door_listing_parser(soup):
        usd = usd.strip()
        USD.append(usd)
        # Finding Rating
        stars = card.find("ul", {"class": "star-list"})
        full = stars.findAll('i', {"class": "fas fa-star star star-active"})
        half = stars.find('i', {"class": "fas fa-star-half star star-active"})
        rating = len(full)
        if half is not None:
            rating += 0.5
        rating_item.append(str(rating))
        # Finding Reviews
        num = card.find("span", {"class": "rate-count"}).text
        num = num.replace("(", "")
@ -216,9 +232,12 @@ def tor2door_listing_parser(soup):
            MSValue=me
        MS.append(MSValue)
        image = bae[0].find('img')
        image = image.get('src').split('base64,')[-1]
    # Populate the final variable (this should be a list with all fields scraped)
    return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
                            reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
                            reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
 def tor2door_links_parser(soup):
--- a/setup.ini
+++ b/setup.ini
@ -6,7 +6,7 @@ geckodriver_path = C:\calsyslab\Project\dw_pipeline_test\selenium\geckodriver.ex
 [Project]
 project_directory = C:\calsyslab\Project\dw_pipeline_test
 shared_folder = \\VBoxSvr\\Shared
 shared_folder = \\VBoxSvr\Shared
 [PostgreSQL]
 ip = localhost