From 0844b120bc2a52159e66392d61605436aa1ea69f Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Fri, 8 Sep 2023 15:50:09 -0700 Subject: [PATCH] fully ran Apocalypse and DarkBazar, fixed move bug --- .idea/DW_Pipeline_Test.iml | 1 + Forums/Initialization/prepare_parser.py | 47 ++++++++++--------- MarketPlaces/Apocalypse/crawler_selenium.py | 28 +++++------ MarketPlaces/Apocalypse/parser.py | 5 +- MarketPlaces/DarkBazar/crawler_selenium.py | 16 +++---- MarketPlaces/Initialization/prepare_parser.py | 35 +++++++++----- MarketPlaces/Tor2door/crawler_selenium.py | 20 ++++---- MarketPlaces/Tor2door/parser.py | 25 ++++++++-- setup.ini | 2 +- 9 files changed, 107 insertions(+), 72 deletions(-) diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml index 9ee2f4c..cd99e29 100644 --- a/.idea/DW_Pipeline_Test.iml +++ b/.idea/DW_Pipeline_Test.iml @@ -29,6 +29,7 @@ diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index e3cc468..82e08da 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -105,7 +105,7 @@ def read_file(filePath, createLog, logFile): print("There was a problem to read the file " + filePath) if createLog: logFile.write( - str(nError) + ". There was a problem to read the file " + filePath + "\n") + str(nError) + ". There was a problem to read the file " + filePath + "\n" + traceback.format_exc() + "\n") return None @@ -141,7 +141,8 @@ def parse_listing(forum, listingFile, soup, createLog, logFile): traceback.print_exc() if createLog: logFile.write( - str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n") + str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n" + + traceback.format_exc() + "\n") return None @@ -177,7 +178,8 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile): traceback.print_exc() if createLog: logFile.write( - str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n") + str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n" + + traceback.format_exc() + "\n") return None @@ -191,17 +193,14 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript con.rollback() - trace = traceback.format_exc() - - if trace.find("already exists") == -1: - incrementError() - print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!") - traceback.print_exc() - if createLog: - logFile.write(str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n") - return False - else: - return True + incrementError() + print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!") + traceback.print_exc() + if createLog: + logFile.write( + str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n" + + traceback.format_exc() + "\n") + return False def move_file(filePath, createLog, logFile): @@ -210,17 +209,21 @@ def move_file(filePath, createLog, logFile): destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' try: - shutil.move(source, destination, shutil.copytree) + shutil.move(source, destination, shutil.copy2) return True except: - incrementError() - print("There was a problem to move the file " + filePath) - traceback.print_exc() - if createLog: - logFile.write( - str(nError) + ". There was a problem to move the file " + filePath + "\n") - return False + try: + shutil.move(source, destination, shutil.copytree) + return True + except: + incrementError() + print("There was a problem to move the file " + filePath) + traceback.print_exc() + if createLog: + logFile.write( + str(nError) + ". There was a problem to move the file " + filePath + "\n" + traceback.format_exc() + "\n") + return False #main method for this program, what actually gets the parsed info from the parser, and persists them into the db diff --git a/MarketPlaces/Apocalypse/crawler_selenium.py b/MarketPlaces/Apocalypse/crawler_selenium.py index 7a684df..b91bf0e 100644 --- a/MarketPlaces/Apocalypse/crawler_selenium.py +++ b/MarketPlaces/Apocalypse/crawler_selenium.py @@ -189,12 +189,12 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Digital Goods - # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74') - # # Fraud - # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75') - # # Services - # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76') + # Digital Goods + links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74') + # Fraud + links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75') + # Services + links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76') # software and malware links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30') @@ -239,16 +239,16 @@ def crawlForum(driver): except: driver.refresh() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - link = driver.find_element(by=By.XPATH, value= - '/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href') + nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div/div[2]/nav') + link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='ยป').get_attribute('href') if link == "": raise NoSuchElementException count += 1 diff --git a/MarketPlaces/Apocalypse/parser.py b/MarketPlaces/Apocalypse/parser.py index 8cd3a5b..6610cc6 100644 --- a/MarketPlaces/Apocalypse/parser.py +++ b/MarketPlaces/Apocalypse/parser.py @@ -113,7 +113,10 @@ def apocalypse_listing_parser(soup: Tag): image_vendor = [] # 21 Vendor_Image href = [] # 22 Product_Links - listings: ResultSet[Tag] = soup.find("div", {"class": "col-lg-9 my-4"}).find_all("div", {"class": "col-lg-4 col-md-6 mb-1"}) + table = soup.find("div", {"class": "col-lg-9 my-4"}) + if table is None: + table = soup.find("div", {"class": "col-lg-9"}) + listings: ResultSet[Tag] = table.find_all("div", {"class": "col-lg-4 col-md-6 mb-1"}) for prod in listings: diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py index 4a8f4e5..fdfb640 100644 --- a/MarketPlaces/DarkBazar/crawler_selenium.py +++ b/MarketPlaces/DarkBazar/crawler_selenium.py @@ -175,8 +175,8 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Digital Goods - # links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=4') + # Digital Goods + links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=3') # Services links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=5') @@ -216,12 +216,12 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 1cc5af5..985ef69 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -4,6 +4,8 @@ import glob import os import codecs import shutil +import traceback + from MarketPlaces.DB_Connection.db_connection import * from MarketPlaces.DarkFox.parser import * from MarketPlaces.Tor2door.parser import * @@ -118,7 +120,7 @@ def read_file(filePath, createLog, logFile): print("There was a problem to read the file " + filePath) if createLog: logFile.write( - str(nError) + ". There was a problem to read the file " + filePath + "\n") + str(nError) + ". There was a problem to read the file " + filePath + "\n" + traceback.format_exc() + "\n") return None @@ -179,7 +181,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): traceback.print_exc() if createLog: logFile.write( - str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n") + str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n" + + traceback.format_exc() + "\n") return None @@ -240,7 +243,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): traceback.print_exc() if createLog: logFile.write( - str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n") + str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n" + + traceback.format_exc() + "\n") return None @@ -258,27 +262,32 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript traceback.print_exc() if createLog: logFile.write( - str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n") + str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n" + + traceback.format_exc() + "\n") return False def move_file(filePath, createLog, logFile): source = filePath - destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' + destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' + os.path.basename(filePath) try: - shutil.move(source, destination, shutil.copytree) + shutil.move(source, destination, shutil.copy2) return True except: - incrementError() - print("There was a problem to move the file " + filePath) - traceback.print_exc() - if createLog: - logFile.write( - str(nError) + ". There was a problem to move the file " + filePath + "\n") - return False + try: + shutil.move(source, destination, shutil.copytree) + return True + except: + incrementError() + print("There was a problem to move the file " + filePath) + traceback.print_exc() + if createLog: + logFile.write( + str(nError) + ". There was a problem to move the file " + filePath + "\n" + traceback.format_exc() + "\n") + return False def new_parse(marketPlace, url, createLog): diff --git a/MarketPlaces/Tor2door/crawler_selenium.py b/MarketPlaces/Tor2door/crawler_selenium.py index a2df655..36a3e63 100644 --- a/MarketPlaces/Tor2door/crawler_selenium.py +++ b/MarketPlaces/Tor2door/crawler_selenium.py @@ -24,7 +24,7 @@ from MarketPlaces.Tor2door.parser import tor2door_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion' +baseURL = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion' # Opens Tor Browser, crawls the website @@ -98,7 +98,7 @@ def getMKTName(): # Return the link of the website def getFixedURL(): - url = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/login' + url = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/login' return url @@ -129,8 +129,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -199,15 +199,15 @@ def getInterestedLinks(): links = [] # # Digital - Guides - Hacking - # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=55') + # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=55') # # Digital - Guides - Others - # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=57') + # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=57') # # Digital - Software - # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=60') + # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=60') # Software - Malware - links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=69') + links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=69') # # Software - Others - # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=78') + # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=78') return links @@ -244,7 +244,7 @@ def crawlForum(driver): driver.back() # comment out - break + # break # comment out if count == 1: diff --git a/MarketPlaces/Tor2door/parser.py b/MarketPlaces/Tor2door/parser.py index f4a4c07..49e0a93 100644 --- a/MarketPlaces/Tor2door/parser.py +++ b/MarketPlaces/Tor2door/parser.py @@ -31,6 +31,8 @@ def tor2door_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image bae = soup.find('div', {'class': "col-9"}) @@ -106,9 +108,12 @@ def tor2door_description_parser(soup): MS = MS.replace(',', ' ') MS = MS.replace('\n', '') + image = bae.find('div', {"class": "product-primary"}).find('img') + image = image.get('src').split('base64,')[-1] + # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row @@ -139,7 +144,9 @@ def tor2door_listing_parser(soup): qLeft =[] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links listing = soup.findAll('div', {"class": "card product-card mb-3"}) @@ -181,6 +188,15 @@ def tor2door_listing_parser(soup): usd = usd.strip() USD.append(usd) + # Finding Rating + stars = card.find("ul", {"class": "star-list"}) + full = stars.findAll('i', {"class": "fas fa-star star star-active"}) + half = stars.find('i', {"class": "fas fa-star-half star star-active"}) + rating = len(full) + if half is not None: + rating += 0.5 + rating_item.append(str(rating)) + # Finding Reviews num = card.find("span", {"class": "rate-count"}).text num = num.replace("(", "") @@ -216,9 +232,12 @@ def tor2door_listing_parser(soup): MSValue=me MS.append(MSValue) + image = bae[0].find('img') + image = image.get('src').split('base64,')[-1] + # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) def tor2door_links_parser(soup): diff --git a/setup.ini b/setup.ini index f4c18df..29997a6 100644 --- a/setup.ini +++ b/setup.ini @@ -6,7 +6,7 @@ geckodriver_path = C:\calsyslab\Project\dw_pipeline_test\selenium\geckodriver.ex [Project] project_directory = C:\calsyslab\Project\dw_pipeline_test -shared_folder = \\VBoxSvr\\Shared +shared_folder = \\VBoxSvr\Shared [PostgreSQL] ip = localhost