From 0844b120bc2a52159e66392d61605436aa1ea69f Mon Sep 17 00:00:00 2001
From: westernmeadow <wkwan626@gmail.com>
Date: Fri, 8 Sep 2023 15:50:09 -0700
Subject: [PATCH] fully ran Apocalypse and DarkBazar, fixed move bug

---
 .idea/DW_Pipeline_Test.iml                    |  1 +
 Forums/Initialization/prepare_parser.py       | 47 ++++++++++---------
 MarketPlaces/Apocalypse/crawler_selenium.py   | 28 +++++------
 MarketPlaces/Apocalypse/parser.py             |  5 +-
 MarketPlaces/DarkBazar/crawler_selenium.py    | 16 +++----
 MarketPlaces/Initialization/prepare_parser.py | 35 +++++++++-----
 MarketPlaces/Tor2door/crawler_selenium.py     | 20 ++++----
 MarketPlaces/Tor2door/parser.py               | 25 ++++++++--
 setup.ini                                     |  2 +-
 9 files changed, 107 insertions(+), 72 deletions(-)
diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml
index 9ee2f4c..cd99e29 100644
--- a/.idea/DW_Pipeline_Test.iml
+++ b/.idea/DW_Pipeline_Test.iml
@@ -29,6 +29,7 @@
         <option value="$MODULE_DIR$/Forums/Procrax" />
         <option value="$MODULE_DIR$/MarketPlaces/DarkBazar" />
         <option value="$MODULE_DIR$/MarketPlaces/AnonMarket" />
+        <option value="$MODULE_DIR$/MarketPlaces/Tor2door" />
       </list>
     </option>
   </component>
diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py
index e3cc468..82e08da 100644
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@@ -105,7 +105,7 @@ def read_file(filePath, createLog, logFile):
             print("There was a problem to read the file " + filePath)
             if createLog:
                 logFile.write(
-                    str(nError) + ". There was a problem to read the file " + filePath + "\n")
+                    str(nError) + ". There was a problem to read the file " + filePath + "\n" + traceback.format_exc() + "\n")
             return None
 
 
@@ -141,7 +141,8 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
         traceback.print_exc()
         if createLog:
             logFile.write(
-                str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n")
+                str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n"
+                + traceback.format_exc() + "\n")
         return None
 
 
@@ -177,7 +178,8 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
         traceback.print_exc()
         if createLog:
             logFile.write(
-                str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n")
+                str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n"
+                + traceback.format_exc() + "\n")
         return None
 
 
@@ -191,17 +193,14 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
 
         con.rollback()
 
-        trace = traceback.format_exc()
-
-        if trace.find("already exists") == -1:
-            incrementError()
-            print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!")
-            traceback.print_exc()
-            if createLog:
-                logFile.write(str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
-            return False
-        else:
-            return True
+        incrementError()
+        print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!")
+        traceback.print_exc()
+        if createLog:
+            logFile.write(
+                str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n"
+                + traceback.format_exc() + "\n")
+        return False
 
 
 def move_file(filePath, createLog, logFile):
@@ -210,17 +209,21 @@ def move_file(filePath, createLog, logFile):
     destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
 
     try:
-        shutil.move(source, destination, shutil.copytree)
+        shutil.move(source, destination, shutil.copy2)
         return True
     except:
 
-        incrementError()
-        print("There was a problem to move the file " + filePath)
-        traceback.print_exc()
-        if createLog:
-            logFile.write(
-                str(nError) + ". There was a problem to move the file " + filePath + "\n")
-        return False
+        try:
+            shutil.move(source, destination, shutil.copytree)
+            return True
+        except:
+            incrementError()
+            print("There was a problem to move the file " + filePath)
+            traceback.print_exc()
+            if createLog:
+                logFile.write(
+                    str(nError) + ". There was a problem to move the file " + filePath + "\n" + traceback.format_exc() + "\n")
+            return False
 
 
 #main method for this program, what actually gets the parsed info from the parser, and persists them into the db
diff --git a/MarketPlaces/Apocalypse/crawler_selenium.py b/MarketPlaces/Apocalypse/crawler_selenium.py
index 7a684df..b91bf0e 100644
--- a/MarketPlaces/Apocalypse/crawler_selenium.py
+++ b/MarketPlaces/Apocalypse/crawler_selenium.py
@@ -189,12 +189,12 @@ def getNameFromURL(url):
 def getInterestedLinks():
     links = []
 
-    # # Digital Goods
-    # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74')
-    # # Fraud
-    # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75')
-    # # Services
-    # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76')
+    # Digital Goods
+    links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74')
+    # Fraud
+    links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75')
+    # Services
+    links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76')
     # software and malware
     links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30')
 
@@ -239,16 +239,16 @@ def crawlForum(driver):
                     except:
                         driver.refresh()
 
-                    # comment out
-                    # break
-
-                # comment out
-                if count == 1:
-                    break
+                #     # comment out
+                #     break
+                #
+                # # comment out
+                # if count == 1:
+                #     break
 
                 try:
-                    link = driver.find_element(by=By.XPATH, value=
-                        '/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href')
+                    nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div/div[2]/nav')
+                    link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='»').get_attribute('href')
                     if link == "":
                         raise NoSuchElementException
                     count += 1
diff --git a/MarketPlaces/Apocalypse/parser.py b/MarketPlaces/Apocalypse/parser.py
index 8cd3a5b..6610cc6 100644
--- a/MarketPlaces/Apocalypse/parser.py
+++ b/MarketPlaces/Apocalypse/parser.py
@@ -113,7 +113,10 @@ def apocalypse_listing_parser(soup: Tag):
     image_vendor = []                         # 21 Vendor_Image
     href = []                                 # 22 Product_Links
 
-    listings: ResultSet[Tag] = soup.find("div", {"class": "col-lg-9 my-4"}).find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
+    table = soup.find("div", {"class": "col-lg-9 my-4"})
+    if table is None:
+        table = soup.find("div", {"class": "col-lg-9"})
+    listings: ResultSet[Tag] = table.find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
     
     for prod in listings:
         
diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py
index 4a8f4e5..fdfb640 100644
--- a/MarketPlaces/DarkBazar/crawler_selenium.py
+++ b/MarketPlaces/DarkBazar/crawler_selenium.py
@@ -175,8 +175,8 @@ def getNameFromURL(url):
 def getInterestedLinks():
     links = []
 
-    # # Digital Goods
-    # links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=4')
+    # Digital Goods
+    links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=3')
     # Services
     links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=5')
 
@@ -216,12 +216,12 @@ def crawlForum(driver):
                     savePage(driver, driver.page_source, item)
                     driver.back()
 
-                    # comment out
-                    # break
-
-                # comment out
-                if count == 1:
-                    break
+                #     # comment out
+                #     break
+                #
+                # # comment out
+                # if count == 1:
+                #     break
 
                 try:
                     link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py
index 1cc5af5..985ef69 100644
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@@ -4,6 +4,8 @@ import glob
 import os
 import codecs
 import shutil
+import traceback
+
 from MarketPlaces.DB_Connection.db_connection import *
 from MarketPlaces.DarkFox.parser import *
 from MarketPlaces.Tor2door.parser import *
@@ -118,7 +120,7 @@ def read_file(filePath, createLog, logFile):
             print("There was a problem to read the file " + filePath)
             if createLog:
                 logFile.write(
-                    str(nError) + ". There was a problem to read the file " + filePath + "\n")
+                    str(nError) + ". There was a problem to read the file " + filePath + "\n" + traceback.format_exc() + "\n")
             return None
 
 
@@ -179,7 +181,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
         traceback.print_exc()
         if createLog:
             logFile.write(
-                str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n")
+                str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n"
+                + traceback.format_exc() + "\n")
         return None
 
 
@@ -240,7 +243,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
         traceback.print_exc()
         if createLog:
             logFile.write(
-                str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n")
+                str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n"
+                + traceback.format_exc() + "\n")
         return None
 
 
@@ -258,27 +262,32 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
         traceback.print_exc()
         if createLog:
             logFile.write(
-                str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
+                str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n"
+                + traceback.format_exc() + "\n")
         return False
 
 
 def move_file(filePath, createLog, logFile):
 
     source = filePath
-    destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
+    destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' + os.path.basename(filePath)
 
     try:
-        shutil.move(source, destination, shutil.copytree)
+        shutil.move(source, destination, shutil.copy2)
         return True
     except:
 
-        incrementError()
-        print("There was a problem to move the file " + filePath)
-        traceback.print_exc()
-        if createLog:
-            logFile.write(
-                str(nError) + ". There was a problem to move the file " + filePath + "\n")
-        return False
+        try:
+            shutil.move(source, destination, shutil.copytree)
+            return True
+        except:
+            incrementError()
+            print("There was a problem to move the file " + filePath)
+            traceback.print_exc()
+            if createLog:
+                logFile.write(
+                    str(nError) + ". There was a problem to move the file " + filePath + "\n" + traceback.format_exc() + "\n")
+            return False
 
 
 def new_parse(marketPlace, url, createLog):
diff --git a/MarketPlaces/Tor2door/crawler_selenium.py b/MarketPlaces/Tor2door/crawler_selenium.py
index a2df655..36a3e63 100644
--- a/MarketPlaces/Tor2door/crawler_selenium.py
+++ b/MarketPlaces/Tor2door/crawler_selenium.py
@@ -24,7 +24,7 @@ from MarketPlaces.Tor2door.parser import tor2door_links_parser
 from MarketPlaces.Utilities.utilities import cleanHTML
 
 counter = 1
-baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion'
+baseURL = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion'
 
 
 # Opens Tor Browser, crawls the website
@@ -98,7 +98,7 @@ def getMKTName():
 
 # Return the link of the website
 def getFixedURL():
-    url = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/login'
+    url = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/login'
 
     return url
 
@@ -129,8 +129,8 @@ def createFFDriver():
     ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
     ff_prof.set_preference("signon.rememberSignons", False)
     ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
-    ff_prof.set_preference("network.dns.disablePrefetch", True)
-    ff_prof.set_preference("network.http.sendRefererHeader", 0)
+    # ff_prof.set_preference("network.dns.disablePrefetch", True)
+    # ff_prof.set_preference("network.http.sendRefererHeader", 0)
     ff_prof.set_preference("permissions.default.image", 3)
     ff_prof.set_preference("browser.download.folderList", 2)
     ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@@ -199,15 +199,15 @@ def getInterestedLinks():
     links = []
 
     # # Digital - Guides - Hacking
-    # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=55')
+    # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=55')
     # # Digital - Guides - Others
-    # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=57')
+    # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=57')
     # # Digital - Software
-    # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=60')
+    # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=60')
     # Software - Malware
-    links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=69')
+    links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=69')
     # # Software - Others
-    # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=78')
+    # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=78')
 
     return links
 
@@ -244,7 +244,7 @@ def crawlForum(driver):
                     driver.back()
 
                     # comment out
-                    break
+                    # break
 
                 # comment out
                 if count == 1:
diff --git a/MarketPlaces/Tor2door/parser.py b/MarketPlaces/Tor2door/parser.py
index f4a4c07..49e0a93 100644
--- a/MarketPlaces/Tor2door/parser.py
+++ b/MarketPlaces/Tor2door/parser.py
@@ -31,6 +31,8 @@ def tor2door_description_parser(soup):
     left = "-1"                         # 16 Product_QuantityLeft
     shipFrom = "-1"                     # 17 Product_ShippedFrom
     shipTo = "-1"                       # 18 Product_ShippedTo
+    image = "-1"                        # 19 Product_Image
+    vendor_image = "-1"                 # 20 Vendor_Image
 
     bae = soup.find('div', {'class': "col-9"})
 
@@ -106,9 +108,12 @@ def tor2door_description_parser(soup):
             MS = MS.replace(',', ' ')
             MS = MS.replace('\n', '')
 
+    image = bae.find('div', {"class": "product-primary"}).find('img')
+    image = image.get('src').split('base64,')[-1]
+
     # Populating the final variable (this should be a list with all fields scraped)
     row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
-           BTC, USD, EURO, sold, left, shipFrom, shipTo)
+           BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
 
     # Sending the results
     return row
@@ -139,7 +144,9 @@ def tor2door_listing_parser(soup):
     qLeft =[]                                 # 17 Product_QuantityLeft
     shipFrom = []                             # 18 Product_ShippedFrom
     shipTo = []                               # 19 Product_ShippedTo
-    href = []                                 # 20 Product_Links
+    image = []                                # 20 Product_Image
+    image_vendor = []                         # 21 Vendor_Image
+    href = []                                 # 22 Product_Links
 
     listing = soup.findAll('div', {"class": "card product-card mb-3"})
 
@@ -181,6 +188,15 @@ def tor2door_listing_parser(soup):
         usd = usd.strip()
         USD.append(usd)
 
+        # Finding Rating
+        stars = card.find("ul", {"class": "star-list"})
+        full = stars.findAll('i', {"class": "fas fa-star star star-active"})
+        half = stars.find('i', {"class": "fas fa-star-half star star-active"})
+        rating = len(full)
+        if half is not None:
+            rating += 0.5
+        rating_item.append(str(rating))
+
         # Finding Reviews
         num = card.find("span", {"class": "rate-count"}).text
         num = num.replace("(", "")
@@ -216,9 +232,12 @@ def tor2door_listing_parser(soup):
             MSValue=me
         MS.append(MSValue)
 
+        image = bae[0].find('img')
+        image = image.get('src').split('base64,')[-1]
+
     # Populate the final variable (this should be a list with all fields scraped)
     return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
-                            reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
+                            reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
 
 
 def tor2door_links_parser(soup):
diff --git a/setup.ini b/setup.ini
index f4c18df..29997a6 100644
--- a/setup.ini
+++ b/setup.ini
@@ -6,7 +6,7 @@ geckodriver_path = C:\calsyslab\Project\dw_pipeline_test\selenium\geckodriver.ex
 
 [Project]
 project_directory = C:\calsyslab\Project\dw_pipeline_test
-shared_folder = \\VBoxSvr\\Shared
+shared_folder = \\VBoxSvr\Shared
 
 [PostgreSQL]
 ip = localhost