diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml
index 9ee2f4c..cd99e29 100644
--- a/.idea/DW_Pipeline_Test.iml
+++ b/.idea/DW_Pipeline_Test.iml
@@ -29,6 +29,7 @@
+
diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py
index e3cc468..82e08da 100644
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@@ -105,7 +105,7 @@ def read_file(filePath, createLog, logFile):
print("There was a problem to read the file " + filePath)
if createLog:
logFile.write(
- str(nError) + ". There was a problem to read the file " + filePath + "\n")
+ str(nError) + ". There was a problem to read the file " + filePath + "\n" + traceback.format_exc() + "\n")
return None
@@ -141,7 +141,8 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
traceback.print_exc()
if createLog:
logFile.write(
- str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n")
+ str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n"
+ + traceback.format_exc() + "\n")
return None
@@ -177,7 +178,8 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
traceback.print_exc()
if createLog:
logFile.write(
- str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n")
+ str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n"
+ + traceback.format_exc() + "\n")
return None
@@ -191,17 +193,14 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
con.rollback()
- trace = traceback.format_exc()
-
- if trace.find("already exists") == -1:
- incrementError()
- print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!")
- traceback.print_exc()
- if createLog:
- logFile.write(str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
- return False
- else:
- return True
+ incrementError()
+ print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!")
+ traceback.print_exc()
+ if createLog:
+ logFile.write(
+ str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n"
+ + traceback.format_exc() + "\n")
+ return False
def move_file(filePath, createLog, logFile):
@@ -210,17 +209,21 @@ def move_file(filePath, createLog, logFile):
destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
try:
- shutil.move(source, destination, shutil.copytree)
+ shutil.move(source, destination, shutil.copy2)
return True
except:
- incrementError()
- print("There was a problem to move the file " + filePath)
- traceback.print_exc()
- if createLog:
- logFile.write(
- str(nError) + ". There was a problem to move the file " + filePath + "\n")
- return False
+ try:
+ shutil.move(source, destination, shutil.copytree)
+ return True
+ except:
+ incrementError()
+ print("There was a problem to move the file " + filePath)
+ traceback.print_exc()
+ if createLog:
+ logFile.write(
+ str(nError) + ". There was a problem to move the file " + filePath + "\n" + traceback.format_exc() + "\n")
+ return False
#main method for this program, what actually gets the parsed info from the parser, and persists them into the db
diff --git a/MarketPlaces/Apocalypse/crawler_selenium.py b/MarketPlaces/Apocalypse/crawler_selenium.py
index 7a684df..b91bf0e 100644
--- a/MarketPlaces/Apocalypse/crawler_selenium.py
+++ b/MarketPlaces/Apocalypse/crawler_selenium.py
@@ -189,12 +189,12 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # # Digital Goods
- # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74')
- # # Fraud
- # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75')
- # # Services
- # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76')
+ # Digital Goods
+ links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74')
+ # Fraud
+ links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75')
+ # Services
+ links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76')
# software and malware
links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30')
@@ -239,16 +239,16 @@ def crawlForum(driver):
except:
driver.refresh()
- # comment out
- # break
-
- # comment out
- if count == 1:
- break
+ # # comment out
+ # break
+ #
+ # # comment out
+ # if count == 1:
+ # break
try:
- link = driver.find_element(by=By.XPATH, value=
- '/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href')
+ nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div/div[2]/nav')
+ link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='ยป').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
diff --git a/MarketPlaces/Apocalypse/parser.py b/MarketPlaces/Apocalypse/parser.py
index 8cd3a5b..6610cc6 100644
--- a/MarketPlaces/Apocalypse/parser.py
+++ b/MarketPlaces/Apocalypse/parser.py
@@ -113,7 +113,10 @@ def apocalypse_listing_parser(soup: Tag):
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
- listings: ResultSet[Tag] = soup.find("div", {"class": "col-lg-9 my-4"}).find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
+ table = soup.find("div", {"class": "col-lg-9 my-4"})
+ if table is None:
+ table = soup.find("div", {"class": "col-lg-9"})
+ listings: ResultSet[Tag] = table.find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
for prod in listings:
diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py
index 4a8f4e5..fdfb640 100644
--- a/MarketPlaces/DarkBazar/crawler_selenium.py
+++ b/MarketPlaces/DarkBazar/crawler_selenium.py
@@ -175,8 +175,8 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
- # # Digital Goods
- # links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=4')
+ # Digital Goods
+ links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=3')
# Services
links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=5')
@@ -216,12 +216,12 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item)
driver.back()
- # comment out
- # break
-
- # comment out
- if count == 1:
- break
+ # # comment out
+ # break
+ #
+ # # comment out
+ # if count == 1:
+ # break
try:
link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py
index 1cc5af5..985ef69 100644
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@@ -4,6 +4,8 @@ import glob
import os
import codecs
import shutil
+import traceback
+
from MarketPlaces.DB_Connection.db_connection import *
from MarketPlaces.DarkFox.parser import *
from MarketPlaces.Tor2door.parser import *
@@ -118,7 +120,7 @@ def read_file(filePath, createLog, logFile):
print("There was a problem to read the file " + filePath)
if createLog:
logFile.write(
- str(nError) + ". There was a problem to read the file " + filePath + "\n")
+ str(nError) + ". There was a problem to read the file " + filePath + "\n" + traceback.format_exc() + "\n")
return None
@@ -179,7 +181,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
traceback.print_exc()
if createLog:
logFile.write(
- str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n")
+ str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n"
+ + traceback.format_exc() + "\n")
return None
@@ -240,7 +243,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
traceback.print_exc()
if createLog:
logFile.write(
- str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n")
+ str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n"
+ + traceback.format_exc() + "\n")
return None
@@ -258,27 +262,32 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
traceback.print_exc()
if createLog:
logFile.write(
- str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
+ str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n"
+ + traceback.format_exc() + "\n")
return False
def move_file(filePath, createLog, logFile):
source = filePath
- destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
+ destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' + os.path.basename(filePath)
try:
- shutil.move(source, destination, shutil.copytree)
+ shutil.move(source, destination, shutil.copy2)
return True
except:
- incrementError()
- print("There was a problem to move the file " + filePath)
- traceback.print_exc()
- if createLog:
- logFile.write(
- str(nError) + ". There was a problem to move the file " + filePath + "\n")
- return False
+ try:
+ shutil.move(source, destination, shutil.copytree)
+ return True
+ except:
+ incrementError()
+ print("There was a problem to move the file " + filePath)
+ traceback.print_exc()
+ if createLog:
+ logFile.write(
+ str(nError) + ". There was a problem to move the file " + filePath + "\n" + traceback.format_exc() + "\n")
+ return False
def new_parse(marketPlace, url, createLog):
diff --git a/MarketPlaces/Tor2door/crawler_selenium.py b/MarketPlaces/Tor2door/crawler_selenium.py
index a2df655..36a3e63 100644
--- a/MarketPlaces/Tor2door/crawler_selenium.py
+++ b/MarketPlaces/Tor2door/crawler_selenium.py
@@ -24,7 +24,7 @@ from MarketPlaces.Tor2door.parser import tor2door_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
-baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion'
+baseURL = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion'
# Opens Tor Browser, crawls the website
@@ -98,7 +98,7 @@ def getMKTName():
# Return the link of the website
def getFixedURL():
- url = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/login'
+ url = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/login'
return url
@@ -129,8 +129,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
- ff_prof.set_preference("network.dns.disablePrefetch", True)
- ff_prof.set_preference("network.http.sendRefererHeader", 0)
+ # ff_prof.set_preference("network.dns.disablePrefetch", True)
+ # ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@@ -199,15 +199,15 @@ def getInterestedLinks():
links = []
# # Digital - Guides - Hacking
- # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=55')
+ # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=55')
# # Digital - Guides - Others
- # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=57')
+ # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=57')
# # Digital - Software
- # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=60')
+ # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=60')
# Software - Malware
- links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=69')
+ links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=69')
# # Software - Others
- # links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=78')
+ # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=78')
return links
@@ -244,7 +244,7 @@ def crawlForum(driver):
driver.back()
# comment out
- break
+ # break
# comment out
if count == 1:
diff --git a/MarketPlaces/Tor2door/parser.py b/MarketPlaces/Tor2door/parser.py
index f4a4c07..49e0a93 100644
--- a/MarketPlaces/Tor2door/parser.py
+++ b/MarketPlaces/Tor2door/parser.py
@@ -31,6 +31,8 @@ def tor2door_description_parser(soup):
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
+ image = "-1" # 19 Product_Image
+ vendor_image = "-1" # 20 Vendor_Image
bae = soup.find('div', {'class': "col-9"})
@@ -106,9 +108,12 @@ def tor2door_description_parser(soup):
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
+ image = bae.find('div', {"class": "product-primary"}).find('img')
+ image = image.get('src').split('base64,')[-1]
+
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
- BTC, USD, EURO, sold, left, shipFrom, shipTo)
+ BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
@@ -139,7 +144,9 @@ def tor2door_listing_parser(soup):
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
- href = [] # 20 Product_Links
+ image = [] # 20 Product_Image
+ image_vendor = [] # 21 Vendor_Image
+ href = [] # 22 Product_Links
listing = soup.findAll('div', {"class": "card product-card mb-3"})
@@ -181,6 +188,15 @@ def tor2door_listing_parser(soup):
usd = usd.strip()
USD.append(usd)
+ # Finding Rating
+ stars = card.find("ul", {"class": "star-list"})
+ full = stars.findAll('i', {"class": "fas fa-star star star-active"})
+ half = stars.find('i', {"class": "fas fa-star-half star star-active"})
+ rating = len(full)
+ if half is not None:
+ rating += 0.5
+ rating_item.append(str(rating))
+
# Finding Reviews
num = card.find("span", {"class": "rate-count"}).text
num = num.replace("(", "")
@@ -216,9 +232,12 @@ def tor2door_listing_parser(soup):
MSValue=me
MS.append(MSValue)
+ image = bae[0].find('img')
+ image = image.get('src').split('base64,')[-1]
+
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
- reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
+ reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
def tor2door_links_parser(soup):
diff --git a/setup.ini b/setup.ini
index f4c18df..29997a6 100644
--- a/setup.ini
+++ b/setup.ini
@@ -6,7 +6,7 @@ geckodriver_path = C:\calsyslab\Project\dw_pipeline_test\selenium\geckodriver.ex
[Project]
project_directory = C:\calsyslab\Project\dw_pipeline_test
-shared_folder = \\VBoxSvr\\Shared
+shared_folder = \\VBoxSvr\Shared
[PostgreSQL]
ip = localhost