Browse Source

fully ran Apocalypse and DarkBazar, fixed move bug

main
westernmeadow 1 year ago
parent
commit
0844b120bc
9 changed files with 107 additions and 72 deletions
  1. +1
    -0
      .idea/DW_Pipeline_Test.iml
  2. +25
    -22
      Forums/Initialization/prepare_parser.py
  3. +14
    -14
      MarketPlaces/Apocalypse/crawler_selenium.py
  4. +4
    -1
      MarketPlaces/Apocalypse/parser.py
  5. +8
    -8
      MarketPlaces/DarkBazar/crawler_selenium.py
  6. +22
    -13
      MarketPlaces/Initialization/prepare_parser.py
  7. +10
    -10
      MarketPlaces/Tor2door/crawler_selenium.py
  8. +22
    -3
      MarketPlaces/Tor2door/parser.py
  9. +1
    -1
      setup.ini

+ 1
- 0
.idea/DW_Pipeline_Test.iml View File

@ -29,6 +29,7 @@
<option value="$MODULE_DIR$/Forums/Procrax" /> <option value="$MODULE_DIR$/Forums/Procrax" />
<option value="$MODULE_DIR$/MarketPlaces/DarkBazar" /> <option value="$MODULE_DIR$/MarketPlaces/DarkBazar" />
<option value="$MODULE_DIR$/MarketPlaces/AnonMarket" /> <option value="$MODULE_DIR$/MarketPlaces/AnonMarket" />
<option value="$MODULE_DIR$/MarketPlaces/Tor2door" />
</list> </list>
</option> </option>
</component> </component>

+ 25
- 22
Forums/Initialization/prepare_parser.py View File

@ -105,7 +105,7 @@ def read_file(filePath, createLog, logFile):
print("There was a problem to read the file " + filePath) print("There was a problem to read the file " + filePath)
if createLog: if createLog:
logFile.write( logFile.write(
str(nError) + ". There was a problem to read the file " + filePath + "\n")
str(nError) + ". There was a problem to read the file " + filePath + "\n" + traceback.format_exc() + "\n")
return None return None
@ -141,7 +141,8 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
traceback.print_exc() traceback.print_exc()
if createLog: if createLog:
logFile.write( logFile.write(
str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n")
str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n"
+ traceback.format_exc() + "\n")
return None return None
@ -177,7 +178,8 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
traceback.print_exc() traceback.print_exc()
if createLog: if createLog:
logFile.write( logFile.write(
str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n")
str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n"
+ traceback.format_exc() + "\n")
return None return None
@ -191,17 +193,14 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
con.rollback() con.rollback()
trace = traceback.format_exc()
if trace.find("already exists") == -1:
incrementError()
print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!")
traceback.print_exc()
if createLog:
logFile.write(str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
return False
else:
return True
incrementError()
print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!")
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n"
+ traceback.format_exc() + "\n")
return False
def move_file(filePath, createLog, logFile): def move_file(filePath, createLog, logFile):
@ -210,17 +209,21 @@ def move_file(filePath, createLog, logFile):
destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
try: try:
shutil.move(source, destination, shutil.copytree)
shutil.move(source, destination, shutil.copy2)
return True return True
except: except:
incrementError()
print("There was a problem to move the file " + filePath)
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to move the file " + filePath + "\n")
return False
try:
shutil.move(source, destination, shutil.copytree)
return True
except:
incrementError()
print("There was a problem to move the file " + filePath)
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to move the file " + filePath + "\n" + traceback.format_exc() + "\n")
return False
#main method for this program, what actually gets the parsed info from the parser, and persists them into the db #main method for this program, what actually gets the parsed info from the parser, and persists them into the db


+ 14
- 14
MarketPlaces/Apocalypse/crawler_selenium.py View File

@ -189,12 +189,12 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# # Digital Goods
# links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74')
# # Fraud
# links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75')
# # Services
# links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76')
# Digital Goods
links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74')
# Fraud
links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75')
# Services
links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76')
# software and malware # software and malware
links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30') links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30')
@ -239,16 +239,16 @@ def crawlForum(driver):
except: except:
driver.refresh() driver.refresh()
# comment out
# break
# comment out
if count == 1:
break
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try: try:
link = driver.find_element(by=By.XPATH, value=
'/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href')
nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div/div[2]/nav')
link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='»').get_attribute('href')
if link == "": if link == "":
raise NoSuchElementException raise NoSuchElementException
count += 1 count += 1


+ 4
- 1
MarketPlaces/Apocalypse/parser.py View File

@ -113,7 +113,10 @@ def apocalypse_listing_parser(soup: Tag):
image_vendor = [] # 21 Vendor_Image image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links href = [] # 22 Product_Links
listings: ResultSet[Tag] = soup.find("div", {"class": "col-lg-9 my-4"}).find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
table = soup.find("div", {"class": "col-lg-9 my-4"})
if table is None:
table = soup.find("div", {"class": "col-lg-9"})
listings: ResultSet[Tag] = table.find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
for prod in listings: for prod in listings:


+ 8
- 8
MarketPlaces/DarkBazar/crawler_selenium.py View File

@ -175,8 +175,8 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# # Digital Goods
# links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=4')
# Digital Goods
links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=3')
# Services # Services
links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=5') links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=5')
@ -216,12 +216,12 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item) savePage(driver, driver.page_source, item)
driver.back() driver.back()
# comment out
# break
# comment out
if count == 1:
break
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try: try:
link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')


+ 22
- 13
MarketPlaces/Initialization/prepare_parser.py View File

@ -4,6 +4,8 @@ import glob
import os import os
import codecs import codecs
import shutil import shutil
import traceback
from MarketPlaces.DB_Connection.db_connection import * from MarketPlaces.DB_Connection.db_connection import *
from MarketPlaces.DarkFox.parser import * from MarketPlaces.DarkFox.parser import *
from MarketPlaces.Tor2door.parser import * from MarketPlaces.Tor2door.parser import *
@ -118,7 +120,7 @@ def read_file(filePath, createLog, logFile):
print("There was a problem to read the file " + filePath) print("There was a problem to read the file " + filePath)
if createLog: if createLog:
logFile.write( logFile.write(
str(nError) + ". There was a problem to read the file " + filePath + "\n")
str(nError) + ". There was a problem to read the file " + filePath + "\n" + traceback.format_exc() + "\n")
return None return None
@ -179,7 +181,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
traceback.print_exc() traceback.print_exc()
if createLog: if createLog:
logFile.write( logFile.write(
str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n")
str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n"
+ traceback.format_exc() + "\n")
return None return None
@ -240,7 +243,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
traceback.print_exc() traceback.print_exc()
if createLog: if createLog:
logFile.write( logFile.write(
str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n")
str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n"
+ traceback.format_exc() + "\n")
return None return None
@ -258,27 +262,32 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
traceback.print_exc() traceback.print_exc()
if createLog: if createLog:
logFile.write( logFile.write(
str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n"
+ traceback.format_exc() + "\n")
return False return False
def move_file(filePath, createLog, logFile): def move_file(filePath, createLog, logFile):
source = filePath source = filePath
destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' + os.path.basename(filePath)
try: try:
shutil.move(source, destination, shutil.copytree)
shutil.move(source, destination, shutil.copy2)
return True return True
except: except:
incrementError()
print("There was a problem to move the file " + filePath)
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to move the file " + filePath + "\n")
return False
try:
shutil.move(source, destination, shutil.copytree)
return True
except:
incrementError()
print("There was a problem to move the file " + filePath)
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to move the file " + filePath + "\n" + traceback.format_exc() + "\n")
return False
def new_parse(marketPlace, url, createLog): def new_parse(marketPlace, url, createLog):


+ 10
- 10
MarketPlaces/Tor2door/crawler_selenium.py View File

@ -24,7 +24,7 @@ from MarketPlaces.Tor2door.parser import tor2door_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1 counter = 1
baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion'
baseURL = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion'
# Opens Tor Browser, crawls the website # Opens Tor Browser, crawls the website
@ -98,7 +98,7 @@ def getMKTName():
# Return the link of the website # Return the link of the website
def getFixedURL(): def getFixedURL():
url = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/login'
url = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/login'
return url return url
@ -129,8 +129,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -199,15 +199,15 @@ def getInterestedLinks():
links = [] links = []
# # Digital - Guides - Hacking # # Digital - Guides - Hacking
# links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=55')
# links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=55')
# # Digital - Guides - Others # # Digital - Guides - Others
# links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=57')
# links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=57')
# # Digital - Software # # Digital - Software
# links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=60')
# links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=60')
# Software - Malware # Software - Malware
links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=69')
links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=69')
# # Software - Others # # Software - Others
# links.append('http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion/en/products?category=78')
# links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=78')
return links return links
@ -244,7 +244,7 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
break
# break
# comment out # comment out
if count == 1: if count == 1:


+ 22
- 3
MarketPlaces/Tor2door/parser.py View File

@ -31,6 +31,8 @@ def tor2door_description_parser(soup):
left = "-1" # 16 Product_QuantityLeft left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
bae = soup.find('div', {'class': "col-9"}) bae = soup.find('div', {'class': "col-9"})
@ -106,9 +108,12 @@ def tor2door_description_parser(soup):
MS = MS.replace(',', ' ') MS = MS.replace(',', ' ')
MS = MS.replace('\n', '') MS = MS.replace('\n', '')
image = bae.find('div', {"class": "product-primary"}).find('img')
image = image.get('src').split('base64,')[-1]
# Populating the final variable (this should be a list with all fields scraped) # Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results # Sending the results
return row return row
@ -139,7 +144,9 @@ def tor2door_listing_parser(soup):
qLeft =[] # 17 Product_QuantityLeft qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listing = soup.findAll('div', {"class": "card product-card mb-3"}) listing = soup.findAll('div', {"class": "card product-card mb-3"})
@ -181,6 +188,15 @@ def tor2door_listing_parser(soup):
usd = usd.strip() usd = usd.strip()
USD.append(usd) USD.append(usd)
# Finding Rating
stars = card.find("ul", {"class": "star-list"})
full = stars.findAll('i', {"class": "fas fa-star star star-active"})
half = stars.find('i', {"class": "fas fa-star-half star star-active"})
rating = len(full)
if half is not None:
rating += 0.5
rating_item.append(str(rating))
# Finding Reviews # Finding Reviews
num = card.find("span", {"class": "rate-count"}).text num = card.find("span", {"class": "rate-count"}).text
num = num.replace("(", "") num = num.replace("(", "")
@ -216,9 +232,12 @@ def tor2door_listing_parser(soup):
MSValue=me MSValue=me
MS.append(MSValue) MS.append(MSValue)
image = bae[0].find('img')
image = image.get('src').split('base64,')[-1]
# Populate the final variable (this should be a list with all fields scraped) # Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
def tor2door_links_parser(soup): def tor2door_links_parser(soup):


+ 1
- 1
setup.ini View File

@ -6,7 +6,7 @@ geckodriver_path = C:\calsyslab\Project\dw_pipeline_test\selenium\geckodriver.ex
[Project] [Project]
project_directory = C:\calsyslab\Project\dw_pipeline_test project_directory = C:\calsyslab\Project\dw_pipeline_test
shared_folder = \\VBoxSvr\\Shared
shared_folder = \\VBoxSvr\Shared
[PostgreSQL] [PostgreSQL]
ip = localhost ip = localhost


Loading…
Cancel
Save