Browse Source

procrax, cardingleaks - craweler completed

altenens - debugging crawler

torbay - in progress parser
main
Helium 1 year ago
parent
commit
86bdddc30e
15 changed files with 6397 additions and 3485 deletions
  1. +18
    -19
      Forums/Altenens/crawler_selenium.py
  2. +10
    -5
      Forums/Initialization/forums_mining.py
  3. +2768
    -223
      Forums/Initialization/geckodriver.log
  4. +16
    -16
      Forums/OnniForums/crawler_selenium.py
  5. +319
    -0
      Forums/Procrax/crawler_selenium.py
  6. +264
    -0
      Forums/Procrax/parser.py
  7. +2952
    -159
      MarketPlaces/Initialization/geckodriver.log
  8. +0
    -997
      MarketPlaces/M00nkeyMarket/HTML_Pages/06272023/Description/listings3102.html
  9. +0
    -2010
      MarketPlaces/M00nkeyMarket/HTML_Pages/06272023/Listing/httpmoonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wydonionsearchsubcategoriessubcategory30.html
  10. +11
    -8
      MarketPlaces/M00nkeyMarket/crawler_selenium.py
  11. +1
    -1
      MarketPlaces/M00nkeyMarket/parser.py
  12. +12
    -12
      MarketPlaces/ThiefWorld/crawler_selenium.py
  13. +8
    -8
      MarketPlaces/TorBay/crawler_selenium.py
  14. +13
    -22
      MarketPlaces/TorBay/parser.py
  15. +5
    -5
      setup.ini

+ 18
- 19
Forums/Altenens/crawler_selenium.py View File

@ -60,24 +60,24 @@ def opentor():
# Login using premade account credentials and do login captcha manually
def login(driver):
#click login button
login = driver.find_element(by=By.XPATH, value='//*[@id="top"]/div[1]/div/div/div/div[1]/a[1]')
login.click()
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="_xfUid-1-1688066635"]')
#Username here
usernameBox.send_keys('mylittlepony45')#sends string to the username box
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="_xfUid-2-1688066635"]')
#Password here
passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox
# #click login button
# login_link = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]').get_attribute('href')
# driver.get(login_link) # open tab with url
#
# #entering username and password into input boxes
# usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[1]/dd/input')
# #Username here
# usernameBox.send_keys('mylittlepony45')#sends string to the username box
# passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[2]/dd/div/div/input')
# #Password here
# passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
# wait for 50 sec until id = tab_content is found, then cont
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '///html/body/div[1]/div[4]/div/div/div[3]/div/div/div[4]/div/div/div[1]/div/div[1]/div[2]/ol/li[1]span>')))
# WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
# (By.XPATH, '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div[4]/div/div/div[1]/div/div[1]/div[2]/ol/li[1]/a')))
# Returns the name of the website
@ -241,9 +241,9 @@ def crawlForum(driver):
savePage(driver.page_source, item)
# if there is a next page then go and save....
# next page in the topic?
# specific
try:
item = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div[1]/div[1]/div[1]/nav/div[1]/a').get_attribute('href')
item = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
if item == "":
raise NoSuchElementException
@ -256,8 +256,8 @@ def crawlForum(driver):
#end of loop
for i in range(counter):
driver.back()
# comment out
break
# # comment out
# break
# comment out
if count == 1:
@ -265,8 +265,7 @@ def crawlForum(driver):
break
try:# change depending on web page, #next page
link = driver.find_element(by=By.XPATH, value = '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/div[1]/div[1]/nav/div[1]/a').get_attribute('href')
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if link == "":
raise NoSuchElementException
try:


+ 10
- 5
Forums/Initialization/forums_mining.py View File

@ -1,7 +1,7 @@
__author__ = 'DarkWeb'
'''
Starting point of the Darkweb Mining Platform
'''
import os
@ -10,8 +10,10 @@ from Forums.BestCardingWorld.crawler_selenium import crawler as crawlerBestCardi
from Forums.CryptBB.crawler_selenium import crawler as crawlerCryptBB
from Forums.OnniForums.crawler_selenium import crawler as crawlerOnniForums
from Forums.AbyssForum.crawler_selenium import crawler as crawlerAbyssForum
from Forums.Altenens.crawler_selenium import crawler as crawlerAltenensForum
from Forums.Procrax.crawler_selenium import crawler as crawlerProcraxForum
from Forums.HiddenAnswers.crawler_selenium import crawler as crawlerHiddenAnswers
from Forums.Cardingleaks.crawler_selenium import crawler as crawlerCardingleaks
from Forums.Altenens.crawler_selenium import crawler as crawlerAltenens
import configparser
import time
@ -113,9 +115,12 @@ if __name__ == '__main__':
crawlerHiddenAnswers()
elif forum == "Altenens":
crawlerAltenensForum()
elif forum == 'Procrax':
crawlerProcraxForum()
elif forum == 'Cardingleaks':
crawlerCardingleaks()
elif forum == 'Altenens':
crawlerAltenens()


+ 2768
- 223
Forums/Initialization/geckodriver.log
File diff suppressed because it is too large
View File


+ 16
- 16
Forums/OnniForums/crawler_selenium.py View File

@ -31,19 +31,19 @@ baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
opentor()
forumName = getForumName()
# driver = getAccess()
driver = getAccess()
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(forum=forumName, url=baseURL, createLog=False)
# new_parse(forum=forumName, url=baseURL, createLog=False)
# Opens Tor Browser
@ -190,9 +190,9 @@ def getInterestedLinks():
links = []
# # Hacking & Cracking tutorials
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials')
links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials')
# Hacking & Cracking questions
links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions')
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions')
# # Exploit PoCs
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Exploit-PoCs')
# # Cracked software
@ -280,12 +280,12 @@ def crawlForum(driver):
driver.back()
# comment out, one topic per page
# break
break
# comment out, go through all pages
# if count == 1:
# count = 0
# break
if count == 1:
count = 0
break
try:
temp = driver.find_element(by=By.XPATH, value=


+ 319
- 0
Forums/Procrax/crawler_selenium.py View File

@ -0,0 +1,319 @@
__author__ = 'Helium'
'''
Procrax Forum Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import configparser
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.Procrax.parser import procrax_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'https://procrax.cx/'
# Opens Tor Browser, crawls the website
def startCrawling():
opentor()
# forumName = getForumName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, False)
# Opens Tor Browser
def opentor():
from Forums.Initialization.forums_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Login using premade account credentials and do login captcha manually
def login(driver):
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span')))
# #entering username and password into input boxes
# usernameBox = driver.find_element(by=By.NAME, value='login')
# #Username here
# usernameBox.send_keys('cheese_pizza_man')#sends string to the username box
# passwordBox = driver.find_element(by=By.NAME, value='password')
# #Password here
# passwordBox.send_keys('Gr33nSp@m&3ggs')# sends string to passwordBox
#
# clicker = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span')
# clicker.click()
#
# # # wait for listing page show up (This Xpath may need to change based on different seed url)
# # # wait for 50 sec until id = tab_content is found, then cont
# WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
# (By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[1]/div/div[1]/div')))
# Returns the name of the website
def getForumName():
name = 'Procrax'
return name
# Return the link of the website
def getFixedURL():
url = 'https://procrax.cx/'
return url
# Closes Tor Browser
def closetor(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close() #close tab
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)# open url in browser
return driver
except:
driver.close()# close tab
return 'down'
# Saves the crawled html page
def savePage(page, url):
cleanPage = cleanHTML(page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# # general hacking
# links.append('https://procrax.cx/forums/general-hacking.24/')
# # hacking security tools
# links.append('https://procrax.cx/forums/hacking-security-tools.20/')
# # hacktube
# links.append('https://procrax.cx/forums/hacktube.22/')
# # cardable
# links.append('https://procrax.cx/forums/cardable-websites.28/')
# # tools
# links.append('https://procrax.cx/forums/tools-bots-validators.73/')
# general forum
links.append('https://procrax.cx/forums/forum-discussions-updates.7/')
return links
def crawlForum(driver):
print("Crawling the Procrax forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)# open
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
#loop through the topics
while has_next_page:
list = topicPages(html)# for multiple pages
for item in list:
#variable to check if there is a next page for the topic
has_next_topic_page = True
counter = 1
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
# if there is a next page then go and save....
# specific
try:
# temp = driver.find_element(By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div/div')
item = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
if item == "":
raise NoSuchElementException
has_next_topic_page = False
else:
counter += 1
except NoSuchElementException:
has_next_topic_page = False
#end of loop
for i in range(counter):
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# count = 0
# break
try:# change depending on web page, #general
# /html/body/div[1]/div[3]/div[2]/div[3]/div/div/div/div[1]/div/nav/div[1]
# temp = driver.find_element(By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div/div/div/div[1]/div/nav/div[1]')
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Procrax forum done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'threads' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if 'forums' in url:
return True
return False
# calling the parser to define the links
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
return procrax_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 264
- 0
Forums/Procrax/parser.py View File

@ -0,0 +1,264 @@
__author__ = 'Helium'
# Here, we are importing the auxiliary functions to clean or convert data
from Forums.Utilities.utilities import *
from datetime import date
from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def cryptBB_description_parser(soup):
# Fields to be parsed
topic = "-1" # topic name
user = [] # all users of each post
addDate = [] # all dated of each post
feedback = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format)
status = [] # all user's authority in each post such as (adm, member, dangerous)
reputation = [] # all user's karma in each post (usually found as a number)
sign = [] # all user's signature in each post (usually a standard message after the content of the post)
post = [] # all messages of each post
interest = [] # all user's interest in each post
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("td", {"class": "thead"}).find('strong')
topic = li.text
topic = re.sub("\[\w*\]", '', topic)
topic = topic.replace(",","")
topic = topic.replace("\n","")
topic = cleanString(topic.strip())
# Finding the repeated tag that corresponds to the listing of posts
# try:
posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
'div', {"class": "post"})
# For each message (post), get all the fields we are interested to:
for ipost in posts:
# Finding a first level of the HTML page
post_wrapper = ipost.find('span', {"class": "largetext"})
# Finding the author (user) of the post
author = post_wrapper.text.strip()
user.append(cleanString(author)) # Remember to clean the problematic characters
# Finding the status of the author
smalltext = ipost.find('div', {"class": "post_author"})
'''
# Testing here two possibilities to find this status and combine them
if ipost.find('div', {"class": "deleted_post_author"}):
status.append(-1)
interest.append(-1)
reputation.append(-1)
addDate.append(-1)
post.append("THIS POST HAS BEEN REMOVED!")
sign.append(-1)
feedback.append(-1)
continue
'''
# CryptBB does have membergroup and postgroup
membergroup = smalltext.find('div', {"class": "profile-rank"})
postgroup = smalltext.find('div', {"class": "postgroup"})
if membergroup != None:
membergroup = membergroup.text.strip()
if postgroup != None:
postgroup = postgroup.text.strip()
membergroup = membergroup + " - " + postgroup
else:
if postgroup != None:
membergroup = postgroup.text.strip()
else:
membergroup = "-1"
status.append(cleanString(membergroup))
# Finding the interest of the author
# CryptBB does not have blurb
blurb = smalltext.find('li', {"class": "blurb"})
if blurb != None:
blurb = blurb.text.strip()
else:
blurb = "-1"
interest.append(cleanString(blurb))
# Finding the reputation of the user
# CryptBB does have reputation
author_stats = smalltext.find('div', {"class": "author_statistics"})
karma = author_stats.find('strong')
if karma != None:
karma = karma.text
karma = karma.replace("Community Rating: ", "")
karma = karma.replace("Karma: ", "")
karma = karma.strip()
else:
karma = "-1"
reputation.append(cleanString(karma))
# Getting here another good tag to find the post date, post content and users' signature
postarea = ipost.find('div', {"class": "post_content"})
dt = postarea.find('span', {"class": "post_date"}).text
# dt = dt.strip().split()
dt = dt.strip()
day=date.today()
if "Yesterday" in dt:
yesterday = day - timedelta(days=1)
yesterday = yesterday.strftime('%m-%d-%Y')
stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday+ ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "hours ago" in dt:
day = day.strftime('%m-%d-%Y')
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
else:
date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
stime = date_time_obj.strftime('%b %d, %Y')
sdate = date_time_obj.strftime('%I:%M %p')
addDate.append(date_time_obj)
# Finding the post
inner = postarea.find('div', {"class": "post_body scaleimages"})
inner = inner.text.strip()
post.append(cleanString(inner))
# Finding the user's signature
# signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
signature = ipost.find('div', {"class": "signature scaleimages"})
if signature != None:
signature = signature.text.strip()
# print(signature)
else:
signature = "-1"
sign.append(cleanString(signature))
# As no information about user's feedback was found, just assign "-1" to the variable
feedback.append("-1")
'''
except:
if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ":
user.append("-1")
status.append(-1)
interest.append(-1)
reputation.append(-1)
addDate.append(-1)
post.append("NO ACCESS TO THIS PAGE!")
sign.append(-1)
feedback.append(-1)
'''
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
# Sending the results
return row
# This is the method to parse the Listing Pages (one page with many posts)
def cryptBB_listing_parser(soup):
board = "-1" # board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
nm = 0 # this variable should receive the number of topics
topic = [] # all topics
author = [] # all authors of each topic
views = [] # number of views of each topic
posts = [] # number of posts of each topic
addDate = [] # when the topic was created (difficult to find)
href = [] # this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
# Finding the board (should be just one)
board = soup.find('span', {"class": "active"}).text
board = cleanString(board.strip())
# Finding the repeated tag that corresponds to the listing of topics
itopics = soup.find_all('tr',{"class": "inline_row"})
for itopic in itopics:
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
# to don't miss any topic
# Adding the topic to the topic list
try:
topics = itopic.find('span', {"class": "subject_old"}).find('a').text
except:
topics = itopic.find('span', {"class": "subject_new"}).find('a').text
topics = re.sub("\[\w*\]", '', topics)
topic.append(cleanString(topics))
# Counting how many topics we have found so far
nm = len(topic)
# Adding the url to the list of urls
try:
link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
except:
link = itopic.find('span',{"class": "subject_new"}).find('a').get('href')
link = cleanLink(link)
href.append(link)
# Finding the author of the topic
ps = itopic.find('div', {"class":"author smalltext"}).find('a').text
user = ps.strip()
author.append(cleanString(user))
# Finding the number of replies
columns = itopic.findChildren('td',recursive=False)
replies = columns[3].text
posts.append(cleanString(replies))
# Finding the number of Views
tview = columns[4].text
views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable
addDate.append("-1")
return organizeTopics("CryptBB", nm, topic, board, author, views, posts, href, addDate)
def procrax_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find_all('div', {"class": "structItem-title"})
for a in listing:
link = a.find('a', {'class': ''}).get('href')
href.append(link)
return href

+ 2952
- 159
MarketPlaces/Initialization/geckodriver.log
File diff suppressed because it is too large
View File


+ 0
- 997
MarketPlaces/M00nkeyMarket/HTML_Pages/06272023/Description/listings3102.html
File diff suppressed because it is too large
View File


+ 0
- 2010
MarketPlaces/M00nkeyMarket/HTML_Pages/06272023/Listing/httpmoonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wydonionsearchsubcategoriessubcategory30.html
File diff suppressed because it is too large
View File


+ 11
- 8
MarketPlaces/M00nkeyMarket/crawler_selenium.py View File

@ -2,7 +2,8 @@ __author__ = 'Helium'
'''
M00nkeyMarket Forum Crawler (Selenium) incomplete
having trouble checking it due to the captcha
might be impossible to crawl
'''
from selenium import webdriver
@ -154,11 +155,11 @@ def login(driver):
# Password here
passwordBox.send_keys('genie_show_metheWorld')
input("Press ENTER when CAPTCHA and anti-phishing is completed\n")
input("Press ENTER when CAPTCHA and exit pressed is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[1]/div/div/div[2]/div[3]/div/ul/li[2]/ul/li[5]/a")))
(By.XPATH, "/html/body/div/div[1]/div/div/div[2]/div[3]/div")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
@ -248,13 +249,15 @@ def crawlForum(driver):
break
# comment out
if count == 1:
count = 0
break
# if count == 1:
# count = 0
# break
try:
temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[1]/div/div/div[3]/div/div[3]/nav')
link = temp.find_element(by=By.CLASS_NAME, value='next ml-1 mt-1').get_attribute('href')
temp = driver.find_element(by=By.CLASS_NAME, value='col-lg-12 flex-fill ml-auto text-right mb-1')
temp2 = temp.find_element(by=By.CLASS_NAME, value="next ml-1 mt-1")
link = temp2.find_element(By.TAG_NAME, value='a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:


+ 1
- 1
MarketPlaces/M00nkeyMarket/parser.py View File

@ -284,7 +284,7 @@ def m00nkey_links_parser(soup):
listing = soup.findAll('h5', {"class": "card-title rounded text-truncate"})
for a in listing:
bae = a.find('a', href=True)
bae = a.find('a', href=True)#card-title rounded text-truncate
link = bae['href']
href.append(link)

+ 12
- 12
MarketPlaces/ThiefWorld/crawler_selenium.py View File

@ -31,19 +31,19 @@ baseURL = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
opentor()
mktName = getMKTName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
new_parse(mktName, baseURL, False)
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(mktName, baseURL, False)
# Opens Tor Browser


+ 8
- 8
MarketPlaces/TorBay/crawler_selenium.py View File

@ -64,7 +64,7 @@ def opentor():
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'TorBay Market'
name = 'TorBay'
return name
@ -226,13 +226,13 @@ def crawlForum(driver):
savePage(driver.page_source, item)
driver.back()
#comment out
break
# # comment out
if count == 1:
count = 0
break
# #comment out
# break
#
# # # comment out
# if count == 1:
# count = 0
# break
try:
link = driver.find_element(by=By.XPATH, value=


+ 13
- 22
MarketPlaces/TorBay/parser.py View File

@ -11,7 +11,7 @@ from bs4 import BeautifulSoup
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def darkfox_description_parser(soup):
def torbay_description_parser(soup):
# Fields to be parsed
@ -40,43 +40,34 @@ def darkfox_description_parser(soup):
EURO = "-1" # 22 Product_EURO_SellingPrice
# Finding Product Name
name = soup.find('h1').text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
name = soup.find('div', {'class': 'product-information'}).find('h1').text.strip()
# Finding Vendor
vendor = soup.find('h3').find('a').text.strip()
vendor = soup.find('div', {"class": "profile-info"}).find('a').text.strip()
# Finding Vendor Rating
rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
rating.append(-1)
# Finding Successful Transactions
success = soup.find('h3').text
success = success.replace("Vendor: ", "")
success = success.replace(vendor, "")
success = success.replace("(", "")
success = success.replace(")", "")
success = success.strip()
success.append(-1)
bae = soup.find('div', {'class': "box"}).find_all('ul')
# Finding Prices
USD = bae[1].find('strong').text.strip()
USD = soup.find('div', {'class': "total-price"}).find('span').text.strip()
li = bae[2].find_all('li')
# Finding Escrow
# Finding Escrow NEED WORK
escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Category
category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
category = soup.find('div', {'class': "profile-info"}).find('p').find('a').text.strip()
# Finding the Product Quantity Available
left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
left.append(-1)
# Finding Number Sold
sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
sold.append(-1)
li = bae[3].find_all('li')
@ -147,11 +138,11 @@ def darkfox_description_parser(soup):
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def darkfox_listing_parser(soup):
def torbay_listing_parser(soup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "DarkFox" # 0 Marketplace_Name
mktName = "TorBay" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
@ -174,7 +165,7 @@ def darkfox_listing_parser(soup):
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
listing = soup.findAll('div', {"class": "card"})
listing = soup.findAll('div', {"class": "product-card"})
# Populating the Number of Products
nm = len(listing)


+ 5
- 5
setup.ini View File

@ -1,12 +1,12 @@
[TOR]
firefox_binary_path = C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe
firefox_profile_path = C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Data\\Browser\\profile.default
geckodriver_path = C:\\NSF-REU\\dw_pipeline_test\\selenium\\geckodriver.exe
firefox_binary_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe
firefox_profile_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Data\\Browser\\profile.default
geckodriver_path = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test\\selenium\\geckodriver.exe
[Project]
project_directory = C:\\NSF-REU\\dw_pipeline_test
shared_folder = \\VBoxSvr\\VM_Files_(shared)
project_directory = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test
shared_folder = \\VBoxSvr\\Shared
[PostgreSQL]
ip = localhost


Loading…
Cancel
Save