Browse Source

Completed and tested all parsers for Procrax

main
Khoi 1 year ago
parent
commit
a6bdb89850
60 changed files with 279 additions and 246 deletions
  1. +1
    -0
      .gitignore
  2. BIN
      Forums/AbyssForum/__pycache__/crawler_selenium.cpython-310.pyc
  3. BIN
      Forums/AbyssForum/__pycache__/crawler_selenium.cpython-311.pyc
  4. BIN
      Forums/AbyssForum/__pycache__/parser.cpython-310.pyc
  5. BIN
      Forums/AbyssForum/__pycache__/parser.cpython-311.pyc
  6. BIN
      Forums/Altenens/__pycache__/crawler_selenium.cpython-310.pyc
  7. BIN
      Forums/Altenens/__pycache__/crawler_selenium.cpython-311.pyc
  8. BIN
      Forums/Altenens/__pycache__/parser.cpython-310.pyc
  9. BIN
      Forums/Altenens/__pycache__/parser.cpython-311.pyc
  10. BIN
      Forums/CryptBB/__pycache__/__init__.cpython-311.pyc
  11. BIN
      Forums/CryptBB/__pycache__/crawler_selenium.cpython-310.pyc
  12. BIN
      Forums/CryptBB/__pycache__/crawler_selenium.cpython-311.pyc
  13. BIN
      Forums/CryptBB/__pycache__/parser.cpython-310.pyc
  14. BIN
      Forums/CryptBB/__pycache__/parser.cpython-311.pyc
  15. BIN
      Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-310.pyc
  16. BIN
      Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-311.pyc
  17. BIN
      Forums/HiddenAnswers/__pycache__/parser.cpython-310.pyc
  18. BIN
      Forums/HiddenAnswers/__pycache__/parser.cpython-311.pyc
  19. BIN
      Forums/Initialization/__pycache__/__init__.cpython-310.pyc
  20. BIN
      Forums/Initialization/__pycache__/__init__.cpython-311.pyc
  21. BIN
      Forums/Initialization/__pycache__/forums_mining.cpython-310.pyc
  22. BIN
      Forums/Initialization/__pycache__/forums_mining.cpython-311.pyc
  23. BIN
      Forums/Initialization/__pycache__/prepare_parser.cpython-310.pyc
  24. BIN
      Forums/Initialization/__pycache__/prepare_parser.cpython-311.pyc
  25. +3
    -3
      Forums/Initialization/forums_mining.py
  26. +77
    -0
      Forums/Initialization/geckodriver.log
  27. +5
    -0
      Forums/Initialization/prepare_parser.py
  28. BIN
      Forums/OnniForums/__pycache__/crawler_selenium.cpython-310.pyc
  29. BIN
      Forums/OnniForums/__pycache__/crawler_selenium.cpython-311.pyc
  30. BIN
      Forums/OnniForums/__pycache__/parser.cpython-310.pyc
  31. BIN
      Forums/OnniForums/__pycache__/parser.cpython-311.pyc
  32. BIN
      Forums/OnniForums/__pycache__/parser_script.cpython-311.pyc
  33. +27
    -24
      Forums/Procrax/crawler_selenium.py
  34. +73
    -194
      Forums/Procrax/parser.py
  35. BIN
      MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-310.pyc
  36. BIN
      MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc
  37. BIN
      MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-310.pyc
  38. BIN
      MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc
  39. BIN
      MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc
  40. BIN
      MarketPlaces/Initialization/__pycache__/__init__.cpython-310.pyc
  41. BIN
      MarketPlaces/Initialization/__pycache__/__init__.cpython-311.pyc
  42. BIN
      MarketPlaces/Initialization/__pycache__/markets_mining.cpython-310.pyc
  43. BIN
      MarketPlaces/Initialization/__pycache__/markets_mining.cpython-311.pyc
  44. BIN
      MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-310.pyc
  45. BIN
      MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc
  46. +70
    -0
      MarketPlaces/Initialization/geckodriver.log
  47. +1
    -1
      MarketPlaces/Initialization/marketsList.txt
  48. BIN
      MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-310.pyc
  49. BIN
      MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc
  50. BIN
      MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-310.pyc
  51. BIN
      MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc
  52. +22
    -24
      MarketPlaces/M00nkeyMarket/crawler_selenium.py
  53. BIN
      MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-310.pyc
  54. BIN
      MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-311.pyc
  55. BIN
      MarketPlaces/Tor2door/__pycache__/parser.cpython-310.pyc
  56. BIN
      MarketPlaces/Tor2door/__pycache__/parser.cpython-311.pyc
  57. BIN
      MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-310.pyc
  58. BIN
      MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc
  59. BIN
      MarketPlaces/TorBay/__pycache__/parser.cpython-310.pyc
  60. BIN
      MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc

+ 1
- 0
.gitignore View File

@ -2,6 +2,7 @@
/shelf/
.idea/workspace.xml
selenium/geckodriver.exe
__pycache__
setup.ini
*.html
*.log

BIN
Forums/AbyssForum/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
Forums/AbyssForum/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
Forums/AbyssForum/__pycache__/parser.cpython-310.pyc View File


BIN
Forums/AbyssForum/__pycache__/parser.cpython-311.pyc View File


BIN
Forums/Altenens/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
Forums/Altenens/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
Forums/Altenens/__pycache__/parser.cpython-310.pyc View File


BIN
Forums/Altenens/__pycache__/parser.cpython-311.pyc View File


BIN
Forums/CryptBB/__pycache__/__init__.cpython-311.pyc View File


BIN
Forums/CryptBB/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
Forums/CryptBB/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
Forums/CryptBB/__pycache__/parser.cpython-310.pyc View File


BIN
Forums/CryptBB/__pycache__/parser.cpython-311.pyc View File


BIN
Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
Forums/HiddenAnswers/__pycache__/parser.cpython-310.pyc View File


BIN
Forums/HiddenAnswers/__pycache__/parser.cpython-311.pyc View File


BIN
Forums/Initialization/__pycache__/__init__.cpython-310.pyc View File


BIN
Forums/Initialization/__pycache__/__init__.cpython-311.pyc View File


BIN
Forums/Initialization/__pycache__/forums_mining.cpython-310.pyc View File


BIN
Forums/Initialization/__pycache__/forums_mining.cpython-311.pyc View File


BIN
Forums/Initialization/__pycache__/prepare_parser.cpython-310.pyc View File


BIN
Forums/Initialization/__pycache__/prepare_parser.cpython-311.pyc View File


+ 3
- 3
Forums/Initialization/forums_mining.py View File

@ -99,9 +99,9 @@ if __name__ == '__main__':
forum = forum.replace('\n','')
print("Creating listing and description directories ... for " + forum)
createDirectory(forum)
time.sleep(5) # wait for directories to be created
input("Directories created successfully. Press ENTER to continue\n")
# createDirectory(forum)
# time.sleep(5) # wait for directories to be created
# input("Directories created successfully. Press ENTER to continue\n")
if forum == "BestCardingWorld":


+ 77
- 0
Forums/Initialization/geckodriver.log View File

@ -11198,3 +11198,80 @@ unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689622469580 geckodriver INFO Listening on 127.0.0.1:58866
1689622474728 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "58867" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofile5gOLDP"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689622475417 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:58867/devtools/browser/9a3a8de2-439e-425e-b415-f975abd86b65
1689622476941 Marionette INFO Listening on port 58873
1689622477054 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofile5gOLDP\thumbnails) because it does not exist
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: undefined, line 0: Error: Missing host permission for the tab
JavaScript error: undefined, line 0: Error: Missing host permission for the tab
1689624030995 Marionette INFO Stopped listening on port 58873
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofile5gOLDP\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1689624031467 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689624276336 geckodriver INFO Listening on 127.0.0.1:59792
1689624280979 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "59793" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofileSTe5EC"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689624281509 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:59793/devtools/browser/222a61fa-a958-4978-8048-bb632f658131
1689624283001 Marionette INFO Listening on port 59799
1689624283405 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689624692072 Marionette INFO Stopped listening on port 59799
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofileSTe5EC\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1689624692916 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138

+ 5
- 0
Forums/Initialization/prepare_parser.py View File

@ -9,6 +9,7 @@ from Forums.BestCardingWorld.parser import *
from Forums.CryptBB.parser import *
from Forums.OnniForums.parser import *
from Forums.Altenens.parser import *
from Forums.Procrax.parser import *
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -154,6 +155,8 @@ def new_parse(forum, url, createLog):
rmm = onniForums_description_parser(soup)
elif forum == "Altenens":
rmm = altenens_description_parser(soup)
elif forum == "Procrax":
rmm = procrax_description_parser(soup)
# key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
key = u"Url:" + os.path.basename(line2).replace(".html", "")
@ -233,6 +236,8 @@ def new_parse(forum, url, createLog):
rw = onniForums_listing_parser(soup)
elif forum == "Altenens":
rw = altenens_listing_parser(soup)
elif forum == "Procrax":
rw = procrax_listing_parser(soup)
except:


BIN
Forums/OnniForums/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
Forums/OnniForums/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
Forums/OnniForums/__pycache__/parser.cpython-310.pyc View File


BIN
Forums/OnniForums/__pycache__/parser.cpython-311.pyc View File


BIN
Forums/OnniForums/__pycache__/parser_script.cpython-311.pyc View File


+ 27
- 24
Forums/Procrax/crawler_selenium.py View File

@ -26,24 +26,28 @@ from Forums.Procrax.parser import procrax_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'https://procrax.cx/'
BASE_URL = 'https://procrax.cx/'
FORUM_NAME = 'Procrax'
# Opens Tor Browser, crawls the website
def startCrawling():
opentor()
# forumName = getForumName()
driver = getAccess()
# opentor()
# driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
# new_parse(forumName, False)
new_parse(
forum=FORUM_NAME,
url=BASE_URL,
createLog=False
)
# Opens Tor Browser
@ -139,10 +143,9 @@ def createFFDriver():
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)# open url in browser
driver.get(BASE_URL)# open url in browser
return driver
except:
driver.close()# close tab
@ -162,7 +165,7 @@ def savePage(page, url):
def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + FORUM_NAME + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
@ -185,17 +188,17 @@ def getInterestedLinks():
links = []
# # general hacking
# links.append('https://procrax.cx/forums/general-hacking.24/')
links.append('https://procrax.cx/forums/general-hacking.24/')
# # hacking security tools
# links.append('https://procrax.cx/forums/hacking-security-tools.20/')
links.append('https://procrax.cx/forums/hacking-security-tools.20/')
# # hacktube
# links.append('https://procrax.cx/forums/hacktube.22/')
links.append('https://procrax.cx/forums/hacktube.22/')
# # cardable
# links.append('https://procrax.cx/forums/cardable-websites.28/')
# # tools
# links.append('https://procrax.cx/forums/tools-bots-validators.73/')
# general forum
links.append('https://procrax.cx/forums/forum-discussions-updates.7/')
# links.append('https://procrax.cx/forums/forum-discussions-updates.7/')
return links
@ -229,7 +232,7 @@ def crawlForum(driver):
page = topic
while has_next_topic_page:
itemURL = urlparse.urljoin(baseURL, str(page))
itemURL = urlparse.urljoin(BASE_URL, str(page))
try:
driver.get(itemURL)
except:
@ -237,8 +240,8 @@ def crawlForum(driver):
savePage(driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
break
# if counter == 2:
# break
try:
page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
@ -254,10 +257,10 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:
if count == 20:
break
try:


+ 73
- 194
Forums/Procrax/parser.py View File

@ -7,11 +7,12 @@ from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, ResultSet, Tag
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def cryptBB_description_parser(soup):
def procrax_description_parser(soup: Tag):
# Fields to be parsed
@ -27,146 +28,36 @@ def cryptBB_description_parser(soup):
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("td", {"class": "thead"}).find('strong')
li = soup.find("h1", {"class": "p-title-value"})
topic = li.text
topic = re.sub("\[\w*\]", '', topic)
topic = topic.replace(",","")
topic = topic.replace("\n","")
topic = cleanString(topic.strip())
# Finding the repeated tag that corresponds to the listing of posts
# try:
posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
'div', {"class": "post"})
# For each message (post), get all the fields we are interested to:
for ipost in posts:
# Finding a first level of the HTML page
post_wrapper = ipost.find('span', {"class": "largetext"})
# Finding the author (user) of the post
author = post_wrapper.text.strip()
user.append(cleanString(author)) # Remember to clean the problematic characters
# Finding the status of the author
smalltext = ipost.find('div', {"class": "post_author"})
'''
# Testing here two possibilities to find this status and combine them
if ipost.find('div', {"class": "deleted_post_author"}):
status.append(-1)
interest.append(-1)
reputation.append(-1)
addDate.append(-1)
post.append("THIS POST HAS BEEN REMOVED!")
sign.append(-1)
feedback.append(-1)
continue
'''
# CryptBB does have membergroup and postgroup
membergroup = smalltext.find('div', {"class": "profile-rank"})
postgroup = smalltext.find('div', {"class": "postgroup"})
if membergroup != None:
membergroup = membergroup.text.strip()
if postgroup != None:
postgroup = postgroup.text.strip()
membergroup = membergroup + " - " + postgroup
else:
if postgroup != None:
membergroup = postgroup.text.strip()
else:
membergroup = "-1"
status.append(cleanString(membergroup))
# Finding the interest of the author
# CryptBB does not have blurb
blurb = smalltext.find('li', {"class": "blurb"})
if blurb != None:
blurb = blurb.text.strip()
else:
blurb = "-1"
interest.append(cleanString(blurb))
# Finding the reputation of the user
# CryptBB does have reputation
author_stats = smalltext.find('div', {"class": "author_statistics"})
karma = author_stats.find('strong')
if karma != None:
karma = karma.text
karma = karma.replace("Community Rating: ", "")
karma = karma.replace("Karma: ", "")
karma = karma.strip()
else:
karma = "-1"
reputation.append(cleanString(karma))
# Getting here another good tag to find the post date, post content and users' signature
postarea = ipost.find('div', {"class": "post_content"})
dt = postarea.find('span', {"class": "post_date"}).text
# dt = dt.strip().split()
dt = dt.strip()
day=date.today()
if "Yesterday" in dt:
yesterday = day - timedelta(days=1)
yesterday = yesterday.strftime('%m-%d-%Y')
stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday+ ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "hours ago" in dt:
day = day.strftime('%m-%d-%Y')
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
else:
date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
stime = date_time_obj.strftime('%b %d, %Y')
sdate = date_time_obj.strftime('%I:%M %p')
addDate.append(date_time_obj)
# Finding the post
inner = postarea.find('div', {"class": "post_body scaleimages"})
inner = inner.text.strip()
post.append(cleanString(inner))
# Finding the user's signature
# signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
signature = ipost.find('div', {"class": "signature scaleimages"})
if signature != None:
signature = signature.text.strip()
# print(signature)
else:
signature = "-1"
sign.append(cleanString(signature))
# As no information about user's feedback was found, just assign "-1" to the variable
thread: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True})
for ipost in thread:
username = ipost.find("h4", {"class": "message-name"}).text
user.append(cleanString(username.strip()))
date_posted = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime")
datetime_obj = datetime.strptime(date_posted, "%Y-%m-%dT%H:%M:%S%z")
addDate.append(datetime_obj)
feedback.append("-1")
'''
except:
if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ":
user.append("-1")
status.append(-1)
interest.append(-1)
reputation.append(-1)
addDate.append(-1)
post.append("NO ACCESS TO THIS PAGE!")
sign.append(-1)
feedback.append(-1)
'''
user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text
status.append(cleanString(user_status.strip()))
user_lvl = ipost.find("div", {"class": "afAwardLevel"}).text
reputation.append(cleanString(user_lvl.strip()))
sign.append("-1")
user_post = ipost.find("article", {"class": "message-body js-selectToQuote"}).text
post.append(cleanString(user_post.strip()))
interest.append("-1")
# Populate the final variable (this should be a list with all fields scraped)
@ -178,7 +69,7 @@ def cryptBB_description_parser(soup):
# This is the method to parse the Listing Pages (one page with many posts)
def cryptBB_listing_parser(soup):
def procrax_listing_parser(soup: Tag):
board = "-1" # board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
@ -193,59 +84,47 @@ def cryptBB_listing_parser(soup):
# Listing and Description pages)
# Finding the board (should be just one)
board = soup.find('span', {"class": "active"}).text
board = cleanString(board.strip())
# Finding the repeated tag that corresponds to the listing of topics
itopics = soup.find_all('tr',{"class": "inline_row"})
for itopic in itopics:
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
# to don't miss any topic
# Adding the topic to the topic list
try:
topics = itopic.find('span', {"class": "subject_old"}).find('a').text
except:
topics = itopic.find('span', {"class": "subject_new"}).find('a').text
topics = re.sub("\[\w*\]", '', topics)
topic.append(cleanString(topics))
# Counting how many topics we have found so far
nm = len(topic)
# Adding the url to the list of urls
try:
link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
except:
link = itopic.find('span',{"class": "subject_new"}).find('a').get('href')
link = cleanLink(link)
href.append(link)
# Finding the author of the topic
ps = itopic.find('div', {"class":"author smalltext"}).find('a').text
user = ps.strip()
author.append(cleanString(user))
# Finding the number of replies
columns = itopic.findChildren('td',recursive=False)
replies = columns[3].text
posts.append(cleanString(replies))
# Finding the number of Views
tview = columns[4].text
views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable
addDate.append("-1")
return organizeTopics("CryptBB", nm, topic, board, author, views, posts, href, addDate)
li = soup.find("h1", {"class": "p-title-value"})
board = cleanString(li.text.strip())
threads_list: ResultSet[Tag] = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True})
nm = len(threads_list)
for thread in threads_list:
thread_title = thread.find("div", {"class": "structItem-title"}).text
topic.append(cleanString(thread_title.strip()))
thread_author = thread.get("data-author")
author.append(cleanString(thread_author))
thread_views = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find('dd').text
views.append(cleanString(thread_views.strip()))
thread_replies = thread.find("dl", {"class": "pairs pairs--justified"}).find('dd').text
# All threads contain one topic post and reply posts
thread_total_posts = str(1 + int(thread_replies))
posts.append(thread_total_posts)
thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime")
datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z")
addDate.append(datetime_obj)
thread_link = thread.find("div", {"class": "structItem-title"}).find('a').get('href')
href.append(thread_link)
return organizeTopics(
forum="Procrax",
nm=nm,
board=board,
author=author,
topic=topic,
views=views,
posts=posts,
addDate=addDate,
href=href
)
def procrax_links_parser(soup):


BIN
MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-310.pyc View File


BIN
MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc View File


BIN
MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc View File


BIN
MarketPlaces/Initialization/__pycache__/__init__.cpython-310.pyc View File


BIN
MarketPlaces/Initialization/__pycache__/__init__.cpython-311.pyc View File


BIN
MarketPlaces/Initialization/__pycache__/markets_mining.cpython-310.pyc View File


BIN
MarketPlaces/Initialization/__pycache__/markets_mining.cpython-311.pyc View File


BIN
MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-310.pyc View File


BIN
MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc View File


+ 70
- 0
MarketPlaces/Initialization/geckodriver.log View File

@ -15617,3 +15617,73 @@ unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689619116242 geckodriver INFO Listening on 127.0.0.1:57366
1689619118954 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "57367" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofile0Dg5aD"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689619119382 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:57367/devtools/browser/26c42825-1d86-4c6a-ad3b-817e084e0b36
1689619120284 Marionette INFO Listening on port 57373
1689619120428 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofile0Dg5aD\thumbnails) because it does not exist
1689619308722 Marionette INFO Stopped listening on port 57373
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofile0Dg5aD\thumbnails) because it does not exist
Crash Annotation GraphicsCriticalError: |[C0][GFX1-]: Receive IPC close with reason=AbnormalShutdown (t=1960.99)
###!!! [Child][MessageChannel] Error: (msgtype=0x3900E5,name=PContent::Msg_GraphicsError) Channel closing: too late to send/recv, messages will be lost
[GFX1-]: Receive IPC close with reason=AbnormalShutdown
1689619309292 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689619356214 geckodriver INFO Listening on 127.0.0.1:57526
1689619360407 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "57527" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofileUEfwdk"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689619360903 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:57527/devtools/browser/85530b1c-e7e2-4313-8c36-704d0f5ce7da
1689619362005 Marionette INFO Listening on port 57534
1689619362321 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689619608554 Marionette INFO Stopped listening on port 57534
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofileUEfwdk\thumbnails) because it does not exist
1689619609120 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138

+ 1
- 1
MarketPlaces/Initialization/marketsList.txt View File

@ -1 +1 @@
AnonymousMarketplace
M00nkeyMarket

BIN
MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-310.pyc View File


BIN
MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc View File


+ 22
- 24
MarketPlaces/M00nkeyMarket/crawler_selenium.py View File

@ -27,25 +27,24 @@ from MarketPlaces.M00nkeyMarket.parser import m00nkey_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/'
BASE_URL = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/'
MARKET_NAME = 'M00nkeyMarket'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
opentor()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(mktName, baseURL, False)
new_parse(MARKET_NAME, BASE_URL, False)
# Opens Tor Browser
@ -64,16 +63,16 @@ def opentor():
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'M00nkeyMarket'
return name
# def getMKTName():
# name = 'M00nkeyMarket'
# return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/'
return url
# def getFixedURL():
# url = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/'
# return url
# Closes Tor Browser
@ -127,10 +126,9 @@ def createFFDriver():
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
driver.get(BASE_URL)
return driver
except:
driver.close()
@ -175,7 +173,7 @@ def savePage(page, url):
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + MARKET_NAME + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
@ -237,7 +235,7 @@ def crawlForum(driver):
while has_next_page:
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(BASE_URL, str(item))
try:
driver.get(itemURL)
except:


BIN
MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
MarketPlaces/Tor2door/__pycache__/parser.cpython-310.pyc View File


BIN
MarketPlaces/Tor2door/__pycache__/parser.cpython-311.pyc View File


BIN
MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-310.pyc View File


BIN
MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
MarketPlaces/TorBay/__pycache__/parser.cpython-310.pyc View File


BIN
MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc View File


Loading…
Cancel
Save