Browse Source

save html to shared folder examples

main
westernmeadow 1 year ago
parent
commit
7493fb30bb
14 changed files with 97 additions and 381 deletions
  1. +1
    -0
      .idea/.gitignore
  2. +1
    -3
      .idea/DW_Pipeline_Test.iml
  3. +1
    -1
      .idea/misc.xml
  4. +14
    -11
      Forums/CryptBB/crawler_selenium.py
  5. +2
    -2
      Forums/DB_Connection/db_connection.py
  6. +5
    -2
      Forums/Initialization/forums_mining.py
  7. +21
    -316
      Forums/Initialization/geckodriver.log
  8. +12
    -13
      Forums/Initialization/prepare_parser.py
  9. +2
    -2
      MarketPlaces/DB_Connection/db_connection.py
  10. +1
    -1
      MarketPlaces/Initialization/marketsList.txt
  11. +5
    -3
      MarketPlaces/Initialization/markets_mining.py
  12. +5
    -3
      MarketPlaces/Initialization/prepare_parser.py
  13. +21
    -17
      MarketPlaces/ThiefWorld/crawler_selenium.py
  14. +6
    -7
      setup.ini

+ 1
- 0
.idea/.gitignore View File

@ -2,6 +2,7 @@
/shelf/
/workspace.xml
/selenium/geckodriver.exe
setup.ini
*.html
*.log
*.png

+ 1
- 3
.idea/DW_Pipeline_Test.iml View File

@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.11" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="C:\Users\calsyslab\anaconda3" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">
@ -12,8 +12,6 @@
<option value="$MODULE_DIR$/Forums/CryptBB" />
<option value="$MODULE_DIR$/MarketPlaces/DarkFox" />
<option value="$MODULE_DIR$/MarketPlaces/Tor2door" />
<option value="$MODULE_DIR$/Forums/OnniForums" />
<option value="$MODULE_DIR$/MarketPlaces/ThiefWorld" />
</list>
</option>
</component>

+ 1
- 1
.idea/misc.xml View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="C:\Users\calsyslab\anaconda3" project-jdk-type="Python SDK" />
</project>

+ 14
- 11
Forums/CryptBB/crawler_selenium.py View File

@ -17,24 +17,21 @@ from PIL import Image
import urllib.parse as urlparse
import os, re, time
import subprocess
import configparser
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.CryptBB.parser import cryptBB_links_parser
from Forums.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1
baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/'
# Opens Tor Browser, crawls the website
def startCrawling():
forumName = getForumName()
# opentor()
forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
@ -48,6 +45,8 @@ def startCrawling():
# Opens Tor Browser
def opentor():
from Forums.Initialization.forums_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
@ -133,6 +132,8 @@ def closetor(driver):
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -186,12 +187,14 @@ def savePage(page, url):
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from Forums.Initialization.forums_mining import CURRENT_DATE
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'..\\CryptBB\\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = r'..\\CryptBB\\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
@ -208,10 +211,10 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
# Beginner Programming
# # Beginner Programming
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86')
# # Beginner Carding and Fraud
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91')
# links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91')
# # Beginner Hacking
# links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87')
# # Newbie
@ -287,9 +290,9 @@ def crawlForum(driver):
if item == "":
raise NoSuchElementException
has_next_topic_page = False
else:
counter += 1
except NoSuchElementException:
has_next_topic_page = False


+ 2
- 2
Forums/DB_Connection/db_connection.py View File

@ -9,8 +9,8 @@ def connectDataBase():
try:
config = configparser.ConfigParser()
config.read('../../setup.ini')
from Forums.Initialization.forums_mining import config
ip = config.get('PostgreSQL', 'ip')
username = config.get('PostgreSQL', 'username')
password = config.get('PostgreSQL', 'password')


+ 5
- 2
Forums/Initialization/forums_mining.py View File

@ -13,9 +13,11 @@ from Forums.AbyssForum.crawler_selenium import crawler as crawlerAbyssForum
from Forums.Altenens.crawler_selenium import crawler as crawlerAltenensForum
from Forums.HiddenAnswers.crawler_selenium import crawler as crawlerHiddenAnswers
import configparser
import time
config = configparser.ConfigParser()
config.read('../../setup.ini')
CURRENT_DATE = str("%02d" % date.today().month) + str("%02d" % date.today().day) + str("%04d" % date.today().year)
@ -34,7 +36,8 @@ def createDirectory(forum):
if forum == 'Reddits':
pagesMainDir = '../' + forum
else:
pagesMainDir = '../' + forum + "/HTML_Pages"
# pagesMainDir = '../' + forum + "/HTML_Pages"
pagesMainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + forum + "/HTML_Pages")
if not os.path.isdir(pagesMainDir):
os.makedirs(pagesMainDir)


+ 21
- 316
Forums/Initialization/geckodriver.log View File

@ -5841,8 +5841,8 @@ unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687895546413 geckodriver INFO Listening on 127.0.0.1:52237
1687895550932 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "52238" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileiOR21Q"
1687896430885 geckodriver INFO Listening on 127.0.0.1:50135
1687896434527 mozrunner::runner INFO Running command: "C:\\Users\\calsyslab\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" ... "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\CALSYS~1\\AppData\\Local\\Temp\\rust_mozprofilenQCzgp"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
@ -5851,7 +5851,7 @@ console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687895551675 Marionette INFO Marionette enabled
1687896435185 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
@ -5859,240 +5859,30 @@ JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't fin
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:52238/devtools/browser/ad1dc524-5cad-4983-9dd6-c7f6f3d5caee
1687895553974 Marionette INFO Listening on port 52243
1687895554561 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileiOR21Q\thumbnails) because it does not exist
1687895804567 Marionette INFO Stopped listening on port 52243
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileiOR21Q\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687895804907 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687977218822 geckodriver INFO Listening on 127.0.0.1:51022
1687977226564 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51023" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileikuU2J"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687977228948 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51023/devtools/browser/3b0200ed-8dcd-4975-a337-55ca97127f81
1687977234067 Marionette INFO Listening on port 51028
1687977234672 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687977449724 Marionette INFO Stopped listening on port 51028
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileikuU2J\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687977450647 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687977513313 geckodriver INFO Listening on 127.0.0.1:51084
1687977521019 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51085" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileWUrtuT"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687977523015 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51085/devtools/browser/64d878ac-9491-4b68-8378-3cdcd42b86f9
1687977528316 Marionette INFO Listening on port 51090
1687977529126 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687978083314 Marionette INFO Stopped listening on port 51090
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileWUrtuT\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687978083874 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687978133464 geckodriver INFO Listening on 127.0.0.1:51172
1687978141034 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51173" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileu5IdWT"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687978143085 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51173/devtools/browser/92c771f3-77dc-4ad5-9787-19e461c45ad6
1687978148067 Marionette INFO Listening on port 51178
1687978148324 RemoteAgent WARN TLS certificate errors will be ignored for this session
###!!! [Child][MessageChannel] Error: (msgtype=0x390097,name=PContent::Msg_InitBackground) Channel closing: too late to send/recv, messages will be lost
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileu5IdWT\thumbnails) because it does not exist
1687984051859 Marionette INFO Stopped listening on port 51178
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileu5IdWT\thumbnails) because it does not exist
[Parent 6808, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687984052405 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
O Listening on port 51239
1687978539391 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileodTbYM\thumbnails) because it does not exist
1687984050773 Marionette INFO Stopped listening on port 51239
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileodTbYM\thumbnails) because it does not exist
[Parent 2612, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
1687984051727 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
s://localhost:51280/devtools/browser/d4d6f9cc-7d5f-45e3-8873-a460f62cc4cf
1687978926427 Marionette INFO Listening on port 51285
1687978926534 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687979030758 Marionette INFO Stopped listening on port 51285
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileanrFrL\thumbnails) because it does not exist
1687979031575 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687979050690 geckodriver INFO Listening on 127.0.0.1:51360
1687979053723 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51361" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofile0hAG1R"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687979054534 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51361/devtools/browser/92278a26-d591-4e02-9b50-6d94f582bba6
1687979056856 Marionette INFO Listening on port 51366
1687979057092 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687979258295 Marionette INFO Stopped listening on port 51366
DevTools listening on ws://localhost:50136/devtools/browser/773adaec-44e1-4b13-9fac-c38bfb170221
1687896436579 Marionette INFO Listening on port 50142
1687896436612 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\calsyslab\AppData\Local\Temp\rust_mozprofilenQCzgp\thumbnails) because it does not exist
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/, line 2: ReferenceError: $ is not defined
1687896481968 Marionette INFO Stopped listening on port 50142
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofile0hAG1R\thumbnails) because it does not exist
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\calsyslab\AppData\Local\Temp\rust_mozprofilenQCzgp\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687979258801 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687979267242 geckodriver INFO Listening on 127.0.0.1:51432
1687979271790 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51433" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilexKgOT4"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687979272999 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51433/devtools/browser/cda6fecb-bd37-4670-968b-8a378fded89f
1687979276192 Marionette INFO Listening on port 51444
1687979276461 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
1687979332888 Marionette INFO Stopped listening on port 51444
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilexKgOT4\thumbnails) because it does not exist
[Parent 4980, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687979333650 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
1687896482482 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687979430724 geckodriver INFO Listening on 127.0.0.1:51502
1687979436324 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51503" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilegHC201"
1687897304511 geckodriver INFO Listening on 127.0.0.1:50201
1687897308111 mozrunner::runner INFO Running command: "C:\\Users\\calsyslab\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" ... "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\CALSYS~1\\AppData\\Local\\Temp\\rust_mozprofile2TNTj7"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
@ -6101,7 +5891,7 @@ console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687979437856 Marionette INFO Marionette enabled
1687897308686 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
@ -6109,101 +5899,16 @@ JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't fin
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51503/devtools/browser/103a6f45-7bf6-46d2-8040-cefffb477152
1687979442204 Marionette INFO Listening on port 51508
1687979442652 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilegHC201\thumbnails) because it does not exist
1687984048079 Marionette INFO Stopped listening on port 51508
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilegHC201\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687984048659 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
vaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51622/devtools/browser/8b6f89c5-5489-4aa7-84ae-816a519ac6d2
1687983200540 Marionette INFO Listening on port 51627
1687983200642 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687984043915 Marionette INFO Stopped listening on port 51627
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofiletQSuzW\thumbnails) because it does not exist
[Parent 1532, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
1687984044451 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687989865551 geckodriver INFO Listening on 127.0.0.1:49687
1687989870785 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "49688" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileNUIghb"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687989872437 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:49688/devtools/browser/05e0b61d-92e1-4c2b-ac81-164fc698ee43
1687989876314 Marionette INFO Listening on port 49693
1687989876583 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687989882290 Marionette INFO Stopped listening on port 49693
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileNUIghb\thumbnails) because it does not exist
1687989883656 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687989967990 geckodriver INFO Listening on 127.0.0.1:53543
1687989972970 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "53544" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofile50PiiS"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687989974728 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:53544/devtools/browser/574837da-6642-43f8-a689-8dbe14b1e254
1687989978232 Marionette INFO Listening on port 53549
1687989978914 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687990165288 Marionette INFO Stopped listening on port 53549
DevTools listening on ws://localhost:50202/devtools/browser/c30256b0-c71f-40da-a95f-bb1313b3e35e
1687897310328 Marionette INFO Listening on port 50208
1687897310788 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/, line 2: ReferenceError: $ is not defined
1687897315273 Marionette INFO Stopped listening on port 50208
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
[Parent 8704, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
1687990165952 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
1687897315776 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109


+ 12
- 13
Forums/Initialization/prepare_parser.py View File

@ -2,12 +2,11 @@ __author__ = 'DarkWeb'
import codecs
import glob
import os
import os, re
import shutil
from Forums.DB_Connection.db_connection import *
from Forums.BestCardingWorld.parser import *
from Forums.CryptBB.parser import *
import re
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -80,7 +79,7 @@ def persist_data(url, row, cur):
#calls the different parser methods here depending on the type of html page
def new_parse(forum, url, createLog):
from Forums.Initialization.forums_mining import CURRENT_DATE
from Forums.Initialization.forums_mining import config, CURRENT_DATE
print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.")
@ -110,12 +109,14 @@ def new_parse(forum, url, createLog):
" in the _Logs folder to read files from this Forum of this date again.")
raise SystemExit
# Reading the Listing Html Pages -> to memory
for fileListing in glob.glob(os.path.join("..\\" + forum + "\\HTML_Pages\\" + CURRENT_DATE + "\\Listing", '*.html')):
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + forum + "/HTML_Pages")
# Reading the Listing Html Pages
for fileListing in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')):
lines.append(fileListing)
# Reading the Description Html Pages -> to memory
for fileDescription in glob.glob(os.path.join("..\\" + forum + "\\HTML_Pages\\" + CURRENT_DATE + "\\Description" ,'*.html')):
# Reading the Description Html Pages
for fileDescription in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", '*.html')):
lns.append(fileDescription)
# Parsing the Description Pages and put the tag's content into a dictionary (Hash table)
@ -124,8 +125,8 @@ def new_parse(forum, url, createLog):
print("Reading description folder of '" + forum + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns)))
try:
html = codecs.open(line2.strip('\n'), encoding='utf8')#trying t open them in utf8 format
soup = BeautifulSoup(html, "html.parser")#throw into beautiful soup
html = codecs.open(line2.strip('\n'), encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
@ -142,16 +143,14 @@ def new_parse(forum, url, createLog):
continue
try:
#Where actual parsing occurs
if forum == "BestCardingWorld":
rmm = bestcardingworld_description_parser(soup)
elif forum == "CryptBB":
rmm = cryptBB_description_parser(soup)
# key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
#essentially filename and url
key = u"Url:" + os.path.basename(line2).replace(".html", "")#should end with either no(page+num) or no page+num
key = u"Url:" + os.path.basename(line2).replace(".html", "")
# check if page or page exists at the end of a string followed by a series of numbers
#if yes add to other if no add to first page dictionary


+ 2
- 2
MarketPlaces/DB_Connection/db_connection.py View File

@ -9,8 +9,8 @@ def connectDataBase():
try:
config = configparser.ConfigParser()
config.read('../../setup.ini')
from MarketPlaces.Initialization.markets_mining import config
ip = config.get('PostgreSQL', 'ip')
username = config.get('PostgreSQL', 'username')
password = config.get('PostgreSQL', 'password')


+ 1
- 1
MarketPlaces/Initialization/marketsList.txt View File

@ -1 +1 @@
DarkMatter
ThiefWorld

+ 5
- 3
MarketPlaces/Initialization/markets_mining.py View File

@ -21,10 +21,11 @@ from MarketPlaces.CityMarket.crawler_selenium import crawler as crawlerCityMarke
from MarketPlaces.DarkMatter.crawler_selenium import crawler as crawlerDarkMatter
from MarketPlaces.M00nkeyMarket.crawler_selenium import crawler as crawlerM00nkeyMarket
import configparser
import time
config = configparser.ConfigParser()
config.read('../../setup.ini')
CURRENT_DATE = str("%02d" % date.today().month) + str("%02d" % date.today().day) + str("%04d" % date.today().year)
@ -40,7 +41,8 @@ def getMarkets():
def createDirectory(mkt):
# Package should already be there, holding crawler and parser
pagesDir = '../' + mkt + '/HTML_Pages'
# pagesDir = '../' + mkt + '/HTML_Pages'
pagesDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + mkt + "/HTML_Pages")
if not os.path.isdir(pagesDir):
os.makedirs(pagesDir)


+ 5
- 3
MarketPlaces/Initialization/prepare_parser.py View File

@ -71,7 +71,7 @@ def persist_data(url, row, cur):
def new_parse(marketPlace, url, createLog):
from MarketPlaces.Initialization.markets_mining import CURRENT_DATE
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
print("Parsing the " + marketPlace + " marketplace and conduct data classification to store the information in the database.")
@ -100,12 +100,14 @@ def new_parse(marketPlace, url, createLog):
" in the _Logs folder to read files from this Market Place of this date again.")
raise SystemExit
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + marketPlace + "/HTML_Pages")
# Reading the Listing Html Pages
for fileListing in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\" + CURRENT_DATE + "\\Listing", '*.html')):
for fileListing in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')):
lines.append(fileListing)
# Reading the Description Html Pages
for fileDescription in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\" + CURRENT_DATE + "\\Description", '*.html')):
for fileDescription in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", '*.html')):
lns.append(fileDescription)
# Parsing the Description Pages and put the tag's content into a dictionary (Hash table)


+ 21
- 17
MarketPlaces/ThiefWorld/crawler_selenium.py View File

@ -24,8 +24,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.ThiefWorld.parser import thiefworld_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1
baseURL = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/'
@ -33,24 +31,26 @@ baseURL = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
opentor()
# mktName = getMKTName()
driver = getAccess()
# opentor()
mktName = getMKTName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
new_parse(mktName, baseURL, False)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
@ -89,6 +89,8 @@ def closetor(driver):
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -161,12 +163,14 @@ def savePage(page, url):
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import CURRENT_DATE
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'..\ThiefWorld\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = r'..\ThiefWorld\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath


+ 6
- 7
setup.ini View File

@ -1,15 +1,14 @@
[TOR]
firefox_binary_path = C:\Users\\dabadcuber5\Desktop\Tor Browser\Browser\firefox.exe
firefox_profile_path = C:\Users\\dabadcuber5\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
geckodriver_path = C:\Users\\dabadcuber5\\PycharmProjects\dw_pipeline_test\selenium\geckodriver.exe
firefox_binary_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\firefox.exe
firefox_profile_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
geckodriver_path = C:\Users\calsyslab\Projects\dw_pipeline_test\selenium\geckodriver.exe
[Project]
project_directory = C:\Users\dabadcuber5\\PycharmProjects\dw_pipeline_test
shared_folder = \\VBoxSvr\VM_Files_(shared)
project_directory = C:\Users\calsyslab\Projects\dw_pipeline_test
shared_folder = \\VBoxSvr\Shared
[PostgreSQL]
ip = localhost
username = postgres
password = Ilovelucky1!
password = password
database = darkweb_markets_forums

Loading…
Cancel
Save