Browse Source

added more documentation, finished darkmatter marketplace crawler, finished newparser changes now will add topic pages to the first page library

main
Helium 1 year ago
parent
commit
0227f96aac
9 changed files with 1456 additions and 83 deletions
  1. +7
    -7
      Forums/AbyssForum/crawler_selenium.py
  2. +13
    -13
      Forums/CryptBB/crawler_selenium.py
  3. +1
    -1
      Forums/Initialization/forumsList.txt
  4. +332
    -0
      Forums/Initialization/geckodriver.log
  5. +58
    -23
      Forums/Initialization/prepare_parser.py
  6. +9
    -9
      Forums/OnniForums/crawler_selenium.py
  7. +12
    -12
      MarketPlaces/DarkMatter/crawler_selenium.py
  8. +2
    -2
      MarketPlaces/DarkMatter/parser.py
  9. +1022
    -16
      MarketPlaces/Initialization/geckodriver.log

+ 7
- 7
Forums/AbyssForum/crawler_selenium.py View File

@ -170,19 +170,19 @@ def getInterestedLinks():
links = [] links = []
# Hacked Database # Hacked Database
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=26&sid=6f7add746810784861a7ec31703a3757')
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=26')
# Hire a Hacker # Hire a Hacker
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=27&sid=6f7add746810784861a7ec31703a3757')
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=27')
# # Hacking Tools # # Hacking Tools
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=28&sid=6f7add746810784861a7ec31703a3757')
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=28')
# # Carding Forums # # Carding Forums
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=30&sid=6f7add746810784861a7ec31703a3757')
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=30')
# # Social Media Hacking # # Social Media Hacking
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=32&sid=6f7add746810784861a7ec31703a3757')
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=32')
# # Hacking Tutorials # # Hacking Tutorials
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=12&sid=6f7add746810784861a7ec31703a3757')
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=12')
# # Cracking Tutorials # # Cracking Tutorials
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=13&sid=6f7add746810784861a7ec31703a3757')
# links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=13')
return links return links


+ 13
- 13
Forums/CryptBB/crawler_selenium.py View File

@ -31,17 +31,17 @@ baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion
# Opens Tor Browser, crawls the website # Opens Tor Browser, crawls the website
def startCrawling(): def startCrawling():
opentor()
forumName = getForumName() forumName = getForumName()
driver = getAccess()
# opentor()
# driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
new_parse(forumName, baseURL, False) new_parse(forumName, baseURL, False)
@ -208,10 +208,10 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# # Beginner Programming
# Beginner Programming
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86') links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86')
# # Beginner Carding and Fraud # # Beginner Carding and Fraud
# links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91')
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91')
# # Beginner Hacking # # Beginner Hacking
# links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87') # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87')
# # Newbie # # Newbie
@ -264,7 +264,7 @@ def crawlForum(driver):
driver.refresh() driver.refresh()
savePage(driver.page_source, item) savePage(driver.page_source, item)
driver.back() driver.back()
'''
#variable to check if there is a next page for the topic #variable to check if there is a next page for the topic
has_next_topic_page = True has_next_topic_page = True
counter = 1 counter = 1
@ -296,7 +296,7 @@ def crawlForum(driver):
# end of loop # end of loop
for i in range(counter): for i in range(counter):
driver.back() driver.back()
'''
# comment out # comment out
break break


+ 1
- 1
Forums/Initialization/forumsList.txt View File

@ -1 +1 @@
OnniForums
CryptBB

+ 332
- 0
Forums/Initialization/geckodriver.log View File

@ -5877,3 +5877,335 @@ unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109 destructor()@TargetList.jsm:109
stop()@CDP.jsm:104 stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138 close()@RemoteAgent.jsm:138
1687977218822 geckodriver INFO Listening on 127.0.0.1:51022
1687977226564 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51023" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileikuU2J"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687977228948 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51023/devtools/browser/3b0200ed-8dcd-4975-a337-55ca97127f81
1687977234067 Marionette INFO Listening on port 51028
1687977234672 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687977449724 Marionette INFO Stopped listening on port 51028
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileikuU2J\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687977450647 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687977513313 geckodriver INFO Listening on 127.0.0.1:51084
1687977521019 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51085" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileWUrtuT"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687977523015 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51085/devtools/browser/64d878ac-9491-4b68-8378-3cdcd42b86f9
1687977528316 Marionette INFO Listening on port 51090
1687977529126 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687978083314 Marionette INFO Stopped listening on port 51090
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileWUrtuT\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687978083874 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687978133464 geckodriver INFO Listening on 127.0.0.1:51172
1687978141034 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51173" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileu5IdWT"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687978143085 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51173/devtools/browser/92c771f3-77dc-4ad5-9787-19e461c45ad6
1687978148067 Marionette INFO Listening on port 51178
1687978148324 RemoteAgent WARN TLS certificate errors will be ignored for this session
###!!! [Child][MessageChannel] Error: (msgtype=0x390097,name=PContent::Msg_InitBackground) Channel closing: too late to send/recv, messages will be lost
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileu5IdWT\thumbnails) because it does not exist
1687984051859 Marionette INFO Stopped listening on port 51178
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileu5IdWT\thumbnails) because it does not exist
[Parent 6808, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687984052405 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
O Listening on port 51239
1687978539391 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileodTbYM\thumbnails) because it does not exist
1687984050773 Marionette INFO Stopped listening on port 51239
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileodTbYM\thumbnails) because it does not exist
[Parent 2612, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
1687984051727 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
s://localhost:51280/devtools/browser/d4d6f9cc-7d5f-45e3-8873-a460f62cc4cf
1687978926427 Marionette INFO Listening on port 51285
1687978926534 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687979030758 Marionette INFO Stopped listening on port 51285
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileanrFrL\thumbnails) because it does not exist
1687979031575 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687979050690 geckodriver INFO Listening on 127.0.0.1:51360
1687979053723 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51361" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofile0hAG1R"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687979054534 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51361/devtools/browser/92278a26-d591-4e02-9b50-6d94f582bba6
1687979056856 Marionette INFO Listening on port 51366
1687979057092 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687979258295 Marionette INFO Stopped listening on port 51366
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofile0hAG1R\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687979258801 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687979267242 geckodriver INFO Listening on 127.0.0.1:51432
1687979271790 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51433" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilexKgOT4"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687979272999 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51433/devtools/browser/cda6fecb-bd37-4670-968b-8a378fded89f
1687979276192 Marionette INFO Listening on port 51444
1687979276461 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
1687979332888 Marionette INFO Stopped listening on port 51444
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilexKgOT4\thumbnails) because it does not exist
[Parent 4980, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687979333650 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687979430724 geckodriver INFO Listening on 127.0.0.1:51502
1687979436324 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51503" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilegHC201"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687979437856 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51503/devtools/browser/103a6f45-7bf6-46d2-8040-cefffb477152
1687979442204 Marionette INFO Listening on port 51508
1687979442652 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilegHC201\thumbnails) because it does not exist
1687984048079 Marionette INFO Stopped listening on port 51508
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilegHC201\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687984048659 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
vaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51622/devtools/browser/8b6f89c5-5489-4aa7-84ae-816a519ac6d2
1687983200540 Marionette INFO Listening on port 51627
1687983200642 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687984043915 Marionette INFO Stopped listening on port 51627
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofiletQSuzW\thumbnails) because it does not exist
[Parent 1532, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
1687984044451 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687989865551 geckodriver INFO Listening on 127.0.0.1:49687
1687989870785 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "49688" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileNUIghb"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687989872437 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:49688/devtools/browser/05e0b61d-92e1-4c2b-ac81-164fc698ee43
1687989876314 Marionette INFO Listening on port 49693
1687989876583 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687989882290 Marionette INFO Stopped listening on port 49693
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileNUIghb\thumbnails) because it does not exist
1687989883656 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687989967990 geckodriver INFO Listening on 127.0.0.1:53543
1687989972970 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "53544" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofile50PiiS"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687989974728 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:53544/devtools/browser/574837da-6642-43f8-a689-8dbe14b1e254
1687989978232 Marionette INFO Listening on port 53549
1687989978914 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687990165288 Marionette INFO Stopped listening on port 53549
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
[Parent 8704, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
1687990165952 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138

+ 58
- 23
Forums/Initialization/prepare_parser.py View File

@ -7,6 +7,7 @@ import shutil
from Forums.DB_Connection.db_connection import * from Forums.DB_Connection.db_connection import *
from Forums.BestCardingWorld.parser import * from Forums.BestCardingWorld.parser import *
from Forums.CryptBB.parser import * from Forums.CryptBB.parser import *
import re
from Forums.Classifier.classify_product import predict from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -96,7 +97,8 @@ def new_parse(forum, url, createLog):
lines = [] # listing pages lines = [] # listing pages
lns = [] # description pages lns = [] # description pages
detPage = {}
detPage = {} # first pages
other = {} # other pages
# Creating the log file for each Forum # Creating the log file for each Forum
if createLog: if createLog:
@ -108,11 +110,11 @@ def new_parse(forum, url, createLog):
" in the _Logs folder to read files from this Forum of this date again.") " in the _Logs folder to read files from this Forum of this date again.")
raise SystemExit raise SystemExit
# Reading the Listing Html Pages
# Reading the Listing Html Pages -> to memory
for fileListing in glob.glob(os.path.join("..\\" + forum + "\\HTML_Pages\\" + CURRENT_DATE + "\\Listing", '*.html')): for fileListing in glob.glob(os.path.join("..\\" + forum + "\\HTML_Pages\\" + CURRENT_DATE + "\\Listing", '*.html')):
lines.append(fileListing) lines.append(fileListing)
# Reading the Description Html Pages
# Reading the Description Html Pages -> to memory
for fileDescription in glob.glob(os.path.join("..\\" + forum + "\\HTML_Pages\\" + CURRENT_DATE + "\\Description" ,'*.html')): for fileDescription in glob.glob(os.path.join("..\\" + forum + "\\HTML_Pages\\" + CURRENT_DATE + "\\Description" ,'*.html')):
lns.append(fileDescription) lns.append(fileDescription)
@ -122,8 +124,8 @@ def new_parse(forum, url, createLog):
print("Reading description folder of '" + forum + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns))) print("Reading description folder of '" + forum + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns)))
try: try:
html = codecs.open(line2.strip('\n'), encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
html = codecs.open(line2.strip('\n'), encoding='utf8')#trying t open them in utf8 format
soup = BeautifulSoup(html, "html.parser")#throw into beautiful soup
html.close() html.close()
except: except:
@ -140,17 +142,28 @@ def new_parse(forum, url, createLog):
continue continue
try: try:
#Where actual parsing occurs
if forum == "BestCardingWorld": if forum == "BestCardingWorld":
rmm = bestcardingworld_description_parser(soup) rmm = bestcardingworld_description_parser(soup)
elif forum == "CryptBB": elif forum == "CryptBB":
rmm = cryptBB_description_parser(soup) rmm = cryptBB_description_parser(soup)
# key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip() # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
key = u"Url:" + os.path.basename(line2).replace(".html", "")
#essentially filename and url
key = u"Url:" + os.path.basename(line2).replace(".html", "")#should end with either no(page+num) or no page+num
# check if page or page exists at the end of a string followed by a series of numbers
#if yes add to other if no add to first page dictionary
# save descritions into record in memory
check = re.compile(r'(?<=Page|page)[0-9]*')
if check.search(key):
# print(key, 'is an other page\n')
other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
else:
# print(key, 'is a first page\n')
detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]}
# save file address with description record in memory
detPage[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
except: except:
@ -159,6 +172,27 @@ def new_parse(forum, url, createLog):
if createLog: if createLog:
logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n") logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n")
# goes through keys from detPage and other, checks if the keys match.
# if yes adds other[key] values to detPage w/o overwritting
for key in detPage.keys():
for k in list(other.keys()):
checkkey = str(key[4:])
checkk = str(k[4:])
if checkkey in checkk:
detPage[key]['rmm'][1].extend(other[k]['rmm'][1])
detPage[key]['rmm'][2].extend(other[k]['rmm'][2])
detPage[key]['rmm'][3].extend(other[k]['rmm'][3])
detPage[key]['rmm'][4].extend(other[k]['rmm'][4])
detPage[key]['rmm'][5].extend(other[k]['rmm'][5])
detPage[key]['rmm'][6].extend(other[k]['rmm'][6])
detPage[key]['rmm'][7].extend(other[k]['rmm'][7])
detPage[key]['rmm'][8].extend(other[k]['rmm'][8])
detPage[key]['files'].append(other[k]['filename'])
other.pop(k)
# Parsing the Listing Pages and put the tag's content into a list # Parsing the Listing Pages and put the tag's content into a list
for index, line1 in enumerate(lines): for index, line1 in enumerate(lines):
@ -251,20 +285,21 @@ def new_parse(forum, url, createLog):
if not persistError: if not persistError:
# move description files of completed folder # move description files of completed folder
source = line2.replace(os.path.basename(line2), "") + detPage[key]['filename']
destination = line2.replace(os.path.basename(line2), "") + r'Read/'
try:
shutil.move(source, destination)
num_persisted_moved += 1
except:
print("There was a problem to move the file " + detPage[key]['filename'] + " in the Description section!")
nError += 1
if createLog:
logFile.write(
str(nError) + ". There was a problem to move the file " + detPage[key]['filename'] + " in the Description section!.\n")
moveError = True
for filename in detPage[key]['files']:
source = line2.replace(os.path.basename(line2), "") + filename
destination = line2.replace(os.path.basename(line2), "") + r'Read/'
try:
shutil.move(source, destination)
num_persisted_moved += 1
except:
print("There was a problem to move the file " + filename + " in the Description section!")
nError += 1
if createLog:
logFile.write(
str(nError) + ". There was a problem to move the file " + filename + " in the Description section!.\n")
moveError = True
# if the associated description page is not read or not parsed # if the associated description page is not read or not parsed
else: else:


+ 9
- 9
Forums/OnniForums/crawler_selenium.py View File

@ -189,10 +189,10 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# Hacking & Cracking tutorials
links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials')
# # Hacking & Cracking tutorials
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials')
# Hacking & Cracking questions # Hacking & Cracking questions
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions')
links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions')
# # Exploit PoCs # # Exploit PoCs
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Exploit-PoCs') # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Exploit-PoCs')
# # Cracked software # # Cracked software
@ -282,12 +282,12 @@ def crawlForum(driver):
# comment out, one topic per page # comment out, one topic per page
break
# comment out, go through all pages
if count == 1:
count = 0
break
# break
#
# # comment out, go through all pages
# if count == 1:
# count = 0
# break
try: try:
temp = driver.find_element(by=By.XPATH, value= temp = driver.find_element(by=By.XPATH, value=


+ 12
- 12
MarketPlaces/DarkMatter/crawler_selenium.py View File

@ -2,7 +2,8 @@ __author__ = 'Helium'
''' '''
DarkMatter Marketplace Crawler (Selenium) DarkMatter Marketplace Crawler (Selenium)
this is a small marketplace so next page links are not coded in
website has connection issues
not working still trying to debug
''' '''
from selenium import webdriver from selenium import webdriver
@ -180,14 +181,12 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# fraud software
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=76')
# other
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=133')
# # hacking
# digital
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=73')
# # hack guides
# links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=94') # links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=94')
# # carding
# links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=118')
# # services
# links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=117')
# # software/malware # # software/malware
# links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=121') # links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=121')
@ -220,6 +219,7 @@ def crawlForum(driver):
has_next_page = True has_next_page = True
while has_next_page: while has_next_page:
list = productPages(html) list = productPages(html)
for item in list: for item in list:
itemURL = urlparse.urljoin(baseURL, str(item)) itemURL = urlparse.urljoin(baseURL, str(item))
try: try:
@ -238,9 +238,9 @@ def crawlForum(driver):
break break
try: try:
temp = driver.find_element(by=By.XPATH, value=
'/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]/tbody/tr')
link = temp.find_element(by=By.CLASS_NAME, value='button page-num').get_attribute('href')
nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]')
a = nav.find_element(by=By.LINK_TEXT, value=">")
link = a.get_attribute('href')
if link == "": if link == "":
raise NoSuchElementException raise NoSuchElementException
try: try:
@ -268,7 +268,7 @@ def crawlForum(driver):
#@param: url of any url crawled #@param: url of any url crawled
#return: true if is a description page, false if not #return: true if is a description page, false if not
def isDescriptionLink(url): def isDescriptionLink(url):
if 'product/' in url and '/products/?category' not in url:
if 'products/' in url and '/products/?category' not in url:
return True return True
return False return False


+ 2
- 2
MarketPlaces/DarkMatter/parser.py View File

@ -281,10 +281,10 @@ def darkmatter_links_parser(soup):
# Returning all links that should be visited by the Crawler # Returning all links that should be visited by the Crawler
href = [] href = []
listing = soup.findAll('td', {"class": "lefted"})
listing = soup.findAll('td', {"class": "lefted", 'colspan': '2'})
for a in listing: for a in listing:
bae = a.find('a', {"class": "lg bold"}, href=True)
bae = a.find('a', href=True)
link = bae['href'] link = bae['href']
href.append(link) href.append(link)

+ 1022
- 16
MarketPlaces/Initialization/geckodriver.log
File diff suppressed because it is too large
View File


Loading…
Cancel
Save