Browse Source

finished more crawlers

main
Helium 1 year ago
parent
commit
6b8ea82e6c
18 changed files with 8874 additions and 4 deletions
  1. +1
    -1
      Forums/Initialization/forumsList.txt
  2. +3
    -0
      Forums/Initialization/forums_mining.py
  3. +283
    -0
      Forums/Initialization/geckodriver.log
  4. +298
    -0
      MarketPlaces/AnonymousMarketplace/crawler_selenium.py
  5. +1483
    -0
      MarketPlaces/AnonymousMarketplace/geckodriver.log
  6. +291
    -0
      MarketPlaces/AnonymousMarketplace/parser.py
  7. +312
    -0
      MarketPlaces/CypherMarketplace/crawler_selenium.py
  8. +1483
    -0
      MarketPlaces/CypherMarketplace/geckodriver.log
  9. +291
    -0
      MarketPlaces/CypherMarketplace/parser.py
  10. +298
    -0
      MarketPlaces/DarkTor/crawler_selenium.py
  11. +1483
    -0
      MarketPlaces/DarkTor/geckodriver.log
  12. +291
    -0
      MarketPlaces/DarkTor/parser.py
  13. +303
    -0
      MarketPlaces/DigitalThriftShop/crawler_selenium.py
  14. +1483
    -0
      MarketPlaces/DigitalThriftShop/geckodriver.log
  15. +291
    -0
      MarketPlaces/DigitalThriftShop/parser.py
  16. +270
    -2
      MarketPlaces/Initialization/geckodriver.log
  17. +1
    -1
      MarketPlaces/Initialization/marketsList.txt
  18. +9
    -0
      MarketPlaces/Initialization/markets_mining.py

+ 1
- 1
Forums/Initialization/forumsList.txt View File

@ -1 +1 @@
AbyssForum
CryptBB

+ 3
- 0
Forums/Initialization/forums_mining.py View File

@ -106,6 +106,9 @@ if __name__ == '__main__':
elif forum == "Altenens":
crawlerAltenensForum()
print("Scraping process completed successfully!")


+ 283
- 0
Forums/Initialization/geckodriver.log View File

@ -5432,3 +5432,286 @@ unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687455077757 geckodriver INFO Listening on 127.0.0.1:51921
1687455083576 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51922" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileiMnt9Y"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687455084579 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51922/devtools/browser/50fc5309-4448-4643-b620-1944f94d9ab0
1687455087004 Marionette INFO Listening on port 51927
1687455087586 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687455216600 Marionette INFO Stopped listening on port 51927
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
###!!! [Child][MessageChannel] Error: (msgtype=0x59000C,name=PHttpChannel::Msg___delete__) Channel closing: too late to send/recv, messages will be lost
###!!! [Child][MessageChannel] Error: (msgtype=0x690023,name=PNecko::Msg_RemoveRequestContext) Channel closing: too late to send/recv, messages will be lost
1687455216907 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687455226201 geckodriver INFO Listening on 127.0.0.1:51997
1687455228295 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51998" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileexb7nL"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687455228607 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51998/devtools/browser/888a7285-4a85-4e1a-8fbe-55e6a2adbf66
1687455229445 Marionette INFO Listening on port 52003
1687455229702 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://fonts.googleapis.com/css?family=Nunito:300,300i,400,400i,500,500i,600,600i,700,700i without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/341e4a57816af3ba440d891ca87450ff_framework.css.263e0b2a021157e94abb049d32a231b2.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/05e81b71abe4f22d6eb8d1a929494829_responsive.css.2b22c143d0e5ca765c62096277bba194.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/20446cf2d164adcc029377cb04d43d17_flags.css.758a08424164d23f112b6ec2481369a2.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/5e53142098efe430fe4640eabd79b3c6_menuicons.css.a918687a524788f5b223ba01803d2eab.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/79bfb05544daeca4ec6d8979c07ae88e_button.css.aa1bb91a1e01bc96d9b98955b914d67b.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/72f83c71d84327caf917932c957d1930_tthumb.css.3c2c897b5cd04a111e36f36918bec493.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/24bd4821c74747f952801a16eb6c868a_main.css.b1810b27e513fa774b6890ab866909df.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/13136374e1b49ed2a6386fef2a59fb0c_skins.css.72d5c72729ec031009e6afe8feb39e0a.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/90eb5adf50a8c640f633d47fd7eb1778_core.css.e526a33eaadabb033be3513cebde0838.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/5a0da001ccc2200dc5625c3f3934497d_core_responsive.css.d69517154f8e6fde7b033562b64c824f.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/62e269ced0fdab7e30e026f1d30ae516_forums.css.4050a065985ae566d2046d5e66d93755.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/76e62c573090645fb99a15a363d8620e_forums_responsive.css.788fdb90de5b90f1bad497cfcdaf3343.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/258adbb6e4f3e83cd3b355f84e3fa002_custom.css.eadf39a11fb30964439d233cba16e549.css?v=026d4e282a without a browsingContextID set
console.error: (new TypeError("container.editor is undefined", "resource://devtools/client/inspector/markup/markup.js", 1619))
console.warn: "Resource of root-node was not found."
console.warn: "Resource of root-node was not found."
console.warn: "Resource of root-node was not found."
console.warn: "Resource of root-node was not found."
console.warn: "Resource of root-node was not found."
console.error: (new TypeError("currentNode is null", "resource://devtools/client/inspector/shared/style-change-tracker.js", 66))
TypeError: currentNode is null: canMutationImpactCurrentStyles@resource://devtools/client/inspector/shared/style-change-tracker.js:66:20
onMutations@resource://devtools/client/inspector/shared/style-change-tracker.js:84:41
_emit@resource://devtools/shared/event-emitter.js:226:34
emit@resource://devtools/shared/event-emitter.js:172:18
emit@resource://devtools/shared/event-emitter.js:324:18
getMutations@resource://devtools/client/fronts/walker.js:311:10
1687455743435 Marionette INFO Stopped listening on port 52003
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileexb7nL\thumbnails) because it does not exist
1687455743784 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687455784593 geckodriver INFO Listening on 127.0.0.1:52087
1687455788326 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "52088" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileR6GZWe"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687455788927 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:52088/devtools/browser/e0042277-a371-4056-af3d-85ce0875f158
1687455790458 Marionette INFO Listening on port 52093
1687455791011 RemoteAgent WARN TLS certificate errors will be ignored for this session
SourceActor threw an exception: [Exception... "Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/44357726-27b9-4d16-b4e7-4e6f59447a8b'" nsresult: "0x805303f4 (<unknown>)" location: "JS frame :: resource://devtools/shared/DevToolsUtils.js :: mainThreadFetch/< :: line 670" data: yes]
Stack: mainThreadFetch/<@resource://devtools/shared/DevToolsUtils.js:670:15
mainThreadFetch@resource://devtools/shared/DevToolsUtils.js:516:10
_fetchURLContents@resource://devtools/server/actors/utils/sources-manager.js:442:22
urlContents@resource://devtools/server/actors/utils/sources-manager.js:406:17
_resurrectSource@resource://devtools/server/actors/thread.js:2142:35
addAllSources@resource://devtools/server/actors/thread.js:1509:14
watch@resource://devtools/server/actors/resources/sources.js:52:17
watchResources@resource://devtools/server/actors/resources/index.js:239:19
_watchTargetResources@resource://devtools/server/actors/targets/target-actor-mixin.js:156:24
addWatcherDataEntry@resource://devtools/server/actors/targets/target-actor-mixin.js:47:20
_addWatcherDataEntry@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:483:24
receiveMessage@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:425:21
Line: 670, column: 0
console.error: ({})
SourceActor threw an exception: [Exception... "Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/d0bda8df-b93e-42d3-a860-8f679aef9868'" nsresult: "0x805303f4 (<unknown>)" location: "JS frame :: resource://devtools/shared/DevToolsUtils.js :: mainThreadFetch/< :: line 670" data: yes]
Stack: mainThreadFetch/<@resource://devtools/shared/DevToolsUtils.js:670:15
mainThreadFetch@resource://devtools/shared/DevToolsUtils.js:516:10
_fetchURLContents@resource://devtools/server/actors/utils/sources-manager.js:442:22
urlContents@resource://devtools/server/actors/utils/sources-manager.js:406:17
_resurrectSource@resource://devtools/server/actors/thread.js:2142:35
addAllSources@resource://devtools/server/actors/thread.js:1509:14
watch@resource://devtools/server/actors/resources/sources.js:52:17
watchResources@resource://devtools/server/actors/resources/index.js:239:19
_watchTargetResources@resource://devtools/server/actors/targets/target-actor-mixin.js:156:24
addWatcherDataEntry@resource://devtools/server/actors/targets/target-actor-mixin.js:47:20
_addWatcherDataEntry@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:483:24
receiveMessage@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:425:21
Line: 670, column: 0
console.error: ({})
JavaScript error: resource://devtools/shared/DevToolsUtils.js, line 670: Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/44357726-27b9-4d16-b4e7-4e6f59447a8b'
JavaScript error: resource://devtools/shared/DevToolsUtils.js, line 670: Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/d0bda8df-b93e-42d3-a860-8f679aef9868'
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://fonts.googleapis.com/css?family=Nunito:300,300i,400,400i,500,500i,600,600i,700,700i without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/341e4a57816af3ba440d891ca87450ff_framework.css.263e0b2a021157e94abb049d32a231b2.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/05e81b71abe4f22d6eb8d1a929494829_responsive.css.2b22c143d0e5ca765c62096277bba194.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/20446cf2d164adcc029377cb04d43d17_flags.css.758a08424164d23f112b6ec2481369a2.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/5e53142098efe430fe4640eabd79b3c6_menuicons.css.a918687a524788f5b223ba01803d2eab.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/79bfb05544daeca4ec6d8979c07ae88e_button.css.aa1bb91a1e01bc96d9b98955b914d67b.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/72f83c71d84327caf917932c957d1930_tthumb.css.3c2c897b5cd04a111e36f36918bec493.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/24bd4821c74747f952801a16eb6c868a_main.css.b1810b27e513fa774b6890ab866909df.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/13136374e1b49ed2a6386fef2a59fb0c_skins.css.72d5c72729ec031009e6afe8feb39e0a.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/90eb5adf50a8c640f633d47fd7eb1778_core.css.e526a33eaadabb033be3513cebde0838.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/5a0da001ccc2200dc5625c3f3934497d_core_responsive.css.d69517154f8e6fde7b033562b64c824f.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/62e269ced0fdab7e30e026f1d30ae516_forums.css.4050a065985ae566d2046d5e66d93755.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/76e62c573090645fb99a15a363d8620e_forums_responsive.css.788fdb90de5b90f1bad497cfcdaf3343.css?v=026d4e282a without a browsingContextID set
JavaScript error: resource://devtools/server/actors/network-monitor/network-event-actor.js, line 109: Error: Got a request https://crackingall.com/uploads/css_built_5/258adbb6e4f3e83cd3b355f84e3fa002_custom.css.eadf39a11fb30964439d233cba16e549.css?v=026d4e282a without a browsingContextID set
console.warn: "Resource of root-node was not found."
console.warn: "Resource of root-node was not found."
console.warn: "Resource of root-node was not found."
console.warn: "Resource of root-node was not found."
console.warn: "Resource of root-node was not found."
console.error: "Error writing request: getGrids"
console.error: "Error writing request: getGrids"
console.warn: "Resource of root-node was not found."
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileR6GZWe\thumbnails) because it does not exist
1687456242405 Marionette INFO Stopped listening on port 52093
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileR6GZWe\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687456242603 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687456258163 geckodriver INFO Listening on 127.0.0.1:52192
1687456260149 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "52193" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileQo9Zp0"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687456260474 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:52193/devtools/browser/d308bc8a-df10-4763-824a-6f23a5c493b7
1687456261290 Marionette INFO Listening on port 52198
1687456261497 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
JavaScript error: undefined, line 0: Error: Invalid tab ID: 1
JavaScript error: undefined, line 0: Error: Invalid tab ID: 1
JavaScript error: undefined, line 0: Error: Invalid tab ID: 1
JavaScript error: undefined, line 0: Error: Invalid tab ID: 1
1687456791586 Marionette INFO Stopped listening on port 52198
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687456791708 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687467349686 geckodriver INFO Listening on 127.0.0.1:51629
1687467357094 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51630" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilePDgGsl"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687467359136 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51630/devtools/browser/85f253a2-0be5-4d19-a5ad-367d7a91bfb6
1687467364474 Marionette INFO Listening on port 51635
1687467365127 RemoteAgent WARN TLS certificate errors will be ignored for this session
###!!! [Child][MessageChannel] Error: (msgtype=0x59000C,name=PHttpChannel::Msg___delete__) Channel closing: too late to send/recv, messages will be lost
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
1687467402685 Marionette INFO Stopped listening on port 51635
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilePDgGsl\thumbnails) because it does not exist
[Parent 4292, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
1687467404034 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138

+ 298
- 0
MarketPlaces/AnonymousMarketplace/crawler_selenium.py View File

@ -0,0 +1,298 @@
__author__ = 'Helium'
'''
AnonymousMarketplace Forum Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.AnonymousMarketplace.parser import anonymous_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1
baseURL = 'http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
opentor()
# mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'AnonymousMarketplace'
return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/'
return url
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
# wait for page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.ID, "woocommerce_product_categories-2")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import CURRENT_DATE
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'..\AnonymousMarketplace\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html'
else:
fullPath = r'..\AnonymousMarketplace\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html'
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# carding
links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/carding/')
# # hacked paypal
# links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacked-paypal-accounts/')
# # hacking services
# links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacking-services/')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the AnonymousMarketplace market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
while has_next_page:
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
link = ""
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling AnonymousMarketplace forum done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'product/' in url:
return True
return False
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'product-' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return anonymous_links_parser(soup)
# Drop links that "signout"
# def isSignOut(url):
# #absURL = urlparse.urljoin(url.base_url, url.url)
# if 'signout' in url.lower() or 'logout' in url.lower():
# return True
#
# return False
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 1483
- 0
MarketPlaces/AnonymousMarketplace/geckodriver.log
File diff suppressed because it is too large
View File


+ 291
- 0
MarketPlaces/AnonymousMarketplace/parser.py View File

@ -0,0 +1,291 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def darkfox_description_parser(soup):
# Fields to be parsed
name = "-1" # 0 Product_Name
describe = "-1" # 1 Product_Description
lastSeen = "-1" # 2 Product_LastViewDate
rules = "-1" # 3 NOT USED ...
CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
review = "-1" # 6 Product_Number_Of_Reviews
category = "-1" # 7 Product_Category
shipFrom = "-1" # 8 Product_ShippedFrom
shipTo = "-1" # 9 Product_ShippedTo
left = "-1" # 10 Product_QuantityLeft
escrow = "-1" # 11 Vendor_Warranty
terms = "-1" # 12 Vendor_TermsAndConditions
vendor = "-1" # 13 Vendor_Name
sold = "-1" # 14 Product_QuantitySold
addDate = "-1" # 15 Product_AddedDate
available = "-1" # 16 NOT USED ...
endDate = "-1" # 17 NOT USED ...
BTC = "-1" # 18 Product_BTC_SellingPrice
USD = "-1" # 19 Product_USD_SellingPrice
rating = "-1" # 20 Vendor_Rating
success = "-1" # 21 Vendor_Successful_Transactions
EURO = "-1" # 22 Product_EURO_SellingPrice
# Finding Product Name
name = soup.find('h1').text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
vendor = soup.find('h3').find('a').text.strip()
# Finding Vendor Rating
rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
# Finding Successful Transactions
success = soup.find('h3').text
success = success.replace("Vendor: ", "")
success = success.replace(vendor, "")
success = success.replace("(", "")
success = success.replace(")", "")
success = success.strip()
bae = soup.find('div', {'class': "box"}).find_all('ul')
# Finding Prices
USD = bae[1].find('strong').text.strip()
li = bae[2].find_all('li')
# Finding Escrow
escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Category
category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Quantity Available
left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
# Finding Number Sold
sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
li = bae[3].find_all('li')
# Finding Shipment Information (Origin)
if "Ships from:" in li[-2].text:
shipFrom = li[-2].text
shipFrom = shipFrom.replace("Ships from: ", "")
# shipFrom = shipFrom.replace(",", "")
shipFrom = shipFrom.strip()
# Finding Shipment Information (Destination)
shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
shipTo = shipTo.replace("Ships to: ", "")
shipTo = shipTo.strip()
if "certain countries" in shipTo:
countries = ""
tags = li[-1].find_all('span', {'class': "tag"})
for tag in tags:
country = tag.text.strip()
countries += country + ", "
shipTo = countries.strip(", ")
# Finding the Product description
describe = soup.find('div', {'class': "pre-line"}).text
describe = describe.replace("\n", " ")
describe = describe.strip()
'''# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
# Sending the results
return row
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def darkfox_listing_parser(soup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "DarkFox" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft =[] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
listing = soup.findAll('div', {"class": "card"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
product = bae[1].find('p').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
if len(bae) >= 5:
# Finding Prices
price = bae[0].text
ud = price.replace(" USD", " ")
# u = ud.replace("$","")
u = ud.replace(",", "")
u = u.strip()
USD.append(u)
# bc = (prc[1]).strip(' BTC')
# BTC.append(bc)
# Finding the Vendor
vendor_name = bae[1].find('a').text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Category
cat = bae[2].find('small').text
cat = cat.replace("Category: ", "")
cat = cat.replace(",", "")
cat = cat.strip()
category.append(cat)
# Finding Number Sold and Quantity Left
num = bae[3].text
num = num.replace("Sold: ", "")
num = num.strip()
sold.append(num)
quant = bae[4].find('small').text
quant = quant.replace("In stock: ", "")
quant = quant.strip()
qLeft.append(quant)
# Finding Successful Transactions
freq = bae[1].text
freq = freq.replace(vendor_name, "")
freq = re.sub(r'Vendor Level \d+', "", freq)
freq = freq.replace("(", "")
freq = freq.replace(")", "")
freq = freq.strip()
success.append(freq)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
def anonymous_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find('ul', {"class": "products columns-4"}).findAll('li')
for a in listing:
bae = a.find('a', {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}, href=True)
link = bae['href']
href.append(link)
return href

+ 312
- 0
MarketPlaces/CypherMarketplace/crawler_selenium.py View File

@ -0,0 +1,312 @@
__author__ = 'Helium'
'''
CypherMarketplace Forum Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.CypherMarketplace.parser import cyphermarketplace_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1
baseURL = 'http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion/'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
opentor()
# mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'CypherMarketplace'
return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion/'
return url
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
# wait for page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[1]/div/div[1]/div[1]/ul")))
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input')
# Username here
usernameBox.send_keys('beachyoga278') # sends string to the username box
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input')
# Password here
passwordBox.send_keys('sunfish278') # sends string to passwordBox
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/div/div/div[1]/div/div/div[1]/div[2]/ul/li[8]/a")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import CURRENT_DATE
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'..\CypherMarketplace\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html'
else:
fullPath = r'..\CypherMarketplace\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html'
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# software
links.append('http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion/category/040ca140-3cfc-11ea-9364-87edd8c0a63f')
# # guides
# links.append('http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion/category/35a35d10-3cfb-11ea-9b14-65b8930c1372')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the CypherMarketplace market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
while has_next_page:
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
temp = driver.find_element(by=By.XPATH, value=
'/html/body/div[2]/div/div/div[2]/div/nav/ul')
link = temp.find_element(by=By.TAG_NAME, value='page-link').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling CypherMarketplace forum done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'product' in url:
return True
return False
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'category' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return cyphermarketplace_links_parser(soup)
# Drop links that "signout"
# def isSignOut(url):
# #absURL = urlparse.urljoin(url.base_url, url.url)
# if 'signout' in url.lower() or 'logout' in url.lower():
# return True
#
# return False
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 1483
- 0
MarketPlaces/CypherMarketplace/geckodriver.log
File diff suppressed because it is too large
View File


+ 291
- 0
MarketPlaces/CypherMarketplace/parser.py View File

@ -0,0 +1,291 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def darkfox_description_parser(soup):
# Fields to be parsed
name = "-1" # 0 Product_Name
describe = "-1" # 1 Product_Description
lastSeen = "-1" # 2 Product_LastViewDate
rules = "-1" # 3 NOT USED ...
CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
review = "-1" # 6 Product_Number_Of_Reviews
category = "-1" # 7 Product_Category
shipFrom = "-1" # 8 Product_ShippedFrom
shipTo = "-1" # 9 Product_ShippedTo
left = "-1" # 10 Product_QuantityLeft
escrow = "-1" # 11 Vendor_Warranty
terms = "-1" # 12 Vendor_TermsAndConditions
vendor = "-1" # 13 Vendor_Name
sold = "-1" # 14 Product_QuantitySold
addDate = "-1" # 15 Product_AddedDate
available = "-1" # 16 NOT USED ...
endDate = "-1" # 17 NOT USED ...
BTC = "-1" # 18 Product_BTC_SellingPrice
USD = "-1" # 19 Product_USD_SellingPrice
rating = "-1" # 20 Vendor_Rating
success = "-1" # 21 Vendor_Successful_Transactions
EURO = "-1" # 22 Product_EURO_SellingPrice
# Finding Product Name
name = soup.find('h1').text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
vendor = soup.find('h3').find('a').text.strip()
# Finding Vendor Rating
rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
# Finding Successful Transactions
success = soup.find('h3').text
success = success.replace("Vendor: ", "")
success = success.replace(vendor, "")
success = success.replace("(", "")
success = success.replace(")", "")
success = success.strip()
bae = soup.find('div', {'class': "box"}).find_all('ul')
# Finding Prices
USD = bae[1].find('strong').text.strip()
li = bae[2].find_all('li')
# Finding Escrow
escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Category
category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Quantity Available
left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
# Finding Number Sold
sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
li = bae[3].find_all('li')
# Finding Shipment Information (Origin)
if "Ships from:" in li[-2].text:
shipFrom = li[-2].text
shipFrom = shipFrom.replace("Ships from: ", "")
# shipFrom = shipFrom.replace(",", "")
shipFrom = shipFrom.strip()
# Finding Shipment Information (Destination)
shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
shipTo = shipTo.replace("Ships to: ", "")
shipTo = shipTo.strip()
if "certain countries" in shipTo:
countries = ""
tags = li[-1].find_all('span', {'class': "tag"})
for tag in tags:
country = tag.text.strip()
countries += country + ", "
shipTo = countries.strip(", ")
# Finding the Product description
describe = soup.find('div', {'class': "pre-line"}).text
describe = describe.replace("\n", " ")
describe = describe.strip()
'''# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
# Sending the results
return row
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def darkfox_listing_parser(soup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "DarkFox" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft =[] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
listing = soup.findAll('div', {"class": "card"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
product = bae[1].find('p').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
if len(bae) >= 5:
# Finding Prices
price = bae[0].text
ud = price.replace(" USD", " ")
# u = ud.replace("$","")
u = ud.replace(",", "")
u = u.strip()
USD.append(u)
# bc = (prc[1]).strip(' BTC')
# BTC.append(bc)
# Finding the Vendor
vendor_name = bae[1].find('a').text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Category
cat = bae[2].find('small').text
cat = cat.replace("Category: ", "")
cat = cat.replace(",", "")
cat = cat.strip()
category.append(cat)
# Finding Number Sold and Quantity Left
num = bae[3].text
num = num.replace("Sold: ", "")
num = num.strip()
sold.append(num)
quant = bae[4].find('small').text
quant = quant.replace("In stock: ", "")
quant = quant.strip()
qLeft.append(quant)
# Finding Successful Transactions
freq = bae[1].text
freq = freq.replace(vendor_name, "")
freq = re.sub(r'Vendor Level \d+', "", freq)
freq = freq.replace("(", "")
freq = freq.replace(")", "")
freq = freq.strip()
success.append(freq)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
def cyphermarketplace_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"class": "card-body"})
for a in listing:
bae = a.find('a', {"class": "text-info"}, href=True)
link = bae['href']
href.append(link)
return href

+ 298
- 0
MarketPlaces/DarkTor/crawler_selenium.py View File

@ -0,0 +1,298 @@
__author__ = 'Helium'
'''
DarkTor Forum Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.DarkTor.parser import darktor_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1
baseURL = 'http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
opentor()
# mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'DarkTor'
return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion//'
return url
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
# wait for page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[1]/div/div/div[2]/main/div/div/section[5]/div/div[1]/div")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import CURRENT_DATE
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'..\DarkTor\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html'
else:
fullPath = r'..\DarkTor\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html'
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# Hacking
links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/hacking-services/')
# Carding
links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/carding/')
# hacked paypals
links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/hacked-paypal-accounts/')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the DarkTor market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
while has_next_page:
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
link = ""
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling DarkTor forum done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'product/' in url:
return True
return False
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'product-' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return darktor_links_parser(soup)
# Drop links that "signout"
# def isSignOut(url):
# #absURL = urlparse.urljoin(url.base_url, url.url)
# if 'signout' in url.lower() or 'logout' in url.lower():
# return True
#
# return False
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 1483
- 0
MarketPlaces/DarkTor/geckodriver.log
File diff suppressed because it is too large
View File


+ 291
- 0
MarketPlaces/DarkTor/parser.py View File

@ -0,0 +1,291 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def darkfox_description_parser(soup):
# Fields to be parsed
name = "-1" # 0 Product_Name
describe = "-1" # 1 Product_Description
lastSeen = "-1" # 2 Product_LastViewDate
rules = "-1" # 3 NOT USED ...
CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
review = "-1" # 6 Product_Number_Of_Reviews
category = "-1" # 7 Product_Category
shipFrom = "-1" # 8 Product_ShippedFrom
shipTo = "-1" # 9 Product_ShippedTo
left = "-1" # 10 Product_QuantityLeft
escrow = "-1" # 11 Vendor_Warranty
terms = "-1" # 12 Vendor_TermsAndConditions
vendor = "-1" # 13 Vendor_Name
sold = "-1" # 14 Product_QuantitySold
addDate = "-1" # 15 Product_AddedDate
available = "-1" # 16 NOT USED ...
endDate = "-1" # 17 NOT USED ...
BTC = "-1" # 18 Product_BTC_SellingPrice
USD = "-1" # 19 Product_USD_SellingPrice
rating = "-1" # 20 Vendor_Rating
success = "-1" # 21 Vendor_Successful_Transactions
EURO = "-1" # 22 Product_EURO_SellingPrice
# Finding Product Name
name = soup.find('h1').text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
vendor = soup.find('h3').find('a').text.strip()
# Finding Vendor Rating
rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
# Finding Successful Transactions
success = soup.find('h3').text
success = success.replace("Vendor: ", "")
success = success.replace(vendor, "")
success = success.replace("(", "")
success = success.replace(")", "")
success = success.strip()
bae = soup.find('div', {'class': "box"}).find_all('ul')
# Finding Prices
USD = bae[1].find('strong').text.strip()
li = bae[2].find_all('li')
# Finding Escrow
escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Category
category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Quantity Available
left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
# Finding Number Sold
sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
li = bae[3].find_all('li')
# Finding Shipment Information (Origin)
if "Ships from:" in li[-2].text:
shipFrom = li[-2].text
shipFrom = shipFrom.replace("Ships from: ", "")
# shipFrom = shipFrom.replace(",", "")
shipFrom = shipFrom.strip()
# Finding Shipment Information (Destination)
shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
shipTo = shipTo.replace("Ships to: ", "")
shipTo = shipTo.strip()
if "certain countries" in shipTo:
countries = ""
tags = li[-1].find_all('span', {'class': "tag"})
for tag in tags:
country = tag.text.strip()
countries += country + ", "
shipTo = countries.strip(", ")
# Finding the Product description
describe = soup.find('div', {'class': "pre-line"}).text
describe = describe.replace("\n", " ")
describe = describe.strip()
'''# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
# Sending the results
return row
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def darkfox_listing_parser(soup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "DarkFox" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft =[] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
listing = soup.findAll('div', {"class": "card"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
product = bae[1].find('p').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
if len(bae) >= 5:
# Finding Prices
price = bae[0].text
ud = price.replace(" USD", " ")
# u = ud.replace("$","")
u = ud.replace(",", "")
u = u.strip()
USD.append(u)
# bc = (prc[1]).strip(' BTC')
# BTC.append(bc)
# Finding the Vendor
vendor_name = bae[1].find('a').text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Category
cat = bae[2].find('small').text
cat = cat.replace("Category: ", "")
cat = cat.replace(",", "")
cat = cat.strip()
category.append(cat)
# Finding Number Sold and Quantity Left
num = bae[3].text
num = num.replace("Sold: ", "")
num = num.strip()
sold.append(num)
quant = bae[4].find('small').text
quant = quant.replace("In stock: ", "")
quant = quant.strip()
qLeft.append(quant)
# Finding Successful Transactions
freq = bae[1].text
freq = freq.replace(vendor_name, "")
freq = re.sub(r'Vendor Level \d+', "", freq)
freq = freq.replace("(", "")
freq = freq.replace(")", "")
freq = freq.strip()
success.append(freq)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
def darktor_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"class": "product-block"})
for a in listing:
bae = a.find('a', {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}, href=True)
link = bae['href']
href.append(link)
return href

+ 303
- 0
MarketPlaces/DigitalThriftShop/crawler_selenium.py View File

@ -0,0 +1,303 @@
__author__ = 'Helium'
'''
DigitalThriftShop Forum Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.DigitalThriftShop.parser import digitalthriftshop_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1
baseURL = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
opentor()
# mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'DigitalThriftShop'
return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/'
return url
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
# wait for page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.ID, "woocommerce_product_categories-2")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
cleanPage = cleanHTML(page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import CURRENT_DATE
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'..\DigitalThriftShop\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html'
else:
fullPath = r'..\DigitalThriftShop\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html'
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# Bot nets
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/botnets/')
# # data leak
# links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/dataleak/')
# databases
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/databases/')
# # ransomware
# links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/ransomware/')
# # rats
# links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/rats/')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the DigitalThriftShop market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
while has_next_page:
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
link = driver.find_element(by=By.XPATH, value=
'/html/body/div[1]/div[2]/div/div[2]/main/div[1]/nav/ul/li[5]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling DigitalThriftShop forum done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'product/' in url:
return True
return False
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'product-' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return digitalthriftshop_links_parser(soup)
# Drop links that "signout"
# def isSignOut(url):
# #absURL = urlparse.urljoin(url.base_url, url.url)
# if 'signout' in url.lower() or 'logout' in url.lower():
# return True
#
# return False
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 1483
- 0
MarketPlaces/DigitalThriftShop/geckodriver.log
File diff suppressed because it is too large
View File


+ 291
- 0
MarketPlaces/DigitalThriftShop/parser.py View File

@ -0,0 +1,291 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def darkfox_description_parser(soup):
# Fields to be parsed
name = "-1" # 0 Product_Name
describe = "-1" # 1 Product_Description
lastSeen = "-1" # 2 Product_LastViewDate
rules = "-1" # 3 NOT USED ...
CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
review = "-1" # 6 Product_Number_Of_Reviews
category = "-1" # 7 Product_Category
shipFrom = "-1" # 8 Product_ShippedFrom
shipTo = "-1" # 9 Product_ShippedTo
left = "-1" # 10 Product_QuantityLeft
escrow = "-1" # 11 Vendor_Warranty
terms = "-1" # 12 Vendor_TermsAndConditions
vendor = "-1" # 13 Vendor_Name
sold = "-1" # 14 Product_QuantitySold
addDate = "-1" # 15 Product_AddedDate
available = "-1" # 16 NOT USED ...
endDate = "-1" # 17 NOT USED ...
BTC = "-1" # 18 Product_BTC_SellingPrice
USD = "-1" # 19 Product_USD_SellingPrice
rating = "-1" # 20 Vendor_Rating
success = "-1" # 21 Vendor_Successful_Transactions
EURO = "-1" # 22 Product_EURO_SellingPrice
# Finding Product Name
name = soup.find('h1').text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
vendor = soup.find('h3').find('a').text.strip()
# Finding Vendor Rating
rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
# Finding Successful Transactions
success = soup.find('h3').text
success = success.replace("Vendor: ", "")
success = success.replace(vendor, "")
success = success.replace("(", "")
success = success.replace(")", "")
success = success.strip()
bae = soup.find('div', {'class': "box"}).find_all('ul')
# Finding Prices
USD = bae[1].find('strong').text.strip()
li = bae[2].find_all('li')
# Finding Escrow
escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Category
category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Quantity Available
left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
# Finding Number Sold
sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
li = bae[3].find_all('li')
# Finding Shipment Information (Origin)
if "Ships from:" in li[-2].text:
shipFrom = li[-2].text
shipFrom = shipFrom.replace("Ships from: ", "")
# shipFrom = shipFrom.replace(",", "")
shipFrom = shipFrom.strip()
# Finding Shipment Information (Destination)
shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
shipTo = shipTo.replace("Ships to: ", "")
shipTo = shipTo.strip()
if "certain countries" in shipTo:
countries = ""
tags = li[-1].find_all('span', {'class': "tag"})
for tag in tags:
country = tag.text.strip()
countries += country + ", "
shipTo = countries.strip(", ")
# Finding the Product description
describe = soup.find('div', {'class': "pre-line"}).text
describe = describe.replace("\n", " ")
describe = describe.strip()
'''# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
# Sending the results
return row
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def darkfox_listing_parser(soup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "DarkFox" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft =[] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
listing = soup.findAll('div', {"class": "card"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
product = bae[1].find('p').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
if len(bae) >= 5:
# Finding Prices
price = bae[0].text
ud = price.replace(" USD", " ")
# u = ud.replace("$","")
u = ud.replace(",", "")
u = u.strip()
USD.append(u)
# bc = (prc[1]).strip(' BTC')
# BTC.append(bc)
# Finding the Vendor
vendor_name = bae[1].find('a').text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Category
cat = bae[2].find('small').text
cat = cat.replace("Category: ", "")
cat = cat.replace(",", "")
cat = cat.strip()
category.append(cat)
# Finding Number Sold and Quantity Left
num = bae[3].text
num = num.replace("Sold: ", "")
num = num.strip()
sold.append(num)
quant = bae[4].find('small').text
quant = quant.replace("In stock: ", "")
quant = quant.strip()
qLeft.append(quant)
# Finding Successful Transactions
freq = bae[1].text
freq = freq.replace(vendor_name, "")
freq = re.sub(r'Vendor Level \d+', "", freq)
freq = freq.replace("(", "")
freq = freq.replace(")", "")
freq = freq.strip()
success.append(freq)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
def digitalthriftshop_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find('ul', {"class": "products columns-5"}).findAll('li')
for a in listing:
bae = a.find('a', href=True)
link = bae['href']
href.append(link)
return href

+ 270
- 2
MarketPlaces/Initialization/geckodriver.log View File

@ -8604,8 +8604,8 @@ DevTools listening on ws://localhost:56159/devtools/browser/da664e77-92c9-4160-8
1687385553572 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
1687385720138 geckodriver INFO Listening on 127.0.0.1:56246
1687385723484 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "56247" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileuCc0P6"
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileR8jT8Z\thumbnails) because it does not exist
-debugging-port" "56247" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileuCc0P6"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
@ -8641,3 +8641,271 @@ unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687449887317 geckodriver INFO Listening on 127.0.0.1:51004
1687449890873 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51005" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileKCABtF"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687449891534 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51005/devtools/browser/6d865809-fbc7-4760-bc49-9b9aab0e2bf7
1687449893058 Marionette INFO Listening on port 51014
1687449893582 RemoteAgent WARN TLS certificate errors will be ignored for this session
SourceActor threw an exception: [Exception... "Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/9d56152f-51c2-42be-9fbd-fe4e215c085c'" nsresult: "0x805303f4 (<unknown>)" location: "JS frame :: resource://devtools/shared/DevToolsUtils.js :: mainThreadFetch/< :: line 670" data: yes]
Stack: mainThreadFetch/<@resource://devtools/shared/DevToolsUtils.js:670:15
mainThreadFetch@resource://devtools/shared/DevToolsUtils.js:516:10
_fetchURLContents@resource://devtools/server/actors/utils/sources-manager.js:442:22
urlContents@resource://devtools/server/actors/utils/sources-manager.js:406:17
_resurrectSource@resource://devtools/server/actors/thread.js:2142:35
addAllSources@resource://devtools/server/actors/thread.js:1509:14
watch@resource://devtools/server/actors/resources/sources.js:52:17
watchResources@resource://devtools/server/actors/resources/index.js:239:19
_watchTargetResources@resource://devtools/server/actors/targets/target-actor-mixin.js:156:24
addWatcherDataEntry@resource://devtools/server/actors/targets/target-actor-mixin.js:47:20
_addWatcherDataEntry@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:483:24
receiveMessage@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:425:21
Line: 670, column: 0
console.error: ({})
SourceActor threw an exception: [Exception... "Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/861a57ef-0b59-4882-b8a4-f722933956f0'" nsresult: "0x805303f4 (<unknown>)" location: "JS frame :: resource://devtools/shared/DevToolsUtils.js :: mainThreadFetch/< :: line 670" data: yes]
Stack: mainThreadFetch/<@resource://devtools/shared/DevToolsUtils.js:670:15
mainThreadFetch@resource://devtools/shared/DevToolsUtils.js:516:10
_fetchURLContents@resource://devtools/server/actors/utils/sources-manager.js:442:22
urlContents@resource://devtools/server/actors/utils/sources-manager.js:406:17
_resurrectSource@resource://devtools/server/actors/thread.js:2142:35
addAllSources@resource://devtools/server/actors/thread.js:1509:14
watch@resource://devtools/server/actors/resources/sources.js:52:17
watchResources@resource://devtools/server/actors/resources/index.js:239:19
_watchTargetResources@resource://devtools/server/actors/targets/target-actor-mixin.js:156:24
addWatcherDataEntry@resource://devtools/server/actors/targets/target-actor-mixin.js:47:20
_addWatcherDataEntry@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:483:24
receiveMessage@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:425:21
Line: 670, column: 0
console.error: ({})
JavaScript error: resource://devtools/shared/DevToolsUtils.js, line 670: Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/9d56152f-51c2-42be-9fbd-fe4e215c085c'
JavaScript error: resource://devtools/shared/DevToolsUtils.js, line 670: Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/861a57ef-0b59-4882-b8a4-f722933956f0'
console.warn: "Resource of root-node was not found."
console.warn: "Resource of root-node was not found."
console.warn: "Resource of root-node was not found."
SourceActor threw an exception: [Exception... "Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/b31f8585-55eb-4fae-993e-0637e6545dd7'" nsresult: "0x805303f4 (<unknown>)" location: "JS frame :: resource://devtools/shared/DevToolsUtils.js :: mainThreadFetch/< :: line 670" data: yes]
Stack: mainThreadFetch/<@resource://devtools/shared/DevToolsUtils.js:670:15
mainThreadFetch@resource://devtools/shared/DevToolsUtils.js:516:10
_fetchURLContents@resource://devtools/server/actors/utils/sources-manager.js:442:22
urlContents@resource://devtools/server/actors/utils/sources-manager.js:406:17
_resurrectSource@resource://devtools/server/actors/thread.js:2142:35
addAllSources@resource://devtools/server/actors/thread.js:1509:14
_onWindowReady@resource://devtools/server/actors/thread.js:1818:12
_emit@resource://devtools/shared/event-emitter.js:226:34
emit@resource://devtools/shared/event-emitter.js:172:18
emit@resource://devtools/shared/event-emitter.js:324:18
_windowReady@resource://devtools/server/actors/targets/browsing-context.js:1406:10
DebuggerProgressListener.prototype.onWindowCreated<@resource://devtools/server/actors/targets/browsing-context.js:1761:23
exports.makeInfallible/<@resource://devtools/shared/ThreadSafeDevToolsUtils.js:103:22
Line: 670, column: 0
console.error: ({})
SourceActor threw an exception: [Exception... "Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/703518f2-0e44-4fa7-8fda-056e24dfc731'" nsresult: "0x805303f4 (<unknown>)" location: "JS frame :: resource://devtools/shared/DevToolsUtils.js :: mainThreadFetch/< :: line 670" data: yes]
Stack: mainThreadFetch/<@resource://devtools/shared/DevToolsUtils.js:670:15
mainThreadFetch@resource://devtools/shared/DevToolsUtils.js:516:10
_fetchURLContents@resource://devtools/server/actors/utils/sources-manager.js:442:22
urlContents@resource://devtools/server/actors/utils/sources-manager.js:406:17
_resurrectSource@resource://devtools/server/actors/thread.js:2142:35
addAllSources@resource://devtools/server/actors/thread.js:1509:14
_onWindowReady@resource://devtools/server/actors/thread.js:1818:12
_emit@resource://devtools/shared/event-emitter.js:226:34
emit@resource://devtools/shared/event-emitter.js:172:18
emit@resource://devtools/shared/event-emitter.js:324:18
_windowReady@resource://devtools/server/actors/targets/browsing-context.js:1406:10
DebuggerProgressListener.prototype.onWindowCreated<@resource://devtools/server/actors/targets/browsing-context.js:1761:23
exports.makeInfallible/<@resource://devtools/shared/ThreadSafeDevToolsUtils.js:103:22
Line: 670, column: 0
console.error: ({})
JavaScript error: resource://devtools/shared/DevToolsUtils.js, line 670: Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/b31f8585-55eb-4fae-993e-0637e6545dd7'
JavaScript error: resource://devtools/shared/DevToolsUtils.js, line 670: Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/703518f2-0e44-4fa7-8fda-056e24dfc731'
console.warn: "Async method failed in safeAsyncMethod" (new Error("Connection closed, pending request to server0.conn0.child4/domwalker24, type cancelPick fail1687450492998 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51115" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileXamevd"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687450493482 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51115/devtools/browser/44be2243-5f4e-4b1a-9b9e-793f909eb5b6
1687450494706 Marionette INFO Listening on port 51123
1687450494734 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687450804355 Marionette INFO Stopped listening on port 51123
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileXamevd\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687450804629 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51188/devtools/browser/a3878037-b0ca-4217-a088-f448c2fcb69d
1687450681407 Marionette INFO Listening on port 51193
1687450681628 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687450806179 Marionette INFO Stopped listening on port 51193
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
[Parent 8520, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
1687450806439 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687452282130 geckodriver INFO Listening on 127.0.0.1:51476
1687452286160 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51477" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileZ1gqjQ"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687452286803 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51477/devtools/browser/d96c7e14-7dc2-46f0-b4cd-35743e71c63f
1687452288663 Marionette INFO Listening on port 51482
1687452288955 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
1687452391476 Marionette INFO Stopped listening on port 51482
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687452391783 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687453361999 geckodriver INFO Listening on 127.0.0.1:51726
1687453364756 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51727" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilemnOa7V"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687453365268 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51727/devtools/browser/cd3d5b10-01ee-4fb9-9982-649fadd5a624
1687453366340 Marionette INFO Listening on port 51732
1687453366698 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
1687453449959 Marionette INFO Stopped listening on port 51732
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
1687453450550 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687453473007 geckodriver INFO Listening on 127.0.0.1:51777
1687453476313 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51778" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilewh57OX"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687453476948 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51778/devtools/browser/74e919f9-8fdf-410c-8061-37137c444056
1687453478140 Marionette INFO Listening on port 51783
1687453478502 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
1687453663389 Marionette INFO Stopped listening on port 51783
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
[Parent 4800, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687453663611 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138

+ 1
- 1
MarketPlaces/Initialization/marketsList.txt View File

@ -1 +1 @@
MikesGrandStore
AnonymousMarketplace

+ 9
- 0
MarketPlaces/Initialization/markets_mining.py View File

@ -13,6 +13,9 @@ from MarketPlaces.TorBay.crawler_selenium import crawler as crawlerTorBay
from MarketPlaces.LionMarketplace.crawler_selenium import crawler as crawlerLionMarketplace
from MarketPlaces.TorMarket.crawler_selenium import crawler as crawlerTorMarket
from MarketPlaces.MikesGrandStore.crawler_selenium import crawler as crawlerMikesGrandStore
from MarketPlaces.DarkTor.crawler_selenium import crawler as crawlerDarkTor
from MarketPlaces.DigitalThriftShop.crawler_selenium import crawler as crawlerDigitalThriftShop
from MarketPlaces.AnonymousMarketplace.crawler_selenium import crawler as crawlerAnonymousMarketplace
import time
@ -82,6 +85,12 @@ if __name__ == '__main__':
crawlerTorMarket()
elif mkt == "MikesGrandStore":
crawlerMikesGrandStore()
elif mkt == "DarkTor":
crawlerDarkTor()
elif mkt == "DigitalThriftShop":
crawlerDigitalThriftShop()
elif mkt == "AnonymousMarketplace":
crawlerAnonymousMarketplace()


Loading…
Cancel
Save