Browse Source

fixed altenens

main
Helium 1 year ago
parent
commit
33a5379c3e
8 changed files with 937 additions and 99 deletions
  1. +1
    -1
      .gitignore
  2. +19
    -18
      Forums/Altenens/crawler_selenium.py
  3. +2
    -2
      Forums/Altenens/parser.py
  4. +1
    -1
      Forums/Initialization/forumsList.txt
  5. +291
    -21
      Forums/Initialization/geckodriver.log
  6. +330
    -0
      Forums/Libre/crawler_selenium.py
  7. +247
    -0
      Forums/Libre/parser.py
  8. +46
    -56
      MarketPlaces/TorBay/parser.py

+ 1
- 1
.gitignore View File

@ -1,6 +1,6 @@
# Default ignored files
/shelf/
workspace.xml
.idea/workspace.xml
selenium/geckodriver.exe
setup.ini
*.html


+ 19
- 18
Forums/Altenens/crawler_selenium.py View File

@ -1,7 +1,8 @@
__author__ = 'Helium'
'''
Altenens Forum Crawler (Selenium)
Altenens Forum Crawler (Selenium);
Untested due to CAPTCHAS and blocking the network
'''
from selenium import webdriver
@ -36,7 +37,7 @@ def startCrawling():
if driver != 'down':
try:
# login(driver)
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
@ -60,24 +61,24 @@ def opentor():
# Login using premade account credentials and do login captcha manually
def login(driver):
# #click login button
# login_link = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]').get_attribute('href')
# driver.get(login_link) # open tab with url
#
# #entering username and password into input boxes
# usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[1]/dd/input')
# #Username here
# usernameBox.send_keys('mylittlepony45')#sends string to the username box
# passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[2]/dd/div/div/input')
# #Password here
# passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox
input("Press ENTER when CAPTCHA is completed\n")
#click login button
login_link = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]').get_attribute('href')
driver.get(login_link) # open tab with url
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[1]/dd/input')
#Username here
usernameBox.send_keys('mylittlepony45')#sends string to the username box
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[2]/dd/div/div/input')
#Password here
passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox
input("Press ENTER when you complete the CAPTCHA and press login\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
# wait for 50 sec until id = tab_content is found, then cont
# WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
# (By.XPATH, '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div[4]/div/div/div[1]/div/div[1]/div[2]/ol/li[1]/a')))
# (By.XPATH, '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div[4]/div/div/div[1]/div/div[1]')))
# Returns the name of the website
@ -256,8 +257,8 @@ def crawlForum(driver):
#end of loop
for i in range(counter):
driver.back()
# # comment out
# break
# comment out
break
# comment out
if count == 1:


+ 2
- 2
Forums/Altenens/parser.py View File

@ -129,10 +129,10 @@ def altenens_links_parser(soup):
href = []
listing = soup.find('div', {"class": "structItemContainer-group js-threadList"}).find_all('div', {"class": "structItem structItem--thread is-unread js-inlineModContainer js-threadListItem-1843963"})
listing = soup.find_all('div', {"class": "structItem-cell structItem-cell--main"})
for a in listing:
link = a.find('div', {"class": "structItem-title"}).find('a').get('href')
link = a.find('a', {"class": ""}).get('href')
href.append(link)

+ 1
- 1
Forums/Initialization/forumsList.txt View File

@ -1 +1 @@
Procrax
Altenens

+ 291
- 21
Forums/Initialization/geckodriver.log View File

@ -10617,8 +10617,69 @@ JavaScript error: https://cardingleaks.ws/js/xenconcept/hidebbcode/message.min.j
JavaScript error: https://cardingleaks.ws/js/xenconcept/hidebbcode/message.min.js?_v=516cdbc2, line 1: TypeError: XF.QuickReply is undefined
JavaScript error: https://cardingleaks.ws/js/xenconcept/hidebbcode/message.min.js?_v=516cdbc2, line 1: TypeError: XF.QuickReply is undefined
JavaScript error: resource:///actors/ContentMetaChild.jsm, line 179: InvalidStateError: JSWindowActorChild.sendAsyncMessage: JSWindowActorChild cannot send at the moment
1689179769906 geckodriver INFO Listening on 127.0.0.1:51129
1689179773543 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51130" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileDq8hqJ"
1689180161238 Marionette INFO Stopped listening on port 50658
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilekL4y3u\thumbnails) because it does not exist
###!!! [Parent][MessageChannel] Error: (msgtype=0x390076,name=PContent::Msg_DestroyBrowsingContextGroup) Closed channel: cannot send/recv
1689180161433 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
DevTools listening on ws://localhost:51130/devtools/browser/87b39fd8-e9cf-4e5c-8a69-4196731c2231
1689179776083 Marionette INFO Listening on port 51146
1689179776371 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource:///modules/FaviconLoader.jsm, line 596: InvalidStateError: JSWindowActorChild.sendAsyncMessage: JSWindowActorChild cannot send at the moment
JavaScript error: resource:///modules/FaviconLoader.jsm, line 596: InvalidStateError: JSWindowActorChild.sendAsyncMessage: JSWindowActorChild cannot send at the moment
1689180162121 Marionette INFO Stopped listening on port 51146
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileDq8hqJ\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1689180162324 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
ctory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51223/devtools/browser/2a3bd08d-43d3-43a7-9990-6cffd4aa8592
1689179897585 Marionette INFO Listening on port 51228
1689179897947 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689179914813 Marionette INFO Stopped listening on port 51228
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
###!!! [Child][MessageChannel] Error: (msgtype=0x590004,name=PHttpChannel::Msg_Cancel) Channel closing: too late to send/recv, messages will be lost
###!!! [Child][MessageChannel] Error: (msgtype=0x590008,name=PHttpChannel::Msg_DeletingChannel) Channel closing: too late to send/recv, messages will be lost
###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1689179915151 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689180110189 geckodriver INFO Listening on 127.0.0.1:51280
1689180114688 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51281" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileJMOfij"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
@ -10627,21 +10688,81 @@ console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689179774228 Marionette INFO Marionette enabled
1689180115400 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
DevTools listening on ws://localhost:51130/devtools/browser/87b39fd8-e9cf-4e5c-8a69-4196731c2231
1689179776083 Marionette INFO Listening on port 51146
1689179776371 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource:///modules/FaviconLoader.jsm, line 596: InvalidStateError: JSWindowActorChild.sendAsyncMessage: JSWindowActorChild cannot send at the moment
JavaScript error: resource:///modules/FaviconLoader.jsm, line 596: InvalidStateError: JSWindowActorChild.sendAsyncMessage: JSWindowActorChild cannot send at the moment
1689179892485 geckodriver INFO Listening on 127.0.0.1:51222
1689179895575 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51223" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileUnlIAI"
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51281/devtools/browser/3c5f01d6-7eb9-4793-9b53-135aabd01ddd
1689180117797 Marionette INFO Listening on port 51286
1689180118211 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
SourceActor threw an exception: [Exception... "Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/74d6fd32-ea0d-4971-a7c2-b52a97624ba0'" nsresult: "0x805303f4 (<unknown>)" location: "JS frame :: resource://devtools/shared/DevToolsUtils.js :: mainThreadFetch/< :: line 670" data: yes]
Stack: mainThreadFetch/<@resource://devtools/shared/DevToolsUtils.js:670:15
mainThreadFetch@resource://devtools/shared/DevToolsUtils.js:516:10
_fetchURLContents@resource://devtools/server/actors/utils/sources-manager.js:442:22
urlContents@resource://devtools/server/actors/utils/sources-manager.js:406:17
_resurrectSource@resource://devtools/server/actors/thread.js:2142:35
addAllSources@resource://devtools/server/actors/thread.js:1509:14
watch@resource://devtools/server/actors/resources/sources.js:52:17
watchResources@resource://devtools/server/actors/resources/index.js:239:19
_watchTargetResources@resource://devtools/server/actors/targets/target-actor-mixin.js:156:24
addWatcherDataEntry@resource://devtools/server/actors/targets/target-actor-mixin.js:47:20
_addWatcherDataEntry@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:483:24
receiveMessage@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:425:21
Line: 670, column: 0
console.error: ({})
SourceActor threw an exception: [Exception... "Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/cf6221b6-18cd-4bcf-af60-f2867cbd50c1'" nsresult: "0x805303f4 (<unknown>)" location: "JS frame :: resource://devtools/shared/DevToolsUtils.js :: mainThreadFetch/< :: line 670" data: yes]
Stack: mainThreadFetch/<@resource://devtools/shared/DevToolsUtils.js:670:15
mainThreadFetch@resource://devtools/shared/DevToolsUtils.js:516:10
_fetchURLContents@resource://devtools/server/actors/utils/sources-manager.js:442:22
urlContents@resource://devtools/server/actors/utils/sources-manager.js:406:17
_resurrectSource@resource://devtools/server/actors/thread.js:2142:35
addAllSources@resource://devtools/server/actors/thread.js:1509:14
watch@resource://devtools/server/actors/resources/sources.js:52:17
watchResources@resource://devtools/server/actors/resources/index.js:239:19
_watchTargetResources@resource://devtools/server/actors/targets/target-actor-mixin.js:156:24
addWatcherDataEntry@resource://devtools/server/actors/targets/target-actor-mixin.js:47:20
_addWatcherDataEntry@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:483:24
receiveMessage@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:425:21
Line: 670, column: 0
console.error: ({})
JavaScript error: resource://devtools/shared/DevToolsUtils.js, line 670: Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/74d6fd32-ea0d-4971-a7c2-b52a97624ba0'
JavaScript error: resource://devtools/shared/DevToolsUtils.js, line 670: Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/cf6221b6-18cd-4bcf-af60-f2867cbd50c1'
console.error: (new TypeError("container.editor is undefined", "resource://devtools/client/inspector/markup/markup.js", 1619))
JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex
JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex
JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex
JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex
JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex
JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex
JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex
JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex
JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex
JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex
console.warn: "Resource of root-node was not found."
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
1689180410579 Marionette INFO Stopped listening on port 51286
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileJMOfij\thumbnails) because it does not exist
###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1689180410980 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689180448418 geckodriver INFO Listening on 127.0.0.1:51361
1689180451948 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51362" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileBfMnOf"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
@ -10650,7 +10771,7 @@ console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689179896157 Marionette INFO Marionette enabled
1689180452513 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
@ -10658,24 +10779,173 @@ JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't fin
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51223/devtools/browser/2a3bd08d-43d3-43a7-9990-6cffd4aa8592
1689179897585 Marionette INFO Listening on port 51228
1689179897947 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689179914813 Marionette INFO Stopped listening on port 51228
DevTools listening on ws://localhost:51362/devtools/browser/eb2fd724-6b62-4272-b880-64fe24b85796
1689180454170 Marionette INFO Listening on port 51367
1689180454226 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
1689180468239 Marionette INFO Stopped listening on port 51367
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileBfMnOf\thumbnails) because it does not exist
###!!! [Child][MessageChannel] Error: (msgtype=0x590004,name=PHttpChannel::Msg_Cancel) Channel closing: too late to send/recv, messages will be lost
###!!! [Parent][MessageChannel] Error: (msgtype=0x390076,name=PContent::Msg_DestroyBrowsingContextGroup) Closed channel: cannot send/recv
###!!! [Child][MessageChannel] Error: (msgtype=0x590008,name=PHttpChannel::Msg_DeletingChannel) Channel closing: too late to send/recv, messages will be lost
###!!! [Child][MessageChannel] Error: (msgtype=0x390143,name=PContent::Msg_AsyncMessage) Channel closing: too late to send/recv, messages will be lost
JavaScript error: chrome://remote/content/marionette/driver.js, line 2326: NS_ERROR_ILLEGAL_VALUE: Component returned failure code: 0x80070057 (NS_ERROR_ILLEGAL_VALUE) [nsIObserverService.removeObserver]
1689180468546 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689180490645 geckodriver INFO Listening on 127.0.0.1:51415
1689180493200 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51416" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilepO8zPo"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689180493612 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51416/devtools/browser/5ddcd908-4e81-40c1-97c8-d4f4154501b0
1689180494832 Marionette INFO Listening on port 51432
1689180494904 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
1689180505120 Marionette INFO Stopped listening on port 51432
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilepO8zPo\thumbnails) because it does not exist
###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1689179915151 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
1689180505486 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689180540950 geckodriver INFO Listening on 127.0.0.1:51490
1689180545385 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51491" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofile1MaJMH"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689180546062 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51491/devtools/browser/4f6afa29-9a04-4acf-9aaa-6861e3d43365
1689180548320 Marionette INFO Listening on port 51496
1689180548448 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
1689180700106 Marionette INFO Stopped listening on port 51496
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
###!!! [Child][MessageChannel] Error: (msgtype=0x5D0005,name=PImageBridge::Msg_WillClose) Channel error: cannot send/recv
1689180700402 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689180728020 geckodriver INFO Listening on 127.0.0.1:51574
1689180732018 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51575" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileGWyS76"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689180732717 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51575/devtools/browser/3954f2de-919c-463f-a460-3d7733b86d37
1689180734784 Marionette INFO Listening on port 51580
1689180735341 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
1689180856914 Marionette INFO Stopped listening on port 51580
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
[Parent 5764, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
1689180857081 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689180891598 geckodriver INFO Listening on 127.0.0.1:51650
1689180894673 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51651" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofile12mWih"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689180895137 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:51651/devtools/browser/b4589447-7d7d-4ce8-a378-1871d24ce1c8
1689180896726 Marionette INFO Listening on port 51656
1689180896788 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null
1689181002227 Marionette INFO Stopped listening on port 51656
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
1689181002518 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109


+ 330
- 0
Forums/Libre/crawler_selenium.py View File

@ -0,0 +1,330 @@
__author__ = 'DarkWeb'
'''
Libre Forum Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.Libre.parser import libre_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/'
# Opens Tor Browser, crawls the website
def startCrawling():
opentor()
forumName = getForumName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(forumName, baseURL, False)
# Opens Tor Browser
def opentor():
from Forums.Initialization.forums_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Login using premade account credentials and do login captcha manually
def login(driver):
input('Press enter when CAPTCHA is completed, and you\'re at the login page')
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.NAME, value='login')
#Username here
usernameBox.send_keys('ct1234')#sends string to the username box
passwordBox = driver.find_element(by=By.NAME, value='password')
#Password here
passwordBox.send_keys('r5o0wqmw')# sends string to passwordBox
login = driver.find_element(by=By.CLASS_NAME, value='block-container')
login_link = login.find_element(by=By.TAG_NAME, value='button')
login_link.click()
# input('input')
# wait for listing page show up (This Xpath may need to change based on different seed url)
# wait for 50 sec until id = tab_content is found, then cont
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.TAG_NAME, 'nav')))
# click link to correct forum board
login_link = driver.find_element(by=By.XPATH, value='/html/body/nav/div[1]/a[3]').get_attribute('href')
driver.get(login_link) # open tab with url
# wait for listing page show up (This Xpath may need to change based on different seed url)
# wait for 50 sec until id = tab_content is found, then cont
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div/div/div[3]/div[5]')))
# Returns the name of the website
def getForumName() -> str:
name = 'Libre'
return name
# Return the link of the website
def getFixedURL():
url = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/'
return url
# Closes Tor Browser
def closetor(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close() #close tab
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Saves the crawled html page
def savePage(page, url):
cleanPage = cleanHTML(page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# # cyber security
links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/CyberSecurity')
# # services
# links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/Services')
# # programming
# links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/Programming')
return links
def crawlForum(driver):
print("Crawling the Libre forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
#variable to check if there is a next page for the topic
# has_next_topic_page = True
# counter = 1
# # check if there is a next page for the topics
# while has_next_topic_page:
# # try to access next page of th topic
# itemURL = urlparse.urljoin(baseURL, str(item))
# try:
# driver.get(itemURL)
# except:
# driver.refresh()
# savePage(driver.page_source, item)
#
# # if there is a next page then go and save....
# # Spec
# try:
# # temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div[1]/div[1]/div/nav/div[1]') # /html/body/div/div[2]/div/div[2]/div/
# item = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div
#
# if item == "":
# raise NoSuchElementException
# else:
# counter += 1
#
# except NoSuchElementException:
# has_next_topic_page = False
#
# # end of loop
# for i in range(counter):
# driver.back()
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
# temp = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div/div[1]/div/nav/div[1]')
link = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Libre forum done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if '/p/' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if '/c/' in url:
return True
return False
# calling the parser to define the links
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
return libre_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 247
- 0
Forums/Libre/parser.py View File

@ -0,0 +1,247 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from Forums.Utilities.utilities import *
from datetime import date
from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def cryptBB_description_parser(soup):
# Fields to be parsed
topic = "-1" # 0 *topic name
user = [] # 1 *all users of each post
status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
reputation = [] # 3 all user's karma in each post (usually found as a number)
interest = [] # 4 all user's interest in each post
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("td", {"class": "thead"}).find('strong')
topic = li.text
topic = re.sub("\[\w*\]", '', topic)
topic = topic.replace(",","")
topic = topic.replace("\n","")
topic = cleanString(topic.strip())
# Finding the repeated tag that corresponds to the listing of posts
# try:
posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
'div', {"class": "post"})
# For each message (post), get all the fields we are interested to:
for ipost in posts:
# Finding a first level of the HTML page
post_wrapper = ipost.find('span', {"class": "largetext"})
# Finding the author (user) of the post
author = post_wrapper.text.strip()
user.append(cleanString(author)) # Remember to clean the problematic characters
# Finding the status of the author
smalltext = ipost.find('div', {"class": "post_author"})
'''
# Testing here two possibilities to find this status and combine them
if ipost.find('div', {"class": "deleted_post_author"}):
status.append(-1)
interest.append(-1)
reputation.append(-1)
addDate.append(-1)
post.append("THIS POST HAS BEEN REMOVED!")
sign.append(-1)
feedback.append(-1)
continue
'''
# CryptBB does have membergroup and postgroup
membergroup = smalltext.find('div', {"class": "profile-rank"})
postgroup = smalltext.find('div', {"class": "postgroup"})
if membergroup != None:
membergroup = membergroup.text.strip()
if postgroup != None:
postgroup = postgroup.text.strip()
membergroup = membergroup + " - " + postgroup
else:
if postgroup != None:
membergroup = postgroup.text.strip()
else:
membergroup = "-1"
status.append(cleanString(membergroup))
# Finding the interest of the author
# CryptBB does not have blurb
blurb = smalltext.find('li', {"class": "blurb"})
if blurb != None:
blurb = blurb.text.strip()
else:
blurb = "-1"
interest.append(cleanString(blurb))
# Finding the reputation of the user
# CryptBB does have reputation
author_stats = smalltext.find('div', {"class": "author_statistics"})
karma = author_stats.find('strong')
if karma != None:
karma = karma.text
karma = karma.replace("Community Rating: ", "")
karma = karma.replace("Karma: ", "")
karma = karma.strip()
else:
karma = "-1"
reputation.append(cleanString(karma))
# Getting here another good tag to find the post date, post content and users' signature
postarea = ipost.find('div', {"class": "post_content"})
dt = postarea.find('span', {"class": "post_date"}).text
# dt = dt.strip().split()
dt = dt.strip()
day=date.today()
if "Yesterday" in dt:
yesterday = day - timedelta(days=1)
yesterday = yesterday.strftime('%m-%d-%Y')
stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday+ ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "hours ago" in dt:
day = day.strftime('%m-%d-%Y')
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
else:
date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
stime = date_time_obj.strftime('%b %d, %Y')
sdate = date_time_obj.strftime('%I:%M %p')
addDate.append(date_time_obj)
# Finding the post
inner = postarea.find('div', {"class": "post_body scaleimages"})
inner = inner.text.strip()
post.append(cleanString(inner))
# Finding the user's signature
# signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
signature = ipost.find('div', {"class": "signature scaleimages"})
if signature != None:
signature = signature.text.strip()
# print(signature)
else:
signature = "-1"
sign.append(cleanString(signature))
# As no information about user's feedback was found, just assign "-1" to the variable
feedback.append("-1")
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
# Sending the results
return row
# This is the method to parse the Listing Pages (one page with many posts)
def cryptBB_listing_parser(soup):
nm = 0 # *this variable should receive the number of topics
forum = "OnniForums" # 0 *forum name
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
# Finding the board (should be just one)
board = soup.find('span', {"class": "active"}).text
board = cleanString(board.strip())
# Finding the repeated tag that corresponds to the listing of topics
itopics = soup.find_all('tr',{"class": "inline_row"})
for itopic in itopics:
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
# to don't miss any topic
# Adding the topic to the topic list
try:
topics = itopic.find('span', {"class": "subject_old"}).find('a').text
except:
topics = itopic.find('span', {"class": "subject_new"}).find('a').text
topics = re.sub("\[\w*\]", '', topics)
topic.append(cleanString(topics))
# Counting how many topics we have found so far
nm = len(topic)
# Adding the url to the list of urls
try:
link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
except:
link = itopic.find('span',{"class": "subject_new"}).find('a').get('href')
href.append(link)
# Finding the author of the topic
ps = itopic.find('div', {"class":"author smalltext"}).find('a').text
user = ps.strip()
author.append(cleanString(user))
# Finding the number of replies
columns = itopic.findChildren('td',recursive=False)
replies = columns[3].text
posts.append(cleanString(replies))
# Finding the number of Views
tview = columns[4].text
views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable
addDate.append("-1")
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
def libre_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find_all('div', {"class": "flex-grow p-2 text-justify"})
for a in listing:
link = a.find('div', {'class': 'flex space-x-2 items-center'}).find('a').get('href')
href.append(link)
return href

+ 46
- 56
MarketPlaces/TorBay/parser.py View File

@ -14,30 +14,25 @@ from bs4 import BeautifulSoup
def torbay_description_parser(soup):
# Fields to be parsed
name = "-1" # 0 Product_Name
describe = "-1" # 1 Product_Description
lastSeen = "-1" # 2 Product_LastViewDate
rules = "-1" # 3 NOT USED ...
CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
review = "-1" # 6 Product_Number_Of_Reviews
category = "-1" # 7 Product_Category
shipFrom = "-1" # 8 Product_ShippedFrom
shipTo = "-1" # 9 Product_ShippedTo
left = "-1" # 10 Product_QuantityLeft
escrow = "-1" # 11 Vendor_Warranty
terms = "-1" # 12 Vendor_TermsAndConditions
vendor = "-1" # 13 Vendor_Name
sold = "-1" # 14 Product_QuantitySold
addDate = "-1" # 15 Product_AddedDate
available = "-1" # 16 NOT USED ...
endDate = "-1" # 17 NOT USED ...
BTC = "-1" # 18 Product_BTC_SellingPrice
USD = "-1" # 19 Product_USD_SellingPrice
rating = "-1" # 20 Vendor_Rating
success = "-1" # 21 Vendor_Successful_Transactions
EURO = "-1" # 22 Product_EURO_SellingPrice
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
# Finding Product Name
name = soup.find('div', {'class': 'product-information'}).find('h1').text.strip()
@ -46,7 +41,7 @@ def torbay_description_parser(soup):
vendor = soup.find('div', {"class": "profile-info"}).find('a').text.strip()
# Finding Vendor Rating
rating.append(-1)
rating_vendor.append(-1)
# Finding Successful Transactions
success.append(-1)
@ -57,9 +52,6 @@ def torbay_description_parser(soup):
USD = soup.find('div', {'class': "total-price"}).find('span').text.strip()
# Finding Escrow NEED WORK
escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Category
category = soup.find('div', {'class': "profile-info"}).find('p').find('a').text.strip()
@ -127,8 +119,8 @@ def torbay_description_parser(soup):
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
# Sending the results
return row
@ -141,29 +133,28 @@ def torbay_description_parser(soup):
def torbay_listing_parser(soup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "TorBay" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft =[] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
nm = 0 # *Total_Products (Should be Integer)
mktName = "TorBay" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
listing = soup.findAll('div', {"class": "product-card"})
@ -260,9 +251,8 @@ def torbay_listing_parser(soup):
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)


Loading…
Cancel
Save