diff --git a/.gitignore b/.gitignore index 7ea5cec..fe0eeb1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ # Default ignored files /shelf/ -workspace.xml +.idea/workspace.xml selenium/geckodriver.exe setup.ini *.html diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py index 0dadd2d..b1c110a 100644 --- a/Forums/Altenens/crawler_selenium.py +++ b/Forums/Altenens/crawler_selenium.py @@ -1,7 +1,8 @@ __author__ = 'Helium' ''' -Altenens Forum Crawler (Selenium) +Altenens Forum Crawler (Selenium); +Untested due to CAPTCHAS and blocking the network ''' from selenium import webdriver @@ -36,7 +37,7 @@ def startCrawling(): if driver != 'down': try: - # login(driver) + login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) @@ -60,24 +61,24 @@ def opentor(): # Login using premade account credentials and do login captcha manually def login(driver): - # #click login button - # login_link = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]').get_attribute('href') - # driver.get(login_link) # open tab with url - # - # #entering username and password into input boxes - # usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[1]/dd/input') - # #Username here - # usernameBox.send_keys('mylittlepony45')#sends string to the username box - # passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[2]/dd/div/div/input') - # #Password here - # passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox - - input("Press ENTER when CAPTCHA is completed\n") + #click login button + login_link = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]').get_attribute('href') + driver.get(login_link) # open tab with url + + #entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[1]/dd/input') + #Username here + usernameBox.send_keys('mylittlepony45')#sends string to the username box + passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[2]/dd/div/div/input') + #Password here + passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox + + input("Press ENTER when you complete the CAPTCHA and press login\n") # wait for listing page show up (This Xpath may need to change based on different seed url) # wait for 50 sec until id = tab_content is found, then cont # WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - # (By.XPATH, '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div[4]/div/div/div[1]/div/div[1]/div[2]/ol/li[1]/a'))) + # (By.XPATH, '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div[4]/div/div/div[1]/div/div[1]'))) # Returns the name of the website @@ -256,8 +257,8 @@ def crawlForum(driver): #end of loop for i in range(counter): driver.back() - # # comment out - # break + # comment out + break # comment out if count == 1: diff --git a/Forums/Altenens/parser.py b/Forums/Altenens/parser.py index 8a3c3e7..5c5effd 100644 --- a/Forums/Altenens/parser.py +++ b/Forums/Altenens/parser.py @@ -129,10 +129,10 @@ def altenens_links_parser(soup): href = [] - listing = soup.find('div', {"class": "structItemContainer-group js-threadList"}).find_all('div', {"class": "structItem structItem--thread is-unread js-inlineModContainer js-threadListItem-1843963"}) + listing = soup.find_all('div', {"class": "structItem-cell structItem-cell--main"}) for a in listing: - link = a.find('div', {"class": "structItem-title"}).find('a').get('href') + link = a.find('a', {"class": ""}).get('href') href.append(link) diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt index d3ba91a..801a104 100644 --- a/Forums/Initialization/forumsList.txt +++ b/Forums/Initialization/forumsList.txt @@ -1 +1 @@ -Procrax \ No newline at end of file +Altenens \ No newline at end of file diff --git a/Forums/Initialization/geckodriver.log b/Forums/Initialization/geckodriver.log index e737acf..80a0c5a 100644 --- a/Forums/Initialization/geckodriver.log +++ b/Forums/Initialization/geckodriver.log @@ -10617,8 +10617,69 @@ JavaScript error: https://cardingleaks.ws/js/xenconcept/hidebbcode/message.min.j JavaScript error: https://cardingleaks.ws/js/xenconcept/hidebbcode/message.min.js?_v=516cdbc2, line 1: TypeError: XF.QuickReply is undefined JavaScript error: https://cardingleaks.ws/js/xenconcept/hidebbcode/message.min.js?_v=516cdbc2, line 1: TypeError: XF.QuickReply is undefined JavaScript error: resource:///actors/ContentMetaChild.jsm, line 179: InvalidStateError: JSWindowActorChild.sendAsyncMessage: JSWindowActorChild cannot send at the moment -1689179769906 geckodriver INFO Listening on 127.0.0.1:51129 -1689179773543 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51130" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileDq8hqJ" +1689180161238 Marionette INFO Stopped listening on port 50658 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilekL4y3u\thumbnails) because it does not exist + +###!!! [Parent][MessageChannel] Error: (msgtype=0x390076,name=PContent::Msg_DestroyBrowsingContextGroup) Closed channel: cannot send/recv + +1689180161433 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +DevTools listening on ws://localhost:51130/devtools/browser/87b39fd8-e9cf-4e5c-8a69-4196731c2231 +1689179776083 Marionette INFO Listening on port 51146 +1689179776371 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: resource:///modules/FaviconLoader.jsm, line 596: InvalidStateError: JSWindowActorChild.sendAsyncMessage: JSWindowActorChild cannot send at the moment +JavaScript error: resource:///modules/FaviconLoader.jsm, line 596: InvalidStateError: JSWindowActorChild.sendAsyncMessage: JSWindowActorChild cannot send at the moment +1689180162121 Marionette INFO Stopped listening on port 51146 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileDq8hqJ\thumbnails) because it does not exist + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1689180162324 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +ctory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:51223/devtools/browser/2a3bd08d-43d3-43a7-9990-6cffd4aa8592 +1689179897585 Marionette INFO Listening on port 51228 +1689179897947 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689179914813 Marionette INFO Stopped listening on port 51228 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished + +###!!! [Child][MessageChannel] Error: (msgtype=0x590004,name=PHttpChannel::Msg_Cancel) Channel closing: too late to send/recv, messages will be lost + + +###!!! [Child][MessageChannel] Error: (msgtype=0x590008,name=PHttpChannel::Msg_DeletingChannel) Channel closing: too late to send/recv, messages will be lost + + +###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1689179915151 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689180110189 geckodriver INFO Listening on 127.0.0.1:51280 +1689180114688 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51281" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileJMOfij" console.log: "TorSettings: loadFromPrefs()" console.log: "TorConnect: init()" console.log: "TorConnect: Entering Initial state" @@ -10627,21 +10688,81 @@ console.log: "TorConnect: Observing topic 'TorProcessExited'" console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" console.log: "TorConnect: Observing topic 'torsettings:ready'" console.log: "TorSettings: Observed profile-after-change" -1689179774228 Marionette INFO Marionette enabled +1689180115400 Marionette INFO Marionette enabled console.log: "TorConnect: Will load after bootstrap => [about:blank]" console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" -JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. -DevTools listening on ws://localhost:51130/devtools/browser/87b39fd8-e9cf-4e5c-8a69-4196731c2231 -1689179776083 Marionette INFO Listening on port 51146 -1689179776371 RemoteAgent WARN TLS certificate errors will be ignored for this session -JavaScript error: resource:///modules/FaviconLoader.jsm, line 596: InvalidStateError: JSWindowActorChild.sendAsyncMessage: JSWindowActorChild cannot send at the moment -JavaScript error: resource:///modules/FaviconLoader.jsm, line 596: InvalidStateError: JSWindowActorChild.sendAsyncMessage: JSWindowActorChild cannot send at the moment -1689179892485 geckodriver INFO Listening on 127.0.0.1:51222 -1689179895575 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51223" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileUnlIAI" +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:51281/devtools/browser/3c5f01d6-7eb9-4793-9b53-135aabd01ddd +1689180117797 Marionette INFO Listening on port 51286 +1689180118211 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +SourceActor threw an exception: [Exception... "Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/74d6fd32-ea0d-4971-a7c2-b52a97624ba0'" nsresult: "0x805303f4 ()" location: "JS frame :: resource://devtools/shared/DevToolsUtils.js :: mainThreadFetch/< :: line 670" data: yes] +Stack: mainThreadFetch/<@resource://devtools/shared/DevToolsUtils.js:670:15 +mainThreadFetch@resource://devtools/shared/DevToolsUtils.js:516:10 +_fetchURLContents@resource://devtools/server/actors/utils/sources-manager.js:442:22 +urlContents@resource://devtools/server/actors/utils/sources-manager.js:406:17 +_resurrectSource@resource://devtools/server/actors/thread.js:2142:35 +addAllSources@resource://devtools/server/actors/thread.js:1509:14 +watch@resource://devtools/server/actors/resources/sources.js:52:17 +watchResources@resource://devtools/server/actors/resources/index.js:239:19 +_watchTargetResources@resource://devtools/server/actors/targets/target-actor-mixin.js:156:24 +addWatcherDataEntry@resource://devtools/server/actors/targets/target-actor-mixin.js:47:20 +_addWatcherDataEntry@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:483:24 +receiveMessage@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:425:21 +Line: 670, column: 0 +console.error: ({}) +SourceActor threw an exception: [Exception... "Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/cf6221b6-18cd-4bcf-af60-f2867cbd50c1'" nsresult: "0x805303f4 ()" location: "JS frame :: resource://devtools/shared/DevToolsUtils.js :: mainThreadFetch/< :: line 670" data: yes] +Stack: mainThreadFetch/<@resource://devtools/shared/DevToolsUtils.js:670:15 +mainThreadFetch@resource://devtools/shared/DevToolsUtils.js:516:10 +_fetchURLContents@resource://devtools/server/actors/utils/sources-manager.js:442:22 +urlContents@resource://devtools/server/actors/utils/sources-manager.js:406:17 +_resurrectSource@resource://devtools/server/actors/thread.js:2142:35 +addAllSources@resource://devtools/server/actors/thread.js:1509:14 +watch@resource://devtools/server/actors/resources/sources.js:52:17 +watchResources@resource://devtools/server/actors/resources/index.js:239:19 +_watchTargetResources@resource://devtools/server/actors/targets/target-actor-mixin.js:156:24 +addWatcherDataEntry@resource://devtools/server/actors/targets/target-actor-mixin.js:47:20 +_addWatcherDataEntry@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:483:24 +receiveMessage@resource://devtools/server/connectors/js-window-actor/DevToolsFrameChild.jsm:425:21 +Line: 670, column: 0 +console.error: ({}) +JavaScript error: resource://devtools/shared/DevToolsUtils.js, line 670: Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/74d6fd32-ea0d-4971-a7c2-b52a97624ba0' +JavaScript error: resource://devtools/shared/DevToolsUtils.js, line 670: Failed to open input source 'blob:moz-extension://3064b8c5-bffd-4bf8-b2f1-210b12185538/cf6221b6-18cd-4bcf-af60-f2867cbd50c1' +console.error: (new TypeError("container.editor is undefined", "resource://devtools/client/inspector/markup/markup.js", 1619)) +JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex +JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex +JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex +JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex +JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex +JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex +JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex +JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex +JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex +JavaScript error: resource://gre/actors/AutoCompleteChild.jsm, line 125: Error: Invalid autocomplete selectedIndex +console.warn: "Resource of root-node was not found." +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +1689180410579 Marionette INFO Stopped listening on port 51286 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileJMOfij\thumbnails) because it does not exist + +###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1689180410980 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689180448418 geckodriver INFO Listening on 127.0.0.1:51361 +1689180451948 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51362" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileBfMnOf" console.log: "TorSettings: loadFromPrefs()" console.log: "TorConnect: init()" console.log: "TorConnect: Entering Initial state" @@ -10650,7 +10771,7 @@ console.log: "TorConnect: Observing topic 'TorProcessExited'" console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" console.log: "TorConnect: Observing topic 'torsettings:ready'" console.log: "TorSettings: Observed profile-after-change" -1689179896157 Marionette INFO Marionette enabled +1689180452513 Marionette INFO Marionette enabled console.log: "TorConnect: Will load after bootstrap => [about:blank]" console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. @@ -10658,24 +10779,173 @@ JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't fin JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined -DevTools listening on ws://localhost:51223/devtools/browser/2a3bd08d-43d3-43a7-9990-6cffd4aa8592 -1689179897585 Marionette INFO Listening on port 51228 -1689179897947 RemoteAgent WARN TLS certificate errors will be ignored for this session -1689179914813 Marionette INFO Stopped listening on port 51228 +DevTools listening on ws://localhost:51362/devtools/browser/eb2fd724-6b62-4272-b880-64fe24b85796 +1689180454170 Marionette INFO Listening on port 51367 +1689180454226 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +1689180468239 Marionette INFO Stopped listening on port 51367 JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] !!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. -JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileBfMnOf\thumbnails) because it does not exist -###!!! [Child][MessageChannel] Error: (msgtype=0x590004,name=PHttpChannel::Msg_Cancel) Channel closing: too late to send/recv, messages will be lost +###!!! [Parent][MessageChannel] Error: (msgtype=0x390076,name=PContent::Msg_DestroyBrowsingContextGroup) Closed channel: cannot send/recv -###!!! [Child][MessageChannel] Error: (msgtype=0x590008,name=PHttpChannel::Msg_DeletingChannel) Channel closing: too late to send/recv, messages will be lost +###!!! [Child][MessageChannel] Error: (msgtype=0x390143,name=PContent::Msg_AsyncMessage) Channel closing: too late to send/recv, messages will be lost +JavaScript error: chrome://remote/content/marionette/driver.js, line 2326: NS_ERROR_ILLEGAL_VALUE: Component returned failure code: 0x80070057 (NS_ERROR_ILLEGAL_VALUE) [nsIObserverService.removeObserver] +1689180468546 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689180490645 geckodriver INFO Listening on 127.0.0.1:51415 +1689180493200 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51416" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilepO8zPo" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689180493612 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:51416/devtools/browser/5ddcd908-4e81-40c1-97c8-d4f4154501b0 +1689180494832 Marionette INFO Listening on port 51432 +1689180494904 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +1689180505120 Marionette INFO Stopped listening on port 51432 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilepO8zPo\thumbnails) because it does not exist -###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost -1689179915151 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +1689180505486 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689180540950 geckodriver INFO Listening on 127.0.0.1:51490 +1689180545385 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51491" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofile1MaJMH" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689180546062 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:51491/devtools/browser/4f6afa29-9a04-4acf-9aaa-6861e3d43365 +1689180548320 Marionette INFO Listening on port 51496 +1689180548448 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +1689180700106 Marionette INFO Stopped listening on port 51496 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished + +###!!! [Child][MessageChannel] Error: (msgtype=0x5D0005,name=PImageBridge::Msg_WillClose) Channel error: cannot send/recv + +1689180700402 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689180728020 geckodriver INFO Listening on 127.0.0.1:51574 +1689180732018 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51575" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileGWyS76" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689180732717 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:51575/devtools/browser/3954f2de-919c-463f-a460-3d7733b86d37 +1689180734784 Marionette INFO Listening on port 51580 +1689180735341 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +1689180856914 Marionette INFO Stopped listening on port 51580 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished +[Parent 5764, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167 +1689180857081 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689180891598 geckodriver INFO Listening on 127.0.0.1:51650 +1689180894673 mozrunner::runner INFO Running command: "C:\\\\Users\\\\Helium\\\\Desktop\\\\Tor Browser\\\\Browser\\\\firefox.exe" "--marionette" "--remote-debugging-port" "51651" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofile12mWih" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689180895137 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:51651/devtools/browser/b4589447-7d7d-4ce8-a378-1871d24ce1c8 +1689180896726 Marionette INFO Listening on port 51656 +1689180896788 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +JavaScript error: https://altenens.is/js/xenmake/headroom.min.js, line 155: TypeError: this.elem is null +1689181002227 Marionette INFO Stopped listening on port 51656 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished +1689181002518 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 unwatchForTabs()@TargetList.jsm:70 unwatchForTargets()@TargetList.jsm:37 destructor()@TargetList.jsm:109 diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py new file mode 100644 index 0000000..59cea94 --- /dev/null +++ b/Forums/Libre/crawler_selenium.py @@ -0,0 +1,330 @@ +__author__ = 'DarkWeb' + +''' +Libre Forum Crawler (Selenium) +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait + +from PIL import Image +import urllib.parse as urlparse +import os, re, time +import subprocess +from bs4 import BeautifulSoup +from Forums.Initialization.prepare_parser import new_parse +from Forums.Libre.parser import libre_links_parser +from Forums.Utilities.utilities import cleanHTML + +counter = 1 +baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/' + + +# Opens Tor Browser, crawls the website +def startCrawling(): + opentor() + forumName = getForumName() + driver = getAccess() + + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) + + new_parse(forumName, baseURL, False) + + +# Opens Tor Browser +def opentor(): + from Forums.Initialization.forums_mining import config + + global pid + print("Connecting Tor...") + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) + pid = pro.pid + time.sleep(7.5) + input('Tor Connected. Press ENTER to continue\n') + return + + +# Login using premade account credentials and do login captcha manually +def login(driver): + + input('Press enter when CAPTCHA is completed, and you\'re at the login page') + + #entering username and password into input boxes + usernameBox = driver.find_element(by=By.NAME, value='login') + #Username here + usernameBox.send_keys('ct1234')#sends string to the username box + passwordBox = driver.find_element(by=By.NAME, value='password') + #Password here + passwordBox.send_keys('r5o0wqmw')# sends string to passwordBox + + login = driver.find_element(by=By.CLASS_NAME, value='block-container') + login_link = login.find_element(by=By.TAG_NAME, value='button') + login_link.click() + + # input('input') + + # wait for listing page show up (This Xpath may need to change based on different seed url) + # wait for 50 sec until id = tab_content is found, then cont + WebDriverWait(driver, 50).until(EC.visibility_of_element_located( + (By.TAG_NAME, 'nav'))) + + # click link to correct forum board + login_link = driver.find_element(by=By.XPATH, value='/html/body/nav/div[1]/a[3]').get_attribute('href') + driver.get(login_link) # open tab with url + + # wait for listing page show up (This Xpath may need to change based on different seed url) + # wait for 50 sec until id = tab_content is found, then cont + WebDriverWait(driver, 50).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div/div/div[3]/div[5]'))) + + +# Returns the name of the website +def getForumName() -> str: + name = 'Libre' + return name + + +# Return the link of the website +def getFixedURL(): + url = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/' + return url + + +# Closes Tor Browser +def closetor(driver): + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() #close tab + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + from Forums.Initialization.forums_mining import config + + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) + + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("permissions.default.image", 3) + ff_prof.set_preference("browser.download.folderList", 2) + ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", True) + ff_prof.update_preferences() + + service = Service(config.get('TOR', 'geckodriver_path')) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + return driver + + +def getAccess(): + url = getFixedURL() + driver = createFFDriver() + try: + driver.get(url) + return driver + except: + driver.close() + return 'down' + + +# Saves the crawled html page +def savePage(page, url): + cleanPage = cleanHTML(page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +# Gets the full path of the page to be saved along with its appropriate file name +def getFullPathName(url): + from Forums.Initialization.forums_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') + else: + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') + return fullPath + + +# Creates the file name from passed URL +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if name == '': + name = str(counter) + counter = counter + 1 + return name + + +def getInterestedLinks(): + links = [] + + # # cyber security + links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/CyberSecurity') + # # services + # links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/Services') + # # programming + # links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/Programming') + + return links + + +def crawlForum(driver): + print("Crawling the Libre forum") + + linksToCrawl = getInterestedLinks() + visited = set(linksToCrawl) + initialTime = time.time() + + i = 0 + count = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] + print('Crawling :', link) + try: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + has_next_page = True + + while has_next_page: + list = topicPages(html) + for item in list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver.page_source, item) + driver.back() + + #variable to check if there is a next page for the topic + # has_next_topic_page = True + # counter = 1 + + # # check if there is a next page for the topics + # while has_next_topic_page: + # # try to access next page of th topic + # itemURL = urlparse.urljoin(baseURL, str(item)) + # try: + # driver.get(itemURL) + # except: + # driver.refresh() + # savePage(driver.page_source, item) + # + # # if there is a next page then go and save.... + # # Spec + # try: + # # temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div[1]/div[1]/div/nav/div[1]') # /html/body/div/div[2]/div/div[2]/div/ + # item = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div + # + # if item == "": + # raise NoSuchElementException + # else: + # counter += 1 + # + # except NoSuchElementException: + # has_next_topic_page = False + # + # # end of loop + # for i in range(counter): + # driver.back() + + # comment out + break + + # comment out + if count == 1: + count = 0 + break + + try: + # temp = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[4]/div/div[5]/div[2]/div/div/div[1]/div/nav/div[1]') + link = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href') + + if link == "": + raise NoSuchElementException + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + count += 1 + + except NoSuchElementException: + has_next_page = False + + except Exception as e: + print(link, e) + i += 1 + + # finalTime = time.time() + # print finalTime - initialTime + + input("Crawling Libre forum done successfully. Press ENTER to continue\n") + + +# Returns 'True' if the link is Topic link, may need to change for every website +def isDescriptionLink(url): + if '/p/' in url: + return True + return False + + +# Returns True if the link is a listingPage link, may need to change for every website +def isListingLink(url): + if '/c/' in url: + return True + return False + + +# calling the parser to define the links +def topicPages(html): + soup = BeautifulSoup(html, "html.parser") + return libre_links_parser(soup) + + +def crawler(): + startCrawling() + # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/Forums/Libre/parser.py b/Forums/Libre/parser.py new file mode 100644 index 0000000..093c671 --- /dev/null +++ b/Forums/Libre/parser.py @@ -0,0 +1,247 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from Forums.Utilities.utilities import * +from datetime import date +from datetime import timedelta +import re + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + +# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) + + +def cryptBB_description_parser(soup): + + # Fields to be parsed + + topic = "-1" # 0 *topic name + user = [] # 1 *all users of each post + status = [] # 2 all user's authority in each post such as (adm, member, dangerous) + reputation = [] # 3 all user's karma in each post (usually found as a number) + interest = [] # 4 all user's interest in each post + sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) + post = [] # 6 all messages of each post + feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) + addDate = [] # 8 all dates of each post + + # Finding the topic (should be just one coming from the Listing Page) + + li = soup.find("td", {"class": "thead"}).find('strong') + topic = li.text + topic = re.sub("\[\w*\]", '', topic) + + topic = topic.replace(",","") + topic = topic.replace("\n","") + topic = cleanString(topic.strip()) + + # Finding the repeated tag that corresponds to the listing of posts + + # try: + posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all( + 'div', {"class": "post"}) + + # For each message (post), get all the fields we are interested to: + + for ipost in posts: + + # Finding a first level of the HTML page + + post_wrapper = ipost.find('span', {"class": "largetext"}) + + # Finding the author (user) of the post + + author = post_wrapper.text.strip() + user.append(cleanString(author)) # Remember to clean the problematic characters + + # Finding the status of the author + + smalltext = ipost.find('div', {"class": "post_author"}) + + ''' + # Testing here two possibilities to find this status and combine them + if ipost.find('div', {"class": "deleted_post_author"}): + status.append(-1) + interest.append(-1) + reputation.append(-1) + addDate.append(-1) + post.append("THIS POST HAS BEEN REMOVED!") + sign.append(-1) + feedback.append(-1) + continue + ''' + + # CryptBB does have membergroup and postgroup + + membergroup = smalltext.find('div', {"class": "profile-rank"}) + postgroup = smalltext.find('div', {"class": "postgroup"}) + if membergroup != None: + membergroup = membergroup.text.strip() + if postgroup != None: + postgroup = postgroup.text.strip() + membergroup = membergroup + " - " + postgroup + else: + if postgroup != None: + membergroup = postgroup.text.strip() + else: + membergroup = "-1" + status.append(cleanString(membergroup)) + + # Finding the interest of the author + # CryptBB does not have blurb + blurb = smalltext.find('li', {"class": "blurb"}) + if blurb != None: + blurb = blurb.text.strip() + else: + blurb = "-1" + interest.append(cleanString(blurb)) + + # Finding the reputation of the user + # CryptBB does have reputation + author_stats = smalltext.find('div', {"class": "author_statistics"}) + karma = author_stats.find('strong') + if karma != None: + karma = karma.text + karma = karma.replace("Community Rating: ", "") + karma = karma.replace("Karma: ", "") + karma = karma.strip() + else: + karma = "-1" + reputation.append(cleanString(karma)) + + # Getting here another good tag to find the post date, post content and users' signature + + postarea = ipost.find('div', {"class": "post_content"}) + + dt = postarea.find('span', {"class": "post_date"}).text + # dt = dt.strip().split() + dt = dt.strip() + day=date.today() + if "Yesterday" in dt: + yesterday = day - timedelta(days=1) + yesterday = yesterday.strftime('%m-%d-%Y') + stime = dt.replace('Yesterday,','').strip() + date_time_obj = yesterday+ ', '+stime + date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') + elif "hours ago" in dt: + day = day.strftime('%m-%d-%Y') + date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] + date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p') + else: + date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') + stime = date_time_obj.strftime('%b %d, %Y') + sdate = date_time_obj.strftime('%I:%M %p') + addDate.append(date_time_obj) + + # Finding the post + + inner = postarea.find('div', {"class": "post_body scaleimages"}) + inner = inner.text.strip() + post.append(cleanString(inner)) + + # Finding the user's signature + + # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) + signature = ipost.find('div', {"class": "signature scaleimages"}) + if signature != None: + signature = signature.text.strip() + # print(signature) + else: + signature = "-1" + sign.append(cleanString(signature)) + + # As no information about user's feedback was found, just assign "-1" to the variable + + feedback.append("-1") + + # Populate the final variable (this should be a list with all fields scraped) + + row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) + + # Sending the results + + return row + +# This is the method to parse the Listing Pages (one page with many posts) + +def cryptBB_listing_parser(soup): + + nm = 0 # *this variable should receive the number of topics + forum = "OnniForums" # 0 *forum name + board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + author = [] # 2 *all authors of each topic + topic = [] # 3 *all topics + views = [] # 4 number of views of each topic + posts = [] # 5 number of posts of each topic + href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between + # Listing and Description pages) + addDate = [] # 7 when the topic was created (difficult to find) + + # Finding the board (should be just one) + + board = soup.find('span', {"class": "active"}).text + board = cleanString(board.strip()) + + # Finding the repeated tag that corresponds to the listing of topics + + itopics = soup.find_all('tr',{"class": "inline_row"}) + + for itopic in itopics: + + # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them + # to don't miss any topic + + # Adding the topic to the topic list + try: + topics = itopic.find('span', {"class": "subject_old"}).find('a').text + except: + topics = itopic.find('span', {"class": "subject_new"}).find('a').text + topics = re.sub("\[\w*\]", '', topics) + topic.append(cleanString(topics)) + + # Counting how many topics we have found so far + + nm = len(topic) + + # Adding the url to the list of urls + try: + link = itopic.find('span', {"class": "subject_old"}).find('a').get('href') + except: + link = itopic.find('span',{"class": "subject_new"}).find('a').get('href') + href.append(link) + + # Finding the author of the topic + ps = itopic.find('div', {"class":"author smalltext"}).find('a').text + user = ps.strip() + author.append(cleanString(user)) + + # Finding the number of replies + columns = itopic.findChildren('td',recursive=False) + replies = columns[3].text + + posts.append(cleanString(replies)) + + # Finding the number of Views + tview = columns[4].text + views.append(cleanString(tview)) + + # If no information about when the topic was added, just assign "-1" to the variable + + addDate.append("-1") + + return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) + + +def libre_links_parser(soup): + # Returning all links that should be visited by the Crawler + href = [] + listing = soup.find_all('div', {"class": "flex-grow p-2 text-justify"}) + + for a in listing: + link = a.find('div', {'class': 'flex space-x-2 items-center'}).find('a').get('href') + + href.append(link) + + return href diff --git a/MarketPlaces/TorBay/parser.py b/MarketPlaces/TorBay/parser.py index 9b8481f..2e6aeea 100644 --- a/MarketPlaces/TorBay/parser.py +++ b/MarketPlaces/TorBay/parser.py @@ -14,30 +14,25 @@ from bs4 import BeautifulSoup def torbay_description_parser(soup): # Fields to be parsed - - name = "-1" # 0 Product_Name - describe = "-1" # 1 Product_Description - lastSeen = "-1" # 2 Product_LastViewDate - rules = "-1" # 3 NOT USED ... - CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 5 Product_MS_Classification (Microsoft Security) - review = "-1" # 6 Product_Number_Of_Reviews - category = "-1" # 7 Product_Category - shipFrom = "-1" # 8 Product_ShippedFrom - shipTo = "-1" # 9 Product_ShippedTo - left = "-1" # 10 Product_QuantityLeft - escrow = "-1" # 11 Vendor_Warranty - terms = "-1" # 12 Vendor_TermsAndConditions - vendor = "-1" # 13 Vendor_Name - sold = "-1" # 14 Product_QuantitySold - addDate = "-1" # 15 Product_AddedDate - available = "-1" # 16 NOT USED ... - endDate = "-1" # 17 NOT USED ... - BTC = "-1" # 18 Product_BTC_SellingPrice - USD = "-1" # 19 Product_USD_SellingPrice - rating = "-1" # 20 Vendor_Rating - success = "-1" # 21 Vendor_Successful_Transactions - EURO = "-1" # 22 Product_EURO_SellingPrice + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo # Finding Product Name name = soup.find('div', {'class': 'product-information'}).find('h1').text.strip() @@ -46,7 +41,7 @@ def torbay_description_parser(soup): vendor = soup.find('div', {"class": "profile-info"}).find('a').text.strip() # Finding Vendor Rating - rating.append(-1) + rating_vendor.append(-1) # Finding Successful Transactions success.append(-1) @@ -57,9 +52,6 @@ def torbay_description_parser(soup): USD = soup.find('div', {'class': "total-price"}).find('span').text.strip() - # Finding Escrow NEED WORK - escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip() - # Finding the Product Category category = soup.find('div', {'class': "profile-info"}).find('p').find('a').text.strip() @@ -127,8 +119,8 @@ def torbay_description_parser(soup): MS = MS.replace('\n', '') # Populating the final variable (this should be a list with all fields scraped) - row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, - sold, addDate, available, endDate, BTC, USD, rating, success, EURO) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo) # Sending the results return row @@ -141,29 +133,28 @@ def torbay_description_parser(soup): def torbay_listing_parser(soup): # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "TorBay" # 0 Marketplace_Name - name = [] # 1 Product_Name - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category - describe = [] # 5 Product_Description - escrow = [] # 6 Vendor_Warranty - views = [] # 7 Product_Number_Of_Views - reviews = [] # 8 Product_Number_Of_Reviews - addDate = [] # 9 Product_AddDate - lastSeen = [] # 10 Product_LastViewDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft =[] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions - href = [] # 23 Product_Links (Urls) + nm = 0 # *Total_Products (Should be Integer) + mktName = "TorBay" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + href = [] # 20 Product_Links listing = soup.findAll('div', {"class": "product-card"}) @@ -260,9 +251,8 @@ def torbay_listing_parser(soup): MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, - BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) - + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) #called by the crawler to get description links on a listing page #@param: beautifulsoup object that is using the correct html page (listing page)