diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/AbyssForum/crawler_selenium.py index 7fe1184..03f48dc 100644 --- a/Forums/AbyssForum/crawler_selenium.py +++ b/Forums/AbyssForum/crawler_selenium.py @@ -208,6 +208,7 @@ def crawlForum(driver): savePage(html, link) has_next_page = True + while has_next_page: list = topicPages(html) for item in list: @@ -218,18 +219,49 @@ def crawlForum(driver): driver.refresh() savePage(driver.page_source, item) driver.back() - # comment out, one topic per page + ''' + #variable to check if there is a next page for the topic + has_next_topic_page = True + counter = 1 + + # check if there is a next page for the topics + while has_next_topic_page: + # try to access next page of th topic + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver.page_source, item) + + # if there is a next page then go and save.... + # next page in the topic? + try: + temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/ + item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div + + if item == "": + raise NoSuchElementException + has_next_topic_page = False + else: + counter += 1 + except NoSuchElementException: + has_next_topic_page = False + + # end of loop + for i in range(counter): + driver.back() + ''' + # comment out break - # comment out, go through all pages + # comment out if count == 1: - count = 0 - break + count = 0 + break try: - temp = driver.find_element(by=By.XPATH, value= - '/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[3]/div - link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') + link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href') if link == "": raise NoSuchElementException @@ -251,6 +283,7 @@ def crawlForum(driver): # finalTime = time.time() # print finalTime - initialTime + input("Crawling AbyssForum forum done sucessfully. Press ENTER to continue\n") diff --git a/Forums/AbyssForum/parser.py b/Forums/AbyssForum/parser.py index 49f8e37..2b34933 100644 --- a/Forums/AbyssForum/parser.py +++ b/Forums/AbyssForum/parser.py @@ -342,10 +342,10 @@ def abyssForum_links_parser(soup): href = [] #print(soup.find('table', {"class": "tborder clear"}).find( # 'tbody').find_all('tr', {"class": "inline_row"})) - listing = soup.find_all('tr', {"class": "list-inner"}) + listing = soup.find_all('dl', {"class": "row-item topic_read"}) for a in listing: - link = a.find('a', {"class": "topictitle"}).get('href') + link = a.find('div', {"class": "list-inner"}).find('a').get('href') href.append(link) diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt index cdfc2c4..af264a2 100644 --- a/Forums/Initialization/forumsList.txt +++ b/Forums/Initialization/forumsList.txt @@ -1,2 +1 @@ -OnniForums AbyssForum \ No newline at end of file diff --git a/Forums/Initialization/geckodriver.log b/Forums/Initialization/geckodriver.log index 622aee0..69aa452 100644 --- a/Forums/Initialization/geckodriver.log +++ b/Forums/Initialization/geckodriver.log @@ -5631,3 +5631,98 @@ JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}" JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: "" JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}" +JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS" was already initialized +JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID" was already initialized +JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: "" +JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}" +JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: "" +JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}" +JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS" was already initialized +JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID" was already initialized +JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: "" +JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}" +JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: "" +JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}" +1687304344006 Marionette INFO Stopped listening on port 51875 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished +[Parent 8700, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167 + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1687304344268 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1687358079328 geckodriver INFO Listening on 127.0.0.1:50041 +1687358085334 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "50042" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileUuDmLT" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1687358086786 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +JavaScript error: resource://gre/modules/ExtensionContent.jsm, line 575: TypeError: PrecompiledScript.executeInGlobal: Argument 1 is not an object. +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:50042/devtools/browser/69d941a8-6fb6-4ff4-b6a0-dd37e13b70d0 +1687358088685 Marionette INFO Listening on port 50056 +1687358089153 RemoteAgent WARN TLS certificate errors will be ignored for this session +1687358109988 Marionette INFO Stopped listening on port 50056 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileUuDmLT\thumbnails) because it does not exist +1687358110790 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1687358514267 geckodriver INFO Listening on 127.0.0.1:50841 +1687358516175 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "50842" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileHvl1ks" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1687358516555 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:50842/devtools/browser/32fc545b-0ce6-4daf-9fb1-b239ce500a0d +1687358517502 Marionette INFO Listening on port 50847 +1687358517524 RemoteAgent WARN TLS certificate errors will be ignored for this session +1687358538772 Marionette INFO Stopped listening on port 50847 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileHvl1ks\thumbnails) because it does not exist + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1687358539082 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138