Browse Source

completed abyssforum crawler

main
Helium 1 year ago
parent
commit
2963490d6a
4 changed files with 137 additions and 10 deletions
  1. +40
    -7
      Forums/AbyssForum/crawler_selenium.py
  2. +2
    -2
      Forums/AbyssForum/parser.py
  3. +0
    -1
      Forums/Initialization/forumsList.txt
  4. +95
    -0
      Forums/Initialization/geckodriver.log

+ 40
- 7
Forums/AbyssForum/crawler_selenium.py View File

@ -208,6 +208,7 @@ def crawlForum(driver):
savePage(html, link)
has_next_page = True
while has_next_page:
list = topicPages(html)
for item in list:
@ -218,18 +219,49 @@ def crawlForum(driver):
driver.refresh()
savePage(driver.page_source, item)
driver.back()
# comment out, one topic per page
'''
#variable to check if there is a next page for the topic
has_next_topic_page = True
counter = 1
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
# if there is a next page then go and save....
# next page in the topic?
try:
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/
item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div
if item == "":
raise NoSuchElementException
has_next_topic_page = False
else:
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# end of loop
for i in range(counter):
driver.back()
'''
# comment out
break
# comment out, go through all pages
# comment out
if count == 1:
count = 0
break
count = 0
break
try:
temp = driver.find_element(by=By.XPATH, value=
'/html/body/div/div[2]/div/div[3]/div') # /html/body/div/div[2]/div/div[3]/div
link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
@ -251,6 +283,7 @@ def crawlForum(driver):
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling AbyssForum forum done sucessfully. Press ENTER to continue\n")


+ 2
- 2
Forums/AbyssForum/parser.py View File

@ -342,10 +342,10 @@ def abyssForum_links_parser(soup):
href = []
#print(soup.find('table', {"class": "tborder clear"}).find(
# 'tbody').find_all('tr', {"class": "inline_row"}))
listing = soup.find_all('tr', {"class": "list-inner"})
listing = soup.find_all('dl', {"class": "row-item topic_read"})
for a in listing:
link = a.find('a', {"class": "topictitle"}).get('href')
link = a.find('div', {"class": "list-inner"}).find('a').get('href')
href.append(link)

+ 0
- 1
Forums/Initialization/forumsList.txt View File

@ -1,2 +1 @@
OnniForums
AbyssForum

+ 95
- 0
Forums/Initialization/geckodriver.log View File

@ -5631,3 +5631,98 @@ JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: key "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID" was already initialized
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: ""
JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}"
1687304344006 Marionette INFO Stopped listening on port 51875
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
[Parent 8700, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687304344268 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687358079328 geckodriver INFO Listening on 127.0.0.1:50041
1687358085334 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "50042" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileUuDmLT"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687358086786 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
JavaScript error: resource://gre/modules/ExtensionContent.jsm, line 575: TypeError: PrecompiledScript.executeInGlobal: Argument 1 is not an object.
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:50042/devtools/browser/69d941a8-6fb6-4ff4-b6a0-dd37e13b70d0
1687358088685 Marionette INFO Listening on port 50056
1687358089153 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687358109988 Marionette INFO Stopped listening on port 50056
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileUuDmLT\thumbnails) because it does not exist
1687358110790 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1687358514267 geckodriver INFO Listening on 127.0.0.1:50841
1687358516175 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "50842" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofileHvl1ks"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1687358516555 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:50842/devtools/browser/32fc545b-0ce6-4daf-9fb1-b239ce500a0d
1687358517502 Marionette INFO Listening on port 50847
1687358517524 RemoteAgent WARN TLS certificate errors will be ignored for this session
1687358538772 Marionette INFO Stopped listening on port 50847
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofileHvl1ks\thumbnails) because it does not exist
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1687358539082 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138

Loading…
Cancel
Save