Browse Source

Updated parser code for CryptBB

main
Khoi 1 year ago
parent
commit
b1a69d3e05
13 changed files with 228 additions and 28 deletions
  1. BIN
      Forums/BestCardingWorld/__pycache__/crawler_selenium.cpython-311.pyc
  2. +28
    -0
      Forums/CryptBB/HTML_Pages/06142023/Description/Read/showthreadphptid15446.html
  3. +28
    -0
      Forums/CryptBB/HTML_Pages/06142023/Description/Read/showthreadphptid16545.html
  4. +8
    -0
      Forums/CryptBB/HTML_Pages/06142023/Listing/httpcryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebidonionforumdisplayphpfid88.html
  5. +8
    -0
      Forums/CryptBB/HTML_Pages/06142023/Listing/httpcryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebidonionforumdisplayphpfid88page2.html
  6. BIN
      Forums/CryptBB/__pycache__/crawler_selenium.cpython-311.pyc
  7. BIN
      Forums/CryptBB/__pycache__/parser.cpython-311.pyc
  8. +13
    -13
      Forums/CryptBB/crawler_selenium.py
  9. +1
    -1
      Forums/CryptBB/parser.py
  10. BIN
      Forums/Initialization/__pycache__/prepare_parser.cpython-311.pyc
  11. +1
    -12
      Forums/Initialization/forums_mining.py
  12. +139
    -0
      Forums/Initialization/geckodriver.log
  13. +2
    -2
      Forums/Initialization/prepare_parser.py

BIN
Forums/BestCardingWorld/__pycache__/crawler_selenium.cpython-311.pyc View File


+ 28
- 0
Forums/CryptBB/HTML_Pages/06142023/Description/Read/showthreadphptid15446.html
File diff suppressed because it is too large
View File


+ 28
- 0
Forums/CryptBB/HTML_Pages/06142023/Description/Read/showthreadphptid16545.html
File diff suppressed because it is too large
View File


+ 8
- 0
Forums/CryptBB/HTML_Pages/06142023/Listing/httpcryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebidonionforumdisplayphpfid88.html
File diff suppressed because it is too large
View File


+ 8
- 0
Forums/CryptBB/HTML_Pages/06142023/Listing/httpcryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebidonionforumdisplayphpfid88page2.html
File diff suppressed because it is too large
View File


BIN
Forums/CryptBB/__pycache__/crawler_selenium.cpython-311.pyc View File


BIN
Forums/CryptBB/__pycache__/parser.cpython-311.pyc View File


+ 13
- 13
Forums/CryptBB/crawler_selenium.py View File

@ -29,19 +29,19 @@ baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion
# Opens Tor Browser, crawls the website
def startCrawling():
opentor()
# forumName = getForumName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, False)
# opentor()
forumName = getForumName()
# driver = getAccess()
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
new_parse(forumName, False)
# Opens Tor Browser


+ 1
- 1
Forums/CryptBB/parser.py View File

@ -351,4 +351,4 @@ def cryptBB_links_parser(soup):
href.append(link)
return href
return href

BIN
Forums/Initialization/__pycache__/prepare_parser.cpython-311.pyc View File


+ 1
- 12
Forums/Initialization/forums_mining.py View File

@ -11,6 +11,7 @@ from Forums.CryptBB.crawler_selenium import crawler as crawlerCryptBB
import time
@ -80,7 +81,6 @@ def createSubdirectories(pagesDir):
if __name__ == '__main__':
print("HelloWorld")
forumsList = getForums()
for forum in forumsList:
@ -95,17 +95,6 @@ if __name__ == '__main__':
crawlerBestCardingWorld()
elif forum == "CryptBB":
crawlerCryptBB()
<<<<<<< HEAD
# elif forum == "DWForums":
# crawlerDWForums()
# elif forum == "Dread":
# crawlerDread()
# elif forum == "Helium":
# crawlerHelium()
# elif forum == "Nulled":
# crawlerNulled()
=======
>>>>>>> 57f679ee7770f9425fa2725f80f96b9c7e0d0794
print("Scraping process completed successfully!")


+ 139
- 0
Forums/Initialization/geckodriver.log View File

@ -1630,3 +1630,142 @@ JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't fin
1686603752801 Marionette INFO Stopped listening on port 52208
JavaScript error: resource://gre/modules/AsyncShutdown.jsm, line 575: uncaught exception: SessionFileInternal.getWriter() called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 718: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
1686700263325 geckodriver INFO Listening on 127.0.0.1:53021
1686700267125 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "53022" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofilew7iA4s"
console.info: SecurityLevel: Listening for messages from NoScript.
console.info: SecurityLevel: Initializing security-prefs.js
console.info: SecurityLevel: security-prefs.js initialization complete
console.info: TorProtocolService: SOCKS port type: TCP
console.info: TorProtocolService: SOCKS host: 127.0.0.1
console.info: TorProtocolService: SOCKS port: 9150
1686700267942 Marionette INFO Marionette enabled
1686700267951 Marionette INFO Listening on port 53029
WebDriver BiDi listening on ws://localhost:53022
console.debug: TorProtocolService:
TorProtocolService initialized
Read port: 53029
console.debug: TorMonitorService:
TorMonitorService initialized
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.error: "getTorFile: cannot get torrc-defaults" (new Error("torrc-defaults file not found: C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Tor\\torrc-defaults", "resource://gre/modules/TorLauncherUtil.jsm", 245))
console.error: TorProcess:
startTor error:
Message: TypeError: torrcDefaultsFile is null
Stack:
_makeArgs@resource://gre/modules/TorProcess.jsm:296:23
start@resource://gre/modules/TorProcess.jsm:81:12
console.error: TorMonitorService:
Tor not running, not starting to monitor it.
1686700268118 RemoteAgent WARN TLS certificate errors will be ignored for this session
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:53022/devtools/browser/c673c878-2958-4481-8950-4fd6f5d3b948
1686700272178 Marionette WARN Ignoring event 'pageshow' because document has an invalid readyState of 'uninitialized'.
1686700272375 Marionette INFO Stopped listening on port 53029
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/AsyncShutdown.jsm, line 575: uncaught exception: SessionFileInternal.getWriter() called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\MINHKH~1\AppData\Local\Temp\rust_mozprofilew7iA4s\thumbnails) because it does not exist
1686700398479 geckodriver INFO Listening on 127.0.0.1:53150
1686700409460 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "53151" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofileUkq06Y"
console.info: SecurityLevel: Listening for messages from NoScript.
console.info: SecurityLevel: Initializing security-prefs.js
console.info: SecurityLevel: security-prefs.js initialization complete
console.info: TorProtocolService: SOCKS port type: TCP
console.info: TorProtocolService: SOCKS host: 127.0.0.1
console.info: TorProtocolService: SOCKS port: 9150
1686700410130 Marionette INFO Marionette enabled
1686700410138 Marionette INFO Listening on port 53161
WebDriver BiDi listening on ws://localhost:53151
Read port: 53161
console.debug: TorProtocolService:
TorProtocolService initialized
console.debug: TorMonitorService:
TorMonitorService initialized
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.error: "getTorFile: cannot get torrc-defaults" (new Error("torrc-defaults file not found: C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Tor\\torrc-defaults", "resource://gre/modules/TorLauncherUtil.jsm", 245))
console.error: TorProcess:
startTor error:
Message: TypeError: torrcDefaultsFile is null
Stack:
_makeArgs@resource://gre/modules/TorProcess.jsm:296:23
start@resource://gre/modules/TorProcess.jsm:81:12
console.error: TorMonitorService:
Tor not running, not starting to monitor it.
1686700410307 RemoteAgent WARN TLS certificate errors will be ignored for this session
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:53151/devtools/browser/2c2722d3-4863-403d-a64a-f78e13ef52f4
console.warn: LoginRecipes: "getRecipes: falling back to a synchronous message for:" "http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion"
console.warn: LoginRecipes: "getRecipes: falling back to a synchronous message for:" "http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion"
JavaScript error: resource://gre/modules/LoginManagerParent.jsm, line 136: TypeError: gRecipeManager is null
console.warn: LoginRecipes: "getRecipes: falling back to a synchronous message for:" "http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion"
JavaScript error: resource://gre/modules/LoginManagerParent.jsm, line 136: TypeError: gRecipeManager is null
console.warn: LoginRecipes: "getRecipes: falling back to a synchronous message for:" "http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion"
JavaScript error: resource://gre/modules/LoginManagerParent.jsm, line 136: TypeError: gRecipeManager is null
1686700505593 Marionette INFO Stopped listening on port 53161
JavaScript error: resource://gre/modules/AsyncShutdown.jsm, line 575: uncaught exception: SessionFileInternal.getWriter() called too early! Please read the session file from disk first.
JavaScript error: resource://gre/modules/PageThumbs.jsm, line 718: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished
[Parent 5020, IPC I/O Parent] WARNING: pipe error: 232: file /var/tmp/build/firefox-cc1bd3d61c87/ipc/chromium/src/chrome/common/ipc_channel_win.cc:554
1686700833340 geckodriver INFO Listening on 127.0.0.1:53267
1686700837481 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "53268" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofileVl5ZBn"
console.info: SecurityLevel: Listening for messages from NoScript.
console.info: SecurityLevel: Initializing security-prefs.js
console.info: SecurityLevel: security-prefs.js initialization complete
console.info: TorProtocolService: SOCKS port type: TCP
console.info: TorProtocolService: SOCKS host: 127.0.0.1
console.info: TorProtocolService: SOCKS port: 9150
1686700838381 Marionette INFO Marionette enabled
1686700838388 Marionette INFO Listening on port 53275
WebDriver BiDi listening on ws://localhost:53268
console.debug: TorProtocolService:
TorProtocolService initialized
console.debug: TorMonitorService:
TorMonitorService initialized
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.error: "getTorFile: cannot get torrc-defaults" (new Error("torrc-defaults file not found: C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Tor\\torrc-defaults", "resource://gre/modules/TorLauncherUtil.jsm", 245))
console.error: TorProcess:
startTor error:
Message: TypeError: torrcDefaultsFile is null
Stack:
_makeArgs@resource://gre/modules/TorProcess.jsm:296:23
start@resource://gre/modules/TorProcess.jsm:81:12
console.error: TorMonitorService:
Tor not running, not starting to monitor it.
Read port: 53275
1686700838658 RemoteAgent WARN TLS certificate errors will be ignored for this session
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:53268/devtools/browser/87406a4b-45a9-4dde-be46-ed420afc3449

+ 2
- 2
Forums/Initialization/prepare_parser.py View File

@ -112,11 +112,11 @@ def new_parse(forum, createLog):
raise SystemExit
# Reading the Listing Html Pages
for fileListing in glob.glob(os.path.join (os.getcwd().replace("Initialization","") + forum + "\\HTML_Pages\\" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + "\\Listing" ,'*.html')):
for fileListing in glob.glob(os.path.join (os.getcwd().replace("initialization","") + forum + "\\HTML_Pages\\" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + "\\Listing" ,'*.html')):
lines.append(fileListing)
# Reading the Description Html Pages
for fileDescription in glob.glob(os.path.join (os.getcwd().replace("Initialization","") + forum + "\\HTML_Pages\\" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + "\\Description" ,'*.html')):
for fileDescription in glob.glob(os.path.join (os.getcwd().replace("initialization","") + forum + "\\HTML_Pages\\" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + "\\Description" ,'*.html')):
lns.append(fileDescription)
# Parsing the Description Pages and put the tag's content into a dictionary (Hash table)


|||||||
x
 
000:0
Loading…
Cancel
Save