Browse Source

save html to shared folder rest of forums edits

main
westernmeadow 1 year ago
parent
commit
3d30e70e3f
4 changed files with 42 additions and 43 deletions
  1. +9
    -9
      Forums/AbyssForum/crawler_selenium.py
  2. +9
    -8
      Forums/Altenens/crawler_selenium.py
  3. +9
    -9
      Forums/HiddenAnswers/crawler_selenium.py
  4. +15
    -17
      Forums/OnniForums/crawler_selenium.py

+ 9
- 9
Forums/AbyssForum/crawler_selenium.py View File

@ -24,8 +24,6 @@ from Forums.Initialization.prepare_parser import new_parse
from Forums.AbyssForum.parser import abyssForum_links_parser from Forums.AbyssForum.parser import abyssForum_links_parser
from Forums.Utilities.utilities import cleanHTML from Forums.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1 counter = 1
baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/' baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/'
@ -49,6 +47,8 @@ def startCrawling():
# Opens Tor Browser # Opens Tor Browser
def opentor(): def opentor():
from Forums.Initialization.forums_mining import config
global pid global pid
print("Connecting Tor...") print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
@ -91,6 +91,8 @@ def closetor(driver):
# Creates FireFox 'driver' and configure its 'Profile' # Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket # to use Tor proxy and socket
def createFFDriver(): def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -143,16 +145,14 @@ def savePage(page, url):
# Gets the full path of the page to be saved along with its appropriate file name # Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url): def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
fileName = getNameFromURL(url) fileName = getNameFromURL(url)
if isDescriptionLink(url): if isDescriptionLink(url):
#..\CryptBB\HTML_Pages\\
fullPath = r'..\AbyssForum\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else: else:
fullPath = r'..\AbyssForum\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath return fullPath


+ 9
- 8
Forums/Altenens/crawler_selenium.py View File

@ -24,8 +24,6 @@ from Forums.Initialization.prepare_parser import new_parse
from Forums.Altenens.parser import altenens_links_parser from Forums.Altenens.parser import altenens_links_parser
from Forums.Utilities.utilities import cleanHTML from Forums.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1 counter = 1
baseURL = 'https://altenens.is/' baseURL = 'https://altenens.is/'
@ -49,6 +47,8 @@ def startCrawling():
# Opens Tor Browser # Opens Tor Browser
def opentor(): def opentor():
from Forums.Initialization.forums_mining import config
global pid global pid
print("Connecting Tor...") print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
@ -108,6 +108,8 @@ def closetor(driver):
# Creates FireFox 'driver' and configure its 'Profile' # Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket # to use Tor proxy and socket
def createFFDriver(): def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -160,15 +162,14 @@ def savePage(page, url):
# Gets the full path of the page to be saved along with its appropriate file name # Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url): def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
fileName = getNameFromURL(url) fileName = getNameFromURL(url)
if isDescriptionLink(url): if isDescriptionLink(url):
fullPath = r'..\\Altenens\\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else: else:
fullPath = r'..\\Altenens\\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath return fullPath


+ 9
- 9
Forums/HiddenAnswers/crawler_selenium.py View File

@ -24,8 +24,6 @@ from Forums.Initialization.prepare_parser import new_parse
from Forums.HiddenAnswers.parser import hiddenanswers_links_parser from Forums.HiddenAnswers.parser import hiddenanswers_links_parser
from Forums.Utilities.utilities import cleanHTML from Forums.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1 counter = 1
baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/' baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/'
@ -49,6 +47,8 @@ def startCrawling():
# Opens Tor Browser # Opens Tor Browser
def opentor(): def opentor():
from Forums.Initialization.forums_mining import config
global pid global pid
print("Connecting Tor...") print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
@ -91,6 +91,8 @@ def closetor(driver):
# Creates FireFox 'driver' and configure its 'Profile' # Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket # to use Tor proxy and socket
def createFFDriver(): def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -143,16 +145,14 @@ def savePage(page, url):
# Gets the full path of the page to be saved along with its appropriate file name # Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url): def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
fileName = getNameFromURL(url) fileName = getNameFromURL(url)
if isDescriptionLink(url): if isDescriptionLink(url):
#..\CryptBB\HTML_Pages\\
fullPath = r'..\HiddenAnswers\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else: else:
fullPath = r'..\HiddenAnswers\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath return fullPath


+ 15
- 17
Forums/OnniForums/crawler_selenium.py View File

@ -25,8 +25,6 @@ from Forums.Initialization.prepare_parser import new_parse
from Forums.OnniForums.parser import onniForums_links_parser from Forums.OnniForums.parser import onniForums_links_parser
from Forums.Utilities.utilities import cleanHTML from Forums.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1 counter = 1
baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/' baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/'
@ -50,6 +48,8 @@ def startCrawling():
# Opens Tor Browser # Opens Tor Browser
def opentor(): def opentor():
from Forums.Initialization.forums_mining import config
global pid global pid
print("Connecting Tor...") print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
@ -109,6 +109,8 @@ def closetor(driver):
# Creates FireFox 'driver' and configure its 'Profile' # Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket # to use Tor proxy and socket
def createFFDriver(): def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -163,16 +165,14 @@ def savePage(page, url):
# Gets the full path of the page to be saved along with its appropriate file name # Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url): def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
fileName = getNameFromURL(url) fileName = getNameFromURL(url)
if isDescriptionLink(url): if isDescriptionLink(url):
#..\CryptBB\HTML_Pages\\
fullPath = r'..\OnniForums\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else: else:
fullPath = r'..\OnniForums\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath return fullPath
@ -279,15 +279,13 @@ def crawlForum(driver):
for i in range(counter): for i in range(counter):
driver.back() driver.back()
# comment out, one topic per page # comment out, one topic per page
# break
#
# # comment out, go through all pages
# if count == 1:
# count = 0
# break
break
# comment out, go through all pages
if count == 1:
count = 0
break
try: try:
temp = driver.find_element(by=By.XPATH, value= temp = driver.find_element(by=By.XPATH, value=


Loading…
Cancel
Save