diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/AbyssForum/crawler_selenium.py index 9e53696..573cd13 100644 --- a/Forums/AbyssForum/crawler_selenium.py +++ b/Forums/AbyssForum/crawler_selenium.py @@ -24,8 +24,6 @@ from Forums.Initialization.prepare_parser import new_parse from Forums.AbyssForum.parser import abyssForum_links_parser from Forums.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/' @@ -49,6 +47,8 @@ def startCrawling(): # Opens Tor Browser def opentor(): + from Forums.Initialization.forums_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -91,6 +91,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from Forums.Initialization.forums_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -143,16 +145,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): + from Forums.Initialization.forums_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - #..\CryptBB\HTML_Pages\\ - fullPath = r'..\AbyssForum\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\AbyssForum\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py index 9670014..20bf4c5 100644 --- a/Forums/Altenens/crawler_selenium.py +++ b/Forums/Altenens/crawler_selenium.py @@ -24,8 +24,6 @@ from Forums.Initialization.prepare_parser import new_parse from Forums.Altenens.parser import altenens_links_parser from Forums.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'https://altenens.is/' @@ -49,6 +47,8 @@ def startCrawling(): # Opens Tor Browser def opentor(): + from Forums.Initialization.forums_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -108,6 +108,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from Forums.Initialization.forums_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -160,15 +162,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): + from Forums.Initialization.forums_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\\Altenens\\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\\Altenens\\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/HiddenAnswers/crawler_selenium.py index 34f3c07..66085a3 100644 --- a/Forums/HiddenAnswers/crawler_selenium.py +++ b/Forums/HiddenAnswers/crawler_selenium.py @@ -24,8 +24,6 @@ from Forums.Initialization.prepare_parser import new_parse from Forums.HiddenAnswers.parser import hiddenanswers_links_parser from Forums.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/' @@ -49,6 +47,8 @@ def startCrawling(): # Opens Tor Browser def opentor(): + from Forums.Initialization.forums_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -91,6 +91,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from Forums.Initialization.forums_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -143,16 +145,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): + from Forums.Initialization.forums_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - #..\CryptBB\HTML_Pages\\ - fullPath = r'..\HiddenAnswers\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\HiddenAnswers\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/Forums/OnniForums/crawler_selenium.py b/Forums/OnniForums/crawler_selenium.py index 6e2445b..857993a 100644 --- a/Forums/OnniForums/crawler_selenium.py +++ b/Forums/OnniForums/crawler_selenium.py @@ -25,8 +25,6 @@ from Forums.Initialization.prepare_parser import new_parse from Forums.OnniForums.parser import onniForums_links_parser from Forums.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/' @@ -50,6 +48,8 @@ def startCrawling(): # Opens Tor Browser def opentor(): + from Forums.Initialization.forums_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -109,6 +109,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from Forums.Initialization.forums_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -163,16 +165,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): + from Forums.Initialization.forums_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - #..\CryptBB\HTML_Pages\\ - fullPath = r'..\OnniForums\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\OnniForums\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath @@ -279,15 +279,13 @@ def crawlForum(driver): for i in range(counter): driver.back() - - # comment out, one topic per page - # break - # - # # comment out, go through all pages - # if count == 1: - # count = 0 - # break + break + + # comment out, go through all pages + if count == 1: + count = 0 + break try: temp = driver.find_element(by=By.XPATH, value=