|
@ -25,8 +25,6 @@ from Forums.Initialization.prepare_parser import new_parse |
|
|
from Forums.OnniForums.parser import onniForums_links_parser |
|
|
from Forums.OnniForums.parser import onniForums_links_parser |
|
|
from Forums.Utilities.utilities import cleanHTML |
|
|
from Forums.Utilities.utilities import cleanHTML |
|
|
|
|
|
|
|
|
config = configparser.ConfigParser() |
|
|
|
|
|
config.read('../../setup.ini') |
|
|
|
|
|
counter = 1 |
|
|
counter = 1 |
|
|
baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/' |
|
|
baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/' |
|
|
|
|
|
|
|
@ -50,6 +48,8 @@ def startCrawling(): |
|
|
|
|
|
|
|
|
# Opens Tor Browser |
|
|
# Opens Tor Browser |
|
|
def opentor(): |
|
|
def opentor(): |
|
|
|
|
|
from Forums.Initialization.forums_mining import config |
|
|
|
|
|
|
|
|
global pid |
|
|
global pid |
|
|
print("Connecting Tor...") |
|
|
print("Connecting Tor...") |
|
|
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) |
|
|
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) |
|
@ -109,6 +109,8 @@ def closetor(driver): |
|
|
# Creates FireFox 'driver' and configure its 'Profile' |
|
|
# Creates FireFox 'driver' and configure its 'Profile' |
|
|
# to use Tor proxy and socket |
|
|
# to use Tor proxy and socket |
|
|
def createFFDriver(): |
|
|
def createFFDriver(): |
|
|
|
|
|
from Forums.Initialization.forums_mining import config |
|
|
|
|
|
|
|
|
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) |
|
|
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) |
|
|
|
|
|
|
|
|
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) |
|
|
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) |
|
@ -163,16 +165,14 @@ def savePage(page, url): |
|
|
|
|
|
|
|
|
# Gets the full path of the page to be saved along with its appropriate file name |
|
|
# Gets the full path of the page to be saved along with its appropriate file name |
|
|
def getFullPathName(url): |
|
|
def getFullPathName(url): |
|
|
|
|
|
from Forums.Initialization.forums_mining import config, CURRENT_DATE |
|
|
|
|
|
|
|
|
|
|
|
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") |
|
|
fileName = getNameFromURL(url) |
|
|
fileName = getNameFromURL(url) |
|
|
if isDescriptionLink(url): |
|
|
if isDescriptionLink(url): |
|
|
#..\CryptBB\HTML_Pages\\ |
|
|
|
|
|
fullPath = r'..\OnniForums\HTML_Pages\\' + str( |
|
|
|
|
|
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( |
|
|
|
|
|
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' |
|
|
|
|
|
|
|
|
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') |
|
|
else: |
|
|
else: |
|
|
fullPath = r'..\OnniForums\HTML_Pages\\' + str( |
|
|
|
|
|
"%02d" % date.today().month) + str("%02d" % date.today().day) + str( |
|
|
|
|
|
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' |
|
|
|
|
|
|
|
|
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') |
|
|
return fullPath |
|
|
return fullPath |
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -279,15 +279,13 @@ def crawlForum(driver): |
|
|
for i in range(counter): |
|
|
for i in range(counter): |
|
|
driver.back() |
|
|
driver.back() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# comment out, one topic per page |
|
|
# comment out, one topic per page |
|
|
# break |
|
|
|
|
|
# |
|
|
|
|
|
# # comment out, go through all pages |
|
|
|
|
|
# if count == 1: |
|
|
|
|
|
# count = 0 |
|
|
|
|
|
# break |
|
|
|
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
# comment out, go through all pages |
|
|
|
|
|
if count == 1: |
|
|
|
|
|
count = 0 |
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
try: |
|
|
try: |
|
|
temp = driver.find_element(by=By.XPATH, value= |
|
|
temp = driver.find_element(by=By.XPATH, value= |
|
|